Browse Source
[video_core/vic] remove handrolled vector implementation that also uses software prefetching(!!!!); don't try to outsmart compiler - also remove template spam (#2856)
[video_core/vic] remove handrolled vector implementation that also uses software prefetching(!!!!); don't try to outsmart compiler - also remove template spam (#2856)
Also removes sse2neon :) Software prefetching SUCKS and it's evil don't do it Signed-off-by: lizzie <lizzie@eden-emu.dev> Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/2856 Reviewed-by: crueter <crueter@eden-emu.dev> Reviewed-by: MaranBr <maranbr@eden-emu.dev> Reviewed-by: Caio Oliveira <caiooliveirafarias0@gmail.com> Co-authored-by: lizzie <lizzie@eden-emu.dev> Co-committed-by: lizzie <lizzie@eden-emu.dev>pull/2885/head
committed by
crueter
No known key found for this signature in database
GPG Key ID: 425ACD2D4830EBC6
8 changed files with 174 additions and 988 deletions
-
2.ci/license-header.sh
-
129.patch/sse2neon/0001-Add-support-for-clang-cl-on-Windows-633.patch
-
3CMakeLists.txt
-
7externals/CMakeLists.txt
-
9externals/cpmfile.json
-
4src/video_core/CMakeLists.txt
-
983src/video_core/host1x/vic.cpp
-
25src/video_core/host1x/vic.h
@ -1,129 +0,0 @@ |
|||||
From d765ebed3598ddfd7167fc546474626ac5ef9498 Mon Sep 17 00:00:00 2001 |
|
||||
From: Anthony Roberts <anthony.roberts@linaro.org> |
|
||||
Date: Fri, 2 Aug 2024 16:55:57 +0100 |
|
||||
Subject: [PATCH] Add support for clang-cl on Windows (#633) |
|
||||
|
|
||||
This commit adds support for clang-cl (clang, pretending to be MSVC) to |
|
||||
SSE2NEON on Windows ARM64 platforms. This change is part of some Blender |
|
||||
work, as using clang-cl provides a ~20-40% speedup compared to MSVC. |
|
||||
|
|
||||
Compiled with the following command line (via a VS2022 Native ARM64 Tools |
|
||||
CMD window): |
|
||||
msbuild sse2neon.vcxproj /p:Configuration=Release /p:CLToolExe=clang-cl.exe |
|
||||
/p:CLToolPath="C:\Program Files\LLVM\bin\" |
|
||||
|
|
||||
Known failures in test suite: |
|
||||
Test mm_cvttpd_epi32 |
|
||||
Test rdtsc |
|
||||
|
|
||||
Co-authored-by: Anthony Roberts <anthony.roberts@linaro.org> |
|
||||
---
|
|
||||
sse2neon.h | 22 +++++++++++----------- |
|
||||
1 file changed, 11 insertions(+), 11 deletions(-) |
|
||||
|
|
||||
diff --git a/sse2neon.h b/sse2neon.h
|
|
||||
index 56254b5..76cf8e3 100644
|
|
||||
--- a/sse2neon.h
|
|
||||
+++ b/sse2neon.h
|
|
||||
@@ -180,7 +180,7 @@
|
|
||||
} |
|
||||
|
|
||||
/* Compiler barrier */ |
|
||||
-#if defined(_MSC_VER)
|
|
||||
+#if defined(_MSC_VER) && !defined(__clang__)
|
|
||||
#define SSE2NEON_BARRIER() _ReadWriteBarrier() |
|
||||
#else |
|
||||
#define SSE2NEON_BARRIER() \ |
|
||||
@@ -856,7 +856,7 @@ FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
|
|
||||
{ |
|
||||
poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0); |
|
||||
poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0); |
|
||||
-#if defined(_MSC_VER)
|
|
||||
+#if defined(_MSC_VER) && !defined(__clang__)
|
|
||||
__n64 a1 = {a}, b1 = {b}; |
|
||||
return vreinterpretq_u64_p128(vmull_p64(a1, b1)); |
|
||||
#else |
|
||||
@@ -1767,7 +1767,7 @@ FORCE_INLINE void _mm_free(void *addr)
|
|
||||
FORCE_INLINE uint64_t _sse2neon_get_fpcr(void) |
|
||||
{ |
|
||||
uint64_t value; |
|
||||
-#if defined(_MSC_VER)
|
|
||||
+#if defined(_MSC_VER) && !defined(__clang__)
|
|
||||
value = _ReadStatusReg(ARM64_FPCR); |
|
||||
#else |
|
||||
__asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */ |
|
||||
@@ -1777,7 +1777,7 @@ FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
|
|
||||
|
|
||||
FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value) |
|
||||
{ |
|
||||
-#if defined(_MSC_VER)
|
|
||||
+#if defined(_MSC_VER) && !defined(__clang__)
|
|
||||
_WriteStatusReg(ARM64_FPCR, value); |
|
||||
#else |
|
||||
__asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */ |
|
||||
@@ -2246,7 +2246,7 @@ FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
|
|
||||
FORCE_INLINE void _mm_prefetch(char const *p, int i) |
|
||||
{ |
|
||||
(void) i; |
|
||||
-#if defined(_MSC_VER)
|
|
||||
+#if defined(_MSC_VER) && !defined(__clang__)
|
|
||||
switch (i) { |
|
||||
case _MM_HINT_NTA: |
|
||||
__prefetch2(p, 1); |
|
||||
@@ -4817,7 +4817,7 @@ FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
|
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause |
|
||||
FORCE_INLINE void _mm_pause(void) |
|
||||
{ |
|
||||
-#if defined(_MSC_VER)
|
|
||||
+#if defined(_MSC_VER) && !defined(__clang__)
|
|
||||
__isb(_ARM64_BARRIER_SY); |
|
||||
#else |
|
||||
__asm__ __volatile__("isb\n"); |
|
||||
@@ -5713,7 +5713,7 @@ FORCE_INLINE __m128d _mm_undefined_pd(void)
|
|
||||
#pragma GCC diagnostic ignored "-Wuninitialized" |
|
||||
#endif |
|
||||
__m128d a; |
|
||||
-#if defined(_MSC_VER)
|
|
||||
+#if defined(_MSC_VER) && !defined(__clang__)
|
|
||||
a = _mm_setzero_pd(); |
|
||||
#endif |
|
||||
return a; |
|
||||
@@ -8127,7 +8127,7 @@ FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
|
|
||||
|
|
||||
FORCE_INLINE int _sse2neon_clz(unsigned int x) |
|
||||
{ |
|
||||
-#ifdef _MSC_VER
|
|
||||
+#if defined(_MSC_VER) && !defined(__clang__)
|
|
||||
unsigned long cnt = 0; |
|
||||
if (_BitScanReverse(&cnt, x)) |
|
||||
return 31 - cnt; |
|
||||
@@ -8139,7 +8139,7 @@ FORCE_INLINE int _sse2neon_clz(unsigned int x)
|
|
||||
|
|
||||
FORCE_INLINE int _sse2neon_ctz(unsigned int x) |
|
||||
{ |
|
||||
-#ifdef _MSC_VER
|
|
||||
+#if defined(_MSC_VER) && !defined(__clang__)
|
|
||||
unsigned long cnt = 0; |
|
||||
if (_BitScanForward(&cnt, x)) |
|
||||
return cnt; |
|
||||
@@ -9055,7 +9055,7 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
|
|
||||
// AESE does ShiftRows and SubBytes on A |
|
||||
uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)); |
|
||||
|
|
||||
-#ifndef _MSC_VER
|
|
||||
+#if !defined(_MSC_VER) || defined(__clang__)
|
|
||||
uint8x16_t dest = { |
|
||||
// Undo ShiftRows step from AESE and extract X1 and X3 |
|
||||
u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1) |
|
||||
@@ -9242,7 +9242,7 @@ FORCE_INLINE uint64_t _rdtsc(void)
|
|
||||
* bits wide and it is attributed with the flag 'cap_user_time_short' |
|
||||
* is true. |
|
||||
*/ |
|
||||
-#if defined(_MSC_VER)
|
|
||||
+#if defined(_MSC_VER) && !defined(__clang__)
|
|
||||
val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2)); |
|
||||
#else |
|
||||
__asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val)); |
|
||||
--
|
|
||||
2.48.1 |
|
||||
|
|
||||
983
src/video_core/host1x/vic.cpp
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
Write
Preview
Loading…
Cancel
Save
Reference in new issue