Browse Source

[video_core/vic] remove handrolled vector implementation that also uses software prefetching(!!!!); don't try to outsmart compiler - also remove template spam (#2856)

Also removes sse2neon :)
Software prefetching SUCKS and it's evil don't do it
Signed-off-by: lizzie <lizzie@eden-emu.dev>

Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/2856
Reviewed-by: crueter <crueter@eden-emu.dev>
Reviewed-by: MaranBr <maranbr@eden-emu.dev>
Reviewed-by: Caio Oliveira <caiooliveirafarias0@gmail.com>
Co-authored-by: lizzie <lizzie@eden-emu.dev>
Co-committed-by: lizzie <lizzie@eden-emu.dev>
pull/2885/head
lizzie 2 months ago
committed by crueter
parent
commit
5c6aaa7eb1
No known key found for this signature in database GPG Key ID: 425ACD2D4830EBC6
  1. 2
      .ci/license-header.sh
  2. 129
      .patch/sse2neon/0001-Add-support-for-clang-cl-on-Windows-633.patch
  3. 3
      CMakeLists.txt
  4. 7
      externals/CMakeLists.txt
  5. 9
      externals/cpmfile.json
  6. 4
      src/video_core/CMakeLists.txt
  7. 983
      src/video_core/host1x/vic.cpp
  8. 25
      src/video_core/host1x/vic.h

2
.ci/license-header.sh

@ -4,7 +4,7 @@
# SPDX-License-Identifier: GPL-3.0-or-later # SPDX-License-Identifier: GPL-3.0-or-later
# specify full path if dupes may exist # specify full path if dupes may exist
EXCLUDE_FILES="CPM.cmake CPMUtil.cmake GetSCMRev.cmake sse2neon.h renderdoc_app.h tools/cpm tools/shellcheck.sh tools/update-cpm.sh externals/stb externals/glad externals/getopt externals/gamemode externals/FidelityFX-FSR externals/demangle externals/bc_decoder"
EXCLUDE_FILES="CPM.cmake CPMUtil.cmake GetSCMRev.cmake renderdoc_app.h tools/cpm tools/shellcheck.sh tools/update-cpm.sh externals/stb externals/glad externals/getopt externals/gamemode externals/FidelityFX-FSR externals/demangle externals/bc_decoder"
# license header constants, please change when needed :)))) # license header constants, please change when needed :))))
YEAR=2025 YEAR=2025

129
.patch/sse2neon/0001-Add-support-for-clang-cl-on-Windows-633.patch

@ -1,129 +0,0 @@
From d765ebed3598ddfd7167fc546474626ac5ef9498 Mon Sep 17 00:00:00 2001
From: Anthony Roberts <anthony.roberts@linaro.org>
Date: Fri, 2 Aug 2024 16:55:57 +0100
Subject: [PATCH] Add support for clang-cl on Windows (#633)
This commit adds support for clang-cl (clang, pretending to be MSVC) to
SSE2NEON on Windows ARM64 platforms. This change is part of some Blender
work, as using clang-cl provides a ~20-40% speedup compared to MSVC.
Compiled with the following command line (via a VS2022 Native ARM64 Tools
CMD window):
msbuild sse2neon.vcxproj /p:Configuration=Release /p:CLToolExe=clang-cl.exe
/p:CLToolPath="C:\Program Files\LLVM\bin\"
Known failures in test suite:
Test mm_cvttpd_epi32
Test rdtsc
Co-authored-by: Anthony Roberts <anthony.roberts@linaro.org>
---
sse2neon.h | 22 +++++++++++-----------
1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/sse2neon.h b/sse2neon.h
index 56254b5..76cf8e3 100644
--- a/sse2neon.h
+++ b/sse2neon.h
@@ -180,7 +180,7 @@
}
/* Compiler barrier */
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
#define SSE2NEON_BARRIER() _ReadWriteBarrier()
#else
#define SSE2NEON_BARRIER() \
@@ -856,7 +856,7 @@ FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
{
poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
__n64 a1 = {a}, b1 = {b};
return vreinterpretq_u64_p128(vmull_p64(a1, b1));
#else
@@ -1767,7 +1767,7 @@ FORCE_INLINE void _mm_free(void *addr)
FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
{
uint64_t value;
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
value = _ReadStatusReg(ARM64_FPCR);
#else
__asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */
@@ -1777,7 +1777,7 @@ FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
{
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
_WriteStatusReg(ARM64_FPCR, value);
#else
__asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */
@@ -2246,7 +2246,7 @@ FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
FORCE_INLINE void _mm_prefetch(char const *p, int i)
{
(void) i;
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
switch (i) {
case _MM_HINT_NTA:
__prefetch2(p, 1);
@@ -4817,7 +4817,7 @@ FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
FORCE_INLINE void _mm_pause(void)
{
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
__isb(_ARM64_BARRIER_SY);
#else
__asm__ __volatile__("isb\n");
@@ -5713,7 +5713,7 @@ FORCE_INLINE __m128d _mm_undefined_pd(void)
#pragma GCC diagnostic ignored "-Wuninitialized"
#endif
__m128d a;
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
a = _mm_setzero_pd();
#endif
return a;
@@ -8127,7 +8127,7 @@ FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
FORCE_INLINE int _sse2neon_clz(unsigned int x)
{
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && !defined(__clang__)
unsigned long cnt = 0;
if (_BitScanReverse(&cnt, x))
return 31 - cnt;
@@ -8139,7 +8139,7 @@ FORCE_INLINE int _sse2neon_clz(unsigned int x)
FORCE_INLINE int _sse2neon_ctz(unsigned int x)
{
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && !defined(__clang__)
unsigned long cnt = 0;
if (_BitScanForward(&cnt, x))
return cnt;
@@ -9055,7 +9055,7 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
// AESE does ShiftRows and SubBytes on A
uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) || defined(__clang__)
uint8x16_t dest = {
// Undo ShiftRows step from AESE and extract X1 and X3
u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
@@ -9242,7 +9242,7 @@ FORCE_INLINE uint64_t _rdtsc(void)
* bits wide and it is attributed with the flag 'cap_user_time_short'
* is true.
*/
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2));
#else
__asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
--
2.48.1

3
CMakeLists.txt

@ -613,9 +613,6 @@ find_package(VulkanUtilityLibraries)
find_package(SimpleIni) find_package(SimpleIni)
find_package(SPIRV-Tools) find_package(SPIRV-Tools)
find_package(sirit) find_package(sirit)
if (ARCHITECTURE_arm64)
find_package(sse2neon)
endif()
if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64) if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64)
find_package(xbyak) find_package(xbyak)

7
externals/CMakeLists.txt

@ -399,10 +399,3 @@ if (ANDROID)
add_library(oboe::oboe ALIAS oboe) add_library(oboe::oboe ALIAS oboe)
endif() endif()
# sse2neon
if (ARCHITECTURE_arm64 AND NOT TARGET sse2neon)
AddJsonPackage(sse2neon)
add_library(sse2neon INTERFACE)
target_include_directories(sse2neon INTERFACE ${sse2neon_SOURCE_DIR})
endif()

9
externals/cpmfile.json

@ -213,14 +213,5 @@
"key": "steamdeck", "key": "steamdeck",
"bundled": true, "bundled": true,
"skip_updates": "true" "skip_updates": "true"
},
"sse2neon": {
"repo": "DLTcollab/sse2neon",
"sha": "66267b52fd",
"hash": "3aed8676e1b8c428acb076464663e3968a721457b08710a7c5f8df2fbdaa5601053c1606169a55e987e7a58dd17e3cc3b7fbf953aa891c5ac5f8ce2941862e4b",
"download_only": "true",
"patches": [
"0001-Add-support-for-clang-cl-on-Windows-633.patch"
]
} }
} }

4
src/video_core/CMakeLists.txt

@ -398,8 +398,4 @@ if (ANDROID AND ARCHITECTURE_arm64)
target_link_libraries(video_core PRIVATE adrenotools) target_link_libraries(video_core PRIVATE adrenotools)
endif() endif()
if (ARCHITECTURE_arm64)
target_link_libraries(video_core PRIVATE sse2neon)
endif()
create_target_directory_groups(video_core) create_target_directory_groups(video_core)

983
src/video_core/host1x/vic.cpp
File diff suppressed because it is too large
View File

25
src/video_core/host1x/vic.h

@ -1,3 +1,6 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project // SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later // SPDX-License-Identifier: GPL-2.0-or-later
@ -610,24 +613,12 @@ public:
private: private:
void Execute(); void Execute();
void Blend(const ConfigStruct& config, const SlotStruct& slot); void Blend(const ConfigStruct& config, const SlotStruct& slot);
template <bool Planar, bool Interlaced = false>
void ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets,
std::shared_ptr<const FFmpeg::Frame> frame);
template <bool Planar, bool TopField>
void ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets,
std::shared_ptr<const FFmpeg::Frame> frame);
template <bool Planar>
void ReadY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets,
std::shared_ptr<const FFmpeg::Frame> frame);
void ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets, std::shared_ptr<const FFmpeg::Frame> frame, bool planar, bool interlaced);
void ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets, std::shared_ptr<const FFmpeg::Frame> frame, bool planar, bool top_field);
void ReadY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets, std::shared_ptr<const FFmpeg::Frame> frame, bool planar);
void WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config); void WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config);
template <VideoPixelFormat Format>
void WriteABGR(const OutputSurfaceConfig& output_surface_config);
void WriteABGR(const OutputSurfaceConfig& output_surface_config, VideoPixelFormat format);
s32 id; s32 id;
s32 nvdec_id{-1}; s32 nvdec_id{-1};
@ -636,8 +627,6 @@ private:
VicRegisters regs{}; VicRegisters regs{};
FrameQueue& frame_queue; FrameQueue& frame_queue;
const bool has_sse41{false};
Common::ScratchBuffer<Pixel> output_surface; Common::ScratchBuffer<Pixel> output_surface;
Common::ScratchBuffer<Pixel> slot_surface; Common::ScratchBuffer<Pixel> slot_surface;
Common::ScratchBuffer<u8> luma_scratch; Common::ScratchBuffer<u8> luma_scratch;

Loading…
Cancel
Save