From 5c6aaa7eb144f0350e53f9ef8fac820ab595d1c7 Mon Sep 17 00:00:00 2001 From: lizzie Date: Wed, 29 Oct 2025 03:13:24 +0100 Subject: [PATCH] [video_core/vic] remove handrolled vector implementation that also uses software prefetching(!!!!); don't try to outsmart compiler - also remove template spam (#2856) Also removes sse2neon :) Software prefetching SUCKS and it's evil don't do it Signed-off-by: lizzie Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/2856 Reviewed-by: crueter Reviewed-by: MaranBr Reviewed-by: Caio Oliveira Co-authored-by: lizzie Co-committed-by: lizzie --- .ci/license-header.sh | 2 +- ...-support-for-clang-cl-on-Windows-633.patch | 129 --- CMakeLists.txt | 3 - externals/CMakeLists.txt | 7 - externals/cpmfile.json | 9 - src/video_core/CMakeLists.txt | 4 - src/video_core/host1x/vic.cpp | 983 +++--------------- src/video_core/host1x/vic.h | 25 +- 8 files changed, 174 insertions(+), 988 deletions(-) delete mode 100644 .patch/sse2neon/0001-Add-support-for-clang-cl-on-Windows-633.patch diff --git a/.ci/license-header.sh b/.ci/license-header.sh index 874f29aa45..f438d59dac 100755 --- a/.ci/license-header.sh +++ b/.ci/license-header.sh @@ -4,7 +4,7 @@ # SPDX-License-Identifier: GPL-3.0-or-later # specify full path if dupes may exist -EXCLUDE_FILES="CPM.cmake CPMUtil.cmake GetSCMRev.cmake sse2neon.h renderdoc_app.h tools/cpm tools/shellcheck.sh tools/update-cpm.sh externals/stb externals/glad externals/getopt externals/gamemode externals/FidelityFX-FSR externals/demangle externals/bc_decoder" +EXCLUDE_FILES="CPM.cmake CPMUtil.cmake GetSCMRev.cmake renderdoc_app.h tools/cpm tools/shellcheck.sh tools/update-cpm.sh externals/stb externals/glad externals/getopt externals/gamemode externals/FidelityFX-FSR externals/demangle externals/bc_decoder" # license header constants, please change when needed :)))) YEAR=2025 diff --git a/.patch/sse2neon/0001-Add-support-for-clang-cl-on-Windows-633.patch b/.patch/sse2neon/0001-Add-support-for-clang-cl-on-Windows-633.patch deleted file mode 100644 index cf86707355..0000000000 --- a/.patch/sse2neon/0001-Add-support-for-clang-cl-on-Windows-633.patch +++ /dev/null @@ -1,129 +0,0 @@ -From d765ebed3598ddfd7167fc546474626ac5ef9498 Mon Sep 17 00:00:00 2001 -From: Anthony Roberts -Date: Fri, 2 Aug 2024 16:55:57 +0100 -Subject: [PATCH] Add support for clang-cl on Windows (#633) - -This commit adds support for clang-cl (clang, pretending to be MSVC) to -SSE2NEON on Windows ARM64 platforms. This change is part of some Blender -work, as using clang-cl provides a ~20-40% speedup compared to MSVC. - -Compiled with the following command line (via a VS2022 Native ARM64 Tools -CMD window): - msbuild sse2neon.vcxproj /p:Configuration=Release /p:CLToolExe=clang-cl.exe - /p:CLToolPath="C:\Program Files\LLVM\bin\" - -Known failures in test suite: - Test mm_cvttpd_epi32 - Test rdtsc - -Co-authored-by: Anthony Roberts ---- - sse2neon.h | 22 +++++++++++----------- - 1 file changed, 11 insertions(+), 11 deletions(-) - -diff --git a/sse2neon.h b/sse2neon.h -index 56254b5..76cf8e3 100644 ---- a/sse2neon.h -+++ b/sse2neon.h -@@ -180,7 +180,7 @@ - } - - /* Compiler barrier */ --#if defined(_MSC_VER) -+#if defined(_MSC_VER) && !defined(__clang__) - #define SSE2NEON_BARRIER() _ReadWriteBarrier() - #else - #define SSE2NEON_BARRIER() \ -@@ -856,7 +856,7 @@ FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) - { - poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0); - poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0); --#if defined(_MSC_VER) -+#if defined(_MSC_VER) && !defined(__clang__) - __n64 a1 = {a}, b1 = {b}; - return vreinterpretq_u64_p128(vmull_p64(a1, b1)); - #else -@@ -1767,7 +1767,7 @@ FORCE_INLINE void _mm_free(void *addr) - FORCE_INLINE uint64_t _sse2neon_get_fpcr(void) - { - uint64_t value; --#if defined(_MSC_VER) -+#if defined(_MSC_VER) && !defined(__clang__) - value = _ReadStatusReg(ARM64_FPCR); - #else - __asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */ -@@ -1777,7 +1777,7 @@ FORCE_INLINE uint64_t _sse2neon_get_fpcr(void) - - FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value) - { --#if defined(_MSC_VER) -+#if defined(_MSC_VER) && !defined(__clang__) - _WriteStatusReg(ARM64_FPCR, value); - #else - __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */ -@@ -2246,7 +2246,7 @@ FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) - FORCE_INLINE void _mm_prefetch(char const *p, int i) - { - (void) i; --#if defined(_MSC_VER) -+#if defined(_MSC_VER) && !defined(__clang__) - switch (i) { - case _MM_HINT_NTA: - __prefetch2(p, 1); -@@ -4817,7 +4817,7 @@ FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) - // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause - FORCE_INLINE void _mm_pause(void) - { --#if defined(_MSC_VER) -+#if defined(_MSC_VER) && !defined(__clang__) - __isb(_ARM64_BARRIER_SY); - #else - __asm__ __volatile__("isb\n"); -@@ -5713,7 +5713,7 @@ FORCE_INLINE __m128d _mm_undefined_pd(void) - #pragma GCC diagnostic ignored "-Wuninitialized" - #endif - __m128d a; --#if defined(_MSC_VER) -+#if defined(_MSC_VER) && !defined(__clang__) - a = _mm_setzero_pd(); - #endif - return a; -@@ -8127,7 +8127,7 @@ FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound) - - FORCE_INLINE int _sse2neon_clz(unsigned int x) - { --#ifdef _MSC_VER -+#if defined(_MSC_VER) && !defined(__clang__) - unsigned long cnt = 0; - if (_BitScanReverse(&cnt, x)) - return 31 - cnt; -@@ -8139,7 +8139,7 @@ FORCE_INLINE int _sse2neon_clz(unsigned int x) - - FORCE_INLINE int _sse2neon_ctz(unsigned int x) - { --#ifdef _MSC_VER -+#if defined(_MSC_VER) && !defined(__clang__) - unsigned long cnt = 0; - if (_BitScanForward(&cnt, x)) - return cnt; -@@ -9055,7 +9055,7 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) - // AESE does ShiftRows and SubBytes on A - uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)); - --#ifndef _MSC_VER -+#if !defined(_MSC_VER) || defined(__clang__) - uint8x16_t dest = { - // Undo ShiftRows step from AESE and extract X1 and X3 - u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1) -@@ -9242,7 +9242,7 @@ FORCE_INLINE uint64_t _rdtsc(void) - * bits wide and it is attributed with the flag 'cap_user_time_short' - * is true. - */ --#if defined(_MSC_VER) -+#if defined(_MSC_VER) && !defined(__clang__) - val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2)); - #else - __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val)); --- -2.48.1 - diff --git a/CMakeLists.txt b/CMakeLists.txt index d29898b819..dda6979911 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -613,9 +613,6 @@ find_package(VulkanUtilityLibraries) find_package(SimpleIni) find_package(SPIRV-Tools) find_package(sirit) -if (ARCHITECTURE_arm64) - find_package(sse2neon) -endif() if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64) find_package(xbyak) diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt index 8cc04e99a2..096760925f 100644 --- a/externals/CMakeLists.txt +++ b/externals/CMakeLists.txt @@ -399,10 +399,3 @@ if (ANDROID) add_library(oboe::oboe ALIAS oboe) endif() - -# sse2neon -if (ARCHITECTURE_arm64 AND NOT TARGET sse2neon) - AddJsonPackage(sse2neon) - add_library(sse2neon INTERFACE) - target_include_directories(sse2neon INTERFACE ${sse2neon_SOURCE_DIR}) -endif() diff --git a/externals/cpmfile.json b/externals/cpmfile.json index e1eb15fad3..73cdf3e305 100644 --- a/externals/cpmfile.json +++ b/externals/cpmfile.json @@ -213,14 +213,5 @@ "key": "steamdeck", "bundled": true, "skip_updates": "true" - }, - "sse2neon": { - "repo": "DLTcollab/sse2neon", - "sha": "66267b52fd", - "hash": "3aed8676e1b8c428acb076464663e3968a721457b08710a7c5f8df2fbdaa5601053c1606169a55e987e7a58dd17e3cc3b7fbf953aa891c5ac5f8ce2941862e4b", - "download_only": "true", - "patches": [ - "0001-Add-support-for-clang-cl-on-Windows-633.patch" - ] } } diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 444e0461f1..db99c2bcb9 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -398,8 +398,4 @@ if (ANDROID AND ARCHITECTURE_arm64) target_link_libraries(video_core PRIVATE adrenotools) endif() -if (ARCHITECTURE_arm64) - target_link_libraries(video_core PRIVATE sse2neon) -endif() - create_target_directory_groups(video_core) diff --git a/src/video_core/host1x/vic.cpp b/src/video_core/host1x/vic.cpp index 3dbbfa5552..21cf5f4e92 100644 --- a/src/video_core/host1x/vic.cpp +++ b/src/video_core/host1x/vic.cpp @@ -8,21 +8,6 @@ #include #include -#if defined(ARCHITECTURE_x86_64) -#if defined(_MSC_VER) -#include -#else -#include -#endif -#elif defined(ARCHITECTURE_arm64) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wimplicit-int-conversion" -#pragma GCC diagnostic ignored "-Wconversion" -#pragma GCC diagnostic ignored "-Wshadow" -#include -#pragma GCC diagnostic pop -#endif - extern "C" { #if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic push @@ -55,14 +40,6 @@ extern "C" { namespace Tegra::Host1x { namespace { -static bool HasSSE41() { -#if defined(ARCHITECTURE_x86_64) - const auto& cpu_caps{Common::GetCPUCaps()}; - return cpu_caps.sse4_1; -#else - return false; -#endif -} void SwizzleSurface(std::span output, u32 out_stride, std::span input, u32 in_stride, u32 height) { @@ -100,7 +77,7 @@ void SwizzleSurface(std::span output, u32 out_stride, std::span in Vic::Vic(Host1x& host1x_, s32 id_, u32 syncpt, FrameQueue& frame_queue_) : CDmaPusher{host1x_, id_}, id{id_}, syncpoint{syncpt}, - frame_queue{frame_queue_}, has_sse41{HasSSE41()} { + frame_queue{frame_queue_} { LOG_INFO(HW_GPU, "Created vic {}", id); } @@ -110,7 +87,7 @@ Vic::~Vic() { } void Vic::ProcessMethod(u32 method, u32 arg) { - LOG_TRACE(HW_GPU, "Vic {} method {:#X}", id, static_cast(method)); + LOG_TRACE(HW_GPU, "Vic {} method {:#X}", id, u32(method)); regs.reg_array[method] = arg; switch (static_cast(method * sizeof(u32))) { @@ -145,69 +122,57 @@ void Vic::Execute() { nvdec_id = frame_queue.VicFindNvdecFdFromOffset(luma_offset); } - auto frame = frame_queue.GetFrame(nvdec_id, luma_offset); - - if (!frame) { - continue; - } - - if (!frame.get()) { - LOG_ERROR(HW_GPU, "Vic {} failed to get frame with offset {:#X}", id, luma_offset); - continue; - } - - switch (frame->GetPixelFormat()) { - case AV_PIX_FMT_YUV420P: - ReadY8__V8U8_N420(slot_config, regs.surfaces[i], std::move(frame)); - break; - case AV_PIX_FMT_NV12: - ReadY8__V8U8_N420(slot_config, regs.surfaces[i], std::move(frame)); - break; - default: - UNIMPLEMENTED_MSG( - "Unimplemented slot pixel format {}", - static_cast(slot_config.surface_config.slot_pixel_format.Value())); - break; + if (auto frame = frame_queue.GetFrame(nvdec_id, luma_offset); frame) { + if (frame.get()) { + switch (frame->GetPixelFormat()) { + case AV_PIX_FMT_YUV420P: + ReadY8__V8U8_N420(slot_config, regs.surfaces[i], std::move(frame), true); + break; + case AV_PIX_FMT_NV12: + ReadY8__V8U8_N420(slot_config, regs.surfaces[i], std::move(frame), false); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented slot pixel format {}", u32(slot_config.surface_config.slot_pixel_format.Value())); + break; + } + Blend(config, slot_config); + } else { + LOG_ERROR(HW_GPU, "Vic {} failed to get frame with offset {:#X}", id, luma_offset); + } } - - Blend(config, slot_config); } } switch (config.output_surface_config.out_pixel_format) { case VideoPixelFormat::A8B8G8R8: case VideoPixelFormat::X8B8G8R8: - WriteABGR(config.output_surface_config); + WriteABGR(config.output_surface_config, VideoPixelFormat::A8B8G8R8); break; case VideoPixelFormat::A8R8G8B8: - WriteABGR(config.output_surface_config); + WriteABGR(config.output_surface_config, VideoPixelFormat::A8R8G8B8); break; case VideoPixelFormat::Y8__V8U8_N420: WriteY8__V8U8_N420(config.output_surface_config); break; default: - UNIMPLEMENTED_MSG("Unknown video pixel format {}", - config.output_surface_config.out_pixel_format.Value()); + UNIMPLEMENTED_MSG("Unknown video pixel format {}", config.output_surface_config.out_pixel_format.Value()); break; } } -template -void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, - std::span offsets, - std::shared_ptr frame) { +void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span offsets, std::shared_ptr frame, bool planar, bool interlaced) { const auto out_luma_width{slot.surface_config.slot_surface_width + 1}; auto out_luma_height{slot.surface_config.slot_surface_height + 1}; const auto out_luma_stride{out_luma_width}; - if constexpr (Interlaced) { + if(interlaced) { out_luma_height *= 2; } slot_surface.resize_destructive(out_luma_width * out_luma_height); - const auto in_luma_width{(std::min)(frame->GetWidth(), static_cast(out_luma_width))}; - const auto in_luma_height{(std::min)(frame->GetHeight(), static_cast(out_luma_height))}; + const auto in_luma_width{(std::min)(frame->GetWidth(), s32(out_luma_width))}; + const auto in_luma_height{(std::min)(frame->GetHeight(), s32(out_luma_height))}; const auto in_luma_stride{frame->GetStride(0)}; const auto in_chroma_stride{frame->GetStride(1)}; @@ -224,204 +189,29 @@ void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, in_chroma_stride, out_luma_width, out_luma_height, out_luma_stride, out_luma_width, out_luma_height, out_luma_stride); - [[maybe_unused]] auto DecodeLinear = [&]() { - const auto alpha{static_cast(slot.config.planar_alpha.Value())}; - - for (s32 y = 0; y < in_luma_height; y++) { - const auto src_luma{y * in_luma_stride}; - const auto src_chroma{(y / 2) * in_chroma_stride}; - const auto dst{y * out_luma_stride}; - for (s32 x = 0; x < in_luma_width; x++) { - slot_surface[dst + x].r = static_cast(luma_buffer[src_luma + x] << 2); - // Chroma samples are duplicated horizontally and vertically. - if constexpr (Planar) { - slot_surface[dst + x].g = - static_cast(chroma_u_buffer[src_chroma + x / 2] << 2); - slot_surface[dst + x].b = - static_cast(chroma_v_buffer[src_chroma + x / 2] << 2); - } else { - slot_surface[dst + x].g = - static_cast(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2); - slot_surface[dst + x].b = - static_cast(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2); - } - slot_surface[dst + x].a = alpha; - } - } - }; - -#if defined(ARCHITECTURE_x86_64) - if (!has_sse41) { - DecodeLinear(); - return; - } -#endif - -#if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64) - const auto alpha_linear{static_cast(slot.config.planar_alpha.Value())}; - const auto alpha = - _mm_slli_epi64(_mm_set1_epi64x(static_cast(slot.config.planar_alpha.Value())), 48); - - const auto shuffle_mask = _mm_set_epi8(13, 15, 14, 12, 9, 11, 10, 8, 5, 7, 6, 4, 1, 3, 2, 0); - const auto sse_aligned_width = Common::AlignDown(in_luma_width, 16); - + const auto alpha{u16(slot.config.planar_alpha.Value())}; for (s32 y = 0; y < in_luma_height; y++) { const auto src_luma{y * in_luma_stride}; const auto src_chroma{(y / 2) * in_chroma_stride}; const auto dst{y * out_luma_stride}; - s32 x = 0; - for (; x < sse_aligned_width; x += 16) { - // clang-format off - // Prefetch next iteration's memory - _mm_prefetch((const char*)&luma_buffer[src_luma + x + 16], _MM_HINT_T0); - - // Load 8 bytes * 2 of 8-bit luma samples - // luma0 = 00 00 00 00 00 00 00 00 LL LL LL LL LL LL LL LL - auto luma0 = _mm_loadl_epi64((__m128i*)&luma_buffer[src_luma + x + 0]); - auto luma1 = _mm_loadl_epi64((__m128i*)&luma_buffer[src_luma + x + 8]); - - __m128i chroma; - - if constexpr (Planar) { - _mm_prefetch((const char*)&chroma_u_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0); - _mm_prefetch((const char*)&chroma_v_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0); - - // If Chroma is planar, we have separate U and V planes, load 8 bytes of each - // chroma_u0 = 00 00 00 00 00 00 00 00 UU UU UU UU UU UU UU UU - // chroma_v0 = 00 00 00 00 00 00 00 00 VV VV VV VV VV VV VV VV - auto chroma_u0 = _mm_loadl_epi64((__m128i*)&chroma_u_buffer[src_chroma + x / 2]); - auto chroma_v0 = _mm_loadl_epi64((__m128i*)&chroma_v_buffer[src_chroma + x / 2]); - - // Interleave the 8 bytes of U and V into a single 16 byte reg - // chroma = VV UU VV UU VV UU VV UU VV UU VV UU VV UU VV UU - chroma = _mm_unpacklo_epi8(chroma_u0, chroma_v0); - } else { - _mm_prefetch((const char*)&chroma_u_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0); - - // Chroma is already interleaved in semiplanar format, just load 16 bytes - // chroma = VV UU VV UU VV UU VV UU VV UU VV UU VV UU VV UU - chroma = _mm_load_si128((__m128i*)&chroma_u_buffer[src_chroma + x]); - } - - // Convert the low 8 bytes of 8-bit luma into 16-bit luma - // luma0 = [00] [00] [00] [00] [00] [00] [00] [00] [LL] [LL] [LL] [LL] [LL] [LL] [LL] [LL] - // -> - // luma0 = [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] - luma0 = _mm_cvtepu8_epi16(luma0); - luma1 = _mm_cvtepu8_epi16(luma1); - - // Treat the 8 bytes of 8-bit chroma as 16-bit channels, this allows us to take both the - // U and V together as one element. Using chroma twice here duplicates the values, as we - // take element 0 from chroma, and then element 0 from chroma again, etc. We need to - // duplicate chroma horitonally as chroma is half the width of luma. - // chroma = [VV8 UU8] [VV7 UU7] [VV6 UU6] [VV5 UU5] [VV4 UU4] [VV3 UU3] [VV2 UU2] [VV1 UU1] - // -> - // chroma00 = [VV4 UU4] [VV4 UU4] [VV3 UU3] [VV3 UU3] [VV2 UU2] [VV2 UU2] [VV1 UU1] [VV1 UU1] - // chroma01 = [VV8 UU8] [VV8 UU8] [VV7 UU7] [VV7 UU7] [VV6 UU6] [VV6 UU6] [VV5 UU5] [VV5 UU5] - auto chroma00 = _mm_unpacklo_epi16(chroma, chroma); - auto chroma01 = _mm_unpackhi_epi16(chroma, chroma); - - // Interleave the 16-bit luma and chroma. - // luma0 = [008 LL8] [007 LL7] [006 LL6] [005 LL5] [004 LL4] [003 LL3] [002 LL2] [001 LL1] - // chroma00 = [VV8 UU8] [VV7 UU7] [VV6 UU6] [VV5 UU5] [VV4 UU4] [VV3 UU3] [VV2 UU2] [VV1 UU1] - // -> - // yuv0 = [VV4 UU4 004 LL4] [VV3 UU3 003 LL3] [VV2 UU2 002 LL2] [VV1 UU1 001 LL1] - // yuv1 = [VV8 UU8 008 LL8] [VV7 UU7 007 LL7] [VV6 UU6 006 LL6] [VV5 UU5 005 LL5] - auto yuv0 = _mm_unpacklo_epi16(luma0, chroma00); - auto yuv1 = _mm_unpackhi_epi16(luma0, chroma00); - auto yuv2 = _mm_unpacklo_epi16(luma1, chroma01); - auto yuv3 = _mm_unpackhi_epi16(luma1, chroma01); - - // Shuffle the luma/chroma into the channel ordering we actually want. The high byte of - // the luma which is now a constant 0 after converting 8-bit -> 16-bit is used as the - // alpha. Luma -> R, U -> G, V -> B, 0 -> A - // yuv0 = [VV4 UU4 004 LL4] [VV3 UU3 003 LL3] [VV2 UU2 002 LL2] [VV1 UU1 001 LL1] - // -> - // yuv0 = [AA4 VV4 UU4 LL4] [AA3 VV3 UU3 LL3] [AA2 VV2 UU2 LL2] [AA1 VV1 UU1 LL1] - yuv0 = _mm_shuffle_epi8(yuv0, shuffle_mask); - yuv1 = _mm_shuffle_epi8(yuv1, shuffle_mask); - yuv2 = _mm_shuffle_epi8(yuv2, shuffle_mask); - yuv3 = _mm_shuffle_epi8(yuv3, shuffle_mask); - - // Extend the 8-bit channels we have into 16-bits, as that's the target surface format. - // Since this turns just the low 8 bytes into 16 bytes, the second of - // each operation here right shifts the register by 8 to get the high pixels. - // yuv0 = [AA4] [VV4] [UU4] [LL4] [AA3] [VV3] [UU3] [LL3] [AA2] [VV2] [UU2] [LL2] [AA1] [VV1] [UU1] [LL1] - // -> - // yuv01 = [002 AA2] [002 VV2] [002 UU2] [002 LL2] [001 AA1] [001 VV1] [001 UU1] [001 LL1] - // yuv23 = [004 AA4] [004 VV4] [004 UU4] [004 LL4] [003 AA3] [003 VV3] ]003 UU3] [003 LL3] - auto yuv01 = _mm_cvtepu8_epi16(yuv0); - auto yuv23 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv0, 8)); - auto yuv45 = _mm_cvtepu8_epi16(yuv1); - auto yuv67 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv1, 8)); - auto yuv89 = _mm_cvtepu8_epi16(yuv2); - auto yuv1011 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv2, 8)); - auto yuv1213 = _mm_cvtepu8_epi16(yuv3); - auto yuv1415 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv3, 8)); - - // Left-shift all 16-bit channels by 2, this is to get us into a 10-bit format instead - // of 8, which is the format alpha is in, as well as other blending values. - yuv01 = _mm_slli_epi16(yuv01, 2); - yuv23 = _mm_slli_epi16(yuv23, 2); - yuv45 = _mm_slli_epi16(yuv45, 2); - yuv67 = _mm_slli_epi16(yuv67, 2); - yuv89 = _mm_slli_epi16(yuv89, 2); - yuv1011 = _mm_slli_epi16(yuv1011, 2); - yuv1213 = _mm_slli_epi16(yuv1213, 2); - yuv1415 = _mm_slli_epi16(yuv1415, 2); - - // OR in the planar alpha, this has already been duplicated and shifted into position, - // and just fills in the AA channels with the actual alpha value. - yuv01 = _mm_or_si128(yuv01, alpha); - yuv23 = _mm_or_si128(yuv23, alpha); - yuv45 = _mm_or_si128(yuv45, alpha); - yuv67 = _mm_or_si128(yuv67, alpha); - yuv89 = _mm_or_si128(yuv89, alpha); - yuv1011 = _mm_or_si128(yuv1011, alpha); - yuv1213 = _mm_or_si128(yuv1213, alpha); - yuv1415 = _mm_or_si128(yuv1415, alpha); - - // Store out the pixels. One pixel is now 8 bytes, so each store is 2 pixels. - // [AA AA] [VV VV] [UU UU] [LL LL] [AA AA] [VV VV] [UU UU] [LL LL] - _mm_store_si128((__m128i*)&slot_surface[dst + x + 0], yuv01); - _mm_store_si128((__m128i*)&slot_surface[dst + x + 2], yuv23); - _mm_store_si128((__m128i*)&slot_surface[dst + x + 4], yuv45); - _mm_store_si128((__m128i*)&slot_surface[dst + x + 6], yuv67); - _mm_store_si128((__m128i*)&slot_surface[dst + x + 8], yuv89); - _mm_store_si128((__m128i*)&slot_surface[dst + x + 10], yuv1011); - _mm_store_si128((__m128i*)&slot_surface[dst + x + 12], yuv1213); - _mm_store_si128((__m128i*)&slot_surface[dst + x + 14], yuv1415); - - // clang-format on - } - - for (; x < in_luma_width; x++) { - slot_surface[dst + x].r = static_cast(luma_buffer[src_luma + x] << 2); + for (s32 x = 0; x < in_luma_width; x++) { + slot_surface[dst + x].r = u16(luma_buffer[src_luma + x] << 2); // Chroma samples are duplicated horizontally and vertically. - if constexpr (Planar) { - slot_surface[dst + x].g = - static_cast(chroma_u_buffer[src_chroma + x / 2] << 2); - slot_surface[dst + x].b = - static_cast(chroma_v_buffer[src_chroma + x / 2] << 2); + if(planar) { + slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + x / 2] << 2); + slot_surface[dst + x].b = u16(chroma_v_buffer[src_chroma + x / 2] << 2); } else { - slot_surface[dst + x].g = - static_cast(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2); - slot_surface[dst + x].b = - static_cast(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2); + slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2); + slot_surface[dst + x].b = u16(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2); } - slot_surface[dst + x].a = alpha_linear; + slot_surface[dst + x].a = alpha; } } -#else - DecodeLinear(); -#endif } -template -void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span offsets, - std::shared_ptr frame) { - if constexpr (!Planar) { - ReadProgressiveY8__V8U8_N420(slot, offsets, std::move(frame)); +void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span offsets, std::shared_ptr frame, bool planar, bool top_field) { + if(!planar) { + ReadProgressiveY8__V8U8_N420(slot, offsets, std::move(frame), planar, true); return; } const auto out_luma_width{slot.surface_config.slot_surface_width + 1}; @@ -430,9 +220,9 @@ void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::spanGetWidth(), static_cast(out_luma_width))}; + const auto in_luma_width{(std::min)(frame->GetWidth(), s32(out_luma_width))}; [[maybe_unused]] const auto in_luma_height{ - (std::min)(frame->GetHeight(), static_cast(out_luma_height))}; + (std::min)(frame->GetHeight(), s32(out_luma_height))}; const auto in_luma_stride{frame->GetStride(0)}; [[maybe_unused]] const auto in_chroma_width{(frame->GetWidth() + 1) / 2}; @@ -451,81 +241,62 @@ void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span(slot.config.planar_alpha.Value())}; - - for (s32 y = static_cast(TopField == false); y < in_chroma_height * 2; y += 2) { - const auto src_luma{y * in_luma_stride}; - const auto src_chroma{(y / 2) * in_chroma_stride}; - const auto dst{y * out_luma_stride}; - for (s32 x = 0; x < in_luma_width; x++) { - slot_surface[dst + x].r = static_cast(luma_buffer[src_luma + x] << 2); - if constexpr (Planar) { - slot_surface[dst + x].g = - static_cast(chroma_u_buffer[src_chroma + x / 2] << 2); - slot_surface[dst + x].b = - static_cast(chroma_v_buffer[src_chroma + x / 2] << 2); - } else { - slot_surface[dst + x].g = - static_cast(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2); - slot_surface[dst + x].b = - static_cast(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2); - } - slot_surface[dst + x].a = alpha; - } - - s32 other_line{}; - if constexpr (TopField) { - other_line = (y + 1) * out_luma_stride; + auto DecodeBobField = [&]() { + const auto alpha{u16(slot.config.planar_alpha.Value())}; + for (s32 y = s32(top_field == false); y < in_chroma_height * 2; y += 2) { + const auto src_luma{y * in_luma_stride}; + const auto src_chroma{(y / 2) * in_chroma_stride}; + const auto dst{y * out_luma_stride}; + for (s32 x = 0; x < in_luma_width; x++) { + slot_surface[dst + x].r = u16(luma_buffer[src_luma + x] << 2); + if(planar) { + slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + x / 2] << 2); + slot_surface[dst + x].b = u16(chroma_v_buffer[src_chroma + x / 2] << 2); } else { - other_line = (y - 1) * out_luma_stride; + slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2); + slot_surface[dst + x].b = u16(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2); } - std::memcpy(&slot_surface[other_line], &slot_surface[dst], - out_luma_width * sizeof(Pixel)); + slot_surface[dst + x].a = alpha; } - }; - - switch (slot.config.deinterlace_mode) { - case DXVAHD_DEINTERLACE_MODE_PRIVATE::WEAVE: - // Due to the fact that we do not write to memory in nvdec, we cannot use Weave as it - // relies on the previous frame. - DecodeBobField(); - break; - case DXVAHD_DEINTERLACE_MODE_PRIVATE::BOB_FIELD: - DecodeBobField(); - break; - case DXVAHD_DEINTERLACE_MODE_PRIVATE::DISI1: - // Due to the fact that we do not write to memory in nvdec, we cannot use DISI1 as it - // relies on previous/next frames. - DecodeBobField(); - break; - default: - UNIMPLEMENTED_MSG("Deinterlace mode {} not implemented!", - static_cast(slot.config.deinterlace_mode.Value())); - break; + s32 other_line = (top_field ? y + 1 : y - 1) * out_luma_stride; + std::memcpy(&slot_surface[other_line], &slot_surface[dst], out_luma_width * sizeof(Pixel)); } }; - DecodeLinear(); + switch (slot.config.deinterlace_mode) { + case DXVAHD_DEINTERLACE_MODE_PRIVATE::WEAVE: + // Due to the fact that we do not write to memory in nvdec, we cannot use Weave as it + // relies on the previous frame. + DecodeBobField(); + break; + case DXVAHD_DEINTERLACE_MODE_PRIVATE::BOB_FIELD: + DecodeBobField(); + break; + case DXVAHD_DEINTERLACE_MODE_PRIVATE::DISI1: + // Due to the fact that we do not write to memory in nvdec, we cannot use DISI1 as it + // relies on previous/next frames. + DecodeBobField(); + break; + default: + UNIMPLEMENTED_MSG("Deinterlace mode {} not implemented!", s32(slot.config.deinterlace_mode.Value())); + break; + } } -template -void Vic::ReadY8__V8U8_N420(const SlotStruct& slot, std::span offsets, - std::shared_ptr frame) { +void Vic::ReadY8__V8U8_N420(const SlotStruct& slot, std::span offsets, std::shared_ptr frame, bool planar) { switch (slot.config.frame_format) { case DXVAHD_FRAME_FORMAT::PROGRESSIVE: - ReadProgressiveY8__V8U8_N420(slot, offsets, std::move(frame)); + ReadProgressiveY8__V8U8_N420(slot, offsets, std::move(frame), planar, false); break; case DXVAHD_FRAME_FORMAT::TOP_FIELD: - ReadInterlacedY8__V8U8_N420(slot, offsets, std::move(frame)); + ReadInterlacedY8__V8U8_N420(slot, offsets, std::move(frame), planar, true); break; case DXVAHD_FRAME_FORMAT::BOTTOM_FIELD: - ReadInterlacedY8__V8U8_N420(slot, offsets, std::move(frame)); + ReadInterlacedY8__V8U8_N420(slot, offsets, std::move(frame), planar, false); break; default: LOG_ERROR(HW_GPU, "Unknown deinterlace format {}", - static_cast(slot.config.frame_format.Value())); + s32(slot.config.frame_format.Value())); break; } } @@ -533,15 +304,15 @@ void Vic::ReadY8__V8U8_N420(const SlotStruct& slot, std::span u32 { return v != 0 ? v + 1 : 0; }); - auto source_left{add_one(static_cast(slot.config.source_rect_left.Value()))}; - auto source_right{add_one(static_cast(slot.config.source_rect_right.Value()))}; - auto source_top{add_one(static_cast(slot.config.source_rect_top.Value()))}; - auto source_bottom{add_one(static_cast(slot.config.source_rect_bottom.Value()))}; + auto source_left{add_one(u32(slot.config.source_rect_left.Value()))}; + auto source_right{add_one(u32(slot.config.source_rect_right.Value()))}; + auto source_top{add_one(u32(slot.config.source_rect_top.Value()))}; + auto source_bottom{add_one(u32(slot.config.source_rect_bottom.Value()))}; - const auto dest_left{add_one(static_cast(slot.config.dest_rect_left.Value()))}; - const auto dest_right{add_one(static_cast(slot.config.dest_rect_right.Value()))}; - const auto dest_top{add_one(static_cast(slot.config.dest_rect_top.Value()))}; - const auto dest_bottom{add_one(static_cast(slot.config.dest_rect_bottom.Value()))}; + const auto dest_left{add_one(u32(slot.config.dest_rect_left.Value()))}; + const auto dest_right{add_one(u32(slot.config.dest_rect_right.Value()))}; + const auto dest_top{add_one(u32(slot.config.dest_rect_top.Value()))}; + const auto dest_bottom{add_one(u32(slot.config.dest_rect_bottom.Value()))}; auto rect_left{add_one(config.output_config.target_rect_left.Value())}; auto rect_right{add_one(config.output_config.target_rect_right.Value())}; @@ -589,220 +360,59 @@ void Vic::Blend(const ConfigStruct& config, const SlotStruct& slot) { // | r1c0 r1c1 r1c2 r1c3 | * | G | = | G | // | r2c0 r2c1 r2c2 r2c3 | | B | | B | // | 1 | - // clang-format on - - [[maybe_unused]] auto DecodeLinear = [&]() { - const auto r0c0 = static_cast(slot.color_matrix.matrix_coeff00.Value()); - const auto r0c1 = static_cast(slot.color_matrix.matrix_coeff01.Value()); - const auto r0c2 = static_cast(slot.color_matrix.matrix_coeff02.Value()); - const auto r0c3 = static_cast(slot.color_matrix.matrix_coeff03.Value()); - const auto r1c0 = static_cast(slot.color_matrix.matrix_coeff10.Value()); - const auto r1c1 = static_cast(slot.color_matrix.matrix_coeff11.Value()); - const auto r1c2 = static_cast(slot.color_matrix.matrix_coeff12.Value()); - const auto r1c3 = static_cast(slot.color_matrix.matrix_coeff13.Value()); - const auto r2c0 = static_cast(slot.color_matrix.matrix_coeff20.Value()); - const auto r2c1 = static_cast(slot.color_matrix.matrix_coeff21.Value()); - const auto r2c2 = static_cast(slot.color_matrix.matrix_coeff22.Value()); - const auto r2c3 = static_cast(slot.color_matrix.matrix_coeff23.Value()); - - const auto shift = static_cast(slot.color_matrix.matrix_r_shift.Value()); - const auto clamp_min = static_cast(slot.config.soft_clamp_low.Value()); - const auto clamp_max = static_cast(slot.config.soft_clamp_high.Value()); - - auto MatMul = [&](const Pixel& in_pixel) -> std::tuple { - auto r = static_cast(in_pixel.r); - auto g = static_cast(in_pixel.g); - auto b = static_cast(in_pixel.b); - - r = in_pixel.r * r0c0 + in_pixel.g * r0c1 + in_pixel.b * r0c2; - g = in_pixel.r * r1c0 + in_pixel.g * r1c1 + in_pixel.b * r1c2; - b = in_pixel.r * r2c0 + in_pixel.g * r2c1 + in_pixel.b * r2c2; - - r >>= shift; - g >>= shift; - b >>= shift; - - r += r0c3; - g += r1c3; - b += r2c3; - - r >>= 8; - g >>= 8; - b >>= 8; - - return {r, g, b, static_cast(in_pixel.a)}; - }; - - for (u32 y = source_top; y < source_bottom; y++) { - const auto src{y * in_surface_width + source_left}; - const auto dst{y * out_surface_width + rect_left}; - for (u32 x = source_left; x < source_right; x++) { - auto [r, g, b, a] = MatMul(slot_surface[src + x]); - - r = std::clamp(r, clamp_min, clamp_max); - g = std::clamp(g, clamp_min, clamp_max); - b = std::clamp(b, clamp_min, clamp_max); - a = std::clamp(a, clamp_min, clamp_max); - - output_surface[dst + x] = {static_cast(r), static_cast(g), - static_cast(b), static_cast(a)}; - } - } - }; - -#if defined(ARCHITECTURE_x86_64) - if (!has_sse41) { - DecodeLinear(); - return; - } -#endif - -#if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64) - // Fill the columns, e.g - // c0 = [00 00 00 00] [r2c0 r2c0 r2c0 r2c0] [r1c0 r1c0 r1c0 r1c0] [r0c0 r0c0 r0c0 r0c0] - - const auto c0 = _mm_set_epi32(0, static_cast(slot.color_matrix.matrix_coeff20.Value()), - static_cast(slot.color_matrix.matrix_coeff10.Value()), - static_cast(slot.color_matrix.matrix_coeff00.Value())); - const auto c1 = _mm_set_epi32(0, static_cast(slot.color_matrix.matrix_coeff21.Value()), - static_cast(slot.color_matrix.matrix_coeff11.Value()), - static_cast(slot.color_matrix.matrix_coeff01.Value())); - const auto c2 = _mm_set_epi32(0, static_cast(slot.color_matrix.matrix_coeff22.Value()), - static_cast(slot.color_matrix.matrix_coeff12.Value()), - static_cast(slot.color_matrix.matrix_coeff02.Value())); - const auto c3 = _mm_set_epi32(0, static_cast(slot.color_matrix.matrix_coeff23.Value()), - static_cast(slot.color_matrix.matrix_coeff13.Value()), - static_cast(slot.color_matrix.matrix_coeff03.Value())); - - // Set the matrix right-shift as a single element. - const auto shift = - _mm_set_epi32(0, 0, 0, static_cast(slot.color_matrix.matrix_r_shift.Value())); - - // Set every 16-bit value to the soft clamp values for clamping every 16-bit channel. - const auto clamp_min = _mm_set1_epi16(static_cast(slot.config.soft_clamp_low.Value())); - const auto clamp_max = - _mm_set1_epi16(static_cast(slot.config.soft_clamp_high.Value())); - - // clang-format off - - auto MatMul = [](__m128i& p, const __m128i& col0, const __m128i& col1, const __m128i& col2, - const __m128i& col3, const __m128i& trm_shift) -> __m128i { - // Duplicate the 32-bit channels, e.g - // p = [AA AA AA AA] [BB BB BB BB] [GG GG GG GG] [RR RR RR RR] - // -> - // r = [RR4 RR4 RR4 RR4] [RR3 RR3 RR3 RR3] [RR2 RR2 RR2 RR2] [RR1 RR1 RR1 RR1] - auto r = _mm_shuffle_epi32(p, 0x0); - auto g = _mm_shuffle_epi32(p, 0x55); - auto b = _mm_shuffle_epi32(p, 0xAA); - - // Multiply the rows and columns c0 * r, c1 * g, c2 * b, e.g - // r = [RR4 RR4 RR4 RR4] [ RR3 RR3 RR3 RR3] [ RR2 RR2 RR2 RR2] [ RR1 RR1 RR1 RR1] - // * - // c0 = [ 00 00 00 00] [r2c0 r2c0 r2c0 r2c0] [r1c0 r1c0 r1c0 r1c0] [r0c0 r0c0 r0c0 r0c0] - r = _mm_mullo_epi32(r, col0); - g = _mm_mullo_epi32(g, col1); - b = _mm_mullo_epi32(b, col2); - - // Add them all together vertically, such that the 32-bit element - // out[0] = (r[0] * c0[0]) + (g[0] * c1[0]) + (b[0] * c2[0]) - auto out = _mm_add_epi32(_mm_add_epi32(r, g), b); - - // Shift the result by r_shift, as the TRM says - out = _mm_sra_epi32(out, trm_shift); - - // Add the final column. Because the 4x1 matrix has this row as 1, there's no need to - // multiply by it, and as per the TRM this column ignores r_shift, so it's just added - // here after shifting. - out = _mm_add_epi32(out, col3); - - // Shift the result back from S12.8 to integer values - return _mm_srai_epi32(out, 8); + const auto r0c0 = s32(slot.color_matrix.matrix_coeff00.Value()); + const auto r0c1 = s32(slot.color_matrix.matrix_coeff01.Value()); + const auto r0c2 = s32(slot.color_matrix.matrix_coeff02.Value()); + const auto r0c3 = s32(slot.color_matrix.matrix_coeff03.Value()); + const auto r1c0 = s32(slot.color_matrix.matrix_coeff10.Value()); + const auto r1c1 = s32(slot.color_matrix.matrix_coeff11.Value()); + const auto r1c2 = s32(slot.color_matrix.matrix_coeff12.Value()); + const auto r1c3 = s32(slot.color_matrix.matrix_coeff13.Value()); + const auto r2c0 = s32(slot.color_matrix.matrix_coeff20.Value()); + const auto r2c1 = s32(slot.color_matrix.matrix_coeff21.Value()); + const auto r2c2 = s32(slot.color_matrix.matrix_coeff22.Value()); + const auto r2c3 = s32(slot.color_matrix.matrix_coeff23.Value()); + + const auto shift = s32(slot.color_matrix.matrix_r_shift.Value()); + const auto clamp_min = s32(slot.config.soft_clamp_low.Value()); + const auto clamp_max = s32(slot.config.soft_clamp_high.Value()); + + auto MatMul = [&](const Pixel& in_pixel) -> std::tuple { + auto r = s32(in_pixel.r); + auto g = s32(in_pixel.g); + auto b = s32(in_pixel.b); + + r = in_pixel.r * r0c0 + in_pixel.g * r0c1 + in_pixel.b * r0c2; + g = in_pixel.r * r1c0 + in_pixel.g * r1c1 + in_pixel.b * r1c2; + b = in_pixel.r * r2c0 + in_pixel.g * r2c1 + in_pixel.b * r2c2; + + r >>= shift; + g >>= shift; + b >>= shift; + + r += r0c3; + g += r1c3; + b += r2c3; + + r >>= 8; + g >>= 8; + b >>= 8; + + return {r, g, b, s32(in_pixel.a)}; }; for (u32 y = source_top; y < source_bottom; y++) { const auto src{y * in_surface_width + source_left}; const auto dst{y * out_surface_width + rect_left}; - for (u32 x = source_left; x < source_right; x += 8) { - // clang-format off - // Prefetch the next iteration's memory - _mm_prefetch((const char*)&slot_surface[src + x + 8], _MM_HINT_T0); - - // Load in pixels - // p01 = [AA AA] [BB BB] [GG GG] [RR RR] [AA AA] [BB BB] [GG GG] [RR RR] - auto p01 = _mm_load_si128((__m128i*)&slot_surface[src + x + 0]); - auto p23 = _mm_load_si128((__m128i*)&slot_surface[src + x + 2]); - auto p45 = _mm_load_si128((__m128i*)&slot_surface[src + x + 4]); - auto p67 = _mm_load_si128((__m128i*)&slot_surface[src + x + 6]); - - // Convert the 16-bit channels into 32-bit (unsigned), as the matrix values are - // 32-bit and to avoid overflow. - // p01 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1] - // -> - // p01_lo = [001 001 AA1 AA1] [001 001 BB1 BB1] [001 001 GG1 GG1] [001 001 RR1 RR1] - // p01_hi = [002 002 AA2 AA2] [002 002 BB2 BB2] [002 002 GG2 GG2] [002 002 RR2 RR2] - auto p01_lo = _mm_cvtepu16_epi32(p01); - auto p01_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p01, 8)); - auto p23_lo = _mm_cvtepu16_epi32(p23); - auto p23_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p23, 8)); - auto p45_lo = _mm_cvtepu16_epi32(p45); - auto p45_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p45, 8)); - auto p67_lo = _mm_cvtepu16_epi32(p67); - auto p67_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p67, 8)); - - // Matrix multiply the pixel, doing the colour conversion. - auto out0 = MatMul(p01_lo, c0, c1, c2, c3, shift); - auto out1 = MatMul(p01_hi, c0, c1, c2, c3, shift); - auto out2 = MatMul(p23_lo, c0, c1, c2, c3, shift); - auto out3 = MatMul(p23_hi, c0, c1, c2, c3, shift); - auto out4 = MatMul(p45_lo, c0, c1, c2, c3, shift); - auto out5 = MatMul(p45_hi, c0, c1, c2, c3, shift); - auto out6 = MatMul(p67_lo, c0, c1, c2, c3, shift); - auto out7 = MatMul(p67_hi, c0, c1, c2, c3, shift); - - // Pack the 32-bit channel pixels back into 16-bit using unsigned saturation - // out0 = [001 001 AA1 AA1] [001 001 BB1 BB1] [001 001 GG1 GG1] [001 001 RR1 RR1] - // out1 = [002 002 AA2 AA2] [002 002 BB2 BB2] [002 002 GG2 GG2] [002 002 RR2 RR2] - // -> - // done0 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1] - auto done0 = _mm_packus_epi32(out0, out1); - auto done1 = _mm_packus_epi32(out2, out3); - auto done2 = _mm_packus_epi32(out4, out5); - auto done3 = _mm_packus_epi32(out6, out7); - - // Blend the original alpha back into the pixel, as the matrix multiply gives us a - // 3-channel output, not 4. - // 0x88 = b10001000, taking RGB from the first argument, A from the second argument. - // done0 = [002 002] [BB2 BB2] [GG2 GG2] [RR2 RR2] [001 001] [BB1 BB1] [GG1 GG1] [RR1 RR1] - // -> - // done0 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1] - done0 = _mm_blend_epi16(done0, p01, 0x88); - done1 = _mm_blend_epi16(done1, p23, 0x88); - done2 = _mm_blend_epi16(done2, p45, 0x88); - done3 = _mm_blend_epi16(done3, p67, 0x88); - - // Clamp the 16-bit channels to the soft-clamp min/max. - done0 = _mm_max_epu16(done0, clamp_min); - done1 = _mm_max_epu16(done1, clamp_min); - done2 = _mm_max_epu16(done2, clamp_min); - done3 = _mm_max_epu16(done3, clamp_min); - - done0 = _mm_min_epu16(done0, clamp_max); - done1 = _mm_min_epu16(done1, clamp_max); - done2 = _mm_min_epu16(done2, clamp_max); - done3 = _mm_min_epu16(done3, clamp_max); - - // Store the pixels to the output surface. - _mm_store_si128((__m128i*)&output_surface[dst + x + 0], done0); - _mm_store_si128((__m128i*)&output_surface[dst + x + 2], done1); - _mm_store_si128((__m128i*)&output_surface[dst + x + 4], done2); - _mm_store_si128((__m128i*)&output_surface[dst + x + 6], done3); - + for (u32 x = source_left; x < source_right; x++) { + auto [r, g, b, a] = MatMul(slot_surface[src + x]); + r = std::clamp(r, clamp_min, clamp_max); + g = std::clamp(g, clamp_min, clamp_max); + b = std::clamp(b, clamp_min, clamp_max); + a = std::clamp(a, clamp_min, clamp_max); + output_surface[dst + x] = {u16(r), u16(g), u16(b), u16(a)}; } } - // clang-format on -#else - DecodeLinear(); -#endif } } @@ -826,7 +436,7 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) { surface_width = (std::min)(surface_width, out_luma_width); surface_height = (std::min)(surface_height, out_luma_height); - [[maybe_unused]] auto DecodeLinear = [&](std::span out_luma, std::span out_chroma) { + auto Decode = [&](std::span out_luma, std::span out_chroma) { for (u32 y = 0; y < surface_height; ++y) { const auto src_luma = y * surface_stride; const auto dst_luma = y * out_luma_stride; @@ -834,173 +444,20 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) { const auto dst_chroma = (y / 2) * out_chroma_stride; for (u32 x = 0; x < surface_width; x += 2) { out_luma[dst_luma + x + 0] = - static_cast(output_surface[src_luma + x + 0].r >> 2); + u8(output_surface[src_luma + x + 0].r >> 2); out_luma[dst_luma + x + 1] = - static_cast(output_surface[src_luma + x + 1].r >> 2); - out_chroma[dst_chroma + x + 0] = - static_cast(output_surface[src_chroma + x].g >> 2); - out_chroma[dst_chroma + x + 1] = - static_cast(output_surface[src_chroma + x].b >> 2); - } - } - }; - - auto Decode = [&](std::span out_luma, std::span out_chroma) { -#if defined(ARCHITECTURE_x86_64) - if (!has_sse41) { - DecodeLinear(out_luma, out_chroma); - return; - } -#endif - -#if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64) - // luma_mask = [00 00] [00 00] [00 00] [FF FF] [00 00] [00 00] [00 00] [FF FF] - const auto luma_mask = _mm_set_epi16(0, 0, 0, -1, 0, 0, 0, -1); - - const auto sse_aligned_width = Common::AlignDown(surface_width, 16); - - for (u32 y = 0; y < surface_height; ++y) { - const auto src = y * surface_stride; - const auto dst_luma = y * out_luma_stride; - const auto dst_chroma = (y / 2) * out_chroma_stride; - u32 x = 0; - for (; x < sse_aligned_width; x += 16) { - // clang-format off - // Prefetch the next cache lines, 2 per iteration - _mm_prefetch((const char*)&output_surface[src + x + 16], _MM_HINT_T0); - _mm_prefetch((const char*)&output_surface[src + x + 24], _MM_HINT_T0); - - // Load the 64-bit pixels, 2 per variable. - auto pixel01 = _mm_load_si128((__m128i*)&output_surface[src + x + 0]); - auto pixel23 = _mm_load_si128((__m128i*)&output_surface[src + x + 2]); - auto pixel45 = _mm_load_si128((__m128i*)&output_surface[src + x + 4]); - auto pixel67 = _mm_load_si128((__m128i*)&output_surface[src + x + 6]); - auto pixel89 = _mm_load_si128((__m128i*)&output_surface[src + x + 8]); - auto pixel1011 = _mm_load_si128((__m128i*)&output_surface[src + x + 10]); - auto pixel1213 = _mm_load_si128((__m128i*)&output_surface[src + x + 12]); - auto pixel1415 = _mm_load_si128((__m128i*)&output_surface[src + x + 14]); - - // Split out the luma of each pixel using the luma_mask above. - // pixel01 = [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1] [LL1 LL1] - // -> - // l01 = [002 002] [002 002] [002 002] [LL2 LL2] [001 001] [001 001] [001 001] [LL1 LL1] - auto l01 = _mm_and_si128(pixel01, luma_mask); - auto l23 = _mm_and_si128(pixel23, luma_mask); - auto l45 = _mm_and_si128(pixel45, luma_mask); - auto l67 = _mm_and_si128(pixel67, luma_mask); - auto l89 = _mm_and_si128(pixel89, luma_mask); - auto l1011 = _mm_and_si128(pixel1011, luma_mask); - auto l1213 = _mm_and_si128(pixel1213, luma_mask); - auto l1415 = _mm_and_si128(pixel1415, luma_mask); - - // Pack 32-bit elements from 2 registers down into 16-bit elements in 1 register. - // l01 = [002 002 002 002] [002 002 LL2 LL2] [001 001 001 001] [001 001 LL1 LL1] - // l23 = [004 004 004 004] [004 004 LL4 LL4] [003 003 003 003] [003 003 LL3 LL3] - // -> - // l0123 = [004 004] [LL4 LL4] [003 003] [LL3 LL3] [002 002] [LL2 LL2] [001 001] [LL1 LL1] - auto l0123 = _mm_packus_epi32(l01, l23); - auto l4567 = _mm_packus_epi32(l45, l67); - auto l891011 = _mm_packus_epi32(l89, l1011); - auto l12131415 = _mm_packus_epi32(l1213, l1415); - - // Pack 32-bit elements from 2 registers down into 16-bit elements in 1 register. - // l0123 = [004 004 LL4 LL4] [003 003 LL3 LL3] [002 002 LL2 LL2] [001 001 LL1 LL1] - // l4567 = [008 008 LL8 LL8] [007 007 LL7 LL7] [006 006 LL6 LL6] [005 005 LL5 LL5] - // -> - // luma_lo = [LL8 LL8] [LL7 LL7] [LL6 LL6] [LL5 LL5] [LL4 LL4] [LL3 LL3] [LL2 LL2] [LL1 LL1] - auto luma_lo = _mm_packus_epi32(l0123, l4567); - auto luma_hi = _mm_packus_epi32(l891011, l12131415); - - // Right-shift the 16-bit elements by 2, un-doing the left shift by 2 on read - // and bringing the range back to 8-bit. - luma_lo = _mm_srli_epi16(luma_lo, 2); - luma_hi = _mm_srli_epi16(luma_hi, 2); - - // Pack with unsigned saturation the 16-bit values in 2 registers into 8-bit values in 1 register. - // luma_lo = [LL8 LL8] [LL7 LL7] [LL6 LL6] [LL5 LL5] [LL4 LL4] [LL3 LL3] [LL2 LL2] [LL1 LL1] - // luma_hi = [LL16 LL16] [LL15 LL15] [LL14 LL14] [LL13 LL13] [LL12 LL12] [LL11 LL11] [LL10 LL10] [LL9 LL9] - // -> - // luma = [LL16] [LL15] [LL14] [LL13] [LL12] [LL11] [LL10] [LL9] [LL8] [LL7] [LL6] [LL5] [LL4] [LL3] [LL2] [LL1] - auto luma = _mm_packus_epi16(luma_lo, luma_hi); - - // Store the 16 bytes of luma - _mm_store_si128((__m128i*)&out_luma[dst_luma + x], luma); - - if (y % 2 == 0) { - // Chroma, done every other line as it's half the height of luma. - - // Shift the register right by 2 bytes (not bits), to kick out the 16-bit luma. - // We can do this instead of &'ing a mask and then shifting. - // pixel01 = [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1] [LL1 LL1] - // -> - // c01 = [ 00 00] [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1] - auto c01 = _mm_srli_si128(pixel01, 2); - auto c23 = _mm_srli_si128(pixel23, 2); - auto c45 = _mm_srli_si128(pixel45, 2); - auto c67 = _mm_srli_si128(pixel67, 2); - auto c89 = _mm_srli_si128(pixel89, 2); - auto c1011 = _mm_srli_si128(pixel1011, 2); - auto c1213 = _mm_srli_si128(pixel1213, 2); - auto c1415 = _mm_srli_si128(pixel1415, 2); - - // Interleave the lower 8 bytes as 32-bit elements from 2 registers into 1 register. - // This has the effect of skipping every other chroma value horitonally, - // notice the high pixels UU2/UU4 are skipped. - // This is intended as N420 chroma width is half the luma width. - // c01 = [ 00 00 AA2 AA2] [VV2 VV2 UU2 UU2] [LL2 LL2 AA1 AA1] [VV1 VV1 UU1 UU1] - // c23 = [ 00 00 AA4 AA4] [VV4 VV4 UU4 UU4] [LL4 LL4 AA3 AA3] [VV3 VV3 UU3 UU3] - // -> - // c0123 = [LL4 LL4 AA3 AA3] [LL2 LL2 AA1 AA1] [VV3 VV3 UU3 UU3] [VV1 VV1 UU1 UU1] - auto c0123 = _mm_unpacklo_epi32(c01, c23); - auto c4567 = _mm_unpacklo_epi32(c45, c67); - auto c891011 = _mm_unpacklo_epi32(c89, c1011); - auto c12131415 = _mm_unpacklo_epi32(c1213, c1415); - - // Interleave the low 64-bit elements from 2 registers into 1. - // c0123 = [LL4 LL4 AA3 AA3 LL2 LL2 AA1 AA1] [VV3 VV3 UU3 UU3 VV1 VV1 UU1 UU1] - // c4567 = [LL8 LL8 AA7 AA7 LL6 LL6 AA5 AA5] [VV7 VV7 UU7 UU7 VV5 VV5 UU5 UU5] - // -> - // chroma_lo = [VV7 VV7 UU7 UU7 VV5 VV5 UU5 UU5] [VV3 VV3 UU3 UU3 VV1 VV1 UU1 UU1] - auto chroma_lo = _mm_unpacklo_epi64(c0123, c4567); - auto chroma_hi = _mm_unpacklo_epi64(c891011, c12131415); - - // Right-shift the 16-bit elements by 2, un-doing the left shift by 2 on read - // and bringing the range back to 8-bit. - chroma_lo = _mm_srli_epi16(chroma_lo, 2); - chroma_hi = _mm_srli_epi16(chroma_hi, 2); - - // Pack with unsigned saturation the 16-bit elements from 2 registers into 8-bit elements in 1 register. - // chroma_lo = [ VV7 VV7] [ UU7 UU7] [ VV5 VV5] [ UU5 UU5] [ VV3 VV3] [ UU3 UU3] [VV1 VV1] [UU1 UU1] - // chroma_hi = [VV15 VV15] [UU15 UU15] [VV13 VV13] [UU13 UU13] [VV11 VV11] [UU11 UU11] [VV9 VV9] [UU9 UU9] - // -> - // chroma = [VV15] [UU15] [VV13] [UU13] [VV11] [UU11] [VV9] [UU9] [VV7] [UU7] [VV5] [UU5] [VV3] [UU3] [VV1] [UU1] - auto chroma = _mm_packus_epi16(chroma_lo, chroma_hi); - - // Store the 16 bytes of chroma. - _mm_store_si128((__m128i*)&out_chroma[dst_chroma + x + 0], chroma); - } - - // clang-format on - } - - const auto src_chroma = y * surface_stride; - for (; x < surface_width; x += 2) { - out_luma[dst_luma + x + 0] = static_cast(output_surface[src + x + 0].r >> 2); - out_luma[dst_luma + x + 1] = static_cast(output_surface[src + x + 1].r >> 2); + u8(output_surface[src_luma + x + 1].r >> 2); out_chroma[dst_chroma + x + 0] = - static_cast(output_surface[src_chroma + x].g >> 2); + u8(output_surface[src_chroma + x].g >> 2); out_chroma[dst_chroma + x + 1] = - static_cast(output_surface[src_chroma + x].b >> 2); + u8(output_surface[src_chroma + x].b >> 2); } } -#else - DecodeLinear(out_luma, out_chroma); -#endif }; switch (output_surface_config.out_block_kind) { case BLK_KIND::GENERIC_16Bx2: { - const u32 block_height = static_cast(output_surface_config.out_block_height); + const u32 block_height = u32(output_surface_config.out_block_height); const auto out_luma_swizzle_size = Texture::CalculateSize( true, BytesPerPixel, out_luma_width, out_luma_height, 1, block_height, 0); const auto out_chroma_swizzle_size = Texture::CalculateSize( @@ -1028,23 +485,18 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) { &swizzle_scratch); if (block_height == 1) { - SwizzleSurface(out_luma, out_luma_stride, luma_scratch, out_luma_stride, - out_luma_height); + SwizzleSurface(out_luma, out_luma_stride, luma_scratch, out_luma_stride, out_luma_height); } else { - Texture::SwizzleTexture(out_luma, luma_scratch, BytesPerPixel, out_luma_width, - out_luma_height, 1, block_height, 0, 1); + Texture::SwizzleTexture(out_luma, luma_scratch, BytesPerPixel, out_luma_width, out_luma_height, 1, block_height, 0, 1); } Tegra::Memory::GpuGuestMemoryScoped - out_chroma(memory_manager, regs.output_surface.chroma_u.Address(), - out_chroma_swizzle_size, &swizzle_scratch); + out_chroma(memory_manager, regs.output_surface.chroma_u.Address(), out_chroma_swizzle_size, &swizzle_scratch); if (block_height == 1) { - SwizzleSurface(out_chroma, out_chroma_stride, chroma_scratch, out_chroma_stride, - out_chroma_height); + SwizzleSurface(out_chroma, out_chroma_stride, chroma_scratch, out_chroma_stride, out_chroma_height); } else { - Texture::SwizzleTexture(out_chroma, chroma_scratch, BytesPerPixel, out_chroma_width, - out_chroma_height, 1, block_height, 0, 1); + Texture::SwizzleTexture(out_chroma, chroma_scratch, BytesPerPixel, out_chroma_width, out_chroma_height, 1, block_height, 0, 1); } } break; case BLK_KIND::PITCH: { @@ -1067,13 +519,9 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) { // afterwards to re-overwrite the luma being too large. luma_scratch.resize_destructive(out_luma_size); chroma_scratch.resize_destructive(out_chroma_size); - Decode(luma_scratch, chroma_scratch); - - memory_manager.WriteBlock(regs.output_surface.luma.Address(), luma_scratch.data(), - out_luma_size); - memory_manager.WriteBlock(regs.output_surface.chroma_u.Address(), chroma_scratch.data(), - out_chroma_size); + memory_manager.WriteBlock(regs.output_surface.luma.Address(), luma_scratch.data(), out_luma_size); + memory_manager.WriteBlock(regs.output_surface.chroma_u.Address(), chroma_scratch.data(), out_chroma_size); } break; default: UNREACHABLE(); @@ -1081,8 +529,7 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) { } } -template -void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config) { +void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config, VideoPixelFormat format) { constexpr u32 BytesPerPixel = 4; auto surface_width{output_surface_config.out_surface_width + 1}; @@ -1097,125 +544,29 @@ void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config) { surface_width = (std::min)(surface_width, out_luma_width); surface_height = (std::min)(surface_height, out_luma_height); - [[maybe_unused]] auto DecodeLinear = [&](std::span out_buffer) { - for (u32 y = 0; y < surface_height; y++) { - const auto src = y * surface_stride; - const auto dst = y * out_luma_stride; - for (u32 x = 0; x < surface_width; x++) { - if constexpr (Format == VideoPixelFormat::A8R8G8B8) { - out_buffer[dst + x * 4 + 0] = static_cast(output_surface[src + x].b >> 2); - out_buffer[dst + x * 4 + 1] = static_cast(output_surface[src + x].g >> 2); - out_buffer[dst + x * 4 + 2] = static_cast(output_surface[src + x].r >> 2); - out_buffer[dst + x * 4 + 3] = static_cast(output_surface[src + x].a >> 2); - } else { - out_buffer[dst + x * 4 + 0] = static_cast(output_surface[src + x].r >> 2); - out_buffer[dst + x * 4 + 1] = static_cast(output_surface[src + x].g >> 2); - out_buffer[dst + x * 4 + 2] = static_cast(output_surface[src + x].b >> 2); - out_buffer[dst + x * 4 + 3] = static_cast(output_surface[src + x].a >> 2); - } - } - } - }; - auto Decode = [&](std::span out_buffer) { -#if defined(ARCHITECTURE_x86_64) - if (!has_sse41) { - DecodeLinear(out_buffer); - return; - } -#endif - -#if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64) - constexpr size_t SseAlignment = 16; - const auto sse_aligned_width = Common::AlignDown(surface_width, SseAlignment); - for (u32 y = 0; y < surface_height; y++) { const auto src = y * surface_stride; const auto dst = y * out_luma_stride; - u32 x = 0; - for (; x < sse_aligned_width; x += SseAlignment) { - // clang-format off - // Prefetch the next 2 cache lines - _mm_prefetch((const char*)&output_surface[src + x + 16], _MM_HINT_T0); - _mm_prefetch((const char*)&output_surface[src + x + 24], _MM_HINT_T0); - - // Load the pixels, 16-bit channels, 8 bytes per pixel, e.g - // pixel01 = [AA AA BB BB GG GG RR RR AA AA BB BB GG GG RR RR - auto pixel01 = _mm_load_si128((__m128i*)&output_surface[src + x + 0]); - auto pixel23 = _mm_load_si128((__m128i*)&output_surface[src + x + 2]); - auto pixel45 = _mm_load_si128((__m128i*)&output_surface[src + x + 4]); - auto pixel67 = _mm_load_si128((__m128i*)&output_surface[src + x + 6]); - auto pixel89 = _mm_load_si128((__m128i*)&output_surface[src + x + 8]); - auto pixel1011 = _mm_load_si128((__m128i*)&output_surface[src + x + 10]); - auto pixel1213 = _mm_load_si128((__m128i*)&output_surface[src + x + 12]); - auto pixel1415 = _mm_load_si128((__m128i*)&output_surface[src + x + 14]); - - // Right-shift the channels by 16 to un-do the left shit on read and bring the range - // back to 8-bit. - pixel01 = _mm_srli_epi16(pixel01, 2); - pixel23 = _mm_srli_epi16(pixel23, 2); - pixel45 = _mm_srli_epi16(pixel45, 2); - pixel67 = _mm_srli_epi16(pixel67, 2); - pixel89 = _mm_srli_epi16(pixel89, 2); - pixel1011 = _mm_srli_epi16(pixel1011, 2); - pixel1213 = _mm_srli_epi16(pixel1213, 2); - pixel1415 = _mm_srli_epi16(pixel1415, 2); - - // Pack with unsigned saturation 16-bit channels from 2 registers into 8-bit channels in 1 register. - // pixel01 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1] - // pixel23 = [AA4 AA4] [BB4 BB4] [GG4 GG4] [RR4 RR4] [AA3 AA3] [BB3 BB3] [GG3 GG3] [RR3 RR3] - // -> - // pixels0_lo = [AA4] [BB4] [GG4] [RR4] [AA3] [BB3] [GG3] [RR3] [AA2] [BB2] [GG2] [RR2] [AA1] [BB1] [GG1] [RR1] - auto pixels0_lo = _mm_packus_epi16(pixel01, pixel23); - auto pixels0_hi = _mm_packus_epi16(pixel45, pixel67); - auto pixels1_lo = _mm_packus_epi16(pixel89, pixel1011); - auto pixels1_hi = _mm_packus_epi16(pixel1213, pixel1415); - - if constexpr (Format == VideoPixelFormat::A8R8G8B8) { - const auto shuffle = - _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2); - - // Our pixels are ABGR (big-endian) by default, if ARGB is needed, we need to shuffle. - // pixels0_lo = [AA4 BB4 GG4 RR4] [AA3 BB3 GG3 RR3] [AA2 BB2 GG2 RR2] [AA1 BB1 GG1 RR1] - // -> - // pixels0_lo = [AA4 RR4 GG4 BB4] [AA3 RR3 GG3 BB3] [AA2 RR2 GG2 BB2] [AA1 RR1 GG1 BB1] - pixels0_lo = _mm_shuffle_epi8(pixels0_lo, shuffle); - pixels0_hi = _mm_shuffle_epi8(pixels0_hi, shuffle); - pixels1_lo = _mm_shuffle_epi8(pixels1_lo, shuffle); - pixels1_hi = _mm_shuffle_epi8(pixels1_hi, shuffle); - } - - // Store the pixels - _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 0], pixels0_lo); - _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 16], pixels0_hi); - _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 32], pixels1_lo); - _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 48], pixels1_hi); - - // clang-format on - } - - for (; x < surface_width; x++) { - if constexpr (Format == VideoPixelFormat::A8R8G8B8) { - out_buffer[dst + x * 4 + 0] = static_cast(output_surface[src + x].b >> 2); - out_buffer[dst + x * 4 + 1] = static_cast(output_surface[src + x].g >> 2); - out_buffer[dst + x * 4 + 2] = static_cast(output_surface[src + x].r >> 2); - out_buffer[dst + x * 4 + 3] = static_cast(output_surface[src + x].a >> 2); + for (u32 x = 0; x < surface_width; x++) { + if(format == VideoPixelFormat::A8R8G8B8) { + out_buffer[dst + x * 4 + 0] = u8(output_surface[src + x].b >> 2); + out_buffer[dst + x * 4 + 1] = u8(output_surface[src + x].g >> 2); + out_buffer[dst + x * 4 + 2] = u8(output_surface[src + x].r >> 2); + out_buffer[dst + x * 4 + 3] = u8(output_surface[src + x].a >> 2); } else { - out_buffer[dst + x * 4 + 0] = static_cast(output_surface[src + x].r >> 2); - out_buffer[dst + x * 4 + 1] = static_cast(output_surface[src + x].g >> 2); - out_buffer[dst + x * 4 + 2] = static_cast(output_surface[src + x].b >> 2); - out_buffer[dst + x * 4 + 3] = static_cast(output_surface[src + x].a >> 2); + out_buffer[dst + x * 4 + 0] = u8(output_surface[src + x].r >> 2); + out_buffer[dst + x * 4 + 1] = u8(output_surface[src + x].g >> 2); + out_buffer[dst + x * 4 + 2] = u8(output_surface[src + x].b >> 2); + out_buffer[dst + x * 4 + 3] = u8(output_surface[src + x].a >> 2); } } } -#else - DecodeLinear(out_buffer); -#endif }; switch (output_surface_config.out_block_kind) { case BLK_KIND::GENERIC_16Bx2: { - const u32 block_height = static_cast(output_surface_config.out_block_height); + const u32 block_height = u32(output_surface_config.out_block_height); const auto out_swizzle_size = Texture::CalculateSize(true, BytesPerPixel, out_luma_width, out_luma_height, 1, block_height, 0); @@ -1236,22 +587,20 @@ void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config) { memory_manager, regs.output_surface.luma.Address(), out_swizzle_size, &swizzle_scratch); if (block_height == 1) { - SwizzleSurface(out_luma, out_luma_stride, luma_scratch, out_luma_stride, - out_luma_height); + SwizzleSurface(out_luma, out_luma_stride, luma_scratch, out_luma_stride, out_luma_height); } else { - Texture::SwizzleTexture(out_luma, luma_scratch, BytesPerPixel, out_luma_width, - out_luma_height, 1, block_height, 0, 1); + Texture::SwizzleTexture(out_luma, luma_scratch, BytesPerPixel, out_luma_width, out_luma_height, 1, block_height, 0, 1); } } break; case BLK_KIND::PITCH: { LOG_TRACE(HW_GPU, - "Writing ABGR pitch frame\n" - "\tinput surface {}x{} stride {} size {:#X}" - "\toutput surface {}x{} stride {} size {:#X}", - surface_width, surface_height, surface_stride, - surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height, - out_luma_stride, out_luma_size); + "Writing ABGR pitch frame\n" + "\tinput surface {}x{} stride {} size {:#X}" + "\toutput surface {}x{} stride {} size {:#X}", + surface_width, surface_height, surface_stride, + surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height, + out_luma_stride, out_luma_size); luma_scratch.resize_destructive(out_luma_size); diff --git a/src/video_core/host1x/vic.h b/src/video_core/host1x/vic.h index e7600941ad..5ea13285a4 100644 --- a/src/video_core/host1x/vic.h +++ b/src/video_core/host1x/vic.h @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later @@ -610,24 +613,12 @@ public: private: void Execute(); - void Blend(const ConfigStruct& config, const SlotStruct& slot); - - template - void ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span offsets, - std::shared_ptr frame); - template - void ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span offsets, - std::shared_ptr frame); - - template - void ReadY8__V8U8_N420(const SlotStruct& slot, std::span offsets, - std::shared_ptr frame); - + void ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span offsets, std::shared_ptr frame, bool planar, bool interlaced); + void ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span offsets, std::shared_ptr frame, bool planar, bool top_field); + void ReadY8__V8U8_N420(const SlotStruct& slot, std::span offsets, std::shared_ptr frame, bool planar); void WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config); - - template - void WriteABGR(const OutputSurfaceConfig& output_surface_config); + void WriteABGR(const OutputSurfaceConfig& output_surface_config, VideoPixelFormat format); s32 id; s32 nvdec_id{-1}; @@ -636,8 +627,6 @@ private: VicRegisters regs{}; FrameQueue& frame_queue; - const bool has_sse41{false}; - Common::ScratchBuffer output_surface; Common::ScratchBuffer slot_surface; Common::ScratchBuffer luma_scratch;