From 5c6aaa7eb144f0350e53f9ef8fac820ab595d1c7 Mon Sep 17 00:00:00 2001
From: lizzie <lizzie@eden-emu.dev>
Date: Wed, 29 Oct 2025 03:13:24 +0100
Subject: [PATCH] [video_core/vic] remove handrolled vector implementation that
 also uses software prefetching(!!!!); don't try to outsmart compiler - also
 remove template spam (#2856)

Also removes sse2neon :)
Software prefetching SUCKS and it's evil don't do it
Signed-off-by: lizzie <lizzie@eden-emu.dev>

Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/2856
Reviewed-by: crueter <crueter@eden-emu.dev>
Reviewed-by: MaranBr <maranbr@eden-emu.dev>
Reviewed-by: Caio Oliveira <caiooliveirafarias0@gmail.com>
Co-authored-by: lizzie <lizzie@eden-emu.dev>
Co-committed-by: lizzie <lizzie@eden-emu.dev>
---
 .ci/license-header.sh                         |   2 +-
 ...-support-for-clang-cl-on-Windows-633.patch | 129 ---
 CMakeLists.txt                                |   3 -
 externals/CMakeLists.txt                      |   7 -
 externals/cpmfile.json                        |   9 -
 src/video_core/CMakeLists.txt                 |   4 -
 src/video_core/host1x/vic.cpp                 | 983 +++---------------
 src/video_core/host1x/vic.h                   |  25 +-
 8 files changed, 174 insertions(+), 988 deletions(-)
 delete mode 100644 .patch/sse2neon/0001-Add-support-for-clang-cl-on-Windows-633.patch

diff --git a/.ci/license-header.sh b/.ci/license-header.sh
index 874f29aa45..f438d59dac 100755
--- a/.ci/license-header.sh
+++ b/.ci/license-header.sh
@@ -4,7 +4,7 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 
 # specify full path if dupes may exist
-EXCLUDE_FILES="CPM.cmake CPMUtil.cmake GetSCMRev.cmake sse2neon.h renderdoc_app.h tools/cpm tools/shellcheck.sh tools/update-cpm.sh externals/stb externals/glad externals/getopt externals/gamemode externals/FidelityFX-FSR externals/demangle externals/bc_decoder"
+EXCLUDE_FILES="CPM.cmake CPMUtil.cmake GetSCMRev.cmake renderdoc_app.h tools/cpm tools/shellcheck.sh tools/update-cpm.sh externals/stb externals/glad externals/getopt externals/gamemode externals/FidelityFX-FSR externals/demangle externals/bc_decoder"
 
 # license header constants, please change when needed :))))
 YEAR=2025
diff --git a/.patch/sse2neon/0001-Add-support-for-clang-cl-on-Windows-633.patch b/.patch/sse2neon/0001-Add-support-for-clang-cl-on-Windows-633.patch
deleted file mode 100644
index cf86707355..0000000000
--- a/.patch/sse2neon/0001-Add-support-for-clang-cl-on-Windows-633.patch
+++ /dev/null
@@ -1,129 +0,0 @@
-From d765ebed3598ddfd7167fc546474626ac5ef9498 Mon Sep 17 00:00:00 2001
-From: Anthony Roberts <anthony.roberts@linaro.org>
-Date: Fri, 2 Aug 2024 16:55:57 +0100
-Subject: [PATCH] Add support for clang-cl on Windows (#633)
-
-This commit adds support for clang-cl (clang, pretending to be MSVC) to
-SSE2NEON on Windows ARM64 platforms. This change is part of some Blender
-work, as using clang-cl provides a ~20-40% speedup compared to MSVC.
-
-Compiled with the following command line (via a VS2022 Native ARM64 Tools
-CMD window):
-  msbuild sse2neon.vcxproj /p:Configuration=Release /p:CLToolExe=clang-cl.exe
-                           /p:CLToolPath="C:\Program Files\LLVM\bin\"
-
-Known failures in test suite:
-  Test mm_cvttpd_epi32
-  Test rdtsc
-
-Co-authored-by: Anthony Roberts <anthony.roberts@linaro.org>
----
- sse2neon.h | 22 +++++++++++-----------
- 1 file changed, 11 insertions(+), 11 deletions(-)
-
-diff --git a/sse2neon.h b/sse2neon.h
-index 56254b5..76cf8e3 100644
---- a/sse2neon.h
-+++ b/sse2neon.h
-@@ -180,7 +180,7 @@
-     }
- 
- /* Compiler barrier */
--#if defined(_MSC_VER)
-+#if defined(_MSC_VER) && !defined(__clang__)
- #define SSE2NEON_BARRIER() _ReadWriteBarrier()
- #else
- #define SSE2NEON_BARRIER()                     \
-@@ -856,7 +856,7 @@ FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
- {
-     poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
-     poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
--#if defined(_MSC_VER)
-+#if defined(_MSC_VER) && !defined(__clang__)
-     __n64 a1 = {a}, b1 = {b};
-     return vreinterpretq_u64_p128(vmull_p64(a1, b1));
- #else
-@@ -1767,7 +1767,7 @@ FORCE_INLINE void _mm_free(void *addr)
- FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
- {
-     uint64_t value;
--#if defined(_MSC_VER)
-+#if defined(_MSC_VER) && !defined(__clang__)
-     value = _ReadStatusReg(ARM64_FPCR);
- #else
-     __asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */
-@@ -1777,7 +1777,7 @@ FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
- 
- FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
- {
--#if defined(_MSC_VER)
-+#if defined(_MSC_VER) && !defined(__clang__)
-     _WriteStatusReg(ARM64_FPCR, value);
- #else
-     __asm__ __volatile__("msr FPCR, %0" ::"r"(value));  /* write */
-@@ -2246,7 +2246,7 @@ FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
- FORCE_INLINE void _mm_prefetch(char const *p, int i)
- {
-     (void) i;
--#if defined(_MSC_VER)
-+#if defined(_MSC_VER) && !defined(__clang__)
-     switch (i) {
-     case _MM_HINT_NTA:
-         __prefetch2(p, 1);
-@@ -4817,7 +4817,7 @@ FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
- // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
- FORCE_INLINE void _mm_pause(void)
- {
--#if defined(_MSC_VER)
-+#if defined(_MSC_VER) && !defined(__clang__)
-     __isb(_ARM64_BARRIER_SY);
- #else
-     __asm__ __volatile__("isb\n");
-@@ -5713,7 +5713,7 @@ FORCE_INLINE __m128d _mm_undefined_pd(void)
- #pragma GCC diagnostic ignored "-Wuninitialized"
- #endif
-     __m128d a;
--#if defined(_MSC_VER)
-+#if defined(_MSC_VER) && !defined(__clang__)
-     a = _mm_setzero_pd();
- #endif
-     return a;
-@@ -8127,7 +8127,7 @@ FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
- 
- FORCE_INLINE int _sse2neon_clz(unsigned int x)
- {
--#ifdef _MSC_VER
-+#if defined(_MSC_VER) && !defined(__clang__)
-     unsigned long cnt = 0;
-     if (_BitScanReverse(&cnt, x))
-         return 31 - cnt;
-@@ -8139,7 +8139,7 @@ FORCE_INLINE int _sse2neon_clz(unsigned int x)
- 
- FORCE_INLINE int _sse2neon_ctz(unsigned int x)
- {
--#ifdef _MSC_VER
-+#if defined(_MSC_VER) && !defined(__clang__)
-     unsigned long cnt = 0;
-     if (_BitScanForward(&cnt, x))
-         return cnt;
-@@ -9055,7 +9055,7 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
-     // AESE does ShiftRows and SubBytes on A
-     uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
- 
--#ifndef _MSC_VER
-+#if !defined(_MSC_VER) || defined(__clang__)
-     uint8x16_t dest = {
-         // Undo ShiftRows step from AESE and extract X1 and X3
-         u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
-@@ -9242,7 +9242,7 @@ FORCE_INLINE uint64_t _rdtsc(void)
-      * bits wide and it is attributed with the flag 'cap_user_time_short'
-      * is true.
-      */
--#if defined(_MSC_VER)
-+#if defined(_MSC_VER) && !defined(__clang__)
-     val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2));
- #else
-     __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
--- 
-2.48.1
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d29898b819..dda6979911 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -613,9 +613,6 @@ find_package(VulkanUtilityLibraries)
 find_package(SimpleIni)
 find_package(SPIRV-Tools)
 find_package(sirit)
-if (ARCHITECTURE_arm64)
-    find_package(sse2neon)
-endif()
 
 if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64)
     find_package(xbyak)
diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt
index 8cc04e99a2..096760925f 100644
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -399,10 +399,3 @@ if (ANDROID)
 
     add_library(oboe::oboe ALIAS oboe)
 endif()
-
-# sse2neon
-if (ARCHITECTURE_arm64 AND NOT TARGET sse2neon)
-    AddJsonPackage(sse2neon)
-    add_library(sse2neon INTERFACE)
-    target_include_directories(sse2neon INTERFACE ${sse2neon_SOURCE_DIR})
-endif()
diff --git a/externals/cpmfile.json b/externals/cpmfile.json
index e1eb15fad3..73cdf3e305 100644
--- a/externals/cpmfile.json
+++ b/externals/cpmfile.json
@@ -213,14 +213,5 @@
         "key": "steamdeck",
         "bundled": true,
         "skip_updates": "true"
-    },
-    "sse2neon": {
-        "repo": "DLTcollab/sse2neon",
-        "sha": "66267b52fd",
-        "hash": "3aed8676e1b8c428acb076464663e3968a721457b08710a7c5f8df2fbdaa5601053c1606169a55e987e7a58dd17e3cc3b7fbf953aa891c5ac5f8ce2941862e4b",
-        "download_only": "true",
-        "patches": [
-            "0001-Add-support-for-clang-cl-on-Windows-633.patch"
-        ]
     }
 }
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 444e0461f1..db99c2bcb9 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -398,8 +398,4 @@ if (ANDROID AND ARCHITECTURE_arm64)
     target_link_libraries(video_core PRIVATE adrenotools)
 endif()
 
-if (ARCHITECTURE_arm64)
-    target_link_libraries(video_core PRIVATE sse2neon)
-endif()
-
 create_target_directory_groups(video_core)
diff --git a/src/video_core/host1x/vic.cpp b/src/video_core/host1x/vic.cpp
index 3dbbfa5552..21cf5f4e92 100644
--- a/src/video_core/host1x/vic.cpp
+++ b/src/video_core/host1x/vic.cpp
@@ -8,21 +8,6 @@
 #include <tuple>
 #include <stdint.h>
 
-#if defined(ARCHITECTURE_x86_64)
-#if defined(_MSC_VER)
-#include <intrin.h>
-#else
-#include <immintrin.h>
-#endif
-#elif defined(ARCHITECTURE_arm64)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wimplicit-int-conversion"
-#pragma GCC diagnostic ignored "-Wconversion"
-#pragma GCC diagnostic ignored "-Wshadow"
-#include <sse2neon.h>
-#pragma GCC diagnostic pop
-#endif
-
 extern "C" {
 #if defined(__GNUC__) || defined(__clang__)
 #pragma GCC diagnostic push
@@ -55,14 +40,6 @@ extern "C" {
 
 namespace Tegra::Host1x {
 namespace {
-static bool HasSSE41() {
-#if defined(ARCHITECTURE_x86_64)
-    const auto& cpu_caps{Common::GetCPUCaps()};
-    return cpu_caps.sse4_1;
-#else
-    return false;
-#endif
-}
 
 void SwizzleSurface(std::span<u8> output, u32 out_stride, std::span<const u8> input, u32 in_stride,
                     u32 height) {
@@ -100,7 +77,7 @@ void SwizzleSurface(std::span<u8> output, u32 out_stride, std::span<const u8> in
 
 Vic::Vic(Host1x& host1x_, s32 id_, u32 syncpt, FrameQueue& frame_queue_)
     : CDmaPusher{host1x_, id_}, id{id_}, syncpoint{syncpt},
-      frame_queue{frame_queue_}, has_sse41{HasSSE41()} {
+      frame_queue{frame_queue_} {
     LOG_INFO(HW_GPU, "Created vic {}", id);
 }
 
@@ -110,7 +87,7 @@ Vic::~Vic() {
 }
 
 void Vic::ProcessMethod(u32 method, u32 arg) {
-    LOG_TRACE(HW_GPU, "Vic {} method {:#X}", id, static_cast<u32>(method));
+    LOG_TRACE(HW_GPU, "Vic {} method {:#X}", id, u32(method));
     regs.reg_array[method] = arg;
 
     switch (static_cast<Method>(method * sizeof(u32))) {
@@ -145,69 +122,57 @@ void Vic::Execute() {
                 nvdec_id = frame_queue.VicFindNvdecFdFromOffset(luma_offset);
             }
 
-            auto frame = frame_queue.GetFrame(nvdec_id, luma_offset);
-
-            if (!frame) {
-                continue;
-            }
-
-            if (!frame.get()) {
-                LOG_ERROR(HW_GPU, "Vic {} failed to get frame with offset {:#X}", id, luma_offset);
-                continue;
-            }
-
-            switch (frame->GetPixelFormat()) {
-            case AV_PIX_FMT_YUV420P:
-                ReadY8__V8U8_N420<true>(slot_config, regs.surfaces[i], std::move(frame));
-                break;
-            case AV_PIX_FMT_NV12:
-                ReadY8__V8U8_N420<false>(slot_config, regs.surfaces[i], std::move(frame));
-                break;
-            default:
-                UNIMPLEMENTED_MSG(
-                    "Unimplemented slot pixel format {}",
-                    static_cast<u32>(slot_config.surface_config.slot_pixel_format.Value()));
-                break;
+            if (auto frame = frame_queue.GetFrame(nvdec_id, luma_offset); frame) {
+                if (frame.get()) {
+                    switch (frame->GetPixelFormat()) {
+                    case AV_PIX_FMT_YUV420P:
+                        ReadY8__V8U8_N420(slot_config, regs.surfaces[i], std::move(frame), true);
+                        break;
+                    case AV_PIX_FMT_NV12:
+                        ReadY8__V8U8_N420(slot_config, regs.surfaces[i], std::move(frame), false);
+                        break;
+                    default:
+                        UNIMPLEMENTED_MSG("Unimplemented slot pixel format {}", u32(slot_config.surface_config.slot_pixel_format.Value()));
+                        break;
+                    }
+                    Blend(config, slot_config);
+                } else {
+                    LOG_ERROR(HW_GPU, "Vic {} failed to get frame with offset {:#X}", id, luma_offset);
+                }
             }
-
-            Blend(config, slot_config);
         }
     }
 
     switch (config.output_surface_config.out_pixel_format) {
     case VideoPixelFormat::A8B8G8R8:
     case VideoPixelFormat::X8B8G8R8:
-        WriteABGR<VideoPixelFormat::A8B8G8R8>(config.output_surface_config);
+        WriteABGR(config.output_surface_config, VideoPixelFormat::A8B8G8R8);
         break;
     case VideoPixelFormat::A8R8G8B8:
-        WriteABGR<VideoPixelFormat::A8R8G8B8>(config.output_surface_config);
+        WriteABGR(config.output_surface_config, VideoPixelFormat::A8R8G8B8);
         break;
     case VideoPixelFormat::Y8__V8U8_N420:
         WriteY8__V8U8_N420(config.output_surface_config);
         break;
     default:
-        UNIMPLEMENTED_MSG("Unknown video pixel format {}",
-                          config.output_surface_config.out_pixel_format.Value());
+        UNIMPLEMENTED_MSG("Unknown video pixel format {}", config.output_surface_config.out_pixel_format.Value());
         break;
     }
 }
 
-template <bool Planar, bool Interlaced>
-void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot,
-                                       std::span<const PlaneOffsets> offsets,
-                                       std::shared_ptr<const FFmpeg::Frame> frame) {
+void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets, std::shared_ptr<const FFmpeg::Frame> frame, bool planar, bool interlaced) {
     const auto out_luma_width{slot.surface_config.slot_surface_width + 1};
     auto out_luma_height{slot.surface_config.slot_surface_height + 1};
     const auto out_luma_stride{out_luma_width};
 
-    if constexpr (Interlaced) {
+    if(interlaced) {
         out_luma_height *= 2;
     }
 
     slot_surface.resize_destructive(out_luma_width * out_luma_height);
 
-    const auto in_luma_width{(std::min)(frame->GetWidth(), static_cast<s32>(out_luma_width))};
-    const auto in_luma_height{(std::min)(frame->GetHeight(), static_cast<s32>(out_luma_height))};
+    const auto in_luma_width{(std::min)(frame->GetWidth(), s32(out_luma_width))};
+    const auto in_luma_height{(std::min)(frame->GetHeight(), s32(out_luma_height))};
     const auto in_luma_stride{frame->GetStride(0)};
 
     const auto in_chroma_stride{frame->GetStride(1)};
@@ -224,204 +189,29 @@ void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot,
               in_chroma_stride, out_luma_width, out_luma_height, out_luma_stride, out_luma_width,
               out_luma_height, out_luma_stride);
 
-    [[maybe_unused]] auto DecodeLinear = [&]() {
-        const auto alpha{static_cast<u16>(slot.config.planar_alpha.Value())};
-
-        for (s32 y = 0; y < in_luma_height; y++) {
-            const auto src_luma{y * in_luma_stride};
-            const auto src_chroma{(y / 2) * in_chroma_stride};
-            const auto dst{y * out_luma_stride};
-            for (s32 x = 0; x < in_luma_width; x++) {
-                slot_surface[dst + x].r = static_cast<u16>(luma_buffer[src_luma + x] << 2);
-                // Chroma samples are duplicated horizontally and vertically.
-                if constexpr (Planar) {
-                    slot_surface[dst + x].g =
-                        static_cast<u16>(chroma_u_buffer[src_chroma + x / 2] << 2);
-                    slot_surface[dst + x].b =
-                        static_cast<u16>(chroma_v_buffer[src_chroma + x / 2] << 2);
-                } else {
-                    slot_surface[dst + x].g =
-                        static_cast<u16>(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2);
-                    slot_surface[dst + x].b =
-                        static_cast<u16>(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2);
-                }
-                slot_surface[dst + x].a = alpha;
-            }
-        }
-    };
-
-#if defined(ARCHITECTURE_x86_64)
-    if (!has_sse41) {
-        DecodeLinear();
-        return;
-    }
-#endif
-
-#if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64)
-    const auto alpha_linear{static_cast<u16>(slot.config.planar_alpha.Value())};
-    const auto alpha =
-        _mm_slli_epi64(_mm_set1_epi64x(static_cast<s64>(slot.config.planar_alpha.Value())), 48);
-
-    const auto shuffle_mask = _mm_set_epi8(13, 15, 14, 12, 9, 11, 10, 8, 5, 7, 6, 4, 1, 3, 2, 0);
-    const auto sse_aligned_width = Common::AlignDown(in_luma_width, 16);
-
+    const auto alpha{u16(slot.config.planar_alpha.Value())};
     for (s32 y = 0; y < in_luma_height; y++) {
         const auto src_luma{y * in_luma_stride};
         const auto src_chroma{(y / 2) * in_chroma_stride};
         const auto dst{y * out_luma_stride};
-        s32 x = 0;
-        for (; x < sse_aligned_width; x += 16) {
-            // clang-format off
-            // Prefetch next iteration's memory
-            _mm_prefetch((const char*)&luma_buffer[src_luma + x + 16], _MM_HINT_T0);
-
-            // Load 8 bytes * 2 of 8-bit luma samples
-            // luma0 = 00 00 00 00 00 00 00 00 LL LL LL LL LL LL LL LL
-            auto luma0 = _mm_loadl_epi64((__m128i*)&luma_buffer[src_luma + x + 0]);
-            auto luma1 = _mm_loadl_epi64((__m128i*)&luma_buffer[src_luma + x + 8]);
-
-            __m128i chroma;
-
-            if constexpr (Planar) {
-                _mm_prefetch((const char*)&chroma_u_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0);
-                _mm_prefetch((const char*)&chroma_v_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0);
-
-                // If Chroma is planar, we have separate U and V planes, load 8 bytes of each
-                // chroma_u0 = 00 00 00 00 00 00 00 00 UU UU UU UU UU UU UU UU
-                // chroma_v0 = 00 00 00 00 00 00 00 00 VV VV VV VV VV VV VV VV
-                auto chroma_u0 = _mm_loadl_epi64((__m128i*)&chroma_u_buffer[src_chroma + x / 2]);
-                auto chroma_v0 = _mm_loadl_epi64((__m128i*)&chroma_v_buffer[src_chroma + x / 2]);
-
-                // Interleave the 8 bytes of U and V into a single 16 byte reg
-                // chroma = VV UU VV UU VV UU VV UU VV UU VV UU VV UU VV UU
-                chroma = _mm_unpacklo_epi8(chroma_u0, chroma_v0);
-            } else {
-                _mm_prefetch((const char*)&chroma_u_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0);
-
-                // Chroma is already interleaved in semiplanar format, just load 16 bytes
-                // chroma = VV UU VV UU VV UU VV UU VV UU VV UU VV UU VV UU
-                chroma = _mm_load_si128((__m128i*)&chroma_u_buffer[src_chroma + x]);
-            }
-
-            // Convert the low 8 bytes of 8-bit luma into 16-bit luma
-            // luma0 = [00] [00] [00] [00] [00] [00] [00] [00] [LL] [LL] [LL] [LL] [LL] [LL] [LL] [LL]
-            // ->
-            // luma0 = [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL]
-            luma0 = _mm_cvtepu8_epi16(luma0);
-            luma1 = _mm_cvtepu8_epi16(luma1);
-
-            // Treat the 8 bytes of 8-bit chroma as 16-bit channels, this allows us to take both the
-            // U and V together as one element. Using chroma twice here duplicates the values, as we
-            // take element 0 from chroma, and then element 0 from chroma again, etc. We need to
-            // duplicate chroma horitonally as chroma is half the width of luma.
-            // chroma   = [VV8 UU8] [VV7 UU7] [VV6 UU6] [VV5 UU5] [VV4 UU4] [VV3 UU3] [VV2 UU2] [VV1 UU1]
-            // ->
-            // chroma00 = [VV4 UU4] [VV4 UU4] [VV3 UU3] [VV3 UU3] [VV2 UU2] [VV2 UU2] [VV1 UU1] [VV1 UU1]
-            // chroma01 = [VV8 UU8] [VV8 UU8] [VV7 UU7] [VV7 UU7] [VV6 UU6] [VV6 UU6] [VV5 UU5] [VV5 UU5]
-            auto chroma00 = _mm_unpacklo_epi16(chroma, chroma);
-            auto chroma01 = _mm_unpackhi_epi16(chroma, chroma);
-
-            // Interleave the 16-bit luma and chroma.
-            // luma0    = [008 LL8] [007 LL7] [006 LL6] [005 LL5] [004 LL4] [003 LL3] [002 LL2] [001 LL1]
-            // chroma00 = [VV8 UU8] [VV7 UU7] [VV6 UU6] [VV5 UU5] [VV4 UU4] [VV3 UU3] [VV2 UU2] [VV1 UU1]
-            // ->
-            // yuv0     = [VV4 UU4 004 LL4] [VV3 UU3 003 LL3] [VV2 UU2 002 LL2] [VV1 UU1 001 LL1]
-            // yuv1     = [VV8 UU8 008 LL8] [VV7 UU7 007 LL7] [VV6 UU6 006 LL6] [VV5 UU5 005 LL5]
-            auto yuv0 = _mm_unpacklo_epi16(luma0, chroma00);
-            auto yuv1 = _mm_unpackhi_epi16(luma0, chroma00);
-            auto yuv2 = _mm_unpacklo_epi16(luma1, chroma01);
-            auto yuv3 = _mm_unpackhi_epi16(luma1, chroma01);
-
-            // Shuffle the luma/chroma into the channel ordering we actually want. The high byte of
-            // the luma which is now a constant 0 after converting 8-bit -> 16-bit is used as the
-            // alpha. Luma -> R, U -> G, V -> B, 0 -> A
-            // yuv0 = [VV4 UU4 004 LL4] [VV3 UU3 003 LL3] [VV2 UU2 002 LL2] [VV1 UU1 001 LL1]
-            // ->
-            // yuv0 = [AA4 VV4 UU4 LL4] [AA3 VV3 UU3 LL3] [AA2 VV2 UU2 LL2] [AA1 VV1 UU1 LL1]
-            yuv0 = _mm_shuffle_epi8(yuv0, shuffle_mask);
-            yuv1 = _mm_shuffle_epi8(yuv1, shuffle_mask);
-            yuv2 = _mm_shuffle_epi8(yuv2, shuffle_mask);
-            yuv3 = _mm_shuffle_epi8(yuv3, shuffle_mask);
-
-            // Extend the 8-bit channels we have into 16-bits, as that's the target surface format.
-            // Since this turns just the low 8 bytes into 16 bytes, the second of
-            // each operation here right shifts the register by 8 to get the high pixels.
-            // yuv0  = [AA4] [VV4] [UU4] [LL4] [AA3] [VV3] [UU3] [LL3] [AA2] [VV2] [UU2] [LL2] [AA1] [VV1] [UU1] [LL1]
-            // ->
-            // yuv01 = [002 AA2] [002 VV2] [002 UU2] [002 LL2] [001 AA1] [001 VV1] [001 UU1] [001 LL1]
-            // yuv23 = [004 AA4] [004 VV4] [004 UU4] [004 LL4] [003 AA3] [003 VV3] ]003 UU3] [003 LL3]
-            auto yuv01 = _mm_cvtepu8_epi16(yuv0);
-            auto yuv23 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv0, 8));
-            auto yuv45 = _mm_cvtepu8_epi16(yuv1);
-            auto yuv67 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv1, 8));
-            auto yuv89 = _mm_cvtepu8_epi16(yuv2);
-            auto yuv1011 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv2, 8));
-            auto yuv1213 = _mm_cvtepu8_epi16(yuv3);
-            auto yuv1415 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv3, 8));
-
-            // Left-shift all 16-bit channels by 2, this is to get us into a 10-bit format instead
-            // of 8, which is the format alpha is in, as well as other blending values.
-            yuv01 = _mm_slli_epi16(yuv01, 2);
-            yuv23 = _mm_slli_epi16(yuv23, 2);
-            yuv45 = _mm_slli_epi16(yuv45, 2);
-            yuv67 = _mm_slli_epi16(yuv67, 2);
-            yuv89 = _mm_slli_epi16(yuv89, 2);
-            yuv1011 = _mm_slli_epi16(yuv1011, 2);
-            yuv1213 = _mm_slli_epi16(yuv1213, 2);
-            yuv1415 = _mm_slli_epi16(yuv1415, 2);
-
-            // OR in the planar alpha, this has already been duplicated and shifted into position,
-            // and just fills in the AA channels with the actual alpha value.
-            yuv01 = _mm_or_si128(yuv01, alpha);
-            yuv23 = _mm_or_si128(yuv23, alpha);
-            yuv45 = _mm_or_si128(yuv45, alpha);
-            yuv67 = _mm_or_si128(yuv67, alpha);
-            yuv89 = _mm_or_si128(yuv89, alpha);
-            yuv1011 = _mm_or_si128(yuv1011, alpha);
-            yuv1213 = _mm_or_si128(yuv1213, alpha);
-            yuv1415 = _mm_or_si128(yuv1415, alpha);
-
-            // Store out the pixels. One pixel is now 8 bytes, so each store is 2 pixels.
-            // [AA AA] [VV VV] [UU UU] [LL LL] [AA AA] [VV VV] [UU UU] [LL LL]
-            _mm_store_si128((__m128i*)&slot_surface[dst + x + 0], yuv01);
-            _mm_store_si128((__m128i*)&slot_surface[dst + x + 2], yuv23);
-            _mm_store_si128((__m128i*)&slot_surface[dst + x + 4], yuv45);
-            _mm_store_si128((__m128i*)&slot_surface[dst + x + 6], yuv67);
-            _mm_store_si128((__m128i*)&slot_surface[dst + x + 8], yuv89);
-            _mm_store_si128((__m128i*)&slot_surface[dst + x + 10], yuv1011);
-            _mm_store_si128((__m128i*)&slot_surface[dst + x + 12], yuv1213);
-            _mm_store_si128((__m128i*)&slot_surface[dst + x + 14], yuv1415);
-
-            // clang-format on
-        }
-
-        for (; x < in_luma_width; x++) {
-            slot_surface[dst + x].r = static_cast<u16>(luma_buffer[src_luma + x] << 2);
+        for (s32 x = 0; x < in_luma_width; x++) {
+            slot_surface[dst + x].r = u16(luma_buffer[src_luma + x] << 2);
             // Chroma samples are duplicated horizontally and vertically.
-            if constexpr (Planar) {
-                slot_surface[dst + x].g =
-                    static_cast<u16>(chroma_u_buffer[src_chroma + x / 2] << 2);
-                slot_surface[dst + x].b =
-                    static_cast<u16>(chroma_v_buffer[src_chroma + x / 2] << 2);
+            if(planar) {
+                slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + x / 2] << 2);
+                slot_surface[dst + x].b = u16(chroma_v_buffer[src_chroma + x / 2] << 2);
             } else {
-                slot_surface[dst + x].g =
-                    static_cast<u16>(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2);
-                slot_surface[dst + x].b =
-                    static_cast<u16>(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2);
+                slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2);
+                slot_surface[dst + x].b = u16(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2);
             }
-            slot_surface[dst + x].a = alpha_linear;
+            slot_surface[dst + x].a = alpha;
         }
     }
-#else
-    DecodeLinear();
-#endif
 }
 
-template <bool Planar, bool TopField>
-void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets,
-                                      std::shared_ptr<const FFmpeg::Frame> frame) {
-    if constexpr (!Planar) {
-        ReadProgressiveY8__V8U8_N420<Planar, true>(slot, offsets, std::move(frame));
+void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets, std::shared_ptr<const FFmpeg::Frame> frame, bool planar, bool top_field) {
+    if(!planar) {
+        ReadProgressiveY8__V8U8_N420(slot, offsets, std::move(frame), planar, true);
         return;
     }
     const auto out_luma_width{slot.surface_config.slot_surface_width + 1};
@@ -430,9 +220,9 @@ void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const Pl
 
     slot_surface.resize_destructive(out_luma_width * out_luma_height);
 
-    const auto in_luma_width{(std::min)(frame->GetWidth(), static_cast<s32>(out_luma_width))};
+    const auto in_luma_width{(std::min)(frame->GetWidth(), s32(out_luma_width))};
     [[maybe_unused]] const auto in_luma_height{
-        (std::min)(frame->GetHeight(), static_cast<s32>(out_luma_height))};
+        (std::min)(frame->GetHeight(), s32(out_luma_height))};
     const auto in_luma_stride{frame->GetStride(0)};
 
     [[maybe_unused]] const auto in_chroma_width{(frame->GetWidth() + 1) / 2};
@@ -451,81 +241,62 @@ void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const Pl
               in_chroma_stride, out_luma_width, out_luma_height, out_luma_stride,
               out_luma_width / 2, out_luma_height / 2, out_luma_stride);
 
-    [[maybe_unused]] auto DecodeLinear = [&]() {
-        auto DecodeBobField = [&]() {
-            const auto alpha{static_cast<u16>(slot.config.planar_alpha.Value())};
-
-            for (s32 y = static_cast<s32>(TopField == false); y < in_chroma_height * 2; y += 2) {
-                const auto src_luma{y * in_luma_stride};
-                const auto src_chroma{(y / 2) * in_chroma_stride};
-                const auto dst{y * out_luma_stride};
-                for (s32 x = 0; x < in_luma_width; x++) {
-                    slot_surface[dst + x].r = static_cast<u16>(luma_buffer[src_luma + x] << 2);
-                    if constexpr (Planar) {
-                        slot_surface[dst + x].g =
-                            static_cast<u16>(chroma_u_buffer[src_chroma + x / 2] << 2);
-                        slot_surface[dst + x].b =
-                            static_cast<u16>(chroma_v_buffer[src_chroma + x / 2] << 2);
-                    } else {
-                        slot_surface[dst + x].g =
-                            static_cast<u16>(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2);
-                        slot_surface[dst + x].b =
-                            static_cast<u16>(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2);
-                    }
-                    slot_surface[dst + x].a = alpha;
-                }
-
-                s32 other_line{};
-                if constexpr (TopField) {
-                    other_line = (y + 1) * out_luma_stride;
+    auto DecodeBobField = [&]() {
+        const auto alpha{u16(slot.config.planar_alpha.Value())};
+        for (s32 y = s32(top_field == false); y < in_chroma_height * 2; y += 2) {
+            const auto src_luma{y * in_luma_stride};
+            const auto src_chroma{(y / 2) * in_chroma_stride};
+            const auto dst{y * out_luma_stride};
+            for (s32 x = 0; x < in_luma_width; x++) {
+                slot_surface[dst + x].r = u16(luma_buffer[src_luma + x] << 2);
+                if(planar) {
+                    slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + x / 2] << 2);
+                    slot_surface[dst + x].b = u16(chroma_v_buffer[src_chroma + x / 2] << 2);
                 } else {
-                    other_line = (y - 1) * out_luma_stride;
+                    slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2);
+                    slot_surface[dst + x].b = u16(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2);
                 }
-                std::memcpy(&slot_surface[other_line], &slot_surface[dst],
-                            out_luma_width * sizeof(Pixel));
+                slot_surface[dst + x].a = alpha;
             }
-        };
-
-        switch (slot.config.deinterlace_mode) {
-        case DXVAHD_DEINTERLACE_MODE_PRIVATE::WEAVE:
-            // Due to the fact that we do not write to memory in nvdec, we cannot use Weave as it
-            // relies on the previous frame.
-            DecodeBobField();
-            break;
-        case DXVAHD_DEINTERLACE_MODE_PRIVATE::BOB_FIELD:
-            DecodeBobField();
-            break;
-        case DXVAHD_DEINTERLACE_MODE_PRIVATE::DISI1:
-            // Due to the fact that we do not write to memory in nvdec, we cannot use DISI1 as it
-            // relies on previous/next frames.
-            DecodeBobField();
-            break;
-        default:
-            UNIMPLEMENTED_MSG("Deinterlace mode {} not implemented!",
-                              static_cast<s32>(slot.config.deinterlace_mode.Value()));
-            break;
+            s32 other_line = (top_field ? y + 1 : y - 1) * out_luma_stride;
+            std::memcpy(&slot_surface[other_line], &slot_surface[dst], out_luma_width * sizeof(Pixel));
         }
     };
 
-    DecodeLinear();
+    switch (slot.config.deinterlace_mode) {
+    case DXVAHD_DEINTERLACE_MODE_PRIVATE::WEAVE:
+        // Due to the fact that we do not write to memory in nvdec, we cannot use Weave as it
+        // relies on the previous frame.
+        DecodeBobField();
+        break;
+    case DXVAHD_DEINTERLACE_MODE_PRIVATE::BOB_FIELD:
+        DecodeBobField();
+        break;
+    case DXVAHD_DEINTERLACE_MODE_PRIVATE::DISI1:
+        // Due to the fact that we do not write to memory in nvdec, we cannot use DISI1 as it
+        // relies on previous/next frames.
+        DecodeBobField();
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Deinterlace mode {} not implemented!", s32(slot.config.deinterlace_mode.Value()));
+        break;
+    }
 }
 
-template <bool Planar>
-void Vic::ReadY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets,
-                            std::shared_ptr<const FFmpeg::Frame> frame) {
+void Vic::ReadY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets, std::shared_ptr<const FFmpeg::Frame> frame, bool planar) {
     switch (slot.config.frame_format) {
     case DXVAHD_FRAME_FORMAT::PROGRESSIVE:
-        ReadProgressiveY8__V8U8_N420<Planar>(slot, offsets, std::move(frame));
+        ReadProgressiveY8__V8U8_N420(slot, offsets, std::move(frame), planar, false);
         break;
     case DXVAHD_FRAME_FORMAT::TOP_FIELD:
-        ReadInterlacedY8__V8U8_N420<Planar, true>(slot, offsets, std::move(frame));
+        ReadInterlacedY8__V8U8_N420(slot, offsets, std::move(frame), planar, true);
         break;
     case DXVAHD_FRAME_FORMAT::BOTTOM_FIELD:
-        ReadInterlacedY8__V8U8_N420<Planar, false>(slot, offsets, std::move(frame));
+        ReadInterlacedY8__V8U8_N420(slot, offsets, std::move(frame), planar, false);
         break;
     default:
         LOG_ERROR(HW_GPU, "Unknown deinterlace format {}",
-                  static_cast<s32>(slot.config.frame_format.Value()));
+                  s32(slot.config.frame_format.Value()));
         break;
     }
 }
@@ -533,15 +304,15 @@ void Vic::ReadY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets
 void Vic::Blend(const ConfigStruct& config, const SlotStruct& slot) {
     constexpr auto add_one([](u32 v) -> u32 { return v != 0 ? v + 1 : 0; });
 
-    auto source_left{add_one(static_cast<u32>(slot.config.source_rect_left.Value()))};
-    auto source_right{add_one(static_cast<u32>(slot.config.source_rect_right.Value()))};
-    auto source_top{add_one(static_cast<u32>(slot.config.source_rect_top.Value()))};
-    auto source_bottom{add_one(static_cast<u32>(slot.config.source_rect_bottom.Value()))};
+    auto source_left{add_one(u32(slot.config.source_rect_left.Value()))};
+    auto source_right{add_one(u32(slot.config.source_rect_right.Value()))};
+    auto source_top{add_one(u32(slot.config.source_rect_top.Value()))};
+    auto source_bottom{add_one(u32(slot.config.source_rect_bottom.Value()))};
 
-    const auto dest_left{add_one(static_cast<u32>(slot.config.dest_rect_left.Value()))};
-    const auto dest_right{add_one(static_cast<u32>(slot.config.dest_rect_right.Value()))};
-    const auto dest_top{add_one(static_cast<u32>(slot.config.dest_rect_top.Value()))};
-    const auto dest_bottom{add_one(static_cast<u32>(slot.config.dest_rect_bottom.Value()))};
+    const auto dest_left{add_one(u32(slot.config.dest_rect_left.Value()))};
+    const auto dest_right{add_one(u32(slot.config.dest_rect_right.Value()))};
+    const auto dest_top{add_one(u32(slot.config.dest_rect_top.Value()))};
+    const auto dest_bottom{add_one(u32(slot.config.dest_rect_bottom.Value()))};
 
     auto rect_left{add_one(config.output_config.target_rect_left.Value())};
     auto rect_right{add_one(config.output_config.target_rect_right.Value())};
@@ -589,220 +360,59 @@ void Vic::Blend(const ConfigStruct& config, const SlotStruct& slot) {
         // | r1c0 r1c1 r1c2 r1c3 | * | G | = | G |
         // | r2c0 r2c1 r2c2 r2c3 |   | B |   | B |
         //                           | 1 |
-        // clang-format on
-
-        [[maybe_unused]] auto DecodeLinear = [&]() {
-            const auto r0c0 = static_cast<s32>(slot.color_matrix.matrix_coeff00.Value());
-            const auto r0c1 = static_cast<s32>(slot.color_matrix.matrix_coeff01.Value());
-            const auto r0c2 = static_cast<s32>(slot.color_matrix.matrix_coeff02.Value());
-            const auto r0c3 = static_cast<s32>(slot.color_matrix.matrix_coeff03.Value());
-            const auto r1c0 = static_cast<s32>(slot.color_matrix.matrix_coeff10.Value());
-            const auto r1c1 = static_cast<s32>(slot.color_matrix.matrix_coeff11.Value());
-            const auto r1c2 = static_cast<s32>(slot.color_matrix.matrix_coeff12.Value());
-            const auto r1c3 = static_cast<s32>(slot.color_matrix.matrix_coeff13.Value());
-            const auto r2c0 = static_cast<s32>(slot.color_matrix.matrix_coeff20.Value());
-            const auto r2c1 = static_cast<s32>(slot.color_matrix.matrix_coeff21.Value());
-            const auto r2c2 = static_cast<s32>(slot.color_matrix.matrix_coeff22.Value());
-            const auto r2c3 = static_cast<s32>(slot.color_matrix.matrix_coeff23.Value());
-
-            const auto shift = static_cast<s32>(slot.color_matrix.matrix_r_shift.Value());
-            const auto clamp_min = static_cast<s32>(slot.config.soft_clamp_low.Value());
-            const auto clamp_max = static_cast<s32>(slot.config.soft_clamp_high.Value());
-
-            auto MatMul = [&](const Pixel& in_pixel) -> std::tuple<s32, s32, s32, s32> {
-                auto r = static_cast<s32>(in_pixel.r);
-                auto g = static_cast<s32>(in_pixel.g);
-                auto b = static_cast<s32>(in_pixel.b);
-
-                r = in_pixel.r * r0c0 + in_pixel.g * r0c1 + in_pixel.b * r0c2;
-                g = in_pixel.r * r1c0 + in_pixel.g * r1c1 + in_pixel.b * r1c2;
-                b = in_pixel.r * r2c0 + in_pixel.g * r2c1 + in_pixel.b * r2c2;
-
-                r >>= shift;
-                g >>= shift;
-                b >>= shift;
-
-                r += r0c3;
-                g += r1c3;
-                b += r2c3;
-
-                r >>= 8;
-                g >>= 8;
-                b >>= 8;
-
-                return {r, g, b, static_cast<s32>(in_pixel.a)};
-            };
-
-            for (u32 y = source_top; y < source_bottom; y++) {
-                const auto src{y * in_surface_width + source_left};
-                const auto dst{y * out_surface_width + rect_left};
-                for (u32 x = source_left; x < source_right; x++) {
-                    auto [r, g, b, a] = MatMul(slot_surface[src + x]);
-
-                    r = std::clamp(r, clamp_min, clamp_max);
-                    g = std::clamp(g, clamp_min, clamp_max);
-                    b = std::clamp(b, clamp_min, clamp_max);
-                    a = std::clamp(a, clamp_min, clamp_max);
-
-                    output_surface[dst + x] = {static_cast<u16>(r), static_cast<u16>(g),
-                                               static_cast<u16>(b), static_cast<u16>(a)};
-                }
-            }
-        };
-
-#if defined(ARCHITECTURE_x86_64)
-        if (!has_sse41) {
-            DecodeLinear();
-            return;
-        }
-#endif
-
-#if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64)
-        // Fill the columns, e.g
-        // c0 = [00 00 00 00] [r2c0 r2c0 r2c0 r2c0] [r1c0 r1c0 r1c0 r1c0] [r0c0 r0c0 r0c0 r0c0]
-
-        const auto c0 = _mm_set_epi32(0, static_cast<s32>(slot.color_matrix.matrix_coeff20.Value()),
-                                      static_cast<s32>(slot.color_matrix.matrix_coeff10.Value()),
-                                      static_cast<s32>(slot.color_matrix.matrix_coeff00.Value()));
-        const auto c1 = _mm_set_epi32(0, static_cast<s32>(slot.color_matrix.matrix_coeff21.Value()),
-                                      static_cast<s32>(slot.color_matrix.matrix_coeff11.Value()),
-                                      static_cast<s32>(slot.color_matrix.matrix_coeff01.Value()));
-        const auto c2 = _mm_set_epi32(0, static_cast<s32>(slot.color_matrix.matrix_coeff22.Value()),
-                                      static_cast<s32>(slot.color_matrix.matrix_coeff12.Value()),
-                                      static_cast<s32>(slot.color_matrix.matrix_coeff02.Value()));
-        const auto c3 = _mm_set_epi32(0, static_cast<s32>(slot.color_matrix.matrix_coeff23.Value()),
-                                      static_cast<s32>(slot.color_matrix.matrix_coeff13.Value()),
-                                      static_cast<s32>(slot.color_matrix.matrix_coeff03.Value()));
-
-        // Set the matrix right-shift as a single element.
-        const auto shift =
-            _mm_set_epi32(0, 0, 0, static_cast<s32>(slot.color_matrix.matrix_r_shift.Value()));
-
-        // Set every 16-bit value to the soft clamp values for clamping every 16-bit channel.
-        const auto clamp_min = _mm_set1_epi16(static_cast<u16>(slot.config.soft_clamp_low.Value()));
-        const auto clamp_max =
-            _mm_set1_epi16(static_cast<u16>(slot.config.soft_clamp_high.Value()));
-
-        // clang-format off
-
-        auto MatMul = [](__m128i& p, const __m128i& col0, const __m128i& col1, const __m128i& col2,
-                         const __m128i& col3, const __m128i& trm_shift) -> __m128i {
-            // Duplicate the 32-bit channels, e.g
-            // p = [AA AA AA AA] [BB BB BB BB] [GG GG GG GG] [RR RR RR RR]
-            // ->
-            // r = [RR4 RR4 RR4 RR4] [RR3 RR3 RR3 RR3] [RR2 RR2 RR2 RR2] [RR1 RR1 RR1 RR1]
-            auto r = _mm_shuffle_epi32(p, 0x0);
-            auto g = _mm_shuffle_epi32(p, 0x55);
-            auto b = _mm_shuffle_epi32(p, 0xAA);
-
-            // Multiply the rows and columns c0 * r, c1 * g, c2 * b, e.g
-            // r  = [RR4 RR4 RR4 RR4] [ RR3  RR3  RR3  RR3] [ RR2  RR2  RR2  RR2] [ RR1  RR1  RR1  RR1]
-            //                                             *
-            // c0 = [ 00  00  00  00] [r2c0 r2c0 r2c0 r2c0] [r1c0 r1c0 r1c0 r1c0] [r0c0 r0c0 r0c0 r0c0]
-            r = _mm_mullo_epi32(r, col0);
-            g = _mm_mullo_epi32(g, col1);
-            b = _mm_mullo_epi32(b, col2);
-
-            // Add them all together vertically, such that the 32-bit element
-            // out[0] = (r[0] * c0[0]) + (g[0] * c1[0]) + (b[0] * c2[0])
-            auto out = _mm_add_epi32(_mm_add_epi32(r, g), b);
-
-            // Shift the result by r_shift, as the TRM says
-            out = _mm_sra_epi32(out, trm_shift);
-
-            // Add the final column. Because the 4x1 matrix has this row as 1, there's no need to
-            // multiply by it, and as per the TRM this column ignores r_shift, so it's just added
-            // here after shifting.
-            out = _mm_add_epi32(out, col3);
-
-            // Shift the result back from S12.8 to integer values
-            return _mm_srai_epi32(out, 8);
+        const auto r0c0 = s32(slot.color_matrix.matrix_coeff00.Value());
+        const auto r0c1 = s32(slot.color_matrix.matrix_coeff01.Value());
+        const auto r0c2 = s32(slot.color_matrix.matrix_coeff02.Value());
+        const auto r0c3 = s32(slot.color_matrix.matrix_coeff03.Value());
+        const auto r1c0 = s32(slot.color_matrix.matrix_coeff10.Value());
+        const auto r1c1 = s32(slot.color_matrix.matrix_coeff11.Value());
+        const auto r1c2 = s32(slot.color_matrix.matrix_coeff12.Value());
+        const auto r1c3 = s32(slot.color_matrix.matrix_coeff13.Value());
+        const auto r2c0 = s32(slot.color_matrix.matrix_coeff20.Value());
+        const auto r2c1 = s32(slot.color_matrix.matrix_coeff21.Value());
+        const auto r2c2 = s32(slot.color_matrix.matrix_coeff22.Value());
+        const auto r2c3 = s32(slot.color_matrix.matrix_coeff23.Value());
+
+        const auto shift = s32(slot.color_matrix.matrix_r_shift.Value());
+        const auto clamp_min = s32(slot.config.soft_clamp_low.Value());
+        const auto clamp_max = s32(slot.config.soft_clamp_high.Value());
+
+        auto MatMul = [&](const Pixel& in_pixel) -> std::tuple<s32, s32, s32, s32> {
+            auto r = s32(in_pixel.r);
+            auto g = s32(in_pixel.g);
+            auto b = s32(in_pixel.b);
+
+            r = in_pixel.r * r0c0 + in_pixel.g * r0c1 + in_pixel.b * r0c2;
+            g = in_pixel.r * r1c0 + in_pixel.g * r1c1 + in_pixel.b * r1c2;
+            b = in_pixel.r * r2c0 + in_pixel.g * r2c1 + in_pixel.b * r2c2;
+
+            r >>= shift;
+            g >>= shift;
+            b >>= shift;
+
+            r += r0c3;
+            g += r1c3;
+            b += r2c3;
+
+            r >>= 8;
+            g >>= 8;
+            b >>= 8;
+
+            return {r, g, b, s32(in_pixel.a)};
         };
 
         for (u32 y = source_top; y < source_bottom; y++) {
             const auto src{y * in_surface_width + source_left};
             const auto dst{y * out_surface_width + rect_left};
-            for (u32 x = source_left; x < source_right; x += 8) {
-                // clang-format off
-                // Prefetch the next iteration's memory
-                _mm_prefetch((const char*)&slot_surface[src + x + 8], _MM_HINT_T0);
-
-                // Load in pixels
-                // p01 = [AA AA] [BB BB] [GG GG] [RR RR] [AA AA] [BB BB] [GG GG] [RR RR]
-                auto p01 = _mm_load_si128((__m128i*)&slot_surface[src + x + 0]);
-                auto p23 = _mm_load_si128((__m128i*)&slot_surface[src + x + 2]);
-                auto p45 = _mm_load_si128((__m128i*)&slot_surface[src + x + 4]);
-                auto p67 = _mm_load_si128((__m128i*)&slot_surface[src + x + 6]);
-
-                // Convert the 16-bit channels into 32-bit (unsigned), as the matrix values are
-                // 32-bit and to avoid overflow.
-                // p01    = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1]
-                // ->
-                // p01_lo = [001 001 AA1 AA1] [001 001 BB1 BB1] [001 001 GG1 GG1] [001 001 RR1 RR1]
-                // p01_hi = [002 002 AA2 AA2] [002 002 BB2 BB2] [002 002 GG2 GG2] [002 002 RR2 RR2]
-                auto p01_lo = _mm_cvtepu16_epi32(p01);
-                auto p01_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p01, 8));
-                auto p23_lo = _mm_cvtepu16_epi32(p23);
-                auto p23_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p23, 8));
-                auto p45_lo = _mm_cvtepu16_epi32(p45);
-                auto p45_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p45, 8));
-                auto p67_lo = _mm_cvtepu16_epi32(p67);
-                auto p67_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p67, 8));
-
-                // Matrix multiply the pixel, doing the colour conversion.
-                auto out0 = MatMul(p01_lo, c0, c1, c2, c3, shift);
-                auto out1 = MatMul(p01_hi, c0, c1, c2, c3, shift);
-                auto out2 = MatMul(p23_lo, c0, c1, c2, c3, shift);
-                auto out3 = MatMul(p23_hi, c0, c1, c2, c3, shift);
-                auto out4 = MatMul(p45_lo, c0, c1, c2, c3, shift);
-                auto out5 = MatMul(p45_hi, c0, c1, c2, c3, shift);
-                auto out6 = MatMul(p67_lo, c0, c1, c2, c3, shift);
-                auto out7 = MatMul(p67_hi, c0, c1, c2, c3, shift);
-
-                // Pack the 32-bit channel pixels back into 16-bit using unsigned saturation
-                // out0  = [001 001 AA1 AA1] [001 001 BB1 BB1] [001 001 GG1 GG1] [001 001 RR1 RR1]
-                // out1  = [002 002 AA2 AA2] [002 002 BB2 BB2] [002 002 GG2 GG2] [002 002 RR2 RR2]
-                // ->
-                // done0 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1]
-                auto done0 = _mm_packus_epi32(out0, out1);
-                auto done1 = _mm_packus_epi32(out2, out3);
-                auto done2 = _mm_packus_epi32(out4, out5);
-                auto done3 = _mm_packus_epi32(out6, out7);
-
-                // Blend the original alpha back into the pixel, as the matrix multiply gives us a
-                // 3-channel output, not 4.
-                // 0x88 = b10001000, taking RGB from the first argument, A from the second argument.
-                // done0 = [002 002] [BB2 BB2] [GG2 GG2] [RR2 RR2] [001 001] [BB1 BB1] [GG1 GG1] [RR1 RR1]
-                // ->
-                // done0 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1]
-                done0 = _mm_blend_epi16(done0, p01, 0x88);
-                done1 = _mm_blend_epi16(done1, p23, 0x88);
-                done2 = _mm_blend_epi16(done2, p45, 0x88);
-                done3 = _mm_blend_epi16(done3, p67, 0x88);
-
-                // Clamp the 16-bit channels to the soft-clamp min/max.
-                done0 = _mm_max_epu16(done0, clamp_min);
-                done1 = _mm_max_epu16(done1, clamp_min);
-                done2 = _mm_max_epu16(done2, clamp_min);
-                done3 = _mm_max_epu16(done3, clamp_min);
-
-                done0 = _mm_min_epu16(done0, clamp_max);
-                done1 = _mm_min_epu16(done1, clamp_max);
-                done2 = _mm_min_epu16(done2, clamp_max);
-                done3 = _mm_min_epu16(done3, clamp_max);
-
-                // Store the pixels to the output surface.
-                _mm_store_si128((__m128i*)&output_surface[dst + x + 0], done0);
-                _mm_store_si128((__m128i*)&output_surface[dst + x + 2], done1);
-                _mm_store_si128((__m128i*)&output_surface[dst + x + 4], done2);
-                _mm_store_si128((__m128i*)&output_surface[dst + x + 6], done3);
-
+            for (u32 x = source_left; x < source_right; x++) {
+                auto [r, g, b, a] = MatMul(slot_surface[src + x]);
+                r = std::clamp(r, clamp_min, clamp_max);
+                g = std::clamp(g, clamp_min, clamp_max);
+                b = std::clamp(b, clamp_min, clamp_max);
+                a = std::clamp(a, clamp_min, clamp_max);
+                output_surface[dst + x] = {u16(r), u16(g), u16(b), u16(a)};
             }
         }
-        // clang-format on
-#else
-        DecodeLinear();
-#endif
     }
 }
 
@@ -826,7 +436,7 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) {
     surface_width = (std::min)(surface_width, out_luma_width);
     surface_height = (std::min)(surface_height, out_luma_height);
 
-    [[maybe_unused]] auto DecodeLinear = [&](std::span<u8> out_luma, std::span<u8> out_chroma) {
+    auto Decode = [&](std::span<u8> out_luma, std::span<u8> out_chroma) {
         for (u32 y = 0; y < surface_height; ++y) {
             const auto src_luma = y * surface_stride;
             const auto dst_luma = y * out_luma_stride;
@@ -834,173 +444,20 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) {
             const auto dst_chroma = (y / 2) * out_chroma_stride;
             for (u32 x = 0; x < surface_width; x += 2) {
                 out_luma[dst_luma + x + 0] =
-                    static_cast<u8>(output_surface[src_luma + x + 0].r >> 2);
+                    u8(output_surface[src_luma + x + 0].r >> 2);
                 out_luma[dst_luma + x + 1] =
-                    static_cast<u8>(output_surface[src_luma + x + 1].r >> 2);
-                out_chroma[dst_chroma + x + 0] =
-                    static_cast<u8>(output_surface[src_chroma + x].g >> 2);
-                out_chroma[dst_chroma + x + 1] =
-                    static_cast<u8>(output_surface[src_chroma + x].b >> 2);
-            }
-        }
-    };
-
-    auto Decode = [&](std::span<u8> out_luma, std::span<u8> out_chroma) {
-#if defined(ARCHITECTURE_x86_64)
-        if (!has_sse41) {
-            DecodeLinear(out_luma, out_chroma);
-            return;
-        }
-#endif
-
-#if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64)
-        // luma_mask   = [00 00] [00 00] [00 00] [FF FF] [00 00] [00 00] [00 00] [FF FF]
-        const auto luma_mask = _mm_set_epi16(0, 0, 0, -1, 0, 0, 0, -1);
-
-        const auto sse_aligned_width = Common::AlignDown(surface_width, 16);
-
-        for (u32 y = 0; y < surface_height; ++y) {
-            const auto src = y * surface_stride;
-            const auto dst_luma = y * out_luma_stride;
-            const auto dst_chroma = (y / 2) * out_chroma_stride;
-            u32 x = 0;
-            for (; x < sse_aligned_width; x += 16) {
-                // clang-format off
-                // Prefetch the next cache lines, 2 per iteration
-                _mm_prefetch((const char*)&output_surface[src + x + 16], _MM_HINT_T0);
-                _mm_prefetch((const char*)&output_surface[src + x + 24], _MM_HINT_T0);
-
-                // Load the 64-bit pixels, 2 per variable.
-                auto pixel01 = _mm_load_si128((__m128i*)&output_surface[src + x + 0]);
-                auto pixel23 = _mm_load_si128((__m128i*)&output_surface[src + x + 2]);
-                auto pixel45 = _mm_load_si128((__m128i*)&output_surface[src + x + 4]);
-                auto pixel67 = _mm_load_si128((__m128i*)&output_surface[src + x + 6]);
-                auto pixel89 = _mm_load_si128((__m128i*)&output_surface[src + x + 8]);
-                auto pixel1011 = _mm_load_si128((__m128i*)&output_surface[src + x + 10]);
-                auto pixel1213 = _mm_load_si128((__m128i*)&output_surface[src + x + 12]);
-                auto pixel1415 = _mm_load_si128((__m128i*)&output_surface[src + x + 14]);
-
-                // Split out the luma of each pixel using the luma_mask above.
-                // pixel01 = [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1] [LL1 LL1]
-                // ->
-                //     l01 = [002 002] [002 002] [002 002] [LL2 LL2] [001 001] [001 001] [001 001] [LL1 LL1]
-                auto l01 = _mm_and_si128(pixel01, luma_mask);
-                auto l23 = _mm_and_si128(pixel23, luma_mask);
-                auto l45 = _mm_and_si128(pixel45, luma_mask);
-                auto l67 = _mm_and_si128(pixel67, luma_mask);
-                auto l89 = _mm_and_si128(pixel89, luma_mask);
-                auto l1011 = _mm_and_si128(pixel1011, luma_mask);
-                auto l1213 = _mm_and_si128(pixel1213, luma_mask);
-                auto l1415 = _mm_and_si128(pixel1415, luma_mask);
-
-                // Pack 32-bit elements from 2 registers down into 16-bit elements in 1 register.
-                // l01   = [002 002 002 002] [002 002 LL2 LL2] [001 001 001 001] [001 001 LL1 LL1]
-                // l23   = [004 004 004 004] [004 004 LL4 LL4] [003 003 003 003] [003 003 LL3 LL3]
-                // ->
-                // l0123 = [004 004] [LL4 LL4] [003 003] [LL3 LL3] [002 002] [LL2 LL2] [001 001] [LL1 LL1]
-                auto l0123 = _mm_packus_epi32(l01, l23);
-                auto l4567 = _mm_packus_epi32(l45, l67);
-                auto l891011 = _mm_packus_epi32(l89, l1011);
-                auto l12131415 = _mm_packus_epi32(l1213, l1415);
-
-                // Pack 32-bit elements from 2 registers down into 16-bit elements in 1 register.
-                // l0123   = [004 004 LL4 LL4] [003 003 LL3 LL3] [002 002 LL2 LL2] [001 001 LL1 LL1]
-                // l4567   = [008 008 LL8 LL8] [007 007 LL7 LL7] [006 006 LL6 LL6] [005 005 LL5 LL5]
-                // ->
-                // luma_lo = [LL8 LL8] [LL7 LL7] [LL6 LL6] [LL5 LL5] [LL4 LL4] [LL3 LL3] [LL2 LL2] [LL1 LL1]
-                auto luma_lo = _mm_packus_epi32(l0123, l4567);
-                auto luma_hi = _mm_packus_epi32(l891011, l12131415);
-
-                // Right-shift the 16-bit elements by 2, un-doing the left shift by 2 on read
-                // and bringing the range back to 8-bit.
-                luma_lo = _mm_srli_epi16(luma_lo, 2);
-                luma_hi = _mm_srli_epi16(luma_hi, 2);
-
-                // Pack with unsigned saturation the 16-bit values in 2 registers into 8-bit values in 1 register.
-                // luma_lo =  [LL8  LL8]  [LL7  LL7]  [LL6  LL6]  [LL5  LL5]  [LL4  LL4]  [LL3  LL3]  [LL2  LL2] [LL1 LL1]
-                // luma_hi = [LL16 LL16] [LL15 LL15] [LL14 LL14] [LL13 LL13] [LL12 LL12] [LL11 LL11] [LL10 LL10] [LL9 LL9]
-                // ->
-                // luma = [LL16] [LL15] [LL14] [LL13] [LL12] [LL11] [LL10] [LL9] [LL8] [LL7] [LL6] [LL5] [LL4] [LL3] [LL2] [LL1]
-                auto luma = _mm_packus_epi16(luma_lo, luma_hi);
-
-                // Store the 16 bytes of luma
-                _mm_store_si128((__m128i*)&out_luma[dst_luma + x], luma);
-
-                if (y % 2 == 0) {
-                    // Chroma, done every other line as it's half the height of luma.
-
-                    // Shift the register right by 2 bytes (not bits), to kick out the 16-bit luma.
-                    // We can do this instead of &'ing a mask and then shifting.
-                    // pixel01 = [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1] [LL1 LL1]
-                    // ->
-                    //     c01 = [ 00  00] [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1]
-                    auto c01 = _mm_srli_si128(pixel01, 2);
-                    auto c23 = _mm_srli_si128(pixel23, 2);
-                    auto c45 = _mm_srli_si128(pixel45, 2);
-                    auto c67 = _mm_srli_si128(pixel67, 2);
-                    auto c89 = _mm_srli_si128(pixel89, 2);
-                    auto c1011 = _mm_srli_si128(pixel1011, 2);
-                    auto c1213 = _mm_srli_si128(pixel1213, 2);
-                    auto c1415 = _mm_srli_si128(pixel1415, 2);
-
-                    // Interleave the lower 8 bytes as 32-bit elements from 2 registers into 1 register.
-                    // This has the effect of skipping every other chroma value horitonally,
-                    // notice the high pixels UU2/UU4 are skipped.
-                    // This is intended as N420 chroma width is half the luma width.
-                    // c01   = [ 00  00 AA2 AA2] [VV2 VV2 UU2 UU2] [LL2 LL2 AA1 AA1] [VV1 VV1 UU1 UU1]
-                    // c23   = [ 00  00 AA4 AA4] [VV4 VV4 UU4 UU4] [LL4 LL4 AA3 AA3] [VV3 VV3 UU3 UU3]
-                    // ->
-                    // c0123 = [LL4 LL4 AA3 AA3] [LL2 LL2 AA1 AA1] [VV3 VV3 UU3 UU3] [VV1 VV1 UU1 UU1]
-                    auto c0123 = _mm_unpacklo_epi32(c01, c23);
-                    auto c4567 = _mm_unpacklo_epi32(c45, c67);
-                    auto c891011 = _mm_unpacklo_epi32(c89, c1011);
-                    auto c12131415 = _mm_unpacklo_epi32(c1213, c1415);
-
-                    // Interleave the low 64-bit elements from 2 registers into 1.
-                    // c0123     = [LL4 LL4 AA3 AA3 LL2 LL2 AA1 AA1] [VV3 VV3 UU3 UU3 VV1 VV1 UU1 UU1]
-                    // c4567     = [LL8 LL8 AA7 AA7 LL6 LL6 AA5 AA5] [VV7 VV7 UU7 UU7 VV5 VV5 UU5 UU5]
-                    // ->
-                    // chroma_lo = [VV7 VV7 UU7 UU7 VV5 VV5 UU5 UU5] [VV3 VV3 UU3 UU3 VV1 VV1 UU1 UU1]
-                    auto chroma_lo = _mm_unpacklo_epi64(c0123, c4567);
-                    auto chroma_hi = _mm_unpacklo_epi64(c891011, c12131415);
-
-                    // Right-shift the 16-bit elements by 2, un-doing the left shift by 2 on read
-                    // and bringing the range back to 8-bit.
-                    chroma_lo = _mm_srli_epi16(chroma_lo, 2);
-                    chroma_hi = _mm_srli_epi16(chroma_hi, 2);
-
-                    // Pack with unsigned saturation the 16-bit elements from 2 registers into 8-bit elements in 1 register.
-                    // chroma_lo = [ VV7  VV7] [ UU7  UU7] [ VV5  VV5] [ UU5  UU5] [ VV3  VV3] [ UU3  UU3] [VV1 VV1] [UU1 UU1]
-                    // chroma_hi = [VV15 VV15] [UU15 UU15] [VV13 VV13] [UU13 UU13] [VV11 VV11] [UU11 UU11] [VV9 VV9] [UU9 UU9]
-                    // ->
-                    // chroma    = [VV15] [UU15] [VV13] [UU13] [VV11] [UU11] [VV9] [UU9] [VV7] [UU7] [VV5] [UU5] [VV3] [UU3] [VV1] [UU1]
-                    auto chroma = _mm_packus_epi16(chroma_lo, chroma_hi);
-
-                    // Store the 16 bytes of chroma.
-                    _mm_store_si128((__m128i*)&out_chroma[dst_chroma + x + 0], chroma);
-                }
-
-                // clang-format on
-            }
-
-            const auto src_chroma = y * surface_stride;
-            for (; x < surface_width; x += 2) {
-                out_luma[dst_luma + x + 0] = static_cast<u8>(output_surface[src + x + 0].r >> 2);
-                out_luma[dst_luma + x + 1] = static_cast<u8>(output_surface[src + x + 1].r >> 2);
+                    u8(output_surface[src_luma + x + 1].r >> 2);
                 out_chroma[dst_chroma + x + 0] =
-                    static_cast<u8>(output_surface[src_chroma + x].g >> 2);
+                    u8(output_surface[src_chroma + x].g >> 2);
                 out_chroma[dst_chroma + x + 1] =
-                    static_cast<u8>(output_surface[src_chroma + x].b >> 2);
+                    u8(output_surface[src_chroma + x].b >> 2);
             }
         }
-#else
-        DecodeLinear(out_luma, out_chroma);
-#endif
     };
 
     switch (output_surface_config.out_block_kind) {
     case BLK_KIND::GENERIC_16Bx2: {
-        const u32 block_height = static_cast<u32>(output_surface_config.out_block_height);
+        const u32 block_height = u32(output_surface_config.out_block_height);
         const auto out_luma_swizzle_size = Texture::CalculateSize(
             true, BytesPerPixel, out_luma_width, out_luma_height, 1, block_height, 0);
         const auto out_chroma_swizzle_size = Texture::CalculateSize(
@@ -1028,23 +485,18 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) {
             &swizzle_scratch);
 
         if (block_height == 1) {
-            SwizzleSurface(out_luma, out_luma_stride, luma_scratch, out_luma_stride,
-                           out_luma_height);
+            SwizzleSurface(out_luma, out_luma_stride, luma_scratch, out_luma_stride, out_luma_height);
         } else {
-            Texture::SwizzleTexture(out_luma, luma_scratch, BytesPerPixel, out_luma_width,
-                                    out_luma_height, 1, block_height, 0, 1);
+            Texture::SwizzleTexture(out_luma, luma_scratch, BytesPerPixel, out_luma_width, out_luma_height, 1, block_height, 0, 1);
         }
 
         Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite>
-            out_chroma(memory_manager, regs.output_surface.chroma_u.Address(),
-                       out_chroma_swizzle_size, &swizzle_scratch);
+            out_chroma(memory_manager, regs.output_surface.chroma_u.Address(), out_chroma_swizzle_size, &swizzle_scratch);
 
         if (block_height == 1) {
-            SwizzleSurface(out_chroma, out_chroma_stride, chroma_scratch, out_chroma_stride,
-                           out_chroma_height);
+            SwizzleSurface(out_chroma, out_chroma_stride, chroma_scratch, out_chroma_stride, out_chroma_height);
         } else {
-            Texture::SwizzleTexture(out_chroma, chroma_scratch, BytesPerPixel, out_chroma_width,
-                                    out_chroma_height, 1, block_height, 0, 1);
+            Texture::SwizzleTexture(out_chroma, chroma_scratch, BytesPerPixel, out_chroma_width, out_chroma_height, 1, block_height, 0, 1);
         }
     } break;
     case BLK_KIND::PITCH: {
@@ -1067,13 +519,9 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) {
         // afterwards to re-overwrite the luma being too large.
         luma_scratch.resize_destructive(out_luma_size);
         chroma_scratch.resize_destructive(out_chroma_size);
-
         Decode(luma_scratch, chroma_scratch);
-
-        memory_manager.WriteBlock(regs.output_surface.luma.Address(), luma_scratch.data(),
-                                  out_luma_size);
-        memory_manager.WriteBlock(regs.output_surface.chroma_u.Address(), chroma_scratch.data(),
-                                  out_chroma_size);
+        memory_manager.WriteBlock(regs.output_surface.luma.Address(), luma_scratch.data(), out_luma_size);
+        memory_manager.WriteBlock(regs.output_surface.chroma_u.Address(), chroma_scratch.data(), out_chroma_size);
     } break;
     default:
         UNREACHABLE();
@@ -1081,8 +529,7 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) {
     }
 }
 
-template <VideoPixelFormat Format>
-void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config) {
+void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config, VideoPixelFormat format) {
     constexpr u32 BytesPerPixel = 4;
 
     auto surface_width{output_surface_config.out_surface_width + 1};
@@ -1097,125 +544,29 @@ void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config) {
     surface_width = (std::min)(surface_width, out_luma_width);
     surface_height = (std::min)(surface_height, out_luma_height);
 
-    [[maybe_unused]] auto DecodeLinear = [&](std::span<u8> out_buffer) {
-        for (u32 y = 0; y < surface_height; y++) {
-            const auto src = y * surface_stride;
-            const auto dst = y * out_luma_stride;
-            for (u32 x = 0; x < surface_width; x++) {
-                if constexpr (Format == VideoPixelFormat::A8R8G8B8) {
-                    out_buffer[dst + x * 4 + 0] = static_cast<u8>(output_surface[src + x].b >> 2);
-                    out_buffer[dst + x * 4 + 1] = static_cast<u8>(output_surface[src + x].g >> 2);
-                    out_buffer[dst + x * 4 + 2] = static_cast<u8>(output_surface[src + x].r >> 2);
-                    out_buffer[dst + x * 4 + 3] = static_cast<u8>(output_surface[src + x].a >> 2);
-                } else {
-                    out_buffer[dst + x * 4 + 0] = static_cast<u8>(output_surface[src + x].r >> 2);
-                    out_buffer[dst + x * 4 + 1] = static_cast<u8>(output_surface[src + x].g >> 2);
-                    out_buffer[dst + x * 4 + 2] = static_cast<u8>(output_surface[src + x].b >> 2);
-                    out_buffer[dst + x * 4 + 3] = static_cast<u8>(output_surface[src + x].a >> 2);
-                }
-            }
-        }
-    };
-
     auto Decode = [&](std::span<u8> out_buffer) {
-#if defined(ARCHITECTURE_x86_64)
-        if (!has_sse41) {
-            DecodeLinear(out_buffer);
-            return;
-        }
-#endif
-
-#if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64)
-        constexpr size_t SseAlignment = 16;
-        const auto sse_aligned_width = Common::AlignDown(surface_width, SseAlignment);
-
         for (u32 y = 0; y < surface_height; y++) {
             const auto src = y * surface_stride;
             const auto dst = y * out_luma_stride;
-            u32 x = 0;
-            for (; x < sse_aligned_width; x += SseAlignment) {
-                // clang-format off
-                // Prefetch the next 2 cache lines
-                _mm_prefetch((const char*)&output_surface[src + x + 16], _MM_HINT_T0);
-                _mm_prefetch((const char*)&output_surface[src + x + 24], _MM_HINT_T0);
-
-                // Load the pixels, 16-bit channels, 8 bytes per pixel, e.g
-                // pixel01 = [AA AA BB BB GG GG RR RR AA AA BB BB GG GG RR RR
-                auto pixel01 = _mm_load_si128((__m128i*)&output_surface[src + x + 0]);
-                auto pixel23 = _mm_load_si128((__m128i*)&output_surface[src + x + 2]);
-                auto pixel45 = _mm_load_si128((__m128i*)&output_surface[src + x + 4]);
-                auto pixel67 = _mm_load_si128((__m128i*)&output_surface[src + x + 6]);
-                auto pixel89 = _mm_load_si128((__m128i*)&output_surface[src + x + 8]);
-                auto pixel1011 = _mm_load_si128((__m128i*)&output_surface[src + x + 10]);
-                auto pixel1213 = _mm_load_si128((__m128i*)&output_surface[src + x + 12]);
-                auto pixel1415 = _mm_load_si128((__m128i*)&output_surface[src + x + 14]);
-
-                // Right-shift the channels by 16 to un-do the left shit on read and bring the range
-                // back to 8-bit.
-                pixel01 = _mm_srli_epi16(pixel01, 2);
-                pixel23 = _mm_srli_epi16(pixel23, 2);
-                pixel45 = _mm_srli_epi16(pixel45, 2);
-                pixel67 = _mm_srli_epi16(pixel67, 2);
-                pixel89 = _mm_srli_epi16(pixel89, 2);
-                pixel1011 = _mm_srli_epi16(pixel1011, 2);
-                pixel1213 = _mm_srli_epi16(pixel1213, 2);
-                pixel1415 = _mm_srli_epi16(pixel1415, 2);
-
-                // Pack with unsigned saturation 16-bit channels from 2 registers into 8-bit channels in 1 register.
-                // pixel01    = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1]
-                // pixel23    = [AA4 AA4] [BB4 BB4] [GG4 GG4] [RR4 RR4] [AA3 AA3] [BB3 BB3] [GG3 GG3] [RR3 RR3]
-                // ->
-                // pixels0_lo = [AA4] [BB4] [GG4] [RR4] [AA3] [BB3] [GG3] [RR3] [AA2] [BB2] [GG2] [RR2] [AA1] [BB1] [GG1] [RR1]
-                auto pixels0_lo = _mm_packus_epi16(pixel01, pixel23);
-                auto pixels0_hi = _mm_packus_epi16(pixel45, pixel67);
-                auto pixels1_lo = _mm_packus_epi16(pixel89, pixel1011);
-                auto pixels1_hi = _mm_packus_epi16(pixel1213, pixel1415);
-
-                if constexpr (Format == VideoPixelFormat::A8R8G8B8) {
-                    const auto shuffle =
-                        _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2);
-
-                    // Our pixels are ABGR (big-endian) by default, if ARGB is needed, we need to shuffle.
-                    // pixels0_lo = [AA4 BB4 GG4 RR4] [AA3 BB3 GG3 RR3] [AA2 BB2 GG2 RR2] [AA1 BB1 GG1 RR1]
-                    // ->
-                    // pixels0_lo = [AA4 RR4 GG4 BB4] [AA3 RR3 GG3 BB3] [AA2 RR2 GG2 BB2] [AA1 RR1 GG1 BB1]
-                    pixels0_lo = _mm_shuffle_epi8(pixels0_lo, shuffle);
-                    pixels0_hi = _mm_shuffle_epi8(pixels0_hi, shuffle);
-                    pixels1_lo = _mm_shuffle_epi8(pixels1_lo, shuffle);
-                    pixels1_hi = _mm_shuffle_epi8(pixels1_hi, shuffle);
-                }
-
-                // Store the pixels
-                _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 0], pixels0_lo);
-                _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 16], pixels0_hi);
-                _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 32], pixels1_lo);
-                _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 48], pixels1_hi);
-
-                // clang-format on
-            }
-
-            for (; x < surface_width; x++) {
-                if constexpr (Format == VideoPixelFormat::A8R8G8B8) {
-                    out_buffer[dst + x * 4 + 0] = static_cast<u8>(output_surface[src + x].b >> 2);
-                    out_buffer[dst + x * 4 + 1] = static_cast<u8>(output_surface[src + x].g >> 2);
-                    out_buffer[dst + x * 4 + 2] = static_cast<u8>(output_surface[src + x].r >> 2);
-                    out_buffer[dst + x * 4 + 3] = static_cast<u8>(output_surface[src + x].a >> 2);
+            for (u32 x = 0; x < surface_width; x++) {
+                if(format == VideoPixelFormat::A8R8G8B8) {
+                    out_buffer[dst + x * 4 + 0] = u8(output_surface[src + x].b >> 2);
+                    out_buffer[dst + x * 4 + 1] = u8(output_surface[src + x].g >> 2);
+                    out_buffer[dst + x * 4 + 2] = u8(output_surface[src + x].r >> 2);
+                    out_buffer[dst + x * 4 + 3] = u8(output_surface[src + x].a >> 2);
                 } else {
-                    out_buffer[dst + x * 4 + 0] = static_cast<u8>(output_surface[src + x].r >> 2);
-                    out_buffer[dst + x * 4 + 1] = static_cast<u8>(output_surface[src + x].g >> 2);
-                    out_buffer[dst + x * 4 + 2] = static_cast<u8>(output_surface[src + x].b >> 2);
-                    out_buffer[dst + x * 4 + 3] = static_cast<u8>(output_surface[src + x].a >> 2);
+                    out_buffer[dst + x * 4 + 0] = u8(output_surface[src + x].r >> 2);
+                    out_buffer[dst + x * 4 + 1] = u8(output_surface[src + x].g >> 2);
+                    out_buffer[dst + x * 4 + 2] = u8(output_surface[src + x].b >> 2);
+                    out_buffer[dst + x * 4 + 3] = u8(output_surface[src + x].a >> 2);
                 }
             }
         }
-#else
-        DecodeLinear(out_buffer);
-#endif
     };
 
     switch (output_surface_config.out_block_kind) {
     case BLK_KIND::GENERIC_16Bx2: {
-        const u32 block_height = static_cast<u32>(output_surface_config.out_block_height);
+        const u32 block_height = u32(output_surface_config.out_block_height);
         const auto out_swizzle_size = Texture::CalculateSize(true, BytesPerPixel, out_luma_width,
                                                              out_luma_height, 1, block_height, 0);
 
@@ -1236,22 +587,20 @@ void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config) {
             memory_manager, regs.output_surface.luma.Address(), out_swizzle_size, &swizzle_scratch);
 
         if (block_height == 1) {
-            SwizzleSurface(out_luma, out_luma_stride, luma_scratch, out_luma_stride,
-                           out_luma_height);
+            SwizzleSurface(out_luma, out_luma_stride, luma_scratch, out_luma_stride, out_luma_height);
         } else {
-            Texture::SwizzleTexture(out_luma, luma_scratch, BytesPerPixel, out_luma_width,
-                                    out_luma_height, 1, block_height, 0, 1);
+            Texture::SwizzleTexture(out_luma, luma_scratch, BytesPerPixel, out_luma_width, out_luma_height, 1, block_height, 0, 1);
         }
 
     } break;
     case BLK_KIND::PITCH: {
         LOG_TRACE(HW_GPU,
-                  "Writing ABGR pitch frame\n"
-                  "\tinput surface {}x{} stride {} size {:#X}"
-                  "\toutput surface {}x{} stride {} size {:#X}",
-                  surface_width, surface_height, surface_stride,
-                  surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height,
-                  out_luma_stride, out_luma_size);
+            "Writing ABGR pitch frame\n"
+            "\tinput surface {}x{} stride {} size {:#X}"
+            "\toutput surface {}x{} stride {} size {:#X}",
+            surface_width, surface_height, surface_stride,
+            surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height,
+            out_luma_stride, out_luma_size);
 
         luma_scratch.resize_destructive(out_luma_size);
 
diff --git a/src/video_core/host1x/vic.h b/src/video_core/host1x/vic.h
index e7600941ad..5ea13285a4 100644
--- a/src/video_core/host1x/vic.h
+++ b/src/video_core/host1x/vic.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -610,24 +613,12 @@ public:
 
 private:
     void Execute();
-
     void Blend(const ConfigStruct& config, const SlotStruct& slot);
-
-    template <bool Planar, bool Interlaced = false>
-    void ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets,
-                                      std::shared_ptr<const FFmpeg::Frame> frame);
-    template <bool Planar, bool TopField>
-    void ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets,
-                                     std::shared_ptr<const FFmpeg::Frame> frame);
-
-    template <bool Planar>
-    void ReadY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets,
-                           std::shared_ptr<const FFmpeg::Frame> frame);
-
+    void ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets, std::shared_ptr<const FFmpeg::Frame> frame, bool planar, bool interlaced);
+    void ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets, std::shared_ptr<const FFmpeg::Frame> frame, bool planar, bool top_field);
+    void ReadY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets, std::shared_ptr<const FFmpeg::Frame> frame, bool planar);
     void WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config);
-
-    template <VideoPixelFormat Format>
-    void WriteABGR(const OutputSurfaceConfig& output_surface_config);
+    void WriteABGR(const OutputSurfaceConfig& output_surface_config, VideoPixelFormat format);
 
     s32 id;
     s32 nvdec_id{-1};
@@ -636,8 +627,6 @@ private:
     VicRegisters regs{};
     FrameQueue& frame_queue;
 
-    const bool has_sse41{false};
-
     Common::ScratchBuffer<Pixel> output_surface;
     Common::ScratchBuffer<Pixel> slot_surface;
     Common::ScratchBuffer<u8> luma_scratch;