[video_core/host1x/vic] Fix a Regression in Big Brain Academy in Linux (#3412)

Patch from Lizz Fixes out of bounds read/writes from vector implementation of the VIC decoder Authored-by: lizzie <lizzie@eden-emu.dev> Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3412 Reviewed-by: MaranBr <maranbr@eden-emu.dev> Co-authored-by: John <john@eden-emu.dev> Co-committed-by: John <john@eden-emu.dev>
1 week ago · 3b81d2e333
1 changed files with 6 additions and 7 deletions
--- a/src/video_core/host1x/vic.cpp
+++ b/src/video_core/host1x/vic.cpp
@ -188,7 +188,7 @@ void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span<const P
        out_luma_height, out_luma_stride);

    slot_surface.resize_destructive(out_luma_width * out_luma_height);
-    if (COMPILED_HAS_SSE41 && HasSSE41()) {
+    if (COMPILED_HAS_SSE41 && HasSSE41() && in_luma_width % 16 == 0) {
 #if COMPILED_HAS_SSE41
        auto const alpha_linear = u16(slot.config.planar_alpha.Value());
        auto const alpha = _mm_slli_epi64(_mm_set1_epi64x(s64(slot.config.planar_alpha.Value())), 48);
@ -491,7 +491,7 @@ void Vic::Blend(const ConfigStruct& config, const SlotStruct& slot, VideoPixelFo
    // TODO Alpha blending. No games I've seen use more than a single surface or supply an alpha
    // below max, so it's ignored for now.
    if (slot.color_matrix.matrix_enable) {
-        if (COMPILED_HAS_SSE41 && HasSSE41()) {
+        if (COMPILED_HAS_SSE41 && HasSSE41() && source_left % 8 == 0 && source_right % 8 == 0) {
            // MSVC doesn't define __SSE4_1__
 #if COMPILED_HAS_SSE41
            // Fill the columns, e.g
@ -707,7 +707,7 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) n
    surface_height = (std::min)(surface_height, out_luma_height);

    auto Decode = [&](u8* out_luma, u8* out_chroma) {
-        if (COMPILED_HAS_SSE41 && HasSSE41()) {
+        if (COMPILED_HAS_SSE41 && HasSSE41() && surface_width % 16 == 0) {
 #if COMPILED_HAS_SSE41
            // luma_mask   = [00 00] [00 00] [00 00] [FF FF] [00 00] [00 00] [00 00] [FF FF]
            auto const luma_mask = _mm_set_epi16(0, 0, 0, -1, 0, 0, 0, -1);
@ -935,15 +935,14 @@ void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config, VideoPixel
    surface_height = (std::min)(surface_height, out_luma_height);

    auto Decode = [&](u8* out, Pixel const* inp) {
-        if (COMPILED_HAS_SSE41 && HasSSE41()) {
+        if (COMPILED_HAS_SSE41 && HasSSE41() && surface_width % 16 == 0) {
 #if COMPILED_HAS_SSE41
-            size_t const SSE_ALIGNMENT = 16;
-            auto const sse_aligned_width = Common::AlignDown(surface_width, SSE_ALIGNMENT);
+            auto const sse_aligned_width = Common::AlignDown(surface_width, 16);
            for (u32 y = 0; y < surface_height; y++) {
                auto const src = y * surface_stride;
                auto const dst = y * out_luma_stride;
                u32 x = 0;
-                for (; x < sse_aligned_width; x += SSE_ALIGNMENT) {
+                for (; x < sse_aligned_width; x += 16) {
                    // Prefetch the next 2 cache lines
                    _mm_prefetch((const char*)&inp[src + x + 16], _MM_HINT_T0);
                    _mm_prefetch((const char*)&inp[src + x + 24], _MM_HINT_T0);