|
|
@ -8,6 +8,10 @@ |
|
|
#include <tuple>
|
|
|
#include <tuple>
|
|
|
#include <stdint.h>
|
|
|
#include <stdint.h>
|
|
|
|
|
|
|
|
|
|
|
|
#if defined(ARCHITECTURE_x86_64)
|
|
|
|
|
|
#include <immintrin.h>
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
extern "C" { |
|
|
extern "C" { |
|
|
#if defined(__GNUC__) || defined(__clang__)
|
|
|
#if defined(__GNUC__) || defined(__clang__)
|
|
|
#pragma GCC diagnostic push
|
|
|
#pragma GCC diagnostic push
|
|
|
@ -38,9 +42,27 @@ extern "C" { |
|
|
#include "common/x64/cpu_detect.h"
|
|
|
#include "common/x64/cpu_detect.h"
|
|
|
#endif
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
#if defined(ARCHITECTURE_x86_64) \
|
|
|
|
|
|
&& ((defined(_MSC_VER) && defined(__AVX__)) \ |
|
|
|
|
|
|| (defined(__GNUC__) && defined(__SSE4_1__)) \ |
|
|
|
|
|
|| (defined(__clang__) && defined(__SSE4_1__))) |
|
|
|
|
|
#define COMPILED_HAS_SSE41 1
|
|
|
|
|
|
#else
|
|
|
|
|
|
#define COMPILED_HAS_SSE41 0
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
namespace Tegra::Host1x { |
|
|
namespace Tegra::Host1x { |
|
|
namespace { |
|
|
namespace { |
|
|
|
|
|
|
|
|
|
|
|
static bool HasSSE41() { |
|
|
|
|
|
#if defined(ARCHITECTURE_x86_64)
|
|
|
|
|
|
static bool has_sse41 = Common::GetCPUCaps().sse4_1; |
|
|
|
|
|
return has_sse41; |
|
|
|
|
|
#else
|
|
|
|
|
|
return false; |
|
|
|
|
|
#endif
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
void SwizzleSurface(std::span<u8> output, u32 out_stride, std::span<const u8> input, u32 in_stride, u32 height) noexcept { |
|
|
void SwizzleSurface(std::span<u8> output, u32 out_stride, std::span<const u8> input, u32 in_stride, u32 height) noexcept { |
|
|
//// Taken from https://github.com/averne/FFmpeg/blob/nvtegra/libavutil/hwcontext_nvtegra.c#L949
|
|
|
//// Taken from https://github.com/averne/FFmpeg/blob/nvtegra/libavutil/hwcontext_nvtegra.c#L949
|
|
|
/// Can only handle block height == 1.
|
|
|
/// Can only handle block height == 1.
|
|
|
@ -126,17 +148,18 @@ void Vic::Execute() noexcept { |
|
|
std::fill(output_surface.begin(), output_surface.end(), Pixel{}); |
|
|
std::fill(output_surface.begin(), output_surface.end(), Pixel{}); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
switch (config.output_surface_config.out_pixel_format) { |
|
|
|
|
|
|
|
|
auto const format = config.output_surface_config.out_pixel_format; |
|
|
|
|
|
switch (format) { |
|
|
case VideoPixelFormat::A8B8G8R8: |
|
|
case VideoPixelFormat::A8B8G8R8: |
|
|
case VideoPixelFormat::X8B8G8R8: |
|
|
case VideoPixelFormat::X8B8G8R8: |
|
|
case VideoPixelFormat::A8R8G8B8: |
|
|
case VideoPixelFormat::A8R8G8B8: |
|
|
WriteABGR(config.output_surface_config); |
|
|
|
|
|
|
|
|
WriteABGR(config.output_surface_config, format); |
|
|
break; |
|
|
break; |
|
|
case VideoPixelFormat::Y8__V8U8_N420: |
|
|
case VideoPixelFormat::Y8__V8U8_N420: |
|
|
WriteY8__V8U8_N420(config.output_surface_config); |
|
|
WriteY8__V8U8_N420(config.output_surface_config); |
|
|
break; |
|
|
break; |
|
|
default: |
|
|
default: |
|
|
UNIMPLEMENTED_MSG("Unknown video pixel format {}", config.output_surface_config.out_pixel_format.Value()); |
|
|
|
|
|
|
|
|
UNIMPLEMENTED_MSG("Unknown video pixel format {}", format.Value()); |
|
|
break; |
|
|
break; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
@ -165,22 +188,173 @@ void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span<const P |
|
|
out_luma_height, out_luma_stride); |
|
|
out_luma_height, out_luma_stride); |
|
|
|
|
|
|
|
|
slot_surface.resize_destructive(out_luma_width * out_luma_height); |
|
|
slot_surface.resize_destructive(out_luma_width * out_luma_height); |
|
|
auto const alpha = u16(slot.config.planar_alpha.Value()); |
|
|
|
|
|
for (size_t y = 0; y < size_t(in_luma_height); y++) { |
|
|
|
|
|
auto const src_luma = y * in_luma_stride; |
|
|
|
|
|
auto const src_chroma = (y / 2) * in_chroma_stride; |
|
|
|
|
|
auto const dst = y * out_luma_stride; |
|
|
|
|
|
for (size_t x = 0; x < size_t(in_luma_width); x++) { |
|
|
|
|
|
slot_surface[dst + x].r = u16(luma_buffer[src_luma + x] << 2); |
|
|
|
|
|
// Chroma samples are duplicated horizontally and vertically.
|
|
|
|
|
|
if(planar) { |
|
|
|
|
|
slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + x / 2] << 2); |
|
|
|
|
|
slot_surface[dst + x].b = u16(chroma_v_buffer[src_chroma + x / 2] << 2); |
|
|
|
|
|
} else { |
|
|
|
|
|
slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2); |
|
|
|
|
|
slot_surface[dst + x].b = u16(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2); |
|
|
|
|
|
|
|
|
if (COMPILED_HAS_SSE41 && HasSSE41()) { |
|
|
|
|
|
#if COMPILED_HAS_SSE41
|
|
|
|
|
|
auto const alpha_linear = u16(slot.config.planar_alpha.Value()); |
|
|
|
|
|
auto const alpha = _mm_slli_epi64(_mm_set1_epi64x(s64(slot.config.planar_alpha.Value())), 48); |
|
|
|
|
|
|
|
|
|
|
|
auto const shuffle_mask = _mm_set_epi8(13, 15, 14, 12, 9, 11, 10, 8, 5, 7, 6, 4, 1, 3, 2, 0); |
|
|
|
|
|
auto const sse_aligned_width = Common::AlignDown(in_luma_width, 16); |
|
|
|
|
|
|
|
|
|
|
|
for (s32 y = 0; y < in_luma_height; y++) { |
|
|
|
|
|
auto const src_luma{y * in_luma_stride}; |
|
|
|
|
|
auto const src_chroma{(y / 2) * in_chroma_stride}; |
|
|
|
|
|
auto const dst{y * out_luma_stride}; |
|
|
|
|
|
s32 x = 0; |
|
|
|
|
|
for (; x < sse_aligned_width; x += 16) { |
|
|
|
|
|
// clang-format off
|
|
|
|
|
|
// Prefetch next iteration's memory
|
|
|
|
|
|
_mm_prefetch((const char*)&luma_buffer[src_luma + x + 16], _MM_HINT_T0); |
|
|
|
|
|
|
|
|
|
|
|
// Load 8 bytes * 2 of 8-bit luma samples
|
|
|
|
|
|
// luma0 = 00 00 00 00 00 00 00 00 LL LL LL LL LL LL LL LL
|
|
|
|
|
|
auto luma0 = _mm_loadl_epi64((__m128i*)&luma_buffer[src_luma + x + 0]); |
|
|
|
|
|
auto luma1 = _mm_loadl_epi64((__m128i*)&luma_buffer[src_luma + x + 8]); |
|
|
|
|
|
|
|
|
|
|
|
__m128i chroma; |
|
|
|
|
|
|
|
|
|
|
|
if (planar) { |
|
|
|
|
|
_mm_prefetch((const char*)&chroma_u_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0); |
|
|
|
|
|
_mm_prefetch((const char*)&chroma_v_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0); |
|
|
|
|
|
|
|
|
|
|
|
// If Chroma is planar, we have separate U and V planes, load 8 bytes of each
|
|
|
|
|
|
// chroma_u0 = 00 00 00 00 00 00 00 00 UU UU UU UU UU UU UU UU
|
|
|
|
|
|
// chroma_v0 = 00 00 00 00 00 00 00 00 VV VV VV VV VV VV VV VV
|
|
|
|
|
|
auto chroma_u0 = _mm_loadl_epi64((__m128i*)&chroma_u_buffer[src_chroma + x / 2]); |
|
|
|
|
|
auto chroma_v0 = _mm_loadl_epi64((__m128i*)&chroma_v_buffer[src_chroma + x / 2]); |
|
|
|
|
|
|
|
|
|
|
|
// Interleave the 8 bytes of U and V into a single 16 byte reg
|
|
|
|
|
|
// chroma = VV UU VV UU VV UU VV UU VV UU VV UU VV UU VV UU
|
|
|
|
|
|
chroma = _mm_unpacklo_epi8(chroma_u0, chroma_v0); |
|
|
|
|
|
} else { |
|
|
|
|
|
_mm_prefetch((const char*)&chroma_u_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0); |
|
|
|
|
|
|
|
|
|
|
|
// Chroma is already interleaved in semiplanar format, just load 16 bytes
|
|
|
|
|
|
// chroma = VV UU VV UU VV UU VV UU VV UU VV UU VV UU VV UU
|
|
|
|
|
|
chroma = _mm_load_si128((__m128i*)&chroma_u_buffer[src_chroma + x]); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Convert the low 8 bytes of 8-bit luma into 16-bit luma
|
|
|
|
|
|
// luma0 = [00] [00] [00] [00] [00] [00] [00] [00] [LL] [LL] [LL] [LL] [LL] [LL] [LL] [LL]
|
|
|
|
|
|
// ->
|
|
|
|
|
|
// luma0 = [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL]
|
|
|
|
|
|
luma0 = _mm_cvtepu8_epi16(luma0); |
|
|
|
|
|
luma1 = _mm_cvtepu8_epi16(luma1); |
|
|
|
|
|
|
|
|
|
|
|
// Treat the 8 bytes of 8-bit chroma as 16-bit channels, this allows us to take both the
|
|
|
|
|
|
// U and V together as one element. Using chroma twice here duplicates the values, as we
|
|
|
|
|
|
// take element 0 from chroma, and then element 0 from chroma again, etc. We need to
|
|
|
|
|
|
// duplicate chroma horitonally as chroma is half the width of luma.
|
|
|
|
|
|
// chroma = [VV8 UU8] [VV7 UU7] [VV6 UU6] [VV5 UU5] [VV4 UU4] [VV3 UU3] [VV2 UU2] [VV1 UU1]
|
|
|
|
|
|
// ->
|
|
|
|
|
|
// chroma00 = [VV4 UU4] [VV4 UU4] [VV3 UU3] [VV3 UU3] [VV2 UU2] [VV2 UU2] [VV1 UU1] [VV1 UU1]
|
|
|
|
|
|
// chroma01 = [VV8 UU8] [VV8 UU8] [VV7 UU7] [VV7 UU7] [VV6 UU6] [VV6 UU6] [VV5 UU5] [VV5 UU5]
|
|
|
|
|
|
auto chroma00 = _mm_unpacklo_epi16(chroma, chroma); |
|
|
|
|
|
auto chroma01 = _mm_unpackhi_epi16(chroma, chroma); |
|
|
|
|
|
|
|
|
|
|
|
// Interleave the 16-bit luma and chroma.
|
|
|
|
|
|
// luma0 = [008 LL8] [007 LL7] [006 LL6] [005 LL5] [004 LL4] [003 LL3] [002 LL2] [001 LL1]
|
|
|
|
|
|
// chroma00 = [VV8 UU8] [VV7 UU7] [VV6 UU6] [VV5 UU5] [VV4 UU4] [VV3 UU3] [VV2 UU2] [VV1 UU1]
|
|
|
|
|
|
// ->
|
|
|
|
|
|
// yuv0 = [VV4 UU4 004 LL4] [VV3 UU3 003 LL3] [VV2 UU2 002 LL2] [VV1 UU1 001 LL1]
|
|
|
|
|
|
// yuv1 = [VV8 UU8 008 LL8] [VV7 UU7 007 LL7] [VV6 UU6 006 LL6] [VV5 UU5 005 LL5]
|
|
|
|
|
|
auto yuv0 = _mm_unpacklo_epi16(luma0, chroma00); |
|
|
|
|
|
auto yuv1 = _mm_unpackhi_epi16(luma0, chroma00); |
|
|
|
|
|
auto yuv2 = _mm_unpacklo_epi16(luma1, chroma01); |
|
|
|
|
|
auto yuv3 = _mm_unpackhi_epi16(luma1, chroma01); |
|
|
|
|
|
|
|
|
|
|
|
// Shuffle the luma/chroma into the channel ordering we actually want. The high byte of
|
|
|
|
|
|
// the luma which is now a constant 0 after converting 8-bit -> 16-bit is used as the
|
|
|
|
|
|
// alpha. Luma -> R, U -> G, V -> B, 0 -> A
|
|
|
|
|
|
// yuv0 = [VV4 UU4 004 LL4] [VV3 UU3 003 LL3] [VV2 UU2 002 LL2] [VV1 UU1 001 LL1]
|
|
|
|
|
|
// ->
|
|
|
|
|
|
// yuv0 = [AA4 VV4 UU4 LL4] [AA3 VV3 UU3 LL3] [AA2 VV2 UU2 LL2] [AA1 VV1 UU1 LL1]
|
|
|
|
|
|
yuv0 = _mm_shuffle_epi8(yuv0, shuffle_mask); |
|
|
|
|
|
yuv1 = _mm_shuffle_epi8(yuv1, shuffle_mask); |
|
|
|
|
|
yuv2 = _mm_shuffle_epi8(yuv2, shuffle_mask); |
|
|
|
|
|
yuv3 = _mm_shuffle_epi8(yuv3, shuffle_mask); |
|
|
|
|
|
|
|
|
|
|
|
// Extend the 8-bit channels we have into 16-bits, as that's the target surface format.
|
|
|
|
|
|
// Since this turns just the low 8 bytes into 16 bytes, the second of
|
|
|
|
|
|
// each operation here right shifts the register by 8 to get the high pixels.
|
|
|
|
|
|
// yuv0 = [AA4] [VV4] [UU4] [LL4] [AA3] [VV3] [UU3] [LL3] [AA2] [VV2] [UU2] [LL2] [AA1] [VV1] [UU1] [LL1]
|
|
|
|
|
|
// ->
|
|
|
|
|
|
// yuv01 = [002 AA2] [002 VV2] [002 UU2] [002 LL2] [001 AA1] [001 VV1] [001 UU1] [001 LL1]
|
|
|
|
|
|
// yuv23 = [004 AA4] [004 VV4] [004 UU4] [004 LL4] [003 AA3] [003 VV3] ]003 UU3] [003 LL3]
|
|
|
|
|
|
auto yuv01 = _mm_cvtepu8_epi16(yuv0); |
|
|
|
|
|
auto yuv23 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv0, 8)); |
|
|
|
|
|
auto yuv45 = _mm_cvtepu8_epi16(yuv1); |
|
|
|
|
|
auto yuv67 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv1, 8)); |
|
|
|
|
|
auto yuv89 = _mm_cvtepu8_epi16(yuv2); |
|
|
|
|
|
auto yuv1011 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv2, 8)); |
|
|
|
|
|
auto yuv1213 = _mm_cvtepu8_epi16(yuv3); |
|
|
|
|
|
auto yuv1415 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv3, 8)); |
|
|
|
|
|
|
|
|
|
|
|
// Left-shift all 16-bit channels by 2, this is to get us into a 10-bit format instead
|
|
|
|
|
|
// of 8, which is the format alpha is in, as well as other blending values.
|
|
|
|
|
|
yuv01 = _mm_slli_epi16(yuv01, 2); |
|
|
|
|
|
yuv23 = _mm_slli_epi16(yuv23, 2); |
|
|
|
|
|
yuv45 = _mm_slli_epi16(yuv45, 2); |
|
|
|
|
|
yuv67 = _mm_slli_epi16(yuv67, 2); |
|
|
|
|
|
yuv89 = _mm_slli_epi16(yuv89, 2); |
|
|
|
|
|
yuv1011 = _mm_slli_epi16(yuv1011, 2); |
|
|
|
|
|
yuv1213 = _mm_slli_epi16(yuv1213, 2); |
|
|
|
|
|
yuv1415 = _mm_slli_epi16(yuv1415, 2); |
|
|
|
|
|
|
|
|
|
|
|
// OR in the planar alpha, this has already been duplicated and shifted into position,
|
|
|
|
|
|
// and just fills in the AA channels with the actual alpha value.
|
|
|
|
|
|
yuv01 = _mm_or_si128(yuv01, alpha); |
|
|
|
|
|
yuv23 = _mm_or_si128(yuv23, alpha); |
|
|
|
|
|
yuv45 = _mm_or_si128(yuv45, alpha); |
|
|
|
|
|
yuv67 = _mm_or_si128(yuv67, alpha); |
|
|
|
|
|
yuv89 = _mm_or_si128(yuv89, alpha); |
|
|
|
|
|
yuv1011 = _mm_or_si128(yuv1011, alpha); |
|
|
|
|
|
yuv1213 = _mm_or_si128(yuv1213, alpha); |
|
|
|
|
|
yuv1415 = _mm_or_si128(yuv1415, alpha); |
|
|
|
|
|
|
|
|
|
|
|
// Store out the pixels. One pixel is now 8 bytes, so each store is 2 pixels.
|
|
|
|
|
|
// [AA AA] [VV VV] [UU UU] [LL LL] [AA AA] [VV VV] [UU UU] [LL LL]
|
|
|
|
|
|
_mm_store_si128((__m128i*)&slot_surface[dst + x + 0], yuv01); |
|
|
|
|
|
_mm_store_si128((__m128i*)&slot_surface[dst + x + 2], yuv23); |
|
|
|
|
|
_mm_store_si128((__m128i*)&slot_surface[dst + x + 4], yuv45); |
|
|
|
|
|
_mm_store_si128((__m128i*)&slot_surface[dst + x + 6], yuv67); |
|
|
|
|
|
_mm_store_si128((__m128i*)&slot_surface[dst + x + 8], yuv89); |
|
|
|
|
|
_mm_store_si128((__m128i*)&slot_surface[dst + x + 10], yuv1011); |
|
|
|
|
|
_mm_store_si128((__m128i*)&slot_surface[dst + x + 12], yuv1213); |
|
|
|
|
|
_mm_store_si128((__m128i*)&slot_surface[dst + x + 14], yuv1415); |
|
|
|
|
|
} |
|
|
|
|
|
for (; x < in_luma_width; x++) { |
|
|
|
|
|
slot_surface[dst + x].r = u16(luma_buffer[src_luma + x] << 2); |
|
|
|
|
|
// Chroma samples are duplicated horizontally and vertically.
|
|
|
|
|
|
if (planar) { |
|
|
|
|
|
slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + x / 2] << 2); |
|
|
|
|
|
slot_surface[dst + x].b = u16(chroma_v_buffer[src_chroma + x / 2] << 2); |
|
|
|
|
|
} else { |
|
|
|
|
|
slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2); |
|
|
|
|
|
slot_surface[dst + x].b = u16(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2); |
|
|
|
|
|
} |
|
|
|
|
|
slot_surface[dst + x].a = alpha_linear; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
#endif
|
|
|
|
|
|
} else { |
|
|
|
|
|
auto const alpha = u16(slot.config.planar_alpha.Value()); |
|
|
|
|
|
for (size_t y = 0; y < size_t(in_luma_height); y++) { |
|
|
|
|
|
auto const src_luma = y * in_luma_stride; |
|
|
|
|
|
auto const src_chroma = (y / 2) * in_chroma_stride; |
|
|
|
|
|
auto const dst = y * out_luma_stride; |
|
|
|
|
|
for (size_t x = 0; x < size_t(in_luma_width); x++) { |
|
|
|
|
|
slot_surface[dst + x].r = u16(luma_buffer[src_luma + x] << 2); |
|
|
|
|
|
// Chroma samples are duplicated horizontally and vertically.
|
|
|
|
|
|
if (planar) { |
|
|
|
|
|
slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + x / 2] << 2); |
|
|
|
|
|
slot_surface[dst + x].b = u16(chroma_v_buffer[src_chroma + x / 2] << 2); |
|
|
|
|
|
} else { |
|
|
|
|
|
slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2); |
|
|
|
|
|
slot_surface[dst + x].b = u16(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2); |
|
|
|
|
|
} |
|
|
|
|
|
slot_surface[dst + x].a = alpha; |
|
|
} |
|
|
} |
|
|
slot_surface[dst + x].a = alpha; |
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
@ -317,48 +491,182 @@ void Vic::Blend(const ConfigStruct& config, const SlotStruct& slot, VideoPixelFo |
|
|
// TODO Alpha blending. No games I've seen use more than a single surface or supply an alpha
|
|
|
// TODO Alpha blending. No games I've seen use more than a single surface or supply an alpha
|
|
|
// below max, so it's ignored for now.
|
|
|
// below max, so it's ignored for now.
|
|
|
if (slot.color_matrix.matrix_enable) { |
|
|
if (slot.color_matrix.matrix_enable) { |
|
|
// clang-format off
|
|
|
|
|
|
// Colour conversion is enabled, this is a 3x4 * 4x1 matrix multiplication, resulting in a 3x1 matrix.
|
|
|
|
|
|
// | r0c0 r0c1 r0c2 r0c3 | | R | | R |
|
|
|
|
|
|
// | r1c0 r1c1 r1c2 r1c3 | * | G | = | G |
|
|
|
|
|
|
// | r2c0 r2c1 r2c2 r2c3 | | B | | B |
|
|
|
|
|
|
// | 1 |
|
|
|
|
|
|
auto const shift = s32(slot.color_matrix.matrix_r_shift.Value()); |
|
|
|
|
|
|
|
|
|
|
|
struct AliasedMatrixType { u64 m[4]; }; |
|
|
|
|
|
static_assert(sizeof(AliasedMatrixType) == sizeof(slot.color_matrix)); |
|
|
|
|
|
u64 const mat_mask = (1 << 20) - 1; |
|
|
|
|
|
auto const* amt = reinterpret_cast<AliasedMatrixType const*>(&slot.color_matrix); |
|
|
|
|
|
|
|
|
|
|
|
constexpr s32 shifts[4] = { 0, 20, 40, 60 }; |
|
|
|
|
|
s32 mr[4][4]; |
|
|
|
|
|
for (u32 j = 0; j < 3; ++j) |
|
|
|
|
|
for (u32 i = 0; i < 4; ++i) |
|
|
|
|
|
mr[j][i] = s32(s64(((amt->m[i] >> shifts[j]) & mat_mask) << (64 - 20)) >> (64 - 20)); |
|
|
|
|
|
|
|
|
|
|
|
auto const clamp_min = s32(slot.config.soft_clamp_low.Value()); |
|
|
|
|
|
auto const clamp_max = s32(slot.config.soft_clamp_high.Value()); |
|
|
|
|
|
for (u32 y = 0; y < work_height; ++y) { |
|
|
|
|
|
auto const src = (y + source_top) * in_surface_width + source_left; |
|
|
|
|
|
auto const dst = (y + source_top) * out_surface_width + rect_left; |
|
|
|
|
|
for (u32 x = 0; x < work_width; ++x) { |
|
|
|
|
|
auto const& in_pixel = slot_surface[src + x]; |
|
|
|
|
|
auto& out_pixel = output_surface[dst + x]; |
|
|
|
|
|
s32 const mul_values[4] = { |
|
|
|
|
|
in_pixel.r * mr[0][0] + in_pixel.g * mr[1][1] + in_pixel.b * mr[0][2], |
|
|
|
|
|
in_pixel.r * mr[1][0] + in_pixel.g * mr[1][1] + in_pixel.b * mr[1][2], |
|
|
|
|
|
in_pixel.r * mr[2][0] + in_pixel.g * mr[2][1] + in_pixel.b * mr[2][2], |
|
|
|
|
|
s32(in_pixel.a) |
|
|
|
|
|
}; |
|
|
|
|
|
s32 const mul_clamp[4] = { |
|
|
|
|
|
std::clamp(((mul_values[0] >> shift) + mr[0][3]) >> 8, clamp_min, clamp_max), |
|
|
|
|
|
std::clamp(((mul_values[1] >> shift) + mr[1][3]) >> 8, clamp_min, clamp_max), |
|
|
|
|
|
std::clamp(((mul_values[2] >> shift) + mr[2][3]) >> 8, clamp_min, clamp_max), |
|
|
|
|
|
std::clamp(mul_values[3], clamp_min, clamp_max) |
|
|
|
|
|
}; |
|
|
|
|
|
out_pixel = format == VideoPixelFormat::A8R8G8B8 |
|
|
|
|
|
? Pixel(u16(mul_clamp[2]), u16(mul_clamp[1]), u16(mul_clamp[0]), u16(mul_clamp[3])) |
|
|
|
|
|
: Pixel(u16(mul_clamp[0]), u16(mul_clamp[1]), u16(mul_clamp[2]), u16(mul_clamp[3])); |
|
|
|
|
|
|
|
|
if (COMPILED_HAS_SSE41 && HasSSE41()) { |
|
|
|
|
|
// MSVC doesn't define __SSE4_1__
|
|
|
|
|
|
#if COMPILED_HAS_SSE41
|
|
|
|
|
|
// Fill the columns, e.g
|
|
|
|
|
|
// c0 = [00 00 00 00] [r2c0 r2c0 r2c0 r2c0] [r1c0 r1c0 r1c0 r1c0] [r0c0 r0c0 r0c0 r0c0]
|
|
|
|
|
|
auto const c0 = _mm_set_epi32(0, s32(slot.color_matrix.matrix_coeff20.Value()), s32(slot.color_matrix.matrix_coeff10.Value()), s32(slot.color_matrix.matrix_coeff00.Value())); |
|
|
|
|
|
auto const c1 = _mm_set_epi32(0, s32(slot.color_matrix.matrix_coeff21.Value()), s32(slot.color_matrix.matrix_coeff11.Value()), s32(slot.color_matrix.matrix_coeff01.Value())); |
|
|
|
|
|
auto const c2 = _mm_set_epi32(0, s32(slot.color_matrix.matrix_coeff22.Value()), s32(slot.color_matrix.matrix_coeff12.Value()), s32(slot.color_matrix.matrix_coeff02.Value())); |
|
|
|
|
|
auto const c3 = _mm_set_epi32(0, s32(slot.color_matrix.matrix_coeff23.Value()), s32(slot.color_matrix.matrix_coeff13.Value()), s32(slot.color_matrix.matrix_coeff03.Value())); |
|
|
|
|
|
|
|
|
|
|
|
// Set the matrix right-shift as a single element.
|
|
|
|
|
|
auto const shift = _mm_set_epi32(0, 0, 0, s32(slot.color_matrix.matrix_r_shift.Value())); |
|
|
|
|
|
|
|
|
|
|
|
// Set every 16-bit value to the soft clamp values for clamping every 16-bit channel.
|
|
|
|
|
|
auto const clamp_min = _mm_set1_epi16(u16(slot.config.soft_clamp_low.Value())); |
|
|
|
|
|
auto const clamp_max = _mm_set1_epi16(u16(slot.config.soft_clamp_high.Value())); |
|
|
|
|
|
|
|
|
|
|
|
// clang-format off
|
|
|
|
|
|
|
|
|
|
|
|
auto MatMul = [](__m128i& p, const __m128i& col0, const __m128i& col1, const __m128i& col2, const __m128i& col3, const __m128i& trm_shift) -> __m128i { |
|
|
|
|
|
// Duplicate the 32-bit channels, e.g
|
|
|
|
|
|
// p = [AA AA AA AA] [BB BB BB BB] [GG GG GG GG] [RR RR RR RR]
|
|
|
|
|
|
// ->
|
|
|
|
|
|
// r = [RR4 RR4 RR4 RR4] [RR3 RR3 RR3 RR3] [RR2 RR2 RR2 RR2] [RR1 RR1 RR1 RR1]
|
|
|
|
|
|
auto r = _mm_shuffle_epi32(p, 0x0); |
|
|
|
|
|
auto g = _mm_shuffle_epi32(p, 0x55); |
|
|
|
|
|
auto b = _mm_shuffle_epi32(p, 0xAA); |
|
|
|
|
|
|
|
|
|
|
|
// Multiply the rows and columns c0 * r, c1 * g, c2 * b, e.g
|
|
|
|
|
|
// r = [RR4 RR4 RR4 RR4] [ RR3 RR3 RR3 RR3] [ RR2 RR2 RR2 RR2] [ RR1 RR1 RR1 RR1]
|
|
|
|
|
|
// *
|
|
|
|
|
|
// c0 = [ 00 00 00 00] [r2c0 r2c0 r2c0 r2c0] [r1c0 r1c0 r1c0 r1c0] [r0c0 r0c0 r0c0 r0c0]
|
|
|
|
|
|
r = _mm_mullo_epi32(r, col0); |
|
|
|
|
|
g = _mm_mullo_epi32(g, col1); |
|
|
|
|
|
b = _mm_mullo_epi32(b, col2); |
|
|
|
|
|
|
|
|
|
|
|
// Add them all together vertically, such that the 32-bit element
|
|
|
|
|
|
// out[0] = (r[0] * c0[0]) + (g[0] * c1[0]) + (b[0] * c2[0])
|
|
|
|
|
|
auto out = _mm_add_epi32(_mm_add_epi32(r, g), b); |
|
|
|
|
|
|
|
|
|
|
|
// Shift the result by r_shift, as the TRM says
|
|
|
|
|
|
out = _mm_sra_epi32(out, trm_shift); |
|
|
|
|
|
|
|
|
|
|
|
// Add the final column. Because the 4x1 matrix has this row as 1, there's no need to
|
|
|
|
|
|
// multiply by it, and as per the TRM this column ignores r_shift, so it's just added
|
|
|
|
|
|
// here after shifting.
|
|
|
|
|
|
out = _mm_add_epi32(out, col3); |
|
|
|
|
|
|
|
|
|
|
|
// Shift the result back from S12.8 to integer values
|
|
|
|
|
|
return _mm_srai_epi32(out, 8); |
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
for (u32 y = source_top; y < source_bottom; y++) { |
|
|
|
|
|
auto const src{y * in_surface_width + source_left}; |
|
|
|
|
|
auto const dst{y * out_surface_width + rect_left}; |
|
|
|
|
|
for (u32 x = source_left; x < source_right; x += 8) { |
|
|
|
|
|
// clang-format off
|
|
|
|
|
|
// Prefetch the next iteration's memory
|
|
|
|
|
|
_mm_prefetch((const char*)&slot_surface[src + x + 8], _MM_HINT_T0); |
|
|
|
|
|
|
|
|
|
|
|
// Load in pixels
|
|
|
|
|
|
// p01 = [AA AA] [BB BB] [GG GG] [RR RR] [AA AA] [BB BB] [GG GG] [RR RR]
|
|
|
|
|
|
auto p01 = _mm_load_si128((__m128i*)&slot_surface[src + x + 0]); |
|
|
|
|
|
auto p23 = _mm_load_si128((__m128i*)&slot_surface[src + x + 2]); |
|
|
|
|
|
auto p45 = _mm_load_si128((__m128i*)&slot_surface[src + x + 4]); |
|
|
|
|
|
auto p67 = _mm_load_si128((__m128i*)&slot_surface[src + x + 6]); |
|
|
|
|
|
|
|
|
|
|
|
// Convert the 16-bit channels into 32-bit (unsigned), as the matrix values are
|
|
|
|
|
|
// 32-bit and to avoid overflow.
|
|
|
|
|
|
// p01 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1]
|
|
|
|
|
|
// ->
|
|
|
|
|
|
// p01_lo = [001 001 AA1 AA1] [001 001 BB1 BB1] [001 001 GG1 GG1] [001 001 RR1 RR1]
|
|
|
|
|
|
// p01_hi = [002 002 AA2 AA2] [002 002 BB2 BB2] [002 002 GG2 GG2] [002 002 RR2 RR2]
|
|
|
|
|
|
auto p01_lo = _mm_cvtepu16_epi32(p01); |
|
|
|
|
|
auto p01_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p01, 8)); |
|
|
|
|
|
auto p23_lo = _mm_cvtepu16_epi32(p23); |
|
|
|
|
|
auto p23_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p23, 8)); |
|
|
|
|
|
auto p45_lo = _mm_cvtepu16_epi32(p45); |
|
|
|
|
|
auto p45_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p45, 8)); |
|
|
|
|
|
auto p67_lo = _mm_cvtepu16_epi32(p67); |
|
|
|
|
|
auto p67_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p67, 8)); |
|
|
|
|
|
|
|
|
|
|
|
// Matrix multiply the pixel, doing the colour conversion.
|
|
|
|
|
|
auto out0 = MatMul(p01_lo, c0, c1, c2, c3, shift); |
|
|
|
|
|
auto out1 = MatMul(p01_hi, c0, c1, c2, c3, shift); |
|
|
|
|
|
auto out2 = MatMul(p23_lo, c0, c1, c2, c3, shift); |
|
|
|
|
|
auto out3 = MatMul(p23_hi, c0, c1, c2, c3, shift); |
|
|
|
|
|
auto out4 = MatMul(p45_lo, c0, c1, c2, c3, shift); |
|
|
|
|
|
auto out5 = MatMul(p45_hi, c0, c1, c2, c3, shift); |
|
|
|
|
|
auto out6 = MatMul(p67_lo, c0, c1, c2, c3, shift); |
|
|
|
|
|
auto out7 = MatMul(p67_hi, c0, c1, c2, c3, shift); |
|
|
|
|
|
|
|
|
|
|
|
// Pack the 32-bit channel pixels back into 16-bit using unsigned saturation
|
|
|
|
|
|
// out0 = [001 001 AA1 AA1] [001 001 BB1 BB1] [001 001 GG1 GG1] [001 001 RR1 RR1]
|
|
|
|
|
|
// out1 = [002 002 AA2 AA2] [002 002 BB2 BB2] [002 002 GG2 GG2] [002 002 RR2 RR2]
|
|
|
|
|
|
// ->
|
|
|
|
|
|
// done0 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1]
|
|
|
|
|
|
auto done0 = _mm_packus_epi32(out0, out1); |
|
|
|
|
|
auto done1 = _mm_packus_epi32(out2, out3); |
|
|
|
|
|
auto done2 = _mm_packus_epi32(out4, out5); |
|
|
|
|
|
auto done3 = _mm_packus_epi32(out6, out7); |
|
|
|
|
|
|
|
|
|
|
|
// Blend the original alpha back into the pixel, as the matrix multiply gives us a
|
|
|
|
|
|
// 3-channel output, not 4.
|
|
|
|
|
|
// 0x88 = b10001000, taking RGB from the first argument, A from the second argument.
|
|
|
|
|
|
// done0 = [002 002] [BB2 BB2] [GG2 GG2] [RR2 RR2] [001 001] [BB1 BB1] [GG1 GG1] [RR1 RR1]
|
|
|
|
|
|
// ->
|
|
|
|
|
|
// done0 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1]
|
|
|
|
|
|
done0 = _mm_blend_epi16(done0, p01, 0x88); |
|
|
|
|
|
done1 = _mm_blend_epi16(done1, p23, 0x88); |
|
|
|
|
|
done2 = _mm_blend_epi16(done2, p45, 0x88); |
|
|
|
|
|
done3 = _mm_blend_epi16(done3, p67, 0x88); |
|
|
|
|
|
|
|
|
|
|
|
// Clamp the 16-bit channels to the soft-clamp min/max.
|
|
|
|
|
|
done0 = _mm_max_epu16(done0, clamp_min); |
|
|
|
|
|
done1 = _mm_max_epu16(done1, clamp_min); |
|
|
|
|
|
done2 = _mm_max_epu16(done2, clamp_min); |
|
|
|
|
|
done3 = _mm_max_epu16(done3, clamp_min); |
|
|
|
|
|
|
|
|
|
|
|
done0 = _mm_min_epu16(done0, clamp_max); |
|
|
|
|
|
done1 = _mm_min_epu16(done1, clamp_max); |
|
|
|
|
|
done2 = _mm_min_epu16(done2, clamp_max); |
|
|
|
|
|
done3 = _mm_min_epu16(done3, clamp_max); |
|
|
|
|
|
|
|
|
|
|
|
// Store the pixels to the output surface.
|
|
|
|
|
|
_mm_store_si128((__m128i*)&output_surface[dst + x + 0], done0); |
|
|
|
|
|
_mm_store_si128((__m128i*)&output_surface[dst + x + 2], done1); |
|
|
|
|
|
_mm_store_si128((__m128i*)&output_surface[dst + x + 4], done2); |
|
|
|
|
|
_mm_store_si128((__m128i*)&output_surface[dst + x + 6], done3); |
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
#endif
|
|
|
|
|
|
} else { |
|
|
|
|
|
// Colour conversion is enabled, this is a 3x4 * 4x1 matrix multiplication, resulting in a 3x1 matrix.
|
|
|
|
|
|
// | r0c0 r0c1 r0c2 r0c3 | | R | | R |
|
|
|
|
|
|
// | r1c0 r1c1 r1c2 r1c3 | * | G | = | G |
|
|
|
|
|
|
// | r2c0 r2c1 r2c2 r2c3 | | B | | B |
|
|
|
|
|
|
// | 1 |
|
|
|
|
|
|
auto const shift = s32(slot.color_matrix.matrix_r_shift.Value()); |
|
|
|
|
|
|
|
|
|
|
|
struct AliasedMatrixType { u64 m[4]; }; |
|
|
|
|
|
static_assert(sizeof(AliasedMatrixType) == sizeof(slot.color_matrix)); |
|
|
|
|
|
u64 const mat_mask = (1 << 20) - 1; |
|
|
|
|
|
auto const* amt = reinterpret_cast<AliasedMatrixType const*>(&slot.color_matrix); |
|
|
|
|
|
|
|
|
|
|
|
constexpr s32 shifts[4] = { 0, 20, 40, 60 }; |
|
|
|
|
|
s32 mr[4][4]; |
|
|
|
|
|
for (u32 j = 0; j < 3; ++j) |
|
|
|
|
|
for (u32 i = 0; i < 4; ++i) |
|
|
|
|
|
mr[j][i] = s32(s64(((amt->m[i] >> shifts[j]) & mat_mask) << (64 - 20)) >> (64 - 20)); |
|
|
|
|
|
|
|
|
|
|
|
auto const clamp_min = s32(slot.config.soft_clamp_low.Value()); |
|
|
|
|
|
auto const clamp_max = s32(slot.config.soft_clamp_high.Value()); |
|
|
|
|
|
for (u32 y = 0; y < work_height; ++y) { |
|
|
|
|
|
auto const src = (y + source_top) * in_surface_width + source_left; |
|
|
|
|
|
auto const dst = (y + source_top) * out_surface_width + rect_left; |
|
|
|
|
|
for (u32 x = 0; x < work_width; ++x) { |
|
|
|
|
|
auto const& in_pixel = slot_surface[src + x]; |
|
|
|
|
|
auto& out_pixel = output_surface[dst + x]; |
|
|
|
|
|
s32 const mul_values[4] = { |
|
|
|
|
|
in_pixel.r * mr[0][0] + in_pixel.g * mr[1][1] + in_pixel.b * mr[0][2], |
|
|
|
|
|
in_pixel.r * mr[1][0] + in_pixel.g * mr[1][1] + in_pixel.b * mr[1][2], |
|
|
|
|
|
in_pixel.r * mr[2][0] + in_pixel.g * mr[2][1] + in_pixel.b * mr[2][2], |
|
|
|
|
|
s32(in_pixel.a) |
|
|
|
|
|
}; |
|
|
|
|
|
s32 const mul_clamp[4] = { |
|
|
|
|
|
std::clamp(((mul_values[0] >> shift) + mr[0][3]) >> 8, clamp_min, clamp_max), |
|
|
|
|
|
std::clamp(((mul_values[1] >> shift) + mr[1][3]) >> 8, clamp_min, clamp_max), |
|
|
|
|
|
std::clamp(((mul_values[2] >> shift) + mr[2][3]) >> 8, clamp_min, clamp_max), |
|
|
|
|
|
std::clamp(mul_values[3], clamp_min, clamp_max) |
|
|
|
|
|
}; |
|
|
|
|
|
out_pixel = format == VideoPixelFormat::A8R8G8B8 |
|
|
|
|
|
? Pixel(u16(mul_clamp[2]), u16(mul_clamp[1]), u16(mul_clamp[0]), u16(mul_clamp[3])) |
|
|
|
|
|
: Pixel(u16(mul_clamp[0]), u16(mul_clamp[1]), u16(mul_clamp[2]), u16(mul_clamp[3])); |
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} else { |
|
|
} else { |
|
|
@ -391,17 +699,157 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) n |
|
|
surface_width = (std::min)(surface_width, out_luma_width); |
|
|
surface_width = (std::min)(surface_width, out_luma_width); |
|
|
surface_height = (std::min)(surface_height, out_luma_height); |
|
|
surface_height = (std::min)(surface_height, out_luma_height); |
|
|
|
|
|
|
|
|
auto Decode = [&](std::span<u8> out_luma, std::span<u8> out_chroma) { |
|
|
|
|
|
for (size_t y = 0; y < surface_height; ++y) { |
|
|
|
|
|
auto const src_luma = y * surface_stride; |
|
|
|
|
|
auto const dst_luma = y * out_luma_stride; |
|
|
|
|
|
auto const src_chroma = y * surface_stride; |
|
|
|
|
|
auto const dst_chroma = (y / 2) * out_chroma_stride; |
|
|
|
|
|
for (size_t x = 0; x < surface_width; x += 2) { |
|
|
|
|
|
out_luma[dst_luma + x + 0] = u8(output_surface[src_luma + x + 0].r >> 2); |
|
|
|
|
|
out_luma[dst_luma + x + 1] = u8(output_surface[src_luma + x + 1].r >> 2); |
|
|
|
|
|
out_chroma[dst_chroma + x + 0] = u8(output_surface[src_chroma + x].g >> 2); |
|
|
|
|
|
out_chroma[dst_chroma + x + 1] = u8(output_surface[src_chroma + x].b >> 2); |
|
|
|
|
|
|
|
|
auto Decode = [&](u8* out_luma, u8* out_chroma) { |
|
|
|
|
|
if (COMPILED_HAS_SSE41 && HasSSE41()) { |
|
|
|
|
|
#if COMPILED_HAS_SSE41
|
|
|
|
|
|
// luma_mask = [00 00] [00 00] [00 00] [FF FF] [00 00] [00 00] [00 00] [FF FF]
|
|
|
|
|
|
auto const luma_mask = _mm_set_epi16(0, 0, 0, -1, 0, 0, 0, -1); |
|
|
|
|
|
auto const sse_aligned_width = Common::AlignDown(surface_width, 16); |
|
|
|
|
|
for (u32 y = 0; y < surface_height; ++y) { |
|
|
|
|
|
auto const src = y * surface_stride; |
|
|
|
|
|
auto const dst_luma = y * out_luma_stride; |
|
|
|
|
|
auto const dst_chroma = (y / 2) * out_chroma_stride; |
|
|
|
|
|
u32 x = 0; |
|
|
|
|
|
for (; x < sse_aligned_width; x += 16) { |
|
|
|
|
|
// clang-format off
|
|
|
|
|
|
// Prefetch the next cache lines, 2 per iteration
|
|
|
|
|
|
_mm_prefetch((const char*)&output_surface[src + x + 16], _MM_HINT_T0); |
|
|
|
|
|
_mm_prefetch((const char*)&output_surface[src + x + 24], _MM_HINT_T0); |
|
|
|
|
|
|
|
|
|
|
|
// Load the 64-bit pixels, 2 per variable.
|
|
|
|
|
|
auto pixel01 = _mm_load_si128((__m128i*)&output_surface[src + x + 0]); |
|
|
|
|
|
auto pixel23 = _mm_load_si128((__m128i*)&output_surface[src + x + 2]); |
|
|
|
|
|
auto pixel45 = _mm_load_si128((__m128i*)&output_surface[src + x + 4]); |
|
|
|
|
|
auto pixel67 = _mm_load_si128((__m128i*)&output_surface[src + x + 6]); |
|
|
|
|
|
auto pixel89 = _mm_load_si128((__m128i*)&output_surface[src + x + 8]); |
|
|
|
|
|
auto pixel1011 = _mm_load_si128((__m128i*)&output_surface[src + x + 10]); |
|
|
|
|
|
auto pixel1213 = _mm_load_si128((__m128i*)&output_surface[src + x + 12]); |
|
|
|
|
|
auto pixel1415 = _mm_load_si128((__m128i*)&output_surface[src + x + 14]); |
|
|
|
|
|
|
|
|
|
|
|
// Split out the luma of each pixel using the luma_mask above.
|
|
|
|
|
|
// pixel01 = [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1] [LL1 LL1]
|
|
|
|
|
|
// ->
|
|
|
|
|
|
// l01 = [002 002] [002 002] [002 002] [LL2 LL2] [001 001] [001 001] [001 001] [LL1 LL1]
|
|
|
|
|
|
auto l01 = _mm_and_si128(pixel01, luma_mask); |
|
|
|
|
|
auto l23 = _mm_and_si128(pixel23, luma_mask); |
|
|
|
|
|
auto l45 = _mm_and_si128(pixel45, luma_mask); |
|
|
|
|
|
auto l67 = _mm_and_si128(pixel67, luma_mask); |
|
|
|
|
|
auto l89 = _mm_and_si128(pixel89, luma_mask); |
|
|
|
|
|
auto l1011 = _mm_and_si128(pixel1011, luma_mask); |
|
|
|
|
|
auto l1213 = _mm_and_si128(pixel1213, luma_mask); |
|
|
|
|
|
auto l1415 = _mm_and_si128(pixel1415, luma_mask); |
|
|
|
|
|
|
|
|
|
|
|
// Pack 32-bit elements from 2 registers down into 16-bit elements in 1 register.
|
|
|
|
|
|
// l01 = [002 002 002 002] [002 002 LL2 LL2] [001 001 001 001] [001 001 LL1 LL1]
|
|
|
|
|
|
// l23 = [004 004 004 004] [004 004 LL4 LL4] [003 003 003 003] [003 003 LL3 LL3]
|
|
|
|
|
|
// ->
|
|
|
|
|
|
// l0123 = [004 004] [LL4 LL4] [003 003] [LL3 LL3] [002 002] [LL2 LL2] [001 001] [LL1 LL1]
|
|
|
|
|
|
auto l0123 = _mm_packus_epi32(l01, l23); |
|
|
|
|
|
auto l4567 = _mm_packus_epi32(l45, l67); |
|
|
|
|
|
auto l891011 = _mm_packus_epi32(l89, l1011); |
|
|
|
|
|
auto l12131415 = _mm_packus_epi32(l1213, l1415); |
|
|
|
|
|
|
|
|
|
|
|
// Pack 32-bit elements from 2 registers down into 16-bit elements in 1 register.
|
|
|
|
|
|
// l0123 = [004 004 LL4 LL4] [003 003 LL3 LL3] [002 002 LL2 LL2] [001 001 LL1 LL1]
|
|
|
|
|
|
// l4567 = [008 008 LL8 LL8] [007 007 LL7 LL7] [006 006 LL6 LL6] [005 005 LL5 LL5]
|
|
|
|
|
|
// ->
|
|
|
|
|
|
// luma_lo = [LL8 LL8] [LL7 LL7] [LL6 LL6] [LL5 LL5] [LL4 LL4] [LL3 LL3] [LL2 LL2] [LL1 LL1]
|
|
|
|
|
|
auto luma_lo = _mm_packus_epi32(l0123, l4567); |
|
|
|
|
|
auto luma_hi = _mm_packus_epi32(l891011, l12131415); |
|
|
|
|
|
|
|
|
|
|
|
// Right-shift the 16-bit elements by 2, un-doing the left shift by 2 on read
|
|
|
|
|
|
// and bringing the range back to 8-bit.
|
|
|
|
|
|
luma_lo = _mm_srli_epi16(luma_lo, 2); |
|
|
|
|
|
luma_hi = _mm_srli_epi16(luma_hi, 2); |
|
|
|
|
|
|
|
|
|
|
|
// Pack with unsigned saturation the 16-bit values in 2 registers into 8-bit values in 1 register.
|
|
|
|
|
|
// luma_lo = [LL8 LL8] [LL7 LL7] [LL6 LL6] [LL5 LL5] [LL4 LL4] [LL3 LL3] [LL2 LL2] [LL1 LL1]
|
|
|
|
|
|
// luma_hi = [LL16 LL16] [LL15 LL15] [LL14 LL14] [LL13 LL13] [LL12 LL12] [LL11 LL11] [LL10 LL10] [LL9 LL9]
|
|
|
|
|
|
// ->
|
|
|
|
|
|
// luma = [LL16] [LL15] [LL14] [LL13] [LL12] [LL11] [LL10] [LL9] [LL8] [LL7] [LL6] [LL5] [LL4] [LL3] [LL2] [LL1]
|
|
|
|
|
|
auto luma = _mm_packus_epi16(luma_lo, luma_hi); |
|
|
|
|
|
|
|
|
|
|
|
// Store the 16 bytes of luma
|
|
|
|
|
|
_mm_store_si128((__m128i*)&out_luma[dst_luma + x], luma); |
|
|
|
|
|
|
|
|
|
|
|
if (y % 2 == 0) { |
|
|
|
|
|
// Chroma, done every other line as it's half the height of luma.
|
|
|
|
|
|
|
|
|
|
|
|
// Shift the register right by 2 bytes (not bits), to kick out the 16-bit luma.
|
|
|
|
|
|
// We can do this instead of &'ing a mask and then shifting.
|
|
|
|
|
|
// pixel01 = [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1] [LL1 LL1]
|
|
|
|
|
|
// ->
|
|
|
|
|
|
// c01 = [ 00 00] [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1]
|
|
|
|
|
|
auto c01 = _mm_srli_si128(pixel01, 2); |
|
|
|
|
|
auto c23 = _mm_srli_si128(pixel23, 2); |
|
|
|
|
|
auto c45 = _mm_srli_si128(pixel45, 2); |
|
|
|
|
|
auto c67 = _mm_srli_si128(pixel67, 2); |
|
|
|
|
|
auto c89 = _mm_srli_si128(pixel89, 2); |
|
|
|
|
|
auto c1011 = _mm_srli_si128(pixel1011, 2); |
|
|
|
|
|
auto c1213 = _mm_srli_si128(pixel1213, 2); |
|
|
|
|
|
auto c1415 = _mm_srli_si128(pixel1415, 2); |
|
|
|
|
|
|
|
|
|
|
|
// Interleave the lower 8 bytes as 32-bit elements from 2 registers into 1 register.
|
|
|
|
|
|
// This has the effect of skipping every other chroma value horitonally,
|
|
|
|
|
|
// notice the high pixels UU2/UU4 are skipped.
|
|
|
|
|
|
// This is intended as N420 chroma width is half the luma width.
|
|
|
|
|
|
// c01 = [ 00 00 AA2 AA2] [VV2 VV2 UU2 UU2] [LL2 LL2 AA1 AA1] [VV1 VV1 UU1 UU1]
|
|
|
|
|
|
// c23 = [ 00 00 AA4 AA4] [VV4 VV4 UU4 UU4] [LL4 LL4 AA3 AA3] [VV3 VV3 UU3 UU3]
|
|
|
|
|
|
// ->
|
|
|
|
|
|
// c0123 = [LL4 LL4 AA3 AA3] [LL2 LL2 AA1 AA1] [VV3 VV3 UU3 UU3] [VV1 VV1 UU1 UU1]
|
|
|
|
|
|
auto c0123 = _mm_unpacklo_epi32(c01, c23); |
|
|
|
|
|
auto c4567 = _mm_unpacklo_epi32(c45, c67); |
|
|
|
|
|
auto c891011 = _mm_unpacklo_epi32(c89, c1011); |
|
|
|
|
|
auto c12131415 = _mm_unpacklo_epi32(c1213, c1415); |
|
|
|
|
|
|
|
|
|
|
|
// Interleave the low 64-bit elements from 2 registers into 1.
|
|
|
|
|
|
// c0123 = [LL4 LL4 AA3 AA3 LL2 LL2 AA1 AA1] [VV3 VV3 UU3 UU3 VV1 VV1 UU1 UU1]
|
|
|
|
|
|
// c4567 = [LL8 LL8 AA7 AA7 LL6 LL6 AA5 AA5] [VV7 VV7 UU7 UU7 VV5 VV5 UU5 UU5]
|
|
|
|
|
|
// ->
|
|
|
|
|
|
// chroma_lo = [VV7 VV7 UU7 UU7 VV5 VV5 UU5 UU5] [VV3 VV3 UU3 UU3 VV1 VV1 UU1 UU1]
|
|
|
|
|
|
auto chroma_lo = _mm_unpacklo_epi64(c0123, c4567); |
|
|
|
|
|
auto chroma_hi = _mm_unpacklo_epi64(c891011, c12131415); |
|
|
|
|
|
|
|
|
|
|
|
// Right-shift the 16-bit elements by 2, un-doing the left shift by 2 on read
|
|
|
|
|
|
// and bringing the range back to 8-bit.
|
|
|
|
|
|
chroma_lo = _mm_srli_epi16(chroma_lo, 2); |
|
|
|
|
|
chroma_hi = _mm_srli_epi16(chroma_hi, 2); |
|
|
|
|
|
|
|
|
|
|
|
// Pack with unsigned saturation the 16-bit elements from 2 registers into 8-bit elements in 1 register.
|
|
|
|
|
|
// chroma_lo = [ VV7 VV7] [ UU7 UU7] [ VV5 VV5] [ UU5 UU5] [ VV3 VV3] [ UU3 UU3] [VV1 VV1] [UU1 UU1]
|
|
|
|
|
|
// chroma_hi = [VV15 VV15] [UU15 UU15] [VV13 VV13] [UU13 UU13] [VV11 VV11] [UU11 UU11] [VV9 VV9] [UU9 UU9]
|
|
|
|
|
|
// ->
|
|
|
|
|
|
// chroma = [VV15] [UU15] [VV13] [UU13] [VV11] [UU11] [VV9] [UU9] [VV7] [UU7] [VV5] [UU5] [VV3] [UU3] [VV1] [UU1]
|
|
|
|
|
|
auto chroma = _mm_packus_epi16(chroma_lo, chroma_hi); |
|
|
|
|
|
|
|
|
|
|
|
// Store the 16 bytes of chroma.
|
|
|
|
|
|
_mm_store_si128((__m128i*)&out_chroma[dst_chroma + x + 0], chroma); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// clang-format on
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
auto const src_chroma = y * surface_stride; |
|
|
|
|
|
for (; x < surface_width; x += 2) { |
|
|
|
|
|
out_luma[dst_luma + x + 0] = u8(output_surface[src + x + 0].r >> 2); |
|
|
|
|
|
out_luma[dst_luma + x + 1] = u8(output_surface[src + x + 1].r >> 2); |
|
|
|
|
|
out_chroma[dst_chroma + x + 0] = u8(output_surface[src_chroma + x].g >> 2); |
|
|
|
|
|
out_chroma[dst_chroma + x + 1] = u8(output_surface[src_chroma + x].b >> 2); |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
#endif
|
|
|
|
|
|
} else { |
|
|
|
|
|
for (size_t y = 0; y < surface_height; ++y) { |
|
|
|
|
|
auto const src_luma = y * surface_stride; |
|
|
|
|
|
auto const dst_luma = y * out_luma_stride; |
|
|
|
|
|
auto const src_chroma = y * surface_stride; |
|
|
|
|
|
auto const dst_chroma = (y / 2) * out_chroma_stride; |
|
|
|
|
|
for (size_t x = 0; x < surface_width; x += 2) { |
|
|
|
|
|
out_luma[dst_luma + x + 0] = u8(output_surface[src_luma + x + 0].r >> 2); |
|
|
|
|
|
out_luma[dst_luma + x + 1] = u8(output_surface[src_luma + x + 1].r >> 2); |
|
|
|
|
|
out_chroma[dst_chroma + x + 0] = u8(output_surface[src_chroma + x].g >> 2); |
|
|
|
|
|
out_chroma[dst_chroma + x + 1] = u8(output_surface[src_chroma + x].b >> 2); |
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
}; |
|
|
}; |
|
|
@ -424,7 +872,7 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) n |
|
|
|
|
|
|
|
|
luma_scratch.resize_destructive(out_luma_size); |
|
|
luma_scratch.resize_destructive(out_luma_size); |
|
|
chroma_scratch.resize_destructive(out_chroma_size); |
|
|
chroma_scratch.resize_destructive(out_chroma_size); |
|
|
Decode(luma_scratch, chroma_scratch); |
|
|
|
|
|
|
|
|
Decode(luma_scratch.data(), chroma_scratch.data()); |
|
|
|
|
|
|
|
|
Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_luma(memory_manager, regs.output_surface.luma.Address(), out_luma_swizzle_size, &swizzle_scratch); |
|
|
Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_luma(memory_manager, regs.output_surface.luma.Address(), out_luma_swizzle_size, &swizzle_scratch); |
|
|
Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_chroma(memory_manager, regs.output_surface.chroma_u.Address(), out_chroma_swizzle_size, &swizzle_scratch); |
|
|
Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_chroma(memory_manager, regs.output_surface.chroma_u.Address(), out_chroma_swizzle_size, &swizzle_scratch); |
|
|
@ -454,7 +902,7 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) n |
|
|
// afterwards to re-overwrite the luma being too large.
|
|
|
// afterwards to re-overwrite the luma being too large.
|
|
|
luma_scratch.resize_destructive(out_luma_size); |
|
|
luma_scratch.resize_destructive(out_luma_size); |
|
|
chroma_scratch.resize_destructive(out_chroma_size); |
|
|
chroma_scratch.resize_destructive(out_chroma_size); |
|
|
Decode(luma_scratch, chroma_scratch); |
|
|
|
|
|
|
|
|
Decode(luma_scratch.data(), chroma_scratch.data()); |
|
|
memory_manager.WriteBlock(regs.output_surface.luma.Address(), luma_scratch.data(), out_luma_size); |
|
|
memory_manager.WriteBlock(regs.output_surface.luma.Address(), luma_scratch.data(), out_luma_size); |
|
|
memory_manager.WriteBlock(regs.output_surface.chroma_u.Address(), chroma_scratch.data(), out_chroma_size); |
|
|
memory_manager.WriteBlock(regs.output_surface.chroma_u.Address(), chroma_scratch.data(), out_chroma_size); |
|
|
} break; |
|
|
} break; |
|
|
@ -464,7 +912,7 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) n |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config) noexcept { |
|
|
|
|
|
|
|
|
void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config, VideoPixelFormat format) noexcept { |
|
|
constexpr u32 BytesPerPixel = 4; |
|
|
constexpr u32 BytesPerPixel = 4; |
|
|
|
|
|
|
|
|
auto surface_width = output_surface_config.out_surface_width + 1; |
|
|
auto surface_width = output_surface_config.out_surface_width + 1; |
|
|
@ -479,14 +927,95 @@ void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config) noexcept { |
|
|
surface_width = (std::min)(surface_width, out_luma_width); |
|
|
surface_width = (std::min)(surface_width, out_luma_width); |
|
|
surface_height = (std::min)(surface_height, out_luma_height); |
|
|
surface_height = (std::min)(surface_height, out_luma_height); |
|
|
|
|
|
|
|
|
auto Decode = [&](std::span<u8> s1, std::span<Pixel> s2) { |
|
|
|
|
|
for (size_t y = 0; y < surface_height; ++y) { |
|
|
|
|
|
auto const src = y * surface_stride, dst = y * out_luma_stride; |
|
|
|
|
|
for (size_t x = 0; x < surface_width; ++x) { |
|
|
|
|
|
s1[dst + x * 4 + 0] = u8(s2[src + x].r >> 2); |
|
|
|
|
|
s1[dst + x * 4 + 1] = u8(s2[src + x].g >> 2); |
|
|
|
|
|
s1[dst + x * 4 + 2] = u8(s2[src + x].b >> 2); |
|
|
|
|
|
s1[dst + x * 4 + 3] = u8(s2[src + x].a >> 2); |
|
|
|
|
|
|
|
|
auto Decode = [&](u8* out, Pixel const* inp) { |
|
|
|
|
|
if (COMPILED_HAS_SSE41 && HasSSE41()) { |
|
|
|
|
|
#if COMPILED_HAS_SSE41
|
|
|
|
|
|
size_t const SSE_ALIGNMENT = 16; |
|
|
|
|
|
auto const sse_aligned_width = Common::AlignDown(surface_width, SSE_ALIGNMENT); |
|
|
|
|
|
for (u32 y = 0; y < surface_height; y++) { |
|
|
|
|
|
auto const src = y * surface_stride; |
|
|
|
|
|
auto const dst = y * out_luma_stride; |
|
|
|
|
|
u32 x = 0; |
|
|
|
|
|
for (; x < sse_aligned_width; x += SSE_ALIGNMENT) { |
|
|
|
|
|
// Prefetch the next 2 cache lines
|
|
|
|
|
|
_mm_prefetch((const char*)&inp[src + x + 16], _MM_HINT_T0); |
|
|
|
|
|
_mm_prefetch((const char*)&inp[src + x + 24], _MM_HINT_T0); |
|
|
|
|
|
|
|
|
|
|
|
// Load the pixels, 16-bit channels, 8 bytes per pixel, e.g
|
|
|
|
|
|
// pixel01 = [AA AA BB BB GG GG RR RR AA AA BB BB GG GG RR RR
|
|
|
|
|
|
auto pixel01 = _mm_load_si128((__m128i*)&inp[src + x + 0]); |
|
|
|
|
|
auto pixel23 = _mm_load_si128((__m128i*)&inp[src + x + 2]); |
|
|
|
|
|
auto pixel45 = _mm_load_si128((__m128i*)&inp[src + x + 4]); |
|
|
|
|
|
auto pixel67 = _mm_load_si128((__m128i*)&inp[src + x + 6]); |
|
|
|
|
|
auto pixel89 = _mm_load_si128((__m128i*)&inp[src + x + 8]); |
|
|
|
|
|
auto pixel1011 = _mm_load_si128((__m128i*)&inp[src + x + 10]); |
|
|
|
|
|
auto pixel1213 = _mm_load_si128((__m128i*)&inp[src + x + 12]); |
|
|
|
|
|
auto pixel1415 = _mm_load_si128((__m128i*)&inp[src + x + 14]); |
|
|
|
|
|
|
|
|
|
|
|
// Right-shift the channels by 16 to un-do the left shit on read and bring the range
|
|
|
|
|
|
// back to 8-bit.
|
|
|
|
|
|
pixel01 = _mm_srli_epi16(pixel01, 2); |
|
|
|
|
|
pixel23 = _mm_srli_epi16(pixel23, 2); |
|
|
|
|
|
pixel45 = _mm_srli_epi16(pixel45, 2); |
|
|
|
|
|
pixel67 = _mm_srli_epi16(pixel67, 2); |
|
|
|
|
|
pixel89 = _mm_srli_epi16(pixel89, 2); |
|
|
|
|
|
pixel1011 = _mm_srli_epi16(pixel1011, 2); |
|
|
|
|
|
pixel1213 = _mm_srli_epi16(pixel1213, 2); |
|
|
|
|
|
pixel1415 = _mm_srli_epi16(pixel1415, 2); |
|
|
|
|
|
|
|
|
|
|
|
// Pack with unsigned saturation 16-bit channels from 2 registers into 8-bit channels in 1 register.
|
|
|
|
|
|
// pixel01 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1]
|
|
|
|
|
|
// pixel23 = [AA4 AA4] [BB4 BB4] [GG4 GG4] [RR4 RR4] [AA3 AA3] [BB3 BB3] [GG3 GG3] [RR3 RR3]
|
|
|
|
|
|
// ->
|
|
|
|
|
|
// pixels0_lo = [AA4] [BB4] [GG4] [RR4] [AA3] [BB3] [GG3] [RR3] [AA2] [BB2] [GG2] [RR2] [AA1] [BB1] [GG1] [RR1]
|
|
|
|
|
|
auto pixels0_lo = _mm_packus_epi16(pixel01, pixel23); |
|
|
|
|
|
auto pixels0_hi = _mm_packus_epi16(pixel45, pixel67); |
|
|
|
|
|
auto pixels1_lo = _mm_packus_epi16(pixel89, pixel1011); |
|
|
|
|
|
auto pixels1_hi = _mm_packus_epi16(pixel1213, pixel1415); |
|
|
|
|
|
|
|
|
|
|
|
if (format == VideoPixelFormat::A8R8G8B8) { |
|
|
|
|
|
auto const shuffle = _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2); |
|
|
|
|
|
|
|
|
|
|
|
// Our pixels are ABGR (big-endian) by default, if ARGB is needed, we need to shuffle.
|
|
|
|
|
|
// pixels0_lo = [AA4 BB4 GG4 RR4] [AA3 BB3 GG3 RR3] [AA2 BB2 GG2 RR2] [AA1 BB1 GG1 RR1]
|
|
|
|
|
|
// ->
|
|
|
|
|
|
// pixels0_lo = [AA4 RR4 GG4 BB4] [AA3 RR3 GG3 BB3] [AA2 RR2 GG2 BB2] [AA1 RR1 GG1 BB1]
|
|
|
|
|
|
pixels0_lo = _mm_shuffle_epi8(pixels0_lo, shuffle); |
|
|
|
|
|
pixels0_hi = _mm_shuffle_epi8(pixels0_hi, shuffle); |
|
|
|
|
|
pixels1_lo = _mm_shuffle_epi8(pixels1_lo, shuffle); |
|
|
|
|
|
pixels1_hi = _mm_shuffle_epi8(pixels1_hi, shuffle); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Store the pixels
|
|
|
|
|
|
_mm_store_si128((__m128i*)&out[dst + x * 4 + 0], pixels0_lo); |
|
|
|
|
|
_mm_store_si128((__m128i*)&out[dst + x * 4 + 16], pixels0_hi); |
|
|
|
|
|
_mm_store_si128((__m128i*)&out[dst + x * 4 + 32], pixels1_lo); |
|
|
|
|
|
_mm_store_si128((__m128i*)&out[dst + x * 4 + 48], pixels1_hi); |
|
|
|
|
|
} |
|
|
|
|
|
for (; x < surface_width; x++) { |
|
|
|
|
|
if (format == VideoPixelFormat::A8R8G8B8) { |
|
|
|
|
|
out[dst + x * 4 + 0] = u8(inp[src + x].b >> 2); |
|
|
|
|
|
out[dst + x * 4 + 1] = u8(inp[src + x].g >> 2); |
|
|
|
|
|
out[dst + x * 4 + 2] = u8(inp[src + x].r >> 2); |
|
|
|
|
|
out[dst + x * 4 + 3] = u8(inp[src + x].a >> 2); |
|
|
|
|
|
} else { |
|
|
|
|
|
out[dst + x * 4 + 0] = u8(inp[src + x].r >> 2); |
|
|
|
|
|
out[dst + x * 4 + 1] = u8(inp[src + x].g >> 2); |
|
|
|
|
|
out[dst + x * 4 + 2] = u8(inp[src + x].b >> 2); |
|
|
|
|
|
out[dst + x * 4 + 3] = u8(inp[src + x].a >> 2); |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
#endif
|
|
|
|
|
|
} else { |
|
|
|
|
|
for (size_t y = 0; y < surface_height; ++y) { |
|
|
|
|
|
auto const src = y * surface_stride, dst = y * out_luma_stride; |
|
|
|
|
|
for (size_t x = 0; x < surface_width; ++x) { |
|
|
|
|
|
out[dst + x * 4 + 0] = u8(inp[src + x].r >> 2); |
|
|
|
|
|
out[dst + x * 4 + 1] = u8(inp[src + x].g >> 2); |
|
|
|
|
|
out[dst + x * 4 + 2] = u8(inp[src + x].b >> 2); |
|
|
|
|
|
out[dst + x * 4 + 3] = u8(inp[src + x].a >> 2); |
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
}; |
|
|
}; |
|
|
@ -502,7 +1031,7 @@ void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config) noexcept { |
|
|
surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height, |
|
|
surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height, |
|
|
out_luma_stride, out_luma_size, block_height, out_swizzle_size); |
|
|
out_luma_stride, out_luma_size, block_height, out_swizzle_size); |
|
|
luma_scratch.resize_destructive(out_luma_size); |
|
|
luma_scratch.resize_destructive(out_luma_size); |
|
|
Decode(luma_scratch, output_surface); |
|
|
|
|
|
|
|
|
Decode(luma_scratch.data(), output_surface.data()); |
|
|
|
|
|
|
|
|
Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_luma(memory_manager, regs.output_surface.luma.Address(), out_swizzle_size, &swizzle_scratch); |
|
|
Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_luma(memory_manager, regs.output_surface.luma.Address(), out_swizzle_size, &swizzle_scratch); |
|
|
if (block_height == 1) { |
|
|
if (block_height == 1) { |
|
|
@ -520,7 +1049,7 @@ void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config) noexcept { |
|
|
out_luma_stride, out_luma_size); |
|
|
out_luma_stride, out_luma_size); |
|
|
luma_scratch.resize_destructive(out_luma_size); |
|
|
luma_scratch.resize_destructive(out_luma_size); |
|
|
Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_luma(memory_manager, regs.output_surface.luma.Address(), out_luma_size, &luma_scratch); |
|
|
Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_luma(memory_manager, regs.output_surface.luma.Address(), out_luma_size, &luma_scratch); |
|
|
Decode(out_luma, output_surface); |
|
|
|
|
|
|
|
|
Decode(out_luma.data(), output_surface.data()); |
|
|
} break; |
|
|
} break; |
|
|
default: |
|
|
default: |
|
|
UNREACHABLE(); |
|
|
UNREACHABLE(); |
|
|
|