|
|
@ -41,89 +41,69 @@ extern "C" { |
|
|
namespace Tegra::Host1x { |
|
|
namespace Tegra::Host1x { |
|
|
namespace { |
|
|
namespace { |
|
|
|
|
|
|
|
|
void SwizzleSurface(std::span<u8> output, u32 out_stride, std::span<const u8> input, u32 in_stride, |
|
|
|
|
|
u32 height) { |
|
|
|
|
|
/*
|
|
|
|
|
|
* Taken from https://github.com/averne/FFmpeg/blob/nvtegra/libavutil/hwcontext_nvtegra.c#L949
|
|
|
|
|
|
* Can only handle block height == 1. |
|
|
|
|
|
*/ |
|
|
|
|
|
const uint32_t x_mask = 0xFFFFFFD2u; |
|
|
|
|
|
const uint32_t y_mask = 0x2Cu; |
|
|
|
|
|
uint32_t offs_x{}; |
|
|
|
|
|
uint32_t offs_y{}; |
|
|
|
|
|
uint32_t offs_line{}; |
|
|
|
|
|
|
|
|
|
|
|
for (u32 y = 0; y < height; y += 2) { |
|
|
|
|
|
|
|
|
void SwizzleSurface(std::span<u8> output, u32 out_stride, std::span<const u8> input, u32 in_stride, u32 height) noexcept { |
|
|
|
|
|
//// Taken from https://github.com/averne/FFmpeg/blob/nvtegra/libavutil/hwcontext_nvtegra.c#L949
|
|
|
|
|
|
/// Can only handle block height == 1.
|
|
|
|
|
|
u32 const x_mask = 0xFFFFFFD2u, y_mask = 0x2Cu; |
|
|
|
|
|
u32 offs_x = 0, offs_y = 0; |
|
|
|
|
|
for (size_t y = 0; y < height; y += 2) { |
|
|
auto dst_line = output.data() + offs_y * 16; |
|
|
auto dst_line = output.data() + offs_y * 16; |
|
|
const auto src_line = input.data() + y * (in_stride / 16) * 16; |
|
|
|
|
|
|
|
|
auto const src_line = input.data() + u32(y) * (in_stride / 16) * 16; |
|
|
|
|
|
|
|
|
offs_line = offs_x; |
|
|
|
|
|
|
|
|
auto offs_line = offs_x; |
|
|
for (u32 x = 0; x < in_stride; x += 16) { |
|
|
for (u32 x = 0; x < in_stride; x += 16) { |
|
|
std::memcpy(&dst_line[offs_line * 16], &src_line[x], 16); |
|
|
std::memcpy(&dst_line[offs_line * 16], &src_line[x], 16); |
|
|
std::memcpy(&dst_line[offs_line * 16 + 16], &src_line[x + in_stride], 16); |
|
|
std::memcpy(&dst_line[offs_line * 16 + 16], &src_line[x + in_stride], 16); |
|
|
offs_line = (offs_line - x_mask) & x_mask; |
|
|
offs_line = (offs_line - x_mask) & x_mask; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
offs_y = (offs_y - y_mask) & y_mask; |
|
|
offs_y = (offs_y - y_mask) & y_mask; |
|
|
|
|
|
|
|
|
/* Wrap into next tile row */ |
|
|
|
|
|
if (!offs_y) { |
|
|
|
|
|
offs_x += out_stride; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
offs_x += offs_y ? 0 : out_stride; // Wrap into next tile row
|
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
} // namespace
|
|
|
} // namespace
|
|
|
|
|
|
|
|
|
Vic::Vic(Host1x& host1x_, s32 id_, u32 syncpt, FrameQueue& frame_queue_) |
|
|
|
|
|
: CDmaPusher{host1x_, id_}, id{id_}, syncpoint{syncpt}, |
|
|
|
|
|
frame_queue{frame_queue_} { |
|
|
|
|
|
|
|
|
Vic::Vic(Host1x& host1x_, s32 id_, u32 syncpt, FrameQueue& frame_queue_) noexcept : |
|
|
|
|
|
CDmaPusher{host1x_, id_} |
|
|
|
|
|
, id{id_} |
|
|
|
|
|
, syncpoint{syncpt} |
|
|
|
|
|
, frame_queue{frame_queue_} |
|
|
|
|
|
{ |
|
|
LOG_INFO(HW_GPU, "Created vic {}", id); |
|
|
LOG_INFO(HW_GPU, "Created vic {}", id); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
Vic::~Vic() { |
|
|
|
|
|
|
|
|
Vic::~Vic() noexcept { |
|
|
LOG_INFO(HW_GPU, "Destroying vic {}", id); |
|
|
LOG_INFO(HW_GPU, "Destroying vic {}", id); |
|
|
frame_queue.Close(id); |
|
|
frame_queue.Close(id); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void Vic::ProcessMethod(u32 method, u32 arg) { |
|
|
|
|
|
|
|
|
void Vic::ProcessMethod(u32 method, u32 arg) noexcept { |
|
|
LOG_TRACE(HW_GPU, "Vic {} method {:#X}", id, u32(method)); |
|
|
LOG_TRACE(HW_GPU, "Vic {} method {:#X}", id, u32(method)); |
|
|
regs.reg_array[method] = arg; |
|
|
regs.reg_array[method] = arg; |
|
|
|
|
|
|
|
|
switch (static_cast<Method>(method * sizeof(u32))) { |
|
|
|
|
|
case Method::Execute: { |
|
|
|
|
|
|
|
|
switch (Method(method * sizeof(u32))) { |
|
|
|
|
|
case Method::Execute: |
|
|
Execute(); |
|
|
Execute(); |
|
|
} break; |
|
|
|
|
|
|
|
|
break; |
|
|
default: |
|
|
default: |
|
|
break; |
|
|
break; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void Vic::Execute() { |
|
|
|
|
|
|
|
|
void Vic::Execute() noexcept { |
|
|
ConfigStruct config{}; |
|
|
ConfigStruct config{}; |
|
|
memory_manager.ReadBlock(regs.config_struct_offset.Address(), &config, sizeof(ConfigStruct)); |
|
|
memory_manager.ReadBlock(regs.config_struct_offset.Address(), &config, sizeof(ConfigStruct)); |
|
|
|
|
|
|
|
|
auto output_width{config.output_surface_config.out_surface_width + 1}; |
|
|
|
|
|
auto output_height{config.output_surface_config.out_surface_height + 1}; |
|
|
|
|
|
output_surface.resize_destructive(output_width * output_height); |
|
|
|
|
|
|
|
|
auto output_width = config.output_surface_config.out_surface_width + 1; |
|
|
|
|
|
auto output_height = config.output_surface_config.out_surface_height + 1; |
|
|
|
|
|
output_surface.resize(output_width * output_height); |
|
|
|
|
|
|
|
|
if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Off) [[unlikely]] { |
|
|
|
|
|
// Fill the frame with black, as otherwise they can have random data and be very glitchy.
|
|
|
|
|
|
std::fill(output_surface.begin(), output_surface.end(), Pixel{}); |
|
|
|
|
|
} else { |
|
|
|
|
|
|
|
|
if (Settings::values.nvdec_emulation.GetValue() != Settings::NvdecEmulation::Off) { |
|
|
for (size_t i = 0; i < config.slot_structs.size(); i++) { |
|
|
for (size_t i = 0; i < config.slot_structs.size(); i++) { |
|
|
auto& slot_config{config.slot_structs[i]}; |
|
|
|
|
|
if (!slot_config.config.slot_enable) { |
|
|
|
|
|
continue; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
auto luma_offset{regs.surfaces[i][SurfaceIndex::Current].luma.Address()}; |
|
|
|
|
|
if (nvdec_id == -1) { |
|
|
|
|
|
nvdec_id = frame_queue.VicFindNvdecFdFromOffset(luma_offset); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (auto frame = frame_queue.GetFrame(nvdec_id, luma_offset); frame) { |
|
|
|
|
|
if (frame.get()) { |
|
|
|
|
|
|
|
|
if (auto& slot_config = config.slot_structs[i]; slot_config.config.slot_enable) { |
|
|
|
|
|
auto const luma_offset = regs.surfaces[i][SurfaceIndex::Current].luma.Address(); |
|
|
|
|
|
if (nvdec_id == -1) |
|
|
|
|
|
nvdec_id = frame_queue.VicFindNvdecFdFromOffset(luma_offset); |
|
|
|
|
|
if (auto frame = frame_queue.GetFrame(nvdec_id, luma_offset); frame.get()) { |
|
|
switch (frame->GetPixelFormat()) { |
|
|
switch (frame->GetPixelFormat()) { |
|
|
case AV_PIX_FMT_YUV420P: |
|
|
case AV_PIX_FMT_YUV420P: |
|
|
ReadY8__V8U8_N420(slot_config, regs.surfaces[i], std::move(frame), true); |
|
|
ReadY8__V8U8_N420(slot_config, regs.surfaces[i], std::move(frame), true); |
|
|
@ -135,21 +115,22 @@ void Vic::Execute() { |
|
|
UNIMPLEMENTED_MSG("Unimplemented slot pixel format {}", u32(slot_config.surface_config.slot_pixel_format.Value())); |
|
|
UNIMPLEMENTED_MSG("Unimplemented slot pixel format {}", u32(slot_config.surface_config.slot_pixel_format.Value())); |
|
|
break; |
|
|
break; |
|
|
} |
|
|
} |
|
|
Blend(config, slot_config); |
|
|
|
|
|
|
|
|
Blend(config, slot_config, config.output_surface_config.out_pixel_format); |
|
|
} else { |
|
|
} else { |
|
|
LOG_ERROR(HW_GPU, "Vic {} failed to get frame with offset {:#X}", id, luma_offset); |
|
|
LOG_ERROR(HW_GPU, "Vic {} failed to get frame with offset {:#X}", id, luma_offset); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
} else { |
|
|
|
|
|
// Fill the frame with black, as otherwise they can have random data and be very glitchy.
|
|
|
|
|
|
std::fill(output_surface.begin(), output_surface.end(), Pixel{}); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
switch (config.output_surface_config.out_pixel_format) { |
|
|
switch (config.output_surface_config.out_pixel_format) { |
|
|
case VideoPixelFormat::A8B8G8R8: |
|
|
case VideoPixelFormat::A8B8G8R8: |
|
|
case VideoPixelFormat::X8B8G8R8: |
|
|
case VideoPixelFormat::X8B8G8R8: |
|
|
WriteABGR(config.output_surface_config, VideoPixelFormat::A8B8G8R8); |
|
|
|
|
|
break; |
|
|
|
|
|
case VideoPixelFormat::A8R8G8B8: |
|
|
case VideoPixelFormat::A8R8G8B8: |
|
|
WriteABGR(config.output_surface_config, VideoPixelFormat::A8R8G8B8); |
|
|
|
|
|
|
|
|
WriteABGR(config.output_surface_config); |
|
|
break; |
|
|
break; |
|
|
case VideoPixelFormat::Y8__V8U8_N420: |
|
|
case VideoPixelFormat::Y8__V8U8_N420: |
|
|
WriteY8__V8U8_N420(config.output_surface_config); |
|
|
WriteY8__V8U8_N420(config.output_surface_config); |
|
|
@ -160,20 +141,14 @@ void Vic::Execute() { |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets, std::shared_ptr<const FFmpeg::Frame> frame, bool planar, bool interlaced) { |
|
|
|
|
|
const auto out_luma_width{slot.surface_config.slot_surface_width + 1}; |
|
|
|
|
|
auto out_luma_height{slot.surface_config.slot_surface_height + 1}; |
|
|
|
|
|
const auto out_luma_stride{out_luma_width}; |
|
|
|
|
|
|
|
|
|
|
|
if(interlaced) { |
|
|
|
|
|
out_luma_height *= 2; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets, std::shared_ptr<const FFmpeg::Frame> frame, bool planar, bool interlaced) noexcept { |
|
|
|
|
|
auto const out_luma_width = slot.surface_config.slot_surface_width + 1; |
|
|
|
|
|
auto const out_luma_height = (slot.surface_config.slot_surface_height + 1) * (interlaced ? 2 : 1); |
|
|
|
|
|
auto const out_luma_stride = out_luma_width; |
|
|
|
|
|
|
|
|
slot_surface.resize_destructive(out_luma_width * out_luma_height); |
|
|
|
|
|
|
|
|
|
|
|
const auto in_luma_width{(std::min)(frame->GetWidth(), s32(out_luma_width))}; |
|
|
|
|
|
const auto in_luma_height{(std::min)(frame->GetHeight(), s32(out_luma_height))}; |
|
|
|
|
|
const auto in_luma_stride{frame->GetStride(0)}; |
|
|
|
|
|
|
|
|
const auto in_luma_width = (std::min)(frame->GetWidth(), s32(out_luma_width)); |
|
|
|
|
|
const auto in_luma_height = (std::min)(frame->GetHeight(), s32(out_luma_height)); |
|
|
|
|
|
const auto in_luma_stride = frame->GetStride(0); |
|
|
|
|
|
|
|
|
const auto in_chroma_stride{frame->GetStride(1)}; |
|
|
const auto in_chroma_stride{frame->GetStride(1)}; |
|
|
|
|
|
|
|
|
@ -182,19 +157,20 @@ void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span<const P |
|
|
const auto* chroma_v_buffer{frame->GetPlane(2)}; |
|
|
const auto* chroma_v_buffer{frame->GetPlane(2)}; |
|
|
|
|
|
|
|
|
LOG_TRACE(HW_GPU, |
|
|
LOG_TRACE(HW_GPU, |
|
|
"Reading frame" |
|
|
|
|
|
"\ninput luma {}x{} stride {} chroma {}x{} stride {}\n" |
|
|
|
|
|
"output luma {}x{} stride {} chroma {}x{} stride {}", |
|
|
|
|
|
in_luma_width, in_luma_height, in_luma_stride, in_luma_width / 2, in_luma_height / 2, |
|
|
|
|
|
in_chroma_stride, out_luma_width, out_luma_height, out_luma_stride, out_luma_width, |
|
|
|
|
|
out_luma_height, out_luma_stride); |
|
|
|
|
|
|
|
|
|
|
|
const auto alpha{u16(slot.config.planar_alpha.Value())}; |
|
|
|
|
|
for (s32 y = 0; y < in_luma_height; y++) { |
|
|
|
|
|
const auto src_luma{y * in_luma_stride}; |
|
|
|
|
|
const auto src_chroma{(y / 2) * in_chroma_stride}; |
|
|
|
|
|
const auto dst{y * out_luma_stride}; |
|
|
|
|
|
for (s32 x = 0; x < in_luma_width; x++) { |
|
|
|
|
|
|
|
|
"Reading frame" |
|
|
|
|
|
"\ninput luma {}x{} stride {} chroma {}x{} stride {}\n" |
|
|
|
|
|
"output luma {}x{} stride {} chroma {}x{} stride {}", |
|
|
|
|
|
in_luma_width, in_luma_height, in_luma_stride, in_luma_width / 2, in_luma_height / 2, |
|
|
|
|
|
in_chroma_stride, out_luma_width, out_luma_height, out_luma_stride, out_luma_width, |
|
|
|
|
|
out_luma_height, out_luma_stride); |
|
|
|
|
|
|
|
|
|
|
|
slot_surface.resize_destructive(out_luma_width * out_luma_height); |
|
|
|
|
|
auto const alpha = u16(slot.config.planar_alpha.Value()); |
|
|
|
|
|
for (size_t y = 0; y < size_t(in_luma_height); y++) { |
|
|
|
|
|
auto const src_luma = y * in_luma_stride; |
|
|
|
|
|
auto const src_chroma = (y / 2) * in_chroma_stride; |
|
|
|
|
|
auto const dst = y * out_luma_stride; |
|
|
|
|
|
for (size_t x = 0; x < size_t(in_luma_width); x++) { |
|
|
slot_surface[dst + x].r = u16(luma_buffer[src_luma + x] << 2); |
|
|
slot_surface[dst + x].r = u16(luma_buffer[src_luma + x] << 2); |
|
|
// Chroma samples are duplicated horizontally and vertically.
|
|
|
// Chroma samples are duplicated horizontally and vertically.
|
|
|
if(planar) { |
|
|
if(planar) { |
|
|
@ -209,7 +185,7 @@ void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span<const P |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets, std::shared_ptr<const FFmpeg::Frame> frame, bool planar, bool top_field) { |
|
|
|
|
|
|
|
|
void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets, std::shared_ptr<const FFmpeg::Frame> frame, bool planar, bool top_field) noexcept { |
|
|
if(!planar) { |
|
|
if(!planar) { |
|
|
ReadProgressiveY8__V8U8_N420(slot, offsets, std::move(frame), planar, true); |
|
|
ReadProgressiveY8__V8U8_N420(slot, offsets, std::move(frame), planar, true); |
|
|
return; |
|
|
return; |
|
|
@ -218,14 +194,13 @@ void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const Pl |
|
|
const auto out_luma_height{(slot.surface_config.slot_surface_height + 1) * 2}; |
|
|
const auto out_luma_height{(slot.surface_config.slot_surface_height + 1) * 2}; |
|
|
const auto out_luma_stride{out_luma_width}; |
|
|
const auto out_luma_stride{out_luma_width}; |
|
|
|
|
|
|
|
|
slot_surface.resize_destructive(out_luma_width * out_luma_height); |
|
|
|
|
|
|
|
|
slot_surface.resize(out_luma_width * out_luma_height); |
|
|
|
|
|
|
|
|
const auto in_luma_width{(std::min)(frame->GetWidth(), s32(out_luma_width))}; |
|
|
const auto in_luma_width{(std::min)(frame->GetWidth(), s32(out_luma_width))}; |
|
|
[[maybe_unused]] const auto in_luma_height{ |
|
|
|
|
|
(std::min)(frame->GetHeight(), s32(out_luma_height))}; |
|
|
|
|
|
|
|
|
const auto in_luma_height{(std::min)(frame->GetHeight(), s32(out_luma_height))}; |
|
|
const auto in_luma_stride{frame->GetStride(0)}; |
|
|
const auto in_luma_stride{frame->GetStride(0)}; |
|
|
|
|
|
|
|
|
[[maybe_unused]] const auto in_chroma_width{(frame->GetWidth() + 1) / 2}; |
|
|
|
|
|
|
|
|
const auto in_chroma_width{(frame->GetWidth() + 1) / 2}; |
|
|
const auto in_chroma_height{(frame->GetHeight() + 1) / 2}; |
|
|
const auto in_chroma_height{(frame->GetHeight() + 1) / 2}; |
|
|
const auto in_chroma_stride{frame->GetStride(1)}; |
|
|
const auto in_chroma_stride{frame->GetStride(1)}; |
|
|
|
|
|
|
|
|
@ -233,16 +208,15 @@ void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const Pl |
|
|
const auto* chroma_u_buffer{frame->GetPlane(1)}; |
|
|
const auto* chroma_u_buffer{frame->GetPlane(1)}; |
|
|
const auto* chroma_v_buffer{frame->GetPlane(2)}; |
|
|
const auto* chroma_v_buffer{frame->GetPlane(2)}; |
|
|
|
|
|
|
|
|
LOG_TRACE(HW_GPU, |
|
|
|
|
|
"Reading frame" |
|
|
|
|
|
"\ninput luma {}x{} stride {} chroma {}x{} stride {}\n" |
|
|
|
|
|
"output luma {}x{} stride {} chroma {}x{} stride {}", |
|
|
|
|
|
in_luma_width, in_luma_height, in_luma_stride, in_chroma_width, in_chroma_height, |
|
|
|
|
|
in_chroma_stride, out_luma_width, out_luma_height, out_luma_stride, |
|
|
|
|
|
out_luma_width / 2, out_luma_height / 2, out_luma_stride); |
|
|
|
|
|
|
|
|
LOG_TRACE(HW_GPU, "Reading frame" |
|
|
|
|
|
"\ninput luma {}x{} stride {} chroma {}x{} stride {}\n" |
|
|
|
|
|
"output luma {}x{} stride {} chroma {}x{} stride {}", |
|
|
|
|
|
in_luma_width, in_luma_height, in_luma_stride, in_chroma_width, in_chroma_height, |
|
|
|
|
|
in_chroma_stride, out_luma_width, out_luma_height, out_luma_stride, |
|
|
|
|
|
out_luma_width / 2, out_luma_height / 2, out_luma_stride); |
|
|
|
|
|
|
|
|
auto DecodeBobField = [&]() { |
|
|
auto DecodeBobField = [&]() { |
|
|
const auto alpha{u16(slot.config.planar_alpha.Value())}; |
|
|
|
|
|
|
|
|
const auto alpha = u16(slot.config.planar_alpha.Value()); |
|
|
for (s32 y = s32(top_field == false); y < in_chroma_height * 2; y += 2) { |
|
|
for (s32 y = s32(top_field == false); y < in_chroma_height * 2; y += 2) { |
|
|
const auto src_luma{y * in_luma_stride}; |
|
|
const auto src_luma{y * in_luma_stride}; |
|
|
const auto src_chroma{(y / 2) * in_chroma_stride}; |
|
|
const auto src_chroma{(y / 2) * in_chroma_stride}; |
|
|
@ -278,12 +252,12 @@ void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const Pl |
|
|
DecodeBobField(); |
|
|
DecodeBobField(); |
|
|
break; |
|
|
break; |
|
|
default: |
|
|
default: |
|
|
UNIMPLEMENTED_MSG("Deinterlace mode {} not implemented!", s32(slot.config.deinterlace_mode.Value())); |
|
|
|
|
|
|
|
|
LOG_ERROR(HW_GPU, "Deinterlace mode {} not implemented!", s32(slot.config.deinterlace_mode.Value())); |
|
|
break; |
|
|
break; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void Vic::ReadY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets, std::shared_ptr<const FFmpeg::Frame> frame, bool planar) { |
|
|
|
|
|
|
|
|
void Vic::ReadY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets, std::shared_ptr<const FFmpeg::Frame> frame, bool planar) noexcept { |
|
|
switch (slot.config.frame_format) { |
|
|
switch (slot.config.frame_format) { |
|
|
case DXVAHD_FRAME_FORMAT::PROGRESSIVE: |
|
|
case DXVAHD_FRAME_FORMAT::PROGRESSIVE: |
|
|
ReadProgressiveY8__V8U8_N420(slot, offsets, std::move(frame), planar, false); |
|
|
ReadProgressiveY8__V8U8_N420(slot, offsets, std::move(frame), planar, false); |
|
|
@ -295,29 +269,30 @@ void Vic::ReadY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets |
|
|
ReadInterlacedY8__V8U8_N420(slot, offsets, std::move(frame), planar, false); |
|
|
ReadInterlacedY8__V8U8_N420(slot, offsets, std::move(frame), planar, false); |
|
|
break; |
|
|
break; |
|
|
default: |
|
|
default: |
|
|
LOG_ERROR(HW_GPU, "Unknown deinterlace format {}", |
|
|
|
|
|
s32(slot.config.frame_format.Value())); |
|
|
|
|
|
|
|
|
LOG_ERROR(HW_GPU, "Unknown deinterlace format {}", s32(slot.config.frame_format.Value())); |
|
|
break; |
|
|
break; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void Vic::Blend(const ConfigStruct& config, const SlotStruct& slot) { |
|
|
|
|
|
constexpr auto add_one([](u32 v) -> u32 { return v != 0 ? v + 1 : 0; }); |
|
|
|
|
|
|
|
|
void Vic::Blend(const ConfigStruct& config, const SlotStruct& slot, VideoPixelFormat format) noexcept { |
|
|
|
|
|
auto const add_one = [](u32 const v) -> u32 { |
|
|
|
|
|
return v + u32(v > 0); |
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
auto source_left{add_one(u32(slot.config.source_rect_left.Value()))}; |
|
|
|
|
|
auto source_right{add_one(u32(slot.config.source_rect_right.Value()))}; |
|
|
|
|
|
auto source_top{add_one(u32(slot.config.source_rect_top.Value()))}; |
|
|
|
|
|
auto source_bottom{add_one(u32(slot.config.source_rect_bottom.Value()))}; |
|
|
|
|
|
|
|
|
auto source_left = add_one(u32(slot.config.source_rect_left.Value())); |
|
|
|
|
|
auto source_right = add_one(u32(slot.config.source_rect_right.Value())); |
|
|
|
|
|
auto source_top = add_one(u32(slot.config.source_rect_top.Value())); |
|
|
|
|
|
auto source_bottom = add_one(u32(slot.config.source_rect_bottom.Value())); |
|
|
|
|
|
|
|
|
const auto dest_left{add_one(u32(slot.config.dest_rect_left.Value()))}; |
|
|
|
|
|
const auto dest_right{add_one(u32(slot.config.dest_rect_right.Value()))}; |
|
|
|
|
|
const auto dest_top{add_one(u32(slot.config.dest_rect_top.Value()))}; |
|
|
|
|
|
const auto dest_bottom{add_one(u32(slot.config.dest_rect_bottom.Value()))}; |
|
|
|
|
|
|
|
|
auto const dest_left = add_one(u32(slot.config.dest_rect_left.Value())); |
|
|
|
|
|
auto const dest_right = add_one(u32(slot.config.dest_rect_right.Value())); |
|
|
|
|
|
auto const dest_top = add_one(u32(slot.config.dest_rect_top.Value())); |
|
|
|
|
|
auto const dest_bottom = add_one(u32(slot.config.dest_rect_bottom.Value())); |
|
|
|
|
|
|
|
|
auto rect_left{add_one(config.output_config.target_rect_left.Value())}; |
|
|
|
|
|
auto rect_right{add_one(config.output_config.target_rect_right.Value())}; |
|
|
|
|
|
auto rect_top{add_one(config.output_config.target_rect_top.Value())}; |
|
|
|
|
|
auto rect_bottom{add_one(config.output_config.target_rect_bottom.Value())}; |
|
|
|
|
|
|
|
|
auto rect_left = add_one(u32(config.output_config.target_rect_left.Value())); |
|
|
|
|
|
auto rect_right = add_one(u32(config.output_config.target_rect_right.Value())); |
|
|
|
|
|
auto rect_top = add_one(u32(config.output_config.target_rect_top.Value())); |
|
|
|
|
|
auto rect_bottom = add_one(u32(config.output_config.target_rect_bottom.Value())); |
|
|
|
|
|
|
|
|
rect_left = (std::max)(rect_left, dest_left); |
|
|
rect_left = (std::max)(rect_left, dest_left); |
|
|
rect_right = (std::min)(rect_right, dest_right); |
|
|
rect_right = (std::min)(rect_right, dest_right); |
|
|
@ -329,94 +304,74 @@ void Vic::Blend(const ConfigStruct& config, const SlotStruct& slot) { |
|
|
source_top = (std::max)(source_top, rect_top); |
|
|
source_top = (std::max)(source_top, rect_top); |
|
|
source_bottom = (std::min)(source_bottom, rect_bottom); |
|
|
source_bottom = (std::min)(source_bottom, rect_bottom); |
|
|
|
|
|
|
|
|
if (source_left >= source_right || source_top >= source_bottom) { |
|
|
|
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
const auto out_surface_width{config.output_surface_config.out_surface_width + 1}; |
|
|
|
|
|
[[maybe_unused]] const auto out_surface_height{config.output_surface_config.out_surface_height + |
|
|
|
|
|
1}; |
|
|
|
|
|
const auto in_surface_width{slot.surface_config.slot_surface_width + 1}; |
|
|
|
|
|
|
|
|
auto const out_surface_width = config.output_surface_config.out_surface_width + 1; |
|
|
|
|
|
auto const out_surface_height = config.output_surface_config.out_surface_height + 1; |
|
|
|
|
|
auto const in_surface_width = slot.surface_config.slot_surface_width + 1; |
|
|
|
|
|
|
|
|
source_bottom = (std::min)(source_bottom, out_surface_height); |
|
|
source_bottom = (std::min)(source_bottom, out_surface_height); |
|
|
source_right = (std::min)(source_right, out_surface_width); |
|
|
source_right = (std::min)(source_right, out_surface_width); |
|
|
|
|
|
|
|
|
|
|
|
auto const work_width = u32((std::max)(0, s32(source_right) - s32(source_left))); |
|
|
|
|
|
auto const work_height = u32((std::max)(0, s32(source_bottom) - s32(source_top))); |
|
|
|
|
|
|
|
|
// TODO Alpha blending. No games I've seen use more than a single surface or supply an alpha
|
|
|
// TODO Alpha blending. No games I've seen use more than a single surface or supply an alpha
|
|
|
// below max, so it's ignored for now.
|
|
|
// below max, so it's ignored for now.
|
|
|
|
|
|
|
|
|
if (!slot.color_matrix.matrix_enable) { |
|
|
|
|
|
const auto copy_width = (std::min)(source_right - source_left, rect_right - rect_left); |
|
|
|
|
|
|
|
|
|
|
|
for (u32 y = source_top; y < source_bottom; y++) { |
|
|
|
|
|
const auto dst_line = y * out_surface_width; |
|
|
|
|
|
const auto src_line = y * in_surface_width; |
|
|
|
|
|
std::memcpy(&output_surface[dst_line + rect_left], |
|
|
|
|
|
&slot_surface[src_line + source_left], copy_width * sizeof(Pixel)); |
|
|
|
|
|
} |
|
|
|
|
|
} else { |
|
|
|
|
|
|
|
|
if (slot.color_matrix.matrix_enable) { |
|
|
// clang-format off
|
|
|
// clang-format off
|
|
|
// Colour conversion is enabled, this is a 3x4 * 4x1 matrix multiplication, resulting in a 3x1 matrix.
|
|
|
// Colour conversion is enabled, this is a 3x4 * 4x1 matrix multiplication, resulting in a 3x1 matrix.
|
|
|
// | r0c0 r0c1 r0c2 r0c3 | | R | | R |
|
|
|
// | r0c0 r0c1 r0c2 r0c3 | | R | | R |
|
|
|
// | r1c0 r1c1 r1c2 r1c3 | * | G | = | G |
|
|
|
// | r1c0 r1c1 r1c2 r1c3 | * | G | = | G |
|
|
|
// | r2c0 r2c1 r2c2 r2c3 | | B | | B |
|
|
|
// | r2c0 r2c1 r2c2 r2c3 | | B | | B |
|
|
|
// | 1 |
|
|
|
// | 1 |
|
|
|
const auto r0c0 = s32(slot.color_matrix.matrix_coeff00.Value()); |
|
|
|
|
|
const auto r0c1 = s32(slot.color_matrix.matrix_coeff01.Value()); |
|
|
|
|
|
const auto r0c2 = s32(slot.color_matrix.matrix_coeff02.Value()); |
|
|
|
|
|
const auto r0c3 = s32(slot.color_matrix.matrix_coeff03.Value()); |
|
|
|
|
|
const auto r1c0 = s32(slot.color_matrix.matrix_coeff10.Value()); |
|
|
|
|
|
const auto r1c1 = s32(slot.color_matrix.matrix_coeff11.Value()); |
|
|
|
|
|
const auto r1c2 = s32(slot.color_matrix.matrix_coeff12.Value()); |
|
|
|
|
|
const auto r1c3 = s32(slot.color_matrix.matrix_coeff13.Value()); |
|
|
|
|
|
const auto r2c0 = s32(slot.color_matrix.matrix_coeff20.Value()); |
|
|
|
|
|
const auto r2c1 = s32(slot.color_matrix.matrix_coeff21.Value()); |
|
|
|
|
|
const auto r2c2 = s32(slot.color_matrix.matrix_coeff22.Value()); |
|
|
|
|
|
const auto r2c3 = s32(slot.color_matrix.matrix_coeff23.Value()); |
|
|
|
|
|
|
|
|
|
|
|
const auto shift = s32(slot.color_matrix.matrix_r_shift.Value()); |
|
|
|
|
|
const auto clamp_min = s32(slot.config.soft_clamp_low.Value()); |
|
|
|
|
|
const auto clamp_max = s32(slot.config.soft_clamp_high.Value()); |
|
|
|
|
|
|
|
|
|
|
|
auto MatMul = [&](const Pixel& in_pixel) -> std::tuple<s32, s32, s32, s32> { |
|
|
|
|
|
auto r = s32(in_pixel.r); |
|
|
|
|
|
auto g = s32(in_pixel.g); |
|
|
|
|
|
auto b = s32(in_pixel.b); |
|
|
|
|
|
|
|
|
|
|
|
r = in_pixel.r * r0c0 + in_pixel.g * r0c1 + in_pixel.b * r0c2; |
|
|
|
|
|
g = in_pixel.r * r1c0 + in_pixel.g * r1c1 + in_pixel.b * r1c2; |
|
|
|
|
|
b = in_pixel.r * r2c0 + in_pixel.g * r2c1 + in_pixel.b * r2c2; |
|
|
|
|
|
|
|
|
|
|
|
r >>= shift; |
|
|
|
|
|
g >>= shift; |
|
|
|
|
|
b >>= shift; |
|
|
|
|
|
|
|
|
|
|
|
r += r0c3; |
|
|
|
|
|
g += r1c3; |
|
|
|
|
|
b += r2c3; |
|
|
|
|
|
|
|
|
|
|
|
r >>= 8; |
|
|
|
|
|
g >>= 8; |
|
|
|
|
|
b >>= 8; |
|
|
|
|
|
|
|
|
|
|
|
return {r, g, b, s32(in_pixel.a)}; |
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
for (u32 y = source_top; y < source_bottom; y++) { |
|
|
|
|
|
const auto src{y * in_surface_width + source_left}; |
|
|
|
|
|
const auto dst{y * out_surface_width + rect_left}; |
|
|
|
|
|
for (u32 x = source_left; x < source_right; x++) { |
|
|
|
|
|
auto [r, g, b, a] = MatMul(slot_surface[src + x]); |
|
|
|
|
|
r = std::clamp(r, clamp_min, clamp_max); |
|
|
|
|
|
g = std::clamp(g, clamp_min, clamp_max); |
|
|
|
|
|
b = std::clamp(b, clamp_min, clamp_max); |
|
|
|
|
|
a = std::clamp(a, clamp_min, clamp_max); |
|
|
|
|
|
output_surface[dst + x] = {u16(r), u16(g), u16(b), u16(a)}; |
|
|
|
|
|
|
|
|
auto const shift = s32(slot.color_matrix.matrix_r_shift.Value()); |
|
|
|
|
|
|
|
|
|
|
|
struct AliasedMatrixType { u64 m[4]; }; |
|
|
|
|
|
static_assert(sizeof(AliasedMatrixType) == sizeof(slot.color_matrix)); |
|
|
|
|
|
u64 const mat_mask = (1 << 20) - 1; |
|
|
|
|
|
auto const* amt = reinterpret_cast<AliasedMatrixType const*>(&slot.color_matrix); |
|
|
|
|
|
|
|
|
|
|
|
constexpr s32 shifts[4] = { 0, 20, 40, 60 }; |
|
|
|
|
|
s32 mr[4][4]; |
|
|
|
|
|
for (u32 j = 0; j < 3; ++j) |
|
|
|
|
|
for (u32 i = 0; i < 4; ++i) |
|
|
|
|
|
mr[j][i] = s64(((amt->m[i] >> shifts[j]) & mat_mask) << (64 - 20)) >> (64 - 20); |
|
|
|
|
|
|
|
|
|
|
|
auto const clamp_min = s32(slot.config.soft_clamp_low.Value()); |
|
|
|
|
|
auto const clamp_max = s32(slot.config.soft_clamp_high.Value()); |
|
|
|
|
|
for (u32 y = 0; y < work_height; ++y) { |
|
|
|
|
|
auto const src = (y + source_top) * in_surface_width + source_left; |
|
|
|
|
|
auto const dst = (y + source_top) * out_surface_width + rect_left; |
|
|
|
|
|
for (u32 x = 0; x < work_width; ++x) { |
|
|
|
|
|
auto const& in_pixel = slot_surface[src + x]; |
|
|
|
|
|
auto& out_pixel = output_surface[dst + x]; |
|
|
|
|
|
s32 const mul_values[4] = { |
|
|
|
|
|
in_pixel.r * mr[0][0] + in_pixel.g * mr[1][1] + in_pixel.b * mr[0][2], |
|
|
|
|
|
in_pixel.r * mr[1][0] + in_pixel.g * mr[1][1] + in_pixel.b * mr[1][2], |
|
|
|
|
|
in_pixel.r * mr[2][0] + in_pixel.g * mr[2][1] + in_pixel.b * mr[2][2], |
|
|
|
|
|
s32(in_pixel.a) |
|
|
|
|
|
}; |
|
|
|
|
|
s32 const mul_clamp[4] = { |
|
|
|
|
|
std::clamp(((mul_values[0] >> shift) + mr[0][3]) >> 8, clamp_min, clamp_max), |
|
|
|
|
|
std::clamp(((mul_values[1] >> shift) + mr[1][3]) >> 8, clamp_min, clamp_max), |
|
|
|
|
|
std::clamp(((mul_values[2] >> shift) + mr[2][3]) >> 8, clamp_min, clamp_max), |
|
|
|
|
|
std::clamp(mul_values[3], clamp_min, clamp_max) |
|
|
|
|
|
}; |
|
|
|
|
|
out_pixel = format == VideoPixelFormat::A8R8G8B8 |
|
|
|
|
|
? Pixel(u16(mul_clamp[2]), u16(mul_clamp[1]), u16(mul_clamp[0]), u16(mul_clamp[3])) |
|
|
|
|
|
: Pixel(u16(mul_clamp[0]), u16(mul_clamp[1]), u16(mul_clamp[2]), u16(mul_clamp[3])); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
} else { |
|
|
|
|
|
auto const copy_width = (std::min)(source_right - source_left, rect_right - rect_left); |
|
|
|
|
|
for (u32 y = 0; y < work_height; y++) { |
|
|
|
|
|
auto const dst_line = (y + source_top) * out_surface_width; |
|
|
|
|
|
auto const src_line = (y + source_top) * in_surface_width; |
|
|
|
|
|
std::memcpy(&output_surface[dst_line + rect_left], &slot_surface[src_line + source_left], copy_width * sizeof(Pixel)); |
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) { |
|
|
|
|
|
|
|
|
void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) noexcept { |
|
|
constexpr u32 BytesPerPixel = 1; |
|
|
constexpr u32 BytesPerPixel = 1; |
|
|
|
|
|
|
|
|
auto surface_width{output_surface_config.out_surface_width + 1}; |
|
|
auto surface_width{output_surface_config.out_surface_width + 1}; |
|
|
@ -437,35 +392,27 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) { |
|
|
surface_height = (std::min)(surface_height, out_luma_height); |
|
|
surface_height = (std::min)(surface_height, out_luma_height); |
|
|
|
|
|
|
|
|
auto Decode = [&](std::span<u8> out_luma, std::span<u8> out_chroma) { |
|
|
auto Decode = [&](std::span<u8> out_luma, std::span<u8> out_chroma) { |
|
|
for (u32 y = 0; y < surface_height; ++y) { |
|
|
|
|
|
const auto src_luma = y * surface_stride; |
|
|
|
|
|
const auto dst_luma = y * out_luma_stride; |
|
|
|
|
|
const auto src_chroma = y * surface_stride; |
|
|
|
|
|
const auto dst_chroma = (y / 2) * out_chroma_stride; |
|
|
|
|
|
for (u32 x = 0; x < surface_width; x += 2) { |
|
|
|
|
|
out_luma[dst_luma + x + 0] = |
|
|
|
|
|
u8(output_surface[src_luma + x + 0].r >> 2); |
|
|
|
|
|
out_luma[dst_luma + x + 1] = |
|
|
|
|
|
u8(output_surface[src_luma + x + 1].r >> 2); |
|
|
|
|
|
out_chroma[dst_chroma + x + 0] = |
|
|
|
|
|
u8(output_surface[src_chroma + x].g >> 2); |
|
|
|
|
|
out_chroma[dst_chroma + x + 1] = |
|
|
|
|
|
u8(output_surface[src_chroma + x].b >> 2); |
|
|
|
|
|
|
|
|
for (size_t y = 0; y < surface_height; ++y) { |
|
|
|
|
|
auto const src_luma = y * surface_stride; |
|
|
|
|
|
auto const dst_luma = y * out_luma_stride; |
|
|
|
|
|
auto const src_chroma = y * surface_stride; |
|
|
|
|
|
auto const dst_chroma = (y / 2) * out_chroma_stride; |
|
|
|
|
|
for (size_t x = 0; x < surface_width; x += 2) { |
|
|
|
|
|
out_luma[dst_luma + x + 0] = u8(output_surface[src_luma + x + 0].r >> 2); |
|
|
|
|
|
out_luma[dst_luma + x + 1] = u8(output_surface[src_luma + x + 1].r >> 2); |
|
|
|
|
|
out_chroma[dst_chroma + x + 0] = u8(output_surface[src_chroma + x].g >> 2); |
|
|
|
|
|
out_chroma[dst_chroma + x + 1] = u8(output_surface[src_chroma + x].b >> 2); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
}; |
|
|
}; |
|
|
|
|
|
|
|
|
switch (output_surface_config.out_block_kind) { |
|
|
switch (output_surface_config.out_block_kind) { |
|
|
case BLK_KIND::GENERIC_16Bx2: { |
|
|
case BLK_KIND::GENERIC_16Bx2: { |
|
|
const u32 block_height = u32(output_surface_config.out_block_height); |
|
|
|
|
|
const auto out_luma_swizzle_size = Texture::CalculateSize( |
|
|
|
|
|
true, BytesPerPixel, out_luma_width, out_luma_height, 1, block_height, 0); |
|
|
|
|
|
const auto out_chroma_swizzle_size = Texture::CalculateSize( |
|
|
|
|
|
true, BytesPerPixel * 2, out_chroma_width, out_chroma_height, 1, block_height, 0); |
|
|
|
|
|
|
|
|
|
|
|
LOG_TRACE( |
|
|
|
|
|
HW_GPU, |
|
|
|
|
|
"Writing Y8__V8U8_N420 swizzled frame\n" |
|
|
|
|
|
|
|
|
u32 const block_height = u32(output_surface_config.out_block_height); |
|
|
|
|
|
auto const out_luma_swizzle_size = Texture::CalculateSize(true, BytesPerPixel, out_luma_width, out_luma_height, 1, block_height, 0); |
|
|
|
|
|
auto const out_chroma_swizzle_size = Texture::CalculateSize(true, BytesPerPixel * 2, out_chroma_width, out_chroma_height, 1, block_height, 0); |
|
|
|
|
|
|
|
|
|
|
|
LOG_TRACE(HW_GPU, "Writing Y8__V8U8_N420 swizzled frame\n" |
|
|
"\tinput surface {}x{} stride {} size {:#X}\n" |
|
|
"\tinput surface {}x{} stride {} size {:#X}\n" |
|
|
"\toutput luma {}x{} stride {} size {:#X} block height {} swizzled size 0x{:X}\n", |
|
|
"\toutput luma {}x{} stride {} size {:#X} block height {} swizzled size 0x{:X}\n", |
|
|
"\toutput chroma {}x{} stride {} size {:#X} block height {} swizzled size 0x{:X}", |
|
|
"\toutput chroma {}x{} stride {} size {:#X} block height {} swizzled size 0x{:X}", |
|
|
@ -477,32 +424,20 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) { |
|
|
|
|
|
|
|
|
luma_scratch.resize_destructive(out_luma_size); |
|
|
luma_scratch.resize_destructive(out_luma_size); |
|
|
chroma_scratch.resize_destructive(out_chroma_size); |
|
|
chroma_scratch.resize_destructive(out_chroma_size); |
|
|
|
|
|
|
|
|
Decode(luma_scratch, chroma_scratch); |
|
|
Decode(luma_scratch, chroma_scratch); |
|
|
|
|
|
|
|
|
Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_luma( |
|
|
|
|
|
memory_manager, regs.output_surface.luma.Address(), out_luma_swizzle_size, |
|
|
|
|
|
&swizzle_scratch); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_luma(memory_manager, regs.output_surface.luma.Address(), out_luma_swizzle_size, &swizzle_scratch); |
|
|
|
|
|
Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_chroma(memory_manager, regs.output_surface.chroma_u.Address(), out_chroma_swizzle_size, &swizzle_scratch); |
|
|
if (block_height == 1) { |
|
|
if (block_height == 1) { |
|
|
SwizzleSurface(out_luma, out_luma_stride, luma_scratch, out_luma_stride, out_luma_height); |
|
|
SwizzleSurface(out_luma, out_luma_stride, luma_scratch, out_luma_stride, out_luma_height); |
|
|
} else { |
|
|
|
|
|
Texture::SwizzleTexture(out_luma, luma_scratch, BytesPerPixel, out_luma_width, out_luma_height, 1, block_height, 0, 1); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> |
|
|
|
|
|
out_chroma(memory_manager, regs.output_surface.chroma_u.Address(), out_chroma_swizzle_size, &swizzle_scratch); |
|
|
|
|
|
|
|
|
|
|
|
if (block_height == 1) { |
|
|
|
|
|
SwizzleSurface(out_chroma, out_chroma_stride, chroma_scratch, out_chroma_stride, out_chroma_height); |
|
|
SwizzleSurface(out_chroma, out_chroma_stride, chroma_scratch, out_chroma_stride, out_chroma_height); |
|
|
} else { |
|
|
} else { |
|
|
|
|
|
Texture::SwizzleTexture(out_luma, luma_scratch, BytesPerPixel, out_luma_width, out_luma_height, 1, block_height, 0, 1); |
|
|
Texture::SwizzleTexture(out_chroma, chroma_scratch, BytesPerPixel, out_chroma_width, out_chroma_height, 1, block_height, 0, 1); |
|
|
Texture::SwizzleTexture(out_chroma, chroma_scratch, BytesPerPixel, out_chroma_width, out_chroma_height, 1, block_height, 0, 1); |
|
|
} |
|
|
} |
|
|
} break; |
|
|
} break; |
|
|
case BLK_KIND::PITCH: { |
|
|
case BLK_KIND::PITCH: { |
|
|
LOG_TRACE( |
|
|
|
|
|
HW_GPU, |
|
|
|
|
|
"Writing Y8__V8U8_N420 swizzled frame\n" |
|
|
|
|
|
|
|
|
LOG_TRACE(HW_GPU, "Writing Y8__V8U8_N420 swizzled frame\n" |
|
|
"\tinput surface {}x{} stride {} size {:#X}\n" |
|
|
"\tinput surface {}x{} stride {} size {:#X}\n" |
|
|
"\toutput luma {}x{} stride {} size {:#X} block height {} swizzled size 0x{:X}\n", |
|
|
"\toutput luma {}x{} stride {} size {:#X} block height {} swizzled size 0x{:X}\n", |
|
|
"\toutput chroma {}x{} stride {} size {:#X} block height {} swizzled size 0x{:X}", |
|
|
"\toutput chroma {}x{} stride {} size {:#X} block height {} swizzled size 0x{:X}", |
|
|
@ -529,12 +464,12 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) { |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config, VideoPixelFormat format) { |
|
|
|
|
|
|
|
|
void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config) noexcept { |
|
|
constexpr u32 BytesPerPixel = 4; |
|
|
constexpr u32 BytesPerPixel = 4; |
|
|
|
|
|
|
|
|
auto surface_width{output_surface_config.out_surface_width + 1}; |
|
|
|
|
|
auto surface_height{output_surface_config.out_surface_height + 1}; |
|
|
|
|
|
const auto surface_stride{surface_width}; |
|
|
|
|
|
|
|
|
auto surface_width = output_surface_config.out_surface_width + 1; |
|
|
|
|
|
auto surface_height = output_surface_config.out_surface_height + 1; |
|
|
|
|
|
auto const surface_stride = surface_width; |
|
|
|
|
|
|
|
|
const auto out_luma_width = output_surface_config.out_luma_width + 1; |
|
|
const auto out_luma_width = output_surface_config.out_luma_width + 1; |
|
|
const auto out_luma_height = output_surface_config.out_luma_height + 1; |
|
|
const auto out_luma_height = output_surface_config.out_luma_height + 1; |
|
|
@ -544,22 +479,14 @@ void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config, VideoPixel |
|
|
surface_width = (std::min)(surface_width, out_luma_width); |
|
|
surface_width = (std::min)(surface_width, out_luma_width); |
|
|
surface_height = (std::min)(surface_height, out_luma_height); |
|
|
surface_height = (std::min)(surface_height, out_luma_height); |
|
|
|
|
|
|
|
|
auto Decode = [&](std::span<u8> out_buffer) { |
|
|
|
|
|
for (u32 y = 0; y < surface_height; y++) { |
|
|
|
|
|
const auto src = y * surface_stride; |
|
|
|
|
|
const auto dst = y * out_luma_stride; |
|
|
|
|
|
for (u32 x = 0; x < surface_width; x++) { |
|
|
|
|
|
if(format == VideoPixelFormat::A8R8G8B8) { |
|
|
|
|
|
out_buffer[dst + x * 4 + 0] = u8(output_surface[src + x].b >> 2); |
|
|
|
|
|
out_buffer[dst + x * 4 + 1] = u8(output_surface[src + x].g >> 2); |
|
|
|
|
|
out_buffer[dst + x * 4 + 2] = u8(output_surface[src + x].r >> 2); |
|
|
|
|
|
out_buffer[dst + x * 4 + 3] = u8(output_surface[src + x].a >> 2); |
|
|
|
|
|
} else { |
|
|
|
|
|
out_buffer[dst + x * 4 + 0] = u8(output_surface[src + x].r >> 2); |
|
|
|
|
|
out_buffer[dst + x * 4 + 1] = u8(output_surface[src + x].g >> 2); |
|
|
|
|
|
out_buffer[dst + x * 4 + 2] = u8(output_surface[src + x].b >> 2); |
|
|
|
|
|
out_buffer[dst + x * 4 + 3] = u8(output_surface[src + x].a >> 2); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
auto Decode = [&](std::span<u8> s1, std::span<Pixel> s2) { |
|
|
|
|
|
for (size_t y = 0; y < surface_height; ++y) { |
|
|
|
|
|
auto const src = y * surface_stride, dst = y * out_luma_stride; |
|
|
|
|
|
for (size_t x = 0; x < surface_width; ++x) { |
|
|
|
|
|
s1[dst + x * 4 + 0] = u8(s2[src + x].r >> 2); |
|
|
|
|
|
s1[dst + x * 4 + 1] = u8(s2[src + x].g >> 2); |
|
|
|
|
|
s1[dst + x * 4 + 2] = u8(s2[src + x].b >> 2); |
|
|
|
|
|
s1[dst + x * 4 + 3] = u8(s2[src + x].a >> 2); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
}; |
|
|
}; |
|
|
@ -567,47 +494,33 @@ void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config, VideoPixel |
|
|
switch (output_surface_config.out_block_kind) { |
|
|
switch (output_surface_config.out_block_kind) { |
|
|
case BLK_KIND::GENERIC_16Bx2: { |
|
|
case BLK_KIND::GENERIC_16Bx2: { |
|
|
const u32 block_height = u32(output_surface_config.out_block_height); |
|
|
const u32 block_height = u32(output_surface_config.out_block_height); |
|
|
const auto out_swizzle_size = Texture::CalculateSize(true, BytesPerPixel, out_luma_width, |
|
|
|
|
|
out_luma_height, 1, block_height, 0); |
|
|
|
|
|
|
|
|
|
|
|
LOG_TRACE( |
|
|
|
|
|
HW_GPU, |
|
|
|
|
|
"Writing ABGR swizzled frame\n" |
|
|
|
|
|
|
|
|
const auto out_swizzle_size = Texture::CalculateSize(true, BytesPerPixel, out_luma_width, out_luma_height, 1, block_height, 0); |
|
|
|
|
|
LOG_TRACE(HW_GPU, "Writing ABGR swizzled frame\n" |
|
|
"\tinput surface {}x{} stride {} size {:#X}\n" |
|
|
"\tinput surface {}x{} stride {} size {:#X}\n" |
|
|
"\toutput surface {}x{} stride {} size {:#X} block height {} swizzled size 0x{:X}", |
|
|
"\toutput surface {}x{} stride {} size {:#X} block height {} swizzled size 0x{:X}", |
|
|
surface_width, surface_height, surface_stride * BytesPerPixel, |
|
|
surface_width, surface_height, surface_stride * BytesPerPixel, |
|
|
surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height, |
|
|
surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height, |
|
|
out_luma_stride, out_luma_size, block_height, out_swizzle_size); |
|
|
out_luma_stride, out_luma_size, block_height, out_swizzle_size); |
|
|
|
|
|
|
|
|
luma_scratch.resize_destructive(out_luma_size); |
|
|
luma_scratch.resize_destructive(out_luma_size); |
|
|
|
|
|
Decode(luma_scratch, output_surface); |
|
|
|
|
|
|
|
|
Decode(luma_scratch); |
|
|
|
|
|
|
|
|
|
|
|
Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_luma( |
|
|
|
|
|
memory_manager, regs.output_surface.luma.Address(), out_swizzle_size, &swizzle_scratch); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_luma(memory_manager, regs.output_surface.luma.Address(), out_swizzle_size, &swizzle_scratch); |
|
|
if (block_height == 1) { |
|
|
if (block_height == 1) { |
|
|
SwizzleSurface(out_luma, out_luma_stride, luma_scratch, out_luma_stride, out_luma_height); |
|
|
SwizzleSurface(out_luma, out_luma_stride, luma_scratch, out_luma_stride, out_luma_height); |
|
|
} else { |
|
|
} else { |
|
|
Texture::SwizzleTexture(out_luma, luma_scratch, BytesPerPixel, out_luma_width, out_luma_height, 1, block_height, 0, 1); |
|
|
Texture::SwizzleTexture(out_luma, luma_scratch, BytesPerPixel, out_luma_width, out_luma_height, 1, block_height, 0, 1); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
} break; |
|
|
} break; |
|
|
case BLK_KIND::PITCH: { |
|
|
case BLK_KIND::PITCH: { |
|
|
LOG_TRACE(HW_GPU, |
|
|
|
|
|
"Writing ABGR pitch frame\n" |
|
|
|
|
|
|
|
|
LOG_TRACE(HW_GPU, "Writing ABGR pitch frame\n" |
|
|
"\tinput surface {}x{} stride {} size {:#X}" |
|
|
"\tinput surface {}x{} stride {} size {:#X}" |
|
|
"\toutput surface {}x{} stride {} size {:#X}", |
|
|
"\toutput surface {}x{} stride {} size {:#X}", |
|
|
surface_width, surface_height, surface_stride, |
|
|
surface_width, surface_height, surface_stride, |
|
|
surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height, |
|
|
surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height, |
|
|
out_luma_stride, out_luma_size); |
|
|
out_luma_stride, out_luma_size); |
|
|
|
|
|
|
|
|
luma_scratch.resize_destructive(out_luma_size); |
|
|
luma_scratch.resize_destructive(out_luma_size); |
|
|
|
|
|
|
|
|
Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_luma( |
|
|
|
|
|
memory_manager, regs.output_surface.luma.Address(), out_luma_size, &luma_scratch); |
|
|
|
|
|
|
|
|
|
|
|
Decode(out_luma); |
|
|
|
|
|
|
|
|
Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_luma(memory_manager, regs.output_surface.luma.Address(), out_luma_size, &luma_scratch); |
|
|
|
|
|
Decode(out_luma, output_surface); |
|
|
} break; |
|
|
} break; |
|
|
default: |
|
|
default: |
|
|
UNREACHABLE(); |
|
|
UNREACHABLE(); |
|
|
|