Browse Source

Added GPU accelerated texture unswizzle

Added broken system that attempts to partial upload sparse textures
Various Vulkan optimzations
Ignore remapped sparse textures (Breaks most games that use them)
pull/3246/head
Forrest Keller 1 month ago
committed by crueter
parent
commit
bda5b973de
  1. 1
      src/video_core/host_shaders/CMakeLists.txt
  2. 160
      src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp
  3. 14
      src/video_core/memory_manager.cpp
  4. 5
      src/video_core/rasterizer_interface.h
  5. 5
      src/video_core/renderer_null/null_rasterizer.cpp
  6. 5
      src/video_core/renderer_null/null_rasterizer.h
  7. 4
      src/video_core/renderer_opengl/gl_rasterizer.cpp
  8. 5
      src/video_core/renderer_opengl/gl_rasterizer.h
  9. 5
      src/video_core/renderer_opengl/gl_texture_cache.cpp
  10. 10
      src/video_core/renderer_opengl/gl_texture_cache.h
  11. 230
      src/video_core/renderer_vulkan/vk_compute_pass.cpp
  12. 23
      src/video_core/renderer_vulkan/vk_compute_pass.h
  13. 4
      src/video_core/renderer_vulkan/vk_rasterizer.cpp
  14. 2
      src/video_core/renderer_vulkan/vk_rasterizer.h
  15. 10
      src/video_core/renderer_vulkan/vk_scheduler.cpp
  16. 2
      src/video_core/renderer_vulkan/vk_scheduler.h
  17. 97
      src/video_core/renderer_vulkan/vk_texture_cache.cpp
  18. 20
      src/video_core/renderer_vulkan/vk_texture_cache.h
  19. 14
      src/video_core/texture_cache/image_base.h
  20. 256
      src/video_core/texture_cache/texture_cache.h
  21. 27
      src/video_core/texture_cache/texture_cache_base.h
  22. 41
      src/video_core/texture_cache/util.cpp
  23. 14
      src/video_core/texture_cache/util.h

1
src/video_core/host_shaders/CMakeLists.txt

@ -18,6 +18,7 @@ set(SHADER_FILES
blit_color_float.frag
block_linear_unswizzle_2d.comp
block_linear_unswizzle_3d.comp
block_linear_unswizzle_3d_bcn.comp
convert_abgr8_srgb_to_d24s8.frag
convert_abgr8_to_d24s8.frag
convert_abgr8_to_d32f.frag

160
src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp

@ -0,0 +1,160 @@
// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#version 430
#ifdef VULKAN
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_shader_8bit_storage : require
#define HAS_EXTENDED_TYPES 1
#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
#define END_PUSH_CONSTANTS };
#define UNIFORM(n)
#define BINDING_SWIZZLE_BUFFER 0
#define BINDING_INPUT_BUFFER 1
#define BINDING_OUTPUT_BUFFER 2
#else
#extension GL_NV_gpu_shader5 : enable
#ifdef GL_NV_gpu_shader5
#define HAS_EXTENDED_TYPES 1
#else
#define HAS_EXTENDED_TYPES 0
#endif
#define BEGIN_PUSH_CONSTANTS
#define END_PUSH_CONSTANTS
#define UNIFORM(n) layout(location = n) uniform
#define BINDING_SWIZZLE_BUFFER 0
#define BINDING_INPUT_BUFFER 1
#define BINDING_OUTPUT_BUFFER 0
#endif
// --- Push Constants / Uniforms ---
#ifdef VULKAN
layout(push_constant) uniform PushConstants {
uvec3 blocks_dim; // Offset 0
uint bytes_per_block_log2; // Offset 12
uvec3 origin; // Offset 16
uint slice_size; // Offset 28
uint block_size; // Offset 32
uint x_shift; // Offset 36
uint block_height; // Offset 40
uint block_height_mask; // Offset 44
uint block_depth; // Offset 48
uint block_depth_mask; // Offset 52
int _pad; // Offset 56
ivec3 destination; // Offset 60
} pc;
#else
BEGIN_PUSH_CONSTANTS
UNIFORM(0) uvec3 origin;
UNIFORM(1) ivec3 destination;
UNIFORM(2) uint bytes_per_block_log2;
UNIFORM(3) uint slice_size;
UNIFORM(4) uint block_size;
UNIFORM(5) uint x_shift;
UNIFORM(6) uint block_height;
UNIFORM(7) uint block_height_mask;
UNIFORM(8) uint block_depth;
UNIFORM(9) uint block_depth_mask;
UNIFORM(10) uvec3 blocks_dim;
END_PUSH_CONSTANTS
#define pc // Map pc prefix to nothing for OpenGL compatibility
#endif
// --- Buffers ---
layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
uint swizzle_table[];
};
#if HAS_EXTENDED_TYPES
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU8 { uint8_t u8data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU16 { uint16_t u16data[]; };
#endif
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { uint u32data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64 { uvec2 u64data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4 u128data[]; };
layout(binding = BINDING_OUTPUT_BUFFER, std430) buffer OutputBuffer {
uint out_u32[];
};
// --- Constants ---
layout(local_size_x = 32, local_size_y = 8, local_size_z = 1) in;
const uint GOB_SIZE_X = 64;
const uint GOB_SIZE_Y = 8;
const uint GOB_SIZE_Z = 1;
const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
const uint GOB_SIZE_X_SHIFT = 6;
const uint GOB_SIZE_Y_SHIFT = 3;
const uint GOB_SIZE_Z_SHIFT = 0;
const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1u, GOB_SIZE_Y - 1u);
// --- Helpers ---
uint SwizzleOffset(uvec2 pos) {
pos &= SWIZZLE_MASK;
return swizzle_table[pos.y * 64u + pos.x];
}
uvec4 ReadTexel(uint offset) {
uint bpl2 = pc.bytes_per_block_log2;
switch (bpl2) {
#if HAS_EXTENDED_TYPES
case 0u: return uvec4(u8data[offset], 0u, 0u, 0u);
case 1u: return uvec4(u16data[offset / 2u], 0u, 0u, 0u);
#else
case 0u: return uvec4(bitfieldExtract(u32data[offset / 4u], int((offset * 8u) & 24u), 8), 0u, 0u, 0u);
case 1u: return uvec4(bitfieldExtract(u32data[offset / 4u], int((offset * 8u) & 16u), 16), 0u, 0u, 0u);
#endif
case 2u: return uvec4(u32data[offset / 4u], 0u, 0u, 0u);
case 3u: return uvec4(u64data[offset / 8u], 0u, 0u);
case 4u: return u128data[offset / 16u];
}
return uvec4(0u);
}
void main() {
uvec3 block_coord = gl_GlobalInvocationID;
if (any(greaterThanEqual(block_coord, pc.blocks_dim))) {
return;
}
uint bytes_per_block = 1u << pc.bytes_per_block_log2;
// Origin is in pixels, divide by 4 for block-space (e.g. BCn formats)
uvec3 pos;
pos.x = (block_coord.x + (pc.origin.x >> 2u)) * bytes_per_block;
pos.y = block_coord.y + (pc.origin.y >> 2u);
pos.z = block_coord.z + pc.origin.z;
uint swizzle = SwizzleOffset(pos.xy);
uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
uint offset = 0u;
// Apply block-linear offsets
offset += (pos.z >> pc.block_depth) * pc.slice_size;
offset += (pos.z & pc.block_depth_mask) << (GOB_SIZE_SHIFT + pc.block_height);
offset += (block_y >> pc.block_height) * pc.block_size;
offset += (block_y & pc.block_height_mask) << GOB_SIZE_SHIFT;
offset += (pos.x >> GOB_SIZE_X_SHIFT) << pc.x_shift;
offset += swizzle;
uvec4 texel = ReadTexel(offset);
// Calculate linear output index
uint block_index = block_coord.x +
(block_coord.y * pc.blocks_dim.x) +
(block_coord.z * pc.blocks_dim.x * pc.blocks_dim.y);
uint out_idx = block_index * (bytes_per_block >> 2u);
out_u32[out_idx] = texel.x;
out_u32[out_idx + 1] = texel.y;
if (pc.bytes_per_block_log2 == 4u) {
out_u32[out_idx + 2] = texel.z;
out_u32[out_idx + 3] = texel.w;
}
}

14
src/video_core/memory_manager.cpp

@ -123,7 +123,12 @@ GPUVAddr MemoryManager::PageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] DAddr de
[[maybe_unused]] const auto current_entry_type = GetEntry<false>(current_gpu_addr);
SetEntry<false>(current_gpu_addr, entry_type);
if (current_entry_type != entry_type) {
rasterizer->ModifyGPUMemory(unique_identifier, current_gpu_addr, page_size);
if constexpr (entry_type == EntryType::Mapped) {
const DAddr current_dev_addr = dev_addr + offset;
rasterizer->ModifyGPUMemory(unique_identifier, current_gpu_addr, page_size, current_dev_addr);
} else {
rasterizer->ModifyGPUMemory(unique_identifier, current_gpu_addr, page_size, 0u);
}
}
if constexpr (entry_type == EntryType::Mapped) {
const DAddr current_dev_addr = dev_addr + offset;
@ -146,7 +151,12 @@ GPUVAddr MemoryManager::BigPageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] DAddr
[[maybe_unused]] const auto current_entry_type = GetEntry<true>(current_gpu_addr);
SetEntry<true>(current_gpu_addr, entry_type);
if (current_entry_type != entry_type) {
rasterizer->ModifyGPUMemory(unique_identifier, current_gpu_addr, big_page_size);
if constexpr (entry_type == EntryType::Mapped) {
const DAddr current_dev_addr = dev_addr + offset;
rasterizer->ModifyGPUMemory(unique_identifier, current_gpu_addr, page_size, current_dev_addr);
} else {
rasterizer->ModifyGPUMemory(unique_identifier, current_gpu_addr, big_page_size, 0u);
}
}
if constexpr (entry_type == EntryType::Mapped) {
const DAddr current_dev_addr = dev_addr + offset;

5
src/video_core/rasterizer_interface.h

@ -1,3 +1,6 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
@ -120,7 +123,7 @@ public:
virtual void UnmapMemory(DAddr addr, u64 size) = 0;
/// Remap GPU memory range. This means underneath backing memory changed
virtual void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) = 0;
virtual void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size, DAddr d_addr) = 0;
/// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
/// and invalidated

5
src/video_core/renderer_null/null_rasterizer.cpp

@ -1,3 +1,6 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
@ -62,7 +65,7 @@ VideoCore::RasterizerDownloadArea RasterizerNull::GetFlushArea(PAddr addr, u64 s
}
void RasterizerNull::InvalidateGPUCache() {}
void RasterizerNull::UnmapMemory(DAddr addr, u64 size) {}
void RasterizerNull::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) {}
void RasterizerNull::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size, DAddr d_addr) {}
void RasterizerNull::SignalFence(std::function<void()>&& func) {
func();
}

5
src/video_core/renderer_null/null_rasterizer.h

@ -1,3 +1,6 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
@ -58,7 +61,7 @@ public:
VideoCore::RasterizerDownloadArea GetFlushArea(DAddr addr, u64 size) override;
void InvalidateGPUCache() override;
void UnmapMemory(DAddr addr, u64 size) override;
void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) override;
void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size, DAddr d_addr) override;
void SignalFence(std::function<void()>&& func) override;
void SyncOperation(std::function<void()>&& func) override;
void SignalSyncPoint(u32 value) override;

4
src/video_core/renderer_opengl/gl_rasterizer.cpp

@ -600,10 +600,10 @@ void RasterizerOpenGL::UnmapMemory(DAddr addr, u64 size) {
shader_cache.OnCacheInvalidation(addr, size);
}
void RasterizerOpenGL::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) {
void RasterizerOpenGL::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size, DAddr d_addr) {
{
std::scoped_lock lock{texture_cache.mutex};
texture_cache.UnmapGPUMemory(as_id, addr, size);
texture_cache.UnmapGPUMemory(as_id, addr, size, d_addr);
}
}

5
src/video_core/renderer_opengl/gl_rasterizer.h

@ -1,3 +1,6 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: 2015 Citra Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
@ -103,7 +106,7 @@ public:
bool OnCPUWrite(PAddr addr, u64 size) override;
void InvalidateGPUCache() override;
void UnmapMemory(DAddr addr, u64 size) override;
void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) override;
void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size, DAddr d_addr) override;
void SignalFence(std::function<void()>&& func) override;
void SyncOperation(std::function<void()>&& func) override;
void SignalSyncPoint(u32 value) override;

5
src/video_core/renderer_opengl/gl_texture_cache.cpp

@ -556,7 +556,7 @@ void TextureCacheRuntime::Finish() {
glFinish();
}
StagingBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size) {
StagingBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size, bool deferred) {
return staging_buffer_pool.RequestUploadBuffer(size);
}
@ -651,7 +651,8 @@ void TextureCacheRuntime::BlitFramebuffer(Framebuffer* dst, Framebuffer* src,
}
void TextureCacheRuntime::AccelerateImageUpload(Image& image, const StagingBufferMap& map,
std::span<const SwizzleParameters> swizzles) {
std::span<const SwizzleParameters> swizzles,
u32 z_start, u32 z_count) {
switch (image.info.type) {
case ImageType::e2D:
if (IsPixelFormatASTC(image.info.format)) {

10
src/video_core/renderer_opengl/gl_texture_cache.h

@ -1,3 +1,6 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
@ -72,7 +75,7 @@ public:
void Finish();
StagingBufferMap UploadStagingBuffer(size_t size);
StagingBufferMap UploadStagingBuffer(size_t size, bool deferred = false);
StagingBufferMap DownloadStagingBuffer(size_t size, bool deferred = false);
@ -116,7 +119,10 @@ public:
Tegra::Engines::Fermi2D::Operation operation);
void AccelerateImageUpload(Image& image, const StagingBufferMap& map,
std::span<const VideoCommon::SwizzleParameters> swizzles);
std::span<const VideoCommon::SwizzleParameters> swizzles,
u32 z_start, u32 z_count);
void ClearImage(Image& image, u32 clear_value);
void InsertUploadMemoryBarrier();

230
src/video_core/renderer_vulkan/vk_compute_pass.cpp

@ -24,6 +24,7 @@
#include "video_core/host_shaders/resolve_conditional_render_comp_spv.h"
#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
#include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
#include "video_core/host_shaders/block_linear_unswizzle_3d_bcn_comp_spv.h"
#include "video_core/renderer_vulkan/vk_compute_pass.h"
#include "video_core/renderer_vulkan/vk_descriptor_pool.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
@ -642,6 +643,235 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
scheduler.Finish();
}
constexpr u32 BL3D_BINDING_SWIZZLE_TABLE = 0;
constexpr u32 BL3D_BINDING_INPUT_BUFFER = 1;
constexpr u32 BL3D_BINDING_OUTPUT_BUFFER = 2;
constexpr std::array<VkDescriptorSetLayoutBinding, 3> BL3D_DESCRIPTOR_SET_BINDINGS{{
{
.binding = BL3D_BINDING_SWIZZLE_TABLE,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // swizzle_table[]
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr,
},
{
.binding = BL3D_BINDING_INPUT_BUFFER,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // block-linear input
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr,
},
{
.binding = BL3D_BINDING_OUTPUT_BUFFER,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr,
},
}};
constexpr DescriptorBankInfo BL3D_BANK_INFO{
.uniform_buffers = 0,
.storage_buffers = 3,
.texture_buffers = 0,
.image_buffers = 0,
.textures = 0,
.images = 0,
.score = 3,
};
constexpr std::array<VkDescriptorUpdateTemplateEntry, 3>
BL3D_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{
{
.dstBinding = BL3D_BINDING_SWIZZLE_TABLE,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.offset = BL3D_BINDING_SWIZZLE_TABLE * sizeof(DescriptorUpdateEntry),
.stride = sizeof(DescriptorUpdateEntry),
},
{
.dstBinding = BL3D_BINDING_INPUT_BUFFER,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.offset = BL3D_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry),
.stride = sizeof(DescriptorUpdateEntry),
},
{
.dstBinding = BL3D_BINDING_OUTPUT_BUFFER,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.offset = BL3D_BINDING_OUTPUT_BUFFER * sizeof(DescriptorUpdateEntry),
.stride = sizeof(DescriptorUpdateEntry),
}
}};
struct alignas(16) BlockLinearUnswizzle3DPushConstants {
u32 blocks_dim[3]; // Offset 0
u32 bytes_per_block_log2; // Offset 12
u32 origin[3]; // Offset 16
u32 slice_size; // Offset 28
u32 block_size; // Offset 32
u32 x_shift; // Offset 36
u32 block_height; // Offset 40
u32 block_height_mask; // Offset 44
u32 block_depth; // Offset 48
u32 block_depth_mask; // Offset 52
s32 _pad; // Offset 56
s32 destination[3]; // Offset 60
s32 _pad_end; // Offset 72
};
static_assert(sizeof(BlockLinearUnswizzle3DPushConstants) <= 128);
BlockLinearUnswizzle3DPass::BlockLinearUnswizzle3DPass(
const Device& device_, Scheduler& scheduler_,
DescriptorPool& descriptor_pool_,
StagingBufferPool& staging_buffer_pool_,
ComputePassDescriptorQueue& compute_pass_descriptor_queue_)
: ComputePass(
device_, descriptor_pool_,
BL3D_DESCRIPTOR_SET_BINDINGS,
BL3D_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY,
BL3D_BANK_INFO,
COMPUTE_PUSH_CONSTANT_RANGE<sizeof(BlockLinearUnswizzle3DPushConstants)>,
BLOCK_LINEAR_UNSWIZZLE_3D_BCN_COMP_SPV),
scheduler{scheduler_},
staging_buffer_pool{staging_buffer_pool_},
compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {}
BlockLinearUnswizzle3DPass::~BlockLinearUnswizzle3DPass() = default;
// God have mercy on my soul
void BlockLinearUnswizzle3DPass::Unswizzle(
Image& image,
const StagingBufferRef& swizzled,
std::span<const VideoCommon::SwizzleParameters> swizzles,
u32 z_start, u32 z_count)
{
using namespace VideoCommon::Accelerated;
if (!image.has_compute_unswizzle_buffer) {
image.AllocateComputeUnswizzleBuffer();
}
ASSERT(swizzles.size() == 1);
const auto& sw = swizzles[0];
const auto params = MakeBlockLinearSwizzle3DParams(sw, image.info);
BlockLinearUnswizzle3DPushConstants pc{};
pc.origin[0] = params.origin[0];
pc.origin[1] = params.origin[1];
pc.origin[2] = z_start; // Start at the current Z-slice
pc.destination[0] = params.destination[0];
pc.destination[1] = params.destination[1];
pc.destination[2] = 0; // Shader writes to start of output buffer
pc.bytes_per_block_log2 = params.bytes_per_block_log2;
pc.slice_size = params.slice_size;
pc.block_size = params.block_size;
pc.x_shift = params.x_shift;
pc.block_height = params.block_height;
pc.block_height_mask = params.block_height_mask;
pc.block_depth = params.block_depth;
pc.block_depth_mask = params.block_depth_mask;
const u32 blocks_x = (image.info.size.width + 3) / 4;
const u32 blocks_y = (image.info.size.height + 3) / 4;
pc.blocks_dim[0] = blocks_x;
pc.blocks_dim[1] = blocks_y;
pc.blocks_dim[2] = z_count; // Only process the count
compute_pass_descriptor_queue.Acquire();
compute_pass_descriptor_queue.AddBuffer(*image.runtime->swizzle_table_buffer, 0, image.runtime->swizzle_table_size);
compute_pass_descriptor_queue.AddBuffer(swizzled.buffer, sw.buffer_offset + swizzled.offset, image.guest_size_bytes - sw.buffer_offset);
compute_pass_descriptor_queue.AddBuffer(*image.compute_unswizzle_buffer, 0, image.compute_unswizzle_buffer_size);
const void* descriptor_data = compute_pass_descriptor_queue.UpdateData();
const VkDescriptorSet set = descriptor_allocator.Commit();
const u32 gx = Common::DivCeil(blocks_x, 32u);
const u32 gy = Common::DivCeil(blocks_y, 8u);
const u32 gz = Common::DivCeil(z_count, 1u);
scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([this, &image, set, descriptor_data, pc, gx, gy, gz, z_start, z_count](vk::CommandBuffer cmdbuf) {
const VkBuffer out_buffer = *image.compute_unswizzle_buffer;
const VkImage dst_image = image.Handle();
const VkImageAspectFlags aspect = image.AspectMask();
device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data);
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {});
cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
cmdbuf.Dispatch(gx, gy, gz);
// OPTIMIZATION: Combined barrier - merge buffer and image barriers when possible
const bool is_first = (z_start == 0);
// Single barrier for compute -> transfer (buffer ready, image transition)
const VkBufferMemoryBarrier buffer_barrier{
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
.buffer = out_buffer,
.offset = 0,
.size = (VkDeviceSize)z_count * pc.slice_size,
};
const VkImageMemoryBarrier pre_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.srcAccessMask = is_first ? VkAccessFlags{} : static_cast<VkAccessFlags>(VK_ACCESS_SHADER_READ_BIT),
.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.oldLayout = is_first ? VK_IMAGE_LAYOUT_UNDEFINED : VK_IMAGE_LAYOUT_GENERAL,
.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.image = dst_image,
.subresourceRange = {aspect, 0, 1, 0, 1},
};
// Single barrier handles both buffer and image
cmdbuf.PipelineBarrier(
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_TRANSFER_BIT,
0,
nullptr, buffer_barrier, pre_barrier
);
const VkBufferImageCopy copy{
.bufferOffset = 0,
.imageSubresource = {aspect, 0, 0, 1},
.imageOffset = {0, 0, (s32)z_start},
.imageExtent = {image.info.size.width, image.info.size.height, z_count},
};
cmdbuf.CopyBufferToImage(out_buffer, dst_image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy);
// Post-copy transition
const VkImageMemoryBarrier post_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.newLayout = VK_IMAGE_LAYOUT_GENERAL,
.image = dst_image,
.subresourceRange = {aspect, 0, 1, 0, 1},
};
cmdbuf.PipelineBarrier(
VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
0,
nullptr, nullptr, post_barrier
);
});
}
MSAACopyPass::MSAACopyPass(const Device& device_, Scheduler& scheduler_,
DescriptorPool& descriptor_pool_,
StagingBufferPool& staging_buffer_pool_,

23
src/video_core/renderer_vulkan/vk_compute_pass.h

@ -1,3 +1,6 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
@ -131,6 +134,26 @@ private:
MemoryAllocator& memory_allocator;
};
class BlockLinearUnswizzle3DPass final : public ComputePass {
public:
explicit BlockLinearUnswizzle3DPass(const Device& device_, Scheduler& scheduler_,
DescriptorPool& descriptor_pool_,
StagingBufferPool& staging_buffer_pool_,
ComputePassDescriptorQueue& compute_pass_descriptor_queue_);
~BlockLinearUnswizzle3DPass();
void Unswizzle(Image& image,
const StagingBufferRef& swizzled,
std::span<const VideoCommon::SwizzleParameters> swizzles,
u32 z_start, u32 z_count);
private:
Scheduler& scheduler;
StagingBufferPool& staging_buffer_pool;
ComputePassDescriptorQueue& compute_pass_descriptor_queue;
};
class MSAACopyPass final : public ComputePass {
public:
explicit MSAACopyPass(const Device& device_, Scheduler& scheduler_,

4
src/video_core/renderer_vulkan/vk_rasterizer.cpp

@ -748,10 +748,10 @@ void RasterizerVulkan::UnmapMemory(DAddr addr, u64 size) {
pipeline_cache.OnCacheInvalidation(addr, size);
}
void RasterizerVulkan::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) {
void RasterizerVulkan::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size, DAddr d_addr) {
{
std::scoped_lock lock{texture_cache.mutex};
texture_cache.UnmapGPUMemory(as_id, addr, size);
texture_cache.UnmapGPUMemory(as_id, addr, size, d_addr);
}
}

2
src/video_core/renderer_vulkan/vk_rasterizer.h

@ -108,7 +108,7 @@ public:
bool OnCPUWrite(DAddr addr, u64 size) override;
void InvalidateGPUCache() override;
void UnmapMemory(DAddr addr, u64 size) override;
void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) override;
void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size, DAddr d_addr) override;
void SignalFence(std::function<void()>&& func) override;
void SyncOperation(std::function<void()>&& func) override;
void SignalSyncPoint(u32 value) override;

10
src/video_core/renderer_vulkan/vk_scheduler.cpp

@ -43,6 +43,16 @@ Scheduler::Scheduler(const Device& device_, StateTracker& state_tracker_)
: device{device_}, state_tracker{state_tracker_},
master_semaphore{std::make_unique<MasterSemaphore>(device)},
command_pool{std::make_unique<CommandPool>(*master_semaphore, device)} {
// PRE-OPTIMIZATION: Warm up the pool to prevent mid-frame spikes
{
std::scoped_lock rl{reserve_mutex};
chunk_reserve.reserve(2048); // Prevent vector resizing
for (int i = 0; i < 1024; ++i) {
chunk_reserve.push_back(std::make_unique<CommandChunk>());
}
}
AcquireNewChunk();
AllocateWorkerCommandBuffer();
worker_thread = std::jthread([this](std::stop_token token) { WorkerThread(token); });

2
src/video_core/renderer_vulkan/vk_scheduler.h

@ -207,7 +207,7 @@ private:
size_t command_offset = 0;
bool submit = false;
alignas(std::max_align_t) std::array<u8, 0x8000> data{};
alignas(std::max_align_t) std::array<u8, 0x40000> data{};
};
struct State {

97
src/video_core/renderer_vulkan/vk_texture_cache.cpp

@ -24,12 +24,14 @@
#include "video_core/renderer_vulkan/vk_render_pass_cache.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
#include "video_core/surface.h"
#include "video_core/texture_cache/formatter.h"
#include "video_core/texture_cache/samples_helper.h"
#include "video_core/texture_cache/util.h"
#include "video_core/vulkan_common/vulkan_device.h"
#include "video_core/vulkan_common/vulkan_memory_allocator.h"
#include "video_core/vulkan_common/vulkan_wrapper.h"
#include "video_core/textures/decoders.h"
namespace Vulkan {
@ -878,14 +880,51 @@ TextureCacheRuntime::TextureCacheRuntime(const Device& device_, Scheduler& sched
}
}
}
bl3d_unswizzle_pass.emplace(device, scheduler, descriptor_pool,
staging_buffer_pool, compute_pass_descriptor_queue);
// --- Create swizzle table buffer ---
{
auto table = Tegra::Texture::MakeSwizzleTable();
swizzle_table_size = static_cast<VkDeviceSize>(table.size() * sizeof(table[0]));
auto staging = staging_buffer_pool.Request(swizzle_table_size, MemoryUsage::Upload);
std::memcpy(staging.mapped_span.data(), table.data(), static_cast<size_t>(swizzle_table_size));
VkBufferCreateInfo ci{
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.size = swizzle_table_size,
.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
VK_BUFFER_USAGE_TRANSFER_DST_BIT |
VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
};
swizzle_table_buffer = memory_allocator.CreateBuffer(ci, MemoryUsage::DeviceLocal);
scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([staging_buf = staging.buffer,
dst_buf = *swizzle_table_buffer,
size = swizzle_table_size,
src_off = staging.offset](vk::CommandBuffer cmdbuf) {
const VkBufferCopy region{
.srcOffset = src_off,
.dstOffset = 0,
.size = size,
};
cmdbuf.CopyBuffer(staging_buf, dst_buf, region);
});
}
}
void TextureCacheRuntime::Finish() {
scheduler.Finish();
}
StagingBufferRef TextureCacheRuntime::UploadStagingBuffer(size_t size) {
return staging_buffer_pool.Request(size, MemoryUsage::Upload);
StagingBufferRef TextureCacheRuntime::UploadStagingBuffer(size_t size, bool deferred) {
return staging_buffer_pool.Request(size, MemoryUsage::Upload, deferred);
}
StagingBufferRef TextureCacheRuntime::DownloadStagingBuffer(size_t size, bool deferred) {
@ -1581,6 +1620,46 @@ Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBas
Image::~Image() = default;
void Image::AllocateComputeUnswizzleBuffer() {
if (has_compute_unswizzle_buffer)
return;
using VideoCore::Surface::BytesPerBlock;
const u32 block_bytes = BytesPerBlock(info.format); // 8 for BC1, 16 for BC6H
const u32 block_width = 4;
const u32 block_height = 4;
// BCn is 4x4x1 blocks
const u32 blocks_x = (info.size.width + block_width - 1) / block_width;
const u32 blocks_y = (info.size.height + block_height - 1) / block_height;
const u32 blocks_z = info.size.depth;
const u64 block_count =
static_cast<u64>(blocks_x) *
static_cast<u64>(blocks_y) *
static_cast<u64>(blocks_z);
compute_unswizzle_buffer_size = block_count * block_bytes;
VkBufferCreateInfo ci{
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = nullptr,
.flags = 0,
.size = compute_unswizzle_buffer_size,
.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
.queueFamilyIndexCount = 0,
.pQueueFamilyIndices = nullptr,
};
compute_unswizzle_buffer =
runtime->memory_allocator.CreateBuffer(ci, MemoryUsage::DeviceLocal);
has_compute_unswizzle_buffer = true;
}
void Image::UploadMemory(VkBuffer buffer, VkDeviceSize offset,
std::span<const VideoCommon::BufferImageCopy> copies) {
// TODO: Move this to another API
@ -2397,10 +2476,22 @@ void Framebuffer::CreateFramebuffer(TextureCacheRuntime& runtime,
void TextureCacheRuntime::AccelerateImageUpload(
Image& image, const StagingBufferRef& map,
std::span<const VideoCommon::SwizzleParameters> swizzles) {
std::span<const VideoCommon::SwizzleParameters> swizzles,
u32 z_start, u32 z_count) {
if (IsPixelFormatASTC(image.info.format)) {
return astc_decoder_pass->Assemble(image, map, swizzles);
}
if (bl3d_unswizzle_pass &&
IsPixelFormatBCn(image.info.format) &&
image.info.type == ImageType::e3D &&
image.info.resources.levels == 1 &&
image.info.resources.layers == 1) {
return bl3d_unswizzle_pass->Unswizzle(image, map, swizzles, z_start, z_count);
}
ASSERT(false);
}

20
src/video_core/renderer_vulkan/vk_texture_cache.h

@ -51,7 +51,7 @@ public:
void Finish();
StagingBufferRef UploadStagingBuffer(size_t size);
StagingBufferRef UploadStagingBuffer(size_t size, bool deferred = false);
StagingBufferRef DownloadStagingBuffer(size_t size, bool deferred = false);
@ -91,7 +91,8 @@ public:
}
void AccelerateImageUpload(Image&, const StagingBufferRef&,
std::span<const VideoCommon::SwizzleParameters>);
std::span<const VideoCommon::SwizzleParameters>,
u32 z_start, u32 z_count);
void InsertUploadMemoryBarrier() {}
@ -127,6 +128,11 @@ public:
BlitImageHelper& blit_image_helper;
RenderPassCache& render_pass_cache;
std::optional<ASTCDecoderPass> astc_decoder_pass;
std::optional<BlockLinearUnswizzle3DPass> bl3d_unswizzle_pass;
vk::Buffer swizzle_table_buffer;
VkDeviceSize swizzle_table_size = 0;
std::unique_ptr<MSAACopyPass> msaa_copy_pass;
const Settings::ResolutionScalingInfo& resolution;
std::array<std::vector<VkFormat>, VideoCore::Surface::MaxPixelFormat> view_formats;
@ -163,6 +169,8 @@ public:
void DownloadMemory(const StagingBufferRef& map,
std::span<const VideoCommon::BufferImageCopy> copies);
void AllocateComputeUnswizzleImage();
[[nodiscard]] VkImage Handle() const noexcept {
return *(this->*current_image);
@ -188,6 +196,8 @@ public:
bool ScaleUp(bool ignore = false);
bool ScaleDown(bool ignore = false);
friend class BlockLinearUnswizzle3DPass;
private:
bool BlitScaleHelper(bool scale_up);
@ -199,6 +209,12 @@ private:
vk::Image original_image;
vk::Image scaled_image;
vk::Buffer compute_unswizzle_buffer;
VkDeviceSize compute_unswizzle_buffer_size = 0;
bool has_compute_unswizzle_buffer = false;
void AllocateComputeUnswizzleBuffer();
// Use a pointer to field because it is relative, so that the object can be
// moved without breaking the reference.

14
src/video_core/texture_cache/image_base.h

@ -1,3 +1,6 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
@ -52,6 +55,13 @@ struct AliasedImage {
ImageId id;
};
struct SparseBinding {
GPUVAddr gpu_addr; // Virtual GPU address of this tile
DAddr device_addr; // Physical device memory address
u64 tile_index; // Linear tile index in the texture
Extent3D tile_coord; // 3D coordinate of this tile
};
struct NullImageParams {};
struct ImageBase {
@ -115,6 +125,10 @@ struct ImageBase {
std::vector<AliasedImage> aliased_images;
std::vector<ImageId> overlapping_images;
ImageMapId map_view_id{};
boost::container::small_vector<u64, 16> dirty_offsets;
std::unordered_map<GPUVAddr, SparseBinding> sparse_bindings;
u32 sparse_tile_size = 65536;
};
struct ImageMapView {

256
src/video_core/texture_cache/texture_cache.h

@ -8,6 +8,7 @@
#include <limits>
#include <optional>
#include <bit>
#include <unordered_set>
#include <boost/container/small_vector.hpp>
@ -22,6 +23,7 @@
#include "video_core/texture_cache/samples_helper.h"
#include "video_core/texture_cache/texture_cache_base.h"
#include "video_core/texture_cache/util.h"
#include "video_core/textures/decoders.h"
namespace VideoCommon {
@ -160,6 +162,7 @@ void TextureCache<P>::TickFrame() {
sentenced_framebuffers.Tick();
sentenced_image_view.Tick();
TickAsyncDecode();
TickAsyncUnswizzle();
runtime.TickFrame();
++frame_tick;
@ -615,7 +618,36 @@ void TextureCache<P>::UnmapMemory(DAddr cpu_addr, size_t size) {
}
template <class P>
void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t size) {
std::optional<SparseBinding> TextureCache<P>::CalculateSparseBinding(
const Image& image, GPUVAddr gpu_addr, DAddr dev_addr) {
if (!image.info.is_sparse) {
return std::nullopt;
}
const u64 offset = gpu_addr - image.gpu_addr;
const u64 tile_index = offset / image.sparse_tile_size;
const u32 tile_width_blocks = 128;
const u32 tile_height_blocks = 32;
const u32 width_in_tiles = (image.info.size.width / 4 + tile_width_blocks - 1) / tile_width_blocks;
const u32 height_in_tiles = (image.info.size.height / 4 + tile_height_blocks - 1) / tile_height_blocks;
const u32 tile_x = static_cast<u32>((tile_index % width_in_tiles) * tile_width_blocks * 4);
const u32 tile_y = static_cast<u32>(((tile_index / width_in_tiles) % height_in_tiles) * tile_height_blocks * 4);
const u32 tile_z = static_cast<u32>(tile_index / (width_in_tiles * height_in_tiles));
return SparseBinding{
.gpu_addr = gpu_addr,
.device_addr = dev_addr,
.tile_index = tile_index,
.tile_coord = {tile_x, tile_y, tile_z}
};
}
template <class P>
void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t size, DAddr dev_addr) {
boost::container::small_vector<ImageId, 16> deleted_images;
ForEachImageInRegionGPU(as_id, gpu_addr, size,
[&](ImageId id, Image&) { deleted_images.push_back(id); });
@ -627,11 +659,19 @@ void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t siz
UntrackImage(image, id);
}
}
if (True(image.flags & ImageFlagBits::Remapped)) {
continue;
}
image.flags |= ImageFlagBits::Remapped;
if (image.info.is_sparse && dev_addr != 0) {
// Calculate and store the binding
auto binding = CalculateSparseBinding(image, gpu_addr, dev_addr);
if (binding) {
image.sparse_bindings[gpu_addr] = *binding;
image.dirty_offsets.push_back(binding->tile_index);
}
}
}
}
@ -1055,9 +1095,29 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
// Only upload modified images
return;
}
image.flags &= ~ImageFlagBits::CpuModified;
TrackImage(image, image_id);
// If it's sparse and remapped, we treat it as a partial update trigger
if (image.info.is_sparse && True(image.flags & ImageFlagBits::Remapped)) {
image.flags &= ~ImageFlagBits::Remapped;
if (!image.dirty_offsets.empty() && !image.sparse_bindings.empty()) {
/*constexpr u64 page_size = 64_KiB;
size_t dirty_size = image.dirty_offsets.size() * page_size;
auto staging = runtime.UploadStagingBuffer(dirty_size);
UploadSparseDirtyTiles(image, staging);
runtime.InsertUploadMemoryBarrier();
return;*/
image.dirty_offsets.clear();
image.sparse_bindings.clear();
return;
}
}
if (image.info.num_samples > 1 && !runtime.CanUploadMSAA()) {
LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented");
runtime.TransitionImageLayout(image);
@ -1067,11 +1127,102 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
QueueAsyncDecode(image, image_id);
return;
}
if (IsPixelFormatBCn(image.info.format) &&
image.info.type == ImageType::e3D &&
image.info.resources.levels == 1 &&
image.info.resources.layers == 1 &&
MapSizeBytes(image) >= 32_MiB &&
False(image.flags & ImageFlagBits::GpuModified)) {
QueueAsyncUnswizzle(image, image_id);
return;
}
auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image));
UploadImageContents(image, staging);
runtime.InsertUploadMemoryBarrier();
}
template <class P>
template <typename StagingBuffer>
void TextureCache<P>::UploadSparseDirtyTiles(Image& image, StagingBuffer& staging) {
using namespace VideoCommon;
using namespace Tegra::Texture;
std::vector<BufferImageCopy> all_copies;
size_t total_upload_size = 0;
for (u64 dirty_tile_index : image.dirty_offsets) {
SparseBinding* binding = nullptr;
for (auto& [addr, bind] : image.sparse_bindings) {
if (bind.tile_index == dirty_tile_index) {
binding = &bind;
break;
}
}
if (!binding) {
continue;
}
const auto& coord = binding->tile_coord;
// Calculate tile dimensions
const u32 tile_width_blocks = 128;
const u32 tile_height_blocks = 32;
const u32 tile_width = std::min(tile_width_blocks * 4, image.info.size.width - coord.width);
const u32 tile_height = std::min(tile_height_blocks * 4, image.info.size.height - coord.height);
const u32 tile_depth = std::min(1u, image.info.size.depth - coord.depth);
const u32 bytes_per_block = BytesPerBlock(image.info.format);
const u32 blocks_wide = (tile_width + 3) / 4;
const u32 blocks_high = (tile_height + 3) / 4;
const size_t tile_unswizzled_size = blocks_wide * blocks_high * tile_depth * bytes_per_block;
if (total_upload_size + tile_unswizzled_size > staging.mapped_span.size()) {
LOG_ERROR(HW_GPU, "Staging buffer too small");
break;
}
std::array<u8, 65536> tile_swizzled_data;
gpu_memory->ReadBlockUnsafe(binding->gpu_addr, tile_swizzled_data.data(), image.sparse_tile_size);
// Get output span
auto tile_output = staging.mapped_span.subspan(total_upload_size, tile_unswizzled_size);
// Unswizzle the tile
auto result = UnswizzleSparseTextureTile(tile_output, tile_swizzled_data,
image.info, tile_width, tile_height, tile_depth);
// Create the copy descriptor
BufferImageCopy copy{
.buffer_offset = total_upload_size,
.buffer_size = tile_unswizzled_size,
.buffer_row_length = result.buffer_row_length,
.buffer_image_height = result.buffer_image_height,
.image_subresource = {
.base_level = 0,
.base_layer = 0,
.num_layers = 1,
},
.image_offset = {
static_cast<s32>(coord.width),
static_cast<s32>(coord.height),
static_cast<s32>(coord.depth)
},
.image_extent = {tile_width, tile_height, tile_depth}
};
all_copies.push_back(copy);
total_upload_size += tile_unswizzled_size;
}
if (!all_copies.empty()) {
image.UploadMemory(staging, all_copies);
}
image.dirty_offsets.clear();
}
template <class P>
template <typename StagingBuffer>
void TextureCache<P>::UploadImageContents(Image& image, StagingBuffer& staging) {
@ -1082,7 +1233,7 @@ void TextureCache<P>::UploadImageContents(Image& image, StagingBuffer& staging)
gpu_memory->ReadBlock(gpu_addr, mapped_span.data(), mapped_span.size_bytes(),
VideoCommon::CacheType::NoTextureCache);
const auto uploads = FullUploadSwizzles(image.info);
runtime.AccelerateImageUpload(image, staging, FixSmallVectorADL(uploads));
runtime.AccelerateImageUpload(image, staging, FixSmallVectorADL(uploads), 0, 0);
return;
}
@ -1311,6 +1462,20 @@ void TextureCache<P>::QueueAsyncDecode(Image& image, ImageId image_id) {
texture_decode_worker.QueueWork(std::move(func));
}
template <class P>
void TextureCache<P>::QueueAsyncUnswizzle(Image& image, ImageId image_id) {
if (True(image.flags & ImageFlagBits::IsDecoding)) {
return;
}
image.flags |= ImageFlagBits::IsDecoding;
unswizzle_queue.push_back({
.image_id = image_id,
.info = image.info
});
}
template <class P>
void TextureCache<P>::TickAsyncDecode() {
bool has_uploads{};
@ -1336,6 +1501,90 @@ void TextureCache<P>::TickAsyncDecode() {
}
}
template <class P>
void TextureCache<P>::TickAsyncUnswizzle() {
if (unswizzle_queue.empty()) {
current_unswizzle_frame = 0;
return;
}
// Don't process every frame - allow more data to accumulate
if (current_unswizzle_frame++ < 2) return;
PendingUnswizzle& task = unswizzle_queue.front();
Image& image = slot_images[task.image_id];
if (!task.initialized) {
task.total_size = MapSizeBytes(image);
task.staging_buffer = runtime.UploadStagingBuffer(task.total_size, true);
const auto& info = image.info;
const u32 bytes_per_block = BytesPerBlock(info.format);
const u32 width_blocks = Common::DivCeil(info.size.width, 4u);
const u32 height_blocks = Common::DivCeil(info.size.height, 4u);
const u32 stride = Common::AlignUp(width_blocks * bytes_per_block, 64u);
const u32 aligned_height = Common::AlignUp(height_blocks, 8u << task.info.block.height);
task.bytes_per_slice = static_cast<size_t>(stride) * aligned_height;
task.last_submitted_offset = 0;
task.initialized = true;
}
// ToDo: Make these configurable
const size_t CHUNK_SIZE = 64_MiB;
const u32 SLICES_PER_BATCH = 512;
static std::vector<u8> temp_buffer;
if (temp_buffer.size() < CHUNK_SIZE) {
temp_buffer.resize(CHUNK_SIZE);
}
// Read data
if (task.current_offset < task.total_size) {
const size_t remaining = task.total_size - task.current_offset;
const size_t copy_amount = std::min(CHUNK_SIZE, remaining);
gpu_memory->ReadBlock(image.gpu_addr + task.current_offset,
task.staging_buffer.mapped_span.data() + task.current_offset,
copy_amount,
VideoCommon::CacheType::NoTextureCache);
task.current_offset += copy_amount;
}
const size_t batch_threshold = task.bytes_per_slice * SLICES_PER_BATCH;
size_t ready_to_submit = task.current_offset - task.last_submitted_offset;
const bool is_final_batch = task.current_offset >= task.total_size;
const bool should_submit = ready_to_submit >= batch_threshold ||
(is_final_batch && task.last_submitted_offset < task.total_size);
if (should_submit) {
const u32 z_start = static_cast<u32>(task.last_submitted_offset / task.bytes_per_slice);
const u32 total_depth = image.info.size.depth;
u32 z_count = static_cast<u32>(ready_to_submit / task.bytes_per_slice);
if (z_start + z_count > total_depth) {
z_count = total_depth - z_start;
}
if (z_count > 0) {
const auto uploads = FullUploadSwizzles(task.info);
runtime.AccelerateImageUpload(image, task.staging_buffer, FixSmallVectorADL(uploads), z_start, z_count);
task.last_submitted_offset += (static_cast<size_t>(z_count) * task.bytes_per_slice);
}
}
// Check if complete
if (task.current_offset >= task.total_size &&
task.last_submitted_offset >= (task.total_size - (task.total_size % task.bytes_per_slice))) {
runtime.FreeDeferredStagingBuffer(task.staging_buffer);
image.flags &= ~ImageFlagBits::IsDecoding;
unswizzle_queue.pop_front();
current_unswizzle_frame = 0;
}
}
template <class P>
bool TextureCache<P>::ScaleUp(Image& image) {
const bool has_copy = image.HasScaled();
@ -2423,6 +2672,7 @@ void TextureCache<P>::SynchronizeAliases(ImageId image_id) {
template <class P>
void TextureCache<P>::PrepareImage(ImageId image_id, bool is_modification, bool invalidate) {
Image& image = slot_images[image_id];
runtime.TransitionImageLayout(image);
if (invalidate) {
image.flags &= ~(ImageFlagBits::CpuModified | ImageFlagBits::GpuModified);
if (False(image.flags & ImageFlagBits::Tracked)) {

27
src/video_core/texture_cache/texture_cache_base.h

@ -129,6 +129,17 @@ class TextureCache : public VideoCommon::ChannelSetupCaches<TextureCacheChannelI
using AsyncBuffer = typename P::AsyncBuffer;
using BufferType = typename P::BufferType;
struct PendingUnswizzle {
ImageId image_id;
VideoCommon::ImageInfo info;
size_t current_offset = 0;
size_t total_size = 0;
AsyncBuffer staging_buffer;
size_t last_submitted_offset = 0;
u32 bytes_per_slice = 0;
bool initialized = false;
};
struct BlitImages {
ImageId dst_id;
ImageId src_id;
@ -212,7 +223,11 @@ public:
void UnmapMemory(DAddr cpu_addr, size_t size);
/// Remove images in a region
void UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t size);
void UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t size, DAddr dev_addr);
/// Basic sparse binding
std::optional<SparseBinding> CalculateSparseBinding(
const Image& image, GPUVAddr gpu_addr, DAddr dev_addr);
/// Blit an image with the given parameters
bool BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
@ -312,6 +327,10 @@ private:
/// Refresh the contents (pixel data) of an image
void RefreshContents(Image& image, ImageId image_id);
/// Sparse texture partial upload
template <typename StagingBuffer>
void UploadSparseDirtyTiles(Image& image, StagingBuffer& staging);
/// Upload data from guest to an image
template <typename StagingBuffer>
@ -433,6 +452,9 @@ private:
void TrimInactiveSamplers(size_t budget);
std::optional<size_t> QuerySamplerBudget() const;
void QueueAsyncUnswizzle(Image& image, ImageId image_id);
void TickAsyncUnswizzle();
Runtime& runtime;
Tegra::MaxwellDeviceMemoryManager& device_memory;
@ -508,6 +530,9 @@ private:
Common::ThreadWorker texture_decode_worker{1, "TextureDecoder"};
std::vector<std::unique_ptr<AsyncDecodeContext>> async_decodes;
std::deque<PendingUnswizzle> unswizzle_queue;
u8 current_unswizzle_frame;
// Join caching
boost::container::small_vector<ImageId, 4> join_overlap_ids;
std::unordered_set<ImageId> join_overlaps_found;

41
src/video_core/texture_cache/util.cpp

@ -55,6 +55,7 @@ using Tegra::Texture::TextureFormat;
using Tegra::Texture::TextureType;
using Tegra::Texture::TICEntry;
using Tegra::Texture::UnswizzleTexture;
using Tegra::Texture::UnswizzleSubrect;
using VideoCore::Surface::BytesPerBlock;
using VideoCore::Surface::DefaultBlockHeight;
using VideoCore::Surface::DefaultBlockWidth;
@ -922,6 +923,46 @@ boost::container::small_vector<BufferImageCopy, 16> UnswizzleImage(Tegra::Memory
return copies;
}
SparseTileUnswizzleResult UnswizzleSparseTextureTile(std::span<u8> output,
std::span<const u8> input,
const ImageInfo& info,
u32 tile_width,
u32 tile_height,
u32 tile_depth) {
const Extent2D block_size = DefaultBlockSize(info.format);
const u32 bpp = BytesPerBlock(info.format);
const u32 width_blocks = (tile_width + block_size.width - 1) / block_size.width;
const u32 height_blocks = (tile_height + block_size.height - 1) / block_size.height;
// Calculate GOBs per row
const u32 bytes_per_row = width_blocks * bpp;
const u32 gobs_per_row = (bytes_per_row + 63) / 64;
// Calculate block_height for 64KB tiles
// 64KB / (gobs_per_row × 512 bytes) = GOBs tall
constexpr u32 TILE_SIZE = 65536;
const u32 gobs_tall = TILE_SIZE / (gobs_per_row * 512);
// block_height = log2(gobs_tall)
const u32 tile_block_height = std::countr_zero(gobs_tall);
const u32 pitch_linear = width_blocks * bpp;
UnswizzleSubrect(
output, input, bpp,
width_blocks, height_blocks, tile_depth,
0, 0,
width_blocks, height_blocks,
tile_block_height, 0,
pitch_linear
);
return {
.buffer_row_length = Common::AlignUp(tile_width, block_size.width),
.buffer_image_height = Common::AlignUp(tile_height, block_size.height)
};
}
void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8> output,
std::span<BufferImageCopy> copies) {
u32 output_offset = 0;

14
src/video_core/texture_cache/util.h

@ -30,6 +30,11 @@ struct OverlapResult {
SubresourceExtent resources;
};
struct SparseTileUnswizzleResult {
u32 buffer_row_length;
u32 buffer_image_height;
};
[[nodiscard]] u32 CalculateGuestSizeInBytes(const ImageInfo& info) noexcept;
[[nodiscard]] u32 CalculateUnswizzledSizeBytes(const ImageInfo& info) noexcept;
@ -68,7 +73,14 @@ struct OverlapResult {
[[nodiscard]] boost::container::small_vector<BufferImageCopy, 16> UnswizzleImage(
Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageInfo& info,
std::span<const u8> input, std::span<u8> output);
[[nodiscard]] SparseTileUnswizzleResult UnswizzleSparseTextureTile(std::span<u8> output,
std::span<const u8> input,
const ImageInfo& info,
u32 tile_width,
u32 tile_height,
u32 tile_depth);
void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8> output,
std::span<BufferImageCopy> copies);

Loading…
Cancel
Save