diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index fef9a5b16e..2a1bdca3ea 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -18,6 +18,7 @@ set(SHADER_FILES blit_color_float.frag block_linear_unswizzle_2d.comp block_linear_unswizzle_3d.comp + block_linear_unswizzle_3d_bcn.comp convert_abgr8_srgb_to_d24s8.frag convert_abgr8_to_d24s8.frag convert_abgr8_to_d32f.frag diff --git a/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp b/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp new file mode 100644 index 0000000000..2c77937f30 --- /dev/null +++ b/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp @@ -0,0 +1,160 @@ +// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#version 430 + +#ifdef VULKAN + #extension GL_EXT_shader_16bit_storage : require + #extension GL_EXT_shader_8bit_storage : require + #define HAS_EXTENDED_TYPES 1 + #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { + #define END_PUSH_CONSTANTS }; + #define UNIFORM(n) + #define BINDING_SWIZZLE_BUFFER 0 + #define BINDING_INPUT_BUFFER 1 + #define BINDING_OUTPUT_BUFFER 2 +#else + #extension GL_NV_gpu_shader5 : enable + #ifdef GL_NV_gpu_shader5 + #define HAS_EXTENDED_TYPES 1 + #else + #define HAS_EXTENDED_TYPES 0 + #endif + #define BEGIN_PUSH_CONSTANTS + #define END_PUSH_CONSTANTS + #define UNIFORM(n) layout(location = n) uniform + #define BINDING_SWIZZLE_BUFFER 0 + #define BINDING_INPUT_BUFFER 1 + #define BINDING_OUTPUT_BUFFER 0 +#endif + +// --- Push Constants / Uniforms --- +#ifdef VULKAN +layout(push_constant) uniform PushConstants { + uvec3 blocks_dim; // Offset 0 + uint bytes_per_block_log2; // Offset 12 + + uvec3 origin; // Offset 16 + uint slice_size; // Offset 28 + + uint block_size; // Offset 32 + uint x_shift; // Offset 36 + uint block_height; // Offset 40 + uint block_height_mask; // Offset 44 + + uint block_depth; // Offset 48 + uint block_depth_mask; // Offset 52 + int _pad; // Offset 56 + + ivec3 destination; // Offset 60 +} pc; +#else +BEGIN_PUSH_CONSTANTS + UNIFORM(0) uvec3 origin; + UNIFORM(1) ivec3 destination; + UNIFORM(2) uint bytes_per_block_log2; + UNIFORM(3) uint slice_size; + UNIFORM(4) uint block_size; + UNIFORM(5) uint x_shift; + UNIFORM(6) uint block_height; + UNIFORM(7) uint block_height_mask; + UNIFORM(8) uint block_depth; + UNIFORM(9) uint block_depth_mask; + UNIFORM(10) uvec3 blocks_dim; +END_PUSH_CONSTANTS +#define pc // Map pc prefix to nothing for OpenGL compatibility +#endif + +// --- Buffers --- +layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable { + uint swizzle_table[]; +}; + +#if HAS_EXTENDED_TYPES + layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU8 { uint8_t u8data[]; }; + layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU16 { uint16_t u16data[]; }; +#endif +layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { uint u32data[]; }; +layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64 { uvec2 u64data[]; }; +layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4 u128data[]; }; + +layout(binding = BINDING_OUTPUT_BUFFER, std430) buffer OutputBuffer { + uint out_u32[]; +}; + +// --- Constants --- +layout(local_size_x = 32, local_size_y = 8, local_size_z = 1) in; + +const uint GOB_SIZE_X = 64; +const uint GOB_SIZE_Y = 8; +const uint GOB_SIZE_Z = 1; +const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z; + +const uint GOB_SIZE_X_SHIFT = 6; +const uint GOB_SIZE_Y_SHIFT = 3; +const uint GOB_SIZE_Z_SHIFT = 0; +const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT; +const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1u, GOB_SIZE_Y - 1u); + +// --- Helpers --- +uint SwizzleOffset(uvec2 pos) { + pos &= SWIZZLE_MASK; + return swizzle_table[pos.y * 64u + pos.x]; +} + +uvec4 ReadTexel(uint offset) { + uint bpl2 = pc.bytes_per_block_log2; + switch (bpl2) { +#if HAS_EXTENDED_TYPES + case 0u: return uvec4(u8data[offset], 0u, 0u, 0u); + case 1u: return uvec4(u16data[offset / 2u], 0u, 0u, 0u); +#else + case 0u: return uvec4(bitfieldExtract(u32data[offset / 4u], int((offset * 8u) & 24u), 8), 0u, 0u, 0u); + case 1u: return uvec4(bitfieldExtract(u32data[offset / 4u], int((offset * 8u) & 16u), 16), 0u, 0u, 0u); +#endif + case 2u: return uvec4(u32data[offset / 4u], 0u, 0u, 0u); + case 3u: return uvec4(u64data[offset / 8u], 0u, 0u); + case 4u: return u128data[offset / 16u]; + } + return uvec4(0u); +} + +void main() { + uvec3 block_coord = gl_GlobalInvocationID; + if (any(greaterThanEqual(block_coord, pc.blocks_dim))) { + return; + } + + uint bytes_per_block = 1u << pc.bytes_per_block_log2; + // Origin is in pixels, divide by 4 for block-space (e.g. BCn formats) + uvec3 pos; + pos.x = (block_coord.x + (pc.origin.x >> 2u)) * bytes_per_block; + pos.y = block_coord.y + (pc.origin.y >> 2u); + pos.z = block_coord.z + pc.origin.z; + + uint swizzle = SwizzleOffset(pos.xy); + uint block_y = pos.y >> GOB_SIZE_Y_SHIFT; + uint offset = 0u; + // Apply block-linear offsets + offset += (pos.z >> pc.block_depth) * pc.slice_size; + offset += (pos.z & pc.block_depth_mask) << (GOB_SIZE_SHIFT + pc.block_height); + offset += (block_y >> pc.block_height) * pc.block_size; + offset += (block_y & pc.block_height_mask) << GOB_SIZE_SHIFT; + offset += (pos.x >> GOB_SIZE_X_SHIFT) << pc.x_shift; + offset += swizzle; + + uvec4 texel = ReadTexel(offset); + + // Calculate linear output index + uint block_index = block_coord.x + + (block_coord.y * pc.blocks_dim.x) + + (block_coord.z * pc.blocks_dim.x * pc.blocks_dim.y); + uint out_idx = block_index * (bytes_per_block >> 2u); + + out_u32[out_idx] = texel.x; + out_u32[out_idx + 1] = texel.y; + if (pc.bytes_per_block_log2 == 4u) { + out_u32[out_idx + 2] = texel.z; + out_u32[out_idx + 3] = texel.w; + } +} \ No newline at end of file diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index be97f5ab05..cb1c0fab83 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -123,7 +123,12 @@ GPUVAddr MemoryManager::PageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] DAddr de [[maybe_unused]] const auto current_entry_type = GetEntry(current_gpu_addr); SetEntry(current_gpu_addr, entry_type); if (current_entry_type != entry_type) { - rasterizer->ModifyGPUMemory(unique_identifier, current_gpu_addr, page_size); + if constexpr (entry_type == EntryType::Mapped) { + const DAddr current_dev_addr = dev_addr + offset; + rasterizer->ModifyGPUMemory(unique_identifier, current_gpu_addr, page_size, current_dev_addr); + } else { + rasterizer->ModifyGPUMemory(unique_identifier, current_gpu_addr, page_size, 0u); + } } if constexpr (entry_type == EntryType::Mapped) { const DAddr current_dev_addr = dev_addr + offset; @@ -146,7 +151,12 @@ GPUVAddr MemoryManager::BigPageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] DAddr [[maybe_unused]] const auto current_entry_type = GetEntry(current_gpu_addr); SetEntry(current_gpu_addr, entry_type); if (current_entry_type != entry_type) { - rasterizer->ModifyGPUMemory(unique_identifier, current_gpu_addr, big_page_size); + if constexpr (entry_type == EntryType::Mapped) { + const DAddr current_dev_addr = dev_addr + offset; + rasterizer->ModifyGPUMemory(unique_identifier, current_gpu_addr, page_size, current_dev_addr); + } else { + rasterizer->ModifyGPUMemory(unique_identifier, current_gpu_addr, big_page_size, 0u); + } } if constexpr (entry_type == EntryType::Mapped) { const DAddr current_dev_addr = dev_addr + offset; diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index 481efbf53b..9b08f47ef3 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later @@ -120,7 +123,7 @@ public: virtual void UnmapMemory(DAddr addr, u64 size) = 0; /// Remap GPU memory range. This means underneath backing memory changed - virtual void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) = 0; + virtual void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size, DAddr d_addr) = 0; /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory /// and invalidated diff --git a/src/video_core/renderer_null/null_rasterizer.cpp b/src/video_core/renderer_null/null_rasterizer.cpp index a5cda0f389..0f9d1a01d5 100644 --- a/src/video_core/renderer_null/null_rasterizer.cpp +++ b/src/video_core/renderer_null/null_rasterizer.cpp @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later @@ -62,7 +65,7 @@ VideoCore::RasterizerDownloadArea RasterizerNull::GetFlushArea(PAddr addr, u64 s } void RasterizerNull::InvalidateGPUCache() {} void RasterizerNull::UnmapMemory(DAddr addr, u64 size) {} -void RasterizerNull::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) {} +void RasterizerNull::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size, DAddr d_addr) {} void RasterizerNull::SignalFence(std::function&& func) { func(); } diff --git a/src/video_core/renderer_null/null_rasterizer.h b/src/video_core/renderer_null/null_rasterizer.h index c7f5849c75..905a22cef8 100644 --- a/src/video_core/renderer_null/null_rasterizer.h +++ b/src/video_core/renderer_null/null_rasterizer.h @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later @@ -58,7 +61,7 @@ public: VideoCore::RasterizerDownloadArea GetFlushArea(DAddr addr, u64 size) override; void InvalidateGPUCache() override; void UnmapMemory(DAddr addr, u64 size) override; - void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) override; + void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size, DAddr d_addr) override; void SignalFence(std::function&& func) override; void SyncOperation(std::function&& func) override; void SignalSyncPoint(u32 value) override; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 602509bfdb..7bfd00f7bd 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -600,10 +600,10 @@ void RasterizerOpenGL::UnmapMemory(DAddr addr, u64 size) { shader_cache.OnCacheInvalidation(addr, size); } -void RasterizerOpenGL::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) { +void RasterizerOpenGL::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size, DAddr d_addr) { { std::scoped_lock lock{texture_cache.mutex}; - texture_cache.UnmapGPUMemory(as_id, addr, size); + texture_cache.UnmapGPUMemory(as_id, addr, size, d_addr); } } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 6eae51ff7d..f41c64b002 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: 2015 Citra Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later @@ -103,7 +106,7 @@ public: bool OnCPUWrite(PAddr addr, u64 size) override; void InvalidateGPUCache() override; void UnmapMemory(DAddr addr, u64 size) override; - void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) override; + void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size, DAddr d_addr) override; void SignalFence(std::function&& func) override; void SyncOperation(std::function&& func) override; void SignalSyncPoint(u32 value) override; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index a3957e4d9f..75254049a6 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -556,7 +556,7 @@ void TextureCacheRuntime::Finish() { glFinish(); } -StagingBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size) { +StagingBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size, bool deferred) { return staging_buffer_pool.RequestUploadBuffer(size); } @@ -651,7 +651,8 @@ void TextureCacheRuntime::BlitFramebuffer(Framebuffer* dst, Framebuffer* src, } void TextureCacheRuntime::AccelerateImageUpload(Image& image, const StagingBufferMap& map, - std::span swizzles) { + std::span swizzles, + u32 z_start, u32 z_count) { switch (image.info.type) { case ImageType::e2D: if (IsPixelFormatASTC(image.info.format)) { diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index d4165d8e4d..8e1d04e3fb 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later @@ -72,7 +75,7 @@ public: void Finish(); - StagingBufferMap UploadStagingBuffer(size_t size); + StagingBufferMap UploadStagingBuffer(size_t size, bool deferred = false); StagingBufferMap DownloadStagingBuffer(size_t size, bool deferred = false); @@ -116,7 +119,10 @@ public: Tegra::Engines::Fermi2D::Operation operation); void AccelerateImageUpload(Image& image, const StagingBufferMap& map, - std::span swizzles); + std::span swizzles, + u32 z_start, u32 z_count); + + void ClearImage(Image& image, u32 clear_value); void InsertUploadMemoryBarrier(); diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 5b41dc225f..e419c3833a 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -24,6 +24,7 @@ #include "video_core/host_shaders/resolve_conditional_render_comp_spv.h" #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" #include "video_core/host_shaders/vulkan_uint8_comp_spv.h" +#include "video_core/host_shaders/block_linear_unswizzle_3d_bcn_comp_spv.h" #include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" #include "video_core/renderer_vulkan/vk_scheduler.h" @@ -642,6 +643,235 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, scheduler.Finish(); } +constexpr u32 BL3D_BINDING_SWIZZLE_TABLE = 0; +constexpr u32 BL3D_BINDING_INPUT_BUFFER = 1; +constexpr u32 BL3D_BINDING_OUTPUT_BUFFER = 2; + +constexpr std::array BL3D_DESCRIPTOR_SET_BINDINGS{{ + { + .binding = BL3D_BINDING_SWIZZLE_TABLE, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // swizzle_table[] + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, + { + .binding = BL3D_BINDING_INPUT_BUFFER, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // block-linear input + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, + { + .binding = BL3D_BINDING_OUTPUT_BUFFER, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, +}}; + +constexpr DescriptorBankInfo BL3D_BANK_INFO{ + .uniform_buffers = 0, + .storage_buffers = 3, + .texture_buffers = 0, + .image_buffers = 0, + .textures = 0, + .images = 0, + .score = 3, +}; + +constexpr std::array + BL3D_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{ + { + .dstBinding = BL3D_BINDING_SWIZZLE_TABLE, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .offset = BL3D_BINDING_SWIZZLE_TABLE * sizeof(DescriptorUpdateEntry), + .stride = sizeof(DescriptorUpdateEntry), + }, + { + .dstBinding = BL3D_BINDING_INPUT_BUFFER, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .offset = BL3D_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry), + .stride = sizeof(DescriptorUpdateEntry), + }, + { + .dstBinding = BL3D_BINDING_OUTPUT_BUFFER, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .offset = BL3D_BINDING_OUTPUT_BUFFER * sizeof(DescriptorUpdateEntry), + .stride = sizeof(DescriptorUpdateEntry), + } + }}; + +struct alignas(16) BlockLinearUnswizzle3DPushConstants { + u32 blocks_dim[3]; // Offset 0 + u32 bytes_per_block_log2; // Offset 12 + + u32 origin[3]; // Offset 16 + u32 slice_size; // Offset 28 + + u32 block_size; // Offset 32 + u32 x_shift; // Offset 36 + u32 block_height; // Offset 40 + u32 block_height_mask; // Offset 44 + + u32 block_depth; // Offset 48 + u32 block_depth_mask; // Offset 52 + s32 _pad; // Offset 56 + + s32 destination[3]; // Offset 60 + s32 _pad_end; // Offset 72 +}; +static_assert(sizeof(BlockLinearUnswizzle3DPushConstants) <= 128); + +BlockLinearUnswizzle3DPass::BlockLinearUnswizzle3DPass( + const Device& device_, Scheduler& scheduler_, + DescriptorPool& descriptor_pool_, + StagingBufferPool& staging_buffer_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue_) + : ComputePass( + device_, descriptor_pool_, + BL3D_DESCRIPTOR_SET_BINDINGS, + BL3D_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY, + BL3D_BANK_INFO, + COMPUTE_PUSH_CONSTANT_RANGE, + BLOCK_LINEAR_UNSWIZZLE_3D_BCN_COMP_SPV), + scheduler{scheduler_}, + staging_buffer_pool{staging_buffer_pool_}, + compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} + +BlockLinearUnswizzle3DPass::~BlockLinearUnswizzle3DPass() = default; + +// God have mercy on my soul +void BlockLinearUnswizzle3DPass::Unswizzle( + Image& image, + const StagingBufferRef& swizzled, + std::span swizzles, + u32 z_start, u32 z_count) +{ + using namespace VideoCommon::Accelerated; + + if (!image.has_compute_unswizzle_buffer) { + image.AllocateComputeUnswizzleBuffer(); + } + + ASSERT(swizzles.size() == 1); + const auto& sw = swizzles[0]; + const auto params = MakeBlockLinearSwizzle3DParams(sw, image.info); + + BlockLinearUnswizzle3DPushConstants pc{}; + pc.origin[0] = params.origin[0]; + pc.origin[1] = params.origin[1]; + pc.origin[2] = z_start; // Start at the current Z-slice + + pc.destination[0] = params.destination[0]; + pc.destination[1] = params.destination[1]; + pc.destination[2] = 0; // Shader writes to start of output buffer + + pc.bytes_per_block_log2 = params.bytes_per_block_log2; + pc.slice_size = params.slice_size; + pc.block_size = params.block_size; + pc.x_shift = params.x_shift; + pc.block_height = params.block_height; + pc.block_height_mask = params.block_height_mask; + pc.block_depth = params.block_depth; + pc.block_depth_mask = params.block_depth_mask; + + const u32 blocks_x = (image.info.size.width + 3) / 4; + const u32 blocks_y = (image.info.size.height + 3) / 4; + pc.blocks_dim[0] = blocks_x; + pc.blocks_dim[1] = blocks_y; + pc.blocks_dim[2] = z_count; // Only process the count + + compute_pass_descriptor_queue.Acquire(); + compute_pass_descriptor_queue.AddBuffer(*image.runtime->swizzle_table_buffer, 0, image.runtime->swizzle_table_size); + compute_pass_descriptor_queue.AddBuffer(swizzled.buffer, sw.buffer_offset + swizzled.offset, image.guest_size_bytes - sw.buffer_offset); + compute_pass_descriptor_queue.AddBuffer(*image.compute_unswizzle_buffer, 0, image.compute_unswizzle_buffer_size); + + const void* descriptor_data = compute_pass_descriptor_queue.UpdateData(); + const VkDescriptorSet set = descriptor_allocator.Commit(); + + const u32 gx = Common::DivCeil(blocks_x, 32u); + const u32 gy = Common::DivCeil(blocks_y, 8u); + const u32 gz = Common::DivCeil(z_count, 1u); + + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([this, &image, set, descriptor_data, pc, gx, gy, gz, z_start, z_count](vk::CommandBuffer cmdbuf) { + const VkBuffer out_buffer = *image.compute_unswizzle_buffer; + const VkImage dst_image = image.Handle(); + const VkImageAspectFlags aspect = image.AspectMask(); + + device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); + cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc); + cmdbuf.Dispatch(gx, gy, gz); + + // OPTIMIZATION: Combined barrier - merge buffer and image barriers when possible + const bool is_first = (z_start == 0); + + // Single barrier for compute -> transfer (buffer ready, image transition) + const VkBufferMemoryBarrier buffer_barrier{ + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, + .buffer = out_buffer, + .offset = 0, + .size = (VkDeviceSize)z_count * pc.slice_size, + }; + + const VkImageMemoryBarrier pre_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .srcAccessMask = is_first ? VkAccessFlags{} : static_cast(VK_ACCESS_SHADER_READ_BIT), + .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .oldLayout = is_first ? VK_IMAGE_LAYOUT_UNDEFINED : VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .image = dst_image, + .subresourceRange = {aspect, 0, 1, 0, 1}, + }; + + // Single barrier handles both buffer and image + cmdbuf.PipelineBarrier( + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, + nullptr, buffer_barrier, pre_barrier + ); + + const VkBufferImageCopy copy{ + .bufferOffset = 0, + .imageSubresource = {aspect, 0, 0, 1}, + .imageOffset = {0, 0, (s32)z_start}, + .imageExtent = {image.info.size.width, image.info.size.height, z_count}, + }; + cmdbuf.CopyBufferToImage(out_buffer, dst_image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy); + + // Post-copy transition + const VkImageMemoryBarrier post_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .image = dst_image, + .subresourceRange = {aspect, 0, 1, 0, 1}, + }; + + cmdbuf.PipelineBarrier( + VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + 0, + nullptr, nullptr, post_barrier + ); + }); +} + MSAACopyPass::MSAACopyPass(const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, StagingBufferPool& staging_buffer_pool_, diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index 7b8f938c1c..812504e540 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later @@ -131,6 +134,26 @@ private: MemoryAllocator& memory_allocator; }; +class BlockLinearUnswizzle3DPass final : public ComputePass { +public: + explicit BlockLinearUnswizzle3DPass(const Device& device_, Scheduler& scheduler_, + DescriptorPool& descriptor_pool_, + StagingBufferPool& staging_buffer_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue_); + ~BlockLinearUnswizzle3DPass(); + + void Unswizzle(Image& image, + const StagingBufferRef& swizzled, + std::span swizzles, + u32 z_start, u32 z_count); + +private: + Scheduler& scheduler; + StagingBufferPool& staging_buffer_pool; + ComputePassDescriptorQueue& compute_pass_descriptor_queue; +}; + + class MSAACopyPass final : public ComputePass { public: explicit MSAACopyPass(const Device& device_, Scheduler& scheduler_, diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 101a884fd7..521e19621d 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -748,10 +748,10 @@ void RasterizerVulkan::UnmapMemory(DAddr addr, u64 size) { pipeline_cache.OnCacheInvalidation(addr, size); } -void RasterizerVulkan::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) { +void RasterizerVulkan::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size, DAddr d_addr) { { std::scoped_lock lock{texture_cache.mutex}; - texture_cache.UnmapGPUMemory(as_id, addr, size); + texture_cache.UnmapGPUMemory(as_id, addr, size, d_addr); } } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index b689c6b660..f076cbc42b 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -108,7 +108,7 @@ public: bool OnCPUWrite(DAddr addr, u64 size) override; void InvalidateGPUCache() override; void UnmapMemory(DAddr addr, u64 size) override; - void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) override; + void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size, DAddr d_addr) override; void SignalFence(std::function&& func) override; void SyncOperation(std::function&& func) override; void SignalSyncPoint(u32 value) override; diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 04684b5246..8eb232af03 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -43,6 +43,16 @@ Scheduler::Scheduler(const Device& device_, StateTracker& state_tracker_) : device{device_}, state_tracker{state_tracker_}, master_semaphore{std::make_unique(device)}, command_pool{std::make_unique(*master_semaphore, device)} { + + // PRE-OPTIMIZATION: Warm up the pool to prevent mid-frame spikes + { + std::scoped_lock rl{reserve_mutex}; + chunk_reserve.reserve(2048); // Prevent vector resizing + for (int i = 0; i < 1024; ++i) { + chunk_reserve.push_back(std::make_unique()); + } + } + AcquireNewChunk(); AllocateWorkerCommandBuffer(); worker_thread = std::jthread([this](std::stop_token token) { WorkerThread(token); }); diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 5216a436c8..795d538faa 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -207,7 +207,7 @@ private: size_t command_offset = 0; bool submit = false; - alignas(std::max_align_t) std::array data{}; + alignas(std::max_align_t) std::array data{}; }; struct State { diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 64d2f6b586..b52f12a648 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -24,12 +24,14 @@ #include "video_core/renderer_vulkan/vk_render_pass_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" +#include "video_core/surface.h" #include "video_core/texture_cache/formatter.h" #include "video_core/texture_cache/samples_helper.h" #include "video_core/texture_cache/util.h" #include "video_core/vulkan_common/vulkan_device.h" #include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_wrapper.h" +#include "video_core/textures/decoders.h" namespace Vulkan { @@ -878,14 +880,51 @@ TextureCacheRuntime::TextureCacheRuntime(const Device& device_, Scheduler& sched } } } + + bl3d_unswizzle_pass.emplace(device, scheduler, descriptor_pool, + staging_buffer_pool, compute_pass_descriptor_queue); + + // --- Create swizzle table buffer --- + { + auto table = Tegra::Texture::MakeSwizzleTable(); + + swizzle_table_size = static_cast(table.size() * sizeof(table[0])); + + auto staging = staging_buffer_pool.Request(swizzle_table_size, MemoryUsage::Upload); + std::memcpy(staging.mapped_span.data(), table.data(), static_cast(swizzle_table_size)); + + VkBufferCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .size = swizzle_table_size, + .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_TRANSFER_DST_BIT | + VK_BUFFER_USAGE_TRANSFER_SRC_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + }; + swizzle_table_buffer = memory_allocator.CreateBuffer(ci, MemoryUsage::DeviceLocal); + + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([staging_buf = staging.buffer, + dst_buf = *swizzle_table_buffer, + size = swizzle_table_size, + src_off = staging.offset](vk::CommandBuffer cmdbuf) { + + const VkBufferCopy region{ + .srcOffset = src_off, + .dstOffset = 0, + .size = size, + }; + cmdbuf.CopyBuffer(staging_buf, dst_buf, region); + }); + } } void TextureCacheRuntime::Finish() { scheduler.Finish(); } -StagingBufferRef TextureCacheRuntime::UploadStagingBuffer(size_t size) { - return staging_buffer_pool.Request(size, MemoryUsage::Upload); +StagingBufferRef TextureCacheRuntime::UploadStagingBuffer(size_t size, bool deferred) { + return staging_buffer_pool.Request(size, MemoryUsage::Upload, deferred); } StagingBufferRef TextureCacheRuntime::DownloadStagingBuffer(size_t size, bool deferred) { @@ -1581,6 +1620,46 @@ Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBas Image::~Image() = default; +void Image::AllocateComputeUnswizzleBuffer() { + if (has_compute_unswizzle_buffer) + return; + + using VideoCore::Surface::BytesPerBlock; + + const u32 block_bytes = BytesPerBlock(info.format); // 8 for BC1, 16 for BC6H + const u32 block_width = 4; + const u32 block_height = 4; + + // BCn is 4x4x1 blocks + const u32 blocks_x = (info.size.width + block_width - 1) / block_width; + const u32 blocks_y = (info.size.height + block_height - 1) / block_height; + const u32 blocks_z = info.size.depth; + + const u64 block_count = + static_cast(blocks_x) * + static_cast(blocks_y) * + static_cast(blocks_z); + + compute_unswizzle_buffer_size = block_count * block_bytes; + + VkBufferCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = compute_unswizzle_buffer_size, + .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_TRANSFER_SRC_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }; + + compute_unswizzle_buffer = + runtime->memory_allocator.CreateBuffer(ci, MemoryUsage::DeviceLocal); + + has_compute_unswizzle_buffer = true; +} + void Image::UploadMemory(VkBuffer buffer, VkDeviceSize offset, std::span copies) { // TODO: Move this to another API @@ -2397,10 +2476,22 @@ void Framebuffer::CreateFramebuffer(TextureCacheRuntime& runtime, void TextureCacheRuntime::AccelerateImageUpload( Image& image, const StagingBufferRef& map, - std::span swizzles) { + std::span swizzles, + u32 z_start, u32 z_count) { + if (IsPixelFormatASTC(image.info.format)) { return astc_decoder_pass->Assemble(image, map, swizzles); } + + if (bl3d_unswizzle_pass && + IsPixelFormatBCn(image.info.format) && + image.info.type == ImageType::e3D && + image.info.resources.levels == 1 && + image.info.resources.layers == 1) { + + return bl3d_unswizzle_pass->Unswizzle(image, map, swizzles, z_start, z_count); + } + ASSERT(false); } diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 570a3cb335..f692a78c3d 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -51,7 +51,7 @@ public: void Finish(); - StagingBufferRef UploadStagingBuffer(size_t size); + StagingBufferRef UploadStagingBuffer(size_t size, bool deferred = false); StagingBufferRef DownloadStagingBuffer(size_t size, bool deferred = false); @@ -91,7 +91,8 @@ public: } void AccelerateImageUpload(Image&, const StagingBufferRef&, - std::span); + std::span, + u32 z_start, u32 z_count); void InsertUploadMemoryBarrier() {} @@ -127,6 +128,11 @@ public: BlitImageHelper& blit_image_helper; RenderPassCache& render_pass_cache; std::optional astc_decoder_pass; + + std::optional bl3d_unswizzle_pass; + vk::Buffer swizzle_table_buffer; + VkDeviceSize swizzle_table_size = 0; + std::unique_ptr msaa_copy_pass; const Settings::ResolutionScalingInfo& resolution; std::array, VideoCore::Surface::MaxPixelFormat> view_formats; @@ -163,6 +169,8 @@ public: void DownloadMemory(const StagingBufferRef& map, std::span copies); + + void AllocateComputeUnswizzleImage(); [[nodiscard]] VkImage Handle() const noexcept { return *(this->*current_image); @@ -188,6 +196,8 @@ public: bool ScaleUp(bool ignore = false); bool ScaleDown(bool ignore = false); + + friend class BlockLinearUnswizzle3DPass; private: bool BlitScaleHelper(bool scale_up); @@ -199,6 +209,12 @@ private: vk::Image original_image; vk::Image scaled_image; + + vk::Buffer compute_unswizzle_buffer; + VkDeviceSize compute_unswizzle_buffer_size = 0; + bool has_compute_unswizzle_buffer = false; + + void AllocateComputeUnswizzleBuffer(); // Use a pointer to field because it is relative, so that the object can be // moved without breaking the reference. diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h index 0587d7b724..78b49e8610 100644 --- a/src/video_core/texture_cache/image_base.h +++ b/src/video_core/texture_cache/image_base.h @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later @@ -52,6 +55,13 @@ struct AliasedImage { ImageId id; }; +struct SparseBinding { + GPUVAddr gpu_addr; // Virtual GPU address of this tile + DAddr device_addr; // Physical device memory address + u64 tile_index; // Linear tile index in the texture + Extent3D tile_coord; // 3D coordinate of this tile +}; + struct NullImageParams {}; struct ImageBase { @@ -115,6 +125,10 @@ struct ImageBase { std::vector aliased_images; std::vector overlapping_images; ImageMapId map_view_id{}; + + boost::container::small_vector dirty_offsets; + std::unordered_map sparse_bindings; + u32 sparse_tile_size = 65536; }; struct ImageMapView { diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index c580fb10ef..263a9c0630 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -22,6 +23,7 @@ #include "video_core/texture_cache/samples_helper.h" #include "video_core/texture_cache/texture_cache_base.h" #include "video_core/texture_cache/util.h" +#include "video_core/textures/decoders.h" namespace VideoCommon { @@ -160,6 +162,7 @@ void TextureCache

::TickFrame() { sentenced_framebuffers.Tick(); sentenced_image_view.Tick(); TickAsyncDecode(); + TickAsyncUnswizzle(); runtime.TickFrame(); ++frame_tick; @@ -615,7 +618,36 @@ void TextureCache

::UnmapMemory(DAddr cpu_addr, size_t size) { } template -void TextureCache

::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t size) { +std::optional TextureCache

::CalculateSparseBinding( + const Image& image, GPUVAddr gpu_addr, DAddr dev_addr) { + + if (!image.info.is_sparse) { + return std::nullopt; + } + + const u64 offset = gpu_addr - image.gpu_addr; + const u64 tile_index = offset / image.sparse_tile_size; + + const u32 tile_width_blocks = 128; + const u32 tile_height_blocks = 32; + + const u32 width_in_tiles = (image.info.size.width / 4 + tile_width_blocks - 1) / tile_width_blocks; + const u32 height_in_tiles = (image.info.size.height / 4 + tile_height_blocks - 1) / tile_height_blocks; + + const u32 tile_x = static_cast((tile_index % width_in_tiles) * tile_width_blocks * 4); + const u32 tile_y = static_cast(((tile_index / width_in_tiles) % height_in_tiles) * tile_height_blocks * 4); + const u32 tile_z = static_cast(tile_index / (width_in_tiles * height_in_tiles)); + + return SparseBinding{ + .gpu_addr = gpu_addr, + .device_addr = dev_addr, + .tile_index = tile_index, + .tile_coord = {tile_x, tile_y, tile_z} + }; +} + +template +void TextureCache

::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t size, DAddr dev_addr) { boost::container::small_vector deleted_images; ForEachImageInRegionGPU(as_id, gpu_addr, size, [&](ImageId id, Image&) { deleted_images.push_back(id); }); @@ -627,11 +659,19 @@ void TextureCache

::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t siz UntrackImage(image, id); } } - if (True(image.flags & ImageFlagBits::Remapped)) { continue; } image.flags |= ImageFlagBits::Remapped; + + if (image.info.is_sparse && dev_addr != 0) { + // Calculate and store the binding + auto binding = CalculateSparseBinding(image, gpu_addr, dev_addr); + if (binding) { + image.sparse_bindings[gpu_addr] = *binding; + image.dirty_offsets.push_back(binding->tile_index); + } + } } } @@ -1055,9 +1095,29 @@ void TextureCache

::RefreshContents(Image& image, ImageId image_id) { // Only upload modified images return; } + image.flags &= ~ImageFlagBits::CpuModified; TrackImage(image, image_id); + // If it's sparse and remapped, we treat it as a partial update trigger + if (image.info.is_sparse && True(image.flags & ImageFlagBits::Remapped)) { + image.flags &= ~ImageFlagBits::Remapped; + + if (!image.dirty_offsets.empty() && !image.sparse_bindings.empty()) { + /*constexpr u64 page_size = 64_KiB; + size_t dirty_size = image.dirty_offsets.size() * page_size; + + auto staging = runtime.UploadStagingBuffer(dirty_size); + UploadSparseDirtyTiles(image, staging); + runtime.InsertUploadMemoryBarrier(); + + return;*/ + image.dirty_offsets.clear(); + image.sparse_bindings.clear(); + return; + } + } + if (image.info.num_samples > 1 && !runtime.CanUploadMSAA()) { LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented"); runtime.TransitionImageLayout(image); @@ -1067,11 +1127,102 @@ void TextureCache

::RefreshContents(Image& image, ImageId image_id) { QueueAsyncDecode(image, image_id); return; } + if (IsPixelFormatBCn(image.info.format) && + image.info.type == ImageType::e3D && + image.info.resources.levels == 1 && + image.info.resources.layers == 1 && + MapSizeBytes(image) >= 32_MiB && + False(image.flags & ImageFlagBits::GpuModified)) { + + QueueAsyncUnswizzle(image, image_id); + return; + } auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image)); UploadImageContents(image, staging); runtime.InsertUploadMemoryBarrier(); } +template +template +void TextureCache

::UploadSparseDirtyTiles(Image& image, StagingBuffer& staging) { + using namespace VideoCommon; + using namespace Tegra::Texture; + + std::vector all_copies; + size_t total_upload_size = 0; + + for (u64 dirty_tile_index : image.dirty_offsets) { + SparseBinding* binding = nullptr; + for (auto& [addr, bind] : image.sparse_bindings) { + if (bind.tile_index == dirty_tile_index) { + binding = &bind; + break; + } + } + + if (!binding) { + continue; + } + + const auto& coord = binding->tile_coord; + + // Calculate tile dimensions + const u32 tile_width_blocks = 128; + const u32 tile_height_blocks = 32; + const u32 tile_width = std::min(tile_width_blocks * 4, image.info.size.width - coord.width); + const u32 tile_height = std::min(tile_height_blocks * 4, image.info.size.height - coord.height); + const u32 tile_depth = std::min(1u, image.info.size.depth - coord.depth); + + const u32 bytes_per_block = BytesPerBlock(image.info.format); + const u32 blocks_wide = (tile_width + 3) / 4; + const u32 blocks_high = (tile_height + 3) / 4; + const size_t tile_unswizzled_size = blocks_wide * blocks_high * tile_depth * bytes_per_block; + + if (total_upload_size + tile_unswizzled_size > staging.mapped_span.size()) { + LOG_ERROR(HW_GPU, "Staging buffer too small"); + break; + } + + std::array tile_swizzled_data; + gpu_memory->ReadBlockUnsafe(binding->gpu_addr, tile_swizzled_data.data(), image.sparse_tile_size); + + // Get output span + auto tile_output = staging.mapped_span.subspan(total_upload_size, tile_unswizzled_size); + + // Unswizzle the tile + auto result = UnswizzleSparseTextureTile(tile_output, tile_swizzled_data, + image.info, tile_width, tile_height, tile_depth); + + // Create the copy descriptor + BufferImageCopy copy{ + .buffer_offset = total_upload_size, + .buffer_size = tile_unswizzled_size, + .buffer_row_length = result.buffer_row_length, + .buffer_image_height = result.buffer_image_height, + .image_subresource = { + .base_level = 0, + .base_layer = 0, + .num_layers = 1, + }, + .image_offset = { + static_cast(coord.width), + static_cast(coord.height), + static_cast(coord.depth) + }, + .image_extent = {tile_width, tile_height, tile_depth} + }; + + all_copies.push_back(copy); + total_upload_size += tile_unswizzled_size; + } + + if (!all_copies.empty()) { + image.UploadMemory(staging, all_copies); + } + + image.dirty_offsets.clear(); +} + template template void TextureCache

::UploadImageContents(Image& image, StagingBuffer& staging) { @@ -1082,7 +1233,7 @@ void TextureCache

::UploadImageContents(Image& image, StagingBuffer& staging) gpu_memory->ReadBlock(gpu_addr, mapped_span.data(), mapped_span.size_bytes(), VideoCommon::CacheType::NoTextureCache); const auto uploads = FullUploadSwizzles(image.info); - runtime.AccelerateImageUpload(image, staging, FixSmallVectorADL(uploads)); + runtime.AccelerateImageUpload(image, staging, FixSmallVectorADL(uploads), 0, 0); return; } @@ -1311,6 +1462,20 @@ void TextureCache

::QueueAsyncDecode(Image& image, ImageId image_id) { texture_decode_worker.QueueWork(std::move(func)); } +template +void TextureCache

::QueueAsyncUnswizzle(Image& image, ImageId image_id) { + if (True(image.flags & ImageFlagBits::IsDecoding)) { + return; + } + + image.flags |= ImageFlagBits::IsDecoding; + + unswizzle_queue.push_back({ + .image_id = image_id, + .info = image.info + }); +} + template void TextureCache

::TickAsyncDecode() { bool has_uploads{}; @@ -1336,6 +1501,90 @@ void TextureCache

::TickAsyncDecode() { } } +template +void TextureCache

::TickAsyncUnswizzle() { + if (unswizzle_queue.empty()) { + current_unswizzle_frame = 0; + return; + } + + // Don't process every frame - allow more data to accumulate + if (current_unswizzle_frame++ < 2) return; + + PendingUnswizzle& task = unswizzle_queue.front(); + Image& image = slot_images[task.image_id]; + + if (!task.initialized) { + task.total_size = MapSizeBytes(image); + task.staging_buffer = runtime.UploadStagingBuffer(task.total_size, true); + + const auto& info = image.info; + const u32 bytes_per_block = BytesPerBlock(info.format); + const u32 width_blocks = Common::DivCeil(info.size.width, 4u); + const u32 height_blocks = Common::DivCeil(info.size.height, 4u); + + const u32 stride = Common::AlignUp(width_blocks * bytes_per_block, 64u); + const u32 aligned_height = Common::AlignUp(height_blocks, 8u << task.info.block.height); + + task.bytes_per_slice = static_cast(stride) * aligned_height; + task.last_submitted_offset = 0; + task.initialized = true; + } + + // ToDo: Make these configurable + const size_t CHUNK_SIZE = 64_MiB; + const u32 SLICES_PER_BATCH = 512; + + static std::vector temp_buffer; + if (temp_buffer.size() < CHUNK_SIZE) { + temp_buffer.resize(CHUNK_SIZE); + } + + // Read data + if (task.current_offset < task.total_size) { + const size_t remaining = task.total_size - task.current_offset; + const size_t copy_amount = std::min(CHUNK_SIZE, remaining); + + gpu_memory->ReadBlock(image.gpu_addr + task.current_offset, + task.staging_buffer.mapped_span.data() + task.current_offset, + copy_amount, + VideoCommon::CacheType::NoTextureCache); + task.current_offset += copy_amount; + } + + const size_t batch_threshold = task.bytes_per_slice * SLICES_PER_BATCH; + size_t ready_to_submit = task.current_offset - task.last_submitted_offset; + + const bool is_final_batch = task.current_offset >= task.total_size; + const bool should_submit = ready_to_submit >= batch_threshold || + (is_final_batch && task.last_submitted_offset < task.total_size); + + if (should_submit) { + const u32 z_start = static_cast(task.last_submitted_offset / task.bytes_per_slice); + const u32 total_depth = image.info.size.depth; + + u32 z_count = static_cast(ready_to_submit / task.bytes_per_slice); + if (z_start + z_count > total_depth) { + z_count = total_depth - z_start; + } + + if (z_count > 0) { + const auto uploads = FullUploadSwizzles(task.info); + runtime.AccelerateImageUpload(image, task.staging_buffer, FixSmallVectorADL(uploads), z_start, z_count); + task.last_submitted_offset += (static_cast(z_count) * task.bytes_per_slice); + } + } + + // Check if complete + if (task.current_offset >= task.total_size && + task.last_submitted_offset >= (task.total_size - (task.total_size % task.bytes_per_slice))) { + runtime.FreeDeferredStagingBuffer(task.staging_buffer); + image.flags &= ~ImageFlagBits::IsDecoding; + unswizzle_queue.pop_front(); + current_unswizzle_frame = 0; + } +} + template bool TextureCache

::ScaleUp(Image& image) { const bool has_copy = image.HasScaled(); @@ -2423,6 +2672,7 @@ void TextureCache

::SynchronizeAliases(ImageId image_id) { template void TextureCache

::PrepareImage(ImageId image_id, bool is_modification, bool invalidate) { Image& image = slot_images[image_id]; + runtime.TransitionImageLayout(image); if (invalidate) { image.flags &= ~(ImageFlagBits::CpuModified | ImageFlagBits::GpuModified); if (False(image.flags & ImageFlagBits::Tracked)) { diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index 5146a8c291..123bfca6ce 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -129,6 +129,17 @@ class TextureCache : public VideoCommon::ChannelSetupCaches CalculateSparseBinding( + const Image& image, GPUVAddr gpu_addr, DAddr dev_addr); /// Blit an image with the given parameters bool BlitImage(const Tegra::Engines::Fermi2D::Surface& dst, @@ -312,6 +327,10 @@ private: /// Refresh the contents (pixel data) of an image void RefreshContents(Image& image, ImageId image_id); + + /// Sparse texture partial upload + template + void UploadSparseDirtyTiles(Image& image, StagingBuffer& staging); /// Upload data from guest to an image template @@ -433,6 +452,9 @@ private: void TrimInactiveSamplers(size_t budget); std::optional QuerySamplerBudget() const; + void QueueAsyncUnswizzle(Image& image, ImageId image_id); + void TickAsyncUnswizzle(); + Runtime& runtime; Tegra::MaxwellDeviceMemoryManager& device_memory; @@ -508,6 +530,9 @@ private: Common::ThreadWorker texture_decode_worker{1, "TextureDecoder"}; std::vector> async_decodes; + std::deque unswizzle_queue; + u8 current_unswizzle_frame; + // Join caching boost::container::small_vector join_overlap_ids; std::unordered_set join_overlaps_found; diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp index e55d0752ec..03c0c589c1 100644 --- a/src/video_core/texture_cache/util.cpp +++ b/src/video_core/texture_cache/util.cpp @@ -55,6 +55,7 @@ using Tegra::Texture::TextureFormat; using Tegra::Texture::TextureType; using Tegra::Texture::TICEntry; using Tegra::Texture::UnswizzleTexture; +using Tegra::Texture::UnswizzleSubrect; using VideoCore::Surface::BytesPerBlock; using VideoCore::Surface::DefaultBlockHeight; using VideoCore::Surface::DefaultBlockWidth; @@ -922,6 +923,46 @@ boost::container::small_vector UnswizzleImage(Tegra::Memory return copies; } +SparseTileUnswizzleResult UnswizzleSparseTextureTile(std::span output, + std::span input, + const ImageInfo& info, + u32 tile_width, + u32 tile_height, + u32 tile_depth) { + const Extent2D block_size = DefaultBlockSize(info.format); + const u32 bpp = BytesPerBlock(info.format); + const u32 width_blocks = (tile_width + block_size.width - 1) / block_size.width; + const u32 height_blocks = (tile_height + block_size.height - 1) / block_size.height; + + // Calculate GOBs per row + const u32 bytes_per_row = width_blocks * bpp; + const u32 gobs_per_row = (bytes_per_row + 63) / 64; + + // Calculate block_height for 64KB tiles + // 64KB / (gobs_per_row × 512 bytes) = GOBs tall + constexpr u32 TILE_SIZE = 65536; + const u32 gobs_tall = TILE_SIZE / (gobs_per_row * 512); + + // block_height = log2(gobs_tall) + const u32 tile_block_height = std::countr_zero(gobs_tall); + + const u32 pitch_linear = width_blocks * bpp; + + UnswizzleSubrect( + output, input, bpp, + width_blocks, height_blocks, tile_depth, + 0, 0, + width_blocks, height_blocks, + tile_block_height, 0, + pitch_linear + ); + + return { + .buffer_row_length = Common::AlignUp(tile_width, block_size.width), + .buffer_image_height = Common::AlignUp(tile_height, block_size.height) + }; +} + void ConvertImage(std::span input, const ImageInfo& info, std::span output, std::span copies) { u32 output_offset = 0; diff --git a/src/video_core/texture_cache/util.h b/src/video_core/texture_cache/util.h index 3e8bb00032..dbf44d73c3 100644 --- a/src/video_core/texture_cache/util.h +++ b/src/video_core/texture_cache/util.h @@ -30,6 +30,11 @@ struct OverlapResult { SubresourceExtent resources; }; +struct SparseTileUnswizzleResult { + u32 buffer_row_length; + u32 buffer_image_height; +}; + [[nodiscard]] u32 CalculateGuestSizeInBytes(const ImageInfo& info) noexcept; [[nodiscard]] u32 CalculateUnswizzledSizeBytes(const ImageInfo& info) noexcept; @@ -68,7 +73,14 @@ struct OverlapResult { [[nodiscard]] boost::container::small_vector UnswizzleImage( Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageInfo& info, std::span input, std::span output); - + +[[nodiscard]] SparseTileUnswizzleResult UnswizzleSparseTextureTile(std::span output, + std::span input, + const ImageInfo& info, + u32 tile_width, + u32 tile_height, + u32 tile_depth); + void ConvertImage(std::span input, const ImageInfo& info, std::span output, std::span copies);