From c81921d37ec8a56d88130fb07d3ff7b64ec87265 Mon Sep 17 00:00:00 2001 From: Forrest Mark X Date: Fri, 19 Jun 2026 23:33:56 -0500 Subject: [PATCH] Process sparse textures better by finding pages that have data --- .../renderer_opengl/gl_texture_cache.cpp | 2 +- .../renderer_opengl/gl_texture_cache.h | 4 +- .../renderer_vulkan/vk_compute_pass.cpp | 135 +++++++++++++++++- .../renderer_vulkan/vk_compute_pass.h | 9 +- .../renderer_vulkan/vk_texture_cache.cpp | 17 ++- .../renderer_vulkan/vk_texture_cache.h | 4 +- src/video_core/texture_cache/texture_cache.h | 89 ++++++++++-- .../texture_cache/texture_cache_base.h | 3 + 8 files changed, 232 insertions(+), 31 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 958988f27e..f2093d2f4e 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -652,7 +652,7 @@ void TextureCacheRuntime::BlitFramebuffer(Framebuffer* dst, Framebuffer* src, void TextureCacheRuntime::AccelerateImageUpload(Image& image, const StagingBufferMap& map, std::span swizzles, - u32 z_start, u32 z_count) { + u32 z_start, u32 z_count, std::span slice_has_data) { switch (image.info.type) { case ImageType::e2D: if (IsPixelFormatASTC(image.info.format)) { diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index dfcef4b0b6..0521966b93 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -118,9 +118,9 @@ public: const Region2D& src_region, Tegra::Engines::Fermi2D::Filter filter, Tegra::Engines::Fermi2D::Operation operation); - void AccelerateImageUpload(Image& image, const StagingBufferMap& map, + void AccelerateImageUpload(Image &image, const StagingBufferMap &map, std::span swizzles, - u32 z_start, u32 z_count); + u32 z_start, u32 z_count, std::span slice_has_data={}); void InsertUploadMemoryBarrier(); diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index c24b7a5757..5fd1912577 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -763,31 +763,59 @@ void BlockLinearUnswizzle3DPass::Unswizzle( Image& image, const StagingBufferRef& swizzled, std::span swizzles, - u32 z_start, u32 z_count) + u32 z_start, u32 z_count, + std::span slice_has_data) { using namespace VideoCommon::Accelerated; const u32 MAX_BATCH_SLICES = (std::min)(z_count, image.info.size.depth); - if (!image.has_compute_unswizzle_buffer) { + if (image.has_compute_unswizzle_buffer) { // Allocate exactly what this batch needs - image.AllocateComputeUnswizzleBuffer(MAX_BATCH_SLICES); + using VideoCore::Surface::BytesPerBlock; + const u32 bx = (image.info.size.width + 3) / 4; + const u32 by = (image.info.size.height + 3) / 4; + const VkDeviceSize needed = + static_cast(bx) * by * MAX_BATCH_SLICES * + BytesPerBlock(image.info.format); + if (image.compute_unswizzle_buffer_size < needed) { + scheduler.Finish(); + } } + image.AllocateComputeUnswizzleBuffer(MAX_BATCH_SLICES); + ASSERT(swizzles.size() == 1); const auto& sw = swizzles[0]; const auto params = MakeBlockLinearSwizzle3DParams(sw, image.info); const u32 blocks_x = (image.info.size.width + 3) / 4; const u32 blocks_y = (image.info.size.height + 3) / 4; + const u32 bytes_per_block = 1u << params.bytes_per_block_log2; scheduler.RequestOutsideRenderPassOperationContext(); for (u32 z_offset = 0; z_offset < z_count; z_offset += MAX_BATCH_SLICES) { const u32 current_chunk_slices = (std::min)(MAX_BATCH_SLICES, z_count - z_offset); const u32 current_z_start = z_start + z_offset; - UnswizzleChunk(image, swizzled, sw, params, blocks_x, blocks_y, - current_z_start, current_chunk_slices); + bool chunk_has_data = slice_has_data.empty(); + if (!chunk_has_data) { + const u32 z_end = current_z_start + current_chunk_slices; + for (u32 z = current_z_start; z < z_end; ++z) { + if (z < static_cast(slice_has_data.size()) && slice_has_data[z] != 0) { + chunk_has_data = true; + break; + } + } + } + + if (chunk_has_data) { + UnswizzleChunk(image, swizzled, sw, params, blocks_x, blocks_y, + current_z_start, current_chunk_slices); + } else { + UnswizzleZeroChunk(image, blocks_x, blocks_y, bytes_per_block, + current_z_start, current_chunk_slices); + } } } @@ -937,6 +965,103 @@ void BlockLinearUnswizzle3DPass::UnswizzleChunk( }); } +// I feel there was a better way to do this like a image.Clear or something but I couldn't find anything or am blind. +// So enjoy this mess +void BlockLinearUnswizzle3DPass::UnswizzleZeroChunk( + Image& image, + u32 blocks_x, u32 blocks_y, + u32 bytes_per_block, + u32 z_start, u32 z_count) +{ + ASSERT(image.has_compute_unswizzle_buffer); + + const VkBuffer out_buffer = *image.compute_unswizzle_buffer; + const VkImage dst_image = image.Handle(); + const VkImageAspectFlags aspect = image.AspectMask(); + const u32 image_width = image.info.size.width; + const u32 image_height = image.info.size.height; + const bool is_first_chunk = (z_start == 0); + + // Size of one unswizzled z-slice in the output buffer (bytes). + const VkDeviceSize output_slice_bytes = + static_cast(blocks_x) * blocks_y * bytes_per_block; + const VkDeviceSize fill_size = output_slice_bytes * z_count; + + scheduler.Record([out_buffer, dst_image, aspect, z_start, z_count, + fill_size, is_first_chunk, image_width, image_height + ](vk::CommandBuffer cmdbuf) { + + if (dst_image == VK_NULL_HANDLE || out_buffer == VK_NULL_HANDLE) { + return; + } + + // Zero the output buffer region that CopyBufferToImage will read. + cmdbuf.FillBuffer(out_buffer, 0, fill_size, 0u); + + const VkBufferMemoryBarrier buffer_barrier{ + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = out_buffer, + .offset = 0, + .size = fill_size, + }; + const VkImageMemoryBarrier pre_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = is_first_chunk ? VkAccessFlags{} + : static_cast(VK_ACCESS_TRANSFER_WRITE_BIT), + .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .oldLayout = is_first_chunk ? VK_IMAGE_LAYOUT_UNDEFINED + : VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = dst_image, + .subresourceRange = {aspect, 0, 1, 0, 1}, + }; + cmdbuf.PipelineBarrier( + VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, nullptr, buffer_barrier, pre_barrier + ); + + // Copy the zeroed buffer region into the correct Z position of the image. + const VkBufferImageCopy copy{ + .bufferOffset = 0, + .bufferRowLength = 0, + .bufferImageHeight = 0, + .imageSubresource = {aspect, 0, 0, 1}, + .imageOffset = {0, 0, static_cast(z_start)}, + .imageExtent = {image_width, image_height, z_count}, + }; + cmdbuf.CopyBufferToImage(out_buffer, dst_image, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy); + + // Transition image to GENERAL for subsequent shader reads/writes. + const VkImageMemoryBarrier post_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = dst_image, + .subresourceRange = {aspect, 0, 1, 0, 1}, + }; + cmdbuf.PipelineBarrier( + VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + 0, nullptr, nullptr, post_barrier + ); + }); +} + MSAACopyPass::MSAACopyPass(const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, StagingBufferPool& staging_buffer_pool_, diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index cb213dae7e..eb63923180 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h @@ -148,7 +148,8 @@ public: void Unswizzle(Image& image, const StagingBufferRef& swizzled, std::span swizzles, - u32 z_start, u32 z_count); + u32 z_start, u32 z_count, + std::span slice_has_data = {}); void UnswizzleChunk( Image& image, @@ -158,6 +159,12 @@ public: u32 blocks_x, u32 blocks_y, u32 z_start, u32 z_count); + void UnswizzleZeroChunk( + Image& image, + u32 blocks_x, u32 blocks_y, + u32 bytes_per_block, + u32 z_start, u32 z_count); + private: Scheduler& scheduler; StagingBufferPool& staging_buffer_pool; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 7cd6aa3e7c..8a4203385b 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1635,9 +1635,6 @@ Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBas Image::~Image() = default; void Image::AllocateComputeUnswizzleBuffer(u32 max_slices) { - if (has_compute_unswizzle_buffer) - return; - using VideoCore::Surface::BytesPerBlock; const u32 block_bytes = BytesPerBlock(info.format); // 8 for BC1, 16 for BC6H @@ -1654,7 +1651,13 @@ void Image::AllocateComputeUnswizzleBuffer(u32 max_slices) { static_cast(blocks_y) * static_cast(blocks_z); - compute_unswizzle_buffer_size = block_count * block_bytes; + const VkDeviceSize needed_size = block_count * block_bytes; + + if (has_compute_unswizzle_buffer && compute_unswizzle_buffer_size >= needed_size) { + return; + } + + compute_unswizzle_buffer_size = needed_size; VkBufferCreateInfo ci{ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, @@ -2475,8 +2478,8 @@ void Framebuffer::CreateFramebuffer(TextureCacheRuntime& runtime, void TextureCacheRuntime::AccelerateImageUpload( Image& image, const StagingBufferRef& map, - std::span swizzles, - u32 z_start, u32 z_count) { + std::span swizzles, u32 z_start, u32 z_count, + std::span slice_has_data) { if (IsPixelFormatASTC(image.info.format)) { return astc_decoder_pass->Assemble(image, map, swizzles); @@ -2496,7 +2499,7 @@ void TextureCacheRuntime::AccelerateImageUpload( image.info.resources.levels == 1 && image.info.resources.layers == 1) { - return bl3d_unswizzle_pass->Unswizzle(image, map, swizzles, z_start, z_count); + return bl3d_unswizzle_pass->Unswizzle(image, map, swizzles, z_start, z_count, slice_has_data); } ASSERT(false); diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 4bb9687ab0..368c7e1ea9 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -90,9 +90,9 @@ public: return msaa_copy_pass.operator bool(); } - void AccelerateImageUpload(Image&, const StagingBufferRef&, + void AccelerateImageUpload(Image &, const StagingBufferRef &, std::span, - u32 z_start, u32 z_count); + u32 z_start, u32 z_count, std::span slice_has_data={}); void InsertUploadMemoryBarrier() {} diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 3c7c630ee1..4ec0bf4cbf 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -24,6 +24,7 @@ #include "video_core/texture_cache/samples_helper.h" #include "video_core/texture_cache/texture_cache_base.h" #include "video_core/texture_cache/util.h" +#include "video_core/texture_cache/accelerated_swizzle.h" #include "video_core/textures/decoders.h" namespace VideoCommon { @@ -1479,6 +1480,40 @@ void TextureCache

::TickAsyncUnswizzle() { const u32 aligned_height = height_blocks; task.bytes_per_slice = static_cast(stride) * aligned_height; task.last_submitted_offset = 0; + + task.is_sparse = True(image.flags & ImageFlagBits::Sparse); + if (task.is_sparse) { + std::memset(task.staging_buffer.mapped_span.data(), 0, task.total_size); + + const auto segs = + gpu_memory->GetSubmappedRange(image.gpu_addr, image.guest_size_bytes); + task.sparse_segments.assign(segs.begin(), segs.end()); + + task.slice_has_data.assign(image.info.size.depth, 0u); + + if (image.info.size.depth > 1 && !image.slice_offsets.empty()) { + const auto uploads = FullUploadSwizzles(task.info); + const auto sp = Accelerated::MakeBlockLinearSwizzle3DParams( + uploads[0], task.info); + const u64 swizzled_slice_size = sp.slice_size; + + for (const auto& [seg_gpu_addr, seg_size] : task.sparse_segments) { + const u64 seg_start = seg_gpu_addr - image.gpu_addr; + const u64 seg_end = seg_start + seg_size; + for (u32 z = 0; z < static_cast(image.info.size.depth); ++z) { + if (task.slice_has_data[z]) continue; // already marked, skip + const u64 slice_start = image.slice_offsets[z]; + const u64 slice_end = slice_start + swizzled_slice_size; + if (slice_start < seg_end && slice_end > seg_start) { + task.slice_has_data[z] = 1u; + } + } + } + } else { + std::fill(task.slice_has_data.begin(), task.slice_has_data.end(), 1u); + } + } + task.initialized = true; } @@ -1494,14 +1529,31 @@ void TextureCache

::TickAsyncUnswizzle() { copy_amount = (std::min)(dynamic_chunk, remaining); } - if (remaining > swizzle_chunk_size) { + if (swizzle_chunk_size > 0 && remaining > swizzle_chunk_size) { copy_amount = (copy_amount / task.bytes_per_slice) * task.bytes_per_slice; if (copy_amount == 0) copy_amount = task.bytes_per_slice; } - gpu_memory->ReadBlock(image.gpu_addr + task.current_offset, - task.staging_buffer.mapped_span.data() + task.current_offset, - copy_amount); + if (task.is_sparse) { + const size_t read_start = task.current_offset; + const size_t read_end = task.current_offset + copy_amount; + u8* const staging_base = task.staging_buffer.mapped_span.data(); + for (const auto& [seg_gpu_addr, seg_size] : task.sparse_segments) { + const size_t seg_start = static_cast(seg_gpu_addr - image.gpu_addr); + const size_t seg_end = seg_start + seg_size; + const size_t ol_start = (std::max)(seg_start, read_start); + const size_t ol_end = (std::min)(seg_end, read_end); + if (ol_start < ol_end) { + gpu_memory->ReadBlock(image.gpu_addr + ol_start, + staging_base + ol_start, + ol_end - ol_start); + } + } + } else { + gpu_memory->ReadBlock(image.gpu_addr + task.current_offset, + task.staging_buffer.mapped_span.data() + task.current_offset, + copy_amount); + } task.current_offset += copy_amount; } @@ -1513,9 +1565,17 @@ void TextureCache

::TickAsyncUnswizzle() { const size_t bytes_ready = task.current_offset - task.last_submitted_offset; const u32 complete_slices = static_cast(bytes_ready / task.bytes_per_slice); - if( swizzle_slices_per_batch <= 0 ) { - runtime.AccelerateImageUpload(image, task.staging_buffer, FixSmallVectorADL(FullUploadSwizzles(task.info)), 0, image.info.size.depth); - task.last_submitted_offset += (static_cast(image.info.size.depth) * task.bytes_per_slice); + const std::span sparse_hint = + task.is_sparse ? std::span(task.slice_has_data) + : std::span{}; + + if (swizzle_slices_per_batch <= 0 || swizzle_chunk_size == 0) { + const u32 z_start_full = static_cast(task.last_submitted_offset / task.bytes_per_slice); + const u32 remaining_slices_full = image.info.size.depth - z_start_full; + if (remaining_slices_full > 0) { + runtime.AccelerateImageUpload(image, task.staging_buffer, FixSmallVectorADL(FullUploadSwizzles(task.info)), z_start_full, remaining_slices_full, sparse_hint); + task.last_submitted_offset += (static_cast(remaining_slices_full) * task.bytes_per_slice); + } } else { const u32 adaptive_batch = GetAdaptiveBatchSize(task, unswizzle_queue.size()); @@ -1525,17 +1585,20 @@ void TextureCache

::TickAsyncUnswizzle() { const u32 slices_to_process = (std::min)(complete_slices, adaptive_batch); if (whole_texture) { - runtime.AccelerateImageUpload(image, task.staging_buffer, - FixSmallVectorADL(FullUploadSwizzles(task.info)), 0, - image.info.size.depth); - task.last_submitted_offset += - (static_cast(image.info.size.depth) * task.bytes_per_slice); + const u32 remaining_slices = task.info.size.depth - z_start; + if (remaining_slices > 0) { + runtime.AccelerateImageUpload(image, task.staging_buffer, + FixSmallVectorADL(FullUploadSwizzles(task.info)), z_start, + remaining_slices, sparse_hint); + task.last_submitted_offset += + (static_cast(remaining_slices) * task.bytes_per_slice); + } } else if (complete_slices >= slices_to_process || (is_final_batch && complete_slices > 0)) { const u32 z_count = (std::min)(slices_to_process, task.info.size.depth - z_start); if (z_count > 0) { const auto uploads = FullUploadSwizzles(task.info); runtime.AccelerateImageUpload(image, task.staging_buffer, FixSmallVectorADL(uploads), - z_start, z_count); + z_start, z_count, sparse_hint); task.last_submitted_offset += (static_cast(z_count) * task.bytes_per_slice); } } diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index b10f4e5f52..160d0864ce 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -139,6 +139,9 @@ class TextureCache : public VideoCommon::ChannelSetupCaches slice_has_data; + std::vector> sparse_segments; }; struct BlitImages {