Small improvement to sparse texture loading

1 month ago · ad99a0769c
4 changed files with 28 additions and 154 deletions
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@ -801,8 +801,14 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
    const u32 gy = Common::DivCeil(blocks_y, 8u);
    const u32 gz = Common::DivCeil(z_count, 1u);
    
+    const u32 bytes_per_block = 1u << pc.bytes_per_block_log2;
+    const VkDeviceSize output_slice_size =
+        static_cast<VkDeviceSize>(blocks_x) * blocks_y * bytes_per_block;
+    const VkDeviceSize barrier_size = output_slice_size * z_count;
+    
    scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([this, &image, set, descriptor_data, pc, gx, gy, gz, z_start, z_count](vk::CommandBuffer cmdbuf) {
+    scheduler.Record([this, &image, set, descriptor_data, pc, gx, gy, gz, z_start, z_count,
+                      barrier_size](vk::CommandBuffer cmdbuf) {
        const VkBuffer out_buffer = *image.compute_unswizzle_buffer;
        const VkImage dst_image = image.Handle();
        const VkImageAspectFlags aspect = image.AspectMask();
@ -818,19 +824,26 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
        // Single barrier for compute -> transfer (buffer ready, image transition)
        const VkBufferMemoryBarrier buffer_barrier{
            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
+            .pNext = nullptr,
            .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
            .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
            .buffer = out_buffer,
            .offset = 0,
-            .size = VK_WHOLE_SIZE,
+            .size = barrier_size,
        };
        
+        // Image layout transition
        const VkImageMemoryBarrier pre_barrier{
            .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+            .pNext = nullptr,
            .srcAccessMask = is_first ? VkAccessFlags{} : static_cast<VkAccessFlags>(VK_ACCESS_SHADER_READ_BIT),
            .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
            .oldLayout = is_first ? VK_IMAGE_LAYOUT_UNDEFINED : VK_IMAGE_LAYOUT_GENERAL,
            .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
            .image = dst_image,
            .subresourceRange = {aspect, 0, 1, 0, 1},
        };
@ -845,19 +858,25 @@ void BlockLinearUnswizzle3DPass::Unswizzle(

        const VkBufferImageCopy copy{
            .bufferOffset = 0,
+            .bufferRowLength = 0,
+            .bufferImageHeight = 0,
            .imageSubresource = {aspect, 0, 0, 1},
-            .imageOffset = {0, 0, (s32)z_start},
+            .imageOffset = {0, 0, static_cast<s32>(z_start)},
            .imageExtent = {image.info.size.width, image.info.size.height, z_count},
        };
-        cmdbuf.CopyBufferToImage(out_buffer, dst_image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy);
+        cmdbuf.CopyBufferToImage(out_buffer, dst_image, 
+                                VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy);

        // Post-copy transition
        const VkImageMemoryBarrier post_barrier{
            .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+            .pNext = nullptr,
            .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
            .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
            .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
            .newLayout = VK_IMAGE_LAYOUT_GENERAL,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
            .image = dst_image,
            .subresourceRange = {aspect, 0, 1, 0, 1},
        };
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@ -1099,22 +1099,6 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
    image.flags &= ~ImageFlagBits::CpuModified;
    TrackImage(image, image_id);
    
-    /*// If it's sparse and remapped, we treat it as a partial update trigger
-    if (image.info.is_sparse && True(image.flags & ImageFlagBits::Remapped)) {
-        image.flags &= ~ImageFlagBits::Remapped;
-        
-        if (!image.dirty_offsets.empty() && !image.sparse_bindings.empty()) {
-            constexpr u64 page_size = 64_KiB;
-            size_t dirty_size = image.dirty_offsets.size() * page_size;
-            
-            auto staging = runtime.UploadStagingBuffer(dirty_size);
-            UploadSparseDirtyTiles(image, staging);
-            runtime.InsertUploadMemoryBarrier();
-            
-            return;
-        }
-    }*/
-    
    if (image.info.num_samples > 1 && !runtime.CanUploadMSAA()) {
        LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented");
        runtime.TransitionImageLayout(image);
@ -1139,87 +1123,6 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
    runtime.InsertUploadMemoryBarrier();
 }

-template <class P>
-template <typename StagingBuffer>
-void TextureCache<P>::UploadSparseDirtyTiles(Image& image, StagingBuffer& staging) {
-    using namespace VideoCommon;
-    using namespace Tegra::Texture;
-
-    std::vector<BufferImageCopy> all_copies;
-    size_t total_upload_size = 0;
-    
-    for (u64 dirty_tile_index : image.dirty_offsets) {
-        SparseBinding* binding = nullptr;
-        for (auto& [addr, bind] : image.sparse_bindings) {
-            if (bind.tile_index == dirty_tile_index) {
-                binding = &bind;
-                break;
-            }
-        }
-        
-        if (!binding) {
-            continue;
-        }
-        
-        const auto& coord = binding->tile_coord;
-        
-        // Calculate tile dimensions
-        const u32 tile_width_blocks = 128;
-        const u32 tile_height_blocks = 32;
-        const u32 tile_width = std::min(tile_width_blocks * 4, image.info.size.width - coord.width);
-        const u32 tile_height = std::min(tile_height_blocks * 4, image.info.size.height - coord.height);
-        const u32 tile_depth = std::min(1u, image.info.size.depth - coord.depth);
-        
-        const u32 bytes_per_block = BytesPerBlock(image.info.format);
-        const u32 blocks_wide = (tile_width + 3) / 4;
-        const u32 blocks_high = (tile_height + 3) / 4;
-        const size_t tile_unswizzled_size = blocks_wide * blocks_high * tile_depth * bytes_per_block;
-        
-        if (total_upload_size + tile_unswizzled_size > staging.mapped_span.size()) {
-            LOG_ERROR(HW_GPU, "Staging buffer too small");
-            break;
-        }
-        
-        std::array<u8, 65536> tile_swizzled_data;
-        gpu_memory->ReadBlockUnsafe(binding->gpu_addr, tile_swizzled_data.data(), image.sparse_tile_size);
-
-        // Get output span
-        auto tile_output = staging.mapped_span.subspan(total_upload_size, tile_unswizzled_size);
-
-        // Unswizzle the tile
-        auto result = UnswizzleSparseTextureTile(tile_output, tile_swizzled_data, 
-                                                 image.info, tile_width, tile_height, tile_depth);
-
-        // Create the copy descriptor
-        BufferImageCopy copy{
-            .buffer_offset = total_upload_size,
-            .buffer_size = tile_unswizzled_size,
-            .buffer_row_length = result.buffer_row_length,
-            .buffer_image_height = result.buffer_image_height,
-            .image_subresource = {
-                .base_level = 0,
-                .base_layer = 0,
-                .num_layers = 1,
-            },
-            .image_offset = {
-                static_cast<s32>(coord.width),
-                static_cast<s32>(coord.height),
-                static_cast<s32>(coord.depth)
-            },
-            .image_extent = {tile_width, tile_height, tile_depth}
-        };
-        
-        all_copies.push_back(copy);
-        total_upload_size += tile_unswizzled_size;
-    }
-    
-    if (!all_copies.empty()) {
-        image.UploadMemory(staging, all_copies);
-    }
-    
-    image.dirty_offsets.clear();
-}
-
 template <class P>
 template <typename StagingBuffer>
 void TextureCache<P>::UploadImageContents(Image& image, StagingBuffer& staging) {
@ -1505,7 +1408,7 @@ void TextureCache<P>::TickAsyncUnswizzle() {
    }

    // Don't process every frame - allow more data to accumulate
-    if (++current_unswizzle_frame < 2) return;
+    //if (++current_unswizzle_frame < 2) return;
    
    PendingUnswizzle& task = unswizzle_queue.front();
    Image& image = slot_images[task.image_id];
@ -1529,8 +1432,8 @@ void TextureCache<P>::TickAsyncUnswizzle() {
    }

    // ToDo: Make these configurable
-    static constexpr size_t CHUNK_SIZE = 64_MiB;
-    static constexpr u32 SLICES_PER_BATCH = 512u;
+    static constexpr size_t CHUNK_SIZE = 16_MiB;
+    static constexpr u32 SLICES_PER_BATCH = 64u;
    
    // Read data
    if (task.current_offset < task.total_size) {
@ -1569,7 +1472,7 @@ void TextureCache<P>::TickAsyncUnswizzle() {
        unswizzle_queue.pop_front();
    }
    
-    current_unswizzle_frame = 0;
+    //current_unswizzle_frame = 0;
 }

 template <class P>
--- a/src/video_core/texture_cache/util.cpp
+++ b/src/video_core/texture_cache/util.cpp
@ -55,7 +55,6 @@ using Tegra::Texture::TextureFormat;
 using Tegra::Texture::TextureType;
 using Tegra::Texture::TICEntry;
 using Tegra::Texture::UnswizzleTexture;
-using Tegra::Texture::UnswizzleSubrect;
 using VideoCore::Surface::BytesPerBlock;
 using VideoCore::Surface::DefaultBlockHeight;
 using VideoCore::Surface::DefaultBlockWidth;
@ -923,46 +922,6 @@ boost::container::small_vector<BufferImageCopy, 16> UnswizzleImage(Tegra::Memory
    return copies;
 }

-SparseTileUnswizzleResult UnswizzleSparseTextureTile(std::span<u8> output, 
-                                                      std::span<const u8> input,
-                                                      const ImageInfo& info,
-                                                      u32 tile_width,
-                                                      u32 tile_height,
-                                                      u32 tile_depth) {
-    const Extent2D block_size = DefaultBlockSize(info.format);
-    const u32 bpp = BytesPerBlock(info.format);
-    const u32 width_blocks = (tile_width + block_size.width - 1) / block_size.width;
-    const u32 height_blocks = (tile_height + block_size.height - 1) / block_size.height;
-    
-    // Calculate GOBs per row
-    const u32 bytes_per_row = width_blocks * bpp;
-    const u32 gobs_per_row = (bytes_per_row + 63) / 64;
-    
-    // Calculate block_height for 64KB tiles
-    // 64KB / (gobs_per_row × 512 bytes) = GOBs tall
-    constexpr u32 TILE_SIZE = 65536;
-    const u32 gobs_tall = TILE_SIZE / (gobs_per_row * 512);
-    
-    // block_height = log2(gobs_tall)
-    const u32 tile_block_height = std::countr_zero(gobs_tall);
-    
-    const u32 pitch_linear = width_blocks * bpp;
-    
-    UnswizzleSubrect(
-        output, input, bpp,
-        width_blocks, height_blocks, tile_depth,
-        0, 0,
-        width_blocks, height_blocks,
-        tile_block_height, 0,
-        pitch_linear
-    );
-
-    return {
-        .buffer_row_length = Common::AlignUp(tile_width, block_size.width),
-        .buffer_image_height = Common::AlignUp(tile_height, block_size.height)
-    };
-}
-
 void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8> output,
                  std::span<BufferImageCopy> copies) {
    u32 output_offset = 0;
--- a/src/video_core/texture_cache/util.h
+++ b/src/video_core/texture_cache/util.h
@ -74,13 +74,6 @@ struct SparseTileUnswizzleResult {
    Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageInfo& info,
    std::span<const u8> input, std::span<u8> output);
    
-[[nodiscard]] SparseTileUnswizzleResult UnswizzleSparseTextureTile(std::span<u8> output, 
-                                                      std::span<const u8> input,
-                                                      const ImageInfo& info,
-                                                      u32 tile_width,
-                                                      u32 tile_height,
-                                                      u32 tile_depth);
-    
 void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8> output,
                  std::span<BufferImageCopy> copies);