Added more logs and barriers for garbage collection

Attempt to improve VRAM usage of the unswizzle by only allocating what is needed instead of the whole image
1 month ago · 224ea76724
6 changed files with 110 additions and 25 deletions
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@ -229,6 +229,8 @@ public:

    bool ScaleDown(bool ignore = false);

+    u64 allocation_tick;
+
 private:
    void CopyBufferToImage(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset);

--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@ -753,22 +753,50 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
    Image& image,
    const StagingBufferRef& swizzled,
    std::span<const VideoCommon::SwizzleParameters> swizzles,
-    u32 z_start, u32 z_count) 
+    u32 z_start, u32 z_count)
 {
    using namespace VideoCommon::Accelerated;

+    // Leaving this hear incase instances are found where slices_needed causes device loss
+    // Tune this for a balance between speed and size, I don't own a deck so can't self tune it
+    // constexpr u32 MAX_BATCH_SLICES = 64;
+    
    if (!image.has_compute_unswizzle_buffer) {
-        image.AllocateComputeUnswizzleBuffer();
+        // Allocate exactly what this batch needs
+        const u32 slices_needed = std::min(z_count, image.info.size.depth);
+        image.AllocateComputeUnswizzleBuffer(slices_needed);
    }

    ASSERT(swizzles.size() == 1);
    const auto& sw = swizzles[0];
    const auto params = MakeBlockLinearSwizzle3DParams(sw, image.info);

+    const u32 blocks_x = (image.info.size.width  + 3) / 4;
+    const u32 blocks_y = (image.info.size.height + 3) / 4;
+
+    constexpr u32 SLICES_PER_CHUNK = 64;
+    
+    for (u32 z_offset = 0; z_offset < z_count; z_offset += SLICES_PER_CHUNK) {
+        const u32 current_chunk_slices = std::min(SLICES_PER_CHUNK, z_count - z_offset);
+        const u32 current_z_start = z_start + z_offset;
+        
+        UnswizzleChunk(image, swizzled, sw, params, blocks_x, blocks_y,
+                       current_z_start, current_chunk_slices);
+    }
+}
+
+void BlockLinearUnswizzle3DPass::UnswizzleChunk(
+    Image& image,
+    const StagingBufferRef& swizzled,
+    const VideoCommon::SwizzleParameters& sw,
+    const BlockLinearSwizzle3DParams& params,
+    u32 blocks_x, u32 blocks_y,
+    u32 z_start, u32 z_count)
+{
    BlockLinearUnswizzle3DPushConstants pc{};
    pc.origin[0] = params.origin[0];
    pc.origin[1] = params.origin[1];
-    pc.origin[2] = z_start; // Start at the current Z-slice
+    pc.origin[2] = z_start; // Current chunk's Z start

    pc.destination[0] = params.destination[0];
    pc.destination[1] = params.destination[1];
@ -783,16 +811,18 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
    pc.block_depth          = params.block_depth;
    pc.block_depth_mask     = params.block_depth_mask;

-    const u32 blocks_x = (image.info.size.width  + 3) / 4;
-    const u32 blocks_y = (image.info.size.height + 3) / 4;
    pc.blocks_dim[0] = blocks_x;
    pc.blocks_dim[1] = blocks_y;
    pc.blocks_dim[2] = z_count; // Only process the count

    compute_pass_descriptor_queue.Acquire();
-    compute_pass_descriptor_queue.AddBuffer(*image.runtime->swizzle_table_buffer, 0, image.runtime->swizzle_table_size);
-    compute_pass_descriptor_queue.AddBuffer(swizzled.buffer, sw.buffer_offset + swizzled.offset, image.guest_size_bytes - sw.buffer_offset);
-    compute_pass_descriptor_queue.AddBuffer(*image.compute_unswizzle_buffer, 0, image.compute_unswizzle_buffer_size);
+    compute_pass_descriptor_queue.AddBuffer(*image.runtime->swizzle_table_buffer, 0, 
+                                           image.runtime->swizzle_table_size);
+    compute_pass_descriptor_queue.AddBuffer(swizzled.buffer, 
+                                           sw.buffer_offset + swizzled.offset, 
+                                           image.guest_size_bytes - sw.buffer_offset);
+    compute_pass_descriptor_queue.AddBuffer(*image.compute_unswizzle_buffer, 0, 
+                                           image.compute_unswizzle_buffer_size);

    const void* descriptor_data = compute_pass_descriptor_queue.UpdateData();
    const VkDescriptorSet set = descriptor_allocator.Commit();
@ -806,9 +836,11 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
        static_cast<VkDeviceSize>(blocks_x) * blocks_y * bytes_per_block;
    const VkDeviceSize barrier_size = output_slice_size * z_count;
    
+    const bool is_first_chunk = (z_start == 0);
+    
    scheduler.RequestOutsideRenderPassOperationContext();
    scheduler.Record([this, &image, set, descriptor_data, pc, gx, gy, gz, z_start, z_count,
-                      barrier_size](vk::CommandBuffer cmdbuf) {
+                      barrier_size, is_first_chunk](vk::CommandBuffer cmdbuf) {
        const VkBuffer out_buffer = *image.compute_unswizzle_buffer;
        const VkImage dst_image = image.Handle();
        const VkImageAspectFlags aspect = image.AspectMask();
@ -819,8 +851,6 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
        cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
        cmdbuf.Dispatch(gx, gy, gz);

-        const bool is_first = (z_start == 0);
-
        // Single barrier for compute -> transfer (buffer ready, image transition)
        const VkBufferMemoryBarrier buffer_barrier{
            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
@ -838,9 +868,10 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
        const VkImageMemoryBarrier pre_barrier{
            .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
            .pNext = nullptr,
-            .srcAccessMask = is_first ? VkAccessFlags{} : static_cast<VkAccessFlags>(VK_ACCESS_SHADER_READ_BIT),
+            .srcAccessMask = is_first_chunk ? VkAccessFlags{} : 
+                            static_cast<VkAccessFlags>(VK_ACCESS_SHADER_READ_BIT),
            .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
-            .oldLayout = is_first ? VK_IMAGE_LAYOUT_UNDEFINED : VK_IMAGE_LAYOUT_GENERAL,
+            .oldLayout = is_first_chunk ? VK_IMAGE_LAYOUT_UNDEFINED : VK_IMAGE_LAYOUT_GENERAL,
            .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
@ -856,12 +887,13 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
            nullptr, buffer_barrier, pre_barrier
        );

+        // Copy chunk to correct Z position in image
        const VkBufferImageCopy copy{
-            .bufferOffset = 0,
+            .bufferOffset = 0, // Read from start of staging buffer
            .bufferRowLength = 0,
            .bufferImageHeight = 0,
            .imageSubresource = {aspect, 0, 0, 1},
-            .imageOffset = {0, 0, static_cast<s32>(z_start)},
+            .imageOffset = {0, 0, static_cast<s32>(z_start)}, // Write to correct Z
            .imageExtent = {image.info.size.width, image.info.size.height, z_count},
        };
        cmdbuf.CopyBufferToImage(out_buffer, dst_image, 
--- a/src/video_core/renderer_vulkan/vk_compute_pass.h
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.h
@ -17,6 +17,7 @@
 #include "video_core/texture_cache/types.h"
 #include "video_core/vulkan_common/vulkan_memory_allocator.h"
 #include "video_core/vulkan_common/vulkan_wrapper.h"
+#include "video_core/texture_cache/accelerated_swizzle.h"

 namespace VideoCommon {
 struct SwizzleParameters;
@ -24,6 +25,8 @@ struct SwizzleParameters;

 namespace Vulkan {

+using VideoCommon::Accelerated::BlockLinearSwizzle3DParams;
+
 class Device;
 class StagingBufferPool;
 class Scheduler;
@ -146,6 +149,14 @@ public:
                   const StagingBufferRef& swizzled,
                   std::span<const VideoCommon::SwizzleParameters> swizzles,
                   u32 z_start, u32 z_count);
+                   
+    void UnswizzleChunk(
+        Image& image,
+        const StagingBufferRef& swizzled,
+        const VideoCommon::SwizzleParameters& sw,
+        const BlockLinearSwizzle3DParams& params,
+        u32 blocks_x, u32 blocks_y,
+        u32 z_start, u32 z_count);

 private:
    Scheduler& scheduler;
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@ -1620,7 +1620,7 @@ Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBas

 Image::~Image() = default;

-void Image::AllocateComputeUnswizzleBuffer() {
+void Image::AllocateComputeUnswizzleBuffer(u32 max_slices) {
    if (has_compute_unswizzle_buffer)
        return;

@ -1633,7 +1633,7 @@ void Image::AllocateComputeUnswizzleBuffer() {
    // BCn is 4x4x1 blocks
    const u32 blocks_x = (info.size.width  + block_width  - 1) / block_width;
    const u32 blocks_y = (info.size.height + block_height - 1) / block_height;
-    const u32 blocks_z = info.size.depth;
+    const u32 blocks_z = std::min(max_slices, info.size.depth);

    const u64 block_count =
        static_cast<u64>(blocks_x) *
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@ -196,6 +196,8 @@ public:
    bool ScaleUp(bool ignore = false);

    bool ScaleDown(bool ignore = false);
+
+    u64 allocation_tick;
    
    friend class BlockLinearUnswizzle3DPass;

@ -214,7 +216,7 @@ private:
    VkDeviceSize compute_unswizzle_buffer_size = 0;
    bool has_compute_unswizzle_buffer = false;

-    void AllocateComputeUnswizzleBuffer();
+    void AllocateComputeUnswizzleBuffer(u32 max_slices);

    // Use a pointer to field because it is relative, so that the object can be
    // moved without breaking the reference.
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@ -98,6 +98,13 @@ void TextureCache<P>::RunGarbageCollector() {
        }
        --num_iterations;
        auto& image = slot_images[image_id];
+        
+        // Never delete recently allocated sparse textures (within 3 frames)
+        const bool is_recently_allocated = image.allocation_tick >= frame_tick - 3;
+        if (is_recently_allocated && image.info.is_sparse) {
+            return false;
+        }
+        
        if (True(image.flags & ImageFlagBits::IsDecoding)) {
            // This image is still being decoded, deleting it will invalidate the slot
            // used by the async decoder thread.
@ -153,7 +160,13 @@ void TextureCache<P>::RunGarbageCollector() {
    if (total_used_memory >= expected_memory) {
        lru_cache.ForEachItemBelow(frame_tick, [&](ImageId image_id) {
            auto& image = slot_images[image_id];
-            if (image.info.is_sparse && image.guest_size_bytes >= 256_MiB) {
+            // Only target sparse textures that are old enough
+            if (image.info.is_sparse && 
+                image.guest_size_bytes >= 256_MiB &&
+                image.allocation_tick < frame_tick - 3) {
+                LOG_DEBUG(HW_GPU, "GC targeting old sparse texture at 0x{:X} ({} MiB, age: {} frames)", 
+                         image.gpu_addr, image.guest_size_bytes / (1024 * 1024),
+                         frame_tick - image.allocation_tick);
                return Cleanup(image_id);
            }
            return false;
@ -1428,6 +1441,11 @@ void TextureCache<P>::TickAsyncUnswizzle() {
        return;
    }
    
+    if(current_unswizzle_frame > 0) {
+        current_unswizzle_frame--;
+        return;
+    }
+    
    PendingUnswizzle& task = unswizzle_queue.front();
    Image& image = slot_images[task.image_id];
    
@ -1492,6 +1510,9 @@ void TextureCache<P>::TickAsyncUnswizzle() {
        if (total_used_memory >= expected_memory) {
            RunGarbageCollector();
        }
+        
+        // Wait 4 frames to process the next entry
+        current_unswizzle_frame = 4u;
    }
 }

@ -1534,24 +1555,29 @@ ImageId TextureCache<P>::InsertImage(const ImageInfo& info, GPUVAddr gpu_addr,
    }
    ASSERT_MSG(cpu_addr, "Tried to insert an image to an invalid gpu_addr=0x{:x}", gpu_addr);
    
-    // For large sparse textures, aggressively clean up old allocation at same address
+    // For large sparse textures, aggressively clean up old allocations at same address
    if (info.is_sparse && CalculateGuestSizeInBytes(info) >= 256_MiB) {
        const auto alloc_it = image_allocs_table.find(gpu_addr);
        if (alloc_it != image_allocs_table.end()) {
            const ImageAllocId alloc_id = alloc_it->second;
            auto& alloc_images = slot_image_allocs[alloc_id].images;
            
-            // Immediately delete old images at this address before allocating new one
+            // Collect old images at this address that were created more than 2 frames ago
            boost::container::small_vector<ImageId, 4> to_delete;
            for (ImageId old_image_id : alloc_images) {
                Image& old_image = slot_images[old_image_id];
-                if (old_image.info.is_sparse && old_image.gpu_addr == gpu_addr) {
+                if (old_image.info.is_sparse && 
+                    old_image.gpu_addr == gpu_addr &&
+                    old_image.allocation_tick < frame_tick - 2) {  // Try not to delete fresh textures
                    to_delete.push_back(old_image_id);
                }
            }
            
+            // Delete old images immediately
            for (ImageId old_id : to_delete) {
                Image& old_image = slot_images[old_id];
+                LOG_INFO(HW_GPU, "Immediately deleting old sparse texture at 0x{:X} ({} MiB)", 
+                         gpu_addr, old_image.guest_size_bytes / (1024 * 1024));
                if (True(old_image.flags & ImageFlagBits::Tracked)) {
                    UntrackImage(old_image, old_id);
                }
@ -1577,13 +1603,23 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, DA
    ImageInfo new_info = info;
    const size_t size_bytes = CalculateGuestSizeInBytes(new_info);
    
+    // Proactive cleanup for large sparse texture allocations
    if (new_info.is_sparse && size_bytes >= 256_MiB) {
        const u64 estimated_alloc_size = size_bytes;
-
+        
        if (total_used_memory + estimated_alloc_size >= critical_memory) {
-            LOG_WARNING(HW_GPU, "Large sparse texture allocation ({} MiB) - running aggressive GC",
-                       size_bytes / (1024 * 1024));
+            LOG_WARNING(HW_GPU, "Large sparse texture allocation ({} MiB) - running aggressive GC. "
+                       "Current memory: {} MiB, Critical: {} MiB",
+                       size_bytes / (1024 * 1024),
+                       total_used_memory / (1024 * 1024),
+                       critical_memory / (1024 * 1024));
            RunGarbageCollector();
+            
+            // If still over threshold after GC, try one more aggressive pass
+            if (total_used_memory + estimated_alloc_size >= critical_memory) {
+                LOG_WARNING(HW_GPU, "Still critically low on memory, running second GC pass");
+                RunGarbageCollector();
+            }
        }
    }
    
@ -1682,6 +1718,8 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, DA

    const ImageId new_image_id = slot_images.insert(runtime, new_info, gpu_addr, cpu_addr);
    Image& new_image = slot_images[new_image_id];
+    
+    new_image.allocation_tick = frame_tick;

    if (!gpu_memory->IsContinuousRange(new_image.gpu_addr, new_image.guest_size_bytes) &&
        new_info.is_sparse) {