From 224ea7672487a4a36c1d8245f8815ea365063ff3 Mon Sep 17 00:00:00 2001
From: Forrest Keller <forrestmarkx@outlook.com>
Date: Fri, 2 Jan 2026 17:34:35 -0600
Subject: [PATCH] Added more logs and barriers for garbage collection

Attempt to improve VRAM usage of the unswizzle by only allocating what is needed instead of the whole image
---
 .../renderer_opengl/gl_texture_cache.h        |  2 +
 .../renderer_vulkan/vk_compute_pass.cpp       | 62 ++++++++++++++-----
 .../renderer_vulkan/vk_compute_pass.h         | 11 ++++
 .../renderer_vulkan/vk_texture_cache.cpp      |  4 +-
 .../renderer_vulkan/vk_texture_cache.h        |  4 +-
 src/video_core/texture_cache/texture_cache.h  | 52 +++++++++++++---
 6 files changed, 110 insertions(+), 25 deletions(-)
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index 8e1d04e3fb..9693a97954 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -229,6 +229,8 @@ public:
 
     bool ScaleDown(bool ignore = false);
 
+    u64 allocation_tick;
+
 private:
     void CopyBufferToImage(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset);
 
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
index b7684b6a87..d97a35352a 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@@ -753,22 +753,50 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
     Image& image,
     const StagingBufferRef& swizzled,
     std::span<const VideoCommon::SwizzleParameters> swizzles,
-    u32 z_start, u32 z_count) 
+    u32 z_start, u32 z_count)
 {
     using namespace VideoCommon::Accelerated;
 
+    // Leaving this hear incase instances are found where slices_needed causes device loss
+    // Tune this for a balance between speed and size, I don't own a deck so can't self tune it
+    // constexpr u32 MAX_BATCH_SLICES = 64;
+    
     if (!image.has_compute_unswizzle_buffer) {
-        image.AllocateComputeUnswizzleBuffer();
+        // Allocate exactly what this batch needs
+        const u32 slices_needed = std::min(z_count, image.info.size.depth);
+        image.AllocateComputeUnswizzleBuffer(slices_needed);
     }
 
     ASSERT(swizzles.size() == 1);
     const auto& sw = swizzles[0];
     const auto params = MakeBlockLinearSwizzle3DParams(sw, image.info);
 
+    const u32 blocks_x = (image.info.size.width  + 3) / 4;
+    const u32 blocks_y = (image.info.size.height + 3) / 4;
+
+    constexpr u32 SLICES_PER_CHUNK = 64;
+    
+    for (u32 z_offset = 0; z_offset < z_count; z_offset += SLICES_PER_CHUNK) {
+        const u32 current_chunk_slices = std::min(SLICES_PER_CHUNK, z_count - z_offset);
+        const u32 current_z_start = z_start + z_offset;
+        
+        UnswizzleChunk(image, swizzled, sw, params, blocks_x, blocks_y,
+                       current_z_start, current_chunk_slices);
+    }
+}
+
+void BlockLinearUnswizzle3DPass::UnswizzleChunk(
+    Image& image,
+    const StagingBufferRef& swizzled,
+    const VideoCommon::SwizzleParameters& sw,
+    const BlockLinearSwizzle3DParams& params,
+    u32 blocks_x, u32 blocks_y,
+    u32 z_start, u32 z_count)
+{
     BlockLinearUnswizzle3DPushConstants pc{};
     pc.origin[0] = params.origin[0];
     pc.origin[1] = params.origin[1];
-    pc.origin[2] = z_start; // Start at the current Z-slice
+    pc.origin[2] = z_start; // Current chunk's Z start
 
     pc.destination[0] = params.destination[0];
     pc.destination[1] = params.destination[1];
@@ -783,16 +811,18 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
     pc.block_depth          = params.block_depth;
     pc.block_depth_mask     = params.block_depth_mask;
 
-    const u32 blocks_x = (image.info.size.width  + 3) / 4;
-    const u32 blocks_y = (image.info.size.height + 3) / 4;
     pc.blocks_dim[0] = blocks_x;
     pc.blocks_dim[1] = blocks_y;
     pc.blocks_dim[2] = z_count; // Only process the count
 
     compute_pass_descriptor_queue.Acquire();
-    compute_pass_descriptor_queue.AddBuffer(*image.runtime->swizzle_table_buffer, 0, image.runtime->swizzle_table_size);
-    compute_pass_descriptor_queue.AddBuffer(swizzled.buffer, sw.buffer_offset + swizzled.offset, image.guest_size_bytes - sw.buffer_offset);
-    compute_pass_descriptor_queue.AddBuffer(*image.compute_unswizzle_buffer, 0, image.compute_unswizzle_buffer_size);
+    compute_pass_descriptor_queue.AddBuffer(*image.runtime->swizzle_table_buffer, 0, 
+                                           image.runtime->swizzle_table_size);
+    compute_pass_descriptor_queue.AddBuffer(swizzled.buffer, 
+                                           sw.buffer_offset + swizzled.offset, 
+                                           image.guest_size_bytes - sw.buffer_offset);
+    compute_pass_descriptor_queue.AddBuffer(*image.compute_unswizzle_buffer, 0, 
+                                           image.compute_unswizzle_buffer_size);
 
     const void* descriptor_data = compute_pass_descriptor_queue.UpdateData();
     const VkDescriptorSet set = descriptor_allocator.Commit();
@@ -806,9 +836,11 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
         static_cast<VkDeviceSize>(blocks_x) * blocks_y * bytes_per_block;
     const VkDeviceSize barrier_size = output_slice_size * z_count;
     
+    const bool is_first_chunk = (z_start == 0);
+    
     scheduler.RequestOutsideRenderPassOperationContext();
     scheduler.Record([this, &image, set, descriptor_data, pc, gx, gy, gz, z_start, z_count,
-                      barrier_size](vk::CommandBuffer cmdbuf) {
+                      barrier_size, is_first_chunk](vk::CommandBuffer cmdbuf) {
         const VkBuffer out_buffer = *image.compute_unswizzle_buffer;
         const VkImage dst_image = image.Handle();
         const VkImageAspectFlags aspect = image.AspectMask();
@@ -819,8 +851,6 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
         cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
         cmdbuf.Dispatch(gx, gy, gz);
 
-        const bool is_first = (z_start == 0);
-
         // Single barrier for compute -> transfer (buffer ready, image transition)
         const VkBufferMemoryBarrier buffer_barrier{
             .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
@@ -838,9 +868,10 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
         const VkImageMemoryBarrier pre_barrier{
             .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
             .pNext = nullptr,
-            .srcAccessMask = is_first ? VkAccessFlags{} : static_cast<VkAccessFlags>(VK_ACCESS_SHADER_READ_BIT),
+            .srcAccessMask = is_first_chunk ? VkAccessFlags{} : 
+                            static_cast<VkAccessFlags>(VK_ACCESS_SHADER_READ_BIT),
             .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
-            .oldLayout = is_first ? VK_IMAGE_LAYOUT_UNDEFINED : VK_IMAGE_LAYOUT_GENERAL,
+            .oldLayout = is_first_chunk ? VK_IMAGE_LAYOUT_UNDEFINED : VK_IMAGE_LAYOUT_GENERAL,
             .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
             .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
             .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
@@ -856,12 +887,13 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
             nullptr, buffer_barrier, pre_barrier
         );
 
+        // Copy chunk to correct Z position in image
         const VkBufferImageCopy copy{
-            .bufferOffset = 0,
+            .bufferOffset = 0, // Read from start of staging buffer
             .bufferRowLength = 0,
             .bufferImageHeight = 0,
             .imageSubresource = {aspect, 0, 0, 1},
-            .imageOffset = {0, 0, static_cast<s32>(z_start)},
+            .imageOffset = {0, 0, static_cast<s32>(z_start)}, // Write to correct Z
             .imageExtent = {image.info.size.width, image.info.size.height, z_count},
         };
         cmdbuf.CopyBufferToImage(out_buffer, dst_image, 
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h
index 812504e540..3edcf020a2 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.h
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.h
@@ -17,6 +17,7 @@
 #include "video_core/texture_cache/types.h"
 #include "video_core/vulkan_common/vulkan_memory_allocator.h"
 #include "video_core/vulkan_common/vulkan_wrapper.h"
+#include "video_core/texture_cache/accelerated_swizzle.h"
 
 namespace VideoCommon {
 struct SwizzleParameters;
@@ -24,6 +25,8 @@ struct SwizzleParameters;
 
 namespace Vulkan {
 
+using VideoCommon::Accelerated::BlockLinearSwizzle3DParams;
+
 class Device;
 class StagingBufferPool;
 class Scheduler;
@@ -146,6 +149,14 @@ public:
                    const StagingBufferRef& swizzled,
                    std::span<const VideoCommon::SwizzleParameters> swizzles,
                    u32 z_start, u32 z_count);
+                   
+    void UnswizzleChunk(
+        Image& image,
+        const StagingBufferRef& swizzled,
+        const VideoCommon::SwizzleParameters& sw,
+        const BlockLinearSwizzle3DParams& params,
+        u32 blocks_x, u32 blocks_y,
+        u32 z_start, u32 z_count);
 
 private:
     Scheduler& scheduler;
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index b52f12a648..6437cc4c5b 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -1620,7 +1620,7 @@ Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBas
 
 Image::~Image() = default;
 
-void Image::AllocateComputeUnswizzleBuffer() {
+void Image::AllocateComputeUnswizzleBuffer(u32 max_slices) {
     if (has_compute_unswizzle_buffer)
         return;
 
@@ -1633,7 +1633,7 @@ void Image::AllocateComputeUnswizzleBuffer() {
     // BCn is 4x4x1 blocks
     const u32 blocks_x = (info.size.width  + block_width  - 1) / block_width;
     const u32 blocks_y = (info.size.height + block_height - 1) / block_height;
-    const u32 blocks_z = info.size.depth;
+    const u32 blocks_z = std::min(max_slices, info.size.depth);
 
     const u64 block_count =
         static_cast<u64>(blocks_x) *
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h
index f692a78c3d..b545ba8669 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -196,6 +196,8 @@ public:
     bool ScaleUp(bool ignore = false);
 
     bool ScaleDown(bool ignore = false);
+
+    u64 allocation_tick;
     
     friend class BlockLinearUnswizzle3DPass;
 
@@ -214,7 +216,7 @@ private:
     VkDeviceSize compute_unswizzle_buffer_size = 0;
     bool has_compute_unswizzle_buffer = false;
 
-    void AllocateComputeUnswizzleBuffer();
+    void AllocateComputeUnswizzleBuffer(u32 max_slices);
 
     // Use a pointer to field because it is relative, so that the object can be
     // moved without breaking the reference.
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 8a57a7b247..82443d6599 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -98,6 +98,13 @@ void TextureCache<P>::RunGarbageCollector() {
         }
         --num_iterations;
         auto& image = slot_images[image_id];
+        
+        // Never delete recently allocated sparse textures (within 3 frames)
+        const bool is_recently_allocated = image.allocation_tick >= frame_tick - 3;
+        if (is_recently_allocated && image.info.is_sparse) {
+            return false;
+        }
+        
         if (True(image.flags & ImageFlagBits::IsDecoding)) {
             // This image is still being decoded, deleting it will invalidate the slot
             // used by the async decoder thread.
@@ -153,7 +160,13 @@ void TextureCache<P>::RunGarbageCollector() {
     if (total_used_memory >= expected_memory) {
         lru_cache.ForEachItemBelow(frame_tick, [&](ImageId image_id) {
             auto& image = slot_images[image_id];
-            if (image.info.is_sparse && image.guest_size_bytes >= 256_MiB) {
+            // Only target sparse textures that are old enough
+            if (image.info.is_sparse && 
+                image.guest_size_bytes >= 256_MiB &&
+                image.allocation_tick < frame_tick - 3) {
+                LOG_DEBUG(HW_GPU, "GC targeting old sparse texture at 0x{:X} ({} MiB, age: {} frames)", 
+                         image.gpu_addr, image.guest_size_bytes / (1024 * 1024),
+                         frame_tick - image.allocation_tick);
                 return Cleanup(image_id);
             }
             return false;
@@ -1428,6 +1441,11 @@ void TextureCache<P>::TickAsyncUnswizzle() {
         return;
     }
     
+    if(current_unswizzle_frame > 0) {
+        current_unswizzle_frame--;
+        return;
+    }
+    
     PendingUnswizzle& task = unswizzle_queue.front();
     Image& image = slot_images[task.image_id];
     
@@ -1492,6 +1510,9 @@ void TextureCache<P>::TickAsyncUnswizzle() {
         if (total_used_memory >= expected_memory) {
             RunGarbageCollector();
         }
+        
+        // Wait 4 frames to process the next entry
+        current_unswizzle_frame = 4u;
     }
 }
 
@@ -1534,24 +1555,29 @@ ImageId TextureCache<P>::InsertImage(const ImageInfo& info, GPUVAddr gpu_addr,
     }
     ASSERT_MSG(cpu_addr, "Tried to insert an image to an invalid gpu_addr=0x{:x}", gpu_addr);
     
-    // For large sparse textures, aggressively clean up old allocation at same address
+    // For large sparse textures, aggressively clean up old allocations at same address
     if (info.is_sparse && CalculateGuestSizeInBytes(info) >= 256_MiB) {
         const auto alloc_it = image_allocs_table.find(gpu_addr);
         if (alloc_it != image_allocs_table.end()) {
             const ImageAllocId alloc_id = alloc_it->second;
             auto& alloc_images = slot_image_allocs[alloc_id].images;
             
-            // Immediately delete old images at this address before allocating new one
+            // Collect old images at this address that were created more than 2 frames ago
             boost::container::small_vector<ImageId, 4> to_delete;
             for (ImageId old_image_id : alloc_images) {
                 Image& old_image = slot_images[old_image_id];
-                if (old_image.info.is_sparse && old_image.gpu_addr == gpu_addr) {
+                if (old_image.info.is_sparse && 
+                    old_image.gpu_addr == gpu_addr &&
+                    old_image.allocation_tick < frame_tick - 2) {  // Try not to delete fresh textures
                     to_delete.push_back(old_image_id);
                 }
             }
             
+            // Delete old images immediately
             for (ImageId old_id : to_delete) {
                 Image& old_image = slot_images[old_id];
+                LOG_INFO(HW_GPU, "Immediately deleting old sparse texture at 0x{:X} ({} MiB)", 
+                         gpu_addr, old_image.guest_size_bytes / (1024 * 1024));
                 if (True(old_image.flags & ImageFlagBits::Tracked)) {
                     UntrackImage(old_image, old_id);
                 }
@@ -1577,13 +1603,23 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, DA
     ImageInfo new_info = info;
     const size_t size_bytes = CalculateGuestSizeInBytes(new_info);
     
+    // Proactive cleanup for large sparse texture allocations
     if (new_info.is_sparse && size_bytes >= 256_MiB) {
         const u64 estimated_alloc_size = size_bytes;
-
+        
         if (total_used_memory + estimated_alloc_size >= critical_memory) {
-            LOG_WARNING(HW_GPU, "Large sparse texture allocation ({} MiB) - running aggressive GC",
-                       size_bytes / (1024 * 1024));
+            LOG_WARNING(HW_GPU, "Large sparse texture allocation ({} MiB) - running aggressive GC. "
+                       "Current memory: {} MiB, Critical: {} MiB",
+                       size_bytes / (1024 * 1024),
+                       total_used_memory / (1024 * 1024),
+                       critical_memory / (1024 * 1024));
             RunGarbageCollector();
+            
+            // If still over threshold after GC, try one more aggressive pass
+            if (total_used_memory + estimated_alloc_size >= critical_memory) {
+                LOG_WARNING(HW_GPU, "Still critically low on memory, running second GC pass");
+                RunGarbageCollector();
+            }
         }
     }
     
@@ -1682,6 +1718,8 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, DA
 
     const ImageId new_image_id = slot_images.insert(runtime, new_info, gpu_addr, cpu_addr);
     Image& new_image = slot_images[new_image_id];
+    
+    new_image.allocation_tick = frame_tick;
 
     if (!gpu_memory->IsContinuousRange(new_image.gpu_addr, new_image.guest_size_bytes) &&
         new_info.is_sparse) {