From 224ea7672487a4a36c1d8245f8815ea365063ff3 Mon Sep 17 00:00:00 2001 From: Forrest Keller Date: Fri, 2 Jan 2026 17:34:35 -0600 Subject: [PATCH] Added more logs and barriers for garbage collection Attempt to improve VRAM usage of the unswizzle by only allocating what is needed instead of the whole image --- .../renderer_opengl/gl_texture_cache.h | 2 + .../renderer_vulkan/vk_compute_pass.cpp | 62 ++++++++++++++----- .../renderer_vulkan/vk_compute_pass.h | 11 ++++ .../renderer_vulkan/vk_texture_cache.cpp | 4 +- .../renderer_vulkan/vk_texture_cache.h | 4 +- src/video_core/texture_cache/texture_cache.h | 52 +++++++++++++--- 6 files changed, 110 insertions(+), 25 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 8e1d04e3fb..9693a97954 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -229,6 +229,8 @@ public: bool ScaleDown(bool ignore = false); + u64 allocation_tick; + private: void CopyBufferToImage(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset); diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index b7684b6a87..d97a35352a 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -753,22 +753,50 @@ void BlockLinearUnswizzle3DPass::Unswizzle( Image& image, const StagingBufferRef& swizzled, std::span swizzles, - u32 z_start, u32 z_count) + u32 z_start, u32 z_count) { using namespace VideoCommon::Accelerated; + // Leaving this hear incase instances are found where slices_needed causes device loss + // Tune this for a balance between speed and size, I don't own a deck so can't self tune it + // constexpr u32 MAX_BATCH_SLICES = 64; + if (!image.has_compute_unswizzle_buffer) { - image.AllocateComputeUnswizzleBuffer(); + // Allocate exactly what this batch needs + const u32 slices_needed = std::min(z_count, image.info.size.depth); + image.AllocateComputeUnswizzleBuffer(slices_needed); } ASSERT(swizzles.size() == 1); const auto& sw = swizzles[0]; const auto params = MakeBlockLinearSwizzle3DParams(sw, image.info); + const u32 blocks_x = (image.info.size.width + 3) / 4; + const u32 blocks_y = (image.info.size.height + 3) / 4; + + constexpr u32 SLICES_PER_CHUNK = 64; + + for (u32 z_offset = 0; z_offset < z_count; z_offset += SLICES_PER_CHUNK) { + const u32 current_chunk_slices = std::min(SLICES_PER_CHUNK, z_count - z_offset); + const u32 current_z_start = z_start + z_offset; + + UnswizzleChunk(image, swizzled, sw, params, blocks_x, blocks_y, + current_z_start, current_chunk_slices); + } +} + +void BlockLinearUnswizzle3DPass::UnswizzleChunk( + Image& image, + const StagingBufferRef& swizzled, + const VideoCommon::SwizzleParameters& sw, + const BlockLinearSwizzle3DParams& params, + u32 blocks_x, u32 blocks_y, + u32 z_start, u32 z_count) +{ BlockLinearUnswizzle3DPushConstants pc{}; pc.origin[0] = params.origin[0]; pc.origin[1] = params.origin[1]; - pc.origin[2] = z_start; // Start at the current Z-slice + pc.origin[2] = z_start; // Current chunk's Z start pc.destination[0] = params.destination[0]; pc.destination[1] = params.destination[1]; @@ -783,16 +811,18 @@ void BlockLinearUnswizzle3DPass::Unswizzle( pc.block_depth = params.block_depth; pc.block_depth_mask = params.block_depth_mask; - const u32 blocks_x = (image.info.size.width + 3) / 4; - const u32 blocks_y = (image.info.size.height + 3) / 4; pc.blocks_dim[0] = blocks_x; pc.blocks_dim[1] = blocks_y; pc.blocks_dim[2] = z_count; // Only process the count compute_pass_descriptor_queue.Acquire(); - compute_pass_descriptor_queue.AddBuffer(*image.runtime->swizzle_table_buffer, 0, image.runtime->swizzle_table_size); - compute_pass_descriptor_queue.AddBuffer(swizzled.buffer, sw.buffer_offset + swizzled.offset, image.guest_size_bytes - sw.buffer_offset); - compute_pass_descriptor_queue.AddBuffer(*image.compute_unswizzle_buffer, 0, image.compute_unswizzle_buffer_size); + compute_pass_descriptor_queue.AddBuffer(*image.runtime->swizzle_table_buffer, 0, + image.runtime->swizzle_table_size); + compute_pass_descriptor_queue.AddBuffer(swizzled.buffer, + sw.buffer_offset + swizzled.offset, + image.guest_size_bytes - sw.buffer_offset); + compute_pass_descriptor_queue.AddBuffer(*image.compute_unswizzle_buffer, 0, + image.compute_unswizzle_buffer_size); const void* descriptor_data = compute_pass_descriptor_queue.UpdateData(); const VkDescriptorSet set = descriptor_allocator.Commit(); @@ -806,9 +836,11 @@ void BlockLinearUnswizzle3DPass::Unswizzle( static_cast(blocks_x) * blocks_y * bytes_per_block; const VkDeviceSize barrier_size = output_slice_size * z_count; + const bool is_first_chunk = (z_start == 0); + scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([this, &image, set, descriptor_data, pc, gx, gy, gz, z_start, z_count, - barrier_size](vk::CommandBuffer cmdbuf) { + barrier_size, is_first_chunk](vk::CommandBuffer cmdbuf) { const VkBuffer out_buffer = *image.compute_unswizzle_buffer; const VkImage dst_image = image.Handle(); const VkImageAspectFlags aspect = image.AspectMask(); @@ -819,8 +851,6 @@ void BlockLinearUnswizzle3DPass::Unswizzle( cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc); cmdbuf.Dispatch(gx, gy, gz); - const bool is_first = (z_start == 0); - // Single barrier for compute -> transfer (buffer ready, image transition) const VkBufferMemoryBarrier buffer_barrier{ .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, @@ -838,9 +868,10 @@ void BlockLinearUnswizzle3DPass::Unswizzle( const VkImageMemoryBarrier pre_barrier{ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, .pNext = nullptr, - .srcAccessMask = is_first ? VkAccessFlags{} : static_cast(VK_ACCESS_SHADER_READ_BIT), + .srcAccessMask = is_first_chunk ? VkAccessFlags{} : + static_cast(VK_ACCESS_SHADER_READ_BIT), .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, - .oldLayout = is_first ? VK_IMAGE_LAYOUT_UNDEFINED : VK_IMAGE_LAYOUT_GENERAL, + .oldLayout = is_first_chunk ? VK_IMAGE_LAYOUT_UNDEFINED : VK_IMAGE_LAYOUT_GENERAL, .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, @@ -856,12 +887,13 @@ void BlockLinearUnswizzle3DPass::Unswizzle( nullptr, buffer_barrier, pre_barrier ); + // Copy chunk to correct Z position in image const VkBufferImageCopy copy{ - .bufferOffset = 0, + .bufferOffset = 0, // Read from start of staging buffer .bufferRowLength = 0, .bufferImageHeight = 0, .imageSubresource = {aspect, 0, 0, 1}, - .imageOffset = {0, 0, static_cast(z_start)}, + .imageOffset = {0, 0, static_cast(z_start)}, // Write to correct Z .imageExtent = {image.info.size.width, image.info.size.height, z_count}, }; cmdbuf.CopyBufferToImage(out_buffer, dst_image, diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index 812504e540..3edcf020a2 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h @@ -17,6 +17,7 @@ #include "video_core/texture_cache/types.h" #include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_wrapper.h" +#include "video_core/texture_cache/accelerated_swizzle.h" namespace VideoCommon { struct SwizzleParameters; @@ -24,6 +25,8 @@ struct SwizzleParameters; namespace Vulkan { +using VideoCommon::Accelerated::BlockLinearSwizzle3DParams; + class Device; class StagingBufferPool; class Scheduler; @@ -146,6 +149,14 @@ public: const StagingBufferRef& swizzled, std::span swizzles, u32 z_start, u32 z_count); + + void UnswizzleChunk( + Image& image, + const StagingBufferRef& swizzled, + const VideoCommon::SwizzleParameters& sw, + const BlockLinearSwizzle3DParams& params, + u32 blocks_x, u32 blocks_y, + u32 z_start, u32 z_count); private: Scheduler& scheduler; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index b52f12a648..6437cc4c5b 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1620,7 +1620,7 @@ Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBas Image::~Image() = default; -void Image::AllocateComputeUnswizzleBuffer() { +void Image::AllocateComputeUnswizzleBuffer(u32 max_slices) { if (has_compute_unswizzle_buffer) return; @@ -1633,7 +1633,7 @@ void Image::AllocateComputeUnswizzleBuffer() { // BCn is 4x4x1 blocks const u32 blocks_x = (info.size.width + block_width - 1) / block_width; const u32 blocks_y = (info.size.height + block_height - 1) / block_height; - const u32 blocks_z = info.size.depth; + const u32 blocks_z = std::min(max_slices, info.size.depth); const u64 block_count = static_cast(blocks_x) * diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index f692a78c3d..b545ba8669 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -196,6 +196,8 @@ public: bool ScaleUp(bool ignore = false); bool ScaleDown(bool ignore = false); + + u64 allocation_tick; friend class BlockLinearUnswizzle3DPass; @@ -214,7 +216,7 @@ private: VkDeviceSize compute_unswizzle_buffer_size = 0; bool has_compute_unswizzle_buffer = false; - void AllocateComputeUnswizzleBuffer(); + void AllocateComputeUnswizzleBuffer(u32 max_slices); // Use a pointer to field because it is relative, so that the object can be // moved without breaking the reference. diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 8a57a7b247..82443d6599 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -98,6 +98,13 @@ void TextureCache

::RunGarbageCollector() { } --num_iterations; auto& image = slot_images[image_id]; + + // Never delete recently allocated sparse textures (within 3 frames) + const bool is_recently_allocated = image.allocation_tick >= frame_tick - 3; + if (is_recently_allocated && image.info.is_sparse) { + return false; + } + if (True(image.flags & ImageFlagBits::IsDecoding)) { // This image is still being decoded, deleting it will invalidate the slot // used by the async decoder thread. @@ -153,7 +160,13 @@ void TextureCache

::RunGarbageCollector() { if (total_used_memory >= expected_memory) { lru_cache.ForEachItemBelow(frame_tick, [&](ImageId image_id) { auto& image = slot_images[image_id]; - if (image.info.is_sparse && image.guest_size_bytes >= 256_MiB) { + // Only target sparse textures that are old enough + if (image.info.is_sparse && + image.guest_size_bytes >= 256_MiB && + image.allocation_tick < frame_tick - 3) { + LOG_DEBUG(HW_GPU, "GC targeting old sparse texture at 0x{:X} ({} MiB, age: {} frames)", + image.gpu_addr, image.guest_size_bytes / (1024 * 1024), + frame_tick - image.allocation_tick); return Cleanup(image_id); } return false; @@ -1428,6 +1441,11 @@ void TextureCache

::TickAsyncUnswizzle() { return; } + if(current_unswizzle_frame > 0) { + current_unswizzle_frame--; + return; + } + PendingUnswizzle& task = unswizzle_queue.front(); Image& image = slot_images[task.image_id]; @@ -1492,6 +1510,9 @@ void TextureCache

::TickAsyncUnswizzle() { if (total_used_memory >= expected_memory) { RunGarbageCollector(); } + + // Wait 4 frames to process the next entry + current_unswizzle_frame = 4u; } } @@ -1534,24 +1555,29 @@ ImageId TextureCache

::InsertImage(const ImageInfo& info, GPUVAddr gpu_addr, } ASSERT_MSG(cpu_addr, "Tried to insert an image to an invalid gpu_addr=0x{:x}", gpu_addr); - // For large sparse textures, aggressively clean up old allocation at same address + // For large sparse textures, aggressively clean up old allocations at same address if (info.is_sparse && CalculateGuestSizeInBytes(info) >= 256_MiB) { const auto alloc_it = image_allocs_table.find(gpu_addr); if (alloc_it != image_allocs_table.end()) { const ImageAllocId alloc_id = alloc_it->second; auto& alloc_images = slot_image_allocs[alloc_id].images; - // Immediately delete old images at this address before allocating new one + // Collect old images at this address that were created more than 2 frames ago boost::container::small_vector to_delete; for (ImageId old_image_id : alloc_images) { Image& old_image = slot_images[old_image_id]; - if (old_image.info.is_sparse && old_image.gpu_addr == gpu_addr) { + if (old_image.info.is_sparse && + old_image.gpu_addr == gpu_addr && + old_image.allocation_tick < frame_tick - 2) { // Try not to delete fresh textures to_delete.push_back(old_image_id); } } + // Delete old images immediately for (ImageId old_id : to_delete) { Image& old_image = slot_images[old_id]; + LOG_INFO(HW_GPU, "Immediately deleting old sparse texture at 0x{:X} ({} MiB)", + gpu_addr, old_image.guest_size_bytes / (1024 * 1024)); if (True(old_image.flags & ImageFlagBits::Tracked)) { UntrackImage(old_image, old_id); } @@ -1577,13 +1603,23 @@ ImageId TextureCache

::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, DA ImageInfo new_info = info; const size_t size_bytes = CalculateGuestSizeInBytes(new_info); + // Proactive cleanup for large sparse texture allocations if (new_info.is_sparse && size_bytes >= 256_MiB) { const u64 estimated_alloc_size = size_bytes; - + if (total_used_memory + estimated_alloc_size >= critical_memory) { - LOG_WARNING(HW_GPU, "Large sparse texture allocation ({} MiB) - running aggressive GC", - size_bytes / (1024 * 1024)); + LOG_WARNING(HW_GPU, "Large sparse texture allocation ({} MiB) - running aggressive GC. " + "Current memory: {} MiB, Critical: {} MiB", + size_bytes / (1024 * 1024), + total_used_memory / (1024 * 1024), + critical_memory / (1024 * 1024)); RunGarbageCollector(); + + // If still over threshold after GC, try one more aggressive pass + if (total_used_memory + estimated_alloc_size >= critical_memory) { + LOG_WARNING(HW_GPU, "Still critically low on memory, running second GC pass"); + RunGarbageCollector(); + } } } @@ -1682,6 +1718,8 @@ ImageId TextureCache

::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, DA const ImageId new_image_id = slot_images.insert(runtime, new_info, gpu_addr, cpu_addr); Image& new_image = slot_images[new_image_id]; + + new_image.allocation_tick = frame_tick; if (!gpu_memory->IsContinuousRange(new_image.gpu_addr, new_image.guest_size_bytes) && new_info.is_sparse) {