Browse Source

Added more logs and barriers for garbage collection

Attempt to improve VRAM usage of the unswizzle by only allocating what is needed instead of the whole image
pull/3246/head
Forrest Keller 1 month ago
committed by crueter
parent
commit
224ea76724
  1. 2
      src/video_core/renderer_opengl/gl_texture_cache.h
  2. 62
      src/video_core/renderer_vulkan/vk_compute_pass.cpp
  3. 11
      src/video_core/renderer_vulkan/vk_compute_pass.h
  4. 4
      src/video_core/renderer_vulkan/vk_texture_cache.cpp
  5. 4
      src/video_core/renderer_vulkan/vk_texture_cache.h
  6. 52
      src/video_core/texture_cache/texture_cache.h

2
src/video_core/renderer_opengl/gl_texture_cache.h

@ -229,6 +229,8 @@ public:
bool ScaleDown(bool ignore = false);
u64 allocation_tick;
private:
void CopyBufferToImage(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset);

62
src/video_core/renderer_vulkan/vk_compute_pass.cpp

@ -753,22 +753,50 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
Image& image,
const StagingBufferRef& swizzled,
std::span<const VideoCommon::SwizzleParameters> swizzles,
u32 z_start, u32 z_count)
u32 z_start, u32 z_count)
{
using namespace VideoCommon::Accelerated;
// Leaving this hear incase instances are found where slices_needed causes device loss
// Tune this for a balance between speed and size, I don't own a deck so can't self tune it
// constexpr u32 MAX_BATCH_SLICES = 64;
if (!image.has_compute_unswizzle_buffer) {
image.AllocateComputeUnswizzleBuffer();
// Allocate exactly what this batch needs
const u32 slices_needed = std::min(z_count, image.info.size.depth);
image.AllocateComputeUnswizzleBuffer(slices_needed);
}
ASSERT(swizzles.size() == 1);
const auto& sw = swizzles[0];
const auto params = MakeBlockLinearSwizzle3DParams(sw, image.info);
const u32 blocks_x = (image.info.size.width + 3) / 4;
const u32 blocks_y = (image.info.size.height + 3) / 4;
constexpr u32 SLICES_PER_CHUNK = 64;
for (u32 z_offset = 0; z_offset < z_count; z_offset += SLICES_PER_CHUNK) {
const u32 current_chunk_slices = std::min(SLICES_PER_CHUNK, z_count - z_offset);
const u32 current_z_start = z_start + z_offset;
UnswizzleChunk(image, swizzled, sw, params, blocks_x, blocks_y,
current_z_start, current_chunk_slices);
}
}
void BlockLinearUnswizzle3DPass::UnswizzleChunk(
Image& image,
const StagingBufferRef& swizzled,
const VideoCommon::SwizzleParameters& sw,
const BlockLinearSwizzle3DParams& params,
u32 blocks_x, u32 blocks_y,
u32 z_start, u32 z_count)
{
BlockLinearUnswizzle3DPushConstants pc{};
pc.origin[0] = params.origin[0];
pc.origin[1] = params.origin[1];
pc.origin[2] = z_start; // Start at the current Z-slice
pc.origin[2] = z_start; // Current chunk's Z start
pc.destination[0] = params.destination[0];
pc.destination[1] = params.destination[1];
@ -783,16 +811,18 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
pc.block_depth = params.block_depth;
pc.block_depth_mask = params.block_depth_mask;
const u32 blocks_x = (image.info.size.width + 3) / 4;
const u32 blocks_y = (image.info.size.height + 3) / 4;
pc.blocks_dim[0] = blocks_x;
pc.blocks_dim[1] = blocks_y;
pc.blocks_dim[2] = z_count; // Only process the count
compute_pass_descriptor_queue.Acquire();
compute_pass_descriptor_queue.AddBuffer(*image.runtime->swizzle_table_buffer, 0, image.runtime->swizzle_table_size);
compute_pass_descriptor_queue.AddBuffer(swizzled.buffer, sw.buffer_offset + swizzled.offset, image.guest_size_bytes - sw.buffer_offset);
compute_pass_descriptor_queue.AddBuffer(*image.compute_unswizzle_buffer, 0, image.compute_unswizzle_buffer_size);
compute_pass_descriptor_queue.AddBuffer(*image.runtime->swizzle_table_buffer, 0,
image.runtime->swizzle_table_size);
compute_pass_descriptor_queue.AddBuffer(swizzled.buffer,
sw.buffer_offset + swizzled.offset,
image.guest_size_bytes - sw.buffer_offset);
compute_pass_descriptor_queue.AddBuffer(*image.compute_unswizzle_buffer, 0,
image.compute_unswizzle_buffer_size);
const void* descriptor_data = compute_pass_descriptor_queue.UpdateData();
const VkDescriptorSet set = descriptor_allocator.Commit();
@ -806,9 +836,11 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
static_cast<VkDeviceSize>(blocks_x) * blocks_y * bytes_per_block;
const VkDeviceSize barrier_size = output_slice_size * z_count;
const bool is_first_chunk = (z_start == 0);
scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([this, &image, set, descriptor_data, pc, gx, gy, gz, z_start, z_count,
barrier_size](vk::CommandBuffer cmdbuf) {
barrier_size, is_first_chunk](vk::CommandBuffer cmdbuf) {
const VkBuffer out_buffer = *image.compute_unswizzle_buffer;
const VkImage dst_image = image.Handle();
const VkImageAspectFlags aspect = image.AspectMask();
@ -819,8 +851,6 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
cmdbuf.Dispatch(gx, gy, gz);
const bool is_first = (z_start == 0);
// Single barrier for compute -> transfer (buffer ready, image transition)
const VkBufferMemoryBarrier buffer_barrier{
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
@ -838,9 +868,10 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
const VkImageMemoryBarrier pre_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = is_first ? VkAccessFlags{} : static_cast<VkAccessFlags>(VK_ACCESS_SHADER_READ_BIT),
.srcAccessMask = is_first_chunk ? VkAccessFlags{} :
static_cast<VkAccessFlags>(VK_ACCESS_SHADER_READ_BIT),
.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.oldLayout = is_first ? VK_IMAGE_LAYOUT_UNDEFINED : VK_IMAGE_LAYOUT_GENERAL,
.oldLayout = is_first_chunk ? VK_IMAGE_LAYOUT_UNDEFINED : VK_IMAGE_LAYOUT_GENERAL,
.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
@ -856,12 +887,13 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
nullptr, buffer_barrier, pre_barrier
);
// Copy chunk to correct Z position in image
const VkBufferImageCopy copy{
.bufferOffset = 0,
.bufferOffset = 0, // Read from start of staging buffer
.bufferRowLength = 0,
.bufferImageHeight = 0,
.imageSubresource = {aspect, 0, 0, 1},
.imageOffset = {0, 0, static_cast<s32>(z_start)},
.imageOffset = {0, 0, static_cast<s32>(z_start)}, // Write to correct Z
.imageExtent = {image.info.size.width, image.info.size.height, z_count},
};
cmdbuf.CopyBufferToImage(out_buffer, dst_image,

11
src/video_core/renderer_vulkan/vk_compute_pass.h

@ -17,6 +17,7 @@
#include "video_core/texture_cache/types.h"
#include "video_core/vulkan_common/vulkan_memory_allocator.h"
#include "video_core/vulkan_common/vulkan_wrapper.h"
#include "video_core/texture_cache/accelerated_swizzle.h"
namespace VideoCommon {
struct SwizzleParameters;
@ -24,6 +25,8 @@ struct SwizzleParameters;
namespace Vulkan {
using VideoCommon::Accelerated::BlockLinearSwizzle3DParams;
class Device;
class StagingBufferPool;
class Scheduler;
@ -146,6 +149,14 @@ public:
const StagingBufferRef& swizzled,
std::span<const VideoCommon::SwizzleParameters> swizzles,
u32 z_start, u32 z_count);
void UnswizzleChunk(
Image& image,
const StagingBufferRef& swizzled,
const VideoCommon::SwizzleParameters& sw,
const BlockLinearSwizzle3DParams& params,
u32 blocks_x, u32 blocks_y,
u32 z_start, u32 z_count);
private:
Scheduler& scheduler;

4
src/video_core/renderer_vulkan/vk_texture_cache.cpp

@ -1620,7 +1620,7 @@ Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBas
Image::~Image() = default;
void Image::AllocateComputeUnswizzleBuffer() {
void Image::AllocateComputeUnswizzleBuffer(u32 max_slices) {
if (has_compute_unswizzle_buffer)
return;
@ -1633,7 +1633,7 @@ void Image::AllocateComputeUnswizzleBuffer() {
// BCn is 4x4x1 blocks
const u32 blocks_x = (info.size.width + block_width - 1) / block_width;
const u32 blocks_y = (info.size.height + block_height - 1) / block_height;
const u32 blocks_z = info.size.depth;
const u32 blocks_z = std::min(max_slices, info.size.depth);
const u64 block_count =
static_cast<u64>(blocks_x) *

4
src/video_core/renderer_vulkan/vk_texture_cache.h

@ -196,6 +196,8 @@ public:
bool ScaleUp(bool ignore = false);
bool ScaleDown(bool ignore = false);
u64 allocation_tick;
friend class BlockLinearUnswizzle3DPass;
@ -214,7 +216,7 @@ private:
VkDeviceSize compute_unswizzle_buffer_size = 0;
bool has_compute_unswizzle_buffer = false;
void AllocateComputeUnswizzleBuffer();
void AllocateComputeUnswizzleBuffer(u32 max_slices);
// Use a pointer to field because it is relative, so that the object can be
// moved without breaking the reference.

52
src/video_core/texture_cache/texture_cache.h

@ -98,6 +98,13 @@ void TextureCache<P>::RunGarbageCollector() {
}
--num_iterations;
auto& image = slot_images[image_id];
// Never delete recently allocated sparse textures (within 3 frames)
const bool is_recently_allocated = image.allocation_tick >= frame_tick - 3;
if (is_recently_allocated && image.info.is_sparse) {
return false;
}
if (True(image.flags & ImageFlagBits::IsDecoding)) {
// This image is still being decoded, deleting it will invalidate the slot
// used by the async decoder thread.
@ -153,7 +160,13 @@ void TextureCache<P>::RunGarbageCollector() {
if (total_used_memory >= expected_memory) {
lru_cache.ForEachItemBelow(frame_tick, [&](ImageId image_id) {
auto& image = slot_images[image_id];
if (image.info.is_sparse && image.guest_size_bytes >= 256_MiB) {
// Only target sparse textures that are old enough
if (image.info.is_sparse &&
image.guest_size_bytes >= 256_MiB &&
image.allocation_tick < frame_tick - 3) {
LOG_DEBUG(HW_GPU, "GC targeting old sparse texture at 0x{:X} ({} MiB, age: {} frames)",
image.gpu_addr, image.guest_size_bytes / (1024 * 1024),
frame_tick - image.allocation_tick);
return Cleanup(image_id);
}
return false;
@ -1428,6 +1441,11 @@ void TextureCache<P>::TickAsyncUnswizzle() {
return;
}
if(current_unswizzle_frame > 0) {
current_unswizzle_frame--;
return;
}
PendingUnswizzle& task = unswizzle_queue.front();
Image& image = slot_images[task.image_id];
@ -1492,6 +1510,9 @@ void TextureCache<P>::TickAsyncUnswizzle() {
if (total_used_memory >= expected_memory) {
RunGarbageCollector();
}
// Wait 4 frames to process the next entry
current_unswizzle_frame = 4u;
}
}
@ -1534,24 +1555,29 @@ ImageId TextureCache<P>::InsertImage(const ImageInfo& info, GPUVAddr gpu_addr,
}
ASSERT_MSG(cpu_addr, "Tried to insert an image to an invalid gpu_addr=0x{:x}", gpu_addr);
// For large sparse textures, aggressively clean up old allocation at same address
// For large sparse textures, aggressively clean up old allocations at same address
if (info.is_sparse && CalculateGuestSizeInBytes(info) >= 256_MiB) {
const auto alloc_it = image_allocs_table.find(gpu_addr);
if (alloc_it != image_allocs_table.end()) {
const ImageAllocId alloc_id = alloc_it->second;
auto& alloc_images = slot_image_allocs[alloc_id].images;
// Immediately delete old images at this address before allocating new one
// Collect old images at this address that were created more than 2 frames ago
boost::container::small_vector<ImageId, 4> to_delete;
for (ImageId old_image_id : alloc_images) {
Image& old_image = slot_images[old_image_id];
if (old_image.info.is_sparse && old_image.gpu_addr == gpu_addr) {
if (old_image.info.is_sparse &&
old_image.gpu_addr == gpu_addr &&
old_image.allocation_tick < frame_tick - 2) { // Try not to delete fresh textures
to_delete.push_back(old_image_id);
}
}
// Delete old images immediately
for (ImageId old_id : to_delete) {
Image& old_image = slot_images[old_id];
LOG_INFO(HW_GPU, "Immediately deleting old sparse texture at 0x{:X} ({} MiB)",
gpu_addr, old_image.guest_size_bytes / (1024 * 1024));
if (True(old_image.flags & ImageFlagBits::Tracked)) {
UntrackImage(old_image, old_id);
}
@ -1577,13 +1603,23 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, DA
ImageInfo new_info = info;
const size_t size_bytes = CalculateGuestSizeInBytes(new_info);
// Proactive cleanup for large sparse texture allocations
if (new_info.is_sparse && size_bytes >= 256_MiB) {
const u64 estimated_alloc_size = size_bytes;
if (total_used_memory + estimated_alloc_size >= critical_memory) {
LOG_WARNING(HW_GPU, "Large sparse texture allocation ({} MiB) - running aggressive GC",
size_bytes / (1024 * 1024));
LOG_WARNING(HW_GPU, "Large sparse texture allocation ({} MiB) - running aggressive GC. "
"Current memory: {} MiB, Critical: {} MiB",
size_bytes / (1024 * 1024),
total_used_memory / (1024 * 1024),
critical_memory / (1024 * 1024));
RunGarbageCollector();
// If still over threshold after GC, try one more aggressive pass
if (total_used_memory + estimated_alloc_size >= critical_memory) {
LOG_WARNING(HW_GPU, "Still critically low on memory, running second GC pass");
RunGarbageCollector();
}
}
}
@ -1682,6 +1718,8 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, DA
const ImageId new_image_id = slot_images.insert(runtime, new_info, gpu_addr, cpu_addr);
Image& new_image = slot_images[new_image_id];
new_image.allocation_tick = frame_tick;
if (!gpu_memory->IsContinuousRange(new_image.gpu_addr, new_image.guest_size_bytes) &&
new_info.is_sparse) {

Loading…
Cancel
Save