From 4de2faf940362f1fd016673caf21a1267bfc9bf5 Mon Sep 17 00:00:00 2001 From: wildcard Date: Thu, 14 Aug 2025 00:18:13 +0200 Subject: [PATCH] [VK] BufferCache fixes not to be merged yet just for benchmarks DMAClear now treats amount as bytes and marks the region GPU-modified (not subtracting, one of the reasons why need benchmarking]. The page-scanner in IsRegionRegistered() now advances with buf_end_addr so there should be no infinite loop now. Overlap growth now expands left with expand_begin and right with expand_end (should stop stream buffers thrashing). Also Skipping zero-fill on new buffers to save bandwidth and is probably safe since we are doing sync in upload paths before GPU reads. --- src/video_core/buffer_cache/buffer_cache.h | 24 +++++++++++----------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 0cd6861b6d..22fc1a326b 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -249,22 +249,22 @@ bool BufferCache

::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am template bool BufferCache

::DMAClear(GPUVAddr dst_address, u64 amount, u32 value) { - const std::optional cpu_dst_address = gpu_memory->GpuToCpuAddress(dst_address); + const auto cpu_dst_address = gpu_memory->GpuToCpuAddress(dst_address); if (!cpu_dst_address) { return false; } - const bool dest_dirty = IsRegionRegistered(*cpu_dst_address, amount); - if (!dest_dirty) { + const size_t size = static_cast(amount);// amount should be BYTES (same as DMACopy) + if (!IsRegionRegistered(*cpu_dst_address, size)) { return false; } - const size_t size = amount * sizeof(u32); - ClearDownload(*cpu_dst_address, size); - gpu_modified_ranges.Subtract(*cpu_dst_address, size); + ClearDownload(*cpu_dst_address, size);// cancel pending downloads const BufferId buffer = FindBuffer(*cpu_dst_address, static_cast(size)); Buffer& dest_buffer = slot_buffers[buffer]; const u32 offset = dest_buffer.Offset(*cpu_dst_address); + // GPU is writing here(dma clear is a gpu write not cpu so dont subtract(ifyify), mark as GPU-modified (same as DMACopy and MarkWrittenBuffer) + MarkWrittenBuffer(buffer, *cpu_dst_address, static_cast(size)); // adds + tracks uncommitted runtime.ClearBuffer(dest_buffer, offset, size, value); dest_buffer.MarkUsage(offset, size); return true; @@ -671,7 +671,7 @@ bool BufferCache

::IsRegionRegistered(DAddr addr, size_t size) { if (buf_start_addr < end_addr && addr < buf_end_addr) { return true; } - page = Common::DivCeil(end_addr, CACHING_PAGESIZE); + page = Common::DivCeil(buf_end_addr, CACHING_PAGESIZE); //fix the scanning } return false; } @@ -1349,10 +1349,10 @@ typename BufferCache

::OverlapResult BufferCache

::ResolveOverlaps(DAddr dev // as a stream buffer. Increase the size to skip constantly recreating buffers. has_stream_leap = true; if (expands_right) { - expand_begin(CACHING_PAGESIZE * 128); + expand_end(CACHING_PAGESIZE * 128); //previous logic seemed inverse so this fixes it } if (expands_left) { - expand_end(CACHING_PAGESIZE * 128); + expand_begin(CACHING_PAGESIZE * 128); //same as above } } } @@ -1393,9 +1393,9 @@ BufferId BufferCache

::CreateBuffer(DAddr device_addr, u32 wanted_size) { const u32 size = static_cast(overlap.end - overlap.begin); const BufferId new_buffer_id = slot_buffers.insert(runtime, overlap.begin, size); auto& new_buffer = slot_buffers[new_buffer_id]; - const size_t size_bytes = new_buffer.SizeBytes(); - runtime.ClearBuffer(new_buffer, 0, size_bytes, 0); - new_buffer.MarkUsage(0, size_bytes); + //const size_t size_bytes = new_buffer.SizeBytes(); vulkan does not require new buffers be filed with zeros save some bandwidth + //runtime.ClearBuffer(new_buffer, 0, size_bytes, 0); + //new_buffer.MarkUsage(0, size_bytes); for (const BufferId overlap_id : overlap.ids) { JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap); }