From ad99a0769c59f26a9c8b5f8326ecd79c80232367 Mon Sep 17 00:00:00 2001 From: Forrest Keller Date: Fri, 2 Jan 2026 01:59:55 -0600 Subject: [PATCH] Small improvement to sparse texture loading --- .../renderer_vulkan/vk_compute_pass.cpp | 27 ++++- src/video_core/texture_cache/texture_cache.h | 107 +----------------- src/video_core/texture_cache/util.cpp | 41 ------- src/video_core/texture_cache/util.h | 7 -- 4 files changed, 28 insertions(+), 154 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index d60c6d0cd4..b7684b6a87 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -801,8 +801,14 @@ void BlockLinearUnswizzle3DPass::Unswizzle( const u32 gy = Common::DivCeil(blocks_y, 8u); const u32 gz = Common::DivCeil(z_count, 1u); + const u32 bytes_per_block = 1u << pc.bytes_per_block_log2; + const VkDeviceSize output_slice_size = + static_cast(blocks_x) * blocks_y * bytes_per_block; + const VkDeviceSize barrier_size = output_slice_size * z_count; + scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([this, &image, set, descriptor_data, pc, gx, gy, gz, z_start, z_count](vk::CommandBuffer cmdbuf) { + scheduler.Record([this, &image, set, descriptor_data, pc, gx, gy, gz, z_start, z_count, + barrier_size](vk::CommandBuffer cmdbuf) { const VkBuffer out_buffer = *image.compute_unswizzle_buffer; const VkImage dst_image = image.Handle(); const VkImageAspectFlags aspect = image.AspectMask(); @@ -818,19 +824,26 @@ void BlockLinearUnswizzle3DPass::Unswizzle( // Single barrier for compute -> transfer (buffer ready, image transition) const VkBufferMemoryBarrier buffer_barrier{ .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, + .pNext = nullptr, .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .buffer = out_buffer, .offset = 0, - .size = VK_WHOLE_SIZE, + .size = barrier_size, }; + // Image layout transition const VkImageMemoryBarrier pre_barrier{ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, .srcAccessMask = is_first ? VkAccessFlags{} : static_cast(VK_ACCESS_SHADER_READ_BIT), .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, .oldLayout = is_first ? VK_IMAGE_LAYOUT_UNDEFINED : VK_IMAGE_LAYOUT_GENERAL, .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .image = dst_image, .subresourceRange = {aspect, 0, 1, 0, 1}, }; @@ -845,19 +858,25 @@ void BlockLinearUnswizzle3DPass::Unswizzle( const VkBufferImageCopy copy{ .bufferOffset = 0, + .bufferRowLength = 0, + .bufferImageHeight = 0, .imageSubresource = {aspect, 0, 0, 1}, - .imageOffset = {0, 0, (s32)z_start}, + .imageOffset = {0, 0, static_cast(z_start)}, .imageExtent = {image.info.size.width, image.info.size.height, z_count}, }; - cmdbuf.CopyBufferToImage(out_buffer, dst_image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy); + cmdbuf.CopyBufferToImage(out_buffer, dst_image, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy); // Post-copy transition const VkImageMemoryBarrier post_barrier{ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .image = dst_image, .subresourceRange = {aspect, 0, 1, 0, 1}, }; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 1d4000efbf..68f8e843c7 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -1098,22 +1098,6 @@ void TextureCache

::RefreshContents(Image& image, ImageId image_id) { image.flags &= ~ImageFlagBits::CpuModified; TrackImage(image, image_id); - - /*// If it's sparse and remapped, we treat it as a partial update trigger - if (image.info.is_sparse && True(image.flags & ImageFlagBits::Remapped)) { - image.flags &= ~ImageFlagBits::Remapped; - - if (!image.dirty_offsets.empty() && !image.sparse_bindings.empty()) { - constexpr u64 page_size = 64_KiB; - size_t dirty_size = image.dirty_offsets.size() * page_size; - - auto staging = runtime.UploadStagingBuffer(dirty_size); - UploadSparseDirtyTiles(image, staging); - runtime.InsertUploadMemoryBarrier(); - - return; - } - }*/ if (image.info.num_samples > 1 && !runtime.CanUploadMSAA()) { LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented"); @@ -1139,87 +1123,6 @@ void TextureCache

::RefreshContents(Image& image, ImageId image_id) { runtime.InsertUploadMemoryBarrier(); } -template -template -void TextureCache

::UploadSparseDirtyTiles(Image& image, StagingBuffer& staging) { - using namespace VideoCommon; - using namespace Tegra::Texture; - - std::vector all_copies; - size_t total_upload_size = 0; - - for (u64 dirty_tile_index : image.dirty_offsets) { - SparseBinding* binding = nullptr; - for (auto& [addr, bind] : image.sparse_bindings) { - if (bind.tile_index == dirty_tile_index) { - binding = &bind; - break; - } - } - - if (!binding) { - continue; - } - - const auto& coord = binding->tile_coord; - - // Calculate tile dimensions - const u32 tile_width_blocks = 128; - const u32 tile_height_blocks = 32; - const u32 tile_width = std::min(tile_width_blocks * 4, image.info.size.width - coord.width); - const u32 tile_height = std::min(tile_height_blocks * 4, image.info.size.height - coord.height); - const u32 tile_depth = std::min(1u, image.info.size.depth - coord.depth); - - const u32 bytes_per_block = BytesPerBlock(image.info.format); - const u32 blocks_wide = (tile_width + 3) / 4; - const u32 blocks_high = (tile_height + 3) / 4; - const size_t tile_unswizzled_size = blocks_wide * blocks_high * tile_depth * bytes_per_block; - - if (total_upload_size + tile_unswizzled_size > staging.mapped_span.size()) { - LOG_ERROR(HW_GPU, "Staging buffer too small"); - break; - } - - std::array tile_swizzled_data; - gpu_memory->ReadBlockUnsafe(binding->gpu_addr, tile_swizzled_data.data(), image.sparse_tile_size); - - // Get output span - auto tile_output = staging.mapped_span.subspan(total_upload_size, tile_unswizzled_size); - - // Unswizzle the tile - auto result = UnswizzleSparseTextureTile(tile_output, tile_swizzled_data, - image.info, tile_width, tile_height, tile_depth); - - // Create the copy descriptor - BufferImageCopy copy{ - .buffer_offset = total_upload_size, - .buffer_size = tile_unswizzled_size, - .buffer_row_length = result.buffer_row_length, - .buffer_image_height = result.buffer_image_height, - .image_subresource = { - .base_level = 0, - .base_layer = 0, - .num_layers = 1, - }, - .image_offset = { - static_cast(coord.width), - static_cast(coord.height), - static_cast(coord.depth) - }, - .image_extent = {tile_width, tile_height, tile_depth} - }; - - all_copies.push_back(copy); - total_upload_size += tile_unswizzled_size; - } - - if (!all_copies.empty()) { - image.UploadMemory(staging, all_copies); - } - - image.dirty_offsets.clear(); -} - template template void TextureCache

::UploadImageContents(Image& image, StagingBuffer& staging) { @@ -1503,9 +1406,9 @@ void TextureCache

::TickAsyncUnswizzle() { if (unswizzle_queue.empty()) { return; } - + // Don't process every frame - allow more data to accumulate - if (++current_unswizzle_frame < 2) return; + //if (++current_unswizzle_frame < 2) return; PendingUnswizzle& task = unswizzle_queue.front(); Image& image = slot_images[task.image_id]; @@ -1529,8 +1432,8 @@ void TextureCache

::TickAsyncUnswizzle() { } // ToDo: Make these configurable - static constexpr size_t CHUNK_SIZE = 64_MiB; - static constexpr u32 SLICES_PER_BATCH = 512u; + static constexpr size_t CHUNK_SIZE = 16_MiB; + static constexpr u32 SLICES_PER_BATCH = 64u; // Read data if (task.current_offset < task.total_size) { @@ -1569,7 +1472,7 @@ void TextureCache

::TickAsyncUnswizzle() { unswizzle_queue.pop_front(); } - current_unswizzle_frame = 0; + //current_unswizzle_frame = 0; } template diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp index 03c0c589c1..e55d0752ec 100644 --- a/src/video_core/texture_cache/util.cpp +++ b/src/video_core/texture_cache/util.cpp @@ -55,7 +55,6 @@ using Tegra::Texture::TextureFormat; using Tegra::Texture::TextureType; using Tegra::Texture::TICEntry; using Tegra::Texture::UnswizzleTexture; -using Tegra::Texture::UnswizzleSubrect; using VideoCore::Surface::BytesPerBlock; using VideoCore::Surface::DefaultBlockHeight; using VideoCore::Surface::DefaultBlockWidth; @@ -923,46 +922,6 @@ boost::container::small_vector UnswizzleImage(Tegra::Memory return copies; } -SparseTileUnswizzleResult UnswizzleSparseTextureTile(std::span output, - std::span input, - const ImageInfo& info, - u32 tile_width, - u32 tile_height, - u32 tile_depth) { - const Extent2D block_size = DefaultBlockSize(info.format); - const u32 bpp = BytesPerBlock(info.format); - const u32 width_blocks = (tile_width + block_size.width - 1) / block_size.width; - const u32 height_blocks = (tile_height + block_size.height - 1) / block_size.height; - - // Calculate GOBs per row - const u32 bytes_per_row = width_blocks * bpp; - const u32 gobs_per_row = (bytes_per_row + 63) / 64; - - // Calculate block_height for 64KB tiles - // 64KB / (gobs_per_row × 512 bytes) = GOBs tall - constexpr u32 TILE_SIZE = 65536; - const u32 gobs_tall = TILE_SIZE / (gobs_per_row * 512); - - // block_height = log2(gobs_tall) - const u32 tile_block_height = std::countr_zero(gobs_tall); - - const u32 pitch_linear = width_blocks * bpp; - - UnswizzleSubrect( - output, input, bpp, - width_blocks, height_blocks, tile_depth, - 0, 0, - width_blocks, height_blocks, - tile_block_height, 0, - pitch_linear - ); - - return { - .buffer_row_length = Common::AlignUp(tile_width, block_size.width), - .buffer_image_height = Common::AlignUp(tile_height, block_size.height) - }; -} - void ConvertImage(std::span input, const ImageInfo& info, std::span output, std::span copies) { u32 output_offset = 0; diff --git a/src/video_core/texture_cache/util.h b/src/video_core/texture_cache/util.h index dbf44d73c3..e9300cbae5 100644 --- a/src/video_core/texture_cache/util.h +++ b/src/video_core/texture_cache/util.h @@ -74,13 +74,6 @@ struct SparseTileUnswizzleResult { Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageInfo& info, std::span input, std::span output); -[[nodiscard]] SparseTileUnswizzleResult UnswizzleSparseTextureTile(std::span output, - std::span input, - const ImageInfo& info, - u32 tile_width, - u32 tile_height, - u32 tile_depth); - void ConvertImage(std::span input, const ImageInfo& info, std::span output, std::span copies);