Browse Source

Small improvement to sparse texture loading

pull/3246/head
Forrest Keller 1 month ago
committed by crueter
parent
commit
ad99a0769c
  1. 27
      src/video_core/renderer_vulkan/vk_compute_pass.cpp
  2. 105
      src/video_core/texture_cache/texture_cache.h
  3. 41
      src/video_core/texture_cache/util.cpp
  4. 7
      src/video_core/texture_cache/util.h

27
src/video_core/renderer_vulkan/vk_compute_pass.cpp

@ -801,8 +801,14 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
const u32 gy = Common::DivCeil(blocks_y, 8u);
const u32 gz = Common::DivCeil(z_count, 1u);
const u32 bytes_per_block = 1u << pc.bytes_per_block_log2;
const VkDeviceSize output_slice_size =
static_cast<VkDeviceSize>(blocks_x) * blocks_y * bytes_per_block;
const VkDeviceSize barrier_size = output_slice_size * z_count;
scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([this, &image, set, descriptor_data, pc, gx, gy, gz, z_start, z_count](vk::CommandBuffer cmdbuf) {
scheduler.Record([this, &image, set, descriptor_data, pc, gx, gy, gz, z_start, z_count,
barrier_size](vk::CommandBuffer cmdbuf) {
const VkBuffer out_buffer = *image.compute_unswizzle_buffer;
const VkImage dst_image = image.Handle();
const VkImageAspectFlags aspect = image.AspectMask();
@ -818,19 +824,26 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
// Single barrier for compute -> transfer (buffer ready, image transition)
const VkBufferMemoryBarrier buffer_barrier{
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.buffer = out_buffer,
.offset = 0,
.size = VK_WHOLE_SIZE,
.size = barrier_size,
};
// Image layout transition
const VkImageMemoryBarrier pre_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = is_first ? VkAccessFlags{} : static_cast<VkAccessFlags>(VK_ACCESS_SHADER_READ_BIT),
.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.oldLayout = is_first ? VK_IMAGE_LAYOUT_UNDEFINED : VK_IMAGE_LAYOUT_GENERAL,
.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.image = dst_image,
.subresourceRange = {aspect, 0, 1, 0, 1},
};
@ -845,19 +858,25 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
const VkBufferImageCopy copy{
.bufferOffset = 0,
.bufferRowLength = 0,
.bufferImageHeight = 0,
.imageSubresource = {aspect, 0, 0, 1},
.imageOffset = {0, 0, (s32)z_start},
.imageOffset = {0, 0, static_cast<s32>(z_start)},
.imageExtent = {image.info.size.width, image.info.size.height, z_count},
};
cmdbuf.CopyBufferToImage(out_buffer, dst_image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy);
cmdbuf.CopyBufferToImage(out_buffer, dst_image,
VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy);
// Post-copy transition
const VkImageMemoryBarrier post_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.newLayout = VK_IMAGE_LAYOUT_GENERAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.image = dst_image,
.subresourceRange = {aspect, 0, 1, 0, 1},
};

105
src/video_core/texture_cache/texture_cache.h

@ -1099,22 +1099,6 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
image.flags &= ~ImageFlagBits::CpuModified;
TrackImage(image, image_id);
/*// If it's sparse and remapped, we treat it as a partial update trigger
if (image.info.is_sparse && True(image.flags & ImageFlagBits::Remapped)) {
image.flags &= ~ImageFlagBits::Remapped;
if (!image.dirty_offsets.empty() && !image.sparse_bindings.empty()) {
constexpr u64 page_size = 64_KiB;
size_t dirty_size = image.dirty_offsets.size() * page_size;
auto staging = runtime.UploadStagingBuffer(dirty_size);
UploadSparseDirtyTiles(image, staging);
runtime.InsertUploadMemoryBarrier();
return;
}
}*/
if (image.info.num_samples > 1 && !runtime.CanUploadMSAA()) {
LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented");
runtime.TransitionImageLayout(image);
@ -1139,87 +1123,6 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
runtime.InsertUploadMemoryBarrier();
}
template <class P>
template <typename StagingBuffer>
void TextureCache<P>::UploadSparseDirtyTiles(Image& image, StagingBuffer& staging) {
using namespace VideoCommon;
using namespace Tegra::Texture;
std::vector<BufferImageCopy> all_copies;
size_t total_upload_size = 0;
for (u64 dirty_tile_index : image.dirty_offsets) {
SparseBinding* binding = nullptr;
for (auto& [addr, bind] : image.sparse_bindings) {
if (bind.tile_index == dirty_tile_index) {
binding = &bind;
break;
}
}
if (!binding) {
continue;
}
const auto& coord = binding->tile_coord;
// Calculate tile dimensions
const u32 tile_width_blocks = 128;
const u32 tile_height_blocks = 32;
const u32 tile_width = std::min(tile_width_blocks * 4, image.info.size.width - coord.width);
const u32 tile_height = std::min(tile_height_blocks * 4, image.info.size.height - coord.height);
const u32 tile_depth = std::min(1u, image.info.size.depth - coord.depth);
const u32 bytes_per_block = BytesPerBlock(image.info.format);
const u32 blocks_wide = (tile_width + 3) / 4;
const u32 blocks_high = (tile_height + 3) / 4;
const size_t tile_unswizzled_size = blocks_wide * blocks_high * tile_depth * bytes_per_block;
if (total_upload_size + tile_unswizzled_size > staging.mapped_span.size()) {
LOG_ERROR(HW_GPU, "Staging buffer too small");
break;
}
std::array<u8, 65536> tile_swizzled_data;
gpu_memory->ReadBlockUnsafe(binding->gpu_addr, tile_swizzled_data.data(), image.sparse_tile_size);
// Get output span
auto tile_output = staging.mapped_span.subspan(total_upload_size, tile_unswizzled_size);
// Unswizzle the tile
auto result = UnswizzleSparseTextureTile(tile_output, tile_swizzled_data,
image.info, tile_width, tile_height, tile_depth);
// Create the copy descriptor
BufferImageCopy copy{
.buffer_offset = total_upload_size,
.buffer_size = tile_unswizzled_size,
.buffer_row_length = result.buffer_row_length,
.buffer_image_height = result.buffer_image_height,
.image_subresource = {
.base_level = 0,
.base_layer = 0,
.num_layers = 1,
},
.image_offset = {
static_cast<s32>(coord.width),
static_cast<s32>(coord.height),
static_cast<s32>(coord.depth)
},
.image_extent = {tile_width, tile_height, tile_depth}
};
all_copies.push_back(copy);
total_upload_size += tile_unswizzled_size;
}
if (!all_copies.empty()) {
image.UploadMemory(staging, all_copies);
}
image.dirty_offsets.clear();
}
template <class P>
template <typename StagingBuffer>
void TextureCache<P>::UploadImageContents(Image& image, StagingBuffer& staging) {
@ -1505,7 +1408,7 @@ void TextureCache<P>::TickAsyncUnswizzle() {
}
// Don't process every frame - allow more data to accumulate
if (++current_unswizzle_frame < 2) return;
//if (++current_unswizzle_frame < 2) return;
PendingUnswizzle& task = unswizzle_queue.front();
Image& image = slot_images[task.image_id];
@ -1529,8 +1432,8 @@ void TextureCache<P>::TickAsyncUnswizzle() {
}
// ToDo: Make these configurable
static constexpr size_t CHUNK_SIZE = 64_MiB;
static constexpr u32 SLICES_PER_BATCH = 512u;
static constexpr size_t CHUNK_SIZE = 16_MiB;
static constexpr u32 SLICES_PER_BATCH = 64u;
// Read data
if (task.current_offset < task.total_size) {
@ -1569,7 +1472,7 @@ void TextureCache<P>::TickAsyncUnswizzle() {
unswizzle_queue.pop_front();
}
current_unswizzle_frame = 0;
//current_unswizzle_frame = 0;
}
template <class P>

41
src/video_core/texture_cache/util.cpp

@ -55,7 +55,6 @@ using Tegra::Texture::TextureFormat;
using Tegra::Texture::TextureType;
using Tegra::Texture::TICEntry;
using Tegra::Texture::UnswizzleTexture;
using Tegra::Texture::UnswizzleSubrect;
using VideoCore::Surface::BytesPerBlock;
using VideoCore::Surface::DefaultBlockHeight;
using VideoCore::Surface::DefaultBlockWidth;
@ -923,46 +922,6 @@ boost::container::small_vector<BufferImageCopy, 16> UnswizzleImage(Tegra::Memory
return copies;
}
SparseTileUnswizzleResult UnswizzleSparseTextureTile(std::span<u8> output,
std::span<const u8> input,
const ImageInfo& info,
u32 tile_width,
u32 tile_height,
u32 tile_depth) {
const Extent2D block_size = DefaultBlockSize(info.format);
const u32 bpp = BytesPerBlock(info.format);
const u32 width_blocks = (tile_width + block_size.width - 1) / block_size.width;
const u32 height_blocks = (tile_height + block_size.height - 1) / block_size.height;
// Calculate GOBs per row
const u32 bytes_per_row = width_blocks * bpp;
const u32 gobs_per_row = (bytes_per_row + 63) / 64;
// Calculate block_height for 64KB tiles
// 64KB / (gobs_per_row × 512 bytes) = GOBs tall
constexpr u32 TILE_SIZE = 65536;
const u32 gobs_tall = TILE_SIZE / (gobs_per_row * 512);
// block_height = log2(gobs_tall)
const u32 tile_block_height = std::countr_zero(gobs_tall);
const u32 pitch_linear = width_blocks * bpp;
UnswizzleSubrect(
output, input, bpp,
width_blocks, height_blocks, tile_depth,
0, 0,
width_blocks, height_blocks,
tile_block_height, 0,
pitch_linear
);
return {
.buffer_row_length = Common::AlignUp(tile_width, block_size.width),
.buffer_image_height = Common::AlignUp(tile_height, block_size.height)
};
}
void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8> output,
std::span<BufferImageCopy> copies) {
u32 output_offset = 0;

7
src/video_core/texture_cache/util.h

@ -74,13 +74,6 @@ struct SparseTileUnswizzleResult {
Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageInfo& info,
std::span<const u8> input, std::span<u8> output);
[[nodiscard]] SparseTileUnswizzleResult UnswizzleSparseTextureTile(std::span<u8> output,
std::span<const u8> input,
const ImageInfo& info,
u32 tile_width,
u32 tile_height,
u32 tile_depth);
void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8> output,
std::span<BufferImageCopy> copies);

Loading…
Cancel
Save