Browse Source

Process sparse textures better by finding pages that have data

pull/3737/head
Forrest Mark X 5 days ago
committed by crueter
parent
commit
c81921d37e
  1. 2
      src/video_core/renderer_opengl/gl_texture_cache.cpp
  2. 4
      src/video_core/renderer_opengl/gl_texture_cache.h
  3. 135
      src/video_core/renderer_vulkan/vk_compute_pass.cpp
  4. 9
      src/video_core/renderer_vulkan/vk_compute_pass.h
  5. 17
      src/video_core/renderer_vulkan/vk_texture_cache.cpp
  6. 4
      src/video_core/renderer_vulkan/vk_texture_cache.h
  7. 89
      src/video_core/texture_cache/texture_cache.h
  8. 3
      src/video_core/texture_cache/texture_cache_base.h

2
src/video_core/renderer_opengl/gl_texture_cache.cpp

@ -652,7 +652,7 @@ void TextureCacheRuntime::BlitFramebuffer(Framebuffer* dst, Framebuffer* src,
void TextureCacheRuntime::AccelerateImageUpload(Image& image, const StagingBufferMap& map, void TextureCacheRuntime::AccelerateImageUpload(Image& image, const StagingBufferMap& map,
std::span<const SwizzleParameters> swizzles, std::span<const SwizzleParameters> swizzles,
u32 z_start, u32 z_count) {
u32 z_start, u32 z_count, std::span<const u8> slice_has_data) {
switch (image.info.type) { switch (image.info.type) {
case ImageType::e2D: case ImageType::e2D:
if (IsPixelFormatASTC(image.info.format)) { if (IsPixelFormatASTC(image.info.format)) {

4
src/video_core/renderer_opengl/gl_texture_cache.h

@ -118,9 +118,9 @@ public:
const Region2D& src_region, Tegra::Engines::Fermi2D::Filter filter, const Region2D& src_region, Tegra::Engines::Fermi2D::Filter filter,
Tegra::Engines::Fermi2D::Operation operation); Tegra::Engines::Fermi2D::Operation operation);
void AccelerateImageUpload(Image& image, const StagingBufferMap& map,
void AccelerateImageUpload(Image &image, const StagingBufferMap &map,
std::span<const VideoCommon::SwizzleParameters> swizzles, std::span<const VideoCommon::SwizzleParameters> swizzles,
u32 z_start, u32 z_count);
u32 z_start, u32 z_count, std::span<const u8> slice_has_data={});
void InsertUploadMemoryBarrier(); void InsertUploadMemoryBarrier();

135
src/video_core/renderer_vulkan/vk_compute_pass.cpp

@ -763,31 +763,59 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
Image& image, Image& image,
const StagingBufferRef& swizzled, const StagingBufferRef& swizzled,
std::span<const VideoCommon::SwizzleParameters> swizzles, std::span<const VideoCommon::SwizzleParameters> swizzles,
u32 z_start, u32 z_count)
u32 z_start, u32 z_count,
std::span<const u8> slice_has_data)
{ {
using namespace VideoCommon::Accelerated; using namespace VideoCommon::Accelerated;
const u32 MAX_BATCH_SLICES = (std::min)(z_count, image.info.size.depth); const u32 MAX_BATCH_SLICES = (std::min)(z_count, image.info.size.depth);
if (!image.has_compute_unswizzle_buffer) {
if (image.has_compute_unswizzle_buffer) {
// Allocate exactly what this batch needs // Allocate exactly what this batch needs
image.AllocateComputeUnswizzleBuffer(MAX_BATCH_SLICES);
using VideoCore::Surface::BytesPerBlock;
const u32 bx = (image.info.size.width + 3) / 4;
const u32 by = (image.info.size.height + 3) / 4;
const VkDeviceSize needed =
static_cast<VkDeviceSize>(bx) * by * MAX_BATCH_SLICES *
BytesPerBlock(image.info.format);
if (image.compute_unswizzle_buffer_size < needed) {
scheduler.Finish();
}
} }
image.AllocateComputeUnswizzleBuffer(MAX_BATCH_SLICES);
ASSERT(swizzles.size() == 1); ASSERT(swizzles.size() == 1);
const auto& sw = swizzles[0]; const auto& sw = swizzles[0];
const auto params = MakeBlockLinearSwizzle3DParams(sw, image.info); const auto params = MakeBlockLinearSwizzle3DParams(sw, image.info);
const u32 blocks_x = (image.info.size.width + 3) / 4; const u32 blocks_x = (image.info.size.width + 3) / 4;
const u32 blocks_y = (image.info.size.height + 3) / 4; const u32 blocks_y = (image.info.size.height + 3) / 4;
const u32 bytes_per_block = 1u << params.bytes_per_block_log2;
scheduler.RequestOutsideRenderPassOperationContext(); scheduler.RequestOutsideRenderPassOperationContext();
for (u32 z_offset = 0; z_offset < z_count; z_offset += MAX_BATCH_SLICES) { for (u32 z_offset = 0; z_offset < z_count; z_offset += MAX_BATCH_SLICES) {
const u32 current_chunk_slices = (std::min)(MAX_BATCH_SLICES, z_count - z_offset); const u32 current_chunk_slices = (std::min)(MAX_BATCH_SLICES, z_count - z_offset);
const u32 current_z_start = z_start + z_offset; const u32 current_z_start = z_start + z_offset;
UnswizzleChunk(image, swizzled, sw, params, blocks_x, blocks_y,
current_z_start, current_chunk_slices);
bool chunk_has_data = slice_has_data.empty();
if (!chunk_has_data) {
const u32 z_end = current_z_start + current_chunk_slices;
for (u32 z = current_z_start; z < z_end; ++z) {
if (z < static_cast<u32>(slice_has_data.size()) && slice_has_data[z] != 0) {
chunk_has_data = true;
break;
}
}
}
if (chunk_has_data) {
UnswizzleChunk(image, swizzled, sw, params, blocks_x, blocks_y,
current_z_start, current_chunk_slices);
} else {
UnswizzleZeroChunk(image, blocks_x, blocks_y, bytes_per_block,
current_z_start, current_chunk_slices);
}
} }
} }
@ -937,6 +965,103 @@ void BlockLinearUnswizzle3DPass::UnswizzleChunk(
}); });
} }
// I feel there was a better way to do this like a image.Clear or something but I couldn't find anything or am blind.
// So enjoy this mess
void BlockLinearUnswizzle3DPass::UnswizzleZeroChunk(
Image& image,
u32 blocks_x, u32 blocks_y,
u32 bytes_per_block,
u32 z_start, u32 z_count)
{
ASSERT(image.has_compute_unswizzle_buffer);
const VkBuffer out_buffer = *image.compute_unswizzle_buffer;
const VkImage dst_image = image.Handle();
const VkImageAspectFlags aspect = image.AspectMask();
const u32 image_width = image.info.size.width;
const u32 image_height = image.info.size.height;
const bool is_first_chunk = (z_start == 0);
// Size of one unswizzled z-slice in the output buffer (bytes).
const VkDeviceSize output_slice_bytes =
static_cast<VkDeviceSize>(blocks_x) * blocks_y * bytes_per_block;
const VkDeviceSize fill_size = output_slice_bytes * z_count;
scheduler.Record([out_buffer, dst_image, aspect, z_start, z_count,
fill_size, is_first_chunk, image_width, image_height
](vk::CommandBuffer cmdbuf) {
if (dst_image == VK_NULL_HANDLE || out_buffer == VK_NULL_HANDLE) {
return;
}
// Zero the output buffer region that CopyBufferToImage will read.
cmdbuf.FillBuffer(out_buffer, 0, fill_size, 0u);
const VkBufferMemoryBarrier buffer_barrier{
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.buffer = out_buffer,
.offset = 0,
.size = fill_size,
};
const VkImageMemoryBarrier pre_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = is_first_chunk ? VkAccessFlags{}
: static_cast<VkAccessFlags>(VK_ACCESS_TRANSFER_WRITE_BIT),
.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.oldLayout = is_first_chunk ? VK_IMAGE_LAYOUT_UNDEFINED
: VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.image = dst_image,
.subresourceRange = {aspect, 0, 1, 0, 1},
};
cmdbuf.PipelineBarrier(
VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_PIPELINE_STAGE_TRANSFER_BIT,
0, nullptr, buffer_barrier, pre_barrier
);
// Copy the zeroed buffer region into the correct Z position of the image.
const VkBufferImageCopy copy{
.bufferOffset = 0,
.bufferRowLength = 0,
.bufferImageHeight = 0,
.imageSubresource = {aspect, 0, 0, 1},
.imageOffset = {0, 0, static_cast<s32>(z_start)},
.imageExtent = {image_width, image_height, z_count},
};
cmdbuf.CopyBufferToImage(out_buffer, dst_image,
VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy);
// Transition image to GENERAL for subsequent shader reads/writes.
const VkImageMemoryBarrier post_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.newLayout = VK_IMAGE_LAYOUT_GENERAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.image = dst_image,
.subresourceRange = {aspect, 0, 1, 0, 1},
};
cmdbuf.PipelineBarrier(
VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
0, nullptr, nullptr, post_barrier
);
});
}
MSAACopyPass::MSAACopyPass(const Device& device_, Scheduler& scheduler_, MSAACopyPass::MSAACopyPass(const Device& device_, Scheduler& scheduler_,
DescriptorPool& descriptor_pool_, DescriptorPool& descriptor_pool_,
StagingBufferPool& staging_buffer_pool_, StagingBufferPool& staging_buffer_pool_,

9
src/video_core/renderer_vulkan/vk_compute_pass.h

@ -148,7 +148,8 @@ public:
void Unswizzle(Image& image, void Unswizzle(Image& image,
const StagingBufferRef& swizzled, const StagingBufferRef& swizzled,
std::span<const VideoCommon::SwizzleParameters> swizzles, std::span<const VideoCommon::SwizzleParameters> swizzles,
u32 z_start, u32 z_count);
u32 z_start, u32 z_count,
std::span<const u8> slice_has_data = {});
void UnswizzleChunk( void UnswizzleChunk(
Image& image, Image& image,
@ -158,6 +159,12 @@ public:
u32 blocks_x, u32 blocks_y, u32 blocks_x, u32 blocks_y,
u32 z_start, u32 z_count); u32 z_start, u32 z_count);
void UnswizzleZeroChunk(
Image& image,
u32 blocks_x, u32 blocks_y,
u32 bytes_per_block,
u32 z_start, u32 z_count);
private: private:
Scheduler& scheduler; Scheduler& scheduler;
StagingBufferPool& staging_buffer_pool; StagingBufferPool& staging_buffer_pool;

17
src/video_core/renderer_vulkan/vk_texture_cache.cpp

@ -1635,9 +1635,6 @@ Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBas
Image::~Image() = default; Image::~Image() = default;
void Image::AllocateComputeUnswizzleBuffer(u32 max_slices) { void Image::AllocateComputeUnswizzleBuffer(u32 max_slices) {
if (has_compute_unswizzle_buffer)
return;
using VideoCore::Surface::BytesPerBlock; using VideoCore::Surface::BytesPerBlock;
const u32 block_bytes = BytesPerBlock(info.format); // 8 for BC1, 16 for BC6H const u32 block_bytes = BytesPerBlock(info.format); // 8 for BC1, 16 for BC6H
@ -1654,7 +1651,13 @@ void Image::AllocateComputeUnswizzleBuffer(u32 max_slices) {
static_cast<u64>(blocks_y) * static_cast<u64>(blocks_y) *
static_cast<u64>(blocks_z); static_cast<u64>(blocks_z);
compute_unswizzle_buffer_size = block_count * block_bytes;
const VkDeviceSize needed_size = block_count * block_bytes;
if (has_compute_unswizzle_buffer && compute_unswizzle_buffer_size >= needed_size) {
return;
}
compute_unswizzle_buffer_size = needed_size;
VkBufferCreateInfo ci{ VkBufferCreateInfo ci{
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
@ -2475,8 +2478,8 @@ void Framebuffer::CreateFramebuffer(TextureCacheRuntime& runtime,
void TextureCacheRuntime::AccelerateImageUpload( void TextureCacheRuntime::AccelerateImageUpload(
Image& image, const StagingBufferRef& map, Image& image, const StagingBufferRef& map,
std::span<const VideoCommon::SwizzleParameters> swizzles,
u32 z_start, u32 z_count) {
std::span<const VideoCommon::SwizzleParameters> swizzles, u32 z_start, u32 z_count,
std::span<const u8> slice_has_data) {
if (IsPixelFormatASTC(image.info.format)) { if (IsPixelFormatASTC(image.info.format)) {
return astc_decoder_pass->Assemble(image, map, swizzles); return astc_decoder_pass->Assemble(image, map, swizzles);
@ -2496,7 +2499,7 @@ void TextureCacheRuntime::AccelerateImageUpload(
image.info.resources.levels == 1 && image.info.resources.levels == 1 &&
image.info.resources.layers == 1) { image.info.resources.layers == 1) {
return bl3d_unswizzle_pass->Unswizzle(image, map, swizzles, z_start, z_count);
return bl3d_unswizzle_pass->Unswizzle(image, map, swizzles, z_start, z_count, slice_has_data);
} }
ASSERT(false); ASSERT(false);

4
src/video_core/renderer_vulkan/vk_texture_cache.h

@ -90,9 +90,9 @@ public:
return msaa_copy_pass.operator bool(); return msaa_copy_pass.operator bool();
} }
void AccelerateImageUpload(Image&, const StagingBufferRef&,
void AccelerateImageUpload(Image &, const StagingBufferRef &,
std::span<const VideoCommon::SwizzleParameters>, std::span<const VideoCommon::SwizzleParameters>,
u32 z_start, u32 z_count);
u32 z_start, u32 z_count, std::span<const u8> slice_has_data={});
void InsertUploadMemoryBarrier() {} void InsertUploadMemoryBarrier() {}

89
src/video_core/texture_cache/texture_cache.h

@ -24,6 +24,7 @@
#include "video_core/texture_cache/samples_helper.h" #include "video_core/texture_cache/samples_helper.h"
#include "video_core/texture_cache/texture_cache_base.h" #include "video_core/texture_cache/texture_cache_base.h"
#include "video_core/texture_cache/util.h" #include "video_core/texture_cache/util.h"
#include "video_core/texture_cache/accelerated_swizzle.h"
#include "video_core/textures/decoders.h" #include "video_core/textures/decoders.h"
namespace VideoCommon { namespace VideoCommon {
@ -1479,6 +1480,40 @@ void TextureCache<P>::TickAsyncUnswizzle() {
const u32 aligned_height = height_blocks; const u32 aligned_height = height_blocks;
task.bytes_per_slice = static_cast<size_t>(stride) * aligned_height; task.bytes_per_slice = static_cast<size_t>(stride) * aligned_height;
task.last_submitted_offset = 0; task.last_submitted_offset = 0;
task.is_sparse = True(image.flags & ImageFlagBits::Sparse);
if (task.is_sparse) {
std::memset(task.staging_buffer.mapped_span.data(), 0, task.total_size);
const auto segs =
gpu_memory->GetSubmappedRange(image.gpu_addr, image.guest_size_bytes);
task.sparse_segments.assign(segs.begin(), segs.end());
task.slice_has_data.assign(image.info.size.depth, 0u);
if (image.info.size.depth > 1 && !image.slice_offsets.empty()) {
const auto uploads = FullUploadSwizzles(task.info);
const auto sp = Accelerated::MakeBlockLinearSwizzle3DParams(
uploads[0], task.info);
const u64 swizzled_slice_size = sp.slice_size;
for (const auto& [seg_gpu_addr, seg_size] : task.sparse_segments) {
const u64 seg_start = seg_gpu_addr - image.gpu_addr;
const u64 seg_end = seg_start + seg_size;
for (u32 z = 0; z < static_cast<u32>(image.info.size.depth); ++z) {
if (task.slice_has_data[z]) continue; // already marked, skip
const u64 slice_start = image.slice_offsets[z];
const u64 slice_end = slice_start + swizzled_slice_size;
if (slice_start < seg_end && slice_end > seg_start) {
task.slice_has_data[z] = 1u;
}
}
}
} else {
std::fill(task.slice_has_data.begin(), task.slice_has_data.end(), 1u);
}
}
task.initialized = true; task.initialized = true;
} }
@ -1494,14 +1529,31 @@ void TextureCache<P>::TickAsyncUnswizzle() {
copy_amount = (std::min)(dynamic_chunk, remaining); copy_amount = (std::min)(dynamic_chunk, remaining);
} }
if (remaining > swizzle_chunk_size) {
if (swizzle_chunk_size > 0 && remaining > swizzle_chunk_size) {
copy_amount = (copy_amount / task.bytes_per_slice) * task.bytes_per_slice; copy_amount = (copy_amount / task.bytes_per_slice) * task.bytes_per_slice;
if (copy_amount == 0) copy_amount = task.bytes_per_slice; if (copy_amount == 0) copy_amount = task.bytes_per_slice;
} }
gpu_memory->ReadBlock(image.gpu_addr + task.current_offset,
task.staging_buffer.mapped_span.data() + task.current_offset,
copy_amount);
if (task.is_sparse) {
const size_t read_start = task.current_offset;
const size_t read_end = task.current_offset + copy_amount;
u8* const staging_base = task.staging_buffer.mapped_span.data();
for (const auto& [seg_gpu_addr, seg_size] : task.sparse_segments) {
const size_t seg_start = static_cast<size_t>(seg_gpu_addr - image.gpu_addr);
const size_t seg_end = seg_start + seg_size;
const size_t ol_start = (std::max)(seg_start, read_start);
const size_t ol_end = (std::min)(seg_end, read_end);
if (ol_start < ol_end) {
gpu_memory->ReadBlock(image.gpu_addr + ol_start,
staging_base + ol_start,
ol_end - ol_start);
}
}
} else {
gpu_memory->ReadBlock(image.gpu_addr + task.current_offset,
task.staging_buffer.mapped_span.data() + task.current_offset,
copy_amount);
}
task.current_offset += copy_amount; task.current_offset += copy_amount;
} }
@ -1513,9 +1565,17 @@ void TextureCache<P>::TickAsyncUnswizzle() {
const size_t bytes_ready = task.current_offset - task.last_submitted_offset; const size_t bytes_ready = task.current_offset - task.last_submitted_offset;
const u32 complete_slices = static_cast<u32>(bytes_ready / task.bytes_per_slice); const u32 complete_slices = static_cast<u32>(bytes_ready / task.bytes_per_slice);
if( swizzle_slices_per_batch <= 0 ) {
runtime.AccelerateImageUpload(image, task.staging_buffer, FixSmallVectorADL(FullUploadSwizzles(task.info)), 0, image.info.size.depth);
task.last_submitted_offset += (static_cast<size_t>(image.info.size.depth) * task.bytes_per_slice);
const std::span<const u8> sparse_hint =
task.is_sparse ? std::span<const u8>(task.slice_has_data)
: std::span<const u8>{};
if (swizzle_slices_per_batch <= 0 || swizzle_chunk_size == 0) {
const u32 z_start_full = static_cast<u32>(task.last_submitted_offset / task.bytes_per_slice);
const u32 remaining_slices_full = image.info.size.depth - z_start_full;
if (remaining_slices_full > 0) {
runtime.AccelerateImageUpload(image, task.staging_buffer, FixSmallVectorADL(FullUploadSwizzles(task.info)), z_start_full, remaining_slices_full, sparse_hint);
task.last_submitted_offset += (static_cast<size_t>(remaining_slices_full) * task.bytes_per_slice);
}
} }
else { else {
const u32 adaptive_batch = GetAdaptiveBatchSize(task, unswizzle_queue.size()); const u32 adaptive_batch = GetAdaptiveBatchSize(task, unswizzle_queue.size());
@ -1525,17 +1585,20 @@ void TextureCache<P>::TickAsyncUnswizzle() {
const u32 slices_to_process = (std::min)(complete_slices, adaptive_batch); const u32 slices_to_process = (std::min)(complete_slices, adaptive_batch);
if (whole_texture) { if (whole_texture) {
runtime.AccelerateImageUpload(image, task.staging_buffer,
FixSmallVectorADL(FullUploadSwizzles(task.info)), 0,
image.info.size.depth);
task.last_submitted_offset +=
(static_cast<size_t>(image.info.size.depth) * task.bytes_per_slice);
const u32 remaining_slices = task.info.size.depth - z_start;
if (remaining_slices > 0) {
runtime.AccelerateImageUpload(image, task.staging_buffer,
FixSmallVectorADL(FullUploadSwizzles(task.info)), z_start,
remaining_slices, sparse_hint);
task.last_submitted_offset +=
(static_cast<size_t>(remaining_slices) * task.bytes_per_slice);
}
} else if (complete_slices >= slices_to_process || (is_final_batch && complete_slices > 0)) { } else if (complete_slices >= slices_to_process || (is_final_batch && complete_slices > 0)) {
const u32 z_count = (std::min)(slices_to_process, task.info.size.depth - z_start); const u32 z_count = (std::min)(slices_to_process, task.info.size.depth - z_start);
if (z_count > 0) { if (z_count > 0) {
const auto uploads = FullUploadSwizzles(task.info); const auto uploads = FullUploadSwizzles(task.info);
runtime.AccelerateImageUpload(image, task.staging_buffer, FixSmallVectorADL(uploads), runtime.AccelerateImageUpload(image, task.staging_buffer, FixSmallVectorADL(uploads),
z_start, z_count);
z_start, z_count, sparse_hint);
task.last_submitted_offset += (static_cast<size_t>(z_count) * task.bytes_per_slice); task.last_submitted_offset += (static_cast<size_t>(z_count) * task.bytes_per_slice);
} }
} }

3
src/video_core/texture_cache/texture_cache_base.h

@ -139,6 +139,9 @@ class TextureCache : public VideoCommon::ChannelSetupCaches<TextureCacheChannelI
size_t last_submitted_offset = 0; size_t last_submitted_offset = 0;
size_t bytes_per_slice; size_t bytes_per_slice;
bool initialized = false; bool initialized = false;
bool is_sparse = false;
std::vector<u8> slice_has_data;
std::vector<std::pair<GPUVAddr, size_t>> sparse_segments;
}; };
struct BlitImages { struct BlitImages {

Loading…
Cancel
Save