Browse Source

Better ASTC GPU decoding

Reverted swizzle compose shader as the code just did the same thing and added uneeded complexity
Fixed issue with GPU unswizzle code sending more slices than allocated causing issues
pull/3246/head
Forrest Keller 4 weeks ago
committed by crueter
parent
commit
db5a37f304
  1. 20
      src/video_core/host_shaders/astc_decoder.comp
  2. 23
      src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp
  3. 7
      src/video_core/renderer_vulkan/vk_compute_pass.cpp
  4. 10
      src/video_core/texture_cache/texture_cache.h

20
src/video_core/host_shaders/astc_decoder.comp

@ -62,13 +62,6 @@ const uint encoding_values[22] = uint[](
(QUINT | (4u << 8u)), (TRIT | (5u << 8u)), (JUST_BITS | (7u << 8u)), (QUINT | (5u << 8u)),
(TRIT | (6u << 8u)), (JUST_BITS | (8u << 8u)));
// Precomputed weight tables
const uint WEIGHT_TABLE_1BIT[2] = uint[](0, 64);
const uint WEIGHT_TABLE_2BIT[4] = uint[](0, 21, 43, 64);
const uint WEIGHT_TABLE_3BIT[8] = uint[](0, 9, 18, 27, 37, 46, 55, 64);
const uint WEIGHT_TABLE_4BIT[16] = uint[](0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64);
const uint WEIGHT_TABLE_5BIT[32] = uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64);
// Input ASTC texture globals
int total_bitsread = 0;
uvec4 local_buff;
@ -739,13 +732,14 @@ uint UnquantizeTexelWeight(EncodingData val) {
const uint bitval = BitValue(val);
if (encoding == JUST_BITS) {
uint z = bitval;
switch (bitlen) {
case 1: return WEIGHT_TABLE_1BIT[bitval];
case 2: return WEIGHT_TABLE_2BIT[bitval];
case 3: return WEIGHT_TABLE_3BIT[bitval];
case 4: return WEIGHT_TABLE_4BIT[bitval];
case 5: return WEIGHT_TABLE_5BIT[bitval];
default: return FastReplicateTo6(bitval, bitlen);
case 1: return z * 64;
case 2: return uint(floor(float(z) * 21.5f));
case 3: return uint(floor(float(z) * 9.25f));
case 4: return uint(floor(float(z) * 4.125f));
case 5: return uint(floor(float(z) * 2.0625f));
default: return FastReplicateTo6(z, bitlen);
}
}

23
src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp

@ -152,23 +152,12 @@ void main() {
uint block_index = block_coord.x +
(block_coord.y * pc.blocks_dim.x) +
(block_coord.z * pc.blocks_dim.x * pc.blocks_dim.y);
uint out_idx = block_index * (bytes_per_block >> 2u);
if (bytes_per_block == 16u) {
// BC6H/BC7
uvec4 out_data[1];
out_data[0] = texel;
out_u32[block_index * 4u] = texel.x;
out_u32[block_index * 4u + 1u] = texel.y;
out_u32[block_index * 4u + 2u] = texel.z;
out_u32[block_index * 4u + 3u] = texel.w;
} else if (bytes_per_block == 8u) {
// BC1/BC4
uint out_idx = block_index * 2u;
out_u32[out_idx] = texel.x;
out_u32[out_idx + 1u] = texel.y;
} else {
uint out_idx = block_index * (bytes_per_block >> 2u);
out_u32[out_idx] = texel.x;
if (bytes_per_block > 4u) out_u32[out_idx + 1u] = texel.y;
out_u32[out_idx] = texel.x;
out_u32[out_idx + 1u] = texel.y;
if (pc.bytes_per_block_log2 == 4u) {
out_u32[out_idx + 2u] = texel.z;
out_u32[out_idx + 3u] = texel.w;
}
}

7
src/video_core/renderer_vulkan/vk_compute_pass.cpp

@ -756,8 +756,6 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
{
using namespace VideoCommon::Accelerated;
// Leaving this here incase instances are found where slices_needed causes device loss
// Tune this for a balance between speed and size, I don't own a deck so can't self tune it
const u32 MAX_BATCH_SLICES = std::min(z_count, image.info.size.depth);
if (!image.has_compute_unswizzle_buffer) {
@ -874,9 +872,10 @@ void BlockLinearUnswizzle3DPass::UnswizzleChunk(
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = is_first_chunk ? VkAccessFlags{} :
static_cast<VkAccessFlags>(VK_ACCESS_SHADER_READ_BIT),
static_cast<VkAccessFlags>(VK_ACCESS_TRANSFER_WRITE_BIT),
.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.oldLayout = is_first_chunk ? VK_IMAGE_LAYOUT_UNDEFINED : VK_IMAGE_LAYOUT_GENERAL,
.oldLayout = is_first_chunk ? VK_IMAGE_LAYOUT_UNDEFINED :
VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,

10
src/video_core/texture_cache/texture_cache.h

@ -1510,13 +1510,14 @@ void TextureCache<P>::TickAsyncUnswizzle() {
task.current_offset += copy_amount;
}
const bool is_final_batch = task.current_offset >= task.total_size;
const size_t bytes_ready = task.current_offset - task.last_submitted_offset;
const u32 complete_slices = static_cast<u32>(bytes_ready / task.bytes_per_slice);
const bool is_final_batch = task.current_offset >= task.total_size;
if (complete_slices >= SLICES_PER_BATCH || (is_final_batch && complete_slices > 0)) {
const u32 z_start = static_cast<u32>(task.last_submitted_offset / task.bytes_per_slice);
const u32 z_count = std::min(complete_slices, image.info.size.depth - z_start);
const u32 slices_to_process = std::min(complete_slices, SLICES_PER_BATCH);
const u32 z_count = std::min(slices_to_process, image.info.size.depth - z_start);
if (z_count > 0) {
const auto uploads = FullUploadSwizzles(task.info);
@ -1526,7 +1527,10 @@ void TextureCache<P>::TickAsyncUnswizzle() {
}
// Check if complete
if (is_final_batch && task.last_submitted_offset >= task.total_size) {
const u32 slices_submitted = static_cast<u32>(task.last_submitted_offset / task.bytes_per_slice);
const bool all_slices_submitted = slices_submitted >= image.info.size.depth;
if (is_final_batch && all_slices_submitted) {
runtime.FreeDeferredStagingBuffer(task.staging_buffer);
image.flags &= ~ImageFlagBits::IsDecoding;
unswizzle_queue.pop_front();

Loading…
Cancel
Save