Better ASTC GPU decoding

Reverted swizzle compose shader as the code just did the same thing and added uneeded complexity Fixed issue with GPU unswizzle code sending more slices than allocated causing issues
4 weeks ago · db5a37f304
4 changed files with 24 additions and 38 deletions
--- a/src/video_core/host_shaders/astc_decoder.comp
+++ b/src/video_core/host_shaders/astc_decoder.comp
@ -62,13 +62,6 @@ const uint encoding_values[22] = uint[](
    (QUINT | (4u << 8u)), (TRIT | (5u << 8u)), (JUST_BITS | (7u << 8u)), (QUINT | (5u << 8u)),
    (TRIT | (6u << 8u)), (JUST_BITS | (8u << 8u)));
    
-// Precomputed weight tables
-const uint WEIGHT_TABLE_1BIT[2] = uint[](0, 64);
-const uint WEIGHT_TABLE_2BIT[4] = uint[](0, 21, 43, 64);
-const uint WEIGHT_TABLE_3BIT[8] = uint[](0, 9, 18, 27, 37, 46, 55, 64);
-const uint WEIGHT_TABLE_4BIT[16] = uint[](0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64);
-const uint WEIGHT_TABLE_5BIT[32] = uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64);
-
 // Input ASTC texture globals
 int total_bitsread = 0;
 uvec4 local_buff;
@ -739,13 +732,14 @@ uint UnquantizeTexelWeight(EncodingData val) {
    const uint bitval = BitValue(val);

    if (encoding == JUST_BITS) {
+        uint z = bitval;
        switch (bitlen) {
-            case 1: return WEIGHT_TABLE_1BIT[bitval];
-            case 2: return WEIGHT_TABLE_2BIT[bitval];
-            case 3: return WEIGHT_TABLE_3BIT[bitval];
-            case 4: return WEIGHT_TABLE_4BIT[bitval];
-            case 5: return WEIGHT_TABLE_5BIT[bitval];
-            default: return FastReplicateTo6(bitval, bitlen);
+            case 1: return z * 64;
+            case 2: return uint(floor(float(z) * 21.5f));
+            case 3: return uint(floor(float(z) * 9.25f));
+            case 4: return uint(floor(float(z) * 4.125f));
+            case 5: return uint(floor(float(z) * 2.0625f));
+            default: return FastReplicateTo6(z, bitlen);
        }
    }
    
--- a/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp
+++ b/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp
@ -152,23 +152,12 @@ void main() {
    uint block_index = block_coord.x +
                       (block_coord.y * pc.blocks_dim.x) +
                       (block_coord.z * pc.blocks_dim.x * pc.blocks_dim.y);
+    uint out_idx = block_index * (bytes_per_block >> 2u);

-    if (bytes_per_block == 16u) {
-        // BC6H/BC7
-        uvec4 out_data[1];
-        out_data[0] = texel;
-        out_u32[block_index * 4u] = texel.x;
-        out_u32[block_index * 4u + 1u] = texel.y;
-        out_u32[block_index * 4u + 2u] = texel.z;
-        out_u32[block_index * 4u + 3u] = texel.w;
-    } else if (bytes_per_block == 8u) {
-        // BC1/BC4
-        uint out_idx = block_index * 2u;
-        out_u32[out_idx] = texel.x;
-        out_u32[out_idx + 1u] = texel.y;
-    } else {
-        uint out_idx = block_index * (bytes_per_block >> 2u);
-        out_u32[out_idx] = texel.x;
-        if (bytes_per_block > 4u) out_u32[out_idx + 1u] = texel.y;
+    out_u32[out_idx]     = texel.x;
+    out_u32[out_idx + 1u] = texel.y;
+    if (pc.bytes_per_block_log2 == 4u) {
+        out_u32[out_idx + 2u] = texel.z;
+        out_u32[out_idx + 3u] = texel.w;
    }
 }
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@ -756,8 +756,6 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
 {
    using namespace VideoCommon::Accelerated;
    
-    // Leaving this here incase instances are found where slices_needed causes device loss
-    // Tune this for a balance between speed and size, I don't own a deck so can't self tune it
    const u32 MAX_BATCH_SLICES = std::min(z_count, image.info.size.depth);
    
    if (!image.has_compute_unswizzle_buffer) {
@ -874,9 +872,10 @@ void BlockLinearUnswizzle3DPass::UnswizzleChunk(
            .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
            .pNext = nullptr,
            .srcAccessMask = is_first_chunk ? VkAccessFlags{} : 
-                            static_cast<VkAccessFlags>(VK_ACCESS_SHADER_READ_BIT),
+                            static_cast<VkAccessFlags>(VK_ACCESS_TRANSFER_WRITE_BIT),
            .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
-            .oldLayout = is_first_chunk ? VK_IMAGE_LAYOUT_UNDEFINED : VK_IMAGE_LAYOUT_GENERAL,
+            .oldLayout = is_first_chunk ? VK_IMAGE_LAYOUT_UNDEFINED : 
+                        VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
            .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@ -1510,13 +1510,14 @@ void TextureCache<P>::TickAsyncUnswizzle() {
        task.current_offset += copy_amount;
    }
    
+    const bool is_final_batch = task.current_offset >= task.total_size;
    const size_t bytes_ready = task.current_offset - task.last_submitted_offset;
    const u32 complete_slices = static_cast<u32>(bytes_ready / task.bytes_per_slice);
-    const bool is_final_batch = task.current_offset >= task.total_size;
    
    if (complete_slices >= SLICES_PER_BATCH || (is_final_batch && complete_slices > 0)) {
        const u32 z_start = static_cast<u32>(task.last_submitted_offset / task.bytes_per_slice);
-        const u32 z_count = std::min(complete_slices, image.info.size.depth - z_start);
+        const u32 slices_to_process = std::min(complete_slices, SLICES_PER_BATCH);
+        const u32 z_count = std::min(slices_to_process, image.info.size.depth - z_start);

        if (z_count > 0) {
            const auto uploads = FullUploadSwizzles(task.info);
@ -1526,7 +1527,10 @@ void TextureCache<P>::TickAsyncUnswizzle() {
    }
    
    // Check if complete
-    if (is_final_batch && task.last_submitted_offset >= task.total_size) {
+    const u32 slices_submitted = static_cast<u32>(task.last_submitted_offset / task.bytes_per_slice);
+    const bool all_slices_submitted = slices_submitted >= image.info.size.depth;
+    
+    if (is_final_batch && all_slices_submitted) {
        runtime.FreeDeferredStagingBuffer(task.staging_buffer);
        image.flags &= ~ImageFlagBits::IsDecoding;
        unswizzle_queue.pop_front();