From db5a37f3044e652ea623531030a7b4f06934f966 Mon Sep 17 00:00:00 2001
From: Forrest Keller <forrestmarkx@outlook.com>
Date: Fri, 9 Jan 2026 21:03:58 -0600
Subject: [PATCH] Better ASTC GPU decoding

Reverted swizzle compose shader as the code just did the same thing and added uneeded complexity
Fixed issue with GPU unswizzle code sending more slices than allocated causing issues
---
 src/video_core/host_shaders/astc_decoder.comp | 20 ++++++----------
 .../block_linear_unswizzle_3d_bcn.comp        | 23 +++++--------------
 .../renderer_vulkan/vk_compute_pass.cpp       |  7 +++---
 src/video_core/texture_cache/texture_cache.h  | 12 ++++++----
 4 files changed, 24 insertions(+), 38 deletions(-)

diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp
index 761b62582d..ef3f407c33 100644
--- a/src/video_core/host_shaders/astc_decoder.comp
+++ b/src/video_core/host_shaders/astc_decoder.comp
@@ -62,13 +62,6 @@ const uint encoding_values[22] = uint[](
     (QUINT | (4u << 8u)), (TRIT | (5u << 8u)), (JUST_BITS | (7u << 8u)), (QUINT | (5u << 8u)),
     (TRIT | (6u << 8u)), (JUST_BITS | (8u << 8u)));
     
-// Precomputed weight tables
-const uint WEIGHT_TABLE_1BIT[2] = uint[](0, 64);
-const uint WEIGHT_TABLE_2BIT[4] = uint[](0, 21, 43, 64);
-const uint WEIGHT_TABLE_3BIT[8] = uint[](0, 9, 18, 27, 37, 46, 55, 64);
-const uint WEIGHT_TABLE_4BIT[16] = uint[](0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64);
-const uint WEIGHT_TABLE_5BIT[32] = uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64);
-
 // Input ASTC texture globals
 int total_bitsread = 0;
 uvec4 local_buff;
@@ -739,13 +732,14 @@ uint UnquantizeTexelWeight(EncodingData val) {
     const uint bitval = BitValue(val);
 
     if (encoding == JUST_BITS) {
+        uint z = bitval;
         switch (bitlen) {
-            case 1: return WEIGHT_TABLE_1BIT[bitval];
-            case 2: return WEIGHT_TABLE_2BIT[bitval];
-            case 3: return WEIGHT_TABLE_3BIT[bitval];
-            case 4: return WEIGHT_TABLE_4BIT[bitval];
-            case 5: return WEIGHT_TABLE_5BIT[bitval];
-            default: return FastReplicateTo6(bitval, bitlen);
+            case 1: return z * 64;
+            case 2: return uint(floor(float(z) * 21.5f));
+            case 3: return uint(floor(float(z) * 9.25f));
+            case 4: return uint(floor(float(z) * 4.125f));
+            case 5: return uint(floor(float(z) * 2.0625f));
+            default: return FastReplicateTo6(z, bitlen);
         }
     }
     
diff --git a/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp b/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp
index 455e99e019..a25eb52327 100644
--- a/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp
+++ b/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp
@@ -152,23 +152,12 @@ void main() {
     uint block_index = block_coord.x +
                        (block_coord.y * pc.blocks_dim.x) +
                        (block_coord.z * pc.blocks_dim.x * pc.blocks_dim.y);
+    uint out_idx = block_index * (bytes_per_block >> 2u);
 
-    if (bytes_per_block == 16u) {
-        // BC6H/BC7
-        uvec4 out_data[1];
-        out_data[0] = texel;
-        out_u32[block_index * 4u] = texel.x;
-        out_u32[block_index * 4u + 1u] = texel.y;
-        out_u32[block_index * 4u + 2u] = texel.z;
-        out_u32[block_index * 4u + 3u] = texel.w;
-    } else if (bytes_per_block == 8u) {
-        // BC1/BC4
-        uint out_idx = block_index * 2u;
-        out_u32[out_idx] = texel.x;
-        out_u32[out_idx + 1u] = texel.y;
-    } else {
-        uint out_idx = block_index * (bytes_per_block >> 2u);
-        out_u32[out_idx] = texel.x;
-        if (bytes_per_block > 4u) out_u32[out_idx + 1u] = texel.y;
+    out_u32[out_idx]     = texel.x;
+    out_u32[out_idx + 1u] = texel.y;
+    if (pc.bytes_per_block_log2 == 4u) {
+        out_u32[out_idx + 2u] = texel.z;
+        out_u32[out_idx + 3u] = texel.w;
     }
 }
\ No newline at end of file
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
index bc5ab23d06..1874d4002c 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@@ -756,8 +756,6 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
 {
     using namespace VideoCommon::Accelerated;
     
-    // Leaving this here incase instances are found where slices_needed causes device loss
-    // Tune this for a balance between speed and size, I don't own a deck so can't self tune it
     const u32 MAX_BATCH_SLICES = std::min(z_count, image.info.size.depth);
     
     if (!image.has_compute_unswizzle_buffer) {
@@ -874,9 +872,10 @@ void BlockLinearUnswizzle3DPass::UnswizzleChunk(
             .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
             .pNext = nullptr,
             .srcAccessMask = is_first_chunk ? VkAccessFlags{} : 
-                            static_cast<VkAccessFlags>(VK_ACCESS_SHADER_READ_BIT),
+                            static_cast<VkAccessFlags>(VK_ACCESS_TRANSFER_WRITE_BIT),
             .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
-            .oldLayout = is_first_chunk ? VK_IMAGE_LAYOUT_UNDEFINED : VK_IMAGE_LAYOUT_GENERAL,
+            .oldLayout = is_first_chunk ? VK_IMAGE_LAYOUT_UNDEFINED : 
+                        VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
             .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
             .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
             .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 74a42e5c87..74edeec3e2 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -1510,14 +1510,15 @@ void TextureCache<P>::TickAsyncUnswizzle() {
         task.current_offset += copy_amount;
     }
     
+    const bool is_final_batch = task.current_offset >= task.total_size;
     const size_t bytes_ready = task.current_offset - task.last_submitted_offset;
     const u32 complete_slices = static_cast<u32>(bytes_ready / task.bytes_per_slice);
-    const bool is_final_batch = task.current_offset >= task.total_size;
     
     if (complete_slices >= SLICES_PER_BATCH || (is_final_batch && complete_slices > 0)) {
         const u32 z_start = static_cast<u32>(task.last_submitted_offset / task.bytes_per_slice);
-        const u32 z_count = std::min(complete_slices, image.info.size.depth - z_start);
-        
+        const u32 slices_to_process = std::min(complete_slices, SLICES_PER_BATCH);
+        const u32 z_count = std::min(slices_to_process, image.info.size.depth - z_start);
+
         if (z_count > 0) {
             const auto uploads = FullUploadSwizzles(task.info);
             runtime.AccelerateImageUpload(image, task.staging_buffer, FixSmallVectorADL(uploads), z_start, z_count);
@@ -1526,7 +1527,10 @@ void TextureCache<P>::TickAsyncUnswizzle() {
     }
     
     // Check if complete
-    if (is_final_batch && task.last_submitted_offset >= task.total_size) {
+    const u32 slices_submitted = static_cast<u32>(task.last_submitted_offset / task.bytes_per_slice);
+    const bool all_slices_submitted = slices_submitted >= image.info.size.depth;
+    
+    if (is_final_batch && all_slices_submitted) {
         runtime.FreeDeferredStagingBuffer(task.staging_buffer);
         image.flags &= ~ImageFlagBits::IsDecoding;
         unswizzle_queue.pop_front();