Attempt to make GPU ASTC decoding faster

Added device aware memory saving for MP4
4 weeks ago · 44c12fa216
4 changed files with 68 additions and 24 deletions
--- a/src/video_core/host_shaders/astc_decoder.comp
+++ b/src/video_core/host_shaders/astc_decoder.comp
@ -61,6 +61,13 @@ const uint encoding_values[22] = uint[](
    (JUST_BITS | (5u << 8u)), (QUINT | (3u << 8u)), (TRIT | (4u << 8u)), (JUST_BITS | (6u << 8u)),
    (QUINT | (4u << 8u)), (TRIT | (5u << 8u)), (JUST_BITS | (7u << 8u)), (QUINT | (5u << 8u)),
    (TRIT | (6u << 8u)), (JUST_BITS | (8u << 8u)));
+    
+// Precomputed weight tables
+const uint WEIGHT_TABLE_1BIT[2] = uint[](0, 64);
+const uint WEIGHT_TABLE_2BIT[4] = uint[](0, 21, 43, 64);
+const uint WEIGHT_TABLE_3BIT[8] = uint[](0, 9, 18, 27, 37, 46, 55, 64);
+const uint WEIGHT_TABLE_4BIT[16] = uint[](0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64);
+const uint WEIGHT_TABLE_5BIT[32] = uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64);

 // Input ASTC texture globals
 int total_bitsread = 0;
@ -730,18 +737,30 @@ uint UnquantizeTexelWeight(EncodingData val) {
    const uint encoding = Encoding(val);
    const uint bitlen = NumBits(val);
    const uint bitval = BitValue(val);
+
+    if (encoding == JUST_BITS) {
+        switch (bitlen) {
+            case 1: return WEIGHT_TABLE_1BIT[bitval];
+            case 2: return WEIGHT_TABLE_2BIT[bitval];
+            case 3: return WEIGHT_TABLE_3BIT[bitval];
+            case 4: return WEIGHT_TABLE_4BIT[bitval];
+            case 5: return WEIGHT_TABLE_5BIT[bitval];
+            default: return FastReplicateTo6(bitval, bitlen);
+        }
+    }
+    
    const uint A = ReplicateBitTo7((bitval & 1));
    uint B = 0, C = 0, D = 0;
    uint result = 0;
-    const uint bitlen_0_results[5] = {0, 16, 32, 48, 64};
+    
    switch (encoding) {
-    case JUST_BITS:
-        return FastReplicateTo6(bitval, bitlen);
    case TRIT: {
        D = QuintTritValue(val);
        switch (bitlen) {
-        case 0:
-            return bitlen_0_results[D * 2];
+        case 0: {
+            const uint trit_base[3] = uint[](0, 32, 64);
+            return trit_base[D];
+        }
        case 1: {
            C = 50;
            break;
@ -758,16 +777,16 @@ uint UnquantizeTexelWeight(EncodingData val) {
            B = (cb << 5) | cb;
            break;
        }
-        default:
-            break;
        }
        break;
    }
    case QUINT: {
        D = QuintTritValue(val);
        switch (bitlen) {
-        case 0:
-            return bitlen_0_results[D];
+        case 0: {
+            const uint quint_base[5] = uint[](0, 16, 32, 48, 64);
+            return quint_base[D];
+        }
        case 1: {
            C = 28;
            break;
@ -782,14 +801,17 @@ uint UnquantizeTexelWeight(EncodingData val) {
        break;
    }
    }
-    if (encoding != JUST_BITS && bitlen > 0) {
+    
+    if (bitlen > 0) {
        result = D * C + B;
        result ^= A;
        result = (A & 0x20) | (result >> 2);
    }
+    
    if (result > 32) {
        result += 1;
    }
+    
    return result;
 }

@ -1159,10 +1181,11 @@ void DecompressBlock(ivec3 coord) {
 }

 uint SwizzleOffset(uvec2 pos) {
-    const uint x = pos.x;
-    const uint y = pos.y;
-    return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 +
-            ((x % 32) / 16) * 32 + (y % 2) * 16 + (x % 16);
+    return ((pos.x & 32u) << 3u) | 
+           ((pos.y & 6u)  << 5u) | 
+           ((pos.x & 16u) << 1u) | 
+           ((pos.y & 1u)  << 4u) | 
+           (pos.x & 15u);
 }

 void main() {
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@ -623,7 +623,7 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
            .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
            .pNext = nullptr,
            .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
-            .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
+            .dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
            .oldLayout = VK_IMAGE_LAYOUT_GENERAL,
            .newLayout = VK_IMAGE_LAYOUT_GENERAL,
            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
@ -638,9 +638,8 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
            },
        };
        cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
-                               VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, image_barrier);
+                               VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, image_barrier);
    });
-    scheduler.Finish();
 }

 constexpr u32 BL3D_BINDING_SWIZZLE_TABLE = 0;
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@ -70,10 +70,29 @@ TextureCache<P>::TextureCache(Runtime& runtime_, Tegra::MaxwellDeviceMemoryManag
            (std::max)((std::min)(device_local_memory - min_vacancy_critical, min_spacing_critical),
                     DEFAULT_CRITICAL_MEMORY));
        minimum_memory = static_cast<u64>((device_local_memory - mem_threshold) / 2);
+        
+        const u64 device_memory = static_cast<u64>(device_local_memory);
+        if (device_memory <= 4_GiB) {
+            chunk_size = 16_MiB;
+            slices_per_batch = 16;
+        } else if (device_memory <= 8_GiB) {
+            chunk_size = 32_MiB;
+            slices_per_batch = 32;
+        } else {
+            chunk_size = 64_MiB;
+            slices_per_batch = 64;
+        }
+        
+        lowmemorydevice = True(device_memory <= 4_GiB);
    } else {
        expected_memory = DEFAULT_EXPECTED_MEMORY + 512_MiB;
        critical_memory = DEFAULT_CRITICAL_MEMORY + 1_GiB;
        minimum_memory = 0;
+        
+        chunk_size = 32_MiB;
+        slices_per_batch = 32;
+        
+        lowmemorydevice = true;
    }
 }

@ -1131,6 +1150,10 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
    }
    
    image.flags &= ~ImageFlagBits::CpuModified;
+    if( lowmemorydevice && image.info.format == PixelFormat::BC1_RGBA_UNORM && MapSizeBytes(image) >= 256_MiB ) {
+        return;
+    }
+    
    TrackImage(image, image_id);
    
    if (image.info.num_samples > 1 && !runtime.CanUploadMSAA()) {
@ -1465,17 +1488,13 @@ void TextureCache<P>::TickAsyncUnswizzle() {
        task.initialized = true;
    }
    
-    // ToDo: Make these configurable
-    static constexpr size_t CHUNK_SIZE = 48_MiB;
-    static constexpr u32 SLICES_PER_BATCH = 48u;
-
    // Read data
    if (task.current_offset < task.total_size) {
        const size_t remaining = task.total_size - task.current_offset;
        
-        size_t copy_amount = std::min(CHUNK_SIZE, remaining);
+        size_t copy_amount = std::min(chunk_size, remaining);
        
-        if (remaining > CHUNK_SIZE) {
+        if (remaining > chunk_size) {
            copy_amount = (copy_amount / task.bytes_per_slice) * task.bytes_per_slice;
            if (copy_amount == 0) copy_amount = task.bytes_per_slice;
        }
@ -1490,7 +1509,7 @@ void TextureCache<P>::TickAsyncUnswizzle() {
    const u32 complete_slices = static_cast<u32>(bytes_ready / task.bytes_per_slice);
    const bool is_final_batch = task.current_offset >= task.total_size;
    
-    if (complete_slices >= SLICES_PER_BATCH || (is_final_batch && complete_slices > 0)) {
+    if (complete_slices >= slices_per_batch || (is_final_batch && complete_slices > 0)) {
        const u32 z_start = static_cast<u32>(task.last_submitted_offset / task.bytes_per_slice);
        const u32 z_count = std::min(complete_slices, image.info.size.depth - z_start);
        
--- a/src/video_core/texture_cache/texture_cache_base.h
+++ b/src/video_core/texture_cache/texture_cache_base.h
@ -475,6 +475,9 @@ private:
    u64 minimum_memory;
    u64 expected_memory;
    u64 critical_memory;
+    size_t chunk_size;
+    size_t slices_per_batch;
+    bool lowmemorydevice = false;

    struct BufferDownload {
        GPUVAddr address;