From 44c12fa216889720472fb9c961d899e8e59ed805 Mon Sep 17 00:00:00 2001 From: Forrest Keller Date: Sun, 4 Jan 2026 19:59:05 -0600 Subject: [PATCH] Attempt to make GPU ASTC decoding faster Added device aware memory saving for MP4 --- src/video_core/host_shaders/astc_decoder.comp | 51 ++++++++++++++----- .../renderer_vulkan/vk_compute_pass.cpp | 5 +- src/video_core/texture_cache/texture_cache.h | 33 +++++++++--- .../texture_cache/texture_cache_base.h | 3 ++ 4 files changed, 68 insertions(+), 24 deletions(-) diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp index 6e4535d459..761b62582d 100644 --- a/src/video_core/host_shaders/astc_decoder.comp +++ b/src/video_core/host_shaders/astc_decoder.comp @@ -61,6 +61,13 @@ const uint encoding_values[22] = uint[]( (JUST_BITS | (5u << 8u)), (QUINT | (3u << 8u)), (TRIT | (4u << 8u)), (JUST_BITS | (6u << 8u)), (QUINT | (4u << 8u)), (TRIT | (5u << 8u)), (JUST_BITS | (7u << 8u)), (QUINT | (5u << 8u)), (TRIT | (6u << 8u)), (JUST_BITS | (8u << 8u))); + +// Precomputed weight tables +const uint WEIGHT_TABLE_1BIT[2] = uint[](0, 64); +const uint WEIGHT_TABLE_2BIT[4] = uint[](0, 21, 43, 64); +const uint WEIGHT_TABLE_3BIT[8] = uint[](0, 9, 18, 27, 37, 46, 55, 64); +const uint WEIGHT_TABLE_4BIT[16] = uint[](0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64); +const uint WEIGHT_TABLE_5BIT[32] = uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64); // Input ASTC texture globals int total_bitsread = 0; @@ -730,18 +737,30 @@ uint UnquantizeTexelWeight(EncodingData val) { const uint encoding = Encoding(val); const uint bitlen = NumBits(val); const uint bitval = BitValue(val); + + if (encoding == JUST_BITS) { + switch (bitlen) { + case 1: return WEIGHT_TABLE_1BIT[bitval]; + case 2: return WEIGHT_TABLE_2BIT[bitval]; + case 3: return WEIGHT_TABLE_3BIT[bitval]; + case 4: return WEIGHT_TABLE_4BIT[bitval]; + case 5: return WEIGHT_TABLE_5BIT[bitval]; + default: return FastReplicateTo6(bitval, bitlen); + } + } + const uint A = ReplicateBitTo7((bitval & 1)); uint B = 0, C = 0, D = 0; uint result = 0; - const uint bitlen_0_results[5] = {0, 16, 32, 48, 64}; + switch (encoding) { - case JUST_BITS: - return FastReplicateTo6(bitval, bitlen); case TRIT: { D = QuintTritValue(val); switch (bitlen) { - case 0: - return bitlen_0_results[D * 2]; + case 0: { + const uint trit_base[3] = uint[](0, 32, 64); + return trit_base[D]; + } case 1: { C = 50; break; @@ -758,16 +777,16 @@ uint UnquantizeTexelWeight(EncodingData val) { B = (cb << 5) | cb; break; } - default: - break; } break; } case QUINT: { D = QuintTritValue(val); switch (bitlen) { - case 0: - return bitlen_0_results[D]; + case 0: { + const uint quint_base[5] = uint[](0, 16, 32, 48, 64); + return quint_base[D]; + } case 1: { C = 28; break; @@ -782,14 +801,17 @@ uint UnquantizeTexelWeight(EncodingData val) { break; } } - if (encoding != JUST_BITS && bitlen > 0) { + + if (bitlen > 0) { result = D * C + B; result ^= A; result = (A & 0x20) | (result >> 2); } + if (result > 32) { result += 1; } + return result; } @@ -1159,10 +1181,11 @@ void DecompressBlock(ivec3 coord) { } uint SwizzleOffset(uvec2 pos) { - const uint x = pos.x; - const uint y = pos.y; - return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + - ((x % 32) / 16) * 32 + (y % 2) * 16 + (x % 16); + return ((pos.x & 32u) << 3u) | + ((pos.y & 6u) << 5u) | + ((pos.x & 16u) << 1u) | + ((pos.y & 1u) << 4u) | + (pos.x & 15u); } void main() { diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index d97a35352a..411e33f06a 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -623,7 +623,7 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, .pNext = nullptr, .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT, .oldLayout = VK_IMAGE_LAYOUT_GENERAL, .newLayout = VK_IMAGE_LAYOUT_GENERAL, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, @@ -638,9 +638,8 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, }, }; cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, image_barrier); + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, image_barrier); }); - scheduler.Finish(); } constexpr u32 BL3D_BINDING_SWIZZLE_TABLE = 0; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 82443d6599..f00997611a 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -70,10 +70,29 @@ TextureCache

::TextureCache(Runtime& runtime_, Tegra::MaxwellDeviceMemoryManag (std::max)((std::min)(device_local_memory - min_vacancy_critical, min_spacing_critical), DEFAULT_CRITICAL_MEMORY)); minimum_memory = static_cast((device_local_memory - mem_threshold) / 2); + + const u64 device_memory = static_cast(device_local_memory); + if (device_memory <= 4_GiB) { + chunk_size = 16_MiB; + slices_per_batch = 16; + } else if (device_memory <= 8_GiB) { + chunk_size = 32_MiB; + slices_per_batch = 32; + } else { + chunk_size = 64_MiB; + slices_per_batch = 64; + } + + lowmemorydevice = True(device_memory <= 4_GiB); } else { expected_memory = DEFAULT_EXPECTED_MEMORY + 512_MiB; critical_memory = DEFAULT_CRITICAL_MEMORY + 1_GiB; minimum_memory = 0; + + chunk_size = 32_MiB; + slices_per_batch = 32; + + lowmemorydevice = true; } } @@ -1131,6 +1150,10 @@ void TextureCache

::RefreshContents(Image& image, ImageId image_id) { } image.flags &= ~ImageFlagBits::CpuModified; + if( lowmemorydevice && image.info.format == PixelFormat::BC1_RGBA_UNORM && MapSizeBytes(image) >= 256_MiB ) { + return; + } + TrackImage(image, image_id); if (image.info.num_samples > 1 && !runtime.CanUploadMSAA()) { @@ -1465,17 +1488,13 @@ void TextureCache

::TickAsyncUnswizzle() { task.initialized = true; } - // ToDo: Make these configurable - static constexpr size_t CHUNK_SIZE = 48_MiB; - static constexpr u32 SLICES_PER_BATCH = 48u; - // Read data if (task.current_offset < task.total_size) { const size_t remaining = task.total_size - task.current_offset; - size_t copy_amount = std::min(CHUNK_SIZE, remaining); + size_t copy_amount = std::min(chunk_size, remaining); - if (remaining > CHUNK_SIZE) { + if (remaining > chunk_size) { copy_amount = (copy_amount / task.bytes_per_slice) * task.bytes_per_slice; if (copy_amount == 0) copy_amount = task.bytes_per_slice; } @@ -1490,7 +1509,7 @@ void TextureCache

::TickAsyncUnswizzle() { const u32 complete_slices = static_cast(bytes_ready / task.bytes_per_slice); const bool is_final_batch = task.current_offset >= task.total_size; - if (complete_slices >= SLICES_PER_BATCH || (is_final_batch && complete_slices > 0)) { + if (complete_slices >= slices_per_batch || (is_final_batch && complete_slices > 0)) { const u32 z_start = static_cast(task.last_submitted_offset / task.bytes_per_slice); const u32 z_count = std::min(complete_slices, image.info.size.depth - z_start); diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index c72a4a5858..7b642850fd 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -475,6 +475,9 @@ private: u64 minimum_memory; u64 expected_memory; u64 critical_memory; + size_t chunk_size; + size_t slices_per_batch; + bool lowmemorydevice = false; struct BufferDownload { GPUVAddr address;