diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp index 761b62582d..ef3f407c33 100644 --- a/src/video_core/host_shaders/astc_decoder.comp +++ b/src/video_core/host_shaders/astc_decoder.comp @@ -62,13 +62,6 @@ const uint encoding_values[22] = uint[]( (QUINT | (4u << 8u)), (TRIT | (5u << 8u)), (JUST_BITS | (7u << 8u)), (QUINT | (5u << 8u)), (TRIT | (6u << 8u)), (JUST_BITS | (8u << 8u))); -// Precomputed weight tables -const uint WEIGHT_TABLE_1BIT[2] = uint[](0, 64); -const uint WEIGHT_TABLE_2BIT[4] = uint[](0, 21, 43, 64); -const uint WEIGHT_TABLE_3BIT[8] = uint[](0, 9, 18, 27, 37, 46, 55, 64); -const uint WEIGHT_TABLE_4BIT[16] = uint[](0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64); -const uint WEIGHT_TABLE_5BIT[32] = uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64); - // Input ASTC texture globals int total_bitsread = 0; uvec4 local_buff; @@ -739,13 +732,14 @@ uint UnquantizeTexelWeight(EncodingData val) { const uint bitval = BitValue(val); if (encoding == JUST_BITS) { + uint z = bitval; switch (bitlen) { - case 1: return WEIGHT_TABLE_1BIT[bitval]; - case 2: return WEIGHT_TABLE_2BIT[bitval]; - case 3: return WEIGHT_TABLE_3BIT[bitval]; - case 4: return WEIGHT_TABLE_4BIT[bitval]; - case 5: return WEIGHT_TABLE_5BIT[bitval]; - default: return FastReplicateTo6(bitval, bitlen); + case 1: return z * 64; + case 2: return uint(floor(float(z) * 21.5f)); + case 3: return uint(floor(float(z) * 9.25f)); + case 4: return uint(floor(float(z) * 4.125f)); + case 5: return uint(floor(float(z) * 2.0625f)); + default: return FastReplicateTo6(z, bitlen); } } diff --git a/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp b/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp index 455e99e019..a25eb52327 100644 --- a/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp +++ b/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp @@ -152,23 +152,12 @@ void main() { uint block_index = block_coord.x + (block_coord.y * pc.blocks_dim.x) + (block_coord.z * pc.blocks_dim.x * pc.blocks_dim.y); + uint out_idx = block_index * (bytes_per_block >> 2u); - if (bytes_per_block == 16u) { - // BC6H/BC7 - uvec4 out_data[1]; - out_data[0] = texel; - out_u32[block_index * 4u] = texel.x; - out_u32[block_index * 4u + 1u] = texel.y; - out_u32[block_index * 4u + 2u] = texel.z; - out_u32[block_index * 4u + 3u] = texel.w; - } else if (bytes_per_block == 8u) { - // BC1/BC4 - uint out_idx = block_index * 2u; - out_u32[out_idx] = texel.x; - out_u32[out_idx + 1u] = texel.y; - } else { - uint out_idx = block_index * (bytes_per_block >> 2u); - out_u32[out_idx] = texel.x; - if (bytes_per_block > 4u) out_u32[out_idx + 1u] = texel.y; + out_u32[out_idx] = texel.x; + out_u32[out_idx + 1u] = texel.y; + if (pc.bytes_per_block_log2 == 4u) { + out_u32[out_idx + 2u] = texel.z; + out_u32[out_idx + 3u] = texel.w; } } \ No newline at end of file diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index bc5ab23d06..1874d4002c 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -756,8 +756,6 @@ void BlockLinearUnswizzle3DPass::Unswizzle( { using namespace VideoCommon::Accelerated; - // Leaving this here incase instances are found where slices_needed causes device loss - // Tune this for a balance between speed and size, I don't own a deck so can't self tune it const u32 MAX_BATCH_SLICES = std::min(z_count, image.info.size.depth); if (!image.has_compute_unswizzle_buffer) { @@ -874,9 +872,10 @@ void BlockLinearUnswizzle3DPass::UnswizzleChunk( .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, .pNext = nullptr, .srcAccessMask = is_first_chunk ? VkAccessFlags{} : - static_cast(VK_ACCESS_SHADER_READ_BIT), + static_cast(VK_ACCESS_TRANSFER_WRITE_BIT), .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, - .oldLayout = is_first_chunk ? VK_IMAGE_LAYOUT_UNDEFINED : VK_IMAGE_LAYOUT_GENERAL, + .oldLayout = is_first_chunk ? VK_IMAGE_LAYOUT_UNDEFINED : + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 74a42e5c87..74edeec3e2 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -1510,14 +1510,15 @@ void TextureCache

::TickAsyncUnswizzle() { task.current_offset += copy_amount; } + const bool is_final_batch = task.current_offset >= task.total_size; const size_t bytes_ready = task.current_offset - task.last_submitted_offset; const u32 complete_slices = static_cast(bytes_ready / task.bytes_per_slice); - const bool is_final_batch = task.current_offset >= task.total_size; if (complete_slices >= SLICES_PER_BATCH || (is_final_batch && complete_slices > 0)) { const u32 z_start = static_cast(task.last_submitted_offset / task.bytes_per_slice); - const u32 z_count = std::min(complete_slices, image.info.size.depth - z_start); - + const u32 slices_to_process = std::min(complete_slices, SLICES_PER_BATCH); + const u32 z_count = std::min(slices_to_process, image.info.size.depth - z_start); + if (z_count > 0) { const auto uploads = FullUploadSwizzles(task.info); runtime.AccelerateImageUpload(image, task.staging_buffer, FixSmallVectorADL(uploads), z_start, z_count); @@ -1526,7 +1527,10 @@ void TextureCache

::TickAsyncUnswizzle() { } // Check if complete - if (is_final_batch && task.last_submitted_offset >= task.total_size) { + const u32 slices_submitted = static_cast(task.last_submitted_offset / task.bytes_per_slice); + const bool all_slices_submitted = slices_submitted >= image.info.size.depth; + + if (is_final_batch && all_slices_submitted) { runtime.FreeDeferredStagingBuffer(task.staging_buffer); image.flags &= ~ImageFlagBits::IsDecoding; unswizzle_queue.pop_front();