Added GPU accelerated texture unswizzle

Added broken system that attempts to partial upload sparse textures Various Vulkan optimzations Ignore remapped sparse textures (Breaks most games that use them)
1 month ago · bda5b973de
23 changed files with 928 additions and 26 deletions
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@ -18,6 +18,7 @@ set(SHADER_FILES
    blit_color_float.frag
    block_linear_unswizzle_2d.comp
    block_linear_unswizzle_3d.comp
+    block_linear_unswizzle_3d_bcn.comp
    convert_abgr8_srgb_to_d24s8.frag
    convert_abgr8_to_d24s8.frag
    convert_abgr8_to_d32f.frag
--- a/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp
+++ b/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp
@ -0,0 +1,160 @@
+// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#version 430
+
+#ifdef VULKAN
+    #extension GL_EXT_shader_16bit_storage : require
+    #extension GL_EXT_shader_8bit_storage  : require
+    #define HAS_EXTENDED_TYPES 1
+    #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
+    #define END_PUSH_CONSTANTS };
+    #define UNIFORM(n)
+    #define BINDING_SWIZZLE_BUFFER 0
+    #define BINDING_INPUT_BUFFER   1
+    #define BINDING_OUTPUT_BUFFER  2
+#else
+    #extension GL_NV_gpu_shader5 : enable
+    #ifdef GL_NV_gpu_shader5
+        #define HAS_EXTENDED_TYPES 1
+    #else
+        #define HAS_EXTENDED_TYPES 0
+    #endif
+    #define BEGIN_PUSH_CONSTANTS
+    #define END_PUSH_CONSTANTS
+    #define UNIFORM(n) layout(location = n) uniform
+    #define BINDING_SWIZZLE_BUFFER 0
+    #define BINDING_INPUT_BUFFER   1
+    #define BINDING_OUTPUT_BUFFER  0
+#endif
+
+// --- Push Constants / Uniforms ---
+#ifdef VULKAN
+layout(push_constant) uniform PushConstants {
+    uvec3 blocks_dim;           // Offset 0
+    uint bytes_per_block_log2;  // Offset 12
+
+    uvec3 origin;               // Offset 16
+    uint slice_size;            // Offset 28
+
+    uint block_size;            // Offset 32
+    uint x_shift;               // Offset 36
+    uint block_height;          // Offset 40
+    uint block_height_mask;     // Offset 44
+
+    uint block_depth;           // Offset 48
+    uint block_depth_mask;      // Offset 52
+    int _pad;                   // Offset 56
+
+    ivec3 destination;          // Offset 60
+} pc;
+#else
+BEGIN_PUSH_CONSTANTS
+    UNIFORM(0)  uvec3 origin;
+    UNIFORM(1)  ivec3 destination;
+    UNIFORM(2)  uint  bytes_per_block_log2;
+    UNIFORM(3)  uint  slice_size;
+    UNIFORM(4)  uint  block_size;
+    UNIFORM(5)  uint  x_shift;
+    UNIFORM(6)  uint  block_height;
+    UNIFORM(7)  uint  block_height_mask;
+    UNIFORM(8)  uint  block_depth;
+    UNIFORM(9)  uint  block_depth_mask;
+    UNIFORM(10) uvec3 blocks_dim;
+END_PUSH_CONSTANTS
+#define pc // Map pc prefix to nothing for OpenGL compatibility
+#endif
+
+// --- Buffers ---
+layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
+    uint swizzle_table[];
+};
+
+#if HAS_EXTENDED_TYPES
+    layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU8   { uint8_t  u8data[];  };
+    layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU16  { uint16_t u16data[]; };
+#endif
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32  { uint   u32data[];  };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64  { uvec2  u64data[];  };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4  u128data[]; };
+
+layout(binding = BINDING_OUTPUT_BUFFER, std430) buffer OutputBuffer {
+    uint out_u32[];
+};
+
+// --- Constants ---
+layout(local_size_x = 32, local_size_y = 8, local_size_z = 1) in;
+
+const uint GOB_SIZE_X = 64;
+const uint GOB_SIZE_Y = 8;
+const uint GOB_SIZE_Z = 1;
+const uint GOB_SIZE   = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
+
+const uint GOB_SIZE_X_SHIFT = 6;
+const uint GOB_SIZE_Y_SHIFT = 3;
+const uint GOB_SIZE_Z_SHIFT = 0;
+const uint GOB_SIZE_SHIFT   = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
+const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1u, GOB_SIZE_Y - 1u);
+
+// --- Helpers ---
+uint SwizzleOffset(uvec2 pos) {
+    pos &= SWIZZLE_MASK;
+    return swizzle_table[pos.y * 64u + pos.x];
+}
+
+uvec4 ReadTexel(uint offset) {
+    uint bpl2 = pc.bytes_per_block_log2;
+    switch (bpl2) {
+#if HAS_EXTENDED_TYPES
+        case 0u: return uvec4(u8data[offset], 0u, 0u, 0u);
+        case 1u: return uvec4(u16data[offset / 2u], 0u, 0u, 0u);
+#else
+        case 0u: return uvec4(bitfieldExtract(u32data[offset / 4u], int((offset * 8u) & 24u), 8), 0u, 0u, 0u);
+        case 1u: return uvec4(bitfieldExtract(u32data[offset / 4u], int((offset * 8u) & 16u), 16), 0u, 0u, 0u);
+#endif
+        case 2u: return uvec4(u32data[offset / 4u], 0u, 0u, 0u);
+        case 3u: return uvec4(u64data[offset / 8u], 0u, 0u);
+        case 4u: return u128data[offset / 16u];
+    }
+    return uvec4(0u);
+}
+
+void main() {
+    uvec3 block_coord = gl_GlobalInvocationID;
+    if (any(greaterThanEqual(block_coord, pc.blocks_dim))) {
+        return;
+    }
+
+    uint bytes_per_block = 1u << pc.bytes_per_block_log2;
+    // Origin is in pixels, divide by 4 for block-space (e.g. BCn formats)
+    uvec3 pos;
+    pos.x = (block_coord.x + (pc.origin.x >> 2u)) * bytes_per_block;
+    pos.y = block_coord.y + (pc.origin.y >> 2u);
+    pos.z = block_coord.z + pc.origin.z;
+
+    uint swizzle = SwizzleOffset(pos.xy);
+    uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
+    uint offset  = 0u;
+    // Apply block-linear offsets
+    offset += (pos.z >> pc.block_depth) * pc.slice_size;
+    offset += (pos.z & pc.block_depth_mask) << (GOB_SIZE_SHIFT + pc.block_height);
+    offset += (block_y >> pc.block_height) * pc.block_size;
+    offset += (block_y & pc.block_height_mask) << GOB_SIZE_SHIFT;
+    offset += (pos.x >> GOB_SIZE_X_SHIFT) << pc.x_shift;
+    offset += swizzle;
+
+    uvec4 texel = ReadTexel(offset);
+
+    // Calculate linear output index
+    uint block_index = block_coord.x +
+                       (block_coord.y * pc.blocks_dim.x) +
+                       (block_coord.z * pc.blocks_dim.x * pc.blocks_dim.y);
+    uint out_idx = block_index * (bytes_per_block >> 2u);
+
+    out_u32[out_idx]     = texel.x;
+    out_u32[out_idx + 1] = texel.y;
+    if (pc.bytes_per_block_log2 == 4u) {
+        out_u32[out_idx + 2] = texel.z;
+        out_u32[out_idx + 3] = texel.w;
+    }
+}
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@ -123,7 +123,12 @@ GPUVAddr MemoryManager::PageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] DAddr de
        [[maybe_unused]] const auto current_entry_type = GetEntry<false>(current_gpu_addr);
        SetEntry<false>(current_gpu_addr, entry_type);
        if (current_entry_type != entry_type) {
-            rasterizer->ModifyGPUMemory(unique_identifier, current_gpu_addr, page_size);
+            if constexpr (entry_type == EntryType::Mapped) {
+                const DAddr current_dev_addr = dev_addr + offset;
+                rasterizer->ModifyGPUMemory(unique_identifier, current_gpu_addr, page_size, current_dev_addr);
+            } else {
+                rasterizer->ModifyGPUMemory(unique_identifier, current_gpu_addr, page_size, 0u);
+            }
        }
        if constexpr (entry_type == EntryType::Mapped) {
            const DAddr current_dev_addr = dev_addr + offset;
@ -146,7 +151,12 @@ GPUVAddr MemoryManager::BigPageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] DAddr
        [[maybe_unused]] const auto current_entry_type = GetEntry<true>(current_gpu_addr);
        SetEntry<true>(current_gpu_addr, entry_type);
        if (current_entry_type != entry_type) {
-            rasterizer->ModifyGPUMemory(unique_identifier, current_gpu_addr, big_page_size);
+            if constexpr (entry_type == EntryType::Mapped) {
+                const DAddr current_dev_addr = dev_addr + offset;
+                rasterizer->ModifyGPUMemory(unique_identifier, current_gpu_addr, page_size, current_dev_addr);
+            } else {
+                rasterizer->ModifyGPUMemory(unique_identifier, current_gpu_addr, big_page_size, 0u);
+            }
        }
        if constexpr (entry_type == EntryType::Mapped) {
            const DAddr current_dev_addr = dev_addr + offset;
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

@ -120,7 +123,7 @@ public:
    virtual void UnmapMemory(DAddr addr, u64 size) = 0;

    /// Remap GPU memory range. This means underneath backing memory changed
-    virtual void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) = 0;
+    virtual void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size, DAddr d_addr) = 0;

    /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
    /// and invalidated
--- a/src/video_core/renderer_null/null_rasterizer.cpp
+++ b/src/video_core/renderer_null/null_rasterizer.cpp
@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

@ -62,7 +65,7 @@ VideoCore::RasterizerDownloadArea RasterizerNull::GetFlushArea(PAddr addr, u64 s
 }
 void RasterizerNull::InvalidateGPUCache() {}
 void RasterizerNull::UnmapMemory(DAddr addr, u64 size) {}
-void RasterizerNull::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) {}
+void RasterizerNull::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size, DAddr d_addr) {}
 void RasterizerNull::SignalFence(std::function<void()>&& func) {
    func();
 }
--- a/src/video_core/renderer_null/null_rasterizer.h
+++ b/src/video_core/renderer_null/null_rasterizer.h
@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

@ -58,7 +61,7 @@ public:
    VideoCore::RasterizerDownloadArea GetFlushArea(DAddr addr, u64 size) override;
    void InvalidateGPUCache() override;
    void UnmapMemory(DAddr addr, u64 size) override;
-    void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) override;
+    void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size, DAddr d_addr) override;
    void SignalFence(std::function<void()>&& func) override;
    void SyncOperation(std::function<void()>&& func) override;
    void SignalSyncPoint(u32 value) override;
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@ -600,10 +600,10 @@ void RasterizerOpenGL::UnmapMemory(DAddr addr, u64 size) {
    shader_cache.OnCacheInvalidation(addr, size);
 }

-void RasterizerOpenGL::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) {
+void RasterizerOpenGL::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size, DAddr d_addr) {
    {
        std::scoped_lock lock{texture_cache.mutex};
-        texture_cache.UnmapGPUMemory(as_id, addr, size);
+        texture_cache.UnmapGPUMemory(as_id, addr, size, d_addr);
    }
 }

--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: 2015 Citra Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

@ -103,7 +106,7 @@ public:
    bool OnCPUWrite(PAddr addr, u64 size) override;
    void InvalidateGPUCache() override;
    void UnmapMemory(DAddr addr, u64 size) override;
-    void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) override;
+    void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size, DAddr d_addr) override;
    void SignalFence(std::function<void()>&& func) override;
    void SyncOperation(std::function<void()>&& func) override;
    void SignalSyncPoint(u32 value) override;
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@ -556,7 +556,7 @@ void TextureCacheRuntime::Finish() {
    glFinish();
 }

-StagingBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size) {
+StagingBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size, bool deferred) {
    return staging_buffer_pool.RequestUploadBuffer(size);
 }

@ -651,7 +651,8 @@ void TextureCacheRuntime::BlitFramebuffer(Framebuffer* dst, Framebuffer* src,
 }

 void TextureCacheRuntime::AccelerateImageUpload(Image& image, const StagingBufferMap& map,
-                                                std::span<const SwizzleParameters> swizzles) {
+                                                std::span<const SwizzleParameters> swizzles,
+                                                u32 z_start, u32 z_count) {
    switch (image.info.type) {
    case ImageType::e2D:
        if (IsPixelFormatASTC(image.info.format)) {
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

@ -72,7 +75,7 @@ public:

    void Finish();

-    StagingBufferMap UploadStagingBuffer(size_t size);
+    StagingBufferMap UploadStagingBuffer(size_t size, bool deferred = false);

    StagingBufferMap DownloadStagingBuffer(size_t size, bool deferred = false);

@ -116,7 +119,10 @@ public:
                         Tegra::Engines::Fermi2D::Operation operation);

    void AccelerateImageUpload(Image& image, const StagingBufferMap& map,
-                               std::span<const VideoCommon::SwizzleParameters> swizzles);
+                               std::span<const VideoCommon::SwizzleParameters> swizzles,
+                               u32 z_start, u32 z_count);
+
+    void ClearImage(Image& image, u32 clear_value);

    void InsertUploadMemoryBarrier();

--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@ -24,6 +24,7 @@
 #include "video_core/host_shaders/resolve_conditional_render_comp_spv.h"
 #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
 #include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
+#include "video_core/host_shaders/block_linear_unswizzle_3d_bcn_comp_spv.h"
 #include "video_core/renderer_vulkan/vk_compute_pass.h"
 #include "video_core/renderer_vulkan/vk_descriptor_pool.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
@ -642,6 +643,235 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
    scheduler.Finish();
 }

+constexpr u32 BL3D_BINDING_SWIZZLE_TABLE = 0;
+constexpr u32 BL3D_BINDING_INPUT_BUFFER  = 1;
+constexpr u32 BL3D_BINDING_OUTPUT_BUFFER = 2;
+
+constexpr std::array<VkDescriptorSetLayoutBinding, 3> BL3D_DESCRIPTOR_SET_BINDINGS{{
+    {
+        .binding = BL3D_BINDING_SWIZZLE_TABLE,
+        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // swizzle_table[]
+        .descriptorCount = 1,
+        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+        .pImmutableSamplers = nullptr,
+    },
+    {
+        .binding = BL3D_BINDING_INPUT_BUFFER,
+        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // block-linear input
+        .descriptorCount = 1,
+        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+        .pImmutableSamplers = nullptr,
+    },
+    {
+        .binding = BL3D_BINDING_OUTPUT_BUFFER,
+        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+        .descriptorCount = 1,
+        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+        .pImmutableSamplers = nullptr,
+    },
+}};
+
+constexpr DescriptorBankInfo BL3D_BANK_INFO{
+    .uniform_buffers = 0,
+    .storage_buffers = 3,
+    .texture_buffers = 0,
+    .image_buffers = 0,
+    .textures = 0,
+    .images = 0,
+    .score = 3,
+};
+
+constexpr std::array<VkDescriptorUpdateTemplateEntry, 3>
+    BL3D_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{
+        {
+            .dstBinding = BL3D_BINDING_SWIZZLE_TABLE,
+            .dstArrayElement = 0,
+            .descriptorCount = 1,
+            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .offset = BL3D_BINDING_SWIZZLE_TABLE * sizeof(DescriptorUpdateEntry),
+            .stride = sizeof(DescriptorUpdateEntry),
+        },
+        {
+            .dstBinding = BL3D_BINDING_INPUT_BUFFER,
+            .dstArrayElement = 0,
+            .descriptorCount = 1,
+            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .offset = BL3D_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry),
+            .stride = sizeof(DescriptorUpdateEntry),
+        },
+        {
+            .dstBinding = BL3D_BINDING_OUTPUT_BUFFER,
+            .dstArrayElement = 0,
+            .descriptorCount = 1,
+            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .offset = BL3D_BINDING_OUTPUT_BUFFER * sizeof(DescriptorUpdateEntry),
+            .stride = sizeof(DescriptorUpdateEntry),
+        }
+    }};
+
+struct alignas(16) BlockLinearUnswizzle3DPushConstants {
+    u32 blocks_dim[3];           // Offset 0
+    u32 bytes_per_block_log2;    // Offset 12
+    
+    u32 origin[3];               // Offset 16
+    u32 slice_size;              // Offset 28
+    
+    u32 block_size;              // Offset 32
+    u32 x_shift;                 // Offset 36
+    u32 block_height;            // Offset 40
+    u32 block_height_mask;       // Offset 44
+    
+    u32 block_depth;             // Offset 48
+    u32 block_depth_mask;        // Offset 52
+    s32 _pad;                    // Offset 56
+    
+    s32 destination[3];          // Offset 60
+    s32 _pad_end;                // Offset 72
+};
+static_assert(sizeof(BlockLinearUnswizzle3DPushConstants) <= 128);
+
+BlockLinearUnswizzle3DPass::BlockLinearUnswizzle3DPass(
+    const Device& device_, Scheduler& scheduler_,
+    DescriptorPool& descriptor_pool_,
+    StagingBufferPool& staging_buffer_pool_,
+    ComputePassDescriptorQueue& compute_pass_descriptor_queue_)
+    : ComputePass(
+          device_, descriptor_pool_,
+          BL3D_DESCRIPTOR_SET_BINDINGS,
+          BL3D_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY,
+          BL3D_BANK_INFO,
+          COMPUTE_PUSH_CONSTANT_RANGE<sizeof(BlockLinearUnswizzle3DPushConstants)>,
+          BLOCK_LINEAR_UNSWIZZLE_3D_BCN_COMP_SPV),
+      scheduler{scheduler_},
+      staging_buffer_pool{staging_buffer_pool_},
+      compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {}
+
+BlockLinearUnswizzle3DPass::~BlockLinearUnswizzle3DPass() = default;
+
+// God have mercy on my soul
+void BlockLinearUnswizzle3DPass::Unswizzle(
+    Image& image,
+    const StagingBufferRef& swizzled,
+    std::span<const VideoCommon::SwizzleParameters> swizzles,
+    u32 z_start, u32 z_count) 
+{
+    using namespace VideoCommon::Accelerated;
+
+    if (!image.has_compute_unswizzle_buffer) {
+        image.AllocateComputeUnswizzleBuffer();
+    }
+
+    ASSERT(swizzles.size() == 1);
+    const auto& sw = swizzles[0];
+    const auto params = MakeBlockLinearSwizzle3DParams(sw, image.info);
+
+    BlockLinearUnswizzle3DPushConstants pc{};
+    pc.origin[0] = params.origin[0];
+    pc.origin[1] = params.origin[1];
+    pc.origin[2] = z_start; // Start at the current Z-slice
+
+    pc.destination[0] = params.destination[0];
+    pc.destination[1] = params.destination[1];
+    pc.destination[2] = 0; // Shader writes to start of output buffer
+
+    pc.bytes_per_block_log2 = params.bytes_per_block_log2;
+    pc.slice_size           = params.slice_size;
+    pc.block_size           = params.block_size;
+    pc.x_shift              = params.x_shift;
+    pc.block_height         = params.block_height;
+    pc.block_height_mask    = params.block_height_mask;
+    pc.block_depth          = params.block_depth;
+    pc.block_depth_mask     = params.block_depth_mask;
+
+    const u32 blocks_x = (image.info.size.width  + 3) / 4;
+    const u32 blocks_y = (image.info.size.height + 3) / 4;
+    pc.blocks_dim[0] = blocks_x;
+    pc.blocks_dim[1] = blocks_y;
+    pc.blocks_dim[2] = z_count; // Only process the count
+
+    compute_pass_descriptor_queue.Acquire();
+    compute_pass_descriptor_queue.AddBuffer(*image.runtime->swizzle_table_buffer, 0, image.runtime->swizzle_table_size);
+    compute_pass_descriptor_queue.AddBuffer(swizzled.buffer, sw.buffer_offset + swizzled.offset, image.guest_size_bytes - sw.buffer_offset);
+    compute_pass_descriptor_queue.AddBuffer(*image.compute_unswizzle_buffer, 0, image.compute_unswizzle_buffer_size);
+
+    const void* descriptor_data = compute_pass_descriptor_queue.UpdateData();
+    const VkDescriptorSet set = descriptor_allocator.Commit();
+
+    const u32 gx = Common::DivCeil(blocks_x, 32u);
+    const u32 gy = Common::DivCeil(blocks_y, 8u);
+    const u32 gz = Common::DivCeil(z_count, 1u);
+    
+    scheduler.RequestOutsideRenderPassOperationContext();
+    scheduler.Record([this, &image, set, descriptor_data, pc, gx, gy, gz, z_start, z_count](vk::CommandBuffer cmdbuf) {
+        const VkBuffer out_buffer = *image.compute_unswizzle_buffer;
+        const VkImage dst_image = image.Handle();
+        const VkImageAspectFlags aspect = image.AspectMask();
+
+        device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data);
+        cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
+        cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {});
+        cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
+        cmdbuf.Dispatch(gx, gy, gz);
+
+        // OPTIMIZATION: Combined barrier - merge buffer and image barriers when possible
+        const bool is_first = (z_start == 0);
+        
+        // Single barrier for compute -> transfer (buffer ready, image transition)
+        const VkBufferMemoryBarrier buffer_barrier{
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
+            .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
+            .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
+            .buffer = out_buffer,
+            .offset = 0,
+            .size = (VkDeviceSize)z_count * pc.slice_size,
+        };
+        
+        const VkImageMemoryBarrier pre_barrier{
+            .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+            .srcAccessMask = is_first ? VkAccessFlags{} : static_cast<VkAccessFlags>(VK_ACCESS_SHADER_READ_BIT),
+            .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+            .oldLayout = is_first ? VK_IMAGE_LAYOUT_UNDEFINED : VK_IMAGE_LAYOUT_GENERAL,
+            .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+            .image = dst_image,
+            .subresourceRange = {aspect, 0, 1, 0, 1},
+        };
+        
+        // Single barrier handles both buffer and image
+        cmdbuf.PipelineBarrier(
+            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 
+            VK_PIPELINE_STAGE_TRANSFER_BIT, 
+            0, 
+            nullptr, buffer_barrier, pre_barrier
+        );
+
+        const VkBufferImageCopy copy{
+            .bufferOffset = 0,
+            .imageSubresource = {aspect, 0, 0, 1},
+            .imageOffset = {0, 0, (s32)z_start},
+            .imageExtent = {image.info.size.width, image.info.size.height, z_count},
+        };
+        cmdbuf.CopyBufferToImage(out_buffer, dst_image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy);
+
+        // Post-copy transition
+        const VkImageMemoryBarrier post_barrier{
+            .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+            .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+            .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
+            .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+            .newLayout = VK_IMAGE_LAYOUT_GENERAL,
+            .image = dst_image,
+            .subresourceRange = {aspect, 0, 1, 0, 1},
+        };
+
+        cmdbuf.PipelineBarrier(
+            VK_PIPELINE_STAGE_TRANSFER_BIT, 
+            VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 
+            0, 
+            nullptr, nullptr, post_barrier
+        );
+    });
+}
+
 MSAACopyPass::MSAACopyPass(const Device& device_, Scheduler& scheduler_,
                           DescriptorPool& descriptor_pool_,
                           StagingBufferPool& staging_buffer_pool_,
--- a/src/video_core/renderer_vulkan/vk_compute_pass.h
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.h
@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

@ -131,6 +134,26 @@ private:
    MemoryAllocator& memory_allocator;
 };

+class BlockLinearUnswizzle3DPass final : public ComputePass {
+public:
+    explicit BlockLinearUnswizzle3DPass(const Device& device_, Scheduler& scheduler_,
+                             DescriptorPool& descriptor_pool_,
+                             StagingBufferPool& staging_buffer_pool_,
+                             ComputePassDescriptorQueue& compute_pass_descriptor_queue_);
+    ~BlockLinearUnswizzle3DPass();
+    
+    void Unswizzle(Image& image,
+                   const StagingBufferRef& swizzled,
+                   std::span<const VideoCommon::SwizzleParameters> swizzles,
+                   u32 z_start, u32 z_count);
+
+private:
+    Scheduler& scheduler;
+    StagingBufferPool& staging_buffer_pool;
+    ComputePassDescriptorQueue& compute_pass_descriptor_queue;
+};
+
+
 class MSAACopyPass final : public ComputePass {
 public:
    explicit MSAACopyPass(const Device& device_, Scheduler& scheduler_,
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@ -748,10 +748,10 @@ void RasterizerVulkan::UnmapMemory(DAddr addr, u64 size) {
    pipeline_cache.OnCacheInvalidation(addr, size);
 }

-void RasterizerVulkan::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) {
+void RasterizerVulkan::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size, DAddr d_addr) {
    {
        std::scoped_lock lock{texture_cache.mutex};
-        texture_cache.UnmapGPUMemory(as_id, addr, size);
+        texture_cache.UnmapGPUMemory(as_id, addr, size, d_addr);
    }
 }

--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@ -108,7 +108,7 @@ public:
    bool OnCPUWrite(DAddr addr, u64 size) override;
    void InvalidateGPUCache() override;
    void UnmapMemory(DAddr addr, u64 size) override;
-    void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) override;
+    void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size, DAddr d_addr) override;
    void SignalFence(std::function<void()>&& func) override;
    void SyncOperation(std::function<void()>&& func) override;
    void SignalSyncPoint(u32 value) override;
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@ -43,6 +43,16 @@ Scheduler::Scheduler(const Device& device_, StateTracker& state_tracker_)
    : device{device_}, state_tracker{state_tracker_},
      master_semaphore{std::make_unique<MasterSemaphore>(device)},
      command_pool{std::make_unique<CommandPool>(*master_semaphore, device)} {
+    
+    // PRE-OPTIMIZATION: Warm up the pool to prevent mid-frame spikes
+    {
+        std::scoped_lock rl{reserve_mutex};
+        chunk_reserve.reserve(2048); // Prevent vector resizing
+        for (int i = 0; i < 1024; ++i) {
+            chunk_reserve.push_back(std::make_unique<CommandChunk>());
+        }
+    }
+
    AcquireNewChunk();
    AllocateWorkerCommandBuffer();
    worker_thread = std::jthread([this](std::stop_token token) { WorkerThread(token); });
--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@ -207,7 +207,7 @@ private:

        size_t command_offset = 0;
        bool submit = false;
-        alignas(std::max_align_t) std::array<u8, 0x8000> data{};
+        alignas(std::max_align_t) std::array<u8, 0x40000> data{};
    };

    struct State {
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@ -24,12 +24,14 @@
 #include "video_core/renderer_vulkan/vk_render_pass_cache.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
+#include "video_core/surface.h"
 #include "video_core/texture_cache/formatter.h"
 #include "video_core/texture_cache/samples_helper.h"
 #include "video_core/texture_cache/util.h"
 #include "video_core/vulkan_common/vulkan_device.h"
 #include "video_core/vulkan_common/vulkan_memory_allocator.h"
 #include "video_core/vulkan_common/vulkan_wrapper.h"
+#include "video_core/textures/decoders.h"

 namespace Vulkan {

@ -878,14 +880,51 @@ TextureCacheRuntime::TextureCacheRuntime(const Device& device_, Scheduler& sched
            }
        }
    }
+    
+    bl3d_unswizzle_pass.emplace(device, scheduler, descriptor_pool,
+                            staging_buffer_pool, compute_pass_descriptor_queue);
+
+    // --- Create swizzle table buffer ---
+    {
+        auto table = Tegra::Texture::MakeSwizzleTable();
+        
+        swizzle_table_size = static_cast<VkDeviceSize>(table.size() * sizeof(table[0]));
+
+        auto staging = staging_buffer_pool.Request(swizzle_table_size, MemoryUsage::Upload);
+        std::memcpy(staging.mapped_span.data(), table.data(), static_cast<size_t>(swizzle_table_size));
+
+        VkBufferCreateInfo ci{
+            .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+            .size = swizzle_table_size,
+            .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | 
+                     VK_BUFFER_USAGE_TRANSFER_DST_BIT | 
+                     VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+            .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        };
+        swizzle_table_buffer = memory_allocator.CreateBuffer(ci, MemoryUsage::DeviceLocal);
+
+        scheduler.RequestOutsideRenderPassOperationContext();
+        scheduler.Record([staging_buf = staging.buffer, 
+                          dst_buf = *swizzle_table_buffer, 
+                          size = swizzle_table_size,
+                          src_off = staging.offset](vk::CommandBuffer cmdbuf) {
+            
+            const VkBufferCopy region{
+                .srcOffset = src_off,
+                .dstOffset = 0,
+                .size = size,
+            };
+            cmdbuf.CopyBuffer(staging_buf, dst_buf, region);
+        });
+    }
 }

 void TextureCacheRuntime::Finish() {
    scheduler.Finish();
 }

-StagingBufferRef TextureCacheRuntime::UploadStagingBuffer(size_t size) {
-    return staging_buffer_pool.Request(size, MemoryUsage::Upload);
+StagingBufferRef TextureCacheRuntime::UploadStagingBuffer(size_t size, bool deferred) {
+    return staging_buffer_pool.Request(size, MemoryUsage::Upload, deferred);
 }

 StagingBufferRef TextureCacheRuntime::DownloadStagingBuffer(size_t size, bool deferred) {
@ -1581,6 +1620,46 @@ Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBas

 Image::~Image() = default;

+void Image::AllocateComputeUnswizzleBuffer() {
+    if (has_compute_unswizzle_buffer)
+        return;
+
+    using VideoCore::Surface::BytesPerBlock;
+
+    const u32 block_bytes  = BytesPerBlock(info.format); // 8 for BC1, 16 for BC6H
+    const u32 block_width  = 4;
+    const u32 block_height = 4;
+
+    // BCn is 4x4x1 blocks
+    const u32 blocks_x = (info.size.width  + block_width  - 1) / block_width;
+    const u32 blocks_y = (info.size.height + block_height - 1) / block_height;
+    const u32 blocks_z = info.size.depth;
+
+    const u64 block_count =
+        static_cast<u64>(blocks_x) *
+        static_cast<u64>(blocks_y) *
+        static_cast<u64>(blocks_z);
+
+    compute_unswizzle_buffer_size = block_count * block_bytes;
+
+    VkBufferCreateInfo ci{
+        .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .size = compute_unswizzle_buffer_size,
+        .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+                 VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = 0,
+        .pQueueFamilyIndices = nullptr,
+    };
+
+    compute_unswizzle_buffer =
+        runtime->memory_allocator.CreateBuffer(ci, MemoryUsage::DeviceLocal);
+
+    has_compute_unswizzle_buffer = true;
+}
+
 void Image::UploadMemory(VkBuffer buffer, VkDeviceSize offset,
                         std::span<const VideoCommon::BufferImageCopy> copies) {
    // TODO: Move this to another API
@ -2397,10 +2476,22 @@ void Framebuffer::CreateFramebuffer(TextureCacheRuntime& runtime,

 void TextureCacheRuntime::AccelerateImageUpload(
    Image& image, const StagingBufferRef& map,
-    std::span<const VideoCommon::SwizzleParameters> swizzles) {
+    std::span<const VideoCommon::SwizzleParameters> swizzles,
+    u32 z_start, u32 z_count) {
+
    if (IsPixelFormatASTC(image.info.format)) {
        return astc_decoder_pass->Assemble(image, map, swizzles);
    }
+
+    if (bl3d_unswizzle_pass &&
+        IsPixelFormatBCn(image.info.format) &&
+        image.info.type == ImageType::e3D &&
+        image.info.resources.levels == 1 &&
+        image.info.resources.layers == 1) {
+
+        return bl3d_unswizzle_pass->Unswizzle(image, map, swizzles, z_start, z_count);
+    }
+
    ASSERT(false);
 }

--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@ -51,7 +51,7 @@ public:

    void Finish();

-    StagingBufferRef UploadStagingBuffer(size_t size);
+    StagingBufferRef UploadStagingBuffer(size_t size, bool deferred = false);

    StagingBufferRef DownloadStagingBuffer(size_t size, bool deferred = false);

@ -91,7 +91,8 @@ public:
    }

    void AccelerateImageUpload(Image&, const StagingBufferRef&,
-                               std::span<const VideoCommon::SwizzleParameters>);
+                               std::span<const VideoCommon::SwizzleParameters>,
+                               u32 z_start, u32 z_count);

    void InsertUploadMemoryBarrier() {}

@ -127,6 +128,11 @@ public:
    BlitImageHelper& blit_image_helper;
    RenderPassCache& render_pass_cache;
    std::optional<ASTCDecoderPass> astc_decoder_pass;
+    
+    std::optional<BlockLinearUnswizzle3DPass> bl3d_unswizzle_pass;
+    vk::Buffer swizzle_table_buffer;
+    VkDeviceSize swizzle_table_size = 0;
+
    std::unique_ptr<MSAACopyPass> msaa_copy_pass;
    const Settings::ResolutionScalingInfo& resolution;
    std::array<std::vector<VkFormat>, VideoCore::Surface::MaxPixelFormat> view_formats;
@ -163,6 +169,8 @@ public:

    void DownloadMemory(const StagingBufferRef& map,
                        std::span<const VideoCommon::BufferImageCopy> copies);
+                        
+    void AllocateComputeUnswizzleImage();

    [[nodiscard]] VkImage Handle() const noexcept {
        return *(this->*current_image);
@ -188,6 +196,8 @@ public:
    bool ScaleUp(bool ignore = false);

    bool ScaleDown(bool ignore = false);
+    
+    friend class BlockLinearUnswizzle3DPass;

 private:
    bool BlitScaleHelper(bool scale_up);
@ -199,6 +209,12 @@ private:

    vk::Image original_image;
    vk::Image scaled_image;
+    
+    vk::Buffer compute_unswizzle_buffer;
+    VkDeviceSize compute_unswizzle_buffer_size = 0;
+    bool has_compute_unswizzle_buffer = false;
+
+    void AllocateComputeUnswizzleBuffer();

    // Use a pointer to field because it is relative, so that the object can be
    // moved without breaking the reference.
--- a/src/video_core/texture_cache/image_base.h
+++ b/src/video_core/texture_cache/image_base.h
@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

@ -52,6 +55,13 @@ struct AliasedImage {
    ImageId id;
 };

+struct SparseBinding {
+    GPUVAddr gpu_addr;      // Virtual GPU address of this tile
+    DAddr device_addr;      // Physical device memory address
+    u64 tile_index;         // Linear tile index in the texture
+    Extent3D tile_coord;    // 3D coordinate of this tile
+};
+
 struct NullImageParams {};

 struct ImageBase {
@ -115,6 +125,10 @@ struct ImageBase {
    std::vector<AliasedImage> aliased_images;
    std::vector<ImageId> overlapping_images;
    ImageMapId map_view_id{};
+    
+    boost::container::small_vector<u64, 16> dirty_offsets;
+    std::unordered_map<GPUVAddr, SparseBinding> sparse_bindings;
+    u32 sparse_tile_size = 65536;
 };

 struct ImageMapView {
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@ -8,6 +8,7 @@

 #include <limits>
 #include <optional>
+#include <bit>
 #include <unordered_set>
 #include <boost/container/small_vector.hpp>

@ -22,6 +23,7 @@
 #include "video_core/texture_cache/samples_helper.h"
 #include "video_core/texture_cache/texture_cache_base.h"
 #include "video_core/texture_cache/util.h"
+#include "video_core/textures/decoders.h"

 namespace VideoCommon {

@ -160,6 +162,7 @@ void TextureCache<P>::TickFrame() {
    sentenced_framebuffers.Tick();
    sentenced_image_view.Tick();
    TickAsyncDecode();
+    TickAsyncUnswizzle();

    runtime.TickFrame();
    ++frame_tick;
@ -615,7 +618,36 @@ void TextureCache<P>::UnmapMemory(DAddr cpu_addr, size_t size) {
 }

 template <class P>
-void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t size) {
+std::optional<SparseBinding> TextureCache<P>::CalculateSparseBinding(
+    const Image& image, GPUVAddr gpu_addr, DAddr dev_addr) {
+    
+    if (!image.info.is_sparse) {
+        return std::nullopt;
+    }
+    
+    const u64 offset = gpu_addr - image.gpu_addr;
+    const u64 tile_index = offset / image.sparse_tile_size;
+
+    const u32 tile_width_blocks = 128;
+    const u32 tile_height_blocks = 32;
+    
+    const u32 width_in_tiles = (image.info.size.width / 4 + tile_width_blocks - 1) / tile_width_blocks;
+    const u32 height_in_tiles = (image.info.size.height / 4 + tile_height_blocks - 1) / tile_height_blocks;
+    
+    const u32 tile_x = static_cast<u32>((tile_index % width_in_tiles) * tile_width_blocks * 4);
+    const u32 tile_y = static_cast<u32>(((tile_index / width_in_tiles) % height_in_tiles) * tile_height_blocks * 4);
+    const u32 tile_z = static_cast<u32>(tile_index / (width_in_tiles * height_in_tiles));
+    
+    return SparseBinding{
+        .gpu_addr = gpu_addr,
+        .device_addr = dev_addr,
+        .tile_index = tile_index,
+        .tile_coord = {tile_x, tile_y, tile_z}
+    };
+}
+
+template <class P>
+void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t size, DAddr dev_addr) {
    boost::container::small_vector<ImageId, 16> deleted_images;
    ForEachImageInRegionGPU(as_id, gpu_addr, size,
                            [&](ImageId id, Image&) { deleted_images.push_back(id); });
@ -627,11 +659,19 @@ void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t siz
                UntrackImage(image, id);
            }
        }
-
        if (True(image.flags & ImageFlagBits::Remapped)) {
            continue;
        }
        image.flags |= ImageFlagBits::Remapped;
+
+        if (image.info.is_sparse && dev_addr != 0) {
+            // Calculate and store the binding
+            auto binding = CalculateSparseBinding(image, gpu_addr, dev_addr);
+            if (binding) {
+                image.sparse_bindings[gpu_addr] = *binding;
+                image.dirty_offsets.push_back(binding->tile_index);
+            }
+        }
    }
 }

@ -1055,9 +1095,29 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
        // Only upload modified images
        return;
    }
+    
    image.flags &= ~ImageFlagBits::CpuModified;
    TrackImage(image, image_id);

+    // If it's sparse and remapped, we treat it as a partial update trigger
+    if (image.info.is_sparse && True(image.flags & ImageFlagBits::Remapped)) {
+        image.flags &= ~ImageFlagBits::Remapped;
+        
+        if (!image.dirty_offsets.empty() && !image.sparse_bindings.empty()) {
+            /*constexpr u64 page_size = 64_KiB;
+            size_t dirty_size = image.dirty_offsets.size() * page_size;
+            
+            auto staging = runtime.UploadStagingBuffer(dirty_size);
+            UploadSparseDirtyTiles(image, staging);
+            runtime.InsertUploadMemoryBarrier();
+            
+            return;*/
+            image.dirty_offsets.clear();
+            image.sparse_bindings.clear();
+            return;
+        }
+    }
+    
    if (image.info.num_samples > 1 && !runtime.CanUploadMSAA()) {
        LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented");
        runtime.TransitionImageLayout(image);
@ -1067,11 +1127,102 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
        QueueAsyncDecode(image, image_id);
        return;
    }
+    if (IsPixelFormatBCn(image.info.format) &&
+        image.info.type == ImageType::e3D &&
+        image.info.resources.levels == 1 &&
+        image.info.resources.layers == 1 &&
+        MapSizeBytes(image) >= 32_MiB &&
+        False(image.flags & ImageFlagBits::GpuModified)) {
+            
+        QueueAsyncUnswizzle(image, image_id);
+        return;
+    }
    auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image));
    UploadImageContents(image, staging);
    runtime.InsertUploadMemoryBarrier();
 }

+template <class P>
+template <typename StagingBuffer>
+void TextureCache<P>::UploadSparseDirtyTiles(Image& image, StagingBuffer& staging) {
+    using namespace VideoCommon;
+    using namespace Tegra::Texture;
+
+    std::vector<BufferImageCopy> all_copies;
+    size_t total_upload_size = 0;
+    
+    for (u64 dirty_tile_index : image.dirty_offsets) {
+        SparseBinding* binding = nullptr;
+        for (auto& [addr, bind] : image.sparse_bindings) {
+            if (bind.tile_index == dirty_tile_index) {
+                binding = &bind;
+                break;
+            }
+        }
+        
+        if (!binding) {
+            continue;
+        }
+        
+        const auto& coord = binding->tile_coord;
+        
+        // Calculate tile dimensions
+        const u32 tile_width_blocks = 128;
+        const u32 tile_height_blocks = 32;
+        const u32 tile_width = std::min(tile_width_blocks * 4, image.info.size.width - coord.width);
+        const u32 tile_height = std::min(tile_height_blocks * 4, image.info.size.height - coord.height);
+        const u32 tile_depth = std::min(1u, image.info.size.depth - coord.depth);
+        
+        const u32 bytes_per_block = BytesPerBlock(image.info.format);
+        const u32 blocks_wide = (tile_width + 3) / 4;
+        const u32 blocks_high = (tile_height + 3) / 4;
+        const size_t tile_unswizzled_size = blocks_wide * blocks_high * tile_depth * bytes_per_block;
+        
+        if (total_upload_size + tile_unswizzled_size > staging.mapped_span.size()) {
+            LOG_ERROR(HW_GPU, "Staging buffer too small");
+            break;
+        }
+        
+        std::array<u8, 65536> tile_swizzled_data;
+        gpu_memory->ReadBlockUnsafe(binding->gpu_addr, tile_swizzled_data.data(), image.sparse_tile_size);
+
+        // Get output span
+        auto tile_output = staging.mapped_span.subspan(total_upload_size, tile_unswizzled_size);
+
+        // Unswizzle the tile
+        auto result = UnswizzleSparseTextureTile(tile_output, tile_swizzled_data, 
+                                                 image.info, tile_width, tile_height, tile_depth);
+
+        // Create the copy descriptor
+        BufferImageCopy copy{
+            .buffer_offset = total_upload_size,
+            .buffer_size = tile_unswizzled_size,
+            .buffer_row_length = result.buffer_row_length,
+            .buffer_image_height = result.buffer_image_height,
+            .image_subresource = {
+                .base_level = 0,
+                .base_layer = 0,
+                .num_layers = 1,
+            },
+            .image_offset = {
+                static_cast<s32>(coord.width),
+                static_cast<s32>(coord.height),
+                static_cast<s32>(coord.depth)
+            },
+            .image_extent = {tile_width, tile_height, tile_depth}
+        };
+        
+        all_copies.push_back(copy);
+        total_upload_size += tile_unswizzled_size;
+    }
+    
+    if (!all_copies.empty()) {
+        image.UploadMemory(staging, all_copies);
+    }
+    
+    image.dirty_offsets.clear();
+}
+
 template <class P>
 template <typename StagingBuffer>
 void TextureCache<P>::UploadImageContents(Image& image, StagingBuffer& staging) {
@ -1082,7 +1233,7 @@ void TextureCache<P>::UploadImageContents(Image& image, StagingBuffer& staging)
        gpu_memory->ReadBlock(gpu_addr, mapped_span.data(), mapped_span.size_bytes(),
                              VideoCommon::CacheType::NoTextureCache);
        const auto uploads = FullUploadSwizzles(image.info);
-        runtime.AccelerateImageUpload(image, staging, FixSmallVectorADL(uploads));
+        runtime.AccelerateImageUpload(image, staging, FixSmallVectorADL(uploads), 0, 0);
        return;
    }

@ -1311,6 +1462,20 @@ void TextureCache<P>::QueueAsyncDecode(Image& image, ImageId image_id) {
    texture_decode_worker.QueueWork(std::move(func));
 }

+template <class P>
+void TextureCache<P>::QueueAsyncUnswizzle(Image& image, ImageId image_id) {
+    if (True(image.flags & ImageFlagBits::IsDecoding)) {
+        return;
+    }
+
+    image.flags |= ImageFlagBits::IsDecoding;
+    
+    unswizzle_queue.push_back({
+        .image_id = image_id,
+        .info = image.info
+    });
+}
+
 template <class P>
 void TextureCache<P>::TickAsyncDecode() {
    bool has_uploads{};
@ -1336,6 +1501,90 @@ void TextureCache<P>::TickAsyncDecode() {
    }
 }

+template <class P>
+void TextureCache<P>::TickAsyncUnswizzle() {
+    if (unswizzle_queue.empty()) {
+        current_unswizzle_frame = 0;
+        return;
+    }
+    
+    // Don't process every frame - allow more data to accumulate
+    if (current_unswizzle_frame++ < 2) return;
+    
+    PendingUnswizzle& task = unswizzle_queue.front();
+    Image& image = slot_images[task.image_id];
+    
+    if (!task.initialized) {
+        task.total_size = MapSizeBytes(image);
+        task.staging_buffer = runtime.UploadStagingBuffer(task.total_size, true);
+        
+        const auto& info = image.info;
+        const u32 bytes_per_block = BytesPerBlock(info.format);
+        const u32 width_blocks = Common::DivCeil(info.size.width, 4u);
+        const u32 height_blocks = Common::DivCeil(info.size.height, 4u);
+        
+        const u32 stride = Common::AlignUp(width_blocks * bytes_per_block, 64u);
+        const u32 aligned_height = Common::AlignUp(height_blocks, 8u << task.info.block.height);
+        
+        task.bytes_per_slice = static_cast<size_t>(stride) * aligned_height;
+        task.last_submitted_offset = 0;
+        task.initialized = true;
+    }
+    
+    // ToDo: Make these configurable
+    const size_t CHUNK_SIZE = 64_MiB;
+    const u32 SLICES_PER_BATCH = 512;
+    
+    static std::vector<u8> temp_buffer;
+    if (temp_buffer.size() < CHUNK_SIZE) {
+        temp_buffer.resize(CHUNK_SIZE);
+    }
+    
+    // Read data
+    if (task.current_offset < task.total_size) {
+        const size_t remaining = task.total_size - task.current_offset;
+        const size_t copy_amount = std::min(CHUNK_SIZE, remaining);
+        
+        gpu_memory->ReadBlock(image.gpu_addr + task.current_offset, 
+                              task.staging_buffer.mapped_span.data() + task.current_offset, 
+                              copy_amount,
+                              VideoCommon::CacheType::NoTextureCache);
+        task.current_offset += copy_amount;
+    }
+
+    const size_t batch_threshold = task.bytes_per_slice * SLICES_PER_BATCH;
+    size_t ready_to_submit = task.current_offset - task.last_submitted_offset;
+    
+    const bool is_final_batch = task.current_offset >= task.total_size;
+    const bool should_submit = ready_to_submit >= batch_threshold || 
+                               (is_final_batch && task.last_submitted_offset < task.total_size);
+    
+    if (should_submit) {
+        const u32 z_start = static_cast<u32>(task.last_submitted_offset / task.bytes_per_slice);
+        const u32 total_depth = image.info.size.depth;
+        
+        u32 z_count = static_cast<u32>(ready_to_submit / task.bytes_per_slice);
+        if (z_start + z_count > total_depth) {
+            z_count = total_depth - z_start;
+        }
+        
+        if (z_count > 0) {
+            const auto uploads = FullUploadSwizzles(task.info);
+            runtime.AccelerateImageUpload(image, task.staging_buffer, FixSmallVectorADL(uploads), z_start, z_count);
+            task.last_submitted_offset += (static_cast<size_t>(z_count) * task.bytes_per_slice);
+        }
+    }
+    
+    // Check if complete
+    if (task.current_offset >= task.total_size && 
+        task.last_submitted_offset >= (task.total_size - (task.total_size % task.bytes_per_slice))) {
+        runtime.FreeDeferredStagingBuffer(task.staging_buffer);
+        image.flags &= ~ImageFlagBits::IsDecoding;
+        unswizzle_queue.pop_front();
+        current_unswizzle_frame = 0;
+    }
+}
+
 template <class P>
 bool TextureCache<P>::ScaleUp(Image& image) {
    const bool has_copy = image.HasScaled();
@ -2423,6 +2672,7 @@ void TextureCache<P>::SynchronizeAliases(ImageId image_id) {
 template <class P>
 void TextureCache<P>::PrepareImage(ImageId image_id, bool is_modification, bool invalidate) {
    Image& image = slot_images[image_id];
+    runtime.TransitionImageLayout(image);
    if (invalidate) {
        image.flags &= ~(ImageFlagBits::CpuModified | ImageFlagBits::GpuModified);
        if (False(image.flags & ImageFlagBits::Tracked)) {
--- a/src/video_core/texture_cache/texture_cache_base.h
+++ b/src/video_core/texture_cache/texture_cache_base.h
@ -129,6 +129,17 @@ class TextureCache : public VideoCommon::ChannelSetupCaches<TextureCacheChannelI
    using AsyncBuffer = typename P::AsyncBuffer;
    using BufferType = typename P::BufferType;

+    struct PendingUnswizzle {
+        ImageId image_id;
+        VideoCommon::ImageInfo info;
+        size_t current_offset = 0;
+        size_t total_size = 0;
+        AsyncBuffer staging_buffer;
+        size_t last_submitted_offset = 0;
+        u32 bytes_per_slice = 0;
+        bool initialized = false;
+    };
+
    struct BlitImages {
        ImageId dst_id;
        ImageId src_id;
@ -212,7 +223,11 @@ public:
    void UnmapMemory(DAddr cpu_addr, size_t size);

    /// Remove images in a region
-    void UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t size);
+    void UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t size, DAddr dev_addr);
+    
+    /// Basic sparse binding
+    std::optional<SparseBinding> CalculateSparseBinding(
+        const Image& image, GPUVAddr gpu_addr, DAddr dev_addr);

    /// Blit an image with the given parameters
    bool BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
@ -312,6 +327,10 @@ private:

    /// Refresh the contents (pixel data) of an image
    void RefreshContents(Image& image, ImageId image_id);
+    
+    /// Sparse texture partial upload
+    template <typename StagingBuffer>
+    void UploadSparseDirtyTiles(Image& image, StagingBuffer& staging);

    /// Upload data from guest to an image
    template <typename StagingBuffer>
@ -433,6 +452,9 @@ private:
    void TrimInactiveSamplers(size_t budget);
    std::optional<size_t> QuerySamplerBudget() const;

+    void QueueAsyncUnswizzle(Image& image, ImageId image_id);
+    void TickAsyncUnswizzle();
+
    Runtime& runtime;

    Tegra::MaxwellDeviceMemoryManager& device_memory;
@ -508,6 +530,9 @@ private:
    Common::ThreadWorker texture_decode_worker{1, "TextureDecoder"};
    std::vector<std::unique_ptr<AsyncDecodeContext>> async_decodes;

+    std::deque<PendingUnswizzle> unswizzle_queue;
+    u8 current_unswizzle_frame;
+
    // Join caching
    boost::container::small_vector<ImageId, 4> join_overlap_ids;
    std::unordered_set<ImageId> join_overlaps_found;
--- a/src/video_core/texture_cache/util.cpp
+++ b/src/video_core/texture_cache/util.cpp
@ -55,6 +55,7 @@ using Tegra::Texture::TextureFormat;
 using Tegra::Texture::TextureType;
 using Tegra::Texture::TICEntry;
 using Tegra::Texture::UnswizzleTexture;
+using Tegra::Texture::UnswizzleSubrect;
 using VideoCore::Surface::BytesPerBlock;
 using VideoCore::Surface::DefaultBlockHeight;
 using VideoCore::Surface::DefaultBlockWidth;
@ -922,6 +923,46 @@ boost::container::small_vector<BufferImageCopy, 16> UnswizzleImage(Tegra::Memory
    return copies;
 }

+SparseTileUnswizzleResult UnswizzleSparseTextureTile(std::span<u8> output, 
+                                                      std::span<const u8> input,
+                                                      const ImageInfo& info,
+                                                      u32 tile_width,
+                                                      u32 tile_height,
+                                                      u32 tile_depth) {
+    const Extent2D block_size = DefaultBlockSize(info.format);
+    const u32 bpp = BytesPerBlock(info.format);
+    const u32 width_blocks = (tile_width + block_size.width - 1) / block_size.width;
+    const u32 height_blocks = (tile_height + block_size.height - 1) / block_size.height;
+    
+    // Calculate GOBs per row
+    const u32 bytes_per_row = width_blocks * bpp;
+    const u32 gobs_per_row = (bytes_per_row + 63) / 64;
+    
+    // Calculate block_height for 64KB tiles
+    // 64KB / (gobs_per_row × 512 bytes) = GOBs tall
+    constexpr u32 TILE_SIZE = 65536;
+    const u32 gobs_tall = TILE_SIZE / (gobs_per_row * 512);
+    
+    // block_height = log2(gobs_tall)
+    const u32 tile_block_height = std::countr_zero(gobs_tall);
+    
+    const u32 pitch_linear = width_blocks * bpp;
+    
+    UnswizzleSubrect(
+        output, input, bpp,
+        width_blocks, height_blocks, tile_depth,
+        0, 0,
+        width_blocks, height_blocks,
+        tile_block_height, 0,
+        pitch_linear
+    );
+
+    return {
+        .buffer_row_length = Common::AlignUp(tile_width, block_size.width),
+        .buffer_image_height = Common::AlignUp(tile_height, block_size.height)
+    };
+}
+
 void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8> output,
                  std::span<BufferImageCopy> copies) {
    u32 output_offset = 0;
--- a/src/video_core/texture_cache/util.h
+++ b/src/video_core/texture_cache/util.h
@ -30,6 +30,11 @@ struct OverlapResult {
    SubresourceExtent resources;
 };

+struct SparseTileUnswizzleResult {
+    u32 buffer_row_length;
+    u32 buffer_image_height;
+};
+
 [[nodiscard]] u32 CalculateGuestSizeInBytes(const ImageInfo& info) noexcept;

 [[nodiscard]] u32 CalculateUnswizzledSizeBytes(const ImageInfo& info) noexcept;
@ -68,7 +73,14 @@ struct OverlapResult {
 [[nodiscard]] boost::container::small_vector<BufferImageCopy, 16> UnswizzleImage(
    Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageInfo& info,
    std::span<const u8> input, std::span<u8> output);
-
+    
+[[nodiscard]] SparseTileUnswizzleResult UnswizzleSparseTextureTile(std::span<u8> output, 
+                                                      std::span<const u8> input,
+                                                      const ImageInfo& info,
+                                                      u32 tile_width,
+                                                      u32 tile_height,
+                                                      u32 tile_depth);
+    
 void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8> output,
                  std::span<BufferImageCopy> copies);