Fixed broken shader for BCn unswizzle

Added setting to allow users to adjust the streaming system for sparse textures
3 months ago · 4b8de68d0b
11 changed files with 103 additions and 38 deletions
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/IntSetting.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/IntSetting.kt
@ -47,6 +47,8 @@ enum class IntSetting(override val key: String) : AbstractIntSetting {
    FAST_CPU_TIME("fast_cpu_time"),
    CPU_TICKS("cpu_ticks"),
    FAST_GPU_TIME("fast_gpu_time"),
    GPU_UNZWIZZLE_STREAM_SIZE("gpu_unzwizzle_stream_size"),
    GPU_UNZWIZZLE_CHUNK_SIZE("gpu_unzwizzle_chunk_size"),
    BAT_TEMPERATURE_UNIT("bat_temperature_unit"),
    CABINET_APPLET("cabinet_applet_mode"),
    CONTROLLER_APPLET("controller_applet_mode"),
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt
@ -655,6 +655,24 @@ abstract class SettingsItem(
                    valuesId = R.array.gpuValues
                )
            )
            put(
                SingleChoiceSetting(
                    IntSetting.GPU_UNZWIZZLE_STREAM_SIZE,
                    titleId = R.string.gpu_unzwizzle_stream_size,
                    descriptionId = R.string.gpu_unzwizzle_stream_size_description,
                    choicesId = R.array.gpuSwizzleEntries,
                    valuesId = R.array.gpuSwizzleValues
                )
            )
            put(
                SingleChoiceSetting(
                    IntSetting.GPU_UNZWIZZLE_CHUNK_SIZE,
                    titleId = R.string.gpu_unzwizzle_chunk_size,
                    descriptionId = R.string.gpu_unzwizzle_chunk_size_description,
                    choicesId = R.array.gpuSwizzleChunkEntries,
                    valuesId = R.array.gpuSwizzleChunkValues
                )
            )
            put(
                SingleChoiceSetting(
                    IntSetting.FAST_CPU_TIME,
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt
@ -280,6 +280,8 @@ class SettingsFragmentPresenter(
            add(IntSetting.FAST_GPU_TIME.key)
            add(BooleanSetting.SKIP_CPU_INNER_INVALIDATION.key)
            add(BooleanSetting.RENDERER_ASYNCHRONOUS_SHADERS.key)
            add(IntSetting.GPU_UNZWIZZLE_STREAM_SIZE.key)
            add(IntSetting.GPU_UNZWIZZLE_CHUNK_SIZE.key)
            add(HeaderSetting(R.string.extensions))
--- a/src/android/app/src/main/res/values/strings.xml
+++ b/src/android/app/src/main/res/values/strings.xml
@ -504,7 +504,10 @@
    <string name="skip_cpu_inner_invalidation_description">Skips certain CPU-side cache invalidations during memory updates, reducing CPU usage and improving it\'s performance. This may cause glitches or crashes on some games.</string>
    <string name="renderer_asynchronous_shaders">Use asynchronous shaders</string>
    <string name="renderer_asynchronous_shaders_description">Compiles shaders asynchronously. This may reduce stutters but may also introduce glitches.</string>
    <string name="gpu_unzwizzle_stream_size">GPU Unswizzle Stream Size</string>
    <string name="gpu_unzwizzle_stream_size_description">Sets the data limit per frame for unswizzling large textures. Higher values speed up texture loading at the cost of higher frame latency; lower values reduce GPU overhead but may cause visible texture pop-in.</string>
    <string name="gpu_unzwizzle_chunk_size">GPU Unswizzle Chunk Size</string>
    <string name="gpu_unzwizzle_chunk_size_description">Defines the number of depth slices processed per batch for 3D textures. Increasing this improves throughput efficiency on powerful GPUs but may cause stuttering or driver timeouts on weaker hardware.</string>
    <string name="extensions">Extensions</string>
    <string name="dyna_state">Extended Dynamic State</string>
--- a/src/common/settings.h
+++ b/src/common/settings.h
@ -513,6 +513,18 @@ struct Values {
    SwitchableSetting<bool> use_asynchronous_shaders{linkage, false, "use_asynchronous_shaders",
                                                     Category::RendererHacks};
    SwitchableSetting<GpuUnswizzle> gpu_unzwizzle_stream_size{linkage,
                                                  GpuUnswizzle::Medium,
                                                  "gpu_unzwizzle_stream_size",
                                                  Category::RendererHacks,
                                                  Specialization::Default};
    SwitchableSetting<GpuUnswizzleChunk> gpu_unzwizzle_chunk_size{linkage,
                                                  GpuUnswizzleChunk::Medium,
                                                  "gpu_unzwizzle_chunk_size",
                                                  Category::RendererHacks,
                                                  Specialization::Default};
    SwitchableSetting<ExtendedDynamicState> dyna_state{linkage,
 #if defined (_WIN32)
                                           ExtendedDynamicState::EDS3,
--- a/src/common/settings_enums.h
+++ b/src/common/settings_enums.h
@ -150,6 +150,8 @@ ENUM(ConsoleMode, Handheld, Docked);
 ENUM(AppletMode, HLE, LLE);
 ENUM(SpirvOptimizeMode, Never, OnLoad, Always);
 ENUM(GpuOverclock, Normal, Medium, High)
 ENUM(GpuUnswizzle, VeryLow, Low, Normal, Medium, High)
 ENUM(GpuUnswizzleChunk, VeryLow, Low, Normal, Medium, High)
 ENUM(TemperatureUnits, Celsius, Fahrenheit)
 ENUM(ExtendedDynamicState, Disabled, EDS1, EDS2, EDS3);
--- a/src/qt_common/config/shared_translation.cpp
+++ b/src/qt_common/config/shared_translation.cpp
@ -288,6 +288,16 @@ std::unique_ptr<TranslationMap> InitializeTranslations(QObject* parent)
           tr("Fast GPU Time"),
           tr("Overclocks the emulated GPU to increase dynamic resolution and render "
              "distance.\nUse 256 for maximal performance and 512 for maximal graphics fidelity."));
    INSERT(Settings,
           gpu_unzwizzle_stream_size,
           tr("GPU Unswizzle Stream Size"),
           tr("Sets the maximum amount of texture data (in MiB) processed per frame.\n"
              "Higher values can reduce stutter during texture loading but may impact frame consistency."));
    INSERT(Settings,
           gpu_unzwizzle_chunk_size,
           tr("GPU Unswizzle Chunk Size"),
           tr("Determines the number of depth slices processed in a single dispatch.\n"
              "Increasing this can improve throughput on high-end GPUs but may cause TDR or driver timeouts on weaker hardware."));
    INSERT(Settings,
           use_vulkan_driver_pipeline_cache,
@ -719,6 +729,22 @@ std::unique_ptr<ComboboxTranslationMap> ComboboxEnumeration(QObject* parent)
                              PAIR(GpuOverclock, Medium, tr("Medium (256)")),
                              PAIR(GpuOverclock, High, tr("High (512)")),
                          }});
    translations->insert({Settings::EnumMetadata<Settings::GpuUnswizzle>::Index(),
                          {
                              PAIR(GpuUnswizzle, VeryLow, tr("Very Low (4)")),
                              PAIR(GpuUnswizzle, Low, tr("Low (8)")),
                              PAIR(GpuUnswizzle, Normal, tr("Normal (16)")),
                              PAIR(GpuUnswizzle, Medium, tr("Medium (32)")),
                              PAIR(GpuUnswizzle, High, tr("High (64)")),
                          }});
    translations->insert({Settings::EnumMetadata<Settings::GpuUnswizzleChunk>::Index(),
                          {
                              PAIR(GpuUnswizzleChunk, VeryLow, tr("Very Low (32)")),
                              PAIR(GpuUnswizzleChunk, Low, tr("Low (64)")),
                              PAIR(GpuUnswizzleChunk, Normal, tr("Normal (128)")),
                              PAIR(GpuUnswizzleChunk, Medium, tr("Medium (256)")),
                              PAIR(GpuUnswizzleChunk, High, tr("High (512)")),
                          }});
    translations->insert({Settings::EnumMetadata<Settings::ExtendedDynamicState>::Index(),
                          {
--- a/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp
+++ b/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp
@ -100,18 +100,9 @@ const uint GOB_SIZE_SHIFT   = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_S
 const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1u, GOB_SIZE_Y - 1u);
 // --- Helpers ---
 shared uint swizzle_cache[64 * 8]; // Cache 64x8 GOB worth of swizzle data
 uint SwizzleOffset(uvec2 pos) {
    pos &= SWIZZLE_MASK;
    uint local_id = gl_LocalInvocationIndex;
    if (local_id < 512u) {
        swizzle_cache[local_id] = swizzle_table[local_id];
    }
    barrier();
    return swizzle_cache[pos.y * 64u + pos.x];
    return swizzle_table[pos.y * 64u + pos.x];
 }
 uvec4 ReadTexel(uint offset) {
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@ -756,14 +756,13 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
 {
    using namespace VideoCommon::Accelerated;
    // Leaving this hear incase instances are found where slices_needed causes device loss
    // Leaving this here incase instances are found where slices_needed causes device loss
    // Tune this for a balance between speed and size, I don't own a deck so can't self tune it
    // constexpr u32 MAX_BATCH_SLICES = 64;
    const u32 MAX_BATCH_SLICES = std::min(z_count, image.info.size.depth);
    if (!image.has_compute_unswizzle_buffer) {
        // Allocate exactly what this batch needs
        const u32 slices_needed = std::min(z_count, image.info.size.depth);
        image.AllocateComputeUnswizzleBuffer(slices_needed);
        image.AllocateComputeUnswizzleBuffer(MAX_BATCH_SLICES);
    }
    ASSERT(swizzles.size() == 1);
@ -773,11 +772,9 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
    const u32 blocks_x = (image.info.size.width  + 3) / 4;
    const u32 blocks_y = (image.info.size.height + 3) / 4;
    constexpr u32 SLICES_PER_CHUNK = 64;
    scheduler.RequestOutsideRenderPassOperationContext();
    for (u32 z_offset = 0; z_offset < z_count; z_offset += SLICES_PER_CHUNK) {
        const u32 current_chunk_slices = std::min(SLICES_PER_CHUNK, z_count - z_offset);
    for (u32 z_offset = 0; z_offset < z_count; z_offset += MAX_BATCH_SLICES) {
        const u32 current_chunk_slices = std::min(MAX_BATCH_SLICES, z_count - z_offset);
        const u32 current_z_start = z_start + z_offset;
        UnswizzleChunk(image, swizzled, sw, params, blocks_x, blocks_y,
@ -849,6 +846,10 @@ void BlockLinearUnswizzle3DPass::UnswizzleChunk(
                      image_width, image_height
                      ](vk::CommandBuffer cmdbuf) {
        if (dst_image == VK_NULL_HANDLE || out_buffer == VK_NULL_HANDLE) {
            return;
        }
        device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data);
        cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
        cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {});
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@ -71,18 +71,12 @@ TextureCache<P>::TextureCache(Runtime& runtime_, Tegra::MaxwellDeviceMemoryManag
                     DEFAULT_CRITICAL_MEMORY));
        minimum_memory = static_cast<u64>((device_local_memory - mem_threshold) / 2);
        chunk_size = 64_MiB;
        slices_per_batch = 64;
        lowmemorydevice = runtime.IsSteamDeck();
    } else {
        expected_memory = DEFAULT_EXPECTED_MEMORY + 512_MiB;
        critical_memory = DEFAULT_CRITICAL_MEMORY + 1_GiB;
        minimum_memory = 0;
        chunk_size = 32_MiB;
        slices_per_batch = 32;
        lowmemorydevice = true;
    }
 }
@ -1479,18 +1473,38 @@ void TextureCache<P>::TickAsyncUnswizzle() {
        task.initialized = true;
    }
    size_t CHUNK_SIZE;
    switch (Settings::values.gpu_unzwizzle_stream_size.GetValue()) {
        case Settings::GpuUnswizzle::VeryLow: CHUNK_SIZE = 4_MiB; break;
        case Settings::GpuUnswizzle::Low:     CHUNK_SIZE = 8_MiB; break;
        case Settings::GpuUnswizzle::Normal:  CHUNK_SIZE = 16_MiB; break;
        case Settings::GpuUnswizzle::Medium:  CHUNK_SIZE = 32_MiB; break;
        case Settings::GpuUnswizzle::High:    CHUNK_SIZE = 64_MiB; break;
        default:                              CHUNK_SIZE = 16_MiB;
    }
    u32 SLICES_PER_BATCH;
    switch (Settings::values.gpu_unzwizzle_chunk_size.GetValue()) {
        case Settings::GpuUnswizzleChunk::VeryLow: SLICES_PER_BATCH = 32; break;
        case Settings::GpuUnswizzleChunk::Low:     SLICES_PER_BATCH = 64; break;
        case Settings::GpuUnswizzleChunk::Normal:  SLICES_PER_BATCH = 128; break;
        case Settings::GpuUnswizzleChunk::Medium:  SLICES_PER_BATCH = 256; break;
        case Settings::GpuUnswizzleChunk::High:    SLICES_PER_BATCH = 512; break;
        default:                                   SLICES_PER_BATCH = 128;
    }
    // Read data
    if (task.current_offset < task.total_size) {
        const size_t remaining = task.total_size - task.current_offset;
        size_t copy_amount = std::min(chunk_size, remaining);
        size_t copy_amount = std::min(CHUNK_SIZE, remaining);
        if (remaining > chunk_size) {
        if (remaining > CHUNK_SIZE) {
            copy_amount = (copy_amount / task.bytes_per_slice) * task.bytes_per_slice;
            if (copy_amount == 0) copy_amount = task.bytes_per_slice;
        }
        gpu_memory->ReadBlockUnsafe(image.gpu_addr + task.current_offset, 
        gpu_memory->ReadBlock(image.gpu_addr + task.current_offset, 
                              task.staging_buffer.mapped_span.data() + task.current_offset, 
                              copy_amount);
        task.current_offset += copy_amount;
@ -1500,7 +1514,7 @@ void TextureCache<P>::TickAsyncUnswizzle() {
    const u32 complete_slices = static_cast<u32>(bytes_ready / task.bytes_per_slice);
    const bool is_final_batch = task.current_offset >= task.total_size;
    if (complete_slices >= slices_per_batch || (is_final_batch && complete_slices > 0)) {
    if (complete_slices >= SLICES_PER_BATCH || (is_final_batch && complete_slices > 0)) {
        const u32 z_start = static_cast<u32>(task.last_submitted_offset / task.bytes_per_slice);
        const u32 z_count = std::min(complete_slices, image.info.size.depth - z_start);
@ -1517,10 +1531,6 @@ void TextureCache<P>::TickAsyncUnswizzle() {
        image.flags &= ~ImageFlagBits::IsDecoding;
        unswizzle_queue.pop_front();
        if (total_used_memory >= expected_memory) {
            RunGarbageCollector();
        }
        // Wait 4 frames to process the next entry
        current_unswizzle_frame = 4u;
    }
--- a/src/video_core/texture_cache/texture_cache_base.h
+++ b/src/video_core/texture_cache/texture_cache_base.h
@ -475,8 +475,6 @@ private:
    u64 minimum_memory;
    u64 expected_memory;
    u64 critical_memory;
    size_t chunk_size;
    size_t slices_per_batch;
    bool lowmemorydevice = false;
    struct BufferDownload {