diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/IntSetting.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/IntSetting.kt index 8ec498ad22..3926498c25 100644 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/IntSetting.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/IntSetting.kt @@ -47,6 +47,8 @@ enum class IntSetting(override val key: String) : AbstractIntSetting { FAST_CPU_TIME("fast_cpu_time"), CPU_TICKS("cpu_ticks"), FAST_GPU_TIME("fast_gpu_time"), + GPU_UNZWIZZLE_STREAM_SIZE("gpu_unzwizzle_stream_size"), + GPU_UNZWIZZLE_CHUNK_SIZE("gpu_unzwizzle_chunk_size"), BAT_TEMPERATURE_UNIT("bat_temperature_unit"), CABINET_APPLET("cabinet_applet_mode"), CONTROLLER_APPLET("controller_applet_mode"), diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt index 50143a449e..5b37824451 100644 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt @@ -655,6 +655,24 @@ abstract class SettingsItem( valuesId = R.array.gpuValues ) ) + put( + SingleChoiceSetting( + IntSetting.GPU_UNZWIZZLE_STREAM_SIZE, + titleId = R.string.gpu_unzwizzle_stream_size, + descriptionId = R.string.gpu_unzwizzle_stream_size_description, + choicesId = R.array.gpuSwizzleEntries, + valuesId = R.array.gpuSwizzleValues + ) + ) + put( + SingleChoiceSetting( + IntSetting.GPU_UNZWIZZLE_CHUNK_SIZE, + titleId = R.string.gpu_unzwizzle_chunk_size, + descriptionId = R.string.gpu_unzwizzle_chunk_size_description, + choicesId = R.array.gpuSwizzleChunkEntries, + valuesId = R.array.gpuSwizzleChunkValues + ) + ) put( SingleChoiceSetting( IntSetting.FAST_CPU_TIME, diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt index dc58e7d23b..c268d3f8ae 100644 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt @@ -280,6 +280,8 @@ class SettingsFragmentPresenter( add(IntSetting.FAST_GPU_TIME.key) add(BooleanSetting.SKIP_CPU_INNER_INVALIDATION.key) add(BooleanSetting.RENDERER_ASYNCHRONOUS_SHADERS.key) + add(IntSetting.GPU_UNZWIZZLE_STREAM_SIZE.key) + add(IntSetting.GPU_UNZWIZZLE_CHUNK_SIZE.key) add(HeaderSetting(R.string.extensions)) diff --git a/src/android/app/src/main/res/values/strings.xml b/src/android/app/src/main/res/values/strings.xml index 4bfa5afd01..b9e95cce34 100644 --- a/src/android/app/src/main/res/values/strings.xml +++ b/src/android/app/src/main/res/values/strings.xml @@ -504,7 +504,10 @@ Skips certain CPU-side cache invalidations during memory updates, reducing CPU usage and improving it\'s performance. This may cause glitches or crashes on some games. Use asynchronous shaders Compiles shaders asynchronously. This may reduce stutters but may also introduce glitches. - + GPU Unswizzle Stream Size + Sets the data limit per frame for unswizzling large textures. Higher values speed up texture loading at the cost of higher frame latency; lower values reduce GPU overhead but may cause visible texture pop-in. + GPU Unswizzle Chunk Size + Defines the number of depth slices processed per batch for 3D textures. Increasing this improves throughput efficiency on powerful GPUs but may cause stuttering or driver timeouts on weaker hardware. Extensions Extended Dynamic State diff --git a/src/common/settings.h b/src/common/settings.h index 874ba7aee2..60c19997dd 100644 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -512,6 +512,18 @@ struct Values { SwitchableSetting use_asynchronous_shaders{linkage, false, "use_asynchronous_shaders", Category::RendererHacks}; + + SwitchableSetting gpu_unzwizzle_stream_size{linkage, + GpuUnswizzle::Medium, + "gpu_unzwizzle_stream_size", + Category::RendererHacks, + Specialization::Default}; + + SwitchableSetting gpu_unzwizzle_chunk_size{linkage, + GpuUnswizzleChunk::Medium, + "gpu_unzwizzle_chunk_size", + Category::RendererHacks, + Specialization::Default}; SwitchableSetting dyna_state{linkage, #if defined (_WIN32) diff --git a/src/common/settings_enums.h b/src/common/settings_enums.h index bff5f0ec5a..31e11b7319 100644 --- a/src/common/settings_enums.h +++ b/src/common/settings_enums.h @@ -150,6 +150,8 @@ ENUM(ConsoleMode, Handheld, Docked); ENUM(AppletMode, HLE, LLE); ENUM(SpirvOptimizeMode, Never, OnLoad, Always); ENUM(GpuOverclock, Normal, Medium, High) +ENUM(GpuUnswizzle, VeryLow, Low, Normal, Medium, High) +ENUM(GpuUnswizzleChunk, VeryLow, Low, Normal, Medium, High) ENUM(TemperatureUnits, Celsius, Fahrenheit) ENUM(ExtendedDynamicState, Disabled, EDS1, EDS2, EDS3); diff --git a/src/qt_common/config/shared_translation.cpp b/src/qt_common/config/shared_translation.cpp index a3a720db8e..885a8c02bc 100644 --- a/src/qt_common/config/shared_translation.cpp +++ b/src/qt_common/config/shared_translation.cpp @@ -288,6 +288,16 @@ std::unique_ptr InitializeTranslations(QObject* parent) tr("Fast GPU Time"), tr("Overclocks the emulated GPU to increase dynamic resolution and render " "distance.\nUse 256 for maximal performance and 512 for maximal graphics fidelity.")); + INSERT(Settings, + gpu_unzwizzle_stream_size, + tr("GPU Unswizzle Stream Size"), + tr("Sets the maximum amount of texture data (in MiB) processed per frame.\n" + "Higher values can reduce stutter during texture loading but may impact frame consistency.")); + INSERT(Settings, + gpu_unzwizzle_chunk_size, + tr("GPU Unswizzle Chunk Size"), + tr("Determines the number of depth slices processed in a single dispatch.\n" + "Increasing this can improve throughput on high-end GPUs but may cause TDR or driver timeouts on weaker hardware.")); INSERT(Settings, use_vulkan_driver_pipeline_cache, @@ -719,6 +729,22 @@ std::unique_ptr ComboboxEnumeration(QObject* parent) PAIR(GpuOverclock, Medium, tr("Medium (256)")), PAIR(GpuOverclock, High, tr("High (512)")), }}); + translations->insert({Settings::EnumMetadata::Index(), + { + PAIR(GpuUnswizzle, VeryLow, tr("Very Low (4)")), + PAIR(GpuUnswizzle, Low, tr("Low (8)")), + PAIR(GpuUnswizzle, Normal, tr("Normal (16)")), + PAIR(GpuUnswizzle, Medium, tr("Medium (32)")), + PAIR(GpuUnswizzle, High, tr("High (64)")), + }}); + translations->insert({Settings::EnumMetadata::Index(), + { + PAIR(GpuUnswizzleChunk, VeryLow, tr("Very Low (32)")), + PAIR(GpuUnswizzleChunk, Low, tr("Low (64)")), + PAIR(GpuUnswizzleChunk, Normal, tr("Normal (128)")), + PAIR(GpuUnswizzleChunk, Medium, tr("Medium (256)")), + PAIR(GpuUnswizzleChunk, High, tr("High (512)")), + }}); translations->insert({Settings::EnumMetadata::Index(), { diff --git a/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp b/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp index 19969688ab..455e99e019 100644 --- a/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp +++ b/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp @@ -100,18 +100,9 @@ const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_S const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1u, GOB_SIZE_Y - 1u); // --- Helpers --- -shared uint swizzle_cache[64 * 8]; // Cache 64x8 GOB worth of swizzle data - uint SwizzleOffset(uvec2 pos) { pos &= SWIZZLE_MASK; - uint local_id = gl_LocalInvocationIndex; - - if (local_id < 512u) { - swizzle_cache[local_id] = swizzle_table[local_id]; - } - barrier(); - - return swizzle_cache[pos.y * 64u + pos.x]; + return swizzle_table[pos.y * 64u + pos.x]; } uvec4 ReadTexel(uint offset) { diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index a83d2931fd..bc5ab23d06 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -755,15 +755,14 @@ void BlockLinearUnswizzle3DPass::Unswizzle( u32 z_start, u32 z_count) { using namespace VideoCommon::Accelerated; - - // Leaving this hear incase instances are found where slices_needed causes device loss + + // Leaving this here incase instances are found where slices_needed causes device loss // Tune this for a balance between speed and size, I don't own a deck so can't self tune it - // constexpr u32 MAX_BATCH_SLICES = 64; + const u32 MAX_BATCH_SLICES = std::min(z_count, image.info.size.depth); if (!image.has_compute_unswizzle_buffer) { // Allocate exactly what this batch needs - const u32 slices_needed = std::min(z_count, image.info.size.depth); - image.AllocateComputeUnswizzleBuffer(slices_needed); + image.AllocateComputeUnswizzleBuffer(MAX_BATCH_SLICES); } ASSERT(swizzles.size() == 1); @@ -772,12 +771,10 @@ void BlockLinearUnswizzle3DPass::Unswizzle( const u32 blocks_x = (image.info.size.width + 3) / 4; const u32 blocks_y = (image.info.size.height + 3) / 4; - - constexpr u32 SLICES_PER_CHUNK = 64; scheduler.RequestOutsideRenderPassOperationContext(); - for (u32 z_offset = 0; z_offset < z_count; z_offset += SLICES_PER_CHUNK) { - const u32 current_chunk_slices = std::min(SLICES_PER_CHUNK, z_count - z_offset); + for (u32 z_offset = 0; z_offset < z_count; z_offset += MAX_BATCH_SLICES) { + const u32 current_chunk_slices = std::min(MAX_BATCH_SLICES, z_count - z_offset); const u32 current_z_start = z_start + z_offset; UnswizzleChunk(image, swizzled, sw, params, blocks_x, blocks_y, @@ -848,7 +845,11 @@ void BlockLinearUnswizzle3DPass::UnswizzleChunk( barrier_size, is_first_chunk, out_buffer, dst_image, aspect, image_width, image_height ](vk::CommandBuffer cmdbuf) { - + + if (dst_image == VK_NULL_HANDLE || out_buffer == VK_NULL_HANDLE) { + return; + } + device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index f07699ea70..74a42e5c87 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -70,9 +70,6 @@ TextureCache

::TextureCache(Runtime& runtime_, Tegra::MaxwellDeviceMemoryManag (std::max)((std::min)(device_local_memory - min_vacancy_critical, min_spacing_critical), DEFAULT_CRITICAL_MEMORY)); minimum_memory = static_cast((device_local_memory - mem_threshold) / 2); - - chunk_size = 64_MiB; - slices_per_batch = 64; lowmemorydevice = runtime.IsSteamDeck(); } else { @@ -80,9 +77,6 @@ TextureCache

::TextureCache(Runtime& runtime_, Tegra::MaxwellDeviceMemoryManag critical_memory = DEFAULT_CRITICAL_MEMORY + 1_GiB; minimum_memory = 0; - chunk_size = 32_MiB; - slices_per_batch = 32; - lowmemorydevice = true; } } @@ -1479,18 +1473,38 @@ void TextureCache

::TickAsyncUnswizzle() { task.initialized = true; } + size_t CHUNK_SIZE; + switch (Settings::values.gpu_unzwizzle_stream_size.GetValue()) { + case Settings::GpuUnswizzle::VeryLow: CHUNK_SIZE = 4_MiB; break; + case Settings::GpuUnswizzle::Low: CHUNK_SIZE = 8_MiB; break; + case Settings::GpuUnswizzle::Normal: CHUNK_SIZE = 16_MiB; break; + case Settings::GpuUnswizzle::Medium: CHUNK_SIZE = 32_MiB; break; + case Settings::GpuUnswizzle::High: CHUNK_SIZE = 64_MiB; break; + default: CHUNK_SIZE = 16_MiB; + } + + u32 SLICES_PER_BATCH; + switch (Settings::values.gpu_unzwizzle_chunk_size.GetValue()) { + case Settings::GpuUnswizzleChunk::VeryLow: SLICES_PER_BATCH = 32; break; + case Settings::GpuUnswizzleChunk::Low: SLICES_PER_BATCH = 64; break; + case Settings::GpuUnswizzleChunk::Normal: SLICES_PER_BATCH = 128; break; + case Settings::GpuUnswizzleChunk::Medium: SLICES_PER_BATCH = 256; break; + case Settings::GpuUnswizzleChunk::High: SLICES_PER_BATCH = 512; break; + default: SLICES_PER_BATCH = 128; + } + // Read data if (task.current_offset < task.total_size) { const size_t remaining = task.total_size - task.current_offset; - size_t copy_amount = std::min(chunk_size, remaining); - - if (remaining > chunk_size) { + size_t copy_amount = std::min(CHUNK_SIZE, remaining); + + if (remaining > CHUNK_SIZE) { copy_amount = (copy_amount / task.bytes_per_slice) * task.bytes_per_slice; if (copy_amount == 0) copy_amount = task.bytes_per_slice; } - gpu_memory->ReadBlockUnsafe(image.gpu_addr + task.current_offset, + gpu_memory->ReadBlock(image.gpu_addr + task.current_offset, task.staging_buffer.mapped_span.data() + task.current_offset, copy_amount); task.current_offset += copy_amount; @@ -1500,7 +1514,7 @@ void TextureCache

::TickAsyncUnswizzle() { const u32 complete_slices = static_cast(bytes_ready / task.bytes_per_slice); const bool is_final_batch = task.current_offset >= task.total_size; - if (complete_slices >= slices_per_batch || (is_final_batch && complete_slices > 0)) { + if (complete_slices >= SLICES_PER_BATCH || (is_final_batch && complete_slices > 0)) { const u32 z_start = static_cast(task.last_submitted_offset / task.bytes_per_slice); const u32 z_count = std::min(complete_slices, image.info.size.depth - z_start); @@ -1517,10 +1531,6 @@ void TextureCache

::TickAsyncUnswizzle() { image.flags &= ~ImageFlagBits::IsDecoding; unswizzle_queue.pop_front(); - if (total_used_memory >= expected_memory) { - RunGarbageCollector(); - } - // Wait 4 frames to process the next entry current_unswizzle_frame = 4u; } diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index 7b642850fd..616d77776a 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -475,8 +475,6 @@ private: u64 minimum_memory; u64 expected_memory; u64 critical_memory; - size_t chunk_size; - size_t slices_per_batch; bool lowmemorydevice = false; struct BufferDownload {