Browse Source

Fixed broken shader for BCn unswizzle

Added setting to allow users to adjust the streaming system for sparse textures
pull/3246/head
Forrest Keller 3 months ago
committed by crueter
parent
commit
4b8de68d0b
  1. 2
      src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/IntSetting.kt
  2. 18
      src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt
  3. 2
      src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt
  4. 5
      src/android/app/src/main/res/values/strings.xml
  5. 12
      src/common/settings.h
  6. 2
      src/common/settings_enums.h
  7. 26
      src/qt_common/config/shared_translation.cpp
  8. 11
      src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp
  9. 17
      src/video_core/renderer_vulkan/vk_compute_pass.cpp
  10. 38
      src/video_core/texture_cache/texture_cache.h
  11. 2
      src/video_core/texture_cache/texture_cache_base.h

2
src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/IntSetting.kt

@ -47,6 +47,8 @@ enum class IntSetting(override val key: String) : AbstractIntSetting {
FAST_CPU_TIME("fast_cpu_time"), FAST_CPU_TIME("fast_cpu_time"),
CPU_TICKS("cpu_ticks"), CPU_TICKS("cpu_ticks"),
FAST_GPU_TIME("fast_gpu_time"), FAST_GPU_TIME("fast_gpu_time"),
GPU_UNZWIZZLE_STREAM_SIZE("gpu_unzwizzle_stream_size"),
GPU_UNZWIZZLE_CHUNK_SIZE("gpu_unzwizzle_chunk_size"),
BAT_TEMPERATURE_UNIT("bat_temperature_unit"), BAT_TEMPERATURE_UNIT("bat_temperature_unit"),
CABINET_APPLET("cabinet_applet_mode"), CABINET_APPLET("cabinet_applet_mode"),
CONTROLLER_APPLET("controller_applet_mode"), CONTROLLER_APPLET("controller_applet_mode"),

18
src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt

@ -655,6 +655,24 @@ abstract class SettingsItem(
valuesId = R.array.gpuValues valuesId = R.array.gpuValues
) )
) )
put(
SingleChoiceSetting(
IntSetting.GPU_UNZWIZZLE_STREAM_SIZE,
titleId = R.string.gpu_unzwizzle_stream_size,
descriptionId = R.string.gpu_unzwizzle_stream_size_description,
choicesId = R.array.gpuSwizzleEntries,
valuesId = R.array.gpuSwizzleValues
)
)
put(
SingleChoiceSetting(
IntSetting.GPU_UNZWIZZLE_CHUNK_SIZE,
titleId = R.string.gpu_unzwizzle_chunk_size,
descriptionId = R.string.gpu_unzwizzle_chunk_size_description,
choicesId = R.array.gpuSwizzleChunkEntries,
valuesId = R.array.gpuSwizzleChunkValues
)
)
put( put(
SingleChoiceSetting( SingleChoiceSetting(
IntSetting.FAST_CPU_TIME, IntSetting.FAST_CPU_TIME,

2
src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt

@ -280,6 +280,8 @@ class SettingsFragmentPresenter(
add(IntSetting.FAST_GPU_TIME.key) add(IntSetting.FAST_GPU_TIME.key)
add(BooleanSetting.SKIP_CPU_INNER_INVALIDATION.key) add(BooleanSetting.SKIP_CPU_INNER_INVALIDATION.key)
add(BooleanSetting.RENDERER_ASYNCHRONOUS_SHADERS.key) add(BooleanSetting.RENDERER_ASYNCHRONOUS_SHADERS.key)
add(IntSetting.GPU_UNZWIZZLE_STREAM_SIZE.key)
add(IntSetting.GPU_UNZWIZZLE_CHUNK_SIZE.key)
add(HeaderSetting(R.string.extensions)) add(HeaderSetting(R.string.extensions))

5
src/android/app/src/main/res/values/strings.xml

@ -504,7 +504,10 @@
<string name="skip_cpu_inner_invalidation_description">Skips certain CPU-side cache invalidations during memory updates, reducing CPU usage and improving it\'s performance. This may cause glitches or crashes on some games.</string> <string name="skip_cpu_inner_invalidation_description">Skips certain CPU-side cache invalidations during memory updates, reducing CPU usage and improving it\'s performance. This may cause glitches or crashes on some games.</string>
<string name="renderer_asynchronous_shaders">Use asynchronous shaders</string> <string name="renderer_asynchronous_shaders">Use asynchronous shaders</string>
<string name="renderer_asynchronous_shaders_description">Compiles shaders asynchronously. This may reduce stutters but may also introduce glitches.</string> <string name="renderer_asynchronous_shaders_description">Compiles shaders asynchronously. This may reduce stutters but may also introduce glitches.</string>
<string name="gpu_unzwizzle_stream_size">GPU Unswizzle Stream Size</string>
<string name="gpu_unzwizzle_stream_size_description">Sets the data limit per frame for unswizzling large textures. Higher values speed up texture loading at the cost of higher frame latency; lower values reduce GPU overhead but may cause visible texture pop-in.</string>
<string name="gpu_unzwizzle_chunk_size">GPU Unswizzle Chunk Size</string>
<string name="gpu_unzwizzle_chunk_size_description">Defines the number of depth slices processed per batch for 3D textures. Increasing this improves throughput efficiency on powerful GPUs but may cause stuttering or driver timeouts on weaker hardware.</string>
<string name="extensions">Extensions</string> <string name="extensions">Extensions</string>
<string name="dyna_state">Extended Dynamic State</string> <string name="dyna_state">Extended Dynamic State</string>

12
src/common/settings.h

@ -513,6 +513,18 @@ struct Values {
SwitchableSetting<bool> use_asynchronous_shaders{linkage, false, "use_asynchronous_shaders", SwitchableSetting<bool> use_asynchronous_shaders{linkage, false, "use_asynchronous_shaders",
Category::RendererHacks}; Category::RendererHacks};
SwitchableSetting<GpuUnswizzle> gpu_unzwizzle_stream_size{linkage,
GpuUnswizzle::Medium,
"gpu_unzwizzle_stream_size",
Category::RendererHacks,
Specialization::Default};
SwitchableSetting<GpuUnswizzleChunk> gpu_unzwizzle_chunk_size{linkage,
GpuUnswizzleChunk::Medium,
"gpu_unzwizzle_chunk_size",
Category::RendererHacks,
Specialization::Default};
SwitchableSetting<ExtendedDynamicState> dyna_state{linkage, SwitchableSetting<ExtendedDynamicState> dyna_state{linkage,
#if defined (_WIN32) #if defined (_WIN32)
ExtendedDynamicState::EDS3, ExtendedDynamicState::EDS3,

2
src/common/settings_enums.h

@ -150,6 +150,8 @@ ENUM(ConsoleMode, Handheld, Docked);
ENUM(AppletMode, HLE, LLE); ENUM(AppletMode, HLE, LLE);
ENUM(SpirvOptimizeMode, Never, OnLoad, Always); ENUM(SpirvOptimizeMode, Never, OnLoad, Always);
ENUM(GpuOverclock, Normal, Medium, High) ENUM(GpuOverclock, Normal, Medium, High)
ENUM(GpuUnswizzle, VeryLow, Low, Normal, Medium, High)
ENUM(GpuUnswizzleChunk, VeryLow, Low, Normal, Medium, High)
ENUM(TemperatureUnits, Celsius, Fahrenheit) ENUM(TemperatureUnits, Celsius, Fahrenheit)
ENUM(ExtendedDynamicState, Disabled, EDS1, EDS2, EDS3); ENUM(ExtendedDynamicState, Disabled, EDS1, EDS2, EDS3);

26
src/qt_common/config/shared_translation.cpp

@ -288,6 +288,16 @@ std::unique_ptr<TranslationMap> InitializeTranslations(QObject* parent)
tr("Fast GPU Time"), tr("Fast GPU Time"),
tr("Overclocks the emulated GPU to increase dynamic resolution and render " tr("Overclocks the emulated GPU to increase dynamic resolution and render "
"distance.\nUse 256 for maximal performance and 512 for maximal graphics fidelity.")); "distance.\nUse 256 for maximal performance and 512 for maximal graphics fidelity."));
INSERT(Settings,
gpu_unzwizzle_stream_size,
tr("GPU Unswizzle Stream Size"),
tr("Sets the maximum amount of texture data (in MiB) processed per frame.\n"
"Higher values can reduce stutter during texture loading but may impact frame consistency."));
INSERT(Settings,
gpu_unzwizzle_chunk_size,
tr("GPU Unswizzle Chunk Size"),
tr("Determines the number of depth slices processed in a single dispatch.\n"
"Increasing this can improve throughput on high-end GPUs but may cause TDR or driver timeouts on weaker hardware."));
INSERT(Settings, INSERT(Settings,
use_vulkan_driver_pipeline_cache, use_vulkan_driver_pipeline_cache,
@ -719,6 +729,22 @@ std::unique_ptr<ComboboxTranslationMap> ComboboxEnumeration(QObject* parent)
PAIR(GpuOverclock, Medium, tr("Medium (256)")), PAIR(GpuOverclock, Medium, tr("Medium (256)")),
PAIR(GpuOverclock, High, tr("High (512)")), PAIR(GpuOverclock, High, tr("High (512)")),
}}); }});
translations->insert({Settings::EnumMetadata<Settings::GpuUnswizzle>::Index(),
{
PAIR(GpuUnswizzle, VeryLow, tr("Very Low (4)")),
PAIR(GpuUnswizzle, Low, tr("Low (8)")),
PAIR(GpuUnswizzle, Normal, tr("Normal (16)")),
PAIR(GpuUnswizzle, Medium, tr("Medium (32)")),
PAIR(GpuUnswizzle, High, tr("High (64)")),
}});
translations->insert({Settings::EnumMetadata<Settings::GpuUnswizzleChunk>::Index(),
{
PAIR(GpuUnswizzleChunk, VeryLow, tr("Very Low (32)")),
PAIR(GpuUnswizzleChunk, Low, tr("Low (64)")),
PAIR(GpuUnswizzleChunk, Normal, tr("Normal (128)")),
PAIR(GpuUnswizzleChunk, Medium, tr("Medium (256)")),
PAIR(GpuUnswizzleChunk, High, tr("High (512)")),
}});
translations->insert({Settings::EnumMetadata<Settings::ExtendedDynamicState>::Index(), translations->insert({Settings::EnumMetadata<Settings::ExtendedDynamicState>::Index(),
{ {

11
src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp

@ -100,18 +100,9 @@ const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_S
const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1u, GOB_SIZE_Y - 1u); const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1u, GOB_SIZE_Y - 1u);
// --- Helpers --- // --- Helpers ---
shared uint swizzle_cache[64 * 8]; // Cache 64x8 GOB worth of swizzle data
uint SwizzleOffset(uvec2 pos) { uint SwizzleOffset(uvec2 pos) {
pos &= SWIZZLE_MASK; pos &= SWIZZLE_MASK;
uint local_id = gl_LocalInvocationIndex;
if (local_id < 512u) {
swizzle_cache[local_id] = swizzle_table[local_id];
}
barrier();
return swizzle_cache[pos.y * 64u + pos.x];
return swizzle_table[pos.y * 64u + pos.x];
} }
uvec4 ReadTexel(uint offset) { uvec4 ReadTexel(uint offset) {

17
src/video_core/renderer_vulkan/vk_compute_pass.cpp

@ -756,14 +756,13 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
{ {
using namespace VideoCommon::Accelerated; using namespace VideoCommon::Accelerated;
// Leaving this hear incase instances are found where slices_needed causes device loss
// Leaving this here incase instances are found where slices_needed causes device loss
// Tune this for a balance between speed and size, I don't own a deck so can't self tune it // Tune this for a balance between speed and size, I don't own a deck so can't self tune it
// constexpr u32 MAX_BATCH_SLICES = 64;
const u32 MAX_BATCH_SLICES = std::min(z_count, image.info.size.depth);
if (!image.has_compute_unswizzle_buffer) { if (!image.has_compute_unswizzle_buffer) {
// Allocate exactly what this batch needs // Allocate exactly what this batch needs
const u32 slices_needed = std::min(z_count, image.info.size.depth);
image.AllocateComputeUnswizzleBuffer(slices_needed);
image.AllocateComputeUnswizzleBuffer(MAX_BATCH_SLICES);
} }
ASSERT(swizzles.size() == 1); ASSERT(swizzles.size() == 1);
@ -773,11 +772,9 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
const u32 blocks_x = (image.info.size.width + 3) / 4; const u32 blocks_x = (image.info.size.width + 3) / 4;
const u32 blocks_y = (image.info.size.height + 3) / 4; const u32 blocks_y = (image.info.size.height + 3) / 4;
constexpr u32 SLICES_PER_CHUNK = 64;
scheduler.RequestOutsideRenderPassOperationContext(); scheduler.RequestOutsideRenderPassOperationContext();
for (u32 z_offset = 0; z_offset < z_count; z_offset += SLICES_PER_CHUNK) {
const u32 current_chunk_slices = std::min(SLICES_PER_CHUNK, z_count - z_offset);
for (u32 z_offset = 0; z_offset < z_count; z_offset += MAX_BATCH_SLICES) {
const u32 current_chunk_slices = std::min(MAX_BATCH_SLICES, z_count - z_offset);
const u32 current_z_start = z_start + z_offset; const u32 current_z_start = z_start + z_offset;
UnswizzleChunk(image, swizzled, sw, params, blocks_x, blocks_y, UnswizzleChunk(image, swizzled, sw, params, blocks_x, blocks_y,
@ -849,6 +846,10 @@ void BlockLinearUnswizzle3DPass::UnswizzleChunk(
image_width, image_height image_width, image_height
](vk::CommandBuffer cmdbuf) { ](vk::CommandBuffer cmdbuf) {
if (dst_image == VK_NULL_HANDLE || out_buffer == VK_NULL_HANDLE) {
return;
}
device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data);
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {});

38
src/video_core/texture_cache/texture_cache.h

@ -71,18 +71,12 @@ TextureCache<P>::TextureCache(Runtime& runtime_, Tegra::MaxwellDeviceMemoryManag
DEFAULT_CRITICAL_MEMORY)); DEFAULT_CRITICAL_MEMORY));
minimum_memory = static_cast<u64>((device_local_memory - mem_threshold) / 2); minimum_memory = static_cast<u64>((device_local_memory - mem_threshold) / 2);
chunk_size = 64_MiB;
slices_per_batch = 64;
lowmemorydevice = runtime.IsSteamDeck(); lowmemorydevice = runtime.IsSteamDeck();
} else { } else {
expected_memory = DEFAULT_EXPECTED_MEMORY + 512_MiB; expected_memory = DEFAULT_EXPECTED_MEMORY + 512_MiB;
critical_memory = DEFAULT_CRITICAL_MEMORY + 1_GiB; critical_memory = DEFAULT_CRITICAL_MEMORY + 1_GiB;
minimum_memory = 0; minimum_memory = 0;
chunk_size = 32_MiB;
slices_per_batch = 32;
lowmemorydevice = true; lowmemorydevice = true;
} }
} }
@ -1479,18 +1473,38 @@ void TextureCache<P>::TickAsyncUnswizzle() {
task.initialized = true; task.initialized = true;
} }
size_t CHUNK_SIZE;
switch (Settings::values.gpu_unzwizzle_stream_size.GetValue()) {
case Settings::GpuUnswizzle::VeryLow: CHUNK_SIZE = 4_MiB; break;
case Settings::GpuUnswizzle::Low: CHUNK_SIZE = 8_MiB; break;
case Settings::GpuUnswizzle::Normal: CHUNK_SIZE = 16_MiB; break;
case Settings::GpuUnswizzle::Medium: CHUNK_SIZE = 32_MiB; break;
case Settings::GpuUnswizzle::High: CHUNK_SIZE = 64_MiB; break;
default: CHUNK_SIZE = 16_MiB;
}
u32 SLICES_PER_BATCH;
switch (Settings::values.gpu_unzwizzle_chunk_size.GetValue()) {
case Settings::GpuUnswizzleChunk::VeryLow: SLICES_PER_BATCH = 32; break;
case Settings::GpuUnswizzleChunk::Low: SLICES_PER_BATCH = 64; break;
case Settings::GpuUnswizzleChunk::Normal: SLICES_PER_BATCH = 128; break;
case Settings::GpuUnswizzleChunk::Medium: SLICES_PER_BATCH = 256; break;
case Settings::GpuUnswizzleChunk::High: SLICES_PER_BATCH = 512; break;
default: SLICES_PER_BATCH = 128;
}
// Read data // Read data
if (task.current_offset < task.total_size) { if (task.current_offset < task.total_size) {
const size_t remaining = task.total_size - task.current_offset; const size_t remaining = task.total_size - task.current_offset;
size_t copy_amount = std::min(chunk_size, remaining);
size_t copy_amount = std::min(CHUNK_SIZE, remaining);
if (remaining > chunk_size) {
if (remaining > CHUNK_SIZE) {
copy_amount = (copy_amount / task.bytes_per_slice) * task.bytes_per_slice; copy_amount = (copy_amount / task.bytes_per_slice) * task.bytes_per_slice;
if (copy_amount == 0) copy_amount = task.bytes_per_slice; if (copy_amount == 0) copy_amount = task.bytes_per_slice;
} }
gpu_memory->ReadBlockUnsafe(image.gpu_addr + task.current_offset,
gpu_memory->ReadBlock(image.gpu_addr + task.current_offset,
task.staging_buffer.mapped_span.data() + task.current_offset, task.staging_buffer.mapped_span.data() + task.current_offset,
copy_amount); copy_amount);
task.current_offset += copy_amount; task.current_offset += copy_amount;
@ -1500,7 +1514,7 @@ void TextureCache<P>::TickAsyncUnswizzle() {
const u32 complete_slices = static_cast<u32>(bytes_ready / task.bytes_per_slice); const u32 complete_slices = static_cast<u32>(bytes_ready / task.bytes_per_slice);
const bool is_final_batch = task.current_offset >= task.total_size; const bool is_final_batch = task.current_offset >= task.total_size;
if (complete_slices >= slices_per_batch || (is_final_batch && complete_slices > 0)) {
if (complete_slices >= SLICES_PER_BATCH || (is_final_batch && complete_slices > 0)) {
const u32 z_start = static_cast<u32>(task.last_submitted_offset / task.bytes_per_slice); const u32 z_start = static_cast<u32>(task.last_submitted_offset / task.bytes_per_slice);
const u32 z_count = std::min(complete_slices, image.info.size.depth - z_start); const u32 z_count = std::min(complete_slices, image.info.size.depth - z_start);
@ -1517,10 +1531,6 @@ void TextureCache<P>::TickAsyncUnswizzle() {
image.flags &= ~ImageFlagBits::IsDecoding; image.flags &= ~ImageFlagBits::IsDecoding;
unswizzle_queue.pop_front(); unswizzle_queue.pop_front();
if (total_used_memory >= expected_memory) {
RunGarbageCollector();
}
// Wait 4 frames to process the next entry // Wait 4 frames to process the next entry
current_unswizzle_frame = 4u; current_unswizzle_frame = 4u;
} }

2
src/video_core/texture_cache/texture_cache_base.h

@ -475,8 +475,6 @@ private:
u64 minimum_memory; u64 minimum_memory;
u64 expected_memory; u64 expected_memory;
u64 critical_memory; u64 critical_memory;
size_t chunk_size;
size_t slices_per_batch;
bool lowmemorydevice = false; bool lowmemorydevice = false;
struct BufferDownload { struct BufferDownload {

Loading…
Cancel
Save