From 1fc9c3f6ff0362dfaf5b7caecd5a58257e2535f4 Mon Sep 17 00:00:00 2001 From: CamilleLaVey Date: Wed, 5 Nov 2025 22:10:02 -0400 Subject: [PATCH] Added FTZ optimizations and adjust pipeline float control handling for Qualcomm drivers --- src/common/settings.h | 4 +++ .../spirv/emit_spirv_floating_point.cpp | 24 +++++++++++++-- .../renderer_vulkan/vk_graphics_pipeline.h | 6 +++- .../renderer_vulkan/vk_pipeline_cache.cpp | 30 ++++++++++++++++++- 4 files changed, 59 insertions(+), 5 deletions(-) diff --git a/src/common/settings.h b/src/common/settings.h index 0b61da88f9..c52c4ca345 100644 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -547,6 +547,10 @@ struct Values { Specialization::Scalar}; SwitchableSetting force_unsupported_extensions{linkage, false, "force_unsupported_extensions", Category::RendererExtensions}; + // Developer toggle to enable FTZ (flush-to-zero) optimizations for testing on + // Qualcomm devices. Default false for safety; when true, pipeline code may + // emit FTZ-friendly SPIR-V and allow fast-math transforms for Adreno GPUs. + SwitchableSetting enable_ftz{linkage, false, "enable_ftz", Category::RendererExtensions}; SwitchableSetting provoking_vertex{linkage, false, "provoking_vertex", Category::RendererExtensions}; SwitchableSetting descriptor_indexing{linkage, false, "descriptor_indexing", Category::RendererExtensions}; SwitchableSetting sample_shading{linkage, false, "sample_shading", Category::RendererExtensions, Specialization::Paired}; diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_floating_point.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_floating_point.cpp index d921913b4a..9f3ceb09ed 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_floating_point.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_floating_point.cpp @@ -73,15 +73,33 @@ Id EmitFPAdd64(EmitContext& ctx, IR::Inst* inst, Id a, Id b) { } Id EmitFPFma16(EmitContext& ctx, IR::Inst* inst, Id a, Id b, Id c) { - return Decorate(ctx, inst, ctx.OpFma(ctx.F16[1], a, b, c)); + // Use OpFma only when fast-math is allowed. Some drivers/targets require + // conservative separate mul+add semantics; when need_fastmath_off is true + // we emit separate FMul + FAdd to avoid relying on fused behavior. + if (!ctx.profile.need_fastmath_off) { + return Decorate(ctx, inst, ctx.OpFma(ctx.F16[1], a, b, c)); + } else { + const Id mul = ctx.OpFMul(ctx.F16[1], a, b); + return Decorate(ctx, inst, ctx.OpFAdd(ctx.F16[1], mul, c)); + } } Id EmitFPFma32(EmitContext& ctx, IR::Inst* inst, Id a, Id b, Id c) { - return Decorate(ctx, inst, ctx.OpFma(ctx.F32[1], a, b, c)); + if (!ctx.profile.need_fastmath_off) { + return Decorate(ctx, inst, ctx.OpFma(ctx.F32[1], a, b, c)); + } else { + const Id mul = ctx.OpFMul(ctx.F32[1], a, b); + return Decorate(ctx, inst, ctx.OpFAdd(ctx.F32[1], mul, c)); + } } Id EmitFPFma64(EmitContext& ctx, IR::Inst* inst, Id a, Id b, Id c) { - return Decorate(ctx, inst, ctx.OpFma(ctx.F64[1], a, b, c)); + if (!ctx.profile.need_fastmath_off) { + return Decorate(ctx, inst, ctx.OpFma(ctx.F64[1], a, b, c)); + } else { + const Id mul = ctx.OpFMul(ctx.F64[1], a, b); + return Decorate(ctx, inst, ctx.OpFAdd(ctx.F64[1], mul, c)); + } } Id EmitFPMax32(EmitContext& ctx, Id a, Id b) { diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h index 650c8e07ed..6703c8d8c5 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h @@ -31,6 +31,10 @@ namespace Vulkan { struct GraphicsPipelineCacheKey { std::array unique_hashes; FixedPipelineState state; + // Per-pipeline float control choices (selected at pipeline key time). + // 0 = disabled, 1 = enabled + uint8_t use_ftz_f32{}; + uint8_t use_ftz_f16{}; size_t Hash() const noexcept; @@ -41,7 +45,7 @@ struct GraphicsPipelineCacheKey { } size_t Size() const noexcept { - return sizeof(unique_hashes) + state.Size(); + return sizeof(unique_hashes) + state.Size() + sizeof(use_ftz_f32) + sizeof(use_ftz_f16); } }; static_assert(std::has_unique_object_representations_v); diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index fbcad44d5f..22245f91f6 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -446,6 +446,26 @@ GraphicsPipeline* PipelineCache::CurrentGraphicsPipeline() { } graphics_key.state.Refresh(*maxwell3d, dynamic_features); + // Decide per-pipeline FTZ (flush-to-zero) usage based on device float-controls + // properties and a dedicated enable_ftz developer toggle. FTZ is gated to + // Qualcomm drivers for initial testing to avoid widespread regressions. + const bool enable_ftz_setting = Settings::values.enable_ftz.GetValue(); + const auto& float_control = device.FloatControlProperties(); + const bool has_khr_float_controls = device.IsKhrShaderFloatControlsSupported(); + const bool denorm_indep_all = float_control.denormBehaviorIndependence == VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL; + const bool denorm_indep_32 = denorm_indep_all || float_control.denormBehaviorIndependence == VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_32_BIT_ONLY; + + const bool is_qualcomm = device.GetDriverID() == VK_DRIVER_ID_QUALCOMM_PROPRIETARY; + const bool allow_ftz = enable_ftz_setting && is_qualcomm; + + graphics_key.use_ftz_f32 = (has_khr_float_controls && (float_control.shaderDenormFlushToZeroFloat32 == VK_TRUE) && denorm_indep_32 && allow_ftz) ? 1 : 0; + graphics_key.use_ftz_f16 = (has_khr_float_controls && (float_control.shaderDenormFlushToZeroFloat16 == VK_TRUE) && denorm_indep_all && allow_ftz) ? 1 : 0; + + if (allow_ftz && (graphics_key.use_ftz_f32 || graphics_key.use_ftz_f16)) { + LOG_INFO(Render_Vulkan, "Enabling per-pipeline FTZ (fast-math) for Qualcomm device: f32={} f16={}", + graphics_key.use_ftz_f32, graphics_key.use_ftz_f16); + } + if (current_pipeline) { GraphicsPipeline* const next{current_pipeline->Next(graphics_key)}; if (next) { @@ -690,7 +710,15 @@ std::unique_ptr PipelineCache::CreateGraphicsPipeline( const auto runtime_info{MakeRuntimeInfo(programs, key, program, previous_stage)}; ConvertLegacyToGeneric(program, runtime_info); - const std::vector code{EmitSPIRV(profile, runtime_info, program, binding, this->optimize_spirv_output)}; + // Forward a pipeline-specific profile to the SPIR-V emitter so it can + // enable/disable fast-math (FTZ) optimizations per-pipeline. We use the + // GraphicsPipelineCacheKey's FTZ choice to decide whether to allow + // fast-math. When fast-math is enabled we keep need_fastmath_off=false + // (allow optimizations); otherwise we set it to true to prevent unsafe + // transformations. + Shader::Profile emit_profile = profile; + emit_profile.need_fastmath_off = (key.use_ftz_f32 == 0); + const std::vector code{EmitSPIRV(emit_profile, runtime_info, program, binding, this->optimize_spirv_output)}; device.SaveShader(code); modules[stage_index] = BuildShader(device, code); if (device.HasDebuggingToolAttached()) {