From 535b33bc6be608b3ffe1ad1abaf631cce8fc5d19 Mon Sep 17 00:00:00 2001 From: CamilleLaVey Date: Thu, 27 Nov 2025 14:02:03 -0400 Subject: [PATCH] [spv, qcom] Implement warp intrinsics support --- .../backend/spirv/emit_spirv.cpp | 15 ++++- .../backend/spirv/emit_spirv_warp.cpp | 62 +++++++++++++++++++ .../backend/spirv/spirv_emit_context.cpp | 10 +-- src/shader_recompiler/profile.h | 13 ++++ .../renderer_opengl/gl_shader_cache.cpp | 1 + .../renderer_vulkan/vk_compute_pass.cpp | 12 ++-- .../renderer_vulkan/vk_pipeline_cache.cpp | 41 ++++++++++++ .../vulkan_common/vulkan_device.cpp | 43 +++++++++++++ src/video_core/vulkan_common/vulkan_device.h | 21 ++++++- 9 files changed, 204 insertions(+), 14 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp index 313a1deb30..4aa211089f 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp @@ -11,6 +11,7 @@ #include #include +#include "common/logging/log.h" #include "common/settings.h" #include "shader_recompiler/backend/spirv/emit_spirv.h" #include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" @@ -439,15 +440,23 @@ void SetupCapabilities(const Profile& profile, const Info& info, EmitContext& ct ctx.AddExtension("SPV_KHR_shader_draw_parameters"); ctx.AddCapability(spv::Capability::DrawParameters); } - if ((info.uses_subgroup_vote || info.uses_subgroup_invocation_id || - info.uses_subgroup_shuffles) && - profile.support_vote) { + const bool stage_supports_warp = profile.SupportsWarpIntrinsics(ctx.stage); + const bool needs_warp_intrinsics = info.uses_subgroup_vote || + info.uses_subgroup_invocation_id || + info.uses_subgroup_shuffles; + + if (needs_warp_intrinsics && profile.support_vote && stage_supports_warp) { ctx.AddCapability(spv::Capability::GroupNonUniformBallot); ctx.AddCapability(spv::Capability::GroupNonUniformShuffle); if (!profile.warp_size_potentially_larger_than_guest) { // vote ops are only used when not taking the long path ctx.AddCapability(spv::Capability::GroupNonUniformVote); } + } else if (needs_warp_intrinsics && !stage_supports_warp) { + LOG_WARNING(Shader, + "Warp intrinsics requested in stage {} but the device does not report subgroup " + "support; falling back to scalar approximations", + static_cast(ctx.stage)); } if (info.uses_int64_bit_atomics && profile.support_int64_atomics) { ctx.AddCapability(spv::Capability::Int64Atomics); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp index 77ff8c5731..d9850965f7 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later @@ -78,9 +81,25 @@ Id AddPartitionBase(EmitContext& ctx, Id thread_id) { const Id partition_base{ctx.OpShiftLeftLogical(ctx.U32[1], partition_idx, ctx.Const(5u))}; return ctx.OpIAdd(ctx.U32[1], thread_id, partition_base); } + +bool SupportsWarpIntrinsics(const EmitContext& ctx) { + return ctx.profile.SupportsWarpIntrinsics(ctx.stage); +} + +void SetAlwaysInBounds(EmitContext& ctx, IR::Inst* inst) { + SetInBoundsFlag(inst, ctx.true_value); +} + +Id FallbackBallotMask(EmitContext& ctx, Id pred) { + const Id full_mask{ctx.Const(0xFFFFFFFFu)}; + return ctx.OpSelect(ctx.U32[1], pred, full_mask, ctx.u32_zero_value); +} } // Anonymous namespace Id EmitLaneId(EmitContext& ctx) { + if (!SupportsWarpIntrinsics(ctx)) { + return ctx.u32_zero_value; + } const Id id{GetThreadId(ctx)}; if (!ctx.profile.warp_size_potentially_larger_than_guest) { return id; @@ -89,6 +108,9 @@ Id EmitLaneId(EmitContext& ctx) { } Id EmitVoteAll(EmitContext& ctx, Id pred) { + if (!SupportsWarpIntrinsics(ctx)) { + return pred; + } if (!ctx.profile.warp_size_potentially_larger_than_guest) { return ctx.OpGroupNonUniformAll(ctx.U1, SubgroupScope(ctx), pred); } @@ -102,6 +124,9 @@ Id EmitVoteAll(EmitContext& ctx, Id pred) { } Id EmitVoteAny(EmitContext& ctx, Id pred) { + if (!SupportsWarpIntrinsics(ctx)) { + return pred; + } if (!ctx.profile.warp_size_potentially_larger_than_guest) { return ctx.OpGroupNonUniformAny(ctx.U1, SubgroupScope(ctx), pred); } @@ -115,6 +140,9 @@ Id EmitVoteAny(EmitContext& ctx, Id pred) { } Id EmitVoteEqual(EmitContext& ctx, Id pred) { + if (!SupportsWarpIntrinsics(ctx)) { + return pred; + } if (!ctx.profile.warp_size_potentially_larger_than_guest) { return ctx.OpGroupNonUniformAllEqual(ctx.U1, SubgroupScope(ctx), pred); } @@ -129,6 +157,9 @@ Id EmitVoteEqual(EmitContext& ctx, Id pred) { } Id EmitSubgroupBallot(EmitContext& ctx, Id pred) { + if (!SupportsWarpIntrinsics(ctx)) { + return FallbackBallotMask(ctx, pred); + } const Id ballot{ctx.OpGroupNonUniformBallot(ctx.U32[4], SubgroupScope(ctx), pred)}; if (!ctx.profile.warp_size_potentially_larger_than_guest) { return ctx.OpCompositeExtract(ctx.U32[1], ballot, 0U); @@ -137,27 +168,46 @@ Id EmitSubgroupBallot(EmitContext& ctx, Id pred) { } Id EmitSubgroupEqMask(EmitContext& ctx) { + if (!SupportsWarpIntrinsics(ctx)) { + return ctx.u32_zero_value; + } return LoadMask(ctx, ctx.subgroup_mask_eq); } Id EmitSubgroupLtMask(EmitContext& ctx) { + if (!SupportsWarpIntrinsics(ctx)) { + return ctx.u32_zero_value; + } return LoadMask(ctx, ctx.subgroup_mask_lt); } Id EmitSubgroupLeMask(EmitContext& ctx) { + if (!SupportsWarpIntrinsics(ctx)) { + return ctx.u32_zero_value; + } return LoadMask(ctx, ctx.subgroup_mask_le); } Id EmitSubgroupGtMask(EmitContext& ctx) { + if (!SupportsWarpIntrinsics(ctx)) { + return ctx.u32_zero_value; + } return LoadMask(ctx, ctx.subgroup_mask_gt); } Id EmitSubgroupGeMask(EmitContext& ctx) { + if (!SupportsWarpIntrinsics(ctx)) { + return ctx.u32_zero_value; + } return LoadMask(ctx, ctx.subgroup_mask_ge); } Id EmitShuffleIndex(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, Id segmentation_mask) { + if (!SupportsWarpIntrinsics(ctx)) { + SetAlwaysInBounds(ctx, inst); + return value; + } const Id not_seg_mask{ctx.OpNot(ctx.U32[1], segmentation_mask)}; const Id thread_id{EmitLaneId(ctx)}; const Id min_thread_id{ComputeMinThreadId(ctx, thread_id, segmentation_mask)}; @@ -177,6 +227,10 @@ Id EmitShuffleIndex(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id cla Id EmitShuffleUp(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, Id segmentation_mask) { + if (!SupportsWarpIntrinsics(ctx)) { + SetAlwaysInBounds(ctx, inst); + return value; + } const Id thread_id{EmitLaneId(ctx)}; const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)}; Id src_thread_id{ctx.OpISub(ctx.U32[1], thread_id, index)}; @@ -192,6 +246,10 @@ Id EmitShuffleUp(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, Id EmitShuffleDown(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, Id segmentation_mask) { + if (!SupportsWarpIntrinsics(ctx)) { + SetAlwaysInBounds(ctx, inst); + return value; + } const Id thread_id{EmitLaneId(ctx)}; const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)}; Id src_thread_id{ctx.OpIAdd(ctx.U32[1], thread_id, index)}; @@ -207,6 +265,10 @@ Id EmitShuffleDown(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clam Id EmitShuffleButterfly(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, Id segmentation_mask) { + if (!SupportsWarpIntrinsics(ctx)) { + SetAlwaysInBounds(ctx, inst); + return value; + } const Id thread_id{EmitLaneId(ctx)}; const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)}; Id src_thread_id{ctx.OpBitwiseXor(ctx.U32[1], thread_id, index)}; diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index d4e5441469..3f014a6a75 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -1419,6 +1419,7 @@ void EmitContext::DefineImages(const Info& info, u32& binding, u32& scaling_inde void EmitContext::DefineInputs(const IR::Program& program) { const Info& info{program.info}; const VaryingState loads{info.loads.mask | info.passthrough.mask}; + const bool stage_supports_warp = profile.SupportsWarpIntrinsics(stage); if (info.uses_workgroup_id) { workgroup_id = DefineInput(*this, U32[3], false, spv::BuiltIn::WorkgroupId); @@ -1442,7 +1443,7 @@ void EmitContext::DefineInputs(const IR::Program& program) { if (info.uses_is_helper_invocation) { is_helper_invocation = DefineInput(*this, U1, false, spv::BuiltIn::HelperInvocation); } - if (info.uses_subgroup_mask) { + if (info.uses_subgroup_mask && stage_supports_warp) { subgroup_mask_eq = DefineInput(*this, U32[4], false, spv::BuiltIn::SubgroupEqMaskKHR); subgroup_mask_lt = DefineInput(*this, U32[4], false, spv::BuiltIn::SubgroupLtMaskKHR); subgroup_mask_le = DefineInput(*this, U32[4], false, spv::BuiltIn::SubgroupLeMaskKHR); @@ -1456,9 +1457,10 @@ void EmitContext::DefineInputs(const IR::Program& program) { Decorate(subgroup_mask_ge, spv::Decoration::Flat); } } - if (info.uses_fswzadd || info.uses_subgroup_invocation_id || info.uses_subgroup_shuffles || - (profile.warp_size_potentially_larger_than_guest && - (info.uses_subgroup_vote || info.uses_subgroup_mask))) { + if (stage_supports_warp && + (info.uses_fswzadd || info.uses_subgroup_invocation_id || info.uses_subgroup_shuffles || + (profile.warp_size_potentially_larger_than_guest && + (info.uses_subgroup_vote || info.uses_subgroup_mask)))) { AddCapability(spv::Capability::GroupNonUniform); subgroup_local_invocation_id = DefineInput(*this, U32[1], false, spv::BuiltIn::SubgroupLocalInvocationId); diff --git a/src/shader_recompiler/profile.h b/src/shader_recompiler/profile.h index 90e46bb1ba..ea7f5cc76f 100644 --- a/src/shader_recompiler/profile.h +++ b/src/shader_recompiler/profile.h @@ -1,9 +1,15 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later #pragma once +#include + #include "common/common_types.h" +#include "shader_recompiler/stage.h" namespace Shader { @@ -46,6 +52,8 @@ struct Profile { bool support_multi_viewport{}; bool support_geometry_streams{}; + u32 warp_stage_support_mask{std::numeric_limits::max()}; + bool warp_size_potentially_larger_than_guest{}; bool lower_left_origin_mode{}; @@ -90,6 +98,11 @@ struct Profile { u64 min_ssbo_alignment{}; u32 max_user_clip_distances{}; + + bool SupportsWarpIntrinsics(Stage stage) const { + const u32 bit = 1u << static_cast(stage); + return (warp_stage_support_mask & bit) != 0; + } }; } // namespace Shader diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 45f729698e..f3c17e2e91 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -220,6 +220,7 @@ ShaderCache::ShaderCache(Tegra::MaxwellDeviceMemoryManager& device_memory_, .support_gl_sparse_textures = device.HasSparseTexture2(), .support_gl_derivative_control = device.HasDerivativeControl(), .support_geometry_streams = true, + .warp_stage_support_mask = 0xFFFFFFFFu, .warp_size_potentially_larger_than_guest = device.IsWarpSizePotentiallyLargerThanGuest(), diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 5b41dc225f..2dd4d08fa3 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -462,10 +462,14 @@ QueriesPrefixScanPass::QueriesPrefixScanPass( device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS, QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO, COMPUTE_PUSH_CONSTANT_RANGE, - device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_BASIC_BIT) && - device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_ARITHMETIC_BIT) && - device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_BIT) && - device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT) + device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_BASIC_BIT, + VK_SHADER_STAGE_COMPUTE_BIT) && + device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_ARITHMETIC_BIT, + VK_SHADER_STAGE_COMPUTE_BIT) && + device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_BIT, + VK_SHADER_STAGE_COMPUTE_BIT) && + device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT, + VK_SHADER_STAGE_COMPUTE_BIT) ? std::span(QUERIES_PREFIX_SCAN_SUM_COMP_SPV) : std::span(QUERIES_PREFIX_SCAN_SUM_NOSUBGROUPS_COMP_SPV)), scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 57ea9dcc61..aea1cd827e 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -105,6 +105,26 @@ Shader::CompareFunction MaxwellToCompareFunction(Maxwell::ComparisonOp compariso return {}; } +VkShaderStageFlagBits StageToVkStage(Shader::Stage stage) { + switch (stage) { + case Shader::Stage::VertexA: + case Shader::Stage::VertexB: + return VK_SHADER_STAGE_VERTEX_BIT; + case Shader::Stage::TessellationControl: + return VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT; + case Shader::Stage::TessellationEval: + return VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT; + case Shader::Stage::Geometry: + return VK_SHADER_STAGE_GEOMETRY_BIT; + case Shader::Stage::Fragment: + return VK_SHADER_STAGE_FRAGMENT_BIT; + case Shader::Stage::Compute: + return VK_SHADER_STAGE_COMPUTE_BIT; + default: + return VK_SHADER_STAGE_VERTEX_BIT; + } +} + Shader::AttributeType CastAttributeType(const FixedPipelineState::VertexAttribute& attr) { if (attr.enabled == 0) { return Shader::AttributeType::Disabled; @@ -395,6 +415,27 @@ PipelineCache::PipelineCache(Tegra::MaxwellDeviceMemoryManager& device_memory_, .support_conditional_barrier = device.SupportsConditionalBarriers(), }; + profile.warp_stage_support_mask = 0; + static constexpr std::array kAllStages{ + Shader::Stage::VertexA, Shader::Stage::VertexB, + Shader::Stage::TessellationControl, Shader::Stage::TessellationEval, + Shader::Stage::Geometry, Shader::Stage::Fragment, + Shader::Stage::Compute, + }; + for (const auto stage : kAllStages) { + const auto vk_stage = StageToVkStage(stage); + if (device.SupportsWarpIntrinsics(vk_stage)) { + profile.warp_stage_support_mask |= 1u << static_cast(stage); + } + } + profile.support_vote = profile.warp_stage_support_mask != 0; + + if (!profile.SupportsWarpIntrinsics(Shader::Stage::Fragment)) { + LOG_WARNING(Render_Vulkan, + "Fragment shaders lack subgroup support on this driver; warp intrinsics will be " + "approximated and visual artifacts may remain"); + } + if (device.GetMaxVertexInputAttributes() < Maxwell::NumVertexAttributes) { LOG_WARNING(Render_Vulkan, "maxVertexInputAttributes is too low: {} < {}", device.GetMaxVertexInputAttributes(), Maxwell::NumVertexAttributes); diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 1352808d91..07781d30b3 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -92,6 +92,11 @@ constexpr std::array VK_FORMAT_A4B4G4R4_UNORM_PACK16{ } // namespace Alternatives +constexpr VkShaderStageFlags GraphicsStageMask = + VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT | + VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT | VK_SHADER_STAGE_GEOMETRY_BIT | + VK_SHADER_STAGE_FRAGMENT_BIT; + template void SetNext(void**& next, T& data) { *next = &data; @@ -1544,6 +1549,44 @@ void Device::RemoveUnsuitableExtensions() { RemoveExtensionIfUnsuitable(extensions.maintenance9, VK_KHR_MAINTENANCE_9_EXTENSION_NAME); } +bool Device::SupportsSubgroupStage(VkShaderStageFlags stage_mask) const { + if (stage_mask == 0) { + return true; + } + const VkShaderStageFlags supported = properties.subgroup_properties.supportedStages; + if ((supported & stage_mask) == stage_mask) { + return true; + } + if ((stage_mask & GraphicsStageMask) != 0 && + (supported & (VK_SHADER_STAGE_ALL_GRAPHICS | VK_SHADER_STAGE_ALL)) != 0) { + return true; + } + if ((stage_mask & VK_SHADER_STAGE_COMPUTE_BIT) != 0 && + (supported & (VK_SHADER_STAGE_COMPUTE_BIT | VK_SHADER_STAGE_ALL)) != 0) { + return true; + } + return (supported & VK_SHADER_STAGE_ALL) != 0; +} + +bool Device::IsSubgroupFeatureSupported(VkSubgroupFeatureFlagBits feature, + VkShaderStageFlags stage_mask) const { + if ((properties.subgroup_properties.supportedOperations & feature) == 0) { + return false; + } + return SupportsSubgroupStage(stage_mask); +} + +bool Device::SupportsWarpIntrinsics(VkShaderStageFlagBits stage) const { + constexpr VkSubgroupFeatureFlags required_ops = + VK_SUBGROUP_FEATURE_BASIC_BIT | VK_SUBGROUP_FEATURE_VOTE_BIT | + VK_SUBGROUP_FEATURE_ARITHMETIC_BIT | VK_SUBGROUP_FEATURE_BALLOT_BIT | + VK_SUBGROUP_FEATURE_SHUFFLE_BIT | VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT; + if ((properties.subgroup_properties.supportedOperations & required_ops) != required_ops) { + return false; + } + return SupportsSubgroupStage(stage); +} + void Device::SetupFamilies(VkSurfaceKHR surface) { const std::vector queue_family_properties = physical.GetQueueFamilyProperties(); std::optional graphics; diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 621a2b607d..f1674e32a1 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -401,11 +401,26 @@ public: return properties.subgroup_size_control.requiredSubgroupSizeStages & stage; } - /// Returns true if the device supports the provided subgroup feature. - bool IsSubgroupFeatureSupported(VkSubgroupFeatureFlagBits feature) const { - return properties.subgroup_properties.supportedOperations & feature; + /// Returns true if the device supports the provided subgroup feature for the given stages. + bool IsSubgroupFeatureSupported(VkSubgroupFeatureFlagBits feature, + VkShaderStageFlags stage_mask = 0) const; + + /// Returns true if the device reports subgroup support for the provided shader stages. + bool SupportsSubgroupStage(VkShaderStageFlags stage_mask) const; + + /// Returns the set of stages that report subgroup support. + VkShaderStageFlags GetSubgroupSupportedStages() const { + return properties.subgroup_properties.supportedStages; } + /// Returns the set of subgroup operations reported by the driver. + VkSubgroupFeatureFlags GetSubgroupSupportedOperations() const { + return properties.subgroup_properties.supportedOperations; + } + + /// Returns true if the driver can execute all warp intrinsics for the given shader stage. + bool SupportsWarpIntrinsics(VkShaderStageFlagBits stage) const; + /// Returns the maximum number of push descriptors. u32 MaxPushDescriptors() const { return properties.push_descriptor.maxPushDescriptors;