From 91596c6d1b267cdcfbf305f60993f4147f7d22a0 Mon Sep 17 00:00:00 2001 From: lizzie Date: Tue, 13 Jan 2026 01:53:50 +0100 Subject: [PATCH] [dynarmic] fix ender magnolia crash on pre-AVX2 cpus (#3297) supercedes #3295 Intel Atom N450 6 billion fps also for the note, host calls are expensive as fuck please test i didn't break other games kthx Test with both a pre-AVX2 CPU (like i5-3th gen) AND a AVX2 cpu (like a i7-4th gen) Signed-off-by: lizzie lizzie@eden-emu.dev Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3297 Reviewed-by: DraVee Reviewed-by: MaranBr Co-authored-by: lizzie Co-committed-by: lizzie --- .../dynarmic/backend/x64/emit_x64_vector.cpp | 429 ++++++------------ 1 file changed, 147 insertions(+), 282 deletions(-) diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp index 2fa07cc946..e940c190e7 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp @@ -569,32 +569,24 @@ void EmitX64::EmitVectorArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst) ctx.reg_alloc.DefineValue(code, inst, result); } -template -static constexpr T VShift(T x, T y) { - const s8 shift_amount = static_cast(static_cast(y)); - const s64 bit_size = static_cast(mcl::bitsizeof); - - if constexpr (std::is_signed_v) { - if (shift_amount >= bit_size) { +template constexpr T VShift(T x, T y) { + s8 const shift_amount = s8(u8(y)); + s64 const bit_size = s64(mcl::bitsizeof); + if (std::is_signed_v) { + if (shift_amount >= bit_size) return 0; - } - - if (shift_amount <= -bit_size) { - // Parentheses necessary, as MSVC doesn't appear to consider cast parentheses - // as a grouping in terms of precedence, causing warning C4554 to fire. See: - // https://developercommunity.visualstudio.com/content/problem/144783/msvc-2017-does-not-understand-that-static-cast-cou.html + // Parentheses necessary, as MSVC doesn't appear to consider cast parentheses + // as a grouping in terms of precedence, causing warning C4554 to fire. See: + // https://developercommunity.visualstudio.com/content/problem/144783/msvc-2017-does-not-understand-that-static-cast-cou.html + if (shift_amount <= -bit_size) return x >> (T(bit_size - 1)); - } } else if (shift_amount <= -bit_size || shift_amount >= bit_size) { return 0; } - - if (shift_amount < 0) { + if (shift_amount < 0) return x >> T(-shift_amount); - } - using unsigned_type = std::make_unsigned_t; - return static_cast(static_cast(x) << static_cast(shift_amount)); + return T(unsigned_type(x) << unsigned_type(shift_amount)); } void EmitX64::EmitVectorArithmeticVShift8(EmitContext& ctx, IR::Inst* inst) { @@ -606,145 +598,83 @@ void EmitX64::EmitVectorArithmeticVShift8(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorArithmeticVShift16(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); - + auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const right_shift = ctx.reg_alloc.ScratchXmm(code); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.vmovdqa32(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF)); code.vpxord(right_shift, right_shift, right_shift); code.vpsubw(right_shift, right_shift, left_shift); - code.vpsllw(xmm0, left_shift, 8); code.vpsraw(xmm0, xmm0, 15); - - const Xbyak::Opmask mask = k1; - code.vpmovb2m(mask, xmm0); - + code.vpmovb2m(k1, xmm0); code.vpandd(right_shift, right_shift, tmp); code.vpandd(left_shift, left_shift, tmp); - code.vpsravw(tmp, result, right_shift); code.vpsllvw(result, result, left_shift); - code.vpblendmb(result | mask, result, tmp); - + code.vpblendmb(result | k1, result, tmp); ctx.reg_alloc.DefineValue(code, inst, result); - return; + } else { + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { + std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift); + }); } - - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift); - }); } void EmitX64::EmitVectorArithmeticVShift32(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::AVX2)) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); - - // store sign bit of lowest byte of each element of b to select left/right shift later - code.vpslld(xmm0, b, 24); - - // sse/avx shifts are only positive, with dedicated left/right forms - shift by lowest byte of abs(b) - code.vpabsb(b, b); - code.vpand(b, b, code.BConst<32>(xword, 0xFF)); - - // calculate shifts - code.vpsllvd(result, a, b); - code.vpsravd(a, a, b); - - code.blendvps(result, a); // implicit argument: xmm0 (sign of lowest byte of b) - - ctx.reg_alloc.DefineValue(code, inst, result); - return; + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp3 = ctx.reg_alloc.ScratchXmm(code); + code.vpabsb(tmp2, tmp1); + code.vpslld(tmp1, tmp1, 24); + code.vpand(tmp2, tmp2, code.Const(xword, 0x000000FF000000FF, 0x000000FF000000FF)); + code.vpsravd(tmp3, tmp0, tmp2); + code.vpsllvd(tmp0, tmp0, tmp2); + code.vblendvps(tmp0, tmp0, tmp3, tmp1); + ctx.reg_alloc.DefineValue(code, inst, tmp0); + } else { + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { + std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift); + }); } - - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift); - }); } void EmitX64::EmitVectorArithmeticVShift64(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); - + auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const right_shift = ctx.reg_alloc.ScratchXmm(code); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.vmovdqa32(tmp, code.Const(xword, 0x00000000000000FF, 0x00000000000000FF)); code.vpxorq(right_shift, right_shift, right_shift); code.vpsubq(right_shift, right_shift, left_shift); - code.vpsllq(xmm0, left_shift, 56); - const Xbyak::Opmask mask = k1; - code.vpmovq2m(mask, xmm0); - + code.vpmovq2m(k1, xmm0); code.vpandq(right_shift, right_shift, tmp); code.vpandq(left_shift, left_shift, tmp); - code.vpsravq(tmp, result, right_shift); code.vpsllvq(result, result, left_shift); - code.vpblendmq(result | mask, result, tmp); - + code.vpblendmq(result | k1, result, tmp); ctx.reg_alloc.DefineValue(code, inst, result); - return; - } - - if (code.HasHostFeature(HostFeature::AVX2)) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm negative_mask = ctx.reg_alloc.ScratchXmm(code); - - // negative_mask = a < 0 ? 1s : 0s - code.vpxor(xmm0, xmm0, xmm0); - code.vpcmpgtq(negative_mask, xmm0, a); - - // store sign bit of lowest byte of each element of b to select left/right shift later - code.vpsllq(xmm0, b, 56); - - // sse/avx shifts are only positive, with dedicated left/right forms - shift by lowest byte of abs(b) - code.vpabsb(b, b); - code.vpand(b, b, code.BConst<64>(xword, 0xFF)); - - // calculate shifts - code.vpsllvq(result, a, b); - - // implement variable arithmetic shift in terms of logical shift - // if a is negative, invert it, shift in leading 0s, then invert it again - noop if positive - code.vpxor(a, a, negative_mask); - code.vpsrlvq(a, a, b); - code.vpxor(a, a, negative_mask); - - code.blendvpd(result, a); // implicit argument: xmm0 (sign of lowest byte of b) - - ctx.reg_alloc.DefineValue(code, inst, result); - return; + } else { + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { + std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift); + }); } - - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift); - }); } void EmitX64::EmitVectorBroadcastLower8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - if (code.HasHostFeature(HostFeature::AVX2)) { code.vpbroadcastb(a, a); code.vmovq(a, a); } else if (code.HasHostFeature(HostFeature::SSSE3)) { const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); - code.pxor(tmp, tmp); code.pshufb(a, tmp); code.movq(a, a); @@ -752,7 +682,6 @@ void EmitX64::EmitVectorBroadcastLower8(EmitContext& ctx, IR::Inst* inst) { code.punpcklbw(a, a); code.pshuflw(a, a, 0); } - ctx.reg_alloc.DefineValue(code, inst, a); } @@ -777,12 +706,10 @@ void EmitX64::EmitVectorBroadcastLower32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorBroadcast8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - if (code.HasHostFeature(HostFeature::AVX2)) { code.vpbroadcastb(a, a); } else if (code.HasHostFeature(HostFeature::SSSE3)) { const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); - code.pxor(tmp, tmp); code.pshufb(a, tmp); } else { @@ -790,47 +717,40 @@ void EmitX64::EmitVectorBroadcast8(EmitContext& ctx, IR::Inst* inst) { code.pshuflw(a, a, 0); code.punpcklqdq(a, a); } - ctx.reg_alloc.DefineValue(code, inst, a); } void EmitX64::EmitVectorBroadcast16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - if (code.HasHostFeature(HostFeature::AVX2)) { code.vpbroadcastw(a, a); } else { code.pshuflw(a, a, 0); code.punpcklqdq(a, a); } - ctx.reg_alloc.DefineValue(code, inst, a); } void EmitX64::EmitVectorBroadcast32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - if (code.HasHostFeature(HostFeature::AVX2)) { code.vpbroadcastd(a, a); } else { code.pshufd(a, a, 0); } - ctx.reg_alloc.DefineValue(code, inst, a); } void EmitX64::EmitVectorBroadcast64(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - if (code.HasHostFeature(HostFeature::AVX2)) { code.vpbroadcastq(a, a); } else { code.punpcklqdq(a, a); } - ctx.reg_alloc.DefineValue(code, inst, a); } @@ -840,17 +760,14 @@ void EmitX64::EmitVectorBroadcastElementLower8(EmitContext& ctx, IR::Inst* inst) ASSERT(args[1].IsImmediate()); const u8 index = args[1].GetImmediateU8(); ASSERT(index < 16); - if (index > 0) { code.psrldq(a, index); } - if (code.HasHostFeature(HostFeature::AVX2)) { code.vpbroadcastb(a, a); code.vmovq(a, a); } else if (code.HasHostFeature(HostFeature::SSSE3)) { const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); - code.pxor(tmp, tmp); code.pshufb(a, tmp); code.movq(a, a); @@ -858,7 +775,6 @@ void EmitX64::EmitVectorBroadcastElementLower8(EmitContext& ctx, IR::Inst* inst) code.punpcklbw(a, a); code.pshuflw(a, a, 0); } - ctx.reg_alloc.DefineValue(code, inst, a); } @@ -868,13 +784,10 @@ void EmitX64::EmitVectorBroadcastElementLower16(EmitContext& ctx, IR::Inst* inst ASSERT(args[1].IsImmediate()); const u8 index = args[1].GetImmediateU8(); ASSERT(index < 8); - if (index > 0) { code.psrldq(a, u8(index * 2)); } - code.pshuflw(a, a, 0); - ctx.reg_alloc.DefineValue(code, inst, a); } @@ -900,11 +813,9 @@ void EmitX64::EmitVectorBroadcastElement8(EmitContext& ctx, IR::Inst* inst) { ASSERT(args[1].IsImmediate()); const u8 index = args[1].GetImmediateU8(); ASSERT(index < 16); - if (index > 0) { code.psrldq(a, index); } - if (code.HasHostFeature(HostFeature::AVX2)) { code.vpbroadcastb(a, a); } else if (code.HasHostFeature(HostFeature::SSSE3)) { @@ -926,22 +837,17 @@ void EmitX64::EmitVectorBroadcastElement16(EmitContext& ctx, IR::Inst* inst) { ASSERT(args[1].IsImmediate()); const u8 index = args[1].GetImmediateU8(); ASSERT(index < 8); - if (index == 0 && code.HasHostFeature(HostFeature::AVX2)) { code.vpbroadcastw(a, a); - - ctx.reg_alloc.DefineValue(code, inst, a); - return; - } - - if (index < 4) { - code.pshuflw(a, a, mcl::bit::replicate_element<2, u8>(index)); - code.punpcklqdq(a, a); } else { - code.pshufhw(a, a, mcl::bit::replicate_element<2, u8>(u8(index - 4))); - code.punpckhqdq(a, a); + if (index < 4) { + code.pshuflw(a, a, mcl::bit::replicate_element<2, u8>(index)); + code.punpcklqdq(a, a); + } else { + code.pshufhw(a, a, mcl::bit::replicate_element<2, u8>(u8(index - 4))); + code.punpckhqdq(a, a); + } } - ctx.reg_alloc.DefineValue(code, inst, a); } @@ -994,13 +900,10 @@ static void EmitVectorCountLeadingZeros(VectorArray& result, const VectorArra void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::GFNI)) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); - + auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const result = ctx.reg_alloc.ScratchXmm(code); // Reverse bits: code.gf2p8affineqb(data, code.BConst<64>(xword, 0x8040201008040201), 0); - // Perform a tzcnt: // Isolate lowest set bit code.pcmpeqb(result, result); @@ -1008,29 +911,22 @@ void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) { code.pandn(result, data); // Convert lowest set bit into an index code.gf2p8affineqb(result, code.BConst<64>(xword, 0xaaccf0ff'00000000), 8); - ctx.reg_alloc.DefineValue(code, inst, result); } else if (code.HasHostFeature(HostFeature::SSSE3)) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code); - + auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); code.movdqa(tmp1, code.Const(xword, 0x0101010102020304, 0x0000000000000000)); code.movdqa(tmp2, tmp1); - code.pshufb(tmp2, data); code.psrlw(data, 4); code.pand(data, code.Const(xword, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F)); code.pshufb(tmp1, data); - code.movdqa(data, code.Const(xword, 0x0404040404040404, 0x0404040404040404)); - code.pcmpeqb(data, tmp1); code.pand(data, tmp2); code.paddb(data, tmp1); - ctx.reg_alloc.DefineValue(code, inst, data); } else { EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros); @@ -1040,12 +936,10 @@ void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::AVX)) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); - + auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const result = ctx.reg_alloc.ScratchXmm(code); + auto const zeros = ctx.reg_alloc.ScratchXmm(code); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.vpsrlw(tmp, data, 1); code.vpor(data, data, tmp); code.vpsrlw(tmp, data, 2); @@ -1065,16 +959,13 @@ void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) { code.vpor(tmp, tmp, zeros); code.vpor(data, data, tmp); code.vpshufb(result, result, data); - ctx.reg_alloc.DefineValue(code, inst, result); } else if (code.HasHostFeature(HostFeature::SSSE3)) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); - + auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const result = ctx.reg_alloc.ScratchXmm(code); + auto const zeros = ctx.reg_alloc.ScratchXmm(code); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.movdqa(tmp, data); code.psrlw(tmp, 1); code.por(data, tmp); @@ -1098,7 +989,6 @@ void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) { code.por(tmp, zeros); code.por(data, tmp); code.pshufb(result, data); - ctx.reg_alloc.DefineValue(code, inst, result); } else { EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros); @@ -1108,13 +998,13 @@ void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512CD)) { - const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); code.vplzcntd(data, data); ctx.reg_alloc.DefineValue(code, inst, data); - // See https://stackoverflow.com/questions/58823140/count-leading-zero-bits-for-each-element-in-avx2-vector-emulate-mm256-lzcnt-ep/58827596#58827596 } else if (code.HasHostFeature(HostFeature::AVX2)) { - const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(code); + // See https://stackoverflow.com/questions/58823140/count-leading-zero-bits-for-each-element-in-avx2-vector-emulate-mm256-lzcnt-ep/58827596#58827596 + auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const temp = ctx.reg_alloc.ScratchXmm(code); code.vmovdqa(temp, data); code.vpsrld(data, data, 8); code.vpandn(data, data, temp); @@ -1125,7 +1015,24 @@ void EmitX64::EmitVectorCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) { code.vpminsw(data, data, code.Const(xword, 0x0000002000000020, 0x0000002000000020)); ctx.reg_alloc.DefineValue(code, inst, data); } else { - EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros); + // See https://stackoverflow.com/a/58829453 + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + code.pxor(tmp1, tmp1); + code.movdqa(tmp2, tmp0); + code.pcmpeqd(tmp1, tmp0); + code.psrld(tmp0, 1); + code.psrld(tmp2, 2); + code.pandn(tmp2, tmp0); + code.cvtdq2ps(tmp0, tmp2); + code.addps(tmp0, tmp0); + code.addps(tmp0, code.Const(xword, 0x3f8000003f800000, 0x3f8000003f800000)); + code.psrld(tmp0, 23); + code.paddd(tmp1, tmp0); + code.movdqa(tmp0, code.Const(xword, 0x0000009E0000009E, 0x0000009E0000009E)); + code.psubd(tmp0, tmp1); + ctx.reg_alloc.DefineValue(code, inst, tmp0); } } @@ -1892,14 +1799,12 @@ static void EmitVectorLogicalVShiftAVX2(BlockOfCode& code, EmitContext& ctx, IR: ICODE(vpsrlv)(a, a, b); // implicit argument: xmm0 (sign of lowest byte of b) - if constexpr (esize == 32) { + if (esize == 32) { code.blendvps(result, a); } else { code.blendvpd(result, a); } - ctx.reg_alloc.DefineValue(code, inst, result); - return; } void EmitX64::EmitVectorLogicalVShift8(EmitContext& ctx, IR::Inst* inst) { @@ -1942,9 +1847,9 @@ void EmitX64::EmitVectorLogicalVShift8(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(code, inst, result); } else { - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift); - }); + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { + std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift); + }); } } @@ -1969,32 +1874,30 @@ void EmitX64::EmitVectorLogicalVShift16(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(code, inst, result); } else { - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift); - }); + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { + std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift); + }); } } void EmitX64::EmitVectorLogicalVShift32(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::AVX2)) { EmitVectorLogicalVShiftAVX2<32>(code, ctx, inst); - return; + } else { + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { + std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift); + }); } - - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift); - }); } void EmitX64::EmitVectorLogicalVShift64(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::AVX2)) { EmitVectorLogicalVShiftAVX2<64>(code, ctx, inst); - return; + } else { + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { + std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift); + }); } - - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift); - }); } namespace { @@ -3731,25 +3634,21 @@ template static void RoundingShiftLeft(VectorArray& out, const VectorArray& lhs, const VectorArray& rhs) { using signed_type = std::make_signed_t; using unsigned_type = std::make_unsigned_t; - - constexpr auto bit_size = static_cast(mcl::bitsizeof); - + constexpr auto bit_size = s64(mcl::bitsizeof); for (size_t i = 0; i < out.size(); i++) { - const s64 extended_shift = static_cast(mcl::bit::sign_extend<8, u64>(rhs[i] & 0xFF)); - + const s64 extended_shift = s64(mcl::bit::sign_extend<8, u64>(rhs[i] & 0xFF)); if (extended_shift >= 0) { if (extended_shift >= bit_size) { out[i] = 0; } else { - out[i] = static_cast(static_cast(lhs[i]) << extended_shift); + out[i] = T(unsigned_type(lhs[i]) << extended_shift); } } else { if ((std::is_unsigned_v && extended_shift < -bit_size) || (std::is_signed_v && extended_shift <= -bit_size)) { out[i] = 0; } else { const s64 shift_value = -extended_shift - 1; - const T shifted = (lhs[i] & (static_cast(1) << shift_value)) >> shift_value; - + const T shifted = (lhs[i] & (signed_type(1) << shift_value)) >> shift_value; if (extended_shift == -bit_size) { out[i] = shifted; } else { @@ -3810,7 +3709,6 @@ static void EmitUnsignedRoundingShiftLeft(BlockOfCode& code, EmitContext& ctx, I } ctx.reg_alloc.DefineValue(code, inst, left_shift); - return; } void EmitX64::EmitVectorRoundingShiftLeftS8(EmitContext& ctx, IR::Inst* inst) { @@ -3852,23 +3750,21 @@ void EmitX64::EmitVectorRoundingShiftLeftU16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorRoundingShiftLeftU32(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::AVX2)) { EmitUnsignedRoundingShiftLeft<32>(code, ctx, inst); - return; + } else { + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& lhs, const VectorArray& rhs) { + RoundingShiftLeft(result, lhs, rhs); + }); } - - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& lhs, const VectorArray& rhs) { - RoundingShiftLeft(result, lhs, rhs); - }); } void EmitX64::EmitVectorRoundingShiftLeftU64(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::AVX2)) { EmitUnsignedRoundingShiftLeft<64>(code, ctx, inst); - return; + } else { + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& lhs, const VectorArray& rhs) { + RoundingShiftLeft(result, lhs, rhs); + }); } - - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& lhs, const VectorArray& rhs) { - RoundingShiftLeft(result, lhs, rhs); - }); } void EmitX64::EmitVectorSignExtend8(EmitContext& ctx, IR::Inst* inst) { @@ -5270,10 +5166,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { code.vmovdqu8(defaults | k2, indicies); ctx.reg_alloc.DefineValue(code, inst, defaults); } - return; - } - - if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 3) { + } else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 3) { const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); code.vpcmpub(k1, indicies, code.BConst<8>(xword, 2 * 16), CmpInt::LessThan); @@ -5302,10 +5195,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { code.vmovdqu8(defaults | k2, indicies); ctx.reg_alloc.DefineValue(code, inst, defaults); } - return; - } - - if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 2) { + } else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 2) { const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]); const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]); @@ -5321,15 +5211,10 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { code.vmovdqu8(result | k1, indicies); ctx.reg_alloc.DefineValue(code, inst, result); } - return; - } - - if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 1) { + } else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 1) { const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]); const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]); - code.vpcmpub(k1, indicies, code.BConst<8>(xword, 1 * 16), CmpInt::LessThan); - if (is_defaults_zero) { const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); code.vpermb(result | k1 | T_z, indicies, xmm_table0); @@ -5339,10 +5224,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { code.vpermb(result | k1, indicies, xmm_table0); ctx.reg_alloc.DefineValue(code, inst, result); } - return; - } - - if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 1) { + } else if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 1) { const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]); @@ -5350,10 +5232,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { code.pshufb(xmm_table0, indicies); ctx.reg_alloc.DefineValue(code, inst, xmm_table0); - return; - } - - if (code.HasHostFeature(HostFeature::SSE41) && table_size == 1) { + } else if (code.HasHostFeature(HostFeature::SSE41) && table_size == 1) { const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]); const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]); const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]); @@ -5368,10 +5247,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { code.pblendvb(xmm_table0, defaults); ctx.reg_alloc.DefineValue(code, inst, xmm_table0); - return; - } - - if (code.HasHostFeature(HostFeature::SSE41) && is_defaults_zero && table_size == 2) { + } else if (code.HasHostFeature(HostFeature::SSE41) && is_defaults_zero && table_size == 2) { const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]); const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[1]); @@ -5389,9 +5265,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(code, inst, xmm_table0); return; - } - - if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) { + } else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) { const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]); const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); const Xbyak::Xmm masked = ctx.reg_alloc.ScratchXmm(code); @@ -5415,10 +5289,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { } ctx.reg_alloc.DefineValue(code, inst, result); - return; - } - - if (code.HasHostFeature(HostFeature::SSE41)) { + } else if (code.HasHostFeature(HostFeature::SSE41)) { const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]); const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); const Xbyak::Xmm masked = ctx.reg_alloc.ScratchXmm(code); @@ -5447,31 +5318,26 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { } ctx.reg_alloc.DefineValue(code, inst, result); - return; - } - - const u32 stack_space = static_cast((table_size + 2) * 16); - ctx.reg_alloc.AllocStackSpace(code, stack_space + ABI_SHADOW_SPACE); - for (size_t i = 0; i < table_size; ++i) { - const Xbyak::Xmm table_value = ctx.reg_alloc.UseXmm(code, table[i]); - code.movaps(xword[rsp + ABI_SHADOW_SPACE + i * 16], table_value); - ctx.reg_alloc.Release(table_value); - } - const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); - ctx.reg_alloc.EndOfAllocScope(); - ctx.reg_alloc.HostCall(code, nullptr); - - code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE]); - code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + (table_size + 0) * 16]); - code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + (table_size + 1) * 16]); - code.mov(code.ABI_PARAM4.cvt32(), table_size); - code.movaps(xword[code.ABI_PARAM2], defaults); - code.movaps(xword[code.ABI_PARAM3], indicies); - - code.CallLambda( - [](const VectorArray* table, VectorArray& result, const VectorArray& indicies, size_t table_size) { + } else { + const u32 stack_space = static_cast((table_size + 2) * 16); + ctx.reg_alloc.AllocStackSpace(code, stack_space + ABI_SHADOW_SPACE); + for (size_t i = 0; i < table_size; ++i) { + const Xbyak::Xmm table_value = ctx.reg_alloc.UseXmm(code, table[i]); + code.movaps(xword[rsp + ABI_SHADOW_SPACE + i * 16], table_value); + ctx.reg_alloc.Release(table_value); + } + const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]); + const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + ctx.reg_alloc.EndOfAllocScope(); + ctx.reg_alloc.HostCall(code, nullptr); + code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE]); + code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + (table_size + 0) * 16]); + code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + (table_size + 1) * 16]); + code.mov(code.ABI_PARAM4.cvt32(), table_size); + code.movaps(xword[code.ABI_PARAM2], defaults); + code.movaps(xword[code.ABI_PARAM3], indicies); + code.CallLambda([](const VectorArray* table, VectorArray& result, const VectorArray& indicies, size_t table_size) { for (size_t i = 0; i < result.size(); ++i) { const size_t index = indicies[i] / table[0].size(); const size_t elem = indicies[i] % table[0].size(); @@ -5480,11 +5346,10 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { } } }); - - code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + (table_size + 0) * 16]); - ctx.reg_alloc.ReleaseStackSpace(code, stack_space + ABI_SHADOW_SPACE); - - ctx.reg_alloc.DefineValue(code, inst, result); + code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + (table_size + 0) * 16]); + ctx.reg_alloc.ReleaseStackSpace(code, stack_space + ABI_SHADOW_SPACE); + ctx.reg_alloc.DefineValue(code, inst, result); + } } void EmitX64::EmitVectorTranspose8(EmitContext& ctx, IR::Inst* inst) {