Browse Source

[dynarmic] fix ender magnolia crash on pre-AVX2 cpus (#3297)

supercedes #3295

Intel Atom N450 6 billion fps
also for the note, host calls are expensive as fuck

please test i didn't break other games kthx

Test with both a pre-AVX2 CPU (like i5-3th gen) AND a AVX2 cpu (like a i7-4th gen)

Signed-off-by: lizzie lizzie@eden-emu.dev
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3297
Reviewed-by: DraVee <dravee@eden-emu.dev>
Reviewed-by: MaranBr <maranbr@eden-emu.dev>
Co-authored-by: lizzie <lizzie@eden-emu.dev>
Co-committed-by: lizzie <lizzie@eden-emu.dev>
lizzie/stack-fibers-vector
lizzie 3 days ago
committed by crueter
parent
commit
91596c6d1b
No known key found for this signature in database GPG Key ID: 425ACD2D4830EBC6
  1. 429
      src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp

429
src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp

@ -569,32 +569,24 @@ void EmitX64::EmitVectorArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst)
ctx.reg_alloc.DefineValue(code, inst, result);
}
template<typename T>
static constexpr T VShift(T x, T y) {
const s8 shift_amount = static_cast<s8>(static_cast<u8>(y));
const s64 bit_size = static_cast<s64>(mcl::bitsizeof<T>);
if constexpr (std::is_signed_v<T>) {
if (shift_amount >= bit_size) {
template<typename T> constexpr T VShift(T x, T y) {
s8 const shift_amount = s8(u8(y));
s64 const bit_size = s64(mcl::bitsizeof<T>);
if (std::is_signed_v<T>) {
if (shift_amount >= bit_size)
return 0;
}
if (shift_amount <= -bit_size) {
// Parentheses necessary, as MSVC doesn't appear to consider cast parentheses
// as a grouping in terms of precedence, causing warning C4554 to fire. See:
// https://developercommunity.visualstudio.com/content/problem/144783/msvc-2017-does-not-understand-that-static-cast-cou.html
// Parentheses necessary, as MSVC doesn't appear to consider cast parentheses
// as a grouping in terms of precedence, causing warning C4554 to fire. See:
// https://developercommunity.visualstudio.com/content/problem/144783/msvc-2017-does-not-understand-that-static-cast-cou.html
if (shift_amount <= -bit_size)
return x >> (T(bit_size - 1));
}
} else if (shift_amount <= -bit_size || shift_amount >= bit_size) {
return 0;
}
if (shift_amount < 0) {
if (shift_amount < 0)
return x >> T(-shift_amount);
}
using unsigned_type = std::make_unsigned_t<T>;
return static_cast<T>(static_cast<unsigned_type>(x) << static_cast<unsigned_type>(shift_amount));
return T(unsigned_type(x) << unsigned_type(shift_amount));
}
void EmitX64::EmitVectorArithmeticVShift8(EmitContext& ctx, IR::Inst* inst) {
@ -606,145 +598,83 @@ void EmitX64::EmitVectorArithmeticVShift8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorArithmeticVShift16(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]);
const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
auto const left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]);
auto const right_shift = ctx.reg_alloc.ScratchXmm(code);
auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.vmovdqa32(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
code.vpxord(right_shift, right_shift, right_shift);
code.vpsubw(right_shift, right_shift, left_shift);
code.vpsllw(xmm0, left_shift, 8);
code.vpsraw(xmm0, xmm0, 15);
const Xbyak::Opmask mask = k1;
code.vpmovb2m(mask, xmm0);
code.vpmovb2m(k1, xmm0);
code.vpandd(right_shift, right_shift, tmp);
code.vpandd(left_shift, left_shift, tmp);
code.vpsravw(tmp, result, right_shift);
code.vpsllvw(result, result, left_shift);
code.vpblendmb(result | mask, result, tmp);
code.vpblendmb(result | k1, result, tmp);
ctx.reg_alloc.DefineValue(code, inst, result);
return;
} else {
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s16>& result, const VectorArray<s16>& a, const VectorArray<s16>& b) {
std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<s16>);
});
}
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s16>& result, const VectorArray<s16>& a, const VectorArray<s16>& b) {
std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<s16>);
});
}
void EmitX64::EmitVectorArithmeticVShift32(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::AVX2)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
// store sign bit of lowest byte of each element of b to select left/right shift later
code.vpslld(xmm0, b, 24);
// sse/avx shifts are only positive, with dedicated left/right forms - shift by lowest byte of abs(b)
code.vpabsb(b, b);
code.vpand(b, b, code.BConst<32>(xword, 0xFF));
// calculate shifts
code.vpsllvd(result, a, b);
code.vpsravd(a, a, b);
code.blendvps(result, a); // implicit argument: xmm0 (sign of lowest byte of b)
ctx.reg_alloc.DefineValue(code, inst, result);
return;
auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
code.vpabsb(tmp2, tmp1);
code.vpslld(tmp1, tmp1, 24);
code.vpand(tmp2, tmp2, code.Const(xword, 0x000000FF000000FF, 0x000000FF000000FF));
code.vpsravd(tmp3, tmp0, tmp2);
code.vpsllvd(tmp0, tmp0, tmp2);
code.vblendvps(tmp0, tmp0, tmp3, tmp1);
ctx.reg_alloc.DefineValue(code, inst, tmp0);
} else {
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s32>& result, const VectorArray<s32>& a, const VectorArray<s32>& b) {
std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<s32>);
});
}
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s32>& result, const VectorArray<s32>& a, const VectorArray<s32>& b) {
std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<s32>);
});
}
void EmitX64::EmitVectorArithmeticVShift64(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]);
const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
auto const left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]);
auto const right_shift = ctx.reg_alloc.ScratchXmm(code);
auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.vmovdqa32(tmp, code.Const(xword, 0x00000000000000FF, 0x00000000000000FF));
code.vpxorq(right_shift, right_shift, right_shift);
code.vpsubq(right_shift, right_shift, left_shift);
code.vpsllq(xmm0, left_shift, 56);
const Xbyak::Opmask mask = k1;
code.vpmovq2m(mask, xmm0);
code.vpmovq2m(k1, xmm0);
code.vpandq(right_shift, right_shift, tmp);
code.vpandq(left_shift, left_shift, tmp);
code.vpsravq(tmp, result, right_shift);
code.vpsllvq(result, result, left_shift);
code.vpblendmq(result | mask, result, tmp);
code.vpblendmq(result | k1, result, tmp);
ctx.reg_alloc.DefineValue(code, inst, result);
return;
}
if (code.HasHostFeature(HostFeature::AVX2)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm negative_mask = ctx.reg_alloc.ScratchXmm(code);
// negative_mask = a < 0 ? 1s : 0s
code.vpxor(xmm0, xmm0, xmm0);
code.vpcmpgtq(negative_mask, xmm0, a);
// store sign bit of lowest byte of each element of b to select left/right shift later
code.vpsllq(xmm0, b, 56);
// sse/avx shifts are only positive, with dedicated left/right forms - shift by lowest byte of abs(b)
code.vpabsb(b, b);
code.vpand(b, b, code.BConst<64>(xword, 0xFF));
// calculate shifts
code.vpsllvq(result, a, b);
// implement variable arithmetic shift in terms of logical shift
// if a is negative, invert it, shift in leading 0s, then invert it again - noop if positive
code.vpxor(a, a, negative_mask);
code.vpsrlvq(a, a, b);
code.vpxor(a, a, negative_mask);
code.blendvpd(result, a); // implicit argument: xmm0 (sign of lowest byte of b)
ctx.reg_alloc.DefineValue(code, inst, result);
return;
} else {
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s64>& result, const VectorArray<s64>& a, const VectorArray<s64>& b) {
std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<s64>);
});
}
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s64>& result, const VectorArray<s64>& a, const VectorArray<s64>& b) {
std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<s64>);
});
}
void EmitX64::EmitVectorBroadcastLower8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastb(a, a);
code.vmovq(a, a);
} else if (code.HasHostFeature(HostFeature::SSSE3)) {
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
code.pxor(tmp, tmp);
code.pshufb(a, tmp);
code.movq(a, a);
@ -752,7 +682,6 @@ void EmitX64::EmitVectorBroadcastLower8(EmitContext& ctx, IR::Inst* inst) {
code.punpcklbw(a, a);
code.pshuflw(a, a, 0);
}
ctx.reg_alloc.DefineValue(code, inst, a);
}
@ -777,12 +706,10 @@ void EmitX64::EmitVectorBroadcastLower32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorBroadcast8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastb(a, a);
} else if (code.HasHostFeature(HostFeature::SSSE3)) {
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
code.pxor(tmp, tmp);
code.pshufb(a, tmp);
} else {
@ -790,47 +717,40 @@ void EmitX64::EmitVectorBroadcast8(EmitContext& ctx, IR::Inst* inst) {
code.pshuflw(a, a, 0);
code.punpcklqdq(a, a);
}
ctx.reg_alloc.DefineValue(code, inst, a);
}
void EmitX64::EmitVectorBroadcast16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastw(a, a);
} else {
code.pshuflw(a, a, 0);
code.punpcklqdq(a, a);
}
ctx.reg_alloc.DefineValue(code, inst, a);
}
void EmitX64::EmitVectorBroadcast32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastd(a, a);
} else {
code.pshufd(a, a, 0);
}
ctx.reg_alloc.DefineValue(code, inst, a);
}
void EmitX64::EmitVectorBroadcast64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastq(a, a);
} else {
code.punpcklqdq(a, a);
}
ctx.reg_alloc.DefineValue(code, inst, a);
}
@ -840,17 +760,14 @@ void EmitX64::EmitVectorBroadcastElementLower8(EmitContext& ctx, IR::Inst* inst)
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
ASSERT(index < 16);
if (index > 0) {
code.psrldq(a, index);
}
if (code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastb(a, a);
code.vmovq(a, a);
} else if (code.HasHostFeature(HostFeature::SSSE3)) {
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
code.pxor(tmp, tmp);
code.pshufb(a, tmp);
code.movq(a, a);
@ -858,7 +775,6 @@ void EmitX64::EmitVectorBroadcastElementLower8(EmitContext& ctx, IR::Inst* inst)
code.punpcklbw(a, a);
code.pshuflw(a, a, 0);
}
ctx.reg_alloc.DefineValue(code, inst, a);
}
@ -868,13 +784,10 @@ void EmitX64::EmitVectorBroadcastElementLower16(EmitContext& ctx, IR::Inst* inst
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
ASSERT(index < 8);
if (index > 0) {
code.psrldq(a, u8(index * 2));
}
code.pshuflw(a, a, 0);
ctx.reg_alloc.DefineValue(code, inst, a);
}
@ -900,11 +813,9 @@ void EmitX64::EmitVectorBroadcastElement8(EmitContext& ctx, IR::Inst* inst) {
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
ASSERT(index < 16);
if (index > 0) {
code.psrldq(a, index);
}
if (code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastb(a, a);
} else if (code.HasHostFeature(HostFeature::SSSE3)) {
@ -926,22 +837,17 @@ void EmitX64::EmitVectorBroadcastElement16(EmitContext& ctx, IR::Inst* inst) {
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
ASSERT(index < 8);
if (index == 0 && code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastw(a, a);
ctx.reg_alloc.DefineValue(code, inst, a);
return;
}
if (index < 4) {
code.pshuflw(a, a, mcl::bit::replicate_element<2, u8>(index));
code.punpcklqdq(a, a);
} else {
code.pshufhw(a, a, mcl::bit::replicate_element<2, u8>(u8(index - 4)));
code.punpckhqdq(a, a);
if (index < 4) {
code.pshuflw(a, a, mcl::bit::replicate_element<2, u8>(index));
code.punpcklqdq(a, a);
} else {
code.pshufhw(a, a, mcl::bit::replicate_element<2, u8>(u8(index - 4)));
code.punpckhqdq(a, a);
}
}
ctx.reg_alloc.DefineValue(code, inst, a);
}
@ -994,13 +900,10 @@ static void EmitVectorCountLeadingZeros(VectorArray<T>& result, const VectorArra
void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::GFNI)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
auto const result = ctx.reg_alloc.ScratchXmm(code);
// Reverse bits:
code.gf2p8affineqb(data, code.BConst<64>(xword, 0x8040201008040201), 0);
// Perform a tzcnt:
// Isolate lowest set bit
code.pcmpeqb(result, result);
@ -1008,29 +911,22 @@ void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) {
code.pandn(result, data);
// Convert lowest set bit into an index
code.gf2p8affineqb(result, code.BConst<64>(xword, 0xaaccf0ff'00000000), 8);
ctx.reg_alloc.DefineValue(code, inst, result);
} else if (code.HasHostFeature(HostFeature::SSSE3)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code);
auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp1, code.Const(xword, 0x0101010102020304, 0x0000000000000000));
code.movdqa(tmp2, tmp1);
code.pshufb(tmp2, data);
code.psrlw(data, 4);
code.pand(data, code.Const(xword, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F));
code.pshufb(tmp1, data);
code.movdqa(data, code.Const(xword, 0x0404040404040404, 0x0404040404040404));
code.pcmpeqb(data, tmp1);
code.pand(data, tmp2);
code.paddb(data, tmp1);
ctx.reg_alloc.DefineValue(code, inst, data);
} else {
EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u8>);
@ -1040,12 +936,10 @@ void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::AVX)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
auto const result = ctx.reg_alloc.ScratchXmm(code);
auto const zeros = ctx.reg_alloc.ScratchXmm(code);
auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.vpsrlw(tmp, data, 1);
code.vpor(data, data, tmp);
code.vpsrlw(tmp, data, 2);
@ -1065,16 +959,13 @@ void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) {
code.vpor(tmp, tmp, zeros);
code.vpor(data, data, tmp);
code.vpshufb(result, result, data);
ctx.reg_alloc.DefineValue(code, inst, result);
} else if (code.HasHostFeature(HostFeature::SSSE3)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
auto const result = ctx.reg_alloc.ScratchXmm(code);
auto const zeros = ctx.reg_alloc.ScratchXmm(code);
auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, data);
code.psrlw(tmp, 1);
code.por(data, tmp);
@ -1098,7 +989,6 @@ void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) {
code.por(tmp, zeros);
code.por(data, tmp);
code.pshufb(result, data);
ctx.reg_alloc.DefineValue(code, inst, result);
} else {
EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u16>);
@ -1108,13 +998,13 @@ void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512CD)) {
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.vplzcntd(data, data);
ctx.reg_alloc.DefineValue(code, inst, data);
// See https://stackoverflow.com/questions/58823140/count-leading-zero-bits-for-each-element-in-avx2-vector-emulate-mm256-lzcnt-ep/58827596#58827596
} else if (code.HasHostFeature(HostFeature::AVX2)) {
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(code);
// See https://stackoverflow.com/questions/58823140/count-leading-zero-bits-for-each-element-in-avx2-vector-emulate-mm256-lzcnt-ep/58827596#58827596
auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
auto const temp = ctx.reg_alloc.ScratchXmm(code);
code.vmovdqa(temp, data);
code.vpsrld(data, data, 8);
code.vpandn(data, data, temp);
@ -1125,7 +1015,24 @@ void EmitX64::EmitVectorCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) {
code.vpminsw(data, data, code.Const(xword, 0x0000002000000020, 0x0000002000000020));
ctx.reg_alloc.DefineValue(code, inst, data);
} else {
EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u32>);
// See https://stackoverflow.com/a/58829453
auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
code.pxor(tmp1, tmp1);
code.movdqa(tmp2, tmp0);
code.pcmpeqd(tmp1, tmp0);
code.psrld(tmp0, 1);
code.psrld(tmp2, 2);
code.pandn(tmp2, tmp0);
code.cvtdq2ps(tmp0, tmp2);
code.addps(tmp0, tmp0);
code.addps(tmp0, code.Const(xword, 0x3f8000003f800000, 0x3f8000003f800000));
code.psrld(tmp0, 23);
code.paddd(tmp1, tmp0);
code.movdqa(tmp0, code.Const(xword, 0x0000009E0000009E, 0x0000009E0000009E));
code.psubd(tmp0, tmp1);
ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
}
@ -1892,14 +1799,12 @@ static void EmitVectorLogicalVShiftAVX2(BlockOfCode& code, EmitContext& ctx, IR:
ICODE(vpsrlv)(a, a, b);
// implicit argument: xmm0 (sign of lowest byte of b)
if constexpr (esize == 32) {
if (esize == 32) {
code.blendvps(result, a);
} else {
code.blendvpd(result, a);
}
ctx.reg_alloc.DefineValue(code, inst, result);
return;
}
void EmitX64::EmitVectorLogicalVShift8(EmitContext& ctx, IR::Inst* inst) {
@ -1942,9 +1847,9 @@ void EmitX64::EmitVectorLogicalVShift8(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(code, inst, result);
} else {
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u8>& result, const VectorArray<u8>& a, const VectorArray<u8>& b) {
std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<u8>);
});
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u8>& result, const VectorArray<u8>& a, const VectorArray<u8>& b) {
std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<u8>);
});
}
}
@ -1969,32 +1874,30 @@ void EmitX64::EmitVectorLogicalVShift16(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(code, inst, result);
} else {
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u16>& a, const VectorArray<u16>& b) {
std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<u16>);
});
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u16>& a, const VectorArray<u16>& b) {
std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<u16>);
});
}
}
void EmitX64::EmitVectorLogicalVShift32(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::AVX2)) {
EmitVectorLogicalVShiftAVX2<32>(code, ctx, inst);
return;
} else {
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u32>& result, const VectorArray<u32>& a, const VectorArray<u32>& b) {
std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<u32>);
});
}
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u32>& result, const VectorArray<u32>& a, const VectorArray<u32>& b) {
std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<u32>);
});
}
void EmitX64::EmitVectorLogicalVShift64(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::AVX2)) {
EmitVectorLogicalVShiftAVX2<64>(code, ctx, inst);
return;
} else {
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<u64>& a, const VectorArray<u64>& b) {
std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<u64>);
});
}
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<u64>& a, const VectorArray<u64>& b) {
std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<u64>);
});
}
namespace {
@ -3731,25 +3634,21 @@ template<typename T, typename U>
static void RoundingShiftLeft(VectorArray<T>& out, const VectorArray<T>& lhs, const VectorArray<U>& rhs) {
using signed_type = std::make_signed_t<T>;
using unsigned_type = std::make_unsigned_t<T>;
constexpr auto bit_size = static_cast<s64>(mcl::bitsizeof<T>);
constexpr auto bit_size = s64(mcl::bitsizeof<T>);
for (size_t i = 0; i < out.size(); i++) {
const s64 extended_shift = static_cast<s64>(mcl::bit::sign_extend<8, u64>(rhs[i] & 0xFF));
const s64 extended_shift = s64(mcl::bit::sign_extend<8, u64>(rhs[i] & 0xFF));
if (extended_shift >= 0) {
if (extended_shift >= bit_size) {
out[i] = 0;
} else {
out[i] = static_cast<T>(static_cast<unsigned_type>(lhs[i]) << extended_shift);
out[i] = T(unsigned_type(lhs[i]) << extended_shift);
}
} else {
if ((std::is_unsigned_v<T> && extended_shift < -bit_size) || (std::is_signed_v<T> && extended_shift <= -bit_size)) {
out[i] = 0;
} else {
const s64 shift_value = -extended_shift - 1;
const T shifted = (lhs[i] & (static_cast<signed_type>(1) << shift_value)) >> shift_value;
const T shifted = (lhs[i] & (signed_type(1) << shift_value)) >> shift_value;
if (extended_shift == -bit_size) {
out[i] = shifted;
} else {
@ -3810,7 +3709,6 @@ static void EmitUnsignedRoundingShiftLeft(BlockOfCode& code, EmitContext& ctx, I
}
ctx.reg_alloc.DefineValue(code, inst, left_shift);
return;
}
void EmitX64::EmitVectorRoundingShiftLeftS8(EmitContext& ctx, IR::Inst* inst) {
@ -3852,23 +3750,21 @@ void EmitX64::EmitVectorRoundingShiftLeftU16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorRoundingShiftLeftU32(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::AVX2)) {
EmitUnsignedRoundingShiftLeft<32>(code, ctx, inst);
return;
} else {
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u32>& result, const VectorArray<u32>& lhs, const VectorArray<s32>& rhs) {
RoundingShiftLeft(result, lhs, rhs);
});
}
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u32>& result, const VectorArray<u32>& lhs, const VectorArray<s32>& rhs) {
RoundingShiftLeft(result, lhs, rhs);
});
}
void EmitX64::EmitVectorRoundingShiftLeftU64(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::AVX2)) {
EmitUnsignedRoundingShiftLeft<64>(code, ctx, inst);
return;
} else {
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<u64>& lhs, const VectorArray<s64>& rhs) {
RoundingShiftLeft(result, lhs, rhs);
});
}
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<u64>& lhs, const VectorArray<s64>& rhs) {
RoundingShiftLeft(result, lhs, rhs);
});
}
void EmitX64::EmitVectorSignExtend8(EmitContext& ctx, IR::Inst* inst) {
@ -5270,10 +5166,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
code.vmovdqu8(defaults | k2, indicies);
ctx.reg_alloc.DefineValue(code, inst, defaults);
}
return;
}
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 3) {
} else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 3) {
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
code.vpcmpub(k1, indicies, code.BConst<8>(xword, 2 * 16), CmpInt::LessThan);
@ -5302,10 +5195,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
code.vmovdqu8(defaults | k2, indicies);
ctx.reg_alloc.DefineValue(code, inst, defaults);
}
return;
}
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 2) {
} else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 2) {
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]);
@ -5321,15 +5211,10 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
code.vmovdqu8(result | k1, indicies);
ctx.reg_alloc.DefineValue(code, inst, result);
}
return;
}
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 1) {
} else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 1) {
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
code.vpcmpub(k1, indicies, code.BConst<8>(xword, 1 * 16), CmpInt::LessThan);
if (is_defaults_zero) {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
code.vpermb(result | k1 | T_z, indicies, xmm_table0);
@ -5339,10 +5224,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
code.vpermb(result | k1, indicies, xmm_table0);
ctx.reg_alloc.DefineValue(code, inst, result);
}
return;
}
if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 1) {
} else if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 1) {
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
@ -5350,10 +5232,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
code.pshufb(xmm_table0, indicies);
ctx.reg_alloc.DefineValue(code, inst, xmm_table0);
return;
}
if (code.HasHostFeature(HostFeature::SSE41) && table_size == 1) {
} else if (code.HasHostFeature(HostFeature::SSE41) && table_size == 1) {
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
@ -5368,10 +5247,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
code.pblendvb(xmm_table0, defaults);
ctx.reg_alloc.DefineValue(code, inst, xmm_table0);
return;
}
if (code.HasHostFeature(HostFeature::SSE41) && is_defaults_zero && table_size == 2) {
} else if (code.HasHostFeature(HostFeature::SSE41) && is_defaults_zero && table_size == 2) {
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[1]);
@ -5389,9 +5265,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(code, inst, xmm_table0);
return;
}
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) {
} else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) {
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm masked = ctx.reg_alloc.ScratchXmm(code);
@ -5415,10 +5289,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
}
ctx.reg_alloc.DefineValue(code, inst, result);
return;
}
if (code.HasHostFeature(HostFeature::SSE41)) {
} else if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm masked = ctx.reg_alloc.ScratchXmm(code);
@ -5447,31 +5318,26 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
}
ctx.reg_alloc.DefineValue(code, inst, result);
return;
}
const u32 stack_space = static_cast<u32>((table_size + 2) * 16);
ctx.reg_alloc.AllocStackSpace(code, stack_space + ABI_SHADOW_SPACE);
for (size_t i = 0; i < table_size; ++i) {
const Xbyak::Xmm table_value = ctx.reg_alloc.UseXmm(code, table[i]);
code.movaps(xword[rsp + ABI_SHADOW_SPACE + i * 16], table_value);
ctx.reg_alloc.Release(table_value);
}
const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(code, nullptr);
code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE]);
code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + (table_size + 0) * 16]);
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + (table_size + 1) * 16]);
code.mov(code.ABI_PARAM4.cvt32(), table_size);
code.movaps(xword[code.ABI_PARAM2], defaults);
code.movaps(xword[code.ABI_PARAM3], indicies);
code.CallLambda(
[](const VectorArray<u8>* table, VectorArray<u8>& result, const VectorArray<u8>& indicies, size_t table_size) {
} else {
const u32 stack_space = static_cast<u32>((table_size + 2) * 16);
ctx.reg_alloc.AllocStackSpace(code, stack_space + ABI_SHADOW_SPACE);
for (size_t i = 0; i < table_size; ++i) {
const Xbyak::Xmm table_value = ctx.reg_alloc.UseXmm(code, table[i]);
code.movaps(xword[rsp + ABI_SHADOW_SPACE + i * 16], table_value);
ctx.reg_alloc.Release(table_value);
}
const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(code, nullptr);
code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE]);
code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + (table_size + 0) * 16]);
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + (table_size + 1) * 16]);
code.mov(code.ABI_PARAM4.cvt32(), table_size);
code.movaps(xword[code.ABI_PARAM2], defaults);
code.movaps(xword[code.ABI_PARAM3], indicies);
code.CallLambda([](const VectorArray<u8>* table, VectorArray<u8>& result, const VectorArray<u8>& indicies, size_t table_size) {
for (size_t i = 0; i < result.size(); ++i) {
const size_t index = indicies[i] / table[0].size();
const size_t elem = indicies[i] % table[0].size();
@ -5480,11 +5346,10 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
}
}
});
code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + (table_size + 0) * 16]);
ctx.reg_alloc.ReleaseStackSpace(code, stack_space + ABI_SHADOW_SPACE);
ctx.reg_alloc.DefineValue(code, inst, result);
code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + (table_size + 0) * 16]);
ctx.reg_alloc.ReleaseStackSpace(code, stack_space + ABI_SHADOW_SPACE);
ctx.reg_alloc.DefineValue(code, inst, result);
}
}
void EmitX64::EmitVectorTranspose8(EmitContext& ctx, IR::Inst* inst) {

Loading…
Cancel
Save