|
|
@ -598,29 +598,21 @@ void EmitX64::EmitVectorArithmeticVShift8(EmitContext& ctx, IR::Inst* inst) { |
|
|
void EmitX64::EmitVectorArithmeticVShift16(EmitContext& ctx, IR::Inst* inst) { |
|
|
void EmitX64::EmitVectorArithmeticVShift16(EmitContext& ctx, IR::Inst* inst) { |
|
|
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) { |
|
|
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) { |
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
|
|
|
|
|
|
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); |
|
|
|
|
|
const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]); |
|
|
|
|
|
const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); |
|
|
|
|
|
auto const left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]); |
|
|
|
|
|
auto const right_shift = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
auto const tmp = ctx.reg_alloc.ScratchXmm(code); |
|
|
code.vmovdqa32(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF)); |
|
|
code.vmovdqa32(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF)); |
|
|
code.vpxord(right_shift, right_shift, right_shift); |
|
|
code.vpxord(right_shift, right_shift, right_shift); |
|
|
code.vpsubw(right_shift, right_shift, left_shift); |
|
|
code.vpsubw(right_shift, right_shift, left_shift); |
|
|
|
|
|
|
|
|
code.vpsllw(xmm0, left_shift, 8); |
|
|
code.vpsllw(xmm0, left_shift, 8); |
|
|
code.vpsraw(xmm0, xmm0, 15); |
|
|
code.vpsraw(xmm0, xmm0, 15); |
|
|
|
|
|
|
|
|
const Xbyak::Opmask mask = k1; |
|
|
|
|
|
code.vpmovb2m(mask, xmm0); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
code.vpmovb2m(k1, xmm0); |
|
|
code.vpandd(right_shift, right_shift, tmp); |
|
|
code.vpandd(right_shift, right_shift, tmp); |
|
|
code.vpandd(left_shift, left_shift, tmp); |
|
|
code.vpandd(left_shift, left_shift, tmp); |
|
|
|
|
|
|
|
|
code.vpsravw(tmp, result, right_shift); |
|
|
code.vpsravw(tmp, result, right_shift); |
|
|
code.vpsllvw(result, result, left_shift); |
|
|
code.vpsllvw(result, result, left_shift); |
|
|
code.vpblendmb(result | mask, result, tmp); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
code.vpblendmb(result | k1, result, tmp); |
|
|
ctx.reg_alloc.DefineValue(code, inst, result); |
|
|
ctx.reg_alloc.DefineValue(code, inst, result); |
|
|
} else { |
|
|
} else { |
|
|
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s16>& result, const VectorArray<s16>& a, const VectorArray<s16>& b) { |
|
|
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s16>& result, const VectorArray<s16>& a, const VectorArray<s16>& b) { |
|
|
@ -632,49 +624,18 @@ void EmitX64::EmitVectorArithmeticVShift16(EmitContext& ctx, IR::Inst* inst) { |
|
|
void EmitX64::EmitVectorArithmeticVShift32(EmitContext& ctx, IR::Inst* inst) { |
|
|
void EmitX64::EmitVectorArithmeticVShift32(EmitContext& ctx, IR::Inst* inst) { |
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
if (code.HasHostFeature(HostFeature::AVX2)) { |
|
|
if (code.HasHostFeature(HostFeature::AVX2)) { |
|
|
// auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
|
|
|
|
|
|
// auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
|
|
|
|
|
|
// auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
|
|
|
|
|
|
// auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
|
|
|
|
|
|
// auto const result = ctx.reg_alloc.ScratchXmm(code);
|
|
|
|
|
|
// code.vpabsd(tmp3, tmp1);
|
|
|
|
|
|
// code.vpsllvd(tmp2, tmp0, tmp1);
|
|
|
|
|
|
// code.vpsravd(tmp0, tmp0, tmp3);
|
|
|
|
|
|
// code.blendvps(result, tmp0, tmp2, tmp1);
|
|
|
|
|
|
// ctx.reg_alloc.DefineValue(code, inst, result);
|
|
|
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
|
|
|
auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); |
|
|
|
|
|
auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]); |
|
|
|
|
|
auto const result = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
// store sign bit of lowest byte of each element of b to select left/right shift later
|
|
|
|
|
|
code.vpslld(xmm0, b, 24); |
|
|
|
|
|
// sse/avx shifts are only positive, with dedicated left/right forms - shift by lowest byte of abs(b)
|
|
|
|
|
|
code.vpabsb(b, b); |
|
|
|
|
|
code.vpand(b, b, code.BConst<32>(xword, 0xFF)); |
|
|
|
|
|
// calculate shifts
|
|
|
|
|
|
code.vpsllvd(result, a, b); |
|
|
|
|
|
code.vpsravd(a, a, b); |
|
|
|
|
|
code.blendvps(result, a); // implicit argument: xmm0 (sign of lowest byte of b)
|
|
|
|
|
|
ctx.reg_alloc.DefineValue(code, inst, result); |
|
|
|
|
|
|
|
|
auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); |
|
|
|
|
|
auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]); |
|
|
|
|
|
auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
auto const tmp3 = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
code.vpabsb(tmp2, tmp1); |
|
|
|
|
|
code.vpslld(tmp1, tmp1, 24); |
|
|
|
|
|
code.vpand(tmp2, tmp2, code.Const(xword, 0x000000FF000000FF, 0x000000FF000000FF)); |
|
|
|
|
|
code.vpsravd(tmp3, tmp0, tmp2); |
|
|
|
|
|
code.vpsllvd(tmp0, tmp0, tmp2); |
|
|
|
|
|
code.vblendvps(tmp0, tmp0, tmp3, tmp1); |
|
|
|
|
|
ctx.reg_alloc.DefineValue(code, inst, tmp0); |
|
|
} else { |
|
|
} else { |
|
|
/*
|
|
|
|
|
|
template<typename T = s32> |
|
|
|
|
|
ve::vector vshift(ve::vector x, ve::vector y) { |
|
|
|
|
|
auto const bit_size_scalar = sizeof(T) * 8; |
|
|
|
|
|
auto const shift_amount = (y << (bit_size_scalar - 8)) >> (bit_size_scalar - 8); |
|
|
|
|
|
auto const bit_size = ve::vector::broadcast(bit_size_scalar); |
|
|
|
|
|
|
|
|
|
|
|
using unsigned_type = std::make_unsigned_t<T>; |
|
|
|
|
|
auto const m0 = shift_amount <= -bit_size; |
|
|
|
|
|
auto const m1 = shift_amount > -bit_size |
|
|
|
|
|
&& shift_amount < bit_size && shift_amount < ve::vector::zeros(); |
|
|
|
|
|
auto const m2 = shift_amount > -bit_size |
|
|
|
|
|
&& shift_amount < bit_size && shift_amount >= ve::vector::zeros(); |
|
|
|
|
|
return ve::select(m0, x >> (bit_size_scalar - 1), |
|
|
|
|
|
ve::select(m1, x >> -shift_amount, |
|
|
|
|
|
ve::select(m2, x << shift_amount, ve::vector::zeros()))); |
|
|
|
|
|
} |
|
|
|
|
|
*/ |
|
|
|
|
|
auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); |
|
|
auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); |
|
|
auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]); |
|
|
auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]); |
|
|
auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); |
|
|
auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); |
|
|
@ -682,126 +643,115 @@ void EmitX64::EmitVectorArithmeticVShift32(EmitContext& ctx, IR::Inst* inst) { |
|
|
auto const tmp4 = ctx.reg_alloc.ScratchXmm(code); |
|
|
auto const tmp4 = ctx.reg_alloc.ScratchXmm(code); |
|
|
auto const tmp5 = ctx.reg_alloc.ScratchXmm(code); |
|
|
auto const tmp5 = ctx.reg_alloc.ScratchXmm(code); |
|
|
auto const tmp6 = ctx.reg_alloc.ScratchXmm(code); |
|
|
auto const tmp6 = ctx.reg_alloc.ScratchXmm(code); |
|
|
code.pxor(tmp2, tmp2); |
|
|
|
|
|
code.movdqa(tmp4, tmp1); |
|
|
|
|
|
code.movdqa(tmp3, tmp1); |
|
|
|
|
|
code.movdqa(tmp5, tmp0); |
|
|
|
|
|
code.pshufd(tmp6, tmp0, 245); |
|
|
|
|
|
code.pcmpgtd(tmp2, tmp1); |
|
|
|
|
|
code.pslld(tmp1, 23); |
|
|
|
|
|
code.psrad(tmp4, 31); |
|
|
|
|
|
code.paddd(tmp1, code.Const(xword, 0x3F8000003F800000, 0x3F8000003F800000)); |
|
|
|
|
|
code.pxor(tmp3, tmp4); |
|
|
|
|
|
code.psubd(tmp3, tmp4); |
|
|
|
|
|
code.movdqa(tmp4, tmp0); |
|
|
|
|
|
code.cvttps2dq(tmp1, tmp1); |
|
|
|
|
|
code.pmuludq(tmp5, tmp1); |
|
|
|
|
|
code.pshufd(tmp1, tmp1, 245); |
|
|
|
|
|
code.pmuludq(tmp1, tmp6); |
|
|
|
|
|
code.pshufd(tmp5, tmp5, 232); |
|
|
|
|
|
code.movdqa(tmp6, tmp0); |
|
|
|
|
|
code.pshufd(tmp1, tmp1, 232); |
|
|
|
|
|
code.punpckldq(tmp5, tmp1); |
|
|
|
|
|
code.pshuflw(tmp1, tmp3, 254); |
|
|
|
|
|
code.psrad(tmp4, tmp1); |
|
|
|
|
|
code.pshuflw(tmp1, tmp3, 84); |
|
|
|
|
|
code.pand(tmp5, tmp2); |
|
|
|
|
|
code.psrad(tmp6, tmp1); |
|
|
|
|
|
code.pshufd(tmp1, tmp3, 238); |
|
|
|
|
|
code.punpcklqdq(tmp6, tmp4); |
|
|
|
|
|
code.pshuflw(tmp3, tmp1, 254); |
|
|
|
|
|
code.movdqa(tmp4, tmp0); |
|
|
|
|
|
code.pshuflw(tmp1, tmp1, 84); |
|
|
|
|
|
code.psrad(tmp4, tmp3); |
|
|
|
|
|
code.psrad(tmp0, tmp1); |
|
|
|
|
|
code.punpckhqdq(tmp0, tmp4); |
|
|
|
|
|
code.shufps(tmp6, tmp0, 204); |
|
|
|
|
|
code.pandn(tmp2, tmp6); |
|
|
|
|
|
code.por(tmp2, tmp5); |
|
|
|
|
|
ctx.reg_alloc.DefineValue(code, inst, tmp2); |
|
|
|
|
|
|
|
|
code.pxor(tmp3, tmp3); |
|
|
|
|
|
code.movdqa(tmp2, tmp0); |
|
|
|
|
|
code.psubb(tmp3, tmp1); |
|
|
|
|
|
code.movdqa(tmp4, tmp2); |
|
|
|
|
|
code.movdqa(tmp6, tmp2); |
|
|
|
|
|
code.pminub(tmp3, tmp1); |
|
|
|
|
|
code.pslld(tmp1, 24); |
|
|
|
|
|
code.pand(tmp3, code.Const(xword, 0x000000FF000000FF, 0x000000FF000000FF)); |
|
|
|
|
|
code.psrad(tmp1, 31); |
|
|
|
|
|
code.pshuflw(tmp0, tmp3, 254); |
|
|
|
|
|
code.pshuflw(tmp5, tmp3, 84); |
|
|
|
|
|
code.psrad(tmp4, tmp0); |
|
|
|
|
|
code.movdqa(tmp0, tmp2); |
|
|
|
|
|
code.psrad(tmp0, tmp5); |
|
|
|
|
|
code.punpcklqdq(tmp0, tmp4); |
|
|
|
|
|
code.pshufd(tmp4, tmp3, 238); |
|
|
|
|
|
code.pslld(tmp3, 23); |
|
|
|
|
|
code.paddd(tmp3, code.Const(xword, 0x3F800000'3F800000, 0x3F800000'3F800000)); |
|
|
|
|
|
code.pshuflw(tmp5, tmp4, 254); |
|
|
|
|
|
code.pshuflw(tmp4, tmp4, 84); |
|
|
|
|
|
code.psrad(tmp6, tmp5); |
|
|
|
|
|
code.movdqa(tmp5, tmp2); |
|
|
|
|
|
code.psrad(tmp5, tmp4); |
|
|
|
|
|
code.pshufd(tmp4, tmp2, 245); |
|
|
|
|
|
code.punpckhqdq(tmp5, tmp6); |
|
|
|
|
|
code.cvttps2dq(tmp3, tmp3); |
|
|
|
|
|
code.shufps(tmp0, tmp5, 204); |
|
|
|
|
|
code.pmuludq(tmp2, tmp3); |
|
|
|
|
|
code.pshufd(tmp3, tmp3, 245); |
|
|
|
|
|
code.andps(tmp0, tmp1); |
|
|
|
|
|
code.pmuludq(tmp3, tmp4); |
|
|
|
|
|
code.pshufd(tmp2, tmp2, 232); |
|
|
|
|
|
code.pshufd(tmp3, tmp3, 232); |
|
|
|
|
|
code.punpckldq(tmp2, tmp3); |
|
|
|
|
|
code.pandn(tmp1, tmp2); |
|
|
|
|
|
code.orps(tmp0, tmp1); |
|
|
|
|
|
ctx.reg_alloc.DefineValue(code, inst, tmp0); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void EmitX64::EmitVectorArithmeticVShift64(EmitContext& ctx, IR::Inst* inst) { |
|
|
void EmitX64::EmitVectorArithmeticVShift64(EmitContext& ctx, IR::Inst* inst) { |
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { |
|
|
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { |
|
|
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); |
|
|
|
|
|
const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]); |
|
|
|
|
|
const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); |
|
|
|
|
|
auto const left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]); |
|
|
|
|
|
auto const right_shift = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
auto const tmp = ctx.reg_alloc.ScratchXmm(code); |
|
|
code.vmovdqa32(tmp, code.Const(xword, 0x00000000000000FF, 0x00000000000000FF)); |
|
|
code.vmovdqa32(tmp, code.Const(xword, 0x00000000000000FF, 0x00000000000000FF)); |
|
|
code.vpxorq(right_shift, right_shift, right_shift); |
|
|
code.vpxorq(right_shift, right_shift, right_shift); |
|
|
code.vpsubq(right_shift, right_shift, left_shift); |
|
|
code.vpsubq(right_shift, right_shift, left_shift); |
|
|
|
|
|
|
|
|
code.vpsllq(xmm0, left_shift, 56); |
|
|
code.vpsllq(xmm0, left_shift, 56); |
|
|
code.vpmovq2m(k1, xmm0); |
|
|
code.vpmovq2m(k1, xmm0); |
|
|
|
|
|
|
|
|
code.vpandq(right_shift, right_shift, tmp); |
|
|
code.vpandq(right_shift, right_shift, tmp); |
|
|
code.vpandq(left_shift, left_shift, tmp); |
|
|
code.vpandq(left_shift, left_shift, tmp); |
|
|
|
|
|
|
|
|
code.vpsravq(tmp, result, right_shift); |
|
|
code.vpsravq(tmp, result, right_shift); |
|
|
code.vpsllvq(result, result, left_shift); |
|
|
code.vpsllvq(result, result, left_shift); |
|
|
code.vpblendmq(result | k1, result, tmp); |
|
|
code.vpblendmq(result | k1, result, tmp); |
|
|
|
|
|
|
|
|
ctx.reg_alloc.DefineValue(code, inst, result); |
|
|
ctx.reg_alloc.DefineValue(code, inst, result); |
|
|
} else if (code.HasHostFeature(HostFeature::AVX2)) { |
|
|
} else if (code.HasHostFeature(HostFeature::AVX2)) { |
|
|
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); |
|
|
|
|
|
const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]); |
|
|
|
|
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
const Xbyak::Xmm negative_mask = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
|
|
|
|
|
|
// negative_mask = a < 0 ? 1s : 0s
|
|
|
|
|
|
code.vpxor(xmm0, xmm0, xmm0); |
|
|
|
|
|
code.vpcmpgtq(negative_mask, xmm0, a); |
|
|
|
|
|
|
|
|
|
|
|
// store sign bit of lowest byte of each element of b to select left/right shift later
|
|
|
|
|
|
code.vpsllq(xmm0, b, 56); |
|
|
|
|
|
|
|
|
|
|
|
// sse/avx shifts are only positive, with dedicated left/right forms - shift by lowest byte of abs(b)
|
|
|
|
|
|
code.vpabsb(b, b); |
|
|
|
|
|
code.vpand(b, b, code.BConst<64>(xword, 0xFF)); |
|
|
|
|
|
|
|
|
|
|
|
// calculate shifts
|
|
|
|
|
|
code.vpsllvq(result, a, b); |
|
|
|
|
|
|
|
|
|
|
|
// implement variable arithmetic shift in terms of logical shift
|
|
|
|
|
|
// if a is negative, invert it, shift in leading 0s, then invert it again - noop if positive
|
|
|
|
|
|
code.vpxor(a, a, negative_mask); |
|
|
|
|
|
code.vpsrlvq(a, a, b); |
|
|
|
|
|
code.vpxor(a, a, negative_mask); |
|
|
|
|
|
|
|
|
|
|
|
code.blendvpd(result, a); // implicit argument: xmm0 (sign of lowest byte of b)
|
|
|
|
|
|
|
|
|
|
|
|
ctx.reg_alloc.DefineValue(code, inst, result); |
|
|
|
|
|
|
|
|
auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); |
|
|
|
|
|
auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]); |
|
|
|
|
|
auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
auto const tmp3 = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
auto const tmp4 = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
code.vpabsb(tmp2, tmp1); |
|
|
|
|
|
code.vpxor(tmp3, tmp3, tmp3); |
|
|
|
|
|
code.vpsllq(tmp1, tmp1, 56); |
|
|
|
|
|
code.vpand(tmp2, tmp2, code.Const(xword, 255, 255)); |
|
|
|
|
|
code.vpcmpgtq(tmp3, tmp3, tmp0); |
|
|
|
|
|
code.vpsllvq(tmp4, tmp0, tmp2); |
|
|
|
|
|
code.vpxor(tmp0, tmp3, tmp0); |
|
|
|
|
|
code.vpsrlvq(tmp0, tmp0, tmp2); |
|
|
|
|
|
code.vpxor(tmp0, tmp0, tmp3); |
|
|
|
|
|
code.vblendvpd(tmp0, tmp4, tmp0, tmp1); |
|
|
|
|
|
ctx.reg_alloc.DefineValue(code, inst, tmp0); |
|
|
} else { |
|
|
} else { |
|
|
auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); |
|
|
auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); |
|
|
auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]); |
|
|
auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]); |
|
|
auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); |
|
|
auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); |
|
|
auto const tmp_ax = ctx.reg_alloc.ScratchGpr(code); |
|
|
|
|
|
auto const tmp_cx = ctx.reg_alloc.ScratchGpr(code); |
|
|
|
|
|
auto const tmp_dx = ctx.reg_alloc.ScratchGpr(code); |
|
|
|
|
|
auto const tmp_si = ctx.reg_alloc.ScratchGpr(code); |
|
|
|
|
|
auto const tmp_di = ctx.reg_alloc.ScratchGpr(code); |
|
|
|
|
|
code.movq(tmp_ax, tmp0); |
|
|
|
|
|
code.movq(tmp_cx, tmp1); |
|
|
|
|
|
code.pshufd(tmp0, tmp1, 238); |
|
|
|
|
|
code.mov(tmp_si, tmp_ax); |
|
|
|
|
|
code.sar(tmp_ax, tmp_cx.cvt8()); |
|
|
|
|
|
code.movq(tmp_dx, tmp0); |
|
|
|
|
|
code.shl(tmp_si, tmp_cx.cvt8()); |
|
|
|
|
|
code.test(tmp_cx, tmp_cx); |
|
|
|
|
|
code.mov(tmp_di, tmp_ax); |
|
|
|
|
|
code.cmovg(tmp_di, tmp_si); |
|
|
|
|
|
code.test(tmp_dx, tmp_dx); |
|
|
|
|
|
code.cmovg(tmp_ax, tmp_si); |
|
|
|
|
|
code.movq(tmp0, tmp_di); |
|
|
|
|
|
code.mov(tmp_dx.cvt32(), tmp_ax.cvt32()); |
|
|
|
|
|
code.shr(tmp_ax, 32); |
|
|
|
|
|
code.movd(tmp1, tmp_dx.cvt32()); |
|
|
|
|
|
code.movd(tmp2, tmp_ax.cvt32()); |
|
|
|
|
|
code.punpcklqdq(tmp2, tmp1); |
|
|
|
|
|
code.shufps(tmp0, tmp2, 36); |
|
|
|
|
|
|
|
|
auto const tmp3 = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
auto const tmp4 = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
auto const tmp5 = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
auto const tmp6 = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
auto const tmp7 = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
code.pxor(tmp3, tmp3); |
|
|
|
|
|
code.pshufd(tmp4, tmp0, 245); |
|
|
|
|
|
code.movdqa(tmp5, tmp0); |
|
|
|
|
|
code.psubb(tmp3, tmp1); |
|
|
|
|
|
code.psrad(tmp4, 31); |
|
|
|
|
|
code.pminub(tmp3, tmp1); |
|
|
|
|
|
code.movdqa(tmp2, tmp4); |
|
|
|
|
|
code.psllq(tmp1, 56); |
|
|
|
|
|
code.pand(tmp3, code.Const(xword, 255, 255)); |
|
|
|
|
|
code.pxor(tmp2, tmp0); |
|
|
|
|
|
code.pshufd(tmp1, tmp1, 245); |
|
|
|
|
|
code.movdqa(tmp7, tmp2); |
|
|
|
|
|
code.psrad(tmp1, 31); |
|
|
|
|
|
code.pshufd(tmp6, tmp3, 238); |
|
|
|
|
|
code.psrlq(tmp7, tmp3); |
|
|
|
|
|
code.psllq(tmp5, tmp3); |
|
|
|
|
|
code.psrlq(tmp2, tmp6); |
|
|
|
|
|
code.psllq(tmp0, tmp6); |
|
|
|
|
|
code.movsd(tmp2, tmp7); |
|
|
|
|
|
code.movsd(tmp0, tmp5); |
|
|
|
|
|
code.xorpd(tmp2, tmp4); |
|
|
|
|
|
code.andpd(tmp2, tmp1); |
|
|
|
|
|
code.andnpd(tmp1, tmp0); |
|
|
|
|
|
code.orpd(tmp2, tmp1); |
|
|
|
|
|
code.movapd(tmp0, tmp2); |
|
|
ctx.reg_alloc.DefineValue(code, inst, tmp0); |
|
|
ctx.reg_alloc.DefineValue(code, inst, tmp0); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
@ -1039,13 +989,10 @@ static void EmitVectorCountLeadingZeros(VectorArray<T>& result, const VectorArra |
|
|
void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) { |
|
|
void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) { |
|
|
if (code.HasHostFeature(HostFeature::GFNI)) { |
|
|
if (code.HasHostFeature(HostFeature::GFNI)) { |
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
|
|
|
|
|
|
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); |
|
|
|
|
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); |
|
|
|
|
|
auto const result = ctx.reg_alloc.ScratchXmm(code); |
|
|
// Reverse bits:
|
|
|
// Reverse bits:
|
|
|
code.gf2p8affineqb(data, code.BConst<64>(xword, 0x8040201008040201), 0); |
|
|
code.gf2p8affineqb(data, code.BConst<64>(xword, 0x8040201008040201), 0); |
|
|
|
|
|
|
|
|
// Perform a tzcnt:
|
|
|
// Perform a tzcnt:
|
|
|
// Isolate lowest set bit
|
|
|
// Isolate lowest set bit
|
|
|
code.pcmpeqb(result, result); |
|
|
code.pcmpeqb(result, result); |
|
|
@ -1053,29 +1000,22 @@ void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) { |
|
|
code.pandn(result, data); |
|
|
code.pandn(result, data); |
|
|
// Convert lowest set bit into an index
|
|
|
// Convert lowest set bit into an index
|
|
|
code.gf2p8affineqb(result, code.BConst<64>(xword, 0xaaccf0ff'00000000), 8); |
|
|
code.gf2p8affineqb(result, code.BConst<64>(xword, 0xaaccf0ff'00000000), 8); |
|
|
|
|
|
|
|
|
ctx.reg_alloc.DefineValue(code, inst, result); |
|
|
ctx.reg_alloc.DefineValue(code, inst, result); |
|
|
} else if (code.HasHostFeature(HostFeature::SSSE3)) { |
|
|
} else if (code.HasHostFeature(HostFeature::SSSE3)) { |
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
|
|
|
|
|
|
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); |
|
|
|
|
|
const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); |
|
|
|
|
|
auto const tmp1 = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); |
|
|
code.movdqa(tmp1, code.Const(xword, 0x0101010102020304, 0x0000000000000000)); |
|
|
code.movdqa(tmp1, code.Const(xword, 0x0101010102020304, 0x0000000000000000)); |
|
|
code.movdqa(tmp2, tmp1); |
|
|
code.movdqa(tmp2, tmp1); |
|
|
|
|
|
|
|
|
code.pshufb(tmp2, data); |
|
|
code.pshufb(tmp2, data); |
|
|
code.psrlw(data, 4); |
|
|
code.psrlw(data, 4); |
|
|
code.pand(data, code.Const(xword, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F)); |
|
|
code.pand(data, code.Const(xword, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F)); |
|
|
code.pshufb(tmp1, data); |
|
|
code.pshufb(tmp1, data); |
|
|
|
|
|
|
|
|
code.movdqa(data, code.Const(xword, 0x0404040404040404, 0x0404040404040404)); |
|
|
code.movdqa(data, code.Const(xword, 0x0404040404040404, 0x0404040404040404)); |
|
|
|
|
|
|
|
|
code.pcmpeqb(data, tmp1); |
|
|
code.pcmpeqb(data, tmp1); |
|
|
code.pand(data, tmp2); |
|
|
code.pand(data, tmp2); |
|
|
code.paddb(data, tmp1); |
|
|
code.paddb(data, tmp1); |
|
|
|
|
|
|
|
|
ctx.reg_alloc.DefineValue(code, inst, data); |
|
|
ctx.reg_alloc.DefineValue(code, inst, data); |
|
|
} else { |
|
|
} else { |
|
|
EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u8>); |
|
|
EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u8>); |
|
|
@ -1085,12 +1025,10 @@ void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) { |
|
|
void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) { |
|
|
void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) { |
|
|
if (code.HasHostFeature(HostFeature::AVX)) { |
|
|
if (code.HasHostFeature(HostFeature::AVX)) { |
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
|
|
|
|
|
|
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); |
|
|
|
|
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); |
|
|
|
|
|
auto const result = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
auto const zeros = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
auto const tmp = ctx.reg_alloc.ScratchXmm(code); |
|
|
code.vpsrlw(tmp, data, 1); |
|
|
code.vpsrlw(tmp, data, 1); |
|
|
code.vpor(data, data, tmp); |
|
|
code.vpor(data, data, tmp); |
|
|
code.vpsrlw(tmp, data, 2); |
|
|
code.vpsrlw(tmp, data, 2); |
|
|
@ -1110,16 +1048,13 @@ void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) { |
|
|
code.vpor(tmp, tmp, zeros); |
|
|
code.vpor(tmp, tmp, zeros); |
|
|
code.vpor(data, data, tmp); |
|
|
code.vpor(data, data, tmp); |
|
|
code.vpshufb(result, result, data); |
|
|
code.vpshufb(result, result, data); |
|
|
|
|
|
|
|
|
ctx.reg_alloc.DefineValue(code, inst, result); |
|
|
ctx.reg_alloc.DefineValue(code, inst, result); |
|
|
} else if (code.HasHostFeature(HostFeature::SSSE3)) { |
|
|
} else if (code.HasHostFeature(HostFeature::SSSE3)) { |
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
|
|
|
|
|
|
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); |
|
|
|
|
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); |
|
|
|
|
|
auto const result = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
auto const zeros = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
auto const tmp = ctx.reg_alloc.ScratchXmm(code); |
|
|
code.movdqa(tmp, data); |
|
|
code.movdqa(tmp, data); |
|
|
code.psrlw(tmp, 1); |
|
|
code.psrlw(tmp, 1); |
|
|
code.por(data, tmp); |
|
|
code.por(data, tmp); |
|
|
@ -5320,10 +5255,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { |
|
|
code.vmovdqu8(defaults | k2, indicies); |
|
|
code.vmovdqu8(defaults | k2, indicies); |
|
|
ctx.reg_alloc.DefineValue(code, inst, defaults); |
|
|
ctx.reg_alloc.DefineValue(code, inst, defaults); |
|
|
} |
|
|
} |
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 3) { |
|
|
|
|
|
|
|
|
} else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 3) { |
|
|
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); |
|
|
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); |
|
|
|
|
|
|
|
|
code.vpcmpub(k1, indicies, code.BConst<8>(xword, 2 * 16), CmpInt::LessThan); |
|
|
code.vpcmpub(k1, indicies, code.BConst<8>(xword, 2 * 16), CmpInt::LessThan); |
|
|
@ -5352,10 +5284,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { |
|
|
code.vmovdqu8(defaults | k2, indicies); |
|
|
code.vmovdqu8(defaults | k2, indicies); |
|
|
ctx.reg_alloc.DefineValue(code, inst, defaults); |
|
|
ctx.reg_alloc.DefineValue(code, inst, defaults); |
|
|
} |
|
|
} |
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 2) { |
|
|
|
|
|
|
|
|
} else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 2) { |
|
|
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); |
|
|
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); |
|
|
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]); |
|
|
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]); |
|
|
const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]); |
|
|
const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]); |
|
|
@ -5371,15 +5300,10 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { |
|
|
code.vmovdqu8(result | k1, indicies); |
|
|
code.vmovdqu8(result | k1, indicies); |
|
|
ctx.reg_alloc.DefineValue(code, inst, result); |
|
|
ctx.reg_alloc.DefineValue(code, inst, result); |
|
|
} |
|
|
} |
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 1) { |
|
|
|
|
|
|
|
|
} else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 1) { |
|
|
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]); |
|
|
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]); |
|
|
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]); |
|
|
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]); |
|
|
|
|
|
|
|
|
code.vpcmpub(k1, indicies, code.BConst<8>(xword, 1 * 16), CmpInt::LessThan); |
|
|
code.vpcmpub(k1, indicies, code.BConst<8>(xword, 1 * 16), CmpInt::LessThan); |
|
|
|
|
|
|
|
|
if (is_defaults_zero) { |
|
|
if (is_defaults_zero) { |
|
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); |
|
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); |
|
|
code.vpermb(result | k1 | T_z, indicies, xmm_table0); |
|
|
code.vpermb(result | k1 | T_z, indicies, xmm_table0); |
|
|
@ -5389,10 +5313,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { |
|
|
code.vpermb(result | k1, indicies, xmm_table0); |
|
|
code.vpermb(result | k1, indicies, xmm_table0); |
|
|
ctx.reg_alloc.DefineValue(code, inst, result); |
|
|
ctx.reg_alloc.DefineValue(code, inst, result); |
|
|
} |
|
|
} |
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 1) { |
|
|
|
|
|
|
|
|
} else if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 1) { |
|
|
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); |
|
|
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); |
|
|
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]); |
|
|
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]); |
|
|
|
|
|
|
|
|
@ -5400,10 +5321,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { |
|
|
code.pshufb(xmm_table0, indicies); |
|
|
code.pshufb(xmm_table0, indicies); |
|
|
|
|
|
|
|
|
ctx.reg_alloc.DefineValue(code, inst, xmm_table0); |
|
|
ctx.reg_alloc.DefineValue(code, inst, xmm_table0); |
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (code.HasHostFeature(HostFeature::SSE41) && table_size == 1) { |
|
|
|
|
|
|
|
|
} else if (code.HasHostFeature(HostFeature::SSE41) && table_size == 1) { |
|
|
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]); |
|
|
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]); |
|
|
const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]); |
|
|
const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]); |
|
|
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]); |
|
|
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]); |
|
|
@ -5418,10 +5336,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { |
|
|
code.pblendvb(xmm_table0, defaults); |
|
|
code.pblendvb(xmm_table0, defaults); |
|
|
|
|
|
|
|
|
ctx.reg_alloc.DefineValue(code, inst, xmm_table0); |
|
|
ctx.reg_alloc.DefineValue(code, inst, xmm_table0); |
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (code.HasHostFeature(HostFeature::SSE41) && is_defaults_zero && table_size == 2) { |
|
|
|
|
|
|
|
|
} else if (code.HasHostFeature(HostFeature::SSE41) && is_defaults_zero && table_size == 2) { |
|
|
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); |
|
|
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); |
|
|
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]); |
|
|
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]); |
|
|
const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[1]); |
|
|
const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[1]); |
|
|
@ -5439,9 +5354,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { |
|
|
|
|
|
|
|
|
ctx.reg_alloc.DefineValue(code, inst, xmm_table0); |
|
|
ctx.reg_alloc.DefineValue(code, inst, xmm_table0); |
|
|
return; |
|
|
return; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) { |
|
|
|
|
|
|
|
|
} else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) { |
|
|
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]); |
|
|
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]); |
|
|
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); |
|
|
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); |
|
|
const Xbyak::Xmm masked = ctx.reg_alloc.ScratchXmm(code); |
|
|
const Xbyak::Xmm masked = ctx.reg_alloc.ScratchXmm(code); |
|
|
@ -5465,10 +5378,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
ctx.reg_alloc.DefineValue(code, inst, result); |
|
|
ctx.reg_alloc.DefineValue(code, inst, result); |
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (code.HasHostFeature(HostFeature::SSE41)) { |
|
|
|
|
|
|
|
|
} else if (code.HasHostFeature(HostFeature::SSE41)) { |
|
|
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]); |
|
|
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]); |
|
|
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); |
|
|
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); |
|
|
const Xbyak::Xmm masked = ctx.reg_alloc.ScratchXmm(code); |
|
|
const Xbyak::Xmm masked = ctx.reg_alloc.ScratchXmm(code); |
|
|
@ -5497,31 +5407,26 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
ctx.reg_alloc.DefineValue(code, inst, result); |
|
|
ctx.reg_alloc.DefineValue(code, inst, result); |
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
const u32 stack_space = static_cast<u32>((table_size + 2) * 16); |
|
|
|
|
|
ctx.reg_alloc.AllocStackSpace(code, stack_space + ABI_SHADOW_SPACE); |
|
|
|
|
|
for (size_t i = 0; i < table_size; ++i) { |
|
|
|
|
|
const Xbyak::Xmm table_value = ctx.reg_alloc.UseXmm(code, table[i]); |
|
|
|
|
|
code.movaps(xword[rsp + ABI_SHADOW_SPACE + i * 16], table_value); |
|
|
|
|
|
ctx.reg_alloc.Release(table_value); |
|
|
|
|
|
} |
|
|
|
|
|
const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]); |
|
|
|
|
|
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]); |
|
|
|
|
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
ctx.reg_alloc.EndOfAllocScope(); |
|
|
|
|
|
ctx.reg_alloc.HostCall(code, nullptr); |
|
|
|
|
|
|
|
|
|
|
|
code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE]); |
|
|
|
|
|
code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + (table_size + 0) * 16]); |
|
|
|
|
|
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + (table_size + 1) * 16]); |
|
|
|
|
|
code.mov(code.ABI_PARAM4.cvt32(), table_size); |
|
|
|
|
|
code.movaps(xword[code.ABI_PARAM2], defaults); |
|
|
|
|
|
code.movaps(xword[code.ABI_PARAM3], indicies); |
|
|
|
|
|
|
|
|
|
|
|
code.CallLambda( |
|
|
|
|
|
[](const VectorArray<u8>* table, VectorArray<u8>& result, const VectorArray<u8>& indicies, size_t table_size) { |
|
|
|
|
|
|
|
|
} else { |
|
|
|
|
|
const u32 stack_space = static_cast<u32>((table_size + 2) * 16); |
|
|
|
|
|
ctx.reg_alloc.AllocStackSpace(code, stack_space + ABI_SHADOW_SPACE); |
|
|
|
|
|
for (size_t i = 0; i < table_size; ++i) { |
|
|
|
|
|
const Xbyak::Xmm table_value = ctx.reg_alloc.UseXmm(code, table[i]); |
|
|
|
|
|
code.movaps(xword[rsp + ABI_SHADOW_SPACE + i * 16], table_value); |
|
|
|
|
|
ctx.reg_alloc.Release(table_value); |
|
|
|
|
|
} |
|
|
|
|
|
const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]); |
|
|
|
|
|
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]); |
|
|
|
|
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); |
|
|
|
|
|
ctx.reg_alloc.EndOfAllocScope(); |
|
|
|
|
|
ctx.reg_alloc.HostCall(code, nullptr); |
|
|
|
|
|
code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE]); |
|
|
|
|
|
code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + (table_size + 0) * 16]); |
|
|
|
|
|
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + (table_size + 1) * 16]); |
|
|
|
|
|
code.mov(code.ABI_PARAM4.cvt32(), table_size); |
|
|
|
|
|
code.movaps(xword[code.ABI_PARAM2], defaults); |
|
|
|
|
|
code.movaps(xword[code.ABI_PARAM3], indicies); |
|
|
|
|
|
code.CallLambda([](const VectorArray<u8>* table, VectorArray<u8>& result, const VectorArray<u8>& indicies, size_t table_size) { |
|
|
for (size_t i = 0; i < result.size(); ++i) { |
|
|
for (size_t i = 0; i < result.size(); ++i) { |
|
|
const size_t index = indicies[i] / table[0].size(); |
|
|
const size_t index = indicies[i] / table[0].size(); |
|
|
const size_t elem = indicies[i] % table[0].size(); |
|
|
const size_t elem = indicies[i] % table[0].size(); |
|
|
@ -5530,11 +5435,10 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
}); |
|
|
}); |
|
|
|
|
|
|
|
|
code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + (table_size + 0) * 16]); |
|
|
|
|
|
ctx.reg_alloc.ReleaseStackSpace(code, stack_space + ABI_SHADOW_SPACE); |
|
|
|
|
|
|
|
|
|
|
|
ctx.reg_alloc.DefineValue(code, inst, result); |
|
|
|
|
|
|
|
|
code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + (table_size + 0) * 16]); |
|
|
|
|
|
ctx.reg_alloc.ReleaseStackSpace(code, stack_space + ABI_SHADOW_SPACE); |
|
|
|
|
|
ctx.reg_alloc.DefineValue(code, inst, result); |
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void EmitX64::EmitVectorTranspose8(EmitContext& ctx, IR::Inst* inst) { |
|
|
void EmitX64::EmitVectorTranspose8(EmitContext& ctx, IR::Inst* inst) { |
|
|
|