@ -569,32 +569,24 @@ void EmitX64::EmitVectorArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst)
ctx . reg_alloc . DefineValue ( code , inst , result ) ;
}
template < typename T >
static constexpr T VShift ( T x , T y ) {
const s8 shift_amount = static_cast < s8 > ( static_cast < u8 > ( y ) ) ;
const s64 bit_size = static_cast < s64 > ( mcl : : bitsizeof < T > ) ;
if constexpr ( std : : is_signed_v < T > ) {
if ( shift_amount > = bit_size ) {
template < typename T > constexpr T VShift ( T x , T y ) {
s8 const shift_amount = s8 ( u8 ( y ) ) ;
s64 const bit_size = s64 ( mcl : : bitsizeof < T > ) ;
if ( std : : is_signed_v < T > ) {
if ( shift_amount > = bit_size )
return 0 ;
}
if ( shift_amount < = - bit_size ) {
// Parentheses necessary, as MSVC doesn't appear to consider cast parentheses
// as a grouping in terms of precedence, causing warning C4554 to fire. See:
// https://developercommunity.visualstudio.com/content/problem/144783/msvc-2017-does-not-understand-that-static-cast-cou.html
// Parentheses necessary, as MSVC doesn't appear to consider cast parentheses
// as a grouping in terms of precedence, causing warning C4554 to fire. See:
// https://developercommunity.visualstudio.com/content/problem/144783/msvc-2017-does-not-understand-that-static-cast-cou.html
if ( shift_amount < = - bit_size )
return x > > ( T ( bit_size - 1 ) ) ;
}
} else if ( shift_amount < = - bit_size | | shift_amount > = bit_size ) {
return 0 ;
}
if ( shift_amount < 0 ) {
if ( shift_amount < 0 )
return x > > T ( - shift_amount ) ;
}
using unsigned_type = std : : make_unsigned_t < T > ;
return static_cast < T > ( static_cast < unsigned_type > ( x ) < < static_cast < unsigned_type > ( shift_amount ) ) ;
return T ( unsigned_type ( x ) < < unsigned_type ( shift_amount ) ) ;
}
void EmitX64 : : EmitVectorArithmeticVShift8 ( EmitContext & ctx , IR : : Inst * inst ) {
@ -606,145 +598,83 @@ void EmitX64::EmitVectorArithmeticVShift8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64 : : EmitVectorArithmeticVShift16 ( EmitContext & ctx , IR : : Inst * inst ) {
if ( code . HasHostFeature ( HostFeature : : AVX512_Ortho | HostFeature : : AVX512BW ) ) {
auto args = ctx . reg_alloc . GetArgumentInfo ( inst ) ;
const Xbyak : : Xmm result = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
const Xbyak : : Xmm left_shift = ctx . reg_alloc . UseScratchXmm ( code , args [ 1 ] ) ;
const Xbyak : : Xmm right_shift = ctx . reg_alloc . ScratchXmm ( code ) ;
const Xbyak : : Xmm tmp = ctx . reg_alloc . ScratchXmm ( code ) ;
auto const result = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
auto const left_shift = ctx . reg_alloc . UseScratchXmm ( code , args [ 1 ] ) ;
auto const right_shift = ctx . reg_alloc . ScratchXmm ( code ) ;
auto const tmp = ctx . reg_alloc . ScratchXmm ( code ) ;
code . vmovdqa32 ( tmp , code . Const ( xword , 0x00FF00FF00FF00FF , 0x00FF00FF00FF00FF ) ) ;
code . vpxord ( right_shift , right_shift , right_shift ) ;
code . vpsubw ( right_shift , right_shift , left_shift ) ;
code . vpsllw ( xmm0 , left_shift , 8 ) ;
code . vpsraw ( xmm0 , xmm0 , 15 ) ;
const Xbyak : : Opmask mask = k1 ;
code . vpmovb2m ( mask , xmm0 ) ;
code . vpmovb2m ( k1 , xmm0 ) ;
code . vpandd ( right_shift , right_shift , tmp ) ;
code . vpandd ( left_shift , left_shift , tmp ) ;
code . vpsravw ( tmp , result , right_shift ) ;
code . vpsllvw ( result , result , left_shift ) ;
code . vpblendmb ( result | mask , result , tmp ) ;
code . vpblendmb ( result | k1 , result , tmp ) ;
ctx . reg_alloc . DefineValue ( code , inst , result ) ;
return ;
} else {
EmitTwoArgumentFallback ( code , ctx , inst , [ ] ( VectorArray < s16 > & result , const VectorArray < s16 > & a , const VectorArray < s16 > & b ) {
std : : transform ( a . begin ( ) , a . end ( ) , b . begin ( ) , result . begin ( ) , VShift < s16 > ) ;
} ) ;
}
EmitTwoArgumentFallback ( code , ctx , inst , [ ] ( VectorArray < s16 > & result , const VectorArray < s16 > & a , const VectorArray < s16 > & b ) {
std : : transform ( a . begin ( ) , a . end ( ) , b . begin ( ) , result . begin ( ) , VShift < s16 > ) ;
} ) ;
}
void EmitX64 : : EmitVectorArithmeticVShift32 ( EmitContext & ctx , IR : : Inst * inst ) {
if ( code . HasHostFeature ( HostFeature : : AVX2 ) ) {
auto args = ctx . reg_alloc . GetArgumentInfo ( inst ) ;
const Xbyak : : Xmm a = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
const Xbyak : : Xmm b = ctx . reg_alloc . UseScratchXmm ( code , args [ 1 ] ) ;
const Xbyak : : Xmm result = ctx . reg_alloc . ScratchXmm ( code ) ;
// store sign bit of lowest byte of each element of b to select left/right shift later
code . vpslld ( xmm0 , b , 24 ) ;
// sse/avx shifts are only positive, with dedicated left/right forms - shift by lowest byte of abs(b)
code . vpabsb ( b , b ) ;
code . vpand ( b , b , code . BConst < 32 > ( xword , 0xFF ) ) ;
// calculate shifts
code . vpsllvd ( result , a , b ) ;
code . vpsravd ( a , a , b ) ;
code . blendvps ( result , a ) ; // implicit argument: xmm0 (sign of lowest byte of b)
ctx . reg_alloc . DefineValue ( code , inst , result ) ;
return ;
auto const tmp0 = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
auto const tmp1 = ctx . reg_alloc . UseScratchXmm ( code , args [ 1 ] ) ;
auto const tmp2 = ctx . reg_alloc . ScratchXmm ( code ) ;
auto const tmp3 = ctx . reg_alloc . ScratchXmm ( code ) ;
code . vpabsb ( tmp2 , tmp1 ) ;
code . vpslld ( tmp1 , tmp1 , 24 ) ;
code . vpand ( tmp2 , tmp2 , code . Const ( xword , 0x000000FF000000FF , 0x000000FF000000FF ) ) ;
code . vpsravd ( tmp3 , tmp0 , tmp2 ) ;
code . vpsllvd ( tmp0 , tmp0 , tmp2 ) ;
code . vblendvps ( tmp0 , tmp0 , tmp3 , tmp1 ) ;
ctx . reg_alloc . DefineValue ( code , inst , tmp0 ) ;
} else {
EmitTwoArgumentFallback ( code , ctx , inst , [ ] ( VectorArray < s32 > & result , const VectorArray < s32 > & a , const VectorArray < s32 > & b ) {
std : : transform ( a . begin ( ) , a . end ( ) , b . begin ( ) , result . begin ( ) , VShift < s32 > ) ;
} ) ;
}
EmitTwoArgumentFallback ( code , ctx , inst , [ ] ( VectorArray < s32 > & result , const VectorArray < s32 > & a , const VectorArray < s32 > & b ) {
std : : transform ( a . begin ( ) , a . end ( ) , b . begin ( ) , result . begin ( ) , VShift < s32 > ) ;
} ) ;
}
void EmitX64 : : EmitVectorArithmeticVShift64 ( EmitContext & ctx , IR : : Inst * inst ) {
if ( code . HasHostFeature ( HostFeature : : AVX512_Ortho ) ) {
auto args = ctx . reg_alloc . GetArgumentInfo ( inst ) ;
const Xbyak : : Xmm result = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
const Xbyak : : Xmm left_shift = ctx . reg_alloc . UseScratchXmm ( code , args [ 1 ] ) ;
const Xbyak : : Xmm right_shift = ctx . reg_alloc . ScratchXmm ( code ) ;
const Xbyak : : Xmm tmp = ctx . reg_alloc . ScratchXmm ( code ) ;
auto const result = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
auto const left_shift = ctx . reg_alloc . UseScratchXmm ( code , args [ 1 ] ) ;
auto const right_shift = ctx . reg_alloc . ScratchXmm ( code ) ;
auto const tmp = ctx . reg_alloc . ScratchXmm ( code ) ;
code . vmovdqa32 ( tmp , code . Const ( xword , 0x00000000000000FF , 0x00000000000000FF ) ) ;
code . vpxorq ( right_shift , right_shift , right_shift ) ;
code . vpsubq ( right_shift , right_shift , left_shift ) ;
code . vpsllq ( xmm0 , left_shift , 56 ) ;
const Xbyak : : Opmask mask = k1 ;
code . vpmovq2m ( mask , xmm0 ) ;
code . vpmovq2m ( k1 , xmm0 ) ;
code . vpandq ( right_shift , right_shift , tmp ) ;
code . vpandq ( left_shift , left_shift , tmp ) ;
code . vpsravq ( tmp , result , right_shift ) ;
code . vpsllvq ( result , result , left_shift ) ;
code . vpblendmq ( result | mask , result , tmp ) ;
code . vpblendmq ( result | k1 , result , tmp ) ;
ctx . reg_alloc . DefineValue ( code , inst , result ) ;
return ;
}
if ( code . HasHostFeature ( HostFeature : : AVX2 ) ) {
auto args = ctx . reg_alloc . GetArgumentInfo ( inst ) ;
const Xbyak : : Xmm a = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
const Xbyak : : Xmm b = ctx . reg_alloc . UseScratchXmm ( code , args [ 1 ] ) ;
const Xbyak : : Xmm result = ctx . reg_alloc . ScratchXmm ( code ) ;
const Xbyak : : Xmm negative_mask = ctx . reg_alloc . ScratchXmm ( code ) ;
// negative_mask = a < 0 ? 1s : 0s
code . vpxor ( xmm0 , xmm0 , xmm0 ) ;
code . vpcmpgtq ( negative_mask , xmm0 , a ) ;
// store sign bit of lowest byte of each element of b to select left/right shift later
code . vpsllq ( xmm0 , b , 56 ) ;
// sse/avx shifts are only positive, with dedicated left/right forms - shift by lowest byte of abs(b)
code . vpabsb ( b , b ) ;
code . vpand ( b , b , code . BConst < 64 > ( xword , 0xFF ) ) ;
// calculate shifts
code . vpsllvq ( result , a , b ) ;
// implement variable arithmetic shift in terms of logical shift
// if a is negative, invert it, shift in leading 0s, then invert it again - noop if positive
code . vpxor ( a , a , negative_mask ) ;
code . vpsrlvq ( a , a , b ) ;
code . vpxor ( a , a , negative_mask ) ;
code . blendvpd ( result , a ) ; // implicit argument: xmm0 (sign of lowest byte of b)
ctx . reg_alloc . DefineValue ( code , inst , result ) ;
return ;
} else {
EmitTwoArgumentFallback ( code , ctx , inst , [ ] ( VectorArray < s64 > & result , const VectorArray < s64 > & a , const VectorArray < s64 > & b ) {
std : : transform ( a . begin ( ) , a . end ( ) , b . begin ( ) , result . begin ( ) , VShift < s64 > ) ;
} ) ;
}
EmitTwoArgumentFallback ( code , ctx , inst , [ ] ( VectorArray < s64 > & result , const VectorArray < s64 > & a , const VectorArray < s64 > & b ) {
std : : transform ( a . begin ( ) , a . end ( ) , b . begin ( ) , result . begin ( ) , VShift < s64 > ) ;
} ) ;
}
void EmitX64 : : EmitVectorBroadcastLower8 ( EmitContext & ctx , IR : : Inst * inst ) {
auto args = ctx . reg_alloc . GetArgumentInfo ( inst ) ;
const Xbyak : : Xmm a = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
if ( code . HasHostFeature ( HostFeature : : AVX2 ) ) {
code . vpbroadcastb ( a , a ) ;
code . vmovq ( a , a ) ;
} else if ( code . HasHostFeature ( HostFeature : : SSSE3 ) ) {
const Xbyak : : Xmm tmp = ctx . reg_alloc . ScratchXmm ( code ) ;
code . pxor ( tmp , tmp ) ;
code . pshufb ( a , tmp ) ;
code . movq ( a , a ) ;
@ -752,7 +682,6 @@ void EmitX64::EmitVectorBroadcastLower8(EmitContext& ctx, IR::Inst* inst) {
code . punpcklbw ( a , a ) ;
code . pshuflw ( a , a , 0 ) ;
}
ctx . reg_alloc . DefineValue ( code , inst , a ) ;
}
@ -777,12 +706,10 @@ void EmitX64::EmitVectorBroadcastLower32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64 : : EmitVectorBroadcast8 ( EmitContext & ctx , IR : : Inst * inst ) {
auto args = ctx . reg_alloc . GetArgumentInfo ( inst ) ;
const Xbyak : : Xmm a = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
if ( code . HasHostFeature ( HostFeature : : AVX2 ) ) {
code . vpbroadcastb ( a , a ) ;
} else if ( code . HasHostFeature ( HostFeature : : SSSE3 ) ) {
const Xbyak : : Xmm tmp = ctx . reg_alloc . ScratchXmm ( code ) ;
code . pxor ( tmp , tmp ) ;
code . pshufb ( a , tmp ) ;
} else {
@ -790,47 +717,40 @@ void EmitX64::EmitVectorBroadcast8(EmitContext& ctx, IR::Inst* inst) {
code . pshuflw ( a , a , 0 ) ;
code . punpcklqdq ( a , a ) ;
}
ctx . reg_alloc . DefineValue ( code , inst , a ) ;
}
void EmitX64 : : EmitVectorBroadcast16 ( EmitContext & ctx , IR : : Inst * inst ) {
auto args = ctx . reg_alloc . GetArgumentInfo ( inst ) ;
const Xbyak : : Xmm a = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
if ( code . HasHostFeature ( HostFeature : : AVX2 ) ) {
code . vpbroadcastw ( a , a ) ;
} else {
code . pshuflw ( a , a , 0 ) ;
code . punpcklqdq ( a , a ) ;
}
ctx . reg_alloc . DefineValue ( code , inst , a ) ;
}
void EmitX64 : : EmitVectorBroadcast32 ( EmitContext & ctx , IR : : Inst * inst ) {
auto args = ctx . reg_alloc . GetArgumentInfo ( inst ) ;
const Xbyak : : Xmm a = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
if ( code . HasHostFeature ( HostFeature : : AVX2 ) ) {
code . vpbroadcastd ( a , a ) ;
} else {
code . pshufd ( a , a , 0 ) ;
}
ctx . reg_alloc . DefineValue ( code , inst , a ) ;
}
void EmitX64 : : EmitVectorBroadcast64 ( EmitContext & ctx , IR : : Inst * inst ) {
auto args = ctx . reg_alloc . GetArgumentInfo ( inst ) ;
const Xbyak : : Xmm a = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
if ( code . HasHostFeature ( HostFeature : : AVX2 ) ) {
code . vpbroadcastq ( a , a ) ;
} else {
code . punpcklqdq ( a , a ) ;
}
ctx . reg_alloc . DefineValue ( code , inst , a ) ;
}
@ -840,17 +760,14 @@ void EmitX64::EmitVectorBroadcastElementLower8(EmitContext& ctx, IR::Inst* inst)
ASSERT ( args [ 1 ] . IsImmediate ( ) ) ;
const u8 index = args [ 1 ] . GetImmediateU8 ( ) ;
ASSERT ( index < 16 ) ;
if ( index > 0 ) {
code . psrldq ( a , index ) ;
}
if ( code . HasHostFeature ( HostFeature : : AVX2 ) ) {
code . vpbroadcastb ( a , a ) ;
code . vmovq ( a , a ) ;
} else if ( code . HasHostFeature ( HostFeature : : SSSE3 ) ) {
const Xbyak : : Xmm tmp = ctx . reg_alloc . ScratchXmm ( code ) ;
code . pxor ( tmp , tmp ) ;
code . pshufb ( a , tmp ) ;
code . movq ( a , a ) ;
@ -858,7 +775,6 @@ void EmitX64::EmitVectorBroadcastElementLower8(EmitContext& ctx, IR::Inst* inst)
code . punpcklbw ( a , a ) ;
code . pshuflw ( a , a , 0 ) ;
}
ctx . reg_alloc . DefineValue ( code , inst , a ) ;
}
@ -868,13 +784,10 @@ void EmitX64::EmitVectorBroadcastElementLower16(EmitContext& ctx, IR::Inst* inst
ASSERT ( args [ 1 ] . IsImmediate ( ) ) ;
const u8 index = args [ 1 ] . GetImmediateU8 ( ) ;
ASSERT ( index < 8 ) ;
if ( index > 0 ) {
code . psrldq ( a , u8 ( index * 2 ) ) ;
}
code . pshuflw ( a , a , 0 ) ;
ctx . reg_alloc . DefineValue ( code , inst , a ) ;
}
@ -900,11 +813,9 @@ void EmitX64::EmitVectorBroadcastElement8(EmitContext& ctx, IR::Inst* inst) {
ASSERT ( args [ 1 ] . IsImmediate ( ) ) ;
const u8 index = args [ 1 ] . GetImmediateU8 ( ) ;
ASSERT ( index < 16 ) ;
if ( index > 0 ) {
code . psrldq ( a , index ) ;
}
if ( code . HasHostFeature ( HostFeature : : AVX2 ) ) {
code . vpbroadcastb ( a , a ) ;
} else if ( code . HasHostFeature ( HostFeature : : SSSE3 ) ) {
@ -926,22 +837,17 @@ void EmitX64::EmitVectorBroadcastElement16(EmitContext& ctx, IR::Inst* inst) {
ASSERT ( args [ 1 ] . IsImmediate ( ) ) ;
const u8 index = args [ 1 ] . GetImmediateU8 ( ) ;
ASSERT ( index < 8 ) ;
if ( index = = 0 & & code . HasHostFeature ( HostFeature : : AVX2 ) ) {
code . vpbroadcastw ( a , a ) ;
ctx . reg_alloc . DefineValue ( code , inst , a ) ;
return ;
}
if ( index < 4 ) {
code . pshuflw ( a , a , mcl : : bit : : replicate_element < 2 , u8 > ( index ) ) ;
code . punpcklqdq ( a , a ) ;
} else {
code . pshufhw ( a , a , mcl : : bit : : replicate_element < 2 , u8 > ( u8 ( index - 4 ) ) ) ;
code . punpckhqdq ( a , a ) ;
if ( index < 4 ) {
code . pshuflw ( a , a , mcl : : bit : : replicate_element < 2 , u8 > ( index ) ) ;
code . punpcklqdq ( a , a ) ;
} else {
code . pshufhw ( a , a , mcl : : bit : : replicate_element < 2 , u8 > ( u8 ( index - 4 ) ) ) ;
code . punpckhqdq ( a , a ) ;
}
}
ctx . reg_alloc . DefineValue ( code , inst , a ) ;
}
@ -994,13 +900,10 @@ static void EmitVectorCountLeadingZeros(VectorArray<T>& result, const VectorArra
void EmitX64 : : EmitVectorCountLeadingZeros8 ( EmitContext & ctx , IR : : Inst * inst ) {
if ( code . HasHostFeature ( HostFeature : : GFNI ) ) {
auto args = ctx . reg_alloc . GetArgumentInfo ( inst ) ;
const Xbyak : : Xmm data = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
const Xbyak : : Xmm result = ctx . reg_alloc . ScratchXmm ( code ) ;
auto const data = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
auto const result = ctx . reg_alloc . ScratchXmm ( code ) ;
// Reverse bits:
code . gf2p8affineqb ( data , code . BConst < 64 > ( xword , 0x8040201008040201 ) , 0 ) ;
// Perform a tzcnt:
// Isolate lowest set bit
code . pcmpeqb ( result , result ) ;
@ -1008,29 +911,22 @@ void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) {
code . pandn ( result , data ) ;
// Convert lowest set bit into an index
code . gf2p8affineqb ( result , code . BConst < 64 > ( xword , 0xaaccf0ff'00000000 ) , 8 ) ;
ctx . reg_alloc . DefineValue ( code , inst , result ) ;
} else if ( code . HasHostFeature ( HostFeature : : SSSE3 ) ) {
auto args = ctx . reg_alloc . GetArgumentInfo ( inst ) ;
const Xbyak : : Xmm data = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
const Xbyak : : Xmm tmp1 = ctx . reg_alloc . ScratchXmm ( code ) ;
const Xbyak : : Xmm tmp2 = ctx . reg_alloc . ScratchXmm ( code ) ;
auto const data = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
auto const tmp1 = ctx . reg_alloc . ScratchXmm ( code ) ;
auto const tmp2 = ctx . reg_alloc . ScratchXmm ( code ) ;
code . movdqa ( tmp1 , code . Const ( xword , 0x0101010102020304 , 0x0000000000000000 ) ) ;
code . movdqa ( tmp2 , tmp1 ) ;
code . pshufb ( tmp2 , data ) ;
code . psrlw ( data , 4 ) ;
code . pand ( data , code . Const ( xword , 0x0F0F0F0F0F0F0F0F , 0x0F0F0F0F0F0F0F0F ) ) ;
code . pshufb ( tmp1 , data ) ;
code . movdqa ( data , code . Const ( xword , 0x0404040404040404 , 0x0404040404040404 ) ) ;
code . pcmpeqb ( data , tmp1 ) ;
code . pand ( data , tmp2 ) ;
code . paddb ( data , tmp1 ) ;
ctx . reg_alloc . DefineValue ( code , inst , data ) ;
} else {
EmitOneArgumentFallback ( code , ctx , inst , EmitVectorCountLeadingZeros < u8 > ) ;
@ -1040,12 +936,10 @@ void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64 : : EmitVectorCountLeadingZeros16 ( EmitContext & ctx , IR : : Inst * inst ) {
if ( code . HasHostFeature ( HostFeature : : AVX ) ) {
auto args = ctx . reg_alloc . GetArgumentInfo ( inst ) ;
const Xbyak : : Xmm data = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
const Xbyak : : Xmm result = ctx . reg_alloc . ScratchXmm ( code ) ;
const Xbyak : : Xmm zeros = ctx . reg_alloc . ScratchXmm ( code ) ;
const Xbyak : : Xmm tmp = ctx . reg_alloc . ScratchXmm ( code ) ;
auto const data = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
auto const result = ctx . reg_alloc . ScratchXmm ( code ) ;
auto const zeros = ctx . reg_alloc . ScratchXmm ( code ) ;
auto const tmp = ctx . reg_alloc . ScratchXmm ( code ) ;
code . vpsrlw ( tmp , data , 1 ) ;
code . vpor ( data , data , tmp ) ;
code . vpsrlw ( tmp , data , 2 ) ;
@ -1065,16 +959,13 @@ void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) {
code . vpor ( tmp , tmp , zeros ) ;
code . vpor ( data , data , tmp ) ;
code . vpshufb ( result , result , data ) ;
ctx . reg_alloc . DefineValue ( code , inst , result ) ;
} else if ( code . HasHostFeature ( HostFeature : : SSSE3 ) ) {
auto args = ctx . reg_alloc . GetArgumentInfo ( inst ) ;
const Xbyak : : Xmm data = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
const Xbyak : : Xmm result = ctx . reg_alloc . ScratchXmm ( code ) ;
const Xbyak : : Xmm zeros = ctx . reg_alloc . ScratchXmm ( code ) ;
const Xbyak : : Xmm tmp = ctx . reg_alloc . ScratchXmm ( code ) ;
auto const data = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
auto const result = ctx . reg_alloc . ScratchXmm ( code ) ;
auto const zeros = ctx . reg_alloc . ScratchXmm ( code ) ;
auto const tmp = ctx . reg_alloc . ScratchXmm ( code ) ;
code . movdqa ( tmp , data ) ;
code . psrlw ( tmp , 1 ) ;
code . por ( data , tmp ) ;
@ -1098,7 +989,6 @@ void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) {
code . por ( tmp , zeros ) ;
code . por ( data , tmp ) ;
code . pshufb ( result , data ) ;
ctx . reg_alloc . DefineValue ( code , inst , result ) ;
} else {
EmitOneArgumentFallback ( code , ctx , inst , EmitVectorCountLeadingZeros < u16 > ) ;
@ -1108,13 +998,13 @@ void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64 : : EmitVectorCountLeadingZeros32 ( EmitContext & ctx , IR : : Inst * inst ) {
auto args = ctx . reg_alloc . GetArgumentInfo ( inst ) ;
if ( code . HasHostFeature ( HostFeature : : AVX512_Ortho | HostFeature : : AVX512CD ) ) {
const Xbyak : : Xmm data = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
auto const data = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
code . vplzcntd ( data , data ) ;
ctx . reg_alloc . DefineValue ( code , inst , data ) ;
// See https://stackoverflow.com/questions/58823140/count-leading-zero-bits-for-each-element-in-avx2-vector-emulate-mm256-lzcnt-ep/58827596#58827596
} else if ( code . HasHostFeature ( HostFeature : : AVX2 ) ) {
const Xbyak : : Xmm data = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
const Xbyak : : Xmm temp = ctx . reg_alloc . ScratchXmm ( code ) ;
// See https://stackoverflow.com/questions/58823140/count-leading-zero-bits-for-each-element-in-avx2-vector-emulate-mm256-lzcnt-ep/58827596#58827596
auto const data = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
auto const temp = ctx . reg_alloc . ScratchXmm ( code ) ;
code . vmovdqa ( temp , data ) ;
code . vpsrld ( data , data , 8 ) ;
code . vpandn ( data , data , temp ) ;
@ -1125,7 +1015,24 @@ void EmitX64::EmitVectorCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) {
code . vpminsw ( data , data , code . Const ( xword , 0x0000002000000020 , 0x0000002000000020 ) ) ;
ctx . reg_alloc . DefineValue ( code , inst , data ) ;
} else {
EmitOneArgumentFallback ( code , ctx , inst , EmitVectorCountLeadingZeros < u32 > ) ;
// See https://stackoverflow.com/a/58829453
auto const tmp0 = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
auto const tmp1 = ctx . reg_alloc . ScratchXmm ( code ) ;
auto const tmp2 = ctx . reg_alloc . ScratchXmm ( code ) ;
code . pxor ( tmp1 , tmp1 ) ;
code . movdqa ( tmp2 , tmp0 ) ;
code . pcmpeqd ( tmp1 , tmp0 ) ;
code . psrld ( tmp0 , 1 ) ;
code . psrld ( tmp2 , 2 ) ;
code . pandn ( tmp2 , tmp0 ) ;
code . cvtdq2ps ( tmp0 , tmp2 ) ;
code . addps ( tmp0 , tmp0 ) ;
code . addps ( tmp0 , code . Const ( xword , 0x3f8000003f800000 , 0x3f8000003f800000 ) ) ;
code . psrld ( tmp0 , 23 ) ;
code . paddd ( tmp1 , tmp0 ) ;
code . movdqa ( tmp0 , code . Const ( xword , 0x0000009E0000009E , 0x0000009E0000009E ) ) ;
code . psubd ( tmp0 , tmp1 ) ;
ctx . reg_alloc . DefineValue ( code , inst , tmp0 ) ;
}
}
@ -1892,14 +1799,12 @@ static void EmitVectorLogicalVShiftAVX2(BlockOfCode& code, EmitContext& ctx, IR:
ICODE ( vpsrlv ) ( a , a , b ) ;
// implicit argument: xmm0 (sign of lowest byte of b)
if constexpr ( esize = = 32 ) {
if ( esize = = 32 ) {
code . blendvps ( result , a ) ;
} else {
code . blendvpd ( result , a ) ;
}
ctx . reg_alloc . DefineValue ( code , inst , result ) ;
return ;
}
void EmitX64 : : EmitVectorLogicalVShift8 ( EmitContext & ctx , IR : : Inst * inst ) {
@ -1942,9 +1847,9 @@ void EmitX64::EmitVectorLogicalVShift8(EmitContext& ctx, IR::Inst* inst) {
ctx . reg_alloc . DefineValue ( code , inst , result ) ;
} else {
EmitTwoArgumentFallback ( code , ctx , inst , [ ] ( VectorArray < u8 > & result , const VectorArray < u8 > & a , const VectorArray < u8 > & b ) {
std : : transform ( a . begin ( ) , a . end ( ) , b . begin ( ) , result . begin ( ) , VShift < u8 > ) ;
} ) ;
EmitTwoArgumentFallback ( code , ctx , inst , [ ] ( VectorArray < u8 > & result , const VectorArray < u8 > & a , const VectorArray < u8 > & b ) {
std : : transform ( a . begin ( ) , a . end ( ) , b . begin ( ) , result . begin ( ) , VShift < u8 > ) ;
} ) ;
}
}
@ -1969,32 +1874,30 @@ void EmitX64::EmitVectorLogicalVShift16(EmitContext& ctx, IR::Inst* inst) {
ctx . reg_alloc . DefineValue ( code , inst , result ) ;
} else {
EmitTwoArgumentFallback ( code , ctx , inst , [ ] ( VectorArray < u16 > & result , const VectorArray < u16 > & a , const VectorArray < u16 > & b ) {
std : : transform ( a . begin ( ) , a . end ( ) , b . begin ( ) , result . begin ( ) , VShift < u16 > ) ;
} ) ;
EmitTwoArgumentFallback ( code , ctx , inst , [ ] ( VectorArray < u16 > & result , const VectorArray < u16 > & a , const VectorArray < u16 > & b ) {
std : : transform ( a . begin ( ) , a . end ( ) , b . begin ( ) , result . begin ( ) , VShift < u16 > ) ;
} ) ;
}
}
void EmitX64 : : EmitVectorLogicalVShift32 ( EmitContext & ctx , IR : : Inst * inst ) {
if ( code . HasHostFeature ( HostFeature : : AVX2 ) ) {
EmitVectorLogicalVShiftAVX2 < 32 > ( code , ctx , inst ) ;
return ;
} else {
EmitTwoArgumentFallback ( code , ctx , inst , [ ] ( VectorArray < u32 > & result , const VectorArray < u32 > & a , const VectorArray < u32 > & b ) {
std : : transform ( a . begin ( ) , a . end ( ) , b . begin ( ) , result . begin ( ) , VShift < u32 > ) ;
} ) ;
}
EmitTwoArgumentFallback ( code , ctx , inst , [ ] ( VectorArray < u32 > & result , const VectorArray < u32 > & a , const VectorArray < u32 > & b ) {
std : : transform ( a . begin ( ) , a . end ( ) , b . begin ( ) , result . begin ( ) , VShift < u32 > ) ;
} ) ;
}
void EmitX64 : : EmitVectorLogicalVShift64 ( EmitContext & ctx , IR : : Inst * inst ) {
if ( code . HasHostFeature ( HostFeature : : AVX2 ) ) {
EmitVectorLogicalVShiftAVX2 < 64 > ( code , ctx , inst ) ;
return ;
} else {
EmitTwoArgumentFallback ( code , ctx , inst , [ ] ( VectorArray < u64 > & result , const VectorArray < u64 > & a , const VectorArray < u64 > & b ) {
std : : transform ( a . begin ( ) , a . end ( ) , b . begin ( ) , result . begin ( ) , VShift < u64 > ) ;
} ) ;
}
EmitTwoArgumentFallback ( code , ctx , inst , [ ] ( VectorArray < u64 > & result , const VectorArray < u64 > & a , const VectorArray < u64 > & b ) {
std : : transform ( a . begin ( ) , a . end ( ) , b . begin ( ) , result . begin ( ) , VShift < u64 > ) ;
} ) ;
}
namespace {
@ -3731,25 +3634,21 @@ template<typename T, typename U>
static void RoundingShiftLeft ( VectorArray < T > & out , const VectorArray < T > & lhs , const VectorArray < U > & rhs ) {
using signed_type = std : : make_signed_t < T > ;
using unsigned_type = std : : make_unsigned_t < T > ;
constexpr auto bit_size = static_cast < s64 > ( mcl : : bitsizeof < T > ) ;
constexpr auto bit_size = s64 ( mcl : : bitsizeof < T > ) ;
for ( size_t i = 0 ; i < out . size ( ) ; i + + ) {
const s64 extended_shift = static_cast < s64 > ( mcl : : bit : : sign_extend < 8 , u64 > ( rhs [ i ] & 0xFF ) ) ;
const s64 extended_shift = s64 ( mcl : : bit : : sign_extend < 8 , u64 > ( rhs [ i ] & 0xFF ) ) ;
if ( extended_shift > = 0 ) {
if ( extended_shift > = bit_size ) {
out [ i ] = 0 ;
} else {
out [ i ] = static_cast < T > ( static_cast < unsigned_type > ( lhs [ i ] ) < < extended_shift ) ;
out [ i ] = T ( unsigned_type ( lhs [ i ] ) < < extended_shift ) ;
}
} else {
if ( ( std : : is_unsigned_v < T > & & extended_shift < - bit_size ) | | ( std : : is_signed_v < T > & & extended_shift < = - bit_size ) ) {
out [ i ] = 0 ;
} else {
const s64 shift_value = - extended_shift - 1 ;
const T shifted = ( lhs [ i ] & ( static_cast < signed_type > ( 1 ) < < shift_value ) ) > > shift_value ;
const T shifted = ( lhs [ i ] & ( signed_type ( 1 ) < < shift_value ) ) > > shift_value ;
if ( extended_shift = = - bit_size ) {
out [ i ] = shifted ;
} else {
@ -3810,7 +3709,6 @@ static void EmitUnsignedRoundingShiftLeft(BlockOfCode& code, EmitContext& ctx, I
}
ctx . reg_alloc . DefineValue ( code , inst , left_shift ) ;
return ;
}
void EmitX64 : : EmitVectorRoundingShiftLeftS8 ( EmitContext & ctx , IR : : Inst * inst ) {
@ -3852,23 +3750,21 @@ void EmitX64::EmitVectorRoundingShiftLeftU16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64 : : EmitVectorRoundingShiftLeftU32 ( EmitContext & ctx , IR : : Inst * inst ) {
if ( code . HasHostFeature ( HostFeature : : AVX2 ) ) {
EmitUnsignedRoundingShiftLeft < 32 > ( code , ctx , inst ) ;
return ;
} else {
EmitTwoArgumentFallback ( code , ctx , inst , [ ] ( VectorArray < u32 > & result , const VectorArray < u32 > & lhs , const VectorArray < s32 > & rhs ) {
RoundingShiftLeft ( result , lhs , rhs ) ;
} ) ;
}
EmitTwoArgumentFallback ( code , ctx , inst , [ ] ( VectorArray < u32 > & result , const VectorArray < u32 > & lhs , const VectorArray < s32 > & rhs ) {
RoundingShiftLeft ( result , lhs , rhs ) ;
} ) ;
}
void EmitX64 : : EmitVectorRoundingShiftLeftU64 ( EmitContext & ctx , IR : : Inst * inst ) {
if ( code . HasHostFeature ( HostFeature : : AVX2 ) ) {
EmitUnsignedRoundingShiftLeft < 64 > ( code , ctx , inst ) ;
return ;
} else {
EmitTwoArgumentFallback ( code , ctx , inst , [ ] ( VectorArray < u64 > & result , const VectorArray < u64 > & lhs , const VectorArray < s64 > & rhs ) {
RoundingShiftLeft ( result , lhs , rhs ) ;
} ) ;
}
EmitTwoArgumentFallback ( code , ctx , inst , [ ] ( VectorArray < u64 > & result , const VectorArray < u64 > & lhs , const VectorArray < s64 > & rhs ) {
RoundingShiftLeft ( result , lhs , rhs ) ;
} ) ;
}
void EmitX64 : : EmitVectorSignExtend8 ( EmitContext & ctx , IR : : Inst * inst ) {
@ -5270,10 +5166,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
code . vmovdqu8 ( defaults | k2 , indicies ) ;
ctx . reg_alloc . DefineValue ( code , inst , defaults ) ;
}
return ;
}
if ( code . HasHostFeature ( HostFeature : : AVX512_Ortho | HostFeature : : AVX512BW | HostFeature : : AVX512VBMI ) & & table_size = = 3 ) {
} else if ( code . HasHostFeature ( HostFeature : : AVX512_Ortho | HostFeature : : AVX512BW | HostFeature : : AVX512VBMI ) & & table_size = = 3 ) {
const Xbyak : : Xmm indicies = ctx . reg_alloc . UseScratchXmm ( code , args [ 2 ] ) ;
code . vpcmpub ( k1 , indicies , code . BConst < 8 > ( xword , 2 * 16 ) , CmpInt : : LessThan ) ;
@ -5302,10 +5195,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
code . vmovdqu8 ( defaults | k2 , indicies ) ;
ctx . reg_alloc . DefineValue ( code , inst , defaults ) ;
}
return ;
}
if ( code . HasHostFeature ( HostFeature : : AVX512_Ortho | HostFeature : : AVX512BW | HostFeature : : AVX512VBMI ) & & table_size = = 2 ) {
} else if ( code . HasHostFeature ( HostFeature : : AVX512_Ortho | HostFeature : : AVX512BW | HostFeature : : AVX512VBMI ) & & table_size = = 2 ) {
const Xbyak : : Xmm indicies = ctx . reg_alloc . UseScratchXmm ( code , args [ 2 ] ) ;
const Xbyak : : Xmm xmm_table0 = ctx . reg_alloc . UseXmm ( code , table [ 0 ] ) ;
const Xbyak : : Xmm xmm_table1 = ctx . reg_alloc . UseXmm ( code , table [ 1 ] ) ;
@ -5321,15 +5211,10 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
code . vmovdqu8 ( result | k1 , indicies ) ;
ctx . reg_alloc . DefineValue ( code , inst , result ) ;
}
return ;
}
if ( code . HasHostFeature ( HostFeature : : AVX512_Ortho | HostFeature : : AVX512BW | HostFeature : : AVX512VBMI ) & & table_size = = 1 ) {
} else if ( code . HasHostFeature ( HostFeature : : AVX512_Ortho | HostFeature : : AVX512BW | HostFeature : : AVX512VBMI ) & & table_size = = 1 ) {
const Xbyak : : Xmm indicies = ctx . reg_alloc . UseXmm ( code , args [ 2 ] ) ;
const Xbyak : : Xmm xmm_table0 = ctx . reg_alloc . UseXmm ( code , table [ 0 ] ) ;
code . vpcmpub ( k1 , indicies , code . BConst < 8 > ( xword , 1 * 16 ) , CmpInt : : LessThan ) ;
if ( is_defaults_zero ) {
const Xbyak : : Xmm result = ctx . reg_alloc . ScratchXmm ( code ) ;
code . vpermb ( result | k1 | T_z , indicies , xmm_table0 ) ;
@ -5339,10 +5224,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
code . vpermb ( result | k1 , indicies , xmm_table0 ) ;
ctx . reg_alloc . DefineValue ( code , inst , result ) ;
}
return ;
}
if ( code . HasHostFeature ( HostFeature : : SSSE3 ) & & is_defaults_zero & & table_size = = 1 ) {
} else if ( code . HasHostFeature ( HostFeature : : SSSE3 ) & & is_defaults_zero & & table_size = = 1 ) {
const Xbyak : : Xmm indicies = ctx . reg_alloc . UseScratchXmm ( code , args [ 2 ] ) ;
const Xbyak : : Xmm xmm_table0 = ctx . reg_alloc . UseScratchXmm ( code , table [ 0 ] ) ;
@ -5350,10 +5232,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
code . pshufb ( xmm_table0 , indicies ) ;
ctx . reg_alloc . DefineValue ( code , inst , xmm_table0 ) ;
return ;
}
if ( code . HasHostFeature ( HostFeature : : SSE41 ) & & table_size = = 1 ) {
} else if ( code . HasHostFeature ( HostFeature : : SSE41 ) & & table_size = = 1 ) {
const Xbyak : : Xmm indicies = ctx . reg_alloc . UseXmm ( code , args [ 2 ] ) ;
const Xbyak : : Xmm defaults = ctx . reg_alloc . UseXmm ( code , args [ 0 ] ) ;
const Xbyak : : Xmm xmm_table0 = ctx . reg_alloc . UseScratchXmm ( code , table [ 0 ] ) ;
@ -5368,10 +5247,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
code . pblendvb ( xmm_table0 , defaults ) ;
ctx . reg_alloc . DefineValue ( code , inst , xmm_table0 ) ;
return ;
}
if ( code . HasHostFeature ( HostFeature : : SSE41 ) & & is_defaults_zero & & table_size = = 2 ) {
} else if ( code . HasHostFeature ( HostFeature : : SSE41 ) & & is_defaults_zero & & table_size = = 2 ) {
const Xbyak : : Xmm indicies = ctx . reg_alloc . UseScratchXmm ( code , args [ 2 ] ) ;
const Xbyak : : Xmm xmm_table0 = ctx . reg_alloc . UseScratchXmm ( code , table [ 0 ] ) ;
const Xbyak : : Xmm xmm_table1 = ctx . reg_alloc . UseScratchXmm ( code , table [ 1 ] ) ;
@ -5389,9 +5265,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
ctx . reg_alloc . DefineValue ( code , inst , xmm_table0 ) ;
return ;
}
if ( code . HasHostFeature ( HostFeature : : AVX512_Ortho | HostFeature : : AVX512BW ) ) {
} else if ( code . HasHostFeature ( HostFeature : : AVX512_Ortho | HostFeature : : AVX512BW ) ) {
const Xbyak : : Xmm indicies = ctx . reg_alloc . UseXmm ( code , args [ 2 ] ) ;
const Xbyak : : Xmm result = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
const Xbyak : : Xmm masked = ctx . reg_alloc . ScratchXmm ( code ) ;
@ -5415,10 +5289,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
}
ctx . reg_alloc . DefineValue ( code , inst , result ) ;
return ;
}
if ( code . HasHostFeature ( HostFeature : : SSE41 ) ) {
} else if ( code . HasHostFeature ( HostFeature : : SSE41 ) ) {
const Xbyak : : Xmm indicies = ctx . reg_alloc . UseXmm ( code , args [ 2 ] ) ;
const Xbyak : : Xmm result = ctx . reg_alloc . UseScratchXmm ( code , args [ 0 ] ) ;
const Xbyak : : Xmm masked = ctx . reg_alloc . ScratchXmm ( code ) ;
@ -5447,31 +5318,26 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
}
ctx . reg_alloc . DefineValue ( code , inst , result ) ;
return ;
}
const u32 stack_space = static_cast < u32 > ( ( table_size + 2 ) * 16 ) ;
ctx . reg_alloc . AllocStackSpace ( code , stack_space + ABI_SHADOW_SPACE ) ;
for ( size_t i = 0 ; i < table_size ; + + i ) {
const Xbyak : : Xmm table_value = ctx . reg_alloc . UseXmm ( code , table [ i ] ) ;
code . movaps ( xword [ rsp + ABI_SHADOW_SPACE + i * 16 ] , table_value ) ;
ctx . reg_alloc . Release ( table_value ) ;
}
const Xbyak : : Xmm defaults = ctx . reg_alloc . UseXmm ( code , args [ 0 ] ) ;
const Xbyak : : Xmm indicies = ctx . reg_alloc . UseXmm ( code , args [ 2 ] ) ;
const Xbyak : : Xmm result = ctx . reg_alloc . ScratchXmm ( code ) ;
ctx . reg_alloc . EndOfAllocScope ( ) ;
ctx . reg_alloc . HostCall ( code , nullptr ) ;
code . lea ( code . ABI_PARAM1 , ptr [ rsp + ABI_SHADOW_SPACE ] ) ;
code . lea ( code . ABI_PARAM2 , ptr [ rsp + ABI_SHADOW_SPACE + ( table_size + 0 ) * 16 ] ) ;
code . lea ( code . ABI_PARAM3 , ptr [ rsp + ABI_SHADOW_SPACE + ( table_size + 1 ) * 16 ] ) ;
code . mov ( code . ABI_PARAM4 . cvt32 ( ) , table_size ) ;
code . movaps ( xword [ code . ABI_PARAM2 ] , defaults ) ;
code . movaps ( xword [ code . ABI_PARAM3 ] , indicies ) ;
code . CallLambda (
[ ] ( const VectorArray < u8 > * table , VectorArray < u8 > & result , const VectorArray < u8 > & indicies , size_t table_size ) {
} else {
const u32 stack_space = static_cast < u32 > ( ( table_size + 2 ) * 16 ) ;
ctx . reg_alloc . AllocStackSpace ( code , stack_space + ABI_SHADOW_SPACE ) ;
for ( size_t i = 0 ; i < table_size ; + + i ) {
const Xbyak : : Xmm table_value = ctx . reg_alloc . UseXmm ( code , table [ i ] ) ;
code . movaps ( xword [ rsp + ABI_SHADOW_SPACE + i * 16 ] , table_value ) ;
ctx . reg_alloc . Release ( table_value ) ;
}
const Xbyak : : Xmm defaults = ctx . reg_alloc . UseXmm ( code , args [ 0 ] ) ;
const Xbyak : : Xmm indicies = ctx . reg_alloc . UseXmm ( code , args [ 2 ] ) ;
const Xbyak : : Xmm result = ctx . reg_alloc . ScratchXmm ( code ) ;
ctx . reg_alloc . EndOfAllocScope ( ) ;
ctx . reg_alloc . HostCall ( code , nullptr ) ;
code . lea ( code . ABI_PARAM1 , ptr [ rsp + ABI_SHADOW_SPACE ] ) ;
code . lea ( code . ABI_PARAM2 , ptr [ rsp + ABI_SHADOW_SPACE + ( table_size + 0 ) * 16 ] ) ;
code . lea ( code . ABI_PARAM3 , ptr [ rsp + ABI_SHADOW_SPACE + ( table_size + 1 ) * 16 ] ) ;
code . mov ( code . ABI_PARAM4 . cvt32 ( ) , table_size ) ;
code . movaps ( xword [ code . ABI_PARAM2 ] , defaults ) ;
code . movaps ( xword [ code . ABI_PARAM3 ] , indicies ) ;
code . CallLambda ( [ ] ( const VectorArray < u8 > * table , VectorArray < u8 > & result , const VectorArray < u8 > & indicies , size_t table_size ) {
for ( size_t i = 0 ; i < result . size ( ) ; + + i ) {
const size_t index = indicies [ i ] / table [ 0 ] . size ( ) ;
const size_t elem = indicies [ i ] % table [ 0 ] . size ( ) ;
@ -5480,11 +5346,10 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
}
}
} ) ;
code . movaps ( result , xword [ rsp + ABI_SHADOW_SPACE + ( table_size + 0 ) * 16 ] ) ;
ctx . reg_alloc . ReleaseStackSpace ( code , stack_space + ABI_SHADOW_SPACE ) ;
ctx . reg_alloc . DefineValue ( code , inst , result ) ;
code . movaps ( result , xword [ rsp + ABI_SHADOW_SPACE + ( table_size + 0 ) * 16 ] ) ;
ctx . reg_alloc . ReleaseStackSpace ( code , stack_space + ABI_SHADOW_SPACE ) ;
ctx . reg_alloc . DefineValue ( code , inst , result ) ;
}
}
void EmitX64 : : EmitVectorTranspose8 ( EmitContext & ctx , IR : : Inst * inst ) {