|
|
|
@ -18,13 +18,14 @@ |
|
|
|
#include <mcl/mp/typelist/list.hpp>
|
|
|
|
#include <mcl/mp/typelist/lower_to_tuple.hpp>
|
|
|
|
#include "dynarmic/common/common_types.h"
|
|
|
|
#include <mcl/type_traits/integer_of_size.hpp>
|
|
|
|
#include <xbyak/xbyak.h>
|
|
|
|
|
|
|
|
#include "dynarmic/backend/x64/abi.h"
|
|
|
|
#include "dynarmic/backend/x64/block_of_code.h"
|
|
|
|
#include "dynarmic/backend/x64/constants.h"
|
|
|
|
#include "dynarmic/backend/x64/emit_x64.h"
|
|
|
|
#include "dynarmic/common/type_util.h"
|
|
|
|
#include "dynarmic/common/cast_util.h"
|
|
|
|
#include "dynarmic/common/fp/fpcr.h"
|
|
|
|
#include "dynarmic/common/fp/fpsr.h"
|
|
|
|
#include "dynarmic/common/fp/info.h"
|
|
|
|
@ -35,8 +36,22 @@ |
|
|
|
#include "dynarmic/ir/basic_block.h"
|
|
|
|
#include "dynarmic/ir/microinstruction.h"
|
|
|
|
|
|
|
|
#define FCODE(NAME) [&](auto... args) { if (fsize == 32) code.NAME##s(args...); else code.NAME##d(args...); }
|
|
|
|
#define ICODE(NAME) [&](auto... args) { if (fsize == 32) code.NAME##d(args...); else code.NAME##q(args...); }
|
|
|
|
#define FCODE(NAME) \
|
|
|
|
[&code](auto... args) { \ |
|
|
|
if constexpr (fsize == 32) { \ |
|
|
|
code.NAME##s(args...); \ |
|
|
|
} else { \ |
|
|
|
code.NAME##d(args...); \ |
|
|
|
} \ |
|
|
|
} |
|
|
|
#define ICODE(NAME) \
|
|
|
|
[&code](auto... args) { \ |
|
|
|
if constexpr (fsize == 32) { \ |
|
|
|
code.NAME##d(args...); \ |
|
|
|
} else { \ |
|
|
|
code.NAME##q(args...); \ |
|
|
|
} \ |
|
|
|
} |
|
|
|
|
|
|
|
namespace Dynarmic::Backend::X64 { |
|
|
|
|
|
|
|
@ -90,7 +105,7 @@ void ForceDenormalsToZero(BlockOfCode& code, std::initializer_list<Xbyak::Xmm> t |
|
|
|
for (const Xbyak::Xmm& xmm : to_daz) { |
|
|
|
code.movaps(xmm0, code.Const(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask)); |
|
|
|
code.andps(xmm0, xmm); |
|
|
|
if (fsize == 32) { |
|
|
|
if constexpr (fsize == 32) { |
|
|
|
code.pcmpgtd(xmm0, code.Const(xword, f32_smallest_normal - 1)); |
|
|
|
} else if (code.HasHostFeature(HostFeature::SSE42)) { |
|
|
|
code.pcmpgtq(xmm0, code.Const(xword, f64_smallest_normal - 1)); |
|
|
|
@ -105,11 +120,13 @@ void ForceDenormalsToZero(BlockOfCode& code, std::initializer_list<Xbyak::Xmm> t |
|
|
|
|
|
|
|
template<size_t fsize> |
|
|
|
void DenormalsAreZero(BlockOfCode& code, EmitContext& ctx, std::initializer_list<Xbyak::Xmm> to_daz) { |
|
|
|
if (ctx.FPCR().FZ()) |
|
|
|
if (ctx.FPCR().FZ()) { |
|
|
|
ForceDenormalsToZero<fsize>(code, to_daz); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Xmm xmm_scratch, size_t fsize) { |
|
|
|
template<size_t fsize> |
|
|
|
void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Xmm xmm_scratch) { |
|
|
|
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { |
|
|
|
constexpr u32 nan_to_zero = FixupLUT(FpFixup::PosZero, |
|
|
|
FpFixup::PosZero); |
|
|
|
@ -124,7 +141,8 @@ void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Xmm xmm_scratch, |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
void ForceToDefaultNaN(BlockOfCode& code, Xbyak::Xmm result, size_t fsize) { |
|
|
|
template<size_t fsize> |
|
|
|
void ForceToDefaultNaN(BlockOfCode& code, Xbyak::Xmm result) { |
|
|
|
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { |
|
|
|
const Xbyak::Opmask nan_mask = k1; |
|
|
|
FCODE(vfpclasss)(nan_mask, result, u8(FpClass::QNaN | FpClass::SNaN)); |
|
|
|
@ -190,7 +208,7 @@ void PostProcessNaN(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm tmp) { |
|
|
|
// We allow for the case where op1 and result are the same register. We do not read from op1 once result is written to.
|
|
|
|
template<size_t fsize> |
|
|
|
void EmitPostProcessNaNs(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm op1, Xbyak::Xmm op2, Xbyak::Reg64 tmp, Xbyak::Label end) { |
|
|
|
using FPT = Common::UnsignedIntegerN<fsize>; |
|
|
|
using FPT = mcl::unsigned_integer_of_size<fsize>; |
|
|
|
constexpr FPT exponent_mask = FP::FPInfo<FPT>::exponent_mask; |
|
|
|
constexpr FPT mantissa_msb = FP::FPInfo<FPT>::mantissa_msb; |
|
|
|
constexpr u8 mantissa_msb_bit = static_cast<u8>(FP::FPInfo<FPT>::explicit_mantissa_width - 1); |
|
|
|
@ -218,7 +236,7 @@ void EmitPostProcessNaNs(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm op1, X |
|
|
|
} |
|
|
|
|
|
|
|
constexpr size_t shift = fsize == 32 ? 0 : 48; |
|
|
|
if (fsize == 32) { |
|
|
|
if constexpr (fsize == 32) { |
|
|
|
code.movd(tmp.cvt32(), xmm0); |
|
|
|
} else { |
|
|
|
// We do this to avoid requiring 64-bit immediates
|
|
|
|
@ -234,7 +252,7 @@ void EmitPostProcessNaNs(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm op1, X |
|
|
|
// op1 == QNaN && op2 == SNaN <<< The problematic case
|
|
|
|
// op1 == QNaN && op2 == Inf
|
|
|
|
|
|
|
|
if (fsize == 32) { |
|
|
|
if constexpr (fsize == 32) { |
|
|
|
code.movd(tmp.cvt32(), op2); |
|
|
|
code.shl(tmp.cvt32(), 32 - mantissa_msb_bit); |
|
|
|
} else { |
|
|
|
@ -273,7 +291,7 @@ void FPTwoOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { |
|
|
|
if (ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) { |
|
|
|
// Do nothing
|
|
|
|
} else if (ctx.FPCR().DN()) { |
|
|
|
ForceToDefaultNaN(code, result, fsize); |
|
|
|
ForceToDefaultNaN<fsize>(code, result); |
|
|
|
} else { |
|
|
|
PostProcessNaN<fsize>(code, result, xmm0); |
|
|
|
} |
|
|
|
@ -284,7 +302,7 @@ void FPTwoOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { |
|
|
|
|
|
|
|
template<size_t fsize, typename Function> |
|
|
|
void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { |
|
|
|
using FPT = Common::UnsignedIntegerN<fsize>; |
|
|
|
using FPT = mcl::unsigned_integer_of_size<fsize>; |
|
|
|
|
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
|
|
|
|
|
@ -299,7 +317,7 @@ void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) |
|
|
|
} |
|
|
|
|
|
|
|
if (!ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) { |
|
|
|
ForceToDefaultNaN(code, result, fsize); |
|
|
|
ForceToDefaultNaN<fsize>(code, result); |
|
|
|
} |
|
|
|
|
|
|
|
ctx.reg_alloc.DefineValue(inst, result); |
|
|
|
@ -343,7 +361,7 @@ void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) |
|
|
|
|
|
|
|
template<size_t fsize> |
|
|
|
void FPAbs(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { |
|
|
|
using FPT = Common::UnsignedIntegerN<fsize>; |
|
|
|
using FPT = mcl::unsigned_integer_of_size<fsize>; |
|
|
|
constexpr FPT non_sign_mask = FP::FPInfo<FPT>::sign_mask - FPT(1u); |
|
|
|
|
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
|
@ -369,7 +387,7 @@ void EmitX64::EmitFPAbs64(EmitContext& ctx, IR::Inst* inst) { |
|
|
|
|
|
|
|
template<size_t fsize> |
|
|
|
void FPNeg(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { |
|
|
|
using FPT = Common::UnsignedIntegerN<fsize>; |
|
|
|
using FPT = mcl::unsigned_integer_of_size<fsize>; |
|
|
|
constexpr FPT sign_mask = FP::FPInfo<FPT>::sign_mask; |
|
|
|
|
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
|
@ -424,7 +442,7 @@ static void EmitFPMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { |
|
|
|
|
|
|
|
FCODE(ucomis)(result, operand); |
|
|
|
code.jz(*equal, code.T_NEAR); |
|
|
|
if (is_max) { |
|
|
|
if constexpr (is_max) { |
|
|
|
FCODE(maxs)(result, operand); |
|
|
|
} else { |
|
|
|
FCODE(mins)(result, operand); |
|
|
|
@ -436,7 +454,7 @@ static void EmitFPMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { |
|
|
|
|
|
|
|
code.L(*equal); |
|
|
|
code.jp(nan); |
|
|
|
if (is_max) { |
|
|
|
if constexpr (is_max) { |
|
|
|
code.andps(result, operand); |
|
|
|
} else { |
|
|
|
code.orps(result, operand); |
|
|
|
@ -459,7 +477,7 @@ static void EmitFPMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { |
|
|
|
|
|
|
|
template<size_t fsize, bool is_max> |
|
|
|
static inline void EmitFPMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) noexcept { |
|
|
|
using FPT = Common::UnsignedIntegerN<fsize>; |
|
|
|
using FPT = mcl::unsigned_integer_of_size<fsize>; |
|
|
|
constexpr FPT default_nan = FP::FPInfo<FPT>::DefaultNaN(); |
|
|
|
|
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
|
@ -484,7 +502,7 @@ static inline void EmitFPMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR:: |
|
|
|
tmp.setBit(fsize); |
|
|
|
|
|
|
|
const auto move_to_tmp = [=, &code](const Xbyak::Xmm& xmm) { |
|
|
|
if (fsize == 32) { |
|
|
|
if constexpr (fsize == 32) { |
|
|
|
code.movd(tmp.cvt32(), xmm); |
|
|
|
} else { |
|
|
|
code.movq(tmp.cvt64(), xmm); |
|
|
|
@ -495,7 +513,7 @@ static inline void EmitFPMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR:: |
|
|
|
|
|
|
|
FCODE(ucomis)(op1, op2); |
|
|
|
code.jz(*z, code.T_NEAR); |
|
|
|
if (is_max) { |
|
|
|
if constexpr (is_max) { |
|
|
|
FCODE(maxs)(op2, op1); |
|
|
|
} else { |
|
|
|
FCODE(mins)(op2, op1); |
|
|
|
@ -509,7 +527,7 @@ static inline void EmitFPMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR:: |
|
|
|
|
|
|
|
code.L(*z); |
|
|
|
code.jp(nan); |
|
|
|
if (is_max) { |
|
|
|
if constexpr (is_max) { |
|
|
|
code.andps(op2, op1); |
|
|
|
} else { |
|
|
|
code.orps(op2, op1); |
|
|
|
@ -611,12 +629,12 @@ void EmitX64::EmitFPMul64(EmitContext& ctx, IR::Inst* inst) { |
|
|
|
|
|
|
|
template<size_t fsize, bool negate_product> |
|
|
|
static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { |
|
|
|
using FPT = Common::UnsignedIntegerN<fsize>; |
|
|
|
using FPT = mcl::unsigned_integer_of_size<fsize>; |
|
|
|
const auto fallback_fn = negate_product ? &FP::FPMulSub<FPT> : &FP::FPMulAdd<FPT>; |
|
|
|
|
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
|
|
|
|
|
if (fsize != 16) { |
|
|
|
if constexpr (fsize != 16) { |
|
|
|
const bool needs_rounding_correction = ctx.FPCR().FZ(); |
|
|
|
const bool needs_nan_correction = !ctx.FPCR().DN(); |
|
|
|
|
|
|
|
@ -625,13 +643,13 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { |
|
|
|
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); |
|
|
|
const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]); |
|
|
|
|
|
|
|
if (negate_product) { |
|
|
|
if constexpr (negate_product) { |
|
|
|
FCODE(vfnmadd231s)(result, operand2, operand3); |
|
|
|
} else { |
|
|
|
FCODE(vfmadd231s)(result, operand2, operand3); |
|
|
|
} |
|
|
|
if (ctx.FPCR().DN()) { |
|
|
|
ForceToDefaultNaN(code, result, fsize); |
|
|
|
ForceToDefaultNaN<fsize>(code, result); |
|
|
|
} |
|
|
|
|
|
|
|
ctx.reg_alloc.DefineValue(inst, result); |
|
|
|
@ -647,7 +665,7 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { |
|
|
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); |
|
|
|
|
|
|
|
code.movaps(result, operand1); |
|
|
|
if (negate_product) { |
|
|
|
if constexpr (negate_product) { |
|
|
|
FCODE(vfnmadd231s)(result, operand2, operand3); |
|
|
|
} else { |
|
|
|
FCODE(vfmadd231s)(result, operand2, operand3); |
|
|
|
@ -668,8 +686,9 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { |
|
|
|
} else { |
|
|
|
UNREACHABLE(); |
|
|
|
} |
|
|
|
if (ctx.FPCR().DN()) |
|
|
|
ForceToDefaultNaN(code, result, fsize); |
|
|
|
if (ctx.FPCR().DN()) { |
|
|
|
ForceToDefaultNaN<fsize>(code, result); |
|
|
|
} |
|
|
|
code.L(*end); |
|
|
|
|
|
|
|
ctx.deferred_emits.emplace_back([=, &code, &ctx] { |
|
|
|
@ -750,7 +769,7 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { |
|
|
|
code.ptest(operand2, xmm0); |
|
|
|
code.jnz(op2_done); |
|
|
|
code.vorps(result, operand2, xmm0); |
|
|
|
if (negate_product) { |
|
|
|
if constexpr (negate_product) { |
|
|
|
code.xorps(result, code.Const(xword, FP::FPInfo<FPT>::sign_mask)); |
|
|
|
} |
|
|
|
code.jmp(*end); |
|
|
|
@ -766,7 +785,7 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { |
|
|
|
|
|
|
|
// at this point, all SNaNs have been handled
|
|
|
|
// if op1 was not a QNaN and op2 is, negate the result
|
|
|
|
if (negate_product) { |
|
|
|
if constexpr (negate_product) { |
|
|
|
FCODE(ucomis)(operand1, operand1); |
|
|
|
code.jp(*end); |
|
|
|
FCODE(ucomis)(operand2, operand2); |
|
|
|
@ -787,7 +806,7 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { |
|
|
|
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseScratchXmm(args[1]); |
|
|
|
const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]); |
|
|
|
|
|
|
|
if (negate_product) { |
|
|
|
if constexpr (negate_product) { |
|
|
|
code.xorps(operand2, code.Const(xword, FP::FPInfo<FPT>::sign_mask)); |
|
|
|
} |
|
|
|
FCODE(muls)(operand2, operand3); |
|
|
|
@ -838,7 +857,7 @@ void EmitX64::EmitFPMulSub64(EmitContext& ctx, IR::Inst* inst) { |
|
|
|
|
|
|
|
template<size_t fsize> |
|
|
|
static void EmitFPMulX(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { |
|
|
|
using FPT = Common::UnsignedIntegerN<fsize>; |
|
|
|
using FPT = mcl::unsigned_integer_of_size<fsize>; |
|
|
|
|
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
|
|
|
|
|
@ -898,9 +917,9 @@ void EmitX64::EmitFPMulX64(EmitContext& ctx, IR::Inst* inst) { |
|
|
|
|
|
|
|
template<size_t fsize> |
|
|
|
static void EmitFPRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { |
|
|
|
using FPT = Common::UnsignedIntegerN<fsize>; |
|
|
|
using FPT = mcl::unsigned_integer_of_size<fsize>; |
|
|
|
|
|
|
|
if (fsize != 16) { |
|
|
|
if constexpr (fsize != 16) { |
|
|
|
if (ctx.HasOptimization(OptimizationFlag::Unsafe_ReducedErrorFP)) { |
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
|
const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]); |
|
|
|
@ -909,7 +928,7 @@ static void EmitFPRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i |
|
|
|
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { |
|
|
|
FCODE(vrcp14s)(result, operand, operand); |
|
|
|
} else { |
|
|
|
if (fsize == 32) { |
|
|
|
if constexpr (fsize == 32) { |
|
|
|
code.rcpss(result, operand); |
|
|
|
} else { |
|
|
|
code.cvtsd2ss(result, operand); |
|
|
|
@ -944,7 +963,7 @@ void EmitX64::EmitFPRecipEstimate64(EmitContext& ctx, IR::Inst* inst) { |
|
|
|
|
|
|
|
template<size_t fsize> |
|
|
|
static void EmitFPRecipExponent(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { |
|
|
|
using FPT = Common::UnsignedIntegerN<fsize>; |
|
|
|
using FPT = mcl::unsigned_integer_of_size<fsize>; |
|
|
|
|
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
|
ctx.reg_alloc.HostCall(inst, args[0]); |
|
|
|
@ -967,11 +986,11 @@ void EmitX64::EmitFPRecipExponent64(EmitContext& ctx, IR::Inst* inst) { |
|
|
|
|
|
|
|
template<size_t fsize> |
|
|
|
static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { |
|
|
|
using FPT = Common::UnsignedIntegerN<fsize>; |
|
|
|
using FPT = mcl::unsigned_integer_of_size<fsize>; |
|
|
|
|
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
|
|
|
|
|
if (fsize != 16) { |
|
|
|
if constexpr (fsize != 16) { |
|
|
|
if (code.HasHostFeature(HostFeature::FMA) && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) { |
|
|
|
Xbyak::Label end, fallback; |
|
|
|
|
|
|
|
@ -1104,9 +1123,9 @@ void EmitX64::EmitFPRoundInt64(EmitContext& ctx, IR::Inst* inst) { |
|
|
|
|
|
|
|
template<size_t fsize> |
|
|
|
static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { |
|
|
|
using FPT = Common::UnsignedIntegerN<fsize>; |
|
|
|
using FPT = mcl::unsigned_integer_of_size<fsize>; |
|
|
|
|
|
|
|
if (fsize != 16) { |
|
|
|
if constexpr (fsize != 16) { |
|
|
|
if (ctx.HasOptimization(OptimizationFlag::Unsafe_ReducedErrorFP)) { |
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
|
const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]); |
|
|
|
@ -1115,7 +1134,7 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i |
|
|
|
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { |
|
|
|
FCODE(vrsqrt14s)(result, operand, operand); |
|
|
|
} else { |
|
|
|
if (fsize == 32) { |
|
|
|
if constexpr (fsize == 32) { |
|
|
|
code.rsqrtss(result, operand); |
|
|
|
} else { |
|
|
|
code.cvtsd2ss(result, operand); |
|
|
|
@ -1161,7 +1180,7 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i |
|
|
|
bool needs_fallback = false; |
|
|
|
|
|
|
|
code.L(*bad_values); |
|
|
|
if (fsize == 32) { |
|
|
|
if constexpr (fsize == 32) { |
|
|
|
code.movd(tmp, operand); |
|
|
|
|
|
|
|
if (!ctx.FPCR().FZ()) { |
|
|
|
@ -1283,11 +1302,11 @@ void EmitX64::EmitFPRSqrtEstimate64(EmitContext& ctx, IR::Inst* inst) { |
|
|
|
|
|
|
|
template<size_t fsize> |
|
|
|
static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { |
|
|
|
using FPT = Common::UnsignedIntegerN<fsize>; |
|
|
|
using FPT = mcl::unsigned_integer_of_size<fsize>; |
|
|
|
|
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
|
|
|
|
|
if (fsize != 16) { |
|
|
|
if constexpr (fsize != 16) { |
|
|
|
if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX) && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) { |
|
|
|
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); |
|
|
|
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); |
|
|
|
@ -1466,8 +1485,9 @@ void EmitX64::EmitFPHalfToDouble(EmitContext& ctx, IR::Inst* inst) { |
|
|
|
// Double-conversion here is acceptable as this is expanding precision.
|
|
|
|
code.vcvtph2ps(result, value); |
|
|
|
code.vcvtps2pd(result, result); |
|
|
|
if (ctx.FPCR().DN()) |
|
|
|
ForceToDefaultNaN(code, result, 64); |
|
|
|
if (ctx.FPCR().DN()) { |
|
|
|
ForceToDefaultNaN<64>(code, result); |
|
|
|
} |
|
|
|
|
|
|
|
ctx.reg_alloc.DefineValue(inst, result); |
|
|
|
return; |
|
|
|
@ -1489,8 +1509,9 @@ void EmitX64::EmitFPHalfToSingle(EmitContext& ctx, IR::Inst* inst) { |
|
|
|
const Xbyak::Xmm value = ctx.reg_alloc.UseXmm(args[0]); |
|
|
|
|
|
|
|
code.vcvtph2ps(result, value); |
|
|
|
if (ctx.FPCR().DN()) |
|
|
|
ForceToDefaultNaN(code, result, 32); |
|
|
|
if (ctx.FPCR().DN()) { |
|
|
|
ForceToDefaultNaN<32>(code, result); |
|
|
|
} |
|
|
|
|
|
|
|
ctx.reg_alloc.DefineValue(inst, result); |
|
|
|
return; |
|
|
|
@ -1498,22 +1519,23 @@ void EmitX64::EmitFPHalfToSingle(EmitContext& ctx, IR::Inst* inst) { |
|
|
|
|
|
|
|
ctx.reg_alloc.HostCall(inst, args[0]); |
|
|
|
code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value()); |
|
|
|
code.mov(code.ABI_PARAM3.cvt32(), u32(rounding_mode)); |
|
|
|
code.mov(code.ABI_PARAM3.cvt32(), static_cast<u32>(rounding_mode)); |
|
|
|
code.lea(code.ABI_PARAM4, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]); |
|
|
|
code.CallFunction(&FP::FPConvert<u32, u16>); |
|
|
|
} |
|
|
|
|
|
|
|
void EmitX64::EmitFPSingleToDouble(EmitContext& ctx, IR::Inst* inst) { |
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
|
const auto rounding_mode = FP::RoundingMode(args[1].GetImmediateU8()); |
|
|
|
const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8()); |
|
|
|
|
|
|
|
// We special-case the non-IEEE-defined ToOdd rounding mode.
|
|
|
|
if (rounding_mode == ctx.FPCR().RMode() && rounding_mode != FP::RoundingMode::ToOdd) { |
|
|
|
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); |
|
|
|
|
|
|
|
code.cvtss2sd(result, result); |
|
|
|
if (ctx.FPCR().DN()) |
|
|
|
ForceToDefaultNaN(code, result, 64); |
|
|
|
if (ctx.FPCR().DN()) { |
|
|
|
ForceToDefaultNaN<64>(code, result); |
|
|
|
} |
|
|
|
ctx.reg_alloc.DefineValue(inst, result); |
|
|
|
} else { |
|
|
|
ctx.reg_alloc.HostCall(inst, args[0]); |
|
|
|
@ -1531,9 +1553,12 @@ void EmitX64::EmitFPSingleToHalf(EmitContext& ctx, IR::Inst* inst) { |
|
|
|
|
|
|
|
if (code.HasHostFeature(HostFeature::F16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) { |
|
|
|
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); |
|
|
|
if (ctx.FPCR().DN()) |
|
|
|
ForceToDefaultNaN(code, result, 32); |
|
|
|
code.vcvtps2ph(result, result, u8(*round_imm)); |
|
|
|
|
|
|
|
if (ctx.FPCR().DN()) { |
|
|
|
ForceToDefaultNaN<32>(code, result); |
|
|
|
} |
|
|
|
code.vcvtps2ph(result, result, static_cast<u8>(*round_imm)); |
|
|
|
|
|
|
|
ctx.reg_alloc.DefineValue(inst, result); |
|
|
|
return; |
|
|
|
} |
|
|
|
@ -1561,18 +1586,21 @@ void EmitX64::EmitFPDoubleToHalf(EmitContext& ctx, IR::Inst* inst) { |
|
|
|
|
|
|
|
void EmitX64::EmitFPDoubleToSingle(EmitContext& ctx, IR::Inst* inst) { |
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst); |
|
|
|
const auto rounding_mode = FP::RoundingMode(args[1].GetImmediateU8()); |
|
|
|
const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8()); |
|
|
|
|
|
|
|
// We special-case the non-IEEE-defined ToOdd rounding mode.
|
|
|
|
if (rounding_mode == ctx.FPCR().RMode() && rounding_mode != FP::RoundingMode::ToOdd) { |
|
|
|
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); |
|
|
|
|
|
|
|
code.cvtsd2ss(result, result); |
|
|
|
if (ctx.FPCR().DN()) |
|
|
|
ForceToDefaultNaN(code, result, 32); |
|
|
|
if (ctx.FPCR().DN()) { |
|
|
|
ForceToDefaultNaN<32>(code, result); |
|
|
|
} |
|
|
|
ctx.reg_alloc.DefineValue(inst, result); |
|
|
|
} else { |
|
|
|
ctx.reg_alloc.HostCall(inst, args[0]); |
|
|
|
code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value()); |
|
|
|
code.mov(code.ABI_PARAM3.cvt32(), u32(rounding_mode)); |
|
|
|
code.mov(code.ABI_PARAM3.cvt32(), static_cast<u32>(rounding_mode)); |
|
|
|
code.lea(code.ABI_PARAM4, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]); |
|
|
|
code.CallFunction(&FP::FPConvert<u32, u64>); |
|
|
|
} |
|
|
|
@ -1587,7 +1615,7 @@ void EmitX64::EmitFPDoubleToSingle(EmitContext& ctx, IR::Inst* inst) { |
|
|
|
/// Better than spamming thousands of templates aye?
|
|
|
|
template<size_t fsize> |
|
|
|
static u64 EmitFPToFixedThunk(u64 input, FP::FPSR& fpsr, FP::FPCR fpcr, u32 extra_args) { |
|
|
|
using FPT = Common::UnsignedIntegerN<fsize>; |
|
|
|
using FPT = mcl::unsigned_integer_of_size<fsize>; |
|
|
|
auto const unsigned_ = ((extra_args >> 24) & 0xff) != 0; |
|
|
|
auto const isize = ((extra_args >> 16) & 0xff); |
|
|
|
auto const rounding = FP::RoundingMode((extra_args >> 8) & 0xff); |
|
|
|
@ -1602,7 +1630,7 @@ static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { |
|
|
|
const size_t fbits = args[1].GetImmediateU8(); |
|
|
|
const auto rounding_mode = FP::RoundingMode(args[2].GetImmediateU8()); |
|
|
|
|
|
|
|
if (fsize != 16) { |
|
|
|
if constexpr (fsize != 16) { |
|
|
|
const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode); |
|
|
|
|
|
|
|
// cvttsd2si truncates during operation so rounding (and thus SSE4.1) not required
|
|
|
|
@ -1612,7 +1640,7 @@ static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { |
|
|
|
const Xbyak::Xmm src = ctx.reg_alloc.UseScratchXmm(args[0]); |
|
|
|
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr().cvt64(); |
|
|
|
|
|
|
|
if (fsize == 64) { |
|
|
|
if constexpr (fsize == 64) { |
|
|
|
if (fbits != 0) { |
|
|
|
const u64 scale_factor = static_cast<u64>((fbits + 1023) << 52); |
|
|
|
code.mulsd(src, code.Const(xword, scale_factor)); |
|
|
|
@ -1634,13 +1662,13 @@ static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { |
|
|
|
code.cvtss2sd(src, src); |
|
|
|
} |
|
|
|
|
|
|
|
if (isize == 64) { |
|
|
|
if constexpr (isize == 64) { |
|
|
|
const Xbyak::Xmm scratch = ctx.reg_alloc.ScratchXmm(); |
|
|
|
|
|
|
|
if (!unsigned_) { |
|
|
|
SharedLabel saturate_max = GenSharedLabel(), end = GenSharedLabel(); |
|
|
|
|
|
|
|
ZeroIfNaN(code, src, scratch, 64); |
|
|
|
ZeroIfNaN<64>(code, src, scratch); |
|
|
|
|
|
|
|
code.movsd(scratch, code.Const(xword, f64_max_s64_lim)); |
|
|
|
code.comisd(scratch, src); |
|
|
|
@ -1678,11 +1706,11 @@ static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { |
|
|
|
code.sar(result2, 63); |
|
|
|
code.or_(result, result2); |
|
|
|
} |
|
|
|
} else if (isize == 32) { |
|
|
|
} else if constexpr (isize == 32) { |
|
|
|
if (!unsigned_) { |
|
|
|
const Xbyak::Xmm scratch = ctx.reg_alloc.ScratchXmm(); |
|
|
|
|
|
|
|
ZeroIfNaN(code, src, scratch, 64); |
|
|
|
ZeroIfNaN<64>(code, src, scratch); |
|
|
|
code.minsd(src, code.Const(xword, f64_max_s32)); |
|
|
|
// maxsd not required as cvttsd2si results in 0x8000'0000 when out of range
|
|
|
|
code.cvttsd2si(result.cvt32(), src); // 32 bit gpr
|
|
|
|
@ -1695,7 +1723,7 @@ static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { |
|
|
|
} else { |
|
|
|
const Xbyak::Xmm scratch = ctx.reg_alloc.ScratchXmm(); |
|
|
|
|
|
|
|
ZeroIfNaN(code, src, scratch, 64); |
|
|
|
ZeroIfNaN<64>(code, src, scratch); |
|
|
|
code.maxsd(src, code.Const(xword, unsigned_ ? f64_min_u16 : f64_min_s16)); |
|
|
|
code.minsd(src, code.Const(xword, unsigned_ ? f64_max_u16 : f64_max_s16)); |
|
|
|
code.cvttsd2si(result, src); // 64 bit gpr
|
|
|
|
|