diff --git a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp index f037919eb0..ee68fa4869 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp @@ -60,8 +60,10 @@ static Xbyak::Address MJitStateExtReg(A32::ExtReg reg) { UNREACHABLE(); } -A32EmitContext::A32EmitContext(const A32::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block) - : EmitContext(reg_alloc, block), conf(conf) {} +A32EmitContext::A32EmitContext(const A32::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block, std::vector& shared_labels) + : EmitContext(reg_alloc, block, shared_labels) + , conf(conf) +{} A32::LocationDescriptor A32EmitContext::Location() const { return A32::LocationDescriptor{block.Location()}; @@ -110,7 +112,11 @@ A32EmitX64::BlockDescriptor A32EmitX64::Emit(IR::Block& block) { gprs.reset(size_t(HostLoc::R14)); return gprs; }(), any_xmm); - A32EmitContext ctx{conf, reg_alloc, block}; + + // up to 2 labels per insn + if (auto const inst_count = block.instructions.size(); inst_count > shared_labels.capacity()) + shared_labels.reserve(inst_count * 8); + A32EmitContext ctx{conf, reg_alloc, block, shared_labels}; // Start emitting. code.align(); @@ -168,6 +174,7 @@ A32EmitX64::BlockDescriptor A32EmitX64::Emit(IR::Block& block) { auto const bdesc = RegisterBlock(descriptor, entrypoint, size); code.DisableWriting(); + shared_labels.clear(); return bdesc; } diff --git a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h index 5ec78ff50e..8980d2b7e6 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h +++ b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h @@ -29,7 +29,7 @@ namespace Dynarmic::Backend::X64 { class RegAlloc; struct A32EmitContext final : public EmitContext { - A32EmitContext(const A32::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block); + A32EmitContext(const A32::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block, std::vector& shared_labels); A32::LocationDescriptor Location() const; A32::LocationDescriptor EndLocation() const; @@ -130,6 +130,7 @@ public: ankerl::unordered_dense::map, void (*)()> write_fallbacks; ankerl::unordered_dense::map, void (*)()> exclusive_write_fallbacks; ankerl::unordered_dense::set do_not_fastmem; + std::vector shared_labels; void (*memory_read_128)() = nullptr; // Dummy void (*memory_write_128)() = nullptr; // Dummy const void* terminal_handler_pop_rsb_hint; diff --git a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp index ff82d8b05c..9c40240e0a 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp @@ -38,8 +38,10 @@ namespace Dynarmic::Backend::X64 { using namespace Xbyak::util; -A64EmitContext::A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block) - : EmitContext(reg_alloc, block), conf(conf) {} +A64EmitContext::A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block, std::vector& shared_labels) + : EmitContext(reg_alloc, block, shared_labels) + , conf(conf) +{} A64::LocationDescriptor A64EmitContext::Location() const { return A64::LocationDescriptor{block.Location()}; @@ -84,7 +86,11 @@ A64EmitX64::BlockDescriptor A64EmitX64::Emit(IR::Block& block) noexcept { gprs.reset(size_t(HostLoc::R14)); return gprs; }(), any_xmm}; - A64EmitContext ctx{conf, reg_alloc, block}; + + // up to 2 labels per insn + if (auto const inst_count = block.instructions.size(); inst_count > shared_labels.capacity()) + shared_labels.reserve(inst_count * 8); + A64EmitContext ctx{conf, reg_alloc, block, shared_labels}; // Start emitting. code.align(); @@ -162,6 +168,7 @@ finish_this_inst: auto bdesc = RegisterBlock(descriptor, entrypoint, size); code.DisableWriting(); + shared_labels.clear(); return bdesc; } diff --git a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h index dd556e36ce..4e632e6824 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h +++ b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h @@ -27,7 +27,7 @@ namespace Dynarmic::Backend::X64 { struct A64EmitContext final : public EmitContext { - A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block); + A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block, std::vector& shared_labels); A64::LocationDescriptor Location() const; bool IsSingleStep() const; @@ -126,6 +126,7 @@ public: ankerl::unordered_dense::map, void (*)()> write_fallbacks; ankerl::unordered_dense::map, void (*)()> exclusive_write_fallbacks; ankerl::unordered_dense::set do_not_fastmem; + std::vector shared_labels; const void* terminal_handler_pop_rsb_hint = nullptr; const void* terminal_handler_fast_dispatch_hint = nullptr; FastDispatchEntry& (*fast_dispatch_table_lookup)(u64) = nullptr; diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp index 2b0540e4a7..e498c2ec9c 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp @@ -33,8 +33,11 @@ namespace Dynarmic::Backend::X64 { using namespace Xbyak::util; -EmitContext::EmitContext(RegAlloc& reg_alloc, IR::Block& block) - : reg_alloc(reg_alloc), block(block) {} +EmitContext::EmitContext(RegAlloc& reg_alloc, IR::Block& block, std::vector& shared_labels) + : reg_alloc(reg_alloc) + , block(block) + , shared_labels(shared_labels) +{} EmitContext::~EmitContext() = default; diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64.h b/src/dynarmic/src/dynarmic/backend/x64/emit_x64.h index 5de5f2dc7a..50fa550413 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64.h +++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64.h @@ -16,6 +16,7 @@ #include #include +#include #include #include #include "dynarmic/backend/x64/xbyak.h" @@ -52,24 +53,24 @@ using VectorArray = std::array> template using HalfVectorArray = std::array / 2>; +using SharedLabel = Xbyak::Label*; struct EmitContext { - EmitContext(RegAlloc& reg_alloc, IR::Block& block); + EmitContext(RegAlloc& reg_alloc, IR::Block& block, std::vector& shared_labels); virtual ~EmitContext(); virtual FP::FPCR FPCR(bool fpcr_controlled = true) const = 0; virtual bool HasOptimization(OptimizationFlag flag) const = 0; - RegAlloc& reg_alloc; - IR::Block& block; + [[nodiscard]] inline Xbyak::Label* GenSharedLabel() noexcept { + DEBUG_ASSERT(shared_labels.size() + 1 <= shared_labels.capacity()); + return &shared_labels.emplace_back(); + } std::vector> deferred_emits; + RegAlloc& reg_alloc; + IR::Block& block; + std::vector& shared_labels; }; -using SharedLabel = std::shared_ptr; - -inline SharedLabel GenSharedLabel() { - return std::make_shared(); -} - class EmitX64 { public: struct BlockDescriptor { diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp index 76c103ec6f..ddba34626e 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp @@ -144,7 +144,7 @@ void ForceToDefaultNaN(BlockOfCode& code, Xbyak::Xmm result) { template SharedLabel ProcessNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm a) { - SharedLabel nan = GenSharedLabel(), end = GenSharedLabel(); + SharedLabel nan = ctx.GenSharedLabel(), end = ctx.GenSharedLabel(); FCODE(ucomis)(a, a); code.jp(*nan, code.T_NEAR); @@ -259,7 +259,7 @@ template void FPTwoOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - SharedLabel end = GenSharedLabel(); + SharedLabel end = ctx.GenSharedLabel(); Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); @@ -312,7 +312,7 @@ void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(code); - SharedLabel end = GenSharedLabel(), nan = GenSharedLabel(); + SharedLabel end = ctx.GenSharedLabel(), nan = ctx.GenSharedLabel(); code.movaps(result, op1); if constexpr (std::is_member_function_pointer_v) { @@ -421,7 +421,7 @@ static void EmitFPMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, bo DenormalsAreZero(code, ctx, {result, operand}); - SharedLabel equal = GenSharedLabel(), end = GenSharedLabel(); + SharedLabel equal = ctx.GenSharedLabel(), end = ctx.GenSharedLabel(); FCODE(ucomis)(result, operand); code.jz(*equal, code.T_NEAR); @@ -492,7 +492,7 @@ static inline void EmitFPMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR:: } }; - SharedLabel end = GenSharedLabel(), z = GenSharedLabel(); + SharedLabel end = ctx.GenSharedLabel(), z = ctx.GenSharedLabel(); FCODE(ucomis)(op1, op2); code.jz(*z, code.T_NEAR); @@ -640,7 +640,7 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, bo } if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) { - SharedLabel fallback = GenSharedLabel(), end = GenSharedLabel(); + SharedLabel fallback = ctx.GenSharedLabel(), end = ctx.GenSharedLabel(); const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(code, args[0]); const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]); @@ -851,7 +851,7 @@ static void EmitFPMulX(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Reg64 tmp = do_default_nan ? INVALID_REG : ctx.reg_alloc.ScratchGpr(code); - SharedLabel end = GenSharedLabel(), nan = GenSharedLabel(); + SharedLabel end = ctx.GenSharedLabel(), nan = ctx.GenSharedLabel(); if (code.HasHostFeature(HostFeature::AVX)) { FCODE(vmuls)(result, op1, op2); @@ -989,7 +989,7 @@ static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* } if (code.HasHostFeature(HostFeature::FMA)) { - SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel(); + SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel(); const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(code, args[0]); const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]); @@ -1137,7 +1137,7 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i const Xbyak::Xmm value = ctx.reg_alloc.ScratchXmm(code); [[maybe_unused]] const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr(code).cvt32(); - SharedLabel bad_values = GenSharedLabel(), end = GenSharedLabel(); + SharedLabel bad_values = ctx.GenSharedLabel(), end = ctx.GenSharedLabel(); code.movaps(value, operand); @@ -1304,7 +1304,7 @@ static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* } if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) { - SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel(); + SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel(); const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(code, args[0]); const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]); @@ -1649,7 +1649,7 @@ static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { const Xbyak::Xmm scratch = ctx.reg_alloc.ScratchXmm(code); if (!unsigned_) { - SharedLabel saturate_max = GenSharedLabel(), end = GenSharedLabel(); + SharedLabel saturate_max = ctx.GenSharedLabel(), end = ctx.GenSharedLabel(); ZeroIfNaN<64>(code, src, scratch); diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc index aaed8b43f2..a95d9824c8 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc +++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc @@ -85,7 +85,7 @@ void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) { const auto wrapped_fn = read_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)]; - SharedLabel abort = GenSharedLabel(), end = GenSharedLabel(); + SharedLabel abort = ctx.GenSharedLabel(), end = ctx.GenSharedLabel(); if (fastmem_marker) { // Use fastmem @@ -107,7 +107,7 @@ void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) { conf.recompile_on_fastmem_failure, }); - EmitCheckMemoryAbort(ctx, inst, end.get()); + EmitCheckMemoryAbort(ctx, inst, end); code.jmp(*end, code.T_NEAR); }); } else { @@ -119,7 +119,7 @@ void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) { ctx.deferred_emits.emplace_back([=, this, &ctx] { code.L(*abort); code.call(wrapped_fn); - EmitCheckMemoryAbort(ctx, inst, end.get()); + EmitCheckMemoryAbort(ctx, inst, end); code.jmp(*end, code.T_NEAR); }); } @@ -172,7 +172,7 @@ void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) { const auto wrapped_fn = write_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)]; - SharedLabel abort = GenSharedLabel(), end = GenSharedLabel(); + SharedLabel abort = ctx.GenSharedLabel(), end = ctx.GenSharedLabel(); if (fastmem_marker) { // Use fastmem @@ -194,7 +194,7 @@ void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) { conf.recompile_on_fastmem_failure, }); - EmitCheckMemoryAbort(ctx, inst, end.get()); + EmitCheckMemoryAbort(ctx, inst, end); code.jmp(*end, code.T_NEAR); }); } else { @@ -206,7 +206,7 @@ void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) { ctx.deferred_emits.emplace_back([=, this, &ctx] { code.L(*abort); code.call(wrapped_fn); - EmitCheckMemoryAbort(ctx, inst, end.get()); + EmitCheckMemoryAbort(ctx, inst, end); code.jmp(*end, code.T_NEAR); }); } @@ -351,7 +351,7 @@ void AxxEmitX64::EmitExclusiveReadMemoryInline(AxxEmitContext& ctx, IR::Inst* in const auto fastmem_marker = ShouldFastmem(ctx, inst); if (fastmem_marker) { - SharedLabel abort = GenSharedLabel(), end = GenSharedLabel(); + SharedLabel abort = ctx.GenSharedLabel(), end = ctx.GenSharedLabel(); bool require_abort_handling = false; const auto src_ptr = EmitFastmemVAddr(code, ctx, *abort, vaddr, require_abort_handling); @@ -426,7 +426,7 @@ void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* i EmitExclusiveLock(code, conf, tmp, tmp2.cvt32()); - SharedLabel end = GenSharedLabel(); + SharedLabel end = ctx.GenSharedLabel(); code.mov(status, u32(1)); code.movzx(tmp.cvt32(), code.byte[code.ABI_JIT_PTR + offsetof(AxxJitState, exclusive_state)]); @@ -459,7 +459,7 @@ void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* i const auto fastmem_marker = ShouldFastmem(ctx, inst); if (fastmem_marker) { - SharedLabel abort = GenSharedLabel(); + SharedLabel abort = ctx.GenSharedLabel(); bool require_abort_handling = false; const auto dest_ptr = EmitFastmemVAddr(code, ctx, *abort, vaddr, require_abort_handling, tmp); diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h index 211f620ceb..11c489d771 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h +++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h @@ -52,7 +52,7 @@ void EmitDetectMisalignedVAddr(BlockOfCode& code, EmitContext& ctx, size_t bitsi if (ctx.conf.only_detect_misalignment_via_page_table_on_page_boundary) { const u32 page_align_mask = static_cast(page_size - 1) & ~align_mask; - SharedLabel detect_boundary = GenSharedLabel(), resume = GenSharedLabel(); + SharedLabel detect_boundary = ctx.GenSharedLabel(), resume = ctx.GenSharedLabel(); code.jnz(*detect_boundary, code.T_NEAR); code.L(*resume); diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp index 2247b18fcd..3ecb3a7c38 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp @@ -30,6 +30,7 @@ #include "dynarmic/common/fp/fpcr.h" #include "dynarmic/common/fp/info.h" #include "dynarmic/common/fp/op.h" +#include "dynarmic/common/fp/rounding_mode.h" #include "dynarmic/common/fp/util.h" #include "dynarmic/common/lut_from_list.h" #include "dynarmic/interface/optimization_flags.h" @@ -101,7 +102,7 @@ void HandleNaNs(BlockOfCode& code, EmitContext& ctx, bool fpcr_controlled, std:: code.cmp(bitmask, 0); } - SharedLabel end = GenSharedLabel(), nan = GenSharedLabel(); + SharedLabel end = ctx.GenSharedLabel(), nan = ctx.GenSharedLabel(); code.jnz(*nan, code.T_NEAR); code.L(*end); @@ -196,23 +197,6 @@ void ForceToDefaultNaN(BlockOfCode& code, FP::FPCR fpcr, Xbyak::Xmm result) { } } -template -void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm result) { - const Xbyak::Xmm nan_mask = xmm0; - if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { - constexpr u32 nan_to_zero = FixupLUT(FpFixup::PosZero, - FpFixup::PosZero); - FCODE(vfixupimmp)(result, result, code.BConst<32>(ptr_b, nan_to_zero), u8(0)); - } else if (code.HasHostFeature(HostFeature::AVX)) { - FCODE(vcmpordp)(nan_mask, result, result); - FCODE(vandp)(result, result, nan_mask); - } else { - code.movaps(nan_mask, result); - FCODE(cmpordp)(nan_mask, nan_mask); - code.andps(result, nan_mask); - } -} - template void DenormalsAreZero(BlockOfCode& code, FP::FPCR fpcr, std::initializer_list to_daz, Xbyak::Xmm tmp) { if (fpcr.FZ()) { @@ -1338,7 +1322,7 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); - SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel(); + SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel(); MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { code.movaps(result, xmm_a); @@ -1611,7 +1595,7 @@ static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]); const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); - SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel(); + SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel(); MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { code.movaps(result, GetVectorOf(code)); @@ -1784,7 +1768,7 @@ static void EmitRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Xmm value = ctx.reg_alloc.ScratchXmm(code); - SharedLabel bad_values = GenSharedLabel(), end = GenSharedLabel(); + SharedLabel bad_values = ctx.GenSharedLabel(), end = ctx.GenSharedLabel(); code.movaps(value, operand); @@ -1875,7 +1859,7 @@ static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(code); - SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel(); + SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel(); MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { code.vmovaps(result, GetVectorOf(code)); @@ -2011,120 +1995,123 @@ void EmitX64::EmitFPVectorToHalf32(EmitContext& ctx, IR::Inst* inst) { template void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { const size_t fbits = inst->GetArg(1).GetU8(); - const auto rounding = static_cast(inst->GetArg(2).GetU8()); + const auto rounding = FP::RoundingMode(inst->GetArg(2).GetU8()); [[maybe_unused]] const bool fpcr_controlled = inst->GetArg(3).GetU1(); - if constexpr (fsize != 16) { - if (code.HasHostFeature(HostFeature::SSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Xmm src = ctx.reg_alloc.UseScratchXmm(code, args[0]); + if (code.HasHostFeature(HostFeature::SSE41) && fsize != 16 && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm src = ctx.reg_alloc.UseScratchXmm(code, args[0]); + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + const int round_imm = [&] { + switch (rounding) { + case FP::RoundingMode::ToNearest_TieEven: + default: + return 0b00; + case FP::RoundingMode::TowardsPlusInfinity: + return 0b10; + case FP::RoundingMode::TowardsMinusInfinity: + return 0b01; + case FP::RoundingMode::TowardsZero: + return 0b11; + } + }(); + const auto perform_conversion = [&code, &ctx](const Xbyak::Xmm& src) { + // MSVC doesn't allow us to use a [&] capture, so we have to do this instead. + (void)ctx; - MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { - const int round_imm = [&] { - switch (rounding) { - case FP::RoundingMode::ToNearest_TieEven: - default: - return 0b00; - case FP::RoundingMode::TowardsPlusInfinity: - return 0b10; - case FP::RoundingMode::TowardsMinusInfinity: - return 0b01; - case FP::RoundingMode::TowardsZero: - return 0b11; - } - }(); + if constexpr (fsize == 32) { + code.cvttps2dq(src, src); + } else { + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + code.vcvttpd2qq(src, src); + } else { + const Xbyak::Reg64 hi = ctx.reg_alloc.ScratchGpr(code); + const Xbyak::Reg64 lo = ctx.reg_alloc.ScratchGpr(code); - const auto perform_conversion = [&code, &ctx](const Xbyak::Xmm& src) { - // MSVC doesn't allow us to use a [&] capture, so we have to do this instead. - (void)ctx; + code.cvttsd2si(lo, src); + code.punpckhqdq(src, src); + code.cvttsd2si(hi, src); + code.movq(src, lo); + code.pinsrq(src, hi, 1); - if constexpr (fsize == 32) { - code.cvttps2dq(src, src); - } else { - if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { - code.vcvttpd2qq(src, src); - } else { - const Xbyak::Reg64 hi = ctx.reg_alloc.ScratchGpr(code); - const Xbyak::Reg64 lo = ctx.reg_alloc.ScratchGpr(code); - - code.cvttsd2si(lo, src); - code.punpckhqdq(src, src); - code.cvttsd2si(hi, src); - code.movq(src, lo); - code.pinsrq(src, hi, 1); - - ctx.reg_alloc.Release(hi); - ctx.reg_alloc.Release(lo); - } + ctx.reg_alloc.Release(hi); + ctx.reg_alloc.Release(lo); } - }; - - if (fbits != 0) { - const u64 scale_factor = fsize == 32 - ? static_cast(fbits + 127) << 23 - : static_cast(fbits + 1023) << 52; - FCODE(mulp)(src, GetVectorOf(code, scale_factor)); } + }; + if (fbits != 0) { + const u64 scale_factor = fsize == 32 + ? u64(fbits + 127) << 23 + : u64(fbits + 1023) << 52; + FCODE(mulp)(src, GetVectorOf(code, scale_factor)); + } - FCODE(roundp)(src, src, static_cast(round_imm)); - ZeroIfNaN(code, src); + FCODE(roundp)(src, src, u8(round_imm)); + const Xbyak::Xmm nan_mask = xmm0; + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + static constexpr u32 nan_to_zero = FixupLUT(FpFixup::PosZero, FpFixup::PosZero); + FCODE(vfixupimmp)(src, src, code.BConst<32>(ptr_b, nan_to_zero), u8(0)); + } else if (code.HasHostFeature(HostFeature::AVX)) { + FCODE(vcmpordp)(nan_mask, src, src); + FCODE(vandp)(src, src, nan_mask); + } else { + code.movaps(nan_mask, src); + FCODE(cmpordp)(nan_mask, nan_mask); + code.andps(src, nan_mask); + } - constexpr u64 float_upper_limit_signed = fsize == 32 ? 0x4f000000 : 0x43e0000000000000; - [[maybe_unused]] constexpr u64 float_upper_limit_unsigned = fsize == 32 ? 0x4f800000 : 0x43f0000000000000; + constexpr u64 float_upper_limit_signed = fsize == 32 ? 0x4f000000 : 0x43e0000000000000; + [[maybe_unused]] constexpr u64 float_upper_limit_unsigned = fsize == 32 ? 0x4f800000 : 0x43f0000000000000; - if constexpr (unsigned_) { - if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { - // Mask positive values - code.xorps(xmm0, xmm0); - FCODE(vcmpp)(k1, src, xmm0, Cmp::GreaterEqual_OQ); - - // Convert positive values to unsigned integers, write 0 anywhere else - // vcvttp*2u*q already saturates out-of-range values to (0xFFFF...) - if constexpr (fsize == 32) { - code.vcvttps2udq(src | k1 | T_z, src); - } else { - code.vcvttpd2uqq(src | k1 | T_z, src); - } + if constexpr (unsigned_) { + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + // Mask positive values + code.xorps(xmm0, xmm0); + FCODE(vcmpp)(k1, src, xmm0, Cmp::GreaterEqual_OQ); + + // Convert positive values to unsigned integers, write 0 anywhere else + // vcvttp*2u*q already saturates out-of-range values to (0xFFFF...) + if (fsize == 32) { + code.vcvttps2udq(src | k1 | T_z, src); } else { - // Zero is minimum - code.xorps(xmm0, xmm0); - FCODE(cmplep)(xmm0, src); - FCODE(andp)(src, xmm0); - - // Will we exceed unsigned range? - const Xbyak::Xmm exceed_unsigned = ctx.reg_alloc.ScratchXmm(code); - code.movaps(exceed_unsigned, GetVectorOf(code)); - FCODE(cmplep)(exceed_unsigned, src); - - // Will be exceed signed range? - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); - code.movaps(tmp, GetVectorOf(code)); - code.movaps(xmm0, tmp); - FCODE(cmplep)(xmm0, src); - FCODE(andp)(tmp, xmm0); - FCODE(subp)(src, tmp); - perform_conversion(src); - ICODE(psll)(xmm0, u8(fsize - 1)); - FCODE(orp)(src, xmm0); - - // Saturate to max - FCODE(orp)(src, exceed_unsigned); + code.vcvttpd2uqq(src | k1 | T_z, src); } } else { - using FPT = mcl::unsigned_integer_of_size; // WORKAROUND: For issue 678 on MSVC - constexpr u64 integer_max = FPT((std::numeric_limits>>::max)()); + // Zero is minimum + code.xorps(xmm0, xmm0); + FCODE(cmplep)(xmm0, src); + FCODE(andp)(src, xmm0); - code.movaps(xmm0, GetVectorOf(code)); + // Will we exceed unsigned range? + const Xbyak::Xmm exceed_unsigned = ctx.reg_alloc.ScratchXmm(code); + code.movaps(exceed_unsigned, GetVectorOf(code)); + FCODE(cmplep)(exceed_unsigned, src); + + // Will be exceed signed range? + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + code.movaps(tmp, GetVectorOf(code)); + code.movaps(xmm0, tmp); FCODE(cmplep)(xmm0, src); + FCODE(andp)(tmp, xmm0); + FCODE(subp)(src, tmp); perform_conversion(src); - FCODE(blendvp)(src, GetVectorOf(code)); - } - }); + ICODE(psll)(xmm0, u8(fsize - 1)); + FCODE(orp)(src, xmm0); - ctx.reg_alloc.DefineValue(code, inst, src); - return; - } + // Saturate to max + FCODE(orp)(src, exceed_unsigned); + } + } else { + using FPT = mcl::unsigned_integer_of_size; // WORKAROUND: For issue 678 on MSVC + constexpr u64 integer_max = FPT((std::numeric_limits>>::max)()); + code.movaps(xmm0, GetVectorOf(code)); + FCODE(cmplep)(xmm0, src); + perform_conversion(src); + FCODE(blendvp)(src, GetVectorOf(code)); + } + }); + ctx.reg_alloc.DefineValue(code, inst, src); + return; } using fbits_list = mp::lift_sequence>;