Browse Source

[dynarmic] avoid stable_vector<> reallocations for shared labels

Signed-off-by: lizzie <lizzie@eden-emu.dev>
lizzie/dynarmic-shared-labels-better
lizzie 1 day ago
parent
commit
1e006962ea
  1. 13
      src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp
  2. 3
      src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h
  3. 13
      src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp
  4. 3
      src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h
  5. 7
      src/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp
  6. 19
      src/dynarmic/src/dynarmic/backend/x64/emit_x64.h
  7. 22
      src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp
  8. 18
      src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc
  9. 2
      src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h
  10. 61
      src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp

13
src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp

@ -60,8 +60,10 @@ static Xbyak::Address MJitStateExtReg(A32::ExtReg reg) {
UNREACHABLE(); UNREACHABLE();
} }
A32EmitContext::A32EmitContext(const A32::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block)
: EmitContext(reg_alloc, block), conf(conf) {}
A32EmitContext::A32EmitContext(const A32::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block, std::vector<Xbyak::Label>& shared_labels)
: EmitContext(reg_alloc, block, shared_labels)
, conf(conf)
{}
A32::LocationDescriptor A32EmitContext::Location() const { A32::LocationDescriptor A32EmitContext::Location() const {
return A32::LocationDescriptor{block.Location()}; return A32::LocationDescriptor{block.Location()};
@ -110,7 +112,11 @@ A32EmitX64::BlockDescriptor A32EmitX64::Emit(IR::Block& block) {
gprs.reset(size_t(HostLoc::R14)); gprs.reset(size_t(HostLoc::R14));
return gprs; return gprs;
}(), any_xmm); }(), any_xmm);
A32EmitContext ctx{conf, reg_alloc, block};
// up to 2 labels per insn
if (auto const inst_count = block.instructions.size(); inst_count > shared_labels.capacity())
shared_labels.reserve(inst_count * 8);
A32EmitContext ctx{conf, reg_alloc, block, shared_labels};
// Start emitting. // Start emitting.
code.align(); code.align();
@ -168,6 +174,7 @@ A32EmitX64::BlockDescriptor A32EmitX64::Emit(IR::Block& block) {
auto const bdesc = RegisterBlock(descriptor, entrypoint, size); auto const bdesc = RegisterBlock(descriptor, entrypoint, size);
code.DisableWriting(); code.DisableWriting();
shared_labels.clear();
return bdesc; return bdesc;
} }

3
src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h

@ -29,7 +29,7 @@ namespace Dynarmic::Backend::X64 {
class RegAlloc; class RegAlloc;
struct A32EmitContext final : public EmitContext { struct A32EmitContext final : public EmitContext {
A32EmitContext(const A32::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block);
A32EmitContext(const A32::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block, std::vector<Xbyak::Label>& shared_labels);
A32::LocationDescriptor Location() const; A32::LocationDescriptor Location() const;
A32::LocationDescriptor EndLocation() const; A32::LocationDescriptor EndLocation() const;
@ -130,6 +130,7 @@ public:
ankerl::unordered_dense::map<std::tuple<bool, size_t, int, int>, void (*)()> write_fallbacks; ankerl::unordered_dense::map<std::tuple<bool, size_t, int, int>, void (*)()> write_fallbacks;
ankerl::unordered_dense::map<std::tuple<bool, size_t, int, int>, void (*)()> exclusive_write_fallbacks; ankerl::unordered_dense::map<std::tuple<bool, size_t, int, int>, void (*)()> exclusive_write_fallbacks;
ankerl::unordered_dense::set<DoNotFastmemMarker> do_not_fastmem; ankerl::unordered_dense::set<DoNotFastmemMarker> do_not_fastmem;
std::vector<Xbyak::Label> shared_labels;
void (*memory_read_128)() = nullptr; // Dummy void (*memory_read_128)() = nullptr; // Dummy
void (*memory_write_128)() = nullptr; // Dummy void (*memory_write_128)() = nullptr; // Dummy
const void* terminal_handler_pop_rsb_hint; const void* terminal_handler_pop_rsb_hint;

13
src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp

@ -38,8 +38,10 @@ namespace Dynarmic::Backend::X64 {
using namespace Xbyak::util; using namespace Xbyak::util;
A64EmitContext::A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block)
: EmitContext(reg_alloc, block), conf(conf) {}
A64EmitContext::A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block, std::vector<Xbyak::Label>& shared_labels)
: EmitContext(reg_alloc, block, shared_labels)
, conf(conf)
{}
A64::LocationDescriptor A64EmitContext::Location() const { A64::LocationDescriptor A64EmitContext::Location() const {
return A64::LocationDescriptor{block.Location()}; return A64::LocationDescriptor{block.Location()};
@ -84,7 +86,11 @@ A64EmitX64::BlockDescriptor A64EmitX64::Emit(IR::Block& block) noexcept {
gprs.reset(size_t(HostLoc::R14)); gprs.reset(size_t(HostLoc::R14));
return gprs; return gprs;
}(), any_xmm}; }(), any_xmm};
A64EmitContext ctx{conf, reg_alloc, block};
// up to 2 labels per insn
if (auto const inst_count = block.instructions.size(); inst_count > shared_labels.capacity())
shared_labels.reserve(inst_count * 8);
A64EmitContext ctx{conf, reg_alloc, block, shared_labels};
// Start emitting. // Start emitting.
code.align(); code.align();
@ -162,6 +168,7 @@ finish_this_inst:
auto bdesc = RegisterBlock(descriptor, entrypoint, size); auto bdesc = RegisterBlock(descriptor, entrypoint, size);
code.DisableWriting(); code.DisableWriting();
shared_labels.clear();
return bdesc; return bdesc;
} }

3
src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h

@ -27,7 +27,7 @@
namespace Dynarmic::Backend::X64 { namespace Dynarmic::Backend::X64 {
struct A64EmitContext final : public EmitContext { struct A64EmitContext final : public EmitContext {
A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block);
A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block, std::vector<Xbyak::Label>& shared_labels);
A64::LocationDescriptor Location() const; A64::LocationDescriptor Location() const;
bool IsSingleStep() const; bool IsSingleStep() const;
@ -126,6 +126,7 @@ public:
ankerl::unordered_dense::map<std::tuple<bool, size_t, int, int>, void (*)()> write_fallbacks; ankerl::unordered_dense::map<std::tuple<bool, size_t, int, int>, void (*)()> write_fallbacks;
ankerl::unordered_dense::map<std::tuple<bool, size_t, int, int>, void (*)()> exclusive_write_fallbacks; ankerl::unordered_dense::map<std::tuple<bool, size_t, int, int>, void (*)()> exclusive_write_fallbacks;
ankerl::unordered_dense::set<DoNotFastmemMarker> do_not_fastmem; ankerl::unordered_dense::set<DoNotFastmemMarker> do_not_fastmem;
std::vector<Xbyak::Label> shared_labels;
const void* terminal_handler_pop_rsb_hint = nullptr; const void* terminal_handler_pop_rsb_hint = nullptr;
const void* terminal_handler_fast_dispatch_hint = nullptr; const void* terminal_handler_fast_dispatch_hint = nullptr;
FastDispatchEntry& (*fast_dispatch_table_lookup)(u64) = nullptr; FastDispatchEntry& (*fast_dispatch_table_lookup)(u64) = nullptr;

7
src/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp

@ -33,8 +33,11 @@ namespace Dynarmic::Backend::X64 {
using namespace Xbyak::util; using namespace Xbyak::util;
EmitContext::EmitContext(RegAlloc& reg_alloc, IR::Block& block)
: reg_alloc(reg_alloc), block(block) {}
EmitContext::EmitContext(RegAlloc& reg_alloc, IR::Block& block, std::vector<Xbyak::Label>& shared_labels)
: reg_alloc(reg_alloc)
, block(block)
, shared_labels(shared_labels)
{}
EmitContext::~EmitContext() = default; EmitContext::~EmitContext() = default;

19
src/dynarmic/src/dynarmic/backend/x64/emit_x64.h

@ -16,6 +16,7 @@
#include <type_traits> #include <type_traits>
#include <vector> #include <vector>
#include <boost/container/stable_vector.hpp>
#include <mcl/bitsizeof.hpp> #include <mcl/bitsizeof.hpp>
#include <ankerl/unordered_dense.h> #include <ankerl/unordered_dense.h>
#include "dynarmic/backend/x64/xbyak.h" #include "dynarmic/backend/x64/xbyak.h"
@ -52,24 +53,24 @@ using VectorArray = std::array<T, A64FullVectorWidth::value / mcl::bitsizeof<T>>
template<typename T> template<typename T>
using HalfVectorArray = std::array<T, A64FullVectorWidth::value / mcl::bitsizeof<T> / 2>; using HalfVectorArray = std::array<T, A64FullVectorWidth::value / mcl::bitsizeof<T> / 2>;
using SharedLabel = Xbyak::Label*;
struct EmitContext { struct EmitContext {
EmitContext(RegAlloc& reg_alloc, IR::Block& block);
EmitContext(RegAlloc& reg_alloc, IR::Block& block, std::vector<Xbyak::Label>& shared_labels);
virtual ~EmitContext(); virtual ~EmitContext();
virtual FP::FPCR FPCR(bool fpcr_controlled = true) const = 0; virtual FP::FPCR FPCR(bool fpcr_controlled = true) const = 0;
virtual bool HasOptimization(OptimizationFlag flag) const = 0; virtual bool HasOptimization(OptimizationFlag flag) const = 0;
RegAlloc& reg_alloc;
IR::Block& block;
[[nodiscard]] inline Xbyak::Label* GenSharedLabel() noexcept {
DEBUG_ASSERT(shared_labels.size() + 1 <= shared_labels.capacity());
return &shared_labels.emplace_back();
}
std::vector<std::function<void()>> deferred_emits; std::vector<std::function<void()>> deferred_emits;
RegAlloc& reg_alloc;
IR::Block& block;
std::vector<Xbyak::Label>& shared_labels;
}; };
using SharedLabel = std::shared_ptr<Xbyak::Label>;
inline SharedLabel GenSharedLabel() {
return std::make_shared<Xbyak::Label>();
}
class EmitX64 { class EmitX64 {
public: public:
struct BlockDescriptor { struct BlockDescriptor {

22
src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp

@ -144,7 +144,7 @@ void ForceToDefaultNaN(BlockOfCode& code, Xbyak::Xmm result) {
template<size_t fsize> template<size_t fsize>
SharedLabel ProcessNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm a) { SharedLabel ProcessNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm a) {
SharedLabel nan = GenSharedLabel(), end = GenSharedLabel();
SharedLabel nan = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
FCODE(ucomis)(a, a); FCODE(ucomis)(a, a);
code.jp(*nan, code.T_NEAR); code.jp(*nan, code.T_NEAR);
@ -259,7 +259,7 @@ template<size_t fsize, typename Function>
void FPTwoOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { void FPTwoOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
SharedLabel end = GenSharedLabel();
SharedLabel end = ctx.GenSharedLabel();
Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
@ -312,7 +312,7 @@ void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn)
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(code); const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(code);
SharedLabel end = GenSharedLabel(), nan = GenSharedLabel();
SharedLabel end = ctx.GenSharedLabel(), nan = ctx.GenSharedLabel();
code.movaps(result, op1); code.movaps(result, op1);
if constexpr (std::is_member_function_pointer_v<Function>) { if constexpr (std::is_member_function_pointer_v<Function>) {
@ -421,7 +421,7 @@ static void EmitFPMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, bo
DenormalsAreZero<fsize>(code, ctx, {result, operand}); DenormalsAreZero<fsize>(code, ctx, {result, operand});
SharedLabel equal = GenSharedLabel(), end = GenSharedLabel();
SharedLabel equal = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
FCODE(ucomis)(result, operand); FCODE(ucomis)(result, operand);
code.jz(*equal, code.T_NEAR); code.jz(*equal, code.T_NEAR);
@ -492,7 +492,7 @@ static inline void EmitFPMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR::
} }
}; };
SharedLabel end = GenSharedLabel(), z = GenSharedLabel();
SharedLabel end = ctx.GenSharedLabel(), z = ctx.GenSharedLabel();
FCODE(ucomis)(op1, op2); FCODE(ucomis)(op1, op2);
code.jz(*z, code.T_NEAR); code.jz(*z, code.T_NEAR);
@ -640,7 +640,7 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, bo
} }
if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) { if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) {
SharedLabel fallback = GenSharedLabel(), end = GenSharedLabel();
SharedLabel fallback = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(code, args[0]); const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]); const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]);
@ -851,7 +851,7 @@ static void EmitFPMulX(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Reg64 tmp = do_default_nan ? INVALID_REG : ctx.reg_alloc.ScratchGpr(code); const Xbyak::Reg64 tmp = do_default_nan ? INVALID_REG : ctx.reg_alloc.ScratchGpr(code);
SharedLabel end = GenSharedLabel(), nan = GenSharedLabel();
SharedLabel end = ctx.GenSharedLabel(), nan = ctx.GenSharedLabel();
if (code.HasHostFeature(HostFeature::AVX)) { if (code.HasHostFeature(HostFeature::AVX)) {
FCODE(vmuls)(result, op1, op2); FCODE(vmuls)(result, op1, op2);
@ -989,7 +989,7 @@ static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
} }
if (code.HasHostFeature(HostFeature::FMA)) { if (code.HasHostFeature(HostFeature::FMA)) {
SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel();
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(code, args[0]); const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]); const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]);
@ -1137,7 +1137,7 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
const Xbyak::Xmm value = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Xmm value = ctx.reg_alloc.ScratchXmm(code);
[[maybe_unused]] const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr(code).cvt32(); [[maybe_unused]] const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr(code).cvt32();
SharedLabel bad_values = GenSharedLabel(), end = GenSharedLabel();
SharedLabel bad_values = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
code.movaps(value, operand); code.movaps(value, operand);
@ -1304,7 +1304,7 @@ static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
} }
if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) { if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) {
SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel();
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(code, args[0]); const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]); const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]);
@ -1649,7 +1649,7 @@ static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm scratch = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Xmm scratch = ctx.reg_alloc.ScratchXmm(code);
if (!unsigned_) { if (!unsigned_) {
SharedLabel saturate_max = GenSharedLabel(), end = GenSharedLabel();
SharedLabel saturate_max = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
ZeroIfNaN<64>(code, src, scratch); ZeroIfNaN<64>(code, src, scratch);

18
src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc

@ -85,7 +85,7 @@ void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) {
const auto wrapped_fn = read_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)]; const auto wrapped_fn = read_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)];
SharedLabel abort = GenSharedLabel(), end = GenSharedLabel();
SharedLabel abort = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
if (fastmem_marker) { if (fastmem_marker) {
// Use fastmem // Use fastmem
@ -107,7 +107,7 @@ void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) {
conf.recompile_on_fastmem_failure, conf.recompile_on_fastmem_failure,
}); });
EmitCheckMemoryAbort(ctx, inst, end.get());
EmitCheckMemoryAbort(ctx, inst, end);
code.jmp(*end, code.T_NEAR); code.jmp(*end, code.T_NEAR);
}); });
} else { } else {
@ -119,7 +119,7 @@ void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) {
ctx.deferred_emits.emplace_back([=, this, &ctx] { ctx.deferred_emits.emplace_back([=, this, &ctx] {
code.L(*abort); code.L(*abort);
code.call(wrapped_fn); code.call(wrapped_fn);
EmitCheckMemoryAbort(ctx, inst, end.get());
EmitCheckMemoryAbort(ctx, inst, end);
code.jmp(*end, code.T_NEAR); code.jmp(*end, code.T_NEAR);
}); });
} }
@ -172,7 +172,7 @@ void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) {
const auto wrapped_fn = write_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)]; const auto wrapped_fn = write_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)];
SharedLabel abort = GenSharedLabel(), end = GenSharedLabel();
SharedLabel abort = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
if (fastmem_marker) { if (fastmem_marker) {
// Use fastmem // Use fastmem
@ -194,7 +194,7 @@ void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) {
conf.recompile_on_fastmem_failure, conf.recompile_on_fastmem_failure,
}); });
EmitCheckMemoryAbort(ctx, inst, end.get());
EmitCheckMemoryAbort(ctx, inst, end);
code.jmp(*end, code.T_NEAR); code.jmp(*end, code.T_NEAR);
}); });
} else { } else {
@ -206,7 +206,7 @@ void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) {
ctx.deferred_emits.emplace_back([=, this, &ctx] { ctx.deferred_emits.emplace_back([=, this, &ctx] {
code.L(*abort); code.L(*abort);
code.call(wrapped_fn); code.call(wrapped_fn);
EmitCheckMemoryAbort(ctx, inst, end.get());
EmitCheckMemoryAbort(ctx, inst, end);
code.jmp(*end, code.T_NEAR); code.jmp(*end, code.T_NEAR);
}); });
} }
@ -351,7 +351,7 @@ void AxxEmitX64::EmitExclusiveReadMemoryInline(AxxEmitContext& ctx, IR::Inst* in
const auto fastmem_marker = ShouldFastmem(ctx, inst); const auto fastmem_marker = ShouldFastmem(ctx, inst);
if (fastmem_marker) { if (fastmem_marker) {
SharedLabel abort = GenSharedLabel(), end = GenSharedLabel();
SharedLabel abort = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
bool require_abort_handling = false; bool require_abort_handling = false;
const auto src_ptr = EmitFastmemVAddr(code, ctx, *abort, vaddr, require_abort_handling); const auto src_ptr = EmitFastmemVAddr(code, ctx, *abort, vaddr, require_abort_handling);
@ -426,7 +426,7 @@ void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* i
EmitExclusiveLock(code, conf, tmp, tmp2.cvt32()); EmitExclusiveLock(code, conf, tmp, tmp2.cvt32());
SharedLabel end = GenSharedLabel();
SharedLabel end = ctx.GenSharedLabel();
code.mov(status, u32(1)); code.mov(status, u32(1));
code.movzx(tmp.cvt32(), code.byte[code.ABI_JIT_PTR + offsetof(AxxJitState, exclusive_state)]); code.movzx(tmp.cvt32(), code.byte[code.ABI_JIT_PTR + offsetof(AxxJitState, exclusive_state)]);
@ -459,7 +459,7 @@ void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* i
const auto fastmem_marker = ShouldFastmem(ctx, inst); const auto fastmem_marker = ShouldFastmem(ctx, inst);
if (fastmem_marker) { if (fastmem_marker) {
SharedLabel abort = GenSharedLabel();
SharedLabel abort = ctx.GenSharedLabel();
bool require_abort_handling = false; bool require_abort_handling = false;
const auto dest_ptr = EmitFastmemVAddr(code, ctx, *abort, vaddr, require_abort_handling, tmp); const auto dest_ptr = EmitFastmemVAddr(code, ctx, *abort, vaddr, require_abort_handling, tmp);

2
src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h

@ -52,7 +52,7 @@ void EmitDetectMisalignedVAddr(BlockOfCode& code, EmitContext& ctx, size_t bitsi
if (ctx.conf.only_detect_misalignment_via_page_table_on_page_boundary) { if (ctx.conf.only_detect_misalignment_via_page_table_on_page_boundary) {
const u32 page_align_mask = static_cast<u32>(page_size - 1) & ~align_mask; const u32 page_align_mask = static_cast<u32>(page_size - 1) & ~align_mask;
SharedLabel detect_boundary = GenSharedLabel(), resume = GenSharedLabel();
SharedLabel detect_boundary = ctx.GenSharedLabel(), resume = ctx.GenSharedLabel();
code.jnz(*detect_boundary, code.T_NEAR); code.jnz(*detect_boundary, code.T_NEAR);
code.L(*resume); code.L(*resume);

61
src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp

@ -30,6 +30,7 @@
#include "dynarmic/common/fp/fpcr.h" #include "dynarmic/common/fp/fpcr.h"
#include "dynarmic/common/fp/info.h" #include "dynarmic/common/fp/info.h"
#include "dynarmic/common/fp/op.h" #include "dynarmic/common/fp/op.h"
#include "dynarmic/common/fp/rounding_mode.h"
#include "dynarmic/common/fp/util.h" #include "dynarmic/common/fp/util.h"
#include "dynarmic/common/lut_from_list.h" #include "dynarmic/common/lut_from_list.h"
#include "dynarmic/interface/optimization_flags.h" #include "dynarmic/interface/optimization_flags.h"
@ -101,7 +102,7 @@ void HandleNaNs(BlockOfCode& code, EmitContext& ctx, bool fpcr_controlled, std::
code.cmp(bitmask, 0); code.cmp(bitmask, 0);
} }
SharedLabel end = GenSharedLabel(), nan = GenSharedLabel();
SharedLabel end = ctx.GenSharedLabel(), nan = ctx.GenSharedLabel();
code.jnz(*nan, code.T_NEAR); code.jnz(*nan, code.T_NEAR);
code.L(*end); code.L(*end);
@ -196,23 +197,6 @@ void ForceToDefaultNaN(BlockOfCode& code, FP::FPCR fpcr, Xbyak::Xmm result) {
} }
} }
template<size_t fsize>
void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm result) {
const Xbyak::Xmm nan_mask = xmm0;
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
constexpr u32 nan_to_zero = FixupLUT(FpFixup::PosZero,
FpFixup::PosZero);
FCODE(vfixupimmp)(result, result, code.BConst<32>(ptr_b, nan_to_zero), u8(0));
} else if (code.HasHostFeature(HostFeature::AVX)) {
FCODE(vcmpordp)(nan_mask, result, result);
FCODE(vandp)(result, result, nan_mask);
} else {
code.movaps(nan_mask, result);
FCODE(cmpordp)(nan_mask, nan_mask);
code.andps(result, nan_mask);
}
}
template<size_t fsize> template<size_t fsize>
void DenormalsAreZero(BlockOfCode& code, FP::FPCR fpcr, std::initializer_list<Xbyak::Xmm> to_daz, Xbyak::Xmm tmp) { void DenormalsAreZero(BlockOfCode& code, FP::FPCR fpcr, std::initializer_list<Xbyak::Xmm> to_daz, Xbyak::Xmm tmp) {
if (fpcr.FZ()) { if (fpcr.FZ()) {
@ -1338,7 +1322,7 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel();
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
code.movaps(result, xmm_a); code.movaps(result, xmm_a);
@ -1611,7 +1595,7 @@ static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]); const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel();
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
code.movaps(result, GetVectorOf<fsize, false, 0, 2>(code)); code.movaps(result, GetVectorOf<fsize, false, 0, 2>(code));
@ -1784,7 +1768,7 @@ static void EmitRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm value = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Xmm value = ctx.reg_alloc.ScratchXmm(code);
SharedLabel bad_values = GenSharedLabel(), end = GenSharedLabel();
SharedLabel bad_values = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
code.movaps(value, operand); code.movaps(value, operand);
@ -1875,7 +1859,7 @@ static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(code);
SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel();
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
code.vmovaps(result, GetVectorOf<fsize, false, 0, 3>(code)); code.vmovaps(result, GetVectorOf<fsize, false, 0, 3>(code));
@ -2011,15 +1995,12 @@ void EmitX64::EmitFPVectorToHalf32(EmitContext& ctx, IR::Inst* inst) {
template<size_t fsize, bool unsigned_> template<size_t fsize, bool unsigned_>
void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
const size_t fbits = inst->GetArg(1).GetU8(); const size_t fbits = inst->GetArg(1).GetU8();
const auto rounding = static_cast<FP::RoundingMode>(inst->GetArg(2).GetU8());
const auto rounding = FP::RoundingMode(inst->GetArg(2).GetU8());
[[maybe_unused]] const bool fpcr_controlled = inst->GetArg(3).GetU1(); [[maybe_unused]] const bool fpcr_controlled = inst->GetArg(3).GetU1();
if constexpr (fsize != 16) {
if (code.HasHostFeature(HostFeature::SSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) {
if (code.HasHostFeature(HostFeature::SSE41) && fsize != 16 && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm src = ctx.reg_alloc.UseScratchXmm(code, args[0]); const Xbyak::Xmm src = ctx.reg_alloc.UseScratchXmm(code, args[0]);
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
const int round_imm = [&] { const int round_imm = [&] {
switch (rounding) { switch (rounding) {
@ -2034,7 +2015,6 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
return 0b11; return 0b11;
} }
}(); }();
const auto perform_conversion = [&code, &ctx](const Xbyak::Xmm& src) { const auto perform_conversion = [&code, &ctx](const Xbyak::Xmm& src) {
// MSVC doesn't allow us to use a [&] capture, so we have to do this instead. // MSVC doesn't allow us to use a [&] capture, so we have to do this instead.
(void)ctx; (void)ctx;
@ -2059,16 +2039,26 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
} }
} }
}; };
if (fbits != 0) { if (fbits != 0) {
const u64 scale_factor = fsize == 32 const u64 scale_factor = fsize == 32
? static_cast<u64>(fbits + 127) << 23
: static_cast<u64>(fbits + 1023) << 52;
? u64(fbits + 127) << 23
: u64(fbits + 1023) << 52;
FCODE(mulp)(src, GetVectorOf<fsize>(code, scale_factor)); FCODE(mulp)(src, GetVectorOf<fsize>(code, scale_factor));
} }
FCODE(roundp)(src, src, static_cast<u8>(round_imm));
ZeroIfNaN<fsize>(code, src);
FCODE(roundp)(src, src, u8(round_imm));
const Xbyak::Xmm nan_mask = xmm0;
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
static constexpr u32 nan_to_zero = FixupLUT(FpFixup::PosZero, FpFixup::PosZero);
FCODE(vfixupimmp)(src, src, code.BConst<32>(ptr_b, nan_to_zero), u8(0));
} else if (code.HasHostFeature(HostFeature::AVX)) {
FCODE(vcmpordp)(nan_mask, src, src);
FCODE(vandp)(src, src, nan_mask);
} else {
code.movaps(nan_mask, src);
FCODE(cmpordp)(nan_mask, nan_mask);
code.andps(src, nan_mask);
}
constexpr u64 float_upper_limit_signed = fsize == 32 ? 0x4f000000 : 0x43e0000000000000; constexpr u64 float_upper_limit_signed = fsize == 32 ? 0x4f000000 : 0x43e0000000000000;
[[maybe_unused]] constexpr u64 float_upper_limit_unsigned = fsize == 32 ? 0x4f800000 : 0x43f0000000000000; [[maybe_unused]] constexpr u64 float_upper_limit_unsigned = fsize == 32 ? 0x4f800000 : 0x43f0000000000000;
@ -2081,7 +2071,7 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
// Convert positive values to unsigned integers, write 0 anywhere else // Convert positive values to unsigned integers, write 0 anywhere else
// vcvttp*2u*q already saturates out-of-range values to (0xFFFF...) // vcvttp*2u*q already saturates out-of-range values to (0xFFFF...)
if constexpr (fsize == 32) {
if (fsize == 32) {
code.vcvttps2udq(src | k1 | T_z, src); code.vcvttps2udq(src | k1 | T_z, src);
} else { } else {
code.vcvttpd2uqq(src | k1 | T_z, src); code.vcvttpd2uqq(src | k1 | T_z, src);
@ -2114,18 +2104,15 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
} else { } else {
using FPT = mcl::unsigned_integer_of_size<fsize>; // WORKAROUND: For issue 678 on MSVC using FPT = mcl::unsigned_integer_of_size<fsize>; // WORKAROUND: For issue 678 on MSVC
constexpr u64 integer_max = FPT((std::numeric_limits<std::conditional_t<unsigned_, FPT, std::make_signed_t<FPT>>>::max)()); constexpr u64 integer_max = FPT((std::numeric_limits<std::conditional_t<unsigned_, FPT, std::make_signed_t<FPT>>>::max)());
code.movaps(xmm0, GetVectorOf<fsize, float_upper_limit_signed>(code)); code.movaps(xmm0, GetVectorOf<fsize, float_upper_limit_signed>(code));
FCODE(cmplep)(xmm0, src); FCODE(cmplep)(xmm0, src);
perform_conversion(src); perform_conversion(src);
FCODE(blendvp)(src, GetVectorOf<fsize, integer_max>(code)); FCODE(blendvp)(src, GetVectorOf<fsize, integer_max>(code));
} }
}); });
ctx.reg_alloc.DefineValue(code, inst, src); ctx.reg_alloc.DefineValue(code, inst, src);
return; return;
} }
}
using fbits_list = mp::lift_sequence<std::make_index_sequence<fsize + 1>>; using fbits_list = mp::lift_sequence<std::make_index_sequence<fsize + 1>>;
using rounding_list = mp::list< using rounding_list = mp::list<

Loading…
Cancel
Save