diff --git a/src/common/x64/xbyak.h b/src/common/x64/xbyak.h index 5ee12576e9..e541215994 100644 --- a/src/common/x64/xbyak.h +++ b/src/common/x64/xbyak.h @@ -262,20 +262,16 @@ enum { CMP_ORD = 7, }; -constexpr bool IsWithin2G(uintptr_t ref, uintptr_t target) { - const u64 distance = target - (ref + 5); - return !(distance >= 0x8000'0000ULL && distance <= ~0x8000'0000ULL); -} - -inline bool IsWithin2G(const Xbyak::CodeGenerator& code, uintptr_t target) { - return IsWithin2G(reinterpret_cast(code.getCurr()), target); +constexpr bool IsWithin2G(uintptr_t ref, uintptr_t target) noexcept { + u64 const distance = target - (ref + 5); + return (distance & 0xffff'ffff) == distance; } template inline void CallFarFunction(Xbyak::CodeGenerator& code, const T f) { static_assert(std::is_pointer_v, "Argument must be a (function) pointer."); - size_t addr = reinterpret_cast(f); - if (IsWithin2G(code, addr)) { + uintptr_t addr = uintptr_t(f); + if (IsWithin2G(uintptr_t(code.getCurr()), addr)) { code.call(f); } else { // ABI_RETURN is a safe temp register to use before a call diff --git a/src/dynarmic/src/dynarmic/backend/x64/block_of_code.h b/src/dynarmic/src/dynarmic/backend/x64/block_of_code.h index 857b1a4484..d7502d2b83 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/block_of_code.h +++ b/src/dynarmic/src/dynarmic/backend/x64/block_of_code.h @@ -13,8 +13,9 @@ #include #include -#include "dynarmic/mcl/bit.hpp" #include "common/common_types.h" +#include "common/x64/xbyak.h" +#include "dynarmic/mcl/bit.hpp" #include "dynarmic/backend/x64/xbyak.h" #include "dynarmic/backend/x64/abi.h" #include "dynarmic/backend/x64/callback.h" @@ -82,28 +83,17 @@ public: /// Code emitter: Load required flags for conditional cond from rax into host rflags void LoadRequiredFlagsForCondFromRax(IR::Cond cond); - /// Code emitter: Calls the function - template - void CallFunction(FunctionPointer fn) { - static_assert(std::is_pointer_v && std::is_function_v>, - "Supplied type must be a pointer to a function"); - - const u64 address = reinterpret_cast(fn); - const u64 distance = address - (getCurr() + 5); - - if (distance >= 0x0000000080000000ULL && distance < 0xFFFFFFFF80000000ULL) { - // Far call - mov(rax, address); - call(rax); - } else { - call(fn); - } + /// @brief Code emitter: Calls the function + template + void CallFunction(F fn) { + static_assert(std::is_pointer_v && std::is_function_v>, "Supplied type must be a pointer to a function"); + ::Common::X64::CallFarFunction(*this, fn); } - /// Code emitter: Calls the lambda. Lambda must not have any captures. + /// @brief Code emitter: Calls the lambda. Lambda must not have any captures. template void CallLambda(Lambda l) { - CallFunction(Common::FptrCast(l)); + ::Common::X64::CallFarFunction(*this, Common::FptrCast(l)); } void ZeroExtendFrom(size_t bitsize, Xbyak::Reg64 reg) { diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp index d2edea780c..4df09e4797 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp @@ -2103,36 +2103,57 @@ void EmitX64::EmitVectorMaxU64(EmitContext& ctx, IR::Inst* inst) { } else if (code.HasHostFeature(HostFeature::AVX)) { auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); auto const y = ctx.reg_alloc.UseXmm(code, args[1]); - auto const tmp = ctx.reg_alloc.ScratchXmm(code); - code.vmovdqa(xmm0, code.Const(xword, 0x8000000000000000, 0x8000000000000000)); - code.vpsubq(tmp, y, xmm0); - code.vpsubq(xmm0, x, xmm0); - code.vpcmpgtq(xmm0, tmp, xmm0); - code.pblendvb(x, y); + auto const tmp0 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp1 = ctx.reg_alloc.ScratchXmm(code, HostLoc::XMM0); + code.vmovdqa(tmp1, code.Const(xword, 0x8000000000000000, 0x8000000000000000)); + code.vpsubq(tmp0, y, tmp1); + code.vpsubq(tmp1, x, tmp1); + code.vpcmpgtq(tmp1, tmp0, tmp1); + code.pblendvb(x, y); // XMM0 is implicit ctx.reg_alloc.DefineValue(code, inst, x); + } else if (code.HasHostFeature(HostFeature::SSE41)) { + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp3 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp4 = ctx.reg_alloc.ScratchXmm(code); + auto const tax = ctx.reg_alloc.ScratchGpr(code); + auto const tdx = ctx.reg_alloc.ScratchGpr(code); + code.movq(tdx, tmp1); + code.movq(tax, tmp0); + code.movhlps(tmp3, tmp0); + code.cmp(tdx, tax); + code.movhlps(tmp2, tmp1); + code.cmovnb(tax, tdx); + code.movq(tdx, tmp2); + code.pinsrq(tmp4, tax, 0); + code.movq(tax, tmp3); + code.cmp(tdx, tax); + code.cmovnb(tax, tdx); + code.pinsrq(tmp4, tax, 1); + ctx.reg_alloc.DefineValue(code, inst, tmp4); } else { auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]); auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); auto const tmp3 = ctx.reg_alloc.ScratchXmm(code); auto const tmp4 = ctx.reg_alloc.ScratchXmm(code); - auto const tmp5 = ctx.reg_alloc.ScratchXmm(code); - code.movdqa(tmp2, code.Const(xword, 0x8000000080000000, 0x8000000080000000)); - code.movdqa(tmp3, tmp1); - code.pxor(tmp3, tmp2); - code.pxor(tmp2, tmp0); - code.movdqa(tmp4, tmp2); - code.pcmpeqd(tmp2, tmp3); - code.pcmpgtd(tmp4, tmp3); - code.pshufd(tmp2, tmp2, 245); - code.pshufd(tmp5, tmp4, 160); - code.pshufd(tmp3, tmp4, 245); - code.pand(tmp2, tmp5); - code.por(tmp3, tmp2); - code.pand(tmp0, tmp3); - code.pandn(tmp3, tmp1); - code.por(tmp0, tmp3); - ctx.reg_alloc.DefineValue(code, inst, tmp0); + auto const tax = ctx.reg_alloc.ScratchGpr(code); + auto const tdx = ctx.reg_alloc.ScratchGpr(code); + code.movq(tdx, tmp1); + code.movq(tax, tmp0); + code.movhlps(tmp3, tmp0); + code.cmp(tdx, tax); + code.movhlps(tmp2, tmp1); + code.cmovnb(tax, tdx); + code.movq(tdx, tmp2); + code.movq(tmp4, tax); + code.movq(tax, tmp3); + code.cmp(tdx, tax); + code.cmovnb(tax, tdx); + code.movq(tmp3, tax); + code.punpcklqdq(tmp4, tmp3); + ctx.reg_alloc.DefineValue(code, inst, tmp4); } } @@ -2247,37 +2268,57 @@ void EmitX64::EmitVectorMinU64(EmitContext& ctx, IR::Inst* inst) { } else if (code.HasHostFeature(HostFeature::AVX)) { auto const x = ctx.reg_alloc.UseXmm(code, args[0]); auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]); - auto const tmp = ctx.reg_alloc.ScratchXmm(code); - code.vmovdqa(xmm0, code.Const(xword, 0x8000000000000000, 0x8000000000000000)); - code.vpsubq(tmp, y, xmm0); - code.vpsubq(xmm0, x, xmm0); - code.vpcmpgtq(xmm0, tmp, xmm0); + auto const tmp0 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp1 = ctx.reg_alloc.ScratchXmm(code, HostLoc::XMM0); + code.vmovdqa(tmp1, code.Const(xword, 0x8000000000000000, 0x8000000000000000)); + code.vpsubq(tmp0, y, tmp1); + code.vpsubq(tmp1, x, tmp1); + code.vpcmpgtq(tmp1, tmp0, tmp1); code.pblendvb(y, x); ctx.reg_alloc.DefineValue(code, inst, y); + } else if (code.HasHostFeature(HostFeature::SSE41)) { + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp3 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp4 = ctx.reg_alloc.ScratchXmm(code); + auto const tax = ctx.reg_alloc.ScratchGpr(code); + auto const tdx = ctx.reg_alloc.ScratchGpr(code); + code.movq(tdx, tmp1); + code.movq(tax, tmp0); + code.movhlps(tmp3, tmp0); + code.cmp(tdx, tax); + code.movhlps(tmp2, tmp1); + code.cmovbe(tax, tdx); + code.movq(tdx, tmp2); + code.pinsrq(tmp4, tax, 0); + code.movq(tax, tmp3); + code.cmp(tdx, tax); + code.cmovbe(tax, tdx); + code.pinsrq(tmp4, tax, 1); + ctx.reg_alloc.DefineValue(code, inst, tmp4); } else { auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]); auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); auto const tmp3 = ctx.reg_alloc.ScratchXmm(code); auto const tmp4 = ctx.reg_alloc.ScratchXmm(code); - auto const tmp5 = ctx.reg_alloc.ScratchXmm(code); - code.movdqa(tmp2, code.Const(xword, 0x8000000080000000, 0x8000000080000000)); - code.movdqa(tmp3, tmp1); - code.pxor(tmp3, tmp2); - code.pxor(tmp2, tmp0); - code.movdqa(tmp4, tmp2); - code.pcmpeqd(tmp2, tmp3); - code.pcmpgtd(tmp4, tmp3); - code.pshufd(tmp3, tmp2, 245); - code.pshufd(tmp5, tmp4, 160); - code.pshufd(tmp2, tmp4, 245); - code.pand(tmp3, tmp5); - code.por(tmp2, tmp3); - code.pand(tmp1, tmp2); - code.pandn(tmp2, tmp0); - code.por(tmp2, tmp1); - //code.movdqa(tmp0, tmp2); - ctx.reg_alloc.DefineValue(code, inst, tmp2); + auto const tax = ctx.reg_alloc.ScratchGpr(code); + auto const tdx = ctx.reg_alloc.ScratchGpr(code); + code.movq(tdx, tmp1); + code.movq(tax, tmp0); + code.movhlps(tmp3, tmp0); + code.cmp(tdx, tax); + code.movhlps(tmp2, tmp1); + code.cmovbe(tax, tdx); + code.movq(tdx, tmp2); + code.movq(tmp4, tax); + code.movq(tax, tmp3); + code.cmp(tdx, tax); + code.cmovbe(tax, tdx); + code.movq(tmp3, tax); + code.punpcklqdq(tmp4, tmp3); + ctx.reg_alloc.DefineValue(code, inst, tmp4); } } @@ -2471,13 +2512,11 @@ void EmitX64::EmitVectorNarrow64(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(code, inst, result); } else { - auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - auto const zeros = ctx.reg_alloc.ScratchXmm(code); - - code.pxor(zeros, zeros); - code.shufps(a, zeros, 0b00001000); - - ctx.reg_alloc.DefineValue(code, inst, a); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const zeros = ctx.reg_alloc.ScratchXmm(code); + code.pxor(zeros, zeros); + code.shufps(a, zeros, 0b00001000); + ctx.reg_alloc.DefineValue(code, inst, a); } } @@ -2490,11 +2529,11 @@ void EmitX64::EmitVectorNot(EmitContext& ctx, IR::Inst* inst) { code.vpternlogq(result, operand, operand, u8(~Tern::c)); ctx.reg_alloc.DefineValue(code, inst, result); } else { - auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - auto const xmm_b = ctx.reg_alloc.ScratchXmm(code); - code.pcmpeqw(xmm_b, xmm_b); - code.pxor(xmm_a, xmm_b); - ctx.reg_alloc.DefineValue(code, inst, xmm_a); + auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const xmm_b = ctx.reg_alloc.ScratchXmm(code); + code.pcmpeqw(xmm_b, xmm_b); + code.pxor(xmm_a, xmm_b); + ctx.reg_alloc.DefineValue(code, inst, xmm_a); } } @@ -2504,11 +2543,9 @@ void EmitX64::EmitVectorOr(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorPairedAddLower8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); auto const tmp = ctx.reg_alloc.ScratchXmm(code); - code.punpcklqdq(xmm_a, xmm_b); code.movdqa(tmp, xmm_a); code.psllw(xmm_a, 8); @@ -2516,7 +2553,6 @@ void EmitX64::EmitVectorPairedAddLower8(EmitContext& ctx, IR::Inst* inst) { code.pxor(tmp, tmp); code.psrlw(xmm_a, 8); code.packuswb(xmm_a, tmp); - ctx.reg_alloc.DefineValue(code, inst, xmm_a); } diff --git a/src/dynarmic/src/dynarmic/common/llvm_disassemble.cpp b/src/dynarmic/src/dynarmic/common/llvm_disassemble.cpp index 6aff6d6cc7..6896b43d62 100644 --- a/src/dynarmic/src/dynarmic/common/llvm_disassemble.cpp +++ b/src/dynarmic/src/dynarmic/common/llvm_disassemble.cpp @@ -7,7 +7,6 @@ */ #include - #include #ifdef DYNARMIC_USE_LLVM @@ -16,7 +15,6 @@ #endif #include "common/assert.h" -#include #include "common/common_types.h" #include "dynarmic/common/llvm_disassemble.h" diff --git a/src/dynarmic/tests/A64/a64.cpp b/src/dynarmic/tests/A64/a64.cpp index 9bd9c36a86..99cc9d3d4d 100644 --- a/src/dynarmic/tests/A64/a64.cpp +++ b/src/dynarmic/tests/A64/a64.cpp @@ -2587,11 +2587,11 @@ TEST_CASE("A64: Manual Vector Min/Max U64 (Optimizer Test)", "[a64]") { // MaxU64 pattern: (a > b) ? a : b code.CMHI(V2.D2(), V0.D2(), V1.D2()); // V2 = Mask (A > B) - code.BSL(V2.B16(), V0.B16(), V1.B16()); // V2 = Resul + code.BSL(V2.B16(), V0.B16(), V1.B16()); // V2 = Result // MinU64 pattern: (a > b) ? b : a code.CMHI(V3.D2(), V0.D2(), V1.D2()); // V3 = Mask (A > B) - code.BSL(V3.B16(), V1.B16(), V0.B16()); // V3 = Resul + code.BSL(V3.B16(), V1.B16(), V0.B16()); // V3 = Result jit.SetPC(0); jit.SetVector(0, {100, 20}); @@ -2603,3 +2603,32 @@ TEST_CASE("A64: Manual Vector Min/Max U64 (Optimizer Test)", "[a64]") { CHECK(jit.GetVector(2) == Vector{100, 200}); CHECK(jit.GetVector(3) == Vector{50, 20}); } + +TEST_CASE("A64: Rounding", "[a64]") { + A64TestEnv env; + A64::UserConfig jit_user_config{}; + jit_user_config.callbacks = &env; + A64::Jit jit{jit_user_config}; + + oaknut::VectorCodeGenerator code{env.code_mem, nullptr}; + + code.FRINTN(V1.S4(), V0.S4()); // ToNearest_TieEven + code.FRINTM(V2.S4(), V0.S4()); // TowardsMinusInfinity + code.FRINTP(V3.S4(), V0.S4()); // TowardsPlusInfinity + code.FRINTZ(V4.S4(), V0.S4()); // TowardsZero + code.FRINTA(V5.S4(), V0.S4()); // ToNearest_TieAwayFromZero + code.FRINTX(V6.S4(), V0.S4()); // ToNearest_TieAwayFromZero + + jit.SetPC(0); + jit.SetVector(0, {0x4001e17c4001e17c, 0x4001e17c4001e17c}); + env.ticks_left = env.code_mem.size(); + CheckedRun([&]() { jit.Run(); }); + + CHECK(jit.GetVector(0) == Vector{0x4001e17c4001e17c, 0x4001e17c4001e17c}); + CHECK(jit.GetVector(1) == Vector{0x4000000040000000, 0x4000000040000000}); + CHECK(jit.GetVector(2) == Vector{0x4000000040000000, 0x4000000040000000}); + CHECK(jit.GetVector(3) == Vector{0x4040000040400000, 0x4040000040400000}); + CHECK(jit.GetVector(4) == Vector{0x4000000040000000, 0x4000000040000000}); + CHECK(jit.GetVector(5) == Vector{0x4000000040000000, 0x4000000040000000}); + CHECK(jit.GetVector(6) == Vector{0x4000000040000000, 0x4000000040000000}); +} diff --git a/src/dynarmic/tests/A64/testenv.h b/src/dynarmic/tests/A64/testenv.h index c25790e1c9..1d260f5460 100644 --- a/src/dynarmic/tests/A64/testenv.h +++ b/src/dynarmic/tests/A64/testenv.h @@ -17,24 +17,19 @@ using Vector = Dynarmic::A64::Vector; class A64TestEnv : public Dynarmic::A64::UserCallbacks { public: + ankerl::unordered_dense::map modified_memory; + std::vector code_mem; u64 ticks_left = 0; - - bool code_mem_modified_by_guest = false; u64 code_mem_start_address = 0; - std::vector code_mem; - - ankerl::unordered_dense::map modified_memory; - std::vector interrupts; + bool code_mem_modified_by_guest = false; bool IsInCodeMem(u64 vaddr) const { return vaddr >= code_mem_start_address && vaddr < code_mem_start_address + code_mem.size() * 4; } std::optional MemoryReadCode(u64 vaddr) override { - if (!IsInCodeMem(vaddr)) { + if (!IsInCodeMem(vaddr)) return 0x14000000; // B . - } - const size_t index = (vaddr - code_mem_start_address) / 4; return code_mem[index]; } @@ -43,10 +38,9 @@ public: if (IsInCodeMem(vaddr)) { return reinterpret_cast(code_mem.data())[vaddr - code_mem_start_address]; } - if (auto iter = modified_memory.find(vaddr); iter != modified_memory.end()) { - return iter->second; - } - return static_cast(vaddr); + if (auto const it = modified_memory.find(vaddr); it != modified_memory.end()) + return it->second; + return u8(vaddr); } std::uint16_t MemoryRead16(u64 vaddr) override { return u16(MemoryRead8(vaddr)) | u16(MemoryRead8(vaddr + 1)) << 8; @@ -68,16 +62,16 @@ public: modified_memory[vaddr] = value; } void MemoryWrite16(u64 vaddr, std::uint16_t value) override { - MemoryWrite8(vaddr, static_cast(value)); - MemoryWrite8(vaddr + 1, static_cast(value >> 8)); + MemoryWrite8(vaddr, u8(value)); + MemoryWrite8(vaddr + 1, u8(value >> 8)); } void MemoryWrite32(u64 vaddr, std::uint32_t value) override { - MemoryWrite16(vaddr, static_cast(value)); - MemoryWrite16(vaddr + 2, static_cast(value >> 16)); + MemoryWrite16(vaddr, u16(value)); + MemoryWrite16(vaddr + 2, u16(value >> 16)); } void MemoryWrite64(u64 vaddr, std::uint64_t value) override { - MemoryWrite32(vaddr, static_cast(value)); - MemoryWrite32(vaddr + 4, static_cast(value >> 32)); + MemoryWrite32(vaddr, u32(value)); + MemoryWrite32(vaddr + 4, u32(value >> 32)); } void MemoryWrite128(u64 vaddr, Vector value) override { MemoryWrite64(vaddr, value[0]); @@ -139,12 +133,12 @@ public: template T read(u64 vaddr) { T value; - memcpy(&value, backing_memory + vaddr, sizeof(T)); + std::memcpy(&value, backing_memory + vaddr, sizeof(T)); return value; } template void write(u64 vaddr, const T& value) { - memcpy(backing_memory + vaddr, &value, sizeof(T)); + std::memcpy(backing_memory + vaddr, &value, sizeof(T)); } std::optional MemoryReadCode(u64 vaddr) override {