Browse Source

[dynarmic] fix pre-SSE4.1 having errors on CMHI/CMLO, fix extra nuisances and add INTERP testcase (#4025)

does a bit of code dedup
fixes pre-SSE4.1 having horrific CMHI/CMLO

Signed-off-by: lizzie <lizzie@eden-emu.dev>

Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/4025
Reviewed-by: crueter <crueter@eden-emu.dev>
Reviewed-by: MaranBr <maranbr@eden-emu.dev>
lizzie/openssl-fixup-cmake53453
lizzie 16 hours ago
committed by crueter
parent
commit
ad9af25027
No known key found for this signature in database GPG Key ID: 425ACD2D4830EBC6
  1. 14
      src/common/x64/xbyak.h
  2. 28
      src/dynarmic/src/dynarmic/backend/x64/block_of_code.h
  3. 158
      src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp
  4. 2
      src/dynarmic/src/dynarmic/common/llvm_disassemble.cpp
  5. 33
      src/dynarmic/tests/A64/a64.cpp
  6. 36
      src/dynarmic/tests/A64/testenv.h

14
src/common/x64/xbyak.h

@ -262,20 +262,16 @@ enum {
CMP_ORD = 7,
};
constexpr bool IsWithin2G(uintptr_t ref, uintptr_t target) {
const u64 distance = target - (ref + 5);
return !(distance >= 0x8000'0000ULL && distance <= ~0x8000'0000ULL);
}
inline bool IsWithin2G(const Xbyak::CodeGenerator& code, uintptr_t target) {
return IsWithin2G(reinterpret_cast<uintptr_t>(code.getCurr()), target);
constexpr bool IsWithin2G(uintptr_t ref, uintptr_t target) noexcept {
u64 const distance = target - (ref + 5);
return (distance & 0xffff'ffff) == distance;
}
template <typename T>
inline void CallFarFunction(Xbyak::CodeGenerator& code, const T f) {
static_assert(std::is_pointer_v<T>, "Argument must be a (function) pointer.");
size_t addr = reinterpret_cast<size_t>(f);
if (IsWithin2G(code, addr)) {
uintptr_t addr = uintptr_t(f);
if (IsWithin2G(uintptr_t(code.getCurr()), addr)) {
code.call(f);
} else {
// ABI_RETURN is a safe temp register to use before a call

28
src/dynarmic/src/dynarmic/backend/x64/block_of_code.h

@ -13,8 +13,9 @@
#include <memory>
#include <type_traits>
#include "dynarmic/mcl/bit.hpp"
#include "common/common_types.h"
#include "common/x64/xbyak.h"
#include "dynarmic/mcl/bit.hpp"
#include "dynarmic/backend/x64/xbyak.h"
#include "dynarmic/backend/x64/abi.h"
#include "dynarmic/backend/x64/callback.h"
@ -82,28 +83,17 @@ public:
/// Code emitter: Load required flags for conditional cond from rax into host rflags
void LoadRequiredFlagsForCondFromRax(IR::Cond cond);
/// Code emitter: Calls the function
template<typename FunctionPointer>
void CallFunction(FunctionPointer fn) {
static_assert(std::is_pointer_v<FunctionPointer> && std::is_function_v<std::remove_pointer_t<FunctionPointer>>,
"Supplied type must be a pointer to a function");
const u64 address = reinterpret_cast<u64>(fn);
const u64 distance = address - (getCurr<u64>() + 5);
if (distance >= 0x0000000080000000ULL && distance < 0xFFFFFFFF80000000ULL) {
// Far call
mov(rax, address);
call(rax);
} else {
call(fn);
}
/// @brief Code emitter: Calls the function
template<typename F>
void CallFunction(F fn) {
static_assert(std::is_pointer_v<F> && std::is_function_v<std::remove_pointer_t<F>>, "Supplied type must be a pointer to a function");
::Common::X64::CallFarFunction(*this, fn);
}
/// Code emitter: Calls the lambda. Lambda must not have any captures.
/// @brief Code emitter: Calls the lambda. Lambda must not have any captures.
template<typename Lambda>
void CallLambda(Lambda l) {
CallFunction(Common::FptrCast(l));
::Common::X64::CallFarFunction(*this, Common::FptrCast(l));
}
void ZeroExtendFrom(size_t bitsize, Xbyak::Reg64 reg) {

158
src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp

@ -2103,36 +2103,57 @@ void EmitX64::EmitVectorMaxU64(EmitContext& ctx, IR::Inst* inst) {
} else if (code.HasHostFeature(HostFeature::AVX)) {
auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.vmovdqa(xmm0, code.Const(xword, 0x8000000000000000, 0x8000000000000000));
code.vpsubq(tmp, y, xmm0);
code.vpsubq(xmm0, x, xmm0);
code.vpcmpgtq(xmm0, tmp, xmm0);
code.pblendvb(x, y);
auto const tmp0 = ctx.reg_alloc.ScratchXmm(code);
auto const tmp1 = ctx.reg_alloc.ScratchXmm(code, HostLoc::XMM0);
code.vmovdqa(tmp1, code.Const(xword, 0x8000000000000000, 0x8000000000000000));
code.vpsubq(tmp0, y, tmp1);
code.vpsubq(tmp1, x, tmp1);
code.vpcmpgtq(tmp1, tmp0, tmp1);
code.pblendvb(x, y); // XMM0 is implicit
ctx.reg_alloc.DefineValue(code, inst, x);
} else if (code.HasHostFeature(HostFeature::SSE41)) {
auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
auto const tax = ctx.reg_alloc.ScratchGpr(code);
auto const tdx = ctx.reg_alloc.ScratchGpr(code);
code.movq(tdx, tmp1);
code.movq(tax, tmp0);
code.movhlps(tmp3, tmp0);
code.cmp(tdx, tax);
code.movhlps(tmp2, tmp1);
code.cmovnb(tax, tdx);
code.movq(tdx, tmp2);
code.pinsrq(tmp4, tax, 0);
code.movq(tax, tmp3);
code.cmp(tdx, tax);
code.cmovnb(tax, tdx);
code.pinsrq(tmp4, tax, 1);
ctx.reg_alloc.DefineValue(code, inst, tmp4);
} else {
auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
auto const tmp5 = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp2, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
code.movdqa(tmp3, tmp1);
code.pxor(tmp3, tmp2);
code.pxor(tmp2, tmp0);
code.movdqa(tmp4, tmp2);
code.pcmpeqd(tmp2, tmp3);
code.pcmpgtd(tmp4, tmp3);
code.pshufd(tmp2, tmp2, 245);
code.pshufd(tmp5, tmp4, 160);
code.pshufd(tmp3, tmp4, 245);
code.pand(tmp2, tmp5);
code.por(tmp3, tmp2);
code.pand(tmp0, tmp3);
code.pandn(tmp3, tmp1);
code.por(tmp0, tmp3);
ctx.reg_alloc.DefineValue(code, inst, tmp0);
auto const tax = ctx.reg_alloc.ScratchGpr(code);
auto const tdx = ctx.reg_alloc.ScratchGpr(code);
code.movq(tdx, tmp1);
code.movq(tax, tmp0);
code.movhlps(tmp3, tmp0);
code.cmp(tdx, tax);
code.movhlps(tmp2, tmp1);
code.cmovnb(tax, tdx);
code.movq(tdx, tmp2);
code.movq(tmp4, tax);
code.movq(tax, tmp3);
code.cmp(tdx, tax);
code.cmovnb(tax, tdx);
code.movq(tmp3, tax);
code.punpcklqdq(tmp4, tmp3);
ctx.reg_alloc.DefineValue(code, inst, tmp4);
}
}
@ -2247,37 +2268,57 @@ void EmitX64::EmitVectorMinU64(EmitContext& ctx, IR::Inst* inst) {
} else if (code.HasHostFeature(HostFeature::AVX)) {
auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.vmovdqa(xmm0, code.Const(xword, 0x8000000000000000, 0x8000000000000000));
code.vpsubq(tmp, y, xmm0);
code.vpsubq(xmm0, x, xmm0);
code.vpcmpgtq(xmm0, tmp, xmm0);
auto const tmp0 = ctx.reg_alloc.ScratchXmm(code);
auto const tmp1 = ctx.reg_alloc.ScratchXmm(code, HostLoc::XMM0);
code.vmovdqa(tmp1, code.Const(xword, 0x8000000000000000, 0x8000000000000000));
code.vpsubq(tmp0, y, tmp1);
code.vpsubq(tmp1, x, tmp1);
code.vpcmpgtq(tmp1, tmp0, tmp1);
code.pblendvb(y, x);
ctx.reg_alloc.DefineValue(code, inst, y);
} else if (code.HasHostFeature(HostFeature::SSE41)) {
auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
auto const tax = ctx.reg_alloc.ScratchGpr(code);
auto const tdx = ctx.reg_alloc.ScratchGpr(code);
code.movq(tdx, tmp1);
code.movq(tax, tmp0);
code.movhlps(tmp3, tmp0);
code.cmp(tdx, tax);
code.movhlps(tmp2, tmp1);
code.cmovbe(tax, tdx);
code.movq(tdx, tmp2);
code.pinsrq(tmp4, tax, 0);
code.movq(tax, tmp3);
code.cmp(tdx, tax);
code.cmovbe(tax, tdx);
code.pinsrq(tmp4, tax, 1);
ctx.reg_alloc.DefineValue(code, inst, tmp4);
} else {
auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
auto const tmp5 = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp2, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
code.movdqa(tmp3, tmp1);
code.pxor(tmp3, tmp2);
code.pxor(tmp2, tmp0);
code.movdqa(tmp4, tmp2);
code.pcmpeqd(tmp2, tmp3);
code.pcmpgtd(tmp4, tmp3);
code.pshufd(tmp3, tmp2, 245);
code.pshufd(tmp5, tmp4, 160);
code.pshufd(tmp2, tmp4, 245);
code.pand(tmp3, tmp5);
code.por(tmp2, tmp3);
code.pand(tmp1, tmp2);
code.pandn(tmp2, tmp0);
code.por(tmp2, tmp1);
//code.movdqa(tmp0, tmp2);
ctx.reg_alloc.DefineValue(code, inst, tmp2);
auto const tax = ctx.reg_alloc.ScratchGpr(code);
auto const tdx = ctx.reg_alloc.ScratchGpr(code);
code.movq(tdx, tmp1);
code.movq(tax, tmp0);
code.movhlps(tmp3, tmp0);
code.cmp(tdx, tax);
code.movhlps(tmp2, tmp1);
code.cmovbe(tax, tdx);
code.movq(tdx, tmp2);
code.movq(tmp4, tax);
code.movq(tax, tmp3);
code.cmp(tdx, tax);
code.cmovbe(tax, tdx);
code.movq(tmp3, tax);
code.punpcklqdq(tmp4, tmp3);
ctx.reg_alloc.DefineValue(code, inst, tmp4);
}
}
@ -2471,13 +2512,11 @@ void EmitX64::EmitVectorNarrow64(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(code, inst, result);
} else {
auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
auto const zeros = ctx.reg_alloc.ScratchXmm(code);
code.pxor(zeros, zeros);
code.shufps(a, zeros, 0b00001000);
ctx.reg_alloc.DefineValue(code, inst, a);
auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
auto const zeros = ctx.reg_alloc.ScratchXmm(code);
code.pxor(zeros, zeros);
code.shufps(a, zeros, 0b00001000);
ctx.reg_alloc.DefineValue(code, inst, a);
}
}
@ -2490,11 +2529,11 @@ void EmitX64::EmitVectorNot(EmitContext& ctx, IR::Inst* inst) {
code.vpternlogq(result, operand, operand, u8(~Tern::c));
ctx.reg_alloc.DefineValue(code, inst, result);
} else {
auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
auto const xmm_b = ctx.reg_alloc.ScratchXmm(code);
code.pcmpeqw(xmm_b, xmm_b);
code.pxor(xmm_a, xmm_b);
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
auto const xmm_b = ctx.reg_alloc.ScratchXmm(code);
code.pcmpeqw(xmm_b, xmm_b);
code.pxor(xmm_a, xmm_b);
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
}
}
@ -2504,11 +2543,9 @@ void EmitX64::EmitVectorOr(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedAddLower8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.punpcklqdq(xmm_a, xmm_b);
code.movdqa(tmp, xmm_a);
code.psllw(xmm_a, 8);
@ -2516,7 +2553,6 @@ void EmitX64::EmitVectorPairedAddLower8(EmitContext& ctx, IR::Inst* inst) {
code.pxor(tmp, tmp);
code.psrlw(xmm_a, 8);
code.packuswb(xmm_a, tmp);
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
}

2
src/dynarmic/src/dynarmic/common/llvm_disassemble.cpp

@ -7,7 +7,6 @@
*/
#include <string>
#include <fmt/format.h>
#ifdef DYNARMIC_USE_LLVM
@ -16,7 +15,6 @@
#endif
#include "common/assert.h"
#include <bit>
#include "common/common_types.h"
#include "dynarmic/common/llvm_disassemble.h"

33
src/dynarmic/tests/A64/a64.cpp

@ -2587,11 +2587,11 @@ TEST_CASE("A64: Manual Vector Min/Max U64 (Optimizer Test)", "[a64]") {
// MaxU64 pattern: (a > b) ? a : b
code.CMHI(V2.D2(), V0.D2(), V1.D2()); // V2 = Mask (A > B)
code.BSL(V2.B16(), V0.B16(), V1.B16()); // V2 = Resul
code.BSL(V2.B16(), V0.B16(), V1.B16()); // V2 = Result
// MinU64 pattern: (a > b) ? b : a
code.CMHI(V3.D2(), V0.D2(), V1.D2()); // V3 = Mask (A > B)
code.BSL(V3.B16(), V1.B16(), V0.B16()); // V3 = Resul
code.BSL(V3.B16(), V1.B16(), V0.B16()); // V3 = Result
jit.SetPC(0);
jit.SetVector(0, {100, 20});
@ -2603,3 +2603,32 @@ TEST_CASE("A64: Manual Vector Min/Max U64 (Optimizer Test)", "[a64]") {
CHECK(jit.GetVector(2) == Vector{100, 200});
CHECK(jit.GetVector(3) == Vector{50, 20});
}
TEST_CASE("A64: Rounding", "[a64]") {
A64TestEnv env;
A64::UserConfig jit_user_config{};
jit_user_config.callbacks = &env;
A64::Jit jit{jit_user_config};
oaknut::VectorCodeGenerator code{env.code_mem, nullptr};
code.FRINTN(V1.S4(), V0.S4()); // ToNearest_TieEven
code.FRINTM(V2.S4(), V0.S4()); // TowardsMinusInfinity
code.FRINTP(V3.S4(), V0.S4()); // TowardsPlusInfinity
code.FRINTZ(V4.S4(), V0.S4()); // TowardsZero
code.FRINTA(V5.S4(), V0.S4()); // ToNearest_TieAwayFromZero
code.FRINTX(V6.S4(), V0.S4()); // ToNearest_TieAwayFromZero
jit.SetPC(0);
jit.SetVector(0, {0x4001e17c4001e17c, 0x4001e17c4001e17c});
env.ticks_left = env.code_mem.size();
CheckedRun([&]() { jit.Run(); });
CHECK(jit.GetVector(0) == Vector{0x4001e17c4001e17c, 0x4001e17c4001e17c});
CHECK(jit.GetVector(1) == Vector{0x4000000040000000, 0x4000000040000000});
CHECK(jit.GetVector(2) == Vector{0x4000000040000000, 0x4000000040000000});
CHECK(jit.GetVector(3) == Vector{0x4040000040400000, 0x4040000040400000});
CHECK(jit.GetVector(4) == Vector{0x4000000040000000, 0x4000000040000000});
CHECK(jit.GetVector(5) == Vector{0x4000000040000000, 0x4000000040000000});
CHECK(jit.GetVector(6) == Vector{0x4000000040000000, 0x4000000040000000});
}

36
src/dynarmic/tests/A64/testenv.h

@ -17,24 +17,19 @@ using Vector = Dynarmic::A64::Vector;
class A64TestEnv : public Dynarmic::A64::UserCallbacks {
public:
ankerl::unordered_dense::map<u64, u8> modified_memory;
std::vector<u32> code_mem;
u64 ticks_left = 0;
bool code_mem_modified_by_guest = false;
u64 code_mem_start_address = 0;
std::vector<u32> code_mem;
ankerl::unordered_dense::map<u64, u8> modified_memory;
std::vector<std::string> interrupts;
bool code_mem_modified_by_guest = false;
bool IsInCodeMem(u64 vaddr) const {
return vaddr >= code_mem_start_address && vaddr < code_mem_start_address + code_mem.size() * 4;
}
std::optional<std::uint32_t> MemoryReadCode(u64 vaddr) override {
if (!IsInCodeMem(vaddr)) {
if (!IsInCodeMem(vaddr))
return 0x14000000; // B .
}
const size_t index = (vaddr - code_mem_start_address) / 4;
return code_mem[index];
}
@ -43,10 +38,9 @@ public:
if (IsInCodeMem(vaddr)) {
return reinterpret_cast<u8*>(code_mem.data())[vaddr - code_mem_start_address];
}
if (auto iter = modified_memory.find(vaddr); iter != modified_memory.end()) {
return iter->second;
}
return static_cast<u8>(vaddr);
if (auto const it = modified_memory.find(vaddr); it != modified_memory.end())
return it->second;
return u8(vaddr);
}
std::uint16_t MemoryRead16(u64 vaddr) override {
return u16(MemoryRead8(vaddr)) | u16(MemoryRead8(vaddr + 1)) << 8;
@ -68,16 +62,16 @@ public:
modified_memory[vaddr] = value;
}
void MemoryWrite16(u64 vaddr, std::uint16_t value) override {
MemoryWrite8(vaddr, static_cast<u8>(value));
MemoryWrite8(vaddr + 1, static_cast<u8>(value >> 8));
MemoryWrite8(vaddr, u8(value));
MemoryWrite8(vaddr + 1, u8(value >> 8));
}
void MemoryWrite32(u64 vaddr, std::uint32_t value) override {
MemoryWrite16(vaddr, static_cast<u16>(value));
MemoryWrite16(vaddr + 2, static_cast<u16>(value >> 16));
MemoryWrite16(vaddr, u16(value));
MemoryWrite16(vaddr + 2, u16(value >> 16));
}
void MemoryWrite64(u64 vaddr, std::uint64_t value) override {
MemoryWrite32(vaddr, static_cast<u32>(value));
MemoryWrite32(vaddr + 4, static_cast<u32>(value >> 32));
MemoryWrite32(vaddr, u32(value));
MemoryWrite32(vaddr + 4, u32(value >> 32));
}
void MemoryWrite128(u64 vaddr, Vector value) override {
MemoryWrite64(vaddr, value[0]);
@ -139,12 +133,12 @@ public:
template<typename T>
T read(u64 vaddr) {
T value;
memcpy(&value, backing_memory + vaddr, sizeof(T));
std::memcpy(&value, backing_memory + vaddr, sizeof(T));
return value;
}
template<typename T>
void write(u64 vaddr, const T& value) {
memcpy(backing_memory + vaddr, &value, sizeof(T));
std::memcpy(backing_memory + vaddr, &value, sizeof(T));
}
std::optional<std::uint32_t> MemoryReadCode(u64 vaddr) override {

Loading…
Cancel
Save