[dynarmic] regalloc use scratchimpl that uses all instead of iteraiting

Signed-off-by: lizzie <lizzie@eden-emu.dev>
5 months ago · 00e8af04b4
2 changed files with 18 additions and 16 deletions
--- a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp
@ -122,9 +122,9 @@ A64EmitX64::BlockDescriptor A64EmitX64::Emit(IR::Block& block) noexcept {
        auto const opcode = inst.GetOpcode();
        // Call the relevant Emit* member function.
        switch (opcode) {
-#define OPCODE(name, type, ...) [[likely]] case IR::Opcode::name: goto opcode_branch;
+#define OPCODE(name, type, ...) case IR::Opcode::name: goto opcode_branch;
 #define A32OPC(name, type, ...)
-#define A64OPC(name, type, ...) [[likely]] case IR::Opcode::A64##name: goto a64_branch;
+#define A64OPC(name, type, ...) case IR::Opcode::A64##name: goto a64_branch;
 #include "dynarmic/ir/opcodes.inc"
 #undef OPCODE
 #undef A32OPC
--- a/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp
@ -367,10 +367,20 @@ void RegAlloc::HostCall(IR::Inst* result_def,
    if (result_def) {
        DefineValueImpl(result_def, ABI_RETURN);
    }
-
    for (size_t i = 0; i < args.size(); i++) {
-        if (args[i] && !args[i]->get().IsVoid()) {
+        if (args[i]) {
            UseScratch(*args[i], args_hostloc[i]);
+        } else {
+            ScratchGpr(args_hostloc[i]); // TODO: Force spill
+        }
+    }
+    // Must match with with ScratchImpl
+    for (auto const gpr : other_caller_save) {
+        MoveOutOfTheWay(gpr);
+        LocInfo(gpr).WriteLock();
+    }
+    for (size_t i = 0; i < args.size(); i++) {
+        if (args[i] && !args[i]->get().IsVoid()) {
            // LLVM puts the burden of zero-extension of 8 and 16 bit values on the caller instead of the callee
            const Xbyak::Reg64 reg = HostLocToReg64(args_hostloc[i]);
            switch (args[i]->get().GetType()) {
@ -390,14 +400,6 @@ void RegAlloc::HostCall(IR::Inst* result_def,
            }
        }
    }
-
-    for (size_t i = 0; i < args.size(); i++)
-        if (!args[i]) {
-            // TODO: Force spill
-            ScratchGpr(args_hostloc[i]);
-        }
-    for (auto const caller_saved : other_caller_save)
-        ScratchImpl({caller_saved});
 }

 void RegAlloc::AllocStackSpace(const size_t stack_space) noexcept {
@ -560,13 +562,12 @@ void RegAlloc::SpillRegister(HostLoc loc) noexcept {
 }

 HostLoc RegAlloc::FindFreeSpill(bool is_xmm) const noexcept {
-#if 0
    // TODO(lizzie): Ok, Windows hates XMM spills, this means less perf for windows
    // but it's fine anyways. We can find other ways to cheat it later - but which?!?!
    // we should NOT save xmm each block entering... MAYBE xbyak has a bug on start/end?
    // TODO(lizzie): This needs to be investigated further later.
    // Do not spill XMM into other XMM silly
-    if (!is_xmm) {
+    /*if (!is_xmm) {
        // TODO(lizzie): Using lower (xmm0 and such) registers results in issues/crashes - INVESTIGATE WHY
        // Intel recommends to spill GPR onto XMM registers IF POSSIBLE
        // TODO(lizzie): Issues on DBZ, theory: Scratch XMM not properly restored after a function call?
@ -574,8 +575,9 @@ HostLoc RegAlloc::FindFreeSpill(bool is_xmm) const noexcept {
        for (size_t i = size_t(HostLoc::XMM15); i >= size_t(HostLoc::XMM3); --i)
            if (const auto loc = HostLoc(i); LocInfo(loc).IsEmpty())
                return loc;
-    }
-#endif
+    }*/
+    // TODO: Doing this would mean saving XMM on each call... need to benchmark the benefits
+    // of spilling on XMM versus the potential cost of using XMM registers.....
    // Otherwise go to stack spilling
    for (size_t i = size_t(HostLoc::FirstSpill); i < hostloc_info.size(); ++i)
        if (const auto loc = HostLoc(i); LocInfo(loc).IsEmpty())