From 0f835e5c6e8023e29a5228152e7473186cb2454c Mon Sep 17 00:00:00 2001 From: lizzie Date: Tue, 13 Jan 2026 23:21:35 +0000 Subject: [PATCH] inline macro defs --- src/video_core/CMakeLists.txt | 12 +- src/video_core/engines/maxwell_3d.cpp | 7 +- src/video_core/engines/maxwell_3d.h | 4 +- src/video_core/macro.cpp | 1670 ++++++++++++++++++++ src/video_core/{macro => }/macro.h | 39 +- src/video_core/macro/macro.cpp | 140 -- src/video_core/macro/macro_hle.cpp | 606 ------- src/video_core/macro/macro_hle.h | 33 - src/video_core/macro/macro_interpreter.cpp | 362 ----- src/video_core/macro/macro_interpreter.h | 27 - src/video_core/macro/macro_jit_x64.cpp | 678 -------- src/video_core/macro/macro_jit_x64.h | 26 - 12 files changed, 1701 insertions(+), 1903 deletions(-) create mode 100644 src/video_core/macro.cpp rename src/video_core/{macro => }/macro.h (76%) delete mode 100644 src/video_core/macro/macro.cpp delete mode 100644 src/video_core/macro/macro_hle.cpp delete mode 100644 src/video_core/macro/macro_hle.h delete mode 100644 src/video_core/macro/macro_interpreter.cpp delete mode 100644 src/video_core/macro/macro_interpreter.h delete mode 100644 src/video_core/macro/macro_jit_x64.cpp delete mode 100644 src/video_core/macro/macro_jit_x64.h diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 94599532b3..c94b66e6bc 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -87,12 +87,8 @@ add_library(video_core STATIC host1x/syncpoint_manager.h host1x/vic.cpp host1x/vic.h - macro/macro.cpp - macro/macro.h - macro/macro_hle.cpp - macro/macro_hle.h - macro/macro_interpreter.cpp - macro/macro_interpreter.h + macro.cpp + macro.h fence_manager.h gpu.cpp gpu.h @@ -375,10 +371,6 @@ else() endif() if (ARCHITECTURE_x86_64) - target_sources(video_core PRIVATE - macro/macro_jit_x64.cpp - macro/macro_jit_x64.h - ) target_link_libraries(video_core PUBLIC xbyak::xbyak) endif() diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 77729fd5b6..4ca21338c9 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -27,9 +27,7 @@ constexpr u32 MacroRegistersStart = 0xE00; Maxwell3D::Maxwell3D(Core::System& system_, MemoryManager& memory_manager_) : draw_manager{std::make_unique(this)}, system{system_}, - memory_manager{memory_manager_}, macro_engine{GetMacroEngine(*this)}, upload_state{ - memory_manager, - regs.upload} { + memory_manager{memory_manager_}, macro_engine{GetMacroEngine(*this)}, upload_state{memory_manager, regs.upload} { dirty.flags.flip(); InitializeRegisterDefaults(); execution_mask.reset(); @@ -329,8 +327,7 @@ void Maxwell3D::ProcessDirtyRegisters(u32 method, u32 argument) { } } -void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument, - bool is_last_call) { +void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument, bool is_last_call) { switch (method) { case MAXWELL3D_REG_INDEX(wait_for_idle): return rasterizer->WaitForIdle(); diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index ae2e7a84c4..8c50a4ea2f 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -23,7 +23,7 @@ #include "video_core/engines/engine_interface.h" #include "video_core/engines/engine_upload.h" #include "video_core/gpu.h" -#include "video_core/macro/macro.h" +#include "video_core/macro.h" #include "video_core/textures/texture.h" namespace Core { @@ -3203,7 +3203,7 @@ private: std::vector macro_params; /// Interpreter for the macro codes uploaded to the GPU. - std::unique_ptr macro_engine; + std::optional macro_engine; Upload::State upload_state; diff --git a/src/video_core/macro.cpp b/src/video_core/macro.cpp new file mode 100644 index 0000000000..a94eddaef9 --- /dev/null +++ b/src/video_core/macro.cpp @@ -0,0 +1,1670 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include + +#include +#ifdef ARCHITECTURE_x86_64 +#include +#endif + +#include "common/assert.h" +#include "common/scope_exit.h" +#include "common/fs/fs.h" +#include "common/fs/path_util.h" +#include "common/settings.h" +#include "common/container_hash.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/engines/draw_manager.h" +#include "video_core/dirty_flags.h" +#include "video_core/rasterizer_interface.h" +#include "video_core/macro.h" + +#include "common/assert.h" +#include "common/bit_field.h" +#include "common/logging/log.h" +#ifdef ARCHITECTURE_x86_64 +#include "common/x64/xbyak_abi.h" +#include "common/x64/xbyak_util.h" +#endif +#include "video_core/engines/maxwell_3d.h" + +namespace Tegra { + +using Maxwell3D = Engines::Maxwell3D; + +namespace { + +bool IsTopologySafe(Maxwell3D::Regs::PrimitiveTopology topology) { + switch (topology) { + case Maxwell3D::Regs::PrimitiveTopology::Points: + case Maxwell3D::Regs::PrimitiveTopology::Lines: + case Maxwell3D::Regs::PrimitiveTopology::LineLoop: + case Maxwell3D::Regs::PrimitiveTopology::LineStrip: + case Maxwell3D::Regs::PrimitiveTopology::Triangles: + case Maxwell3D::Regs::PrimitiveTopology::TriangleStrip: + case Maxwell3D::Regs::PrimitiveTopology::TriangleFan: + case Maxwell3D::Regs::PrimitiveTopology::LinesAdjacency: + case Maxwell3D::Regs::PrimitiveTopology::LineStripAdjacency: + case Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency: + case Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency: + case Maxwell3D::Regs::PrimitiveTopology::Patches: + return true; + case Maxwell3D::Regs::PrimitiveTopology::Quads: + case Maxwell3D::Regs::PrimitiveTopology::QuadStrip: + case Maxwell3D::Regs::PrimitiveTopology::Polygon: + default: + return false; + } +} + +class HLEMacroImpl : public CachedMacro { +public: + explicit HLEMacroImpl(Maxwell3D& maxwell3d_) + : CachedMacro(maxwell3d_) + {} +}; + +/// @note: these macros have two versions, a normal and extended version, with the extended version +/// also assigning the base vertex/instance. +template +class HLE_DrawArraysIndirect final : public HLEMacroImpl { +public: + explicit HLE_DrawArraysIndirect(Maxwell3D& maxwell3d_) + : HLEMacroImpl(maxwell3d_) + {} + + void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { + auto topology = static_cast(parameters[0]); + if (!maxwell3d.AnyParametersDirty() || !IsTopologySafe(topology)) { + Fallback(parameters); + return; + } + + auto& params = maxwell3d.draw_manager->GetIndirectParams(); + params.is_byte_count = false; + params.is_indexed = false; + params.include_count = false; + params.count_start_address = 0; + params.indirect_start_address = maxwell3d.GetMacroAddress(1); + params.buffer_size = 4 * sizeof(u32); + params.max_draw_counts = 1; + params.stride = 0; + + if (extended) { + maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro; + maxwell3d.SetHLEReplacementAttributeType(0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseInstance); + } + + maxwell3d.draw_manager->DrawArrayIndirect(topology); + + if (extended) { + maxwell3d.engine_state = Maxwell3D::EngineHint::None; + maxwell3d.replace_table.clear(); + } + } + +private: + void Fallback(const std::vector& parameters) { + SCOPE_EXIT { + if (extended) { + maxwell3d.engine_state = Maxwell3D::EngineHint::None; + maxwell3d.replace_table.clear(); + } + }; + maxwell3d.RefreshParameters(); + const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]); + + auto topology = static_cast(parameters[0]); + const u32 vertex_first = parameters[3]; + const u32 vertex_count = parameters[1]; + + if (!IsTopologySafe(topology) && size_t(maxwell3d.GetMaxCurrentVertices()) < size_t(vertex_first) + size_t(vertex_count)) { + ASSERT(false && "Faulty draw!"); + return; + } + + const u32 base_instance = parameters[4]; + if (extended) { + maxwell3d.regs.global_base_instance_index = base_instance; + maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro; + maxwell3d.SetHLEReplacementAttributeType( + 0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseInstance); + } + + maxwell3d.draw_manager->DrawArray(topology, vertex_first, vertex_count, base_instance, + instance_count); + + if (extended) { + maxwell3d.regs.global_base_instance_index = 0; + maxwell3d.engine_state = Maxwell3D::EngineHint::None; + maxwell3d.replace_table.clear(); + } + } +}; + +/* + * @note: these macros have two versions, a normal and extended version, with the extended version + * also assigning the base vertex/instance. + */ +template +class HLE_DrawIndexedIndirect final : public HLEMacroImpl { +public: + explicit HLE_DrawIndexedIndirect(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} + + void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { + auto topology = static_cast(parameters[0]); + if (!maxwell3d.AnyParametersDirty() || !IsTopologySafe(topology)) { + Fallback(parameters); + return; + } + + const u32 estimate = static_cast(maxwell3d.EstimateIndexBufferSize()); + const u32 element_base = parameters[4]; + const u32 base_instance = parameters[5]; + maxwell3d.regs.vertex_id_base = element_base; + maxwell3d.regs.global_base_vertex_index = element_base; + maxwell3d.regs.global_base_instance_index = base_instance; + maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; + if (extended) { + maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro; + maxwell3d.SetHLEReplacementAttributeType(0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseVertex); + maxwell3d.SetHLEReplacementAttributeType(0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance); + } + auto& params = maxwell3d.draw_manager->GetIndirectParams(); + params.is_byte_count = false; + params.is_indexed = true; + params.include_count = false; + params.count_start_address = 0; + params.indirect_start_address = maxwell3d.GetMacroAddress(1); + params.buffer_size = 5 * sizeof(u32); + params.max_draw_counts = 1; + params.stride = 0; + maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; + maxwell3d.draw_manager->DrawIndexedIndirect(topology, 0, estimate); + maxwell3d.regs.vertex_id_base = 0x0; + maxwell3d.regs.global_base_vertex_index = 0x0; + maxwell3d.regs.global_base_instance_index = 0x0; + if (extended) { + maxwell3d.engine_state = Maxwell3D::EngineHint::None; + maxwell3d.replace_table.clear(); + } + } + +private: + void Fallback(const std::vector& parameters) { + maxwell3d.RefreshParameters(); + const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]); + const u32 element_base = parameters[4]; + const u32 base_instance = parameters[5]; + maxwell3d.regs.vertex_id_base = element_base; + maxwell3d.regs.global_base_vertex_index = element_base; + maxwell3d.regs.global_base_instance_index = base_instance; + maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; + if (extended) { + maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro; + maxwell3d.SetHLEReplacementAttributeType(0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseVertex); + maxwell3d.SetHLEReplacementAttributeType(0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance); + } + + maxwell3d.draw_manager->DrawIndex(Tegra::Maxwell3D::Regs::PrimitiveTopology(parameters[0]), parameters[3], parameters[1], element_base, base_instance, instance_count); + + maxwell3d.regs.vertex_id_base = 0x0; + maxwell3d.regs.global_base_vertex_index = 0x0; + maxwell3d.regs.global_base_instance_index = 0x0; + if (extended) { + maxwell3d.engine_state = Maxwell3D::EngineHint::None; + maxwell3d.replace_table.clear(); + } + } +}; + +class HLE_MultiLayerClear final : public HLEMacroImpl { +public: + explicit HLE_MultiLayerClear(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} + + void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { + maxwell3d.RefreshParameters(); + ASSERT(parameters.size() == 1); + + const Maxwell3D::Regs::ClearSurface clear_params{parameters[0]}; + const u32 rt_index = clear_params.RT; + const u32 num_layers = maxwell3d.regs.rt[rt_index].depth; + ASSERT(clear_params.layer == 0); + + maxwell3d.regs.clear_surface.raw = clear_params.raw; + maxwell3d.draw_manager->Clear(num_layers); + } +}; + +class HLE_MultiDrawIndexedIndirectCount final : public HLEMacroImpl { +public: + explicit HLE_MultiDrawIndexedIndirectCount(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} + + void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { + const auto topology = Maxwell3D::Regs::PrimitiveTopology(parameters[2]); + if (!IsTopologySafe(topology)) { + Fallback(parameters); + return; + } + + const u32 start_indirect = parameters[0]; + const u32 end_indirect = parameters[1]; + if (start_indirect >= end_indirect) { + // Nothing to do. + return; + } + + const u32 padding = parameters[3]; // padding is in words + + // size of each indirect segment + const u32 indirect_words = 5 + padding; + const u32 stride = indirect_words * sizeof(u32); + const std::size_t draw_count = end_indirect - start_indirect; + const u32 estimate = static_cast(maxwell3d.EstimateIndexBufferSize()); + maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; + auto& params = maxwell3d.draw_manager->GetIndirectParams(); + params.is_byte_count = false; + params.is_indexed = true; + params.include_count = true; + params.count_start_address = maxwell3d.GetMacroAddress(4); + params.indirect_start_address = maxwell3d.GetMacroAddress(5); + params.buffer_size = stride * draw_count; + params.max_draw_counts = draw_count; + params.stride = stride; + maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; + maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro; + maxwell3d.SetHLEReplacementAttributeType( + 0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseVertex); + maxwell3d.SetHLEReplacementAttributeType( + 0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance); + maxwell3d.SetHLEReplacementAttributeType(0, 0x648, + Maxwell3D::HLEReplacementAttributeType::DrawID); + maxwell3d.draw_manager->DrawIndexedIndirect(topology, 0, estimate); + maxwell3d.engine_state = Maxwell3D::EngineHint::None; + maxwell3d.replace_table.clear(); + } + +private: + void Fallback(const std::vector& parameters) { + SCOPE_EXIT { + // Clean everything. + maxwell3d.regs.vertex_id_base = 0x0; + maxwell3d.engine_state = Maxwell3D::EngineHint::None; + maxwell3d.replace_table.clear(); + }; + maxwell3d.RefreshParameters(); + const u32 start_indirect = parameters[0]; + const u32 end_indirect = parameters[1]; + if (start_indirect >= end_indirect) { + // Nothing to do. + return; + } + const auto topology = static_cast(parameters[2]); + const u32 padding = parameters[3]; + const std::size_t max_draws = parameters[4]; + + const u32 indirect_words = 5 + padding; + const std::size_t first_draw = start_indirect; + const std::size_t effective_draws = end_indirect - start_indirect; + const std::size_t last_draw = start_indirect + (std::min)(effective_draws, max_draws); + + for (std::size_t index = first_draw; index < last_draw; index++) { + const std::size_t base = index * indirect_words + 5; + const u32 base_vertex = parameters[base + 3]; + const u32 base_instance = parameters[base + 4]; + maxwell3d.regs.vertex_id_base = base_vertex; + maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro; + maxwell3d.SetHLEReplacementAttributeType( + 0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseVertex); + maxwell3d.SetHLEReplacementAttributeType( + 0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance); + maxwell3d.CallMethod(0x8e3, 0x648, true); + maxwell3d.CallMethod(0x8e4, static_cast(index), true); + maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; + maxwell3d.draw_manager->DrawIndex(topology, parameters[base + 2], parameters[base], + base_vertex, base_instance, parameters[base + 1]); + } + } +}; + +class HLE_DrawIndirectByteCount final : public HLEMacroImpl { +public: + explicit HLE_DrawIndirectByteCount(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} + + void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { + const bool force = maxwell3d.Rasterizer().HasDrawTransformFeedback(); + + auto topology = static_cast(parameters[0] & 0xFFFFU); + if (!force && (!maxwell3d.AnyParametersDirty() || !IsTopologySafe(topology))) { + Fallback(parameters); + return; + } + auto& params = maxwell3d.draw_manager->GetIndirectParams(); + params.is_byte_count = true; + params.is_indexed = false; + params.include_count = false; + params.count_start_address = 0; + params.indirect_start_address = maxwell3d.GetMacroAddress(2); + params.buffer_size = 4; + params.max_draw_counts = 1; + params.stride = parameters[1]; + maxwell3d.regs.draw.begin = parameters[0]; + maxwell3d.regs.draw_auto_stride = parameters[1]; + maxwell3d.regs.draw_auto_byte_count = parameters[2]; + + maxwell3d.draw_manager->DrawArrayIndirect(topology); + } + +private: + void Fallback(const std::vector& parameters) { + maxwell3d.RefreshParameters(); + + maxwell3d.regs.draw.begin = parameters[0]; + maxwell3d.regs.draw_auto_stride = parameters[1]; + maxwell3d.regs.draw_auto_byte_count = parameters[2]; + + maxwell3d.draw_manager->DrawArray( + maxwell3d.regs.draw.topology, 0, + maxwell3d.regs.draw_auto_byte_count / maxwell3d.regs.draw_auto_stride, 0, 1); + } +}; + +class HLE_C713C83D8F63CCF3 final : public HLEMacroImpl { +public: + explicit HLE_C713C83D8F63CCF3(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} + + void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { + maxwell3d.RefreshParameters(); + const u32 offset = (parameters[0] & 0x3FFFFFFF) << 2; + const u32 address = maxwell3d.regs.shadow_scratch[24]; + auto& const_buffer = maxwell3d.regs.const_buffer; + const_buffer.size = 0x7000; + const_buffer.address_high = (address >> 24) & 0xFF; + const_buffer.address_low = address << 8; + const_buffer.offset = offset; + } +}; + +class HLE_D7333D26E0A93EDE final : public HLEMacroImpl { +public: + explicit HLE_D7333D26E0A93EDE(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} + + void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { + maxwell3d.RefreshParameters(); + const size_t index = parameters[0]; + const u32 address = maxwell3d.regs.shadow_scratch[42 + index]; + const u32 size = maxwell3d.regs.shadow_scratch[47 + index]; + auto& const_buffer = maxwell3d.regs.const_buffer; + const_buffer.size = size; + const_buffer.address_high = (address >> 24) & 0xFF; + const_buffer.address_low = address << 8; + } +}; + +class HLE_BindShader final : public HLEMacroImpl { +public: + explicit HLE_BindShader(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} + + void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { + maxwell3d.RefreshParameters(); + auto& regs = maxwell3d.regs; + const u32 index = parameters[0]; + if ((parameters[1] - regs.shadow_scratch[28 + index]) == 0) { + return; + } + + regs.pipelines[index & 0xF].offset = parameters[2]; + maxwell3d.dirty.flags[VideoCommon::Dirty::Shaders] = true; + regs.shadow_scratch[28 + index] = parameters[1]; + regs.shadow_scratch[34 + index] = parameters[2]; + + const u32 address = parameters[4]; + auto& const_buffer = regs.const_buffer; + const_buffer.size = 0x10000; + const_buffer.address_high = (address >> 24) & 0xFF; + const_buffer.address_low = address << 8; + + const size_t bind_group_id = parameters[3] & 0x7F; + auto& bind_group = regs.bind_groups[bind_group_id]; + bind_group.raw_config = 0x11; + maxwell3d.ProcessCBBind(bind_group_id); + } +}; + +class HLE_SetRasterBoundingBox final : public HLEMacroImpl { +public: + explicit HLE_SetRasterBoundingBox(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} + + void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { + maxwell3d.RefreshParameters(); + const u32 raster_mode = parameters[0]; + auto& regs = maxwell3d.regs; + const u32 raster_enabled = maxwell3d.regs.conservative_raster_enable; + const u32 scratch_data = maxwell3d.regs.shadow_scratch[52]; + regs.raster_bounding_box.raw = raster_mode & 0xFFFFF00F; + regs.raster_bounding_box.pad.Assign(scratch_data & raster_enabled); + } +}; + +template +class HLE_ClearConstBuffer final : public HLEMacroImpl { +public: + explicit HLE_ClearConstBuffer(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} + + void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { + maxwell3d.RefreshParameters(); + static constexpr std::array zeroes{}; + auto& regs = maxwell3d.regs; + regs.const_buffer.size = u32(base_size); + regs.const_buffer.address_high = parameters[0]; + regs.const_buffer.address_low = parameters[1]; + regs.const_buffer.offset = 0; + maxwell3d.ProcessCBMultiData(zeroes.data(), parameters[2] * 4); + } +}; + +class HLE_ClearMemory final : public HLEMacroImpl { +public: + explicit HLE_ClearMemory(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} + + void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { + maxwell3d.RefreshParameters(); + + const u32 needed_memory = parameters[2] / sizeof(u32); + if (needed_memory > zero_memory.size()) { + zero_memory.resize(needed_memory, 0); + } + auto& regs = maxwell3d.regs; + regs.upload.line_length_in = parameters[2]; + regs.upload.line_count = 1; + regs.upload.dest.address_high = parameters[0]; + regs.upload.dest.address_low = parameters[1]; + maxwell3d.CallMethod(size_t(MAXWELL3D_REG_INDEX(launch_dma)), 0x1011, true); + maxwell3d.CallMultiMethod(size_t(MAXWELL3D_REG_INDEX(inline_data)), zero_memory.data(), needed_memory, needed_memory); + } + +private: + std::vector zero_memory; +}; + +class HLE_TransformFeedbackSetup final : public HLEMacroImpl { +public: + explicit HLE_TransformFeedbackSetup(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} + + void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { + maxwell3d.RefreshParameters(); + + auto& regs = maxwell3d.regs; + regs.transform_feedback_enabled = 1; + regs.transform_feedback.buffers[0].start_offset = 0; + regs.transform_feedback.buffers[1].start_offset = 0; + regs.transform_feedback.buffers[2].start_offset = 0; + regs.transform_feedback.buffers[3].start_offset = 0; + + regs.upload.line_length_in = 4; + regs.upload.line_count = 1; + regs.upload.dest.address_high = parameters[0]; + regs.upload.dest.address_low = parameters[1]; + maxwell3d.CallMethod(size_t(MAXWELL3D_REG_INDEX(launch_dma)), 0x1011, true); + maxwell3d.CallMethod(size_t(MAXWELL3D_REG_INDEX(inline_data)), regs.transform_feedback.controls[0].stride, true); + + maxwell3d.Rasterizer().RegisterTransformFeedback(regs.upload.dest.Address()); + } +}; + +} // Anonymous namespace + +HLEMacro::HLEMacro(Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} {} + +HLEMacro::~HLEMacro() = default; + +std::unique_ptr HLEMacro::GetHLEProgram(u64 hash) const { + switch (hash) { + case 0x0D61FC9FAAC9FCADULL: + return std::make_unique>(maxwell3d); + case 0x8A4D173EB99A8603ULL: + return std::make_unique>(maxwell3d); + case 0x771BB18C62444DA0ULL: + return std::make_unique>(maxwell3d); + case 0x0217920100488FF7ULL: + return std::make_unique>(maxwell3d); + case 0x3F5E74B9C9A50164ULL: + return std::make_unique(maxwell3d); + case 0xEAD26C3E2109B06BULL: + return std::make_unique(maxwell3d); + case 0xC713C83D8F63CCF3ULL: + return std::make_unique(maxwell3d); + case 0xD7333D26E0A93EDEULL: + return std::make_unique(maxwell3d); + case 0xEB29B2A09AA06D38ULL: + return std::make_unique(maxwell3d); + case 0xDB1341DBEB4C8AF7ULL: + return std::make_unique(maxwell3d); + case 0x6C97861D891EDf7EULL: + return std::make_unique>(maxwell3d); + case 0xD246FDDF3A6173D7ULL: + return std::make_unique>(maxwell3d); + case 0xEE4D0004BEC8ECF4ULL: + return std::make_unique(maxwell3d); + case 0xFC0CF27F5FFAA661ULL: + return std::make_unique(maxwell3d); + case 0xB5F74EDB717278ECULL: + return std::make_unique(maxwell3d); + default: + return nullptr; + } +} + +namespace { +class MacroInterpreterImpl final : public CachedMacro { +public: + explicit MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d_, const std::vector& code_) + : CachedMacro(maxwell3d_) + , code{code_} + {} + + void Execute(const std::vector& params, u32 method) override; + +private: + /// Resets the execution engine state, zeroing registers, etc. + void Reset(); + + /** + * Executes a single macro instruction located at the current program counter. Returns whether + * the interpreter should keep running. + * + * @param is_delay_slot Whether the current step is being executed due to a delay slot in a + * previous instruction. + */ + bool Step(bool is_delay_slot); + + /// Calculates the result of an ALU operation. src_a OP src_b; + u32 GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b); + + /// Performs the result operation on the input result and stores it in the specified register + /// (if necessary). + void ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result); + + /// Evaluates the branch condition and returns whether the branch should be taken or not. + bool EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const; + + /// Reads an opcode at the current program counter location. + Macro::Opcode GetOpcode() const; + + /// Returns the specified register's value. Register 0 is hardcoded to always return 0. + u32 GetRegister(u32 register_id) const; + + /// Sets the register to the input value. + void SetRegister(u32 register_id, u32 value); + + /// Sets the method address to use for the next Send instruction. + void SetMethodAddress(u32 address); + + /// Calls a GPU Engine method with the input parameter. + void Send(u32 value); + + /// Reads a GPU register located at the method address. + u32 Read(u32 method) const; + + /// Returns the next parameter in the parameter queue. + u32 FetchParameter(); + + /// Current program counter + u32 pc{}; + /// Program counter to execute at after the delay slot is executed. + std::optional delayed_pc; + + /// General purpose macro registers. + std::array registers = {}; + + /// Method address to use for the next Send instruction. + Macro::MethodAddress method_address = {}; + + /// Input parameters of the current macro. + std::unique_ptr parameters; + std::size_t num_parameters = 0; + std::size_t parameters_capacity = 0; + /// Index of the next parameter that will be fetched by the 'parm' instruction. + u32 next_parameter_index = 0; + + bool carry_flag = false; + const std::vector& code; +}; + +void MacroInterpreterImpl::Execute(const std::vector& params, u32 method) { + Reset(); + + registers[1] = params[0]; + num_parameters = params.size(); + + if (num_parameters > parameters_capacity) { + parameters_capacity = num_parameters; + parameters = std::make_unique(num_parameters); + } + std::memcpy(parameters.get(), params.data(), num_parameters * sizeof(u32)); + + // Execute the code until we hit an exit condition. + bool keep_executing = true; + while (keep_executing) { + keep_executing = Step(false); + } + + // Assert the the macro used all the input parameters + ASSERT(next_parameter_index == num_parameters); +} + +void MacroInterpreterImpl::Reset() { + registers = {}; + pc = 0; + delayed_pc = {}; + method_address.raw = 0; + num_parameters = 0; + // The next parameter index starts at 1, because $r1 already has the value of the first + // parameter. + next_parameter_index = 1; + carry_flag = false; +} + +bool MacroInterpreterImpl::Step(bool is_delay_slot) { + u32 base_address = pc; + + Macro::Opcode opcode = GetOpcode(); + pc += 4; + + // Update the program counter if we were delayed + if (delayed_pc) { + ASSERT(is_delay_slot); + pc = *delayed_pc; + delayed_pc = {}; + } + + switch (opcode.operation) { + case Macro::Operation::ALU: { + u32 result = GetALUResult(opcode.alu_operation, GetRegister(opcode.src_a), + GetRegister(opcode.src_b)); + ProcessResult(opcode.result_operation, opcode.dst, result); + break; + } + case Macro::Operation::AddImmediate: { + ProcessResult(opcode.result_operation, opcode.dst, + GetRegister(opcode.src_a) + opcode.immediate); + break; + } + case Macro::Operation::ExtractInsert: { + u32 dst = GetRegister(opcode.src_a); + u32 src = GetRegister(opcode.src_b); + + src = (src >> opcode.bf_src_bit) & opcode.GetBitfieldMask(); + dst &= ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit); + dst |= src << opcode.bf_dst_bit; + ProcessResult(opcode.result_operation, opcode.dst, dst); + break; + } + case Macro::Operation::ExtractShiftLeftImmediate: { + u32 dst = GetRegister(opcode.src_a); + u32 src = GetRegister(opcode.src_b); + + u32 result = ((src >> dst) & opcode.GetBitfieldMask()) << opcode.bf_dst_bit; + + ProcessResult(opcode.result_operation, opcode.dst, result); + break; + } + case Macro::Operation::ExtractShiftLeftRegister: { + u32 dst = GetRegister(opcode.src_a); + u32 src = GetRegister(opcode.src_b); + + u32 result = ((src >> opcode.bf_src_bit) & opcode.GetBitfieldMask()) << dst; + + ProcessResult(opcode.result_operation, opcode.dst, result); + break; + } + case Macro::Operation::Read: { + u32 result = Read(GetRegister(opcode.src_a) + opcode.immediate); + ProcessResult(opcode.result_operation, opcode.dst, result); + break; + } + case Macro::Operation::Branch: { + ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid"); + u32 value = GetRegister(opcode.src_a); + bool taken = EvaluateBranchCondition(opcode.branch_condition, value); + if (taken) { + // Ignore the delay slot if the branch has the annul bit. + if (opcode.branch_annul) { + pc = base_address + opcode.GetBranchTarget(); + return true; + } + + delayed_pc = base_address + opcode.GetBranchTarget(); + // Execute one more instruction due to the delay slot. + return Step(true); + } + break; + } + default: + UNIMPLEMENTED_MSG("Unimplemented macro operation {}", opcode.operation.Value()); + break; + } + + // An instruction with the Exit flag will not actually + // cause an exit if it's executed inside a delay slot. + if (opcode.is_exit && !is_delay_slot) { + // Exit has a delay slot, execute the next instruction + Step(true); + return false; + } + + return true; +} + +u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b) { + switch (operation) { + case Macro::ALUOperation::Add: { + const u64 result{static_cast(src_a) + src_b}; + carry_flag = result > 0xffffffff; + return static_cast(result); + } + case Macro::ALUOperation::AddWithCarry: { + const u64 result{static_cast(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)}; + carry_flag = result > 0xffffffff; + return static_cast(result); + } + case Macro::ALUOperation::Subtract: { + const u64 result{static_cast(src_a) - src_b}; + carry_flag = result < 0x100000000; + return static_cast(result); + } + case Macro::ALUOperation::SubtractWithBorrow: { + const u64 result{static_cast(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)}; + carry_flag = result < 0x100000000; + return static_cast(result); + } + case Macro::ALUOperation::Xor: + return src_a ^ src_b; + case Macro::ALUOperation::Or: + return src_a | src_b; + case Macro::ALUOperation::And: + return src_a & src_b; + case Macro::ALUOperation::AndNot: + return src_a & ~src_b; + case Macro::ALUOperation::Nand: + return ~(src_a & src_b); + + default: + UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", operation); + return 0; + } +} + +void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result) { + switch (operation) { + case Macro::ResultOperation::IgnoreAndFetch: + // Fetch parameter and ignore result. + SetRegister(reg, FetchParameter()); + break; + case Macro::ResultOperation::Move: + // Move result. + SetRegister(reg, result); + break; + case Macro::ResultOperation::MoveAndSetMethod: + // Move result and use as Method Address. + SetRegister(reg, result); + SetMethodAddress(result); + break; + case Macro::ResultOperation::FetchAndSend: + // Fetch parameter and send result. + SetRegister(reg, FetchParameter()); + Send(result); + break; + case Macro::ResultOperation::MoveAndSend: + // Move and send result. + SetRegister(reg, result); + Send(result); + break; + case Macro::ResultOperation::FetchAndSetMethod: + // Fetch parameter and use result as Method Address. + SetRegister(reg, FetchParameter()); + SetMethodAddress(result); + break; + case Macro::ResultOperation::MoveAndSetMethodFetchAndSend: + // Move result and use as Method Address, then fetch and send parameter. + SetRegister(reg, result); + SetMethodAddress(result); + Send(FetchParameter()); + break; + case Macro::ResultOperation::MoveAndSetMethodSend: + // Move result and use as Method Address, then send bits 12:17 of result. + SetRegister(reg, result); + SetMethodAddress(result); + Send((result >> 12) & 0b111111); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented result operation {}", operation); + break; + } +} + +bool MacroInterpreterImpl::EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const { + switch (cond) { + case Macro::BranchCondition::Zero: + return value == 0; + case Macro::BranchCondition::NotZero: + return value != 0; + } + UNREACHABLE(); +} + +Macro::Opcode MacroInterpreterImpl::GetOpcode() const { + ASSERT((pc % sizeof(u32)) == 0); + ASSERT(pc < code.size() * sizeof(u32)); + return {code[pc / sizeof(u32)]}; +} + +u32 MacroInterpreterImpl::GetRegister(u32 register_id) const { + return registers.at(register_id); +} + +void MacroInterpreterImpl::SetRegister(u32 register_id, u32 value) { + // Register 0 is hardwired as the zero register. + // Ensure no writes to it actually occur. + if (register_id == 0) { + return; + } + + registers.at(register_id) = value; +} + +void MacroInterpreterImpl::SetMethodAddress(u32 address) { + method_address.raw = address; +} + +void MacroInterpreterImpl::Send(u32 value) { + maxwell3d.CallMethod(method_address.address, value, true); + // Increment the method address by the method increment. + method_address.address.Assign(method_address.address.Value() + + method_address.increment.Value()); +} + +u32 MacroInterpreterImpl::Read(u32 method) const { + return maxwell3d.GetRegisterValue(method); +} + +u32 MacroInterpreterImpl::FetchParameter() { + ASSERT(next_parameter_index < num_parameters); + return parameters[next_parameter_index++]; +} +} // Anonymous namespace + +#ifdef ARCHITECTURE_x86_64 +namespace { +constexpr Xbyak::Reg64 STATE = Xbyak::util::rbx; +constexpr Xbyak::Reg32 RESULT = Xbyak::util::r10d; +constexpr Xbyak::Reg64 MAX_PARAMETER = Xbyak::util::r11; +constexpr Xbyak::Reg64 PARAMETERS = Xbyak::util::r12; +constexpr Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d; +constexpr Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15; + +constexpr std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({ + STATE, + RESULT, + MAX_PARAMETER, + PARAMETERS, + METHOD_ADDRESS, + BRANCH_HOLDER, +}); + +// Arbitrarily chosen based on current booting games. +constexpr size_t MAX_CODE_SIZE = 0x10000; + +std::bitset<32> PersistentCallerSavedRegs() { + return PERSISTENT_REGISTERS & Common::X64::ABI_ALL_CALLER_SAVED; +} + +/// @brief Must enforce W^X constraints, as we yet don't havea global "NO_EXECUTE" support flag +/// the speed loss is minimal, and in fact may be negligible, however for your peace of mind +/// I simply included known OSes whom had W^X issues +#if defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) +static const auto default_cg_mode = Xbyak::DontSetProtectRWE; +#else +static const auto default_cg_mode = nullptr; //Allow RWE +#endif + +class MacroJITx64Impl final : public Xbyak::CodeGenerator, public CachedMacro { +public: + explicit MacroJITx64Impl(Engines::Maxwell3D& maxwell3d_, const std::vector& code_) + : Xbyak::CodeGenerator(MAX_CODE_SIZE, default_cg_mode) + , CachedMacro(maxwell3d_) + , code{code_} + { + Compile(); + } + + void Execute(const std::vector& parameters, u32 method) override; + + void Compile_ALU(Macro::Opcode opcode); + void Compile_AddImmediate(Macro::Opcode opcode); + void Compile_ExtractInsert(Macro::Opcode opcode); + void Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode); + void Compile_ExtractShiftLeftRegister(Macro::Opcode opcode); + void Compile_Read(Macro::Opcode opcode); + void Compile_Branch(Macro::Opcode opcode); + +private: + void Optimizer_ScanFlags(); + + void Compile(); + bool Compile_NextInstruction(); + + Xbyak::Reg32 Compile_FetchParameter(); + Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst); + + void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg); + void Compile_Send(Xbyak::Reg32 value); + + Macro::Opcode GetOpCode() const; + + struct JITState { + Engines::Maxwell3D* maxwell3d{}; + std::array registers{}; + u32 carry_flag{}; + }; + static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0"); + using ProgramType = void (*)(JITState*, const u32*, const u32*); + + struct OptimizerState { + bool can_skip_carry{}; + bool has_delayed_pc{}; + bool zero_reg_skip{}; + bool skip_dummy_addimmediate{}; + bool optimize_for_method_move{}; + bool enable_asserts{}; + }; + OptimizerState optimizer{}; + + std::optional next_opcode{}; + ProgramType program{nullptr}; + + std::array labels; + std::array delay_skip; + Xbyak::Label end_of_code{}; + + bool is_delay_slot{}; + u32 pc{}; + + const std::vector& code; +}; + +void MacroJITx64Impl::Execute(const std::vector& parameters, u32 method) { + ASSERT_OR_EXECUTE(program != nullptr, { return; }); + JITState state{}; + state.maxwell3d = &maxwell3d; + state.registers = {}; + program(&state, parameters.data(), parameters.data() + parameters.size()); +} + +void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) { + const bool is_a_zero = opcode.src_a == 0; + const bool is_b_zero = opcode.src_b == 0; + const bool valid_operation = !is_a_zero && !is_b_zero; + [[maybe_unused]] const bool is_move_operation = !is_a_zero && is_b_zero; + const bool has_zero_register = is_a_zero || is_b_zero; + const bool no_zero_reg_skip = opcode.alu_operation == Macro::ALUOperation::AddWithCarry || + opcode.alu_operation == Macro::ALUOperation::SubtractWithBorrow; + + Xbyak::Reg32 src_a; + Xbyak::Reg32 src_b; + + if (!optimizer.zero_reg_skip || no_zero_reg_skip) { + src_a = Compile_GetRegister(opcode.src_a, RESULT); + src_b = Compile_GetRegister(opcode.src_b, eax); + } else { + if (!is_a_zero) { + src_a = Compile_GetRegister(opcode.src_a, RESULT); + } + if (!is_b_zero) { + src_b = Compile_GetRegister(opcode.src_b, eax); + } + } + + bool has_emitted = false; + + switch (opcode.alu_operation) { + case Macro::ALUOperation::Add: + if (optimizer.zero_reg_skip) { + if (valid_operation) { + add(src_a, src_b); + } + } else { + add(src_a, src_b); + } + + if (!optimizer.can_skip_carry) { + setc(byte[STATE + offsetof(JITState, carry_flag)]); + } + break; + case Macro::ALUOperation::AddWithCarry: + bt(dword[STATE + offsetof(JITState, carry_flag)], 0); + adc(src_a, src_b); + setc(byte[STATE + offsetof(JITState, carry_flag)]); + break; + case Macro::ALUOperation::Subtract: + if (optimizer.zero_reg_skip) { + if (valid_operation) { + sub(src_a, src_b); + has_emitted = true; + } + } else { + sub(src_a, src_b); + has_emitted = true; + } + if (!optimizer.can_skip_carry && has_emitted) { + setc(byte[STATE + offsetof(JITState, carry_flag)]); + } + break; + case Macro::ALUOperation::SubtractWithBorrow: + bt(dword[STATE + offsetof(JITState, carry_flag)], 0); + sbb(src_a, src_b); + setc(byte[STATE + offsetof(JITState, carry_flag)]); + break; + case Macro::ALUOperation::Xor: + if (optimizer.zero_reg_skip) { + if (valid_operation) { + xor_(src_a, src_b); + } + } else { + xor_(src_a, src_b); + } + break; + case Macro::ALUOperation::Or: + if (optimizer.zero_reg_skip) { + if (valid_operation) { + or_(src_a, src_b); + } + } else { + or_(src_a, src_b); + } + break; + case Macro::ALUOperation::And: + if (optimizer.zero_reg_skip) { + if (!has_zero_register) { + and_(src_a, src_b); + } + } else { + and_(src_a, src_b); + } + break; + case Macro::ALUOperation::AndNot: + if (optimizer.zero_reg_skip) { + if (!is_a_zero) { + not_(src_b); + and_(src_a, src_b); + } + } else { + not_(src_b); + and_(src_a, src_b); + } + break; + case Macro::ALUOperation::Nand: + if (optimizer.zero_reg_skip) { + if (!is_a_zero) { + and_(src_a, src_b); + not_(src_a); + } + } else { + and_(src_a, src_b); + not_(src_a); + } + break; + default: + UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", opcode.alu_operation.Value()); + break; + } + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) { + if (optimizer.skip_dummy_addimmediate) { + // Games tend to use this as an exit instruction placeholder. It's to encode an instruction + // without doing anything. In our case we can just not emit anything. + if (opcode.result_operation == Macro::ResultOperation::Move && opcode.dst == 0) { + return; + } + } + // Check for redundant moves + if (optimizer.optimize_for_method_move && + opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) { + if (next_opcode.has_value()) { + const auto next = *next_opcode; + if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod && + opcode.dst == next.dst) { + return; + } + } + } + if (optimizer.zero_reg_skip && opcode.src_a == 0) { + if (opcode.immediate == 0) { + xor_(RESULT, RESULT); + } else { + mov(RESULT, opcode.immediate); + } + } else { + auto result = Compile_GetRegister(opcode.src_a, RESULT); + if (opcode.immediate > 2) { + add(result, opcode.immediate); + } else if (opcode.immediate == 1) { + inc(result); + } else if (opcode.immediate < 0) { + sub(result, opcode.immediate * -1); + } + } + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) { + auto dst = Compile_GetRegister(opcode.src_a, RESULT); + auto src = Compile_GetRegister(opcode.src_b, eax); + + const u32 mask = ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit); + and_(dst, mask); + shr(src, opcode.bf_src_bit); + and_(src, opcode.GetBitfieldMask()); + shl(src, opcode.bf_dst_bit); + or_(dst, src); + + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) { + const auto dst = Compile_GetRegister(opcode.src_a, ecx); + const auto src = Compile_GetRegister(opcode.src_b, RESULT); + + shr(src, dst.cvt8()); + and_(src, opcode.GetBitfieldMask()); + shl(src, opcode.bf_dst_bit); + + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) { + const auto dst = Compile_GetRegister(opcode.src_a, ecx); + const auto src = Compile_GetRegister(opcode.src_b, RESULT); + + shr(src, opcode.bf_src_bit); + and_(src, opcode.GetBitfieldMask()); + shl(src, dst.cvt8()); + + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) { + if (optimizer.zero_reg_skip && opcode.src_a == 0) { + if (opcode.immediate == 0) { + xor_(RESULT, RESULT); + } else { + mov(RESULT, opcode.immediate); + } + } else { + auto result = Compile_GetRegister(opcode.src_a, RESULT); + if (opcode.immediate > 2) { + add(result, opcode.immediate); + } else if (opcode.immediate == 1) { + inc(result); + } else if (opcode.immediate < 0) { + sub(result, opcode.immediate * -1); + } + } + + // Equivalent to Engines::Maxwell3D::GetRegisterValue: + if (optimizer.enable_asserts) { + Xbyak::Label pass_range_check; + cmp(RESULT, static_cast(Engines::Maxwell3D::Regs::NUM_REGS)); + jb(pass_range_check); + int3(); + L(pass_range_check); + } + mov(rax, qword[STATE]); + mov(RESULT, + dword[rax + offsetof(Engines::Maxwell3D, regs) + + offsetof(Engines::Maxwell3D::Regs, reg_array) + RESULT.cvt64() * sizeof(u32)]); + + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) { + maxwell3d->CallMethod(method_address.address, value, true); +} + +void MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) { + Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); + mov(Common::X64::ABI_PARAM1, qword[STATE]); + mov(Common::X64::ABI_PARAM2.cvt32(), METHOD_ADDRESS); + mov(Common::X64::ABI_PARAM3.cvt32(), value); + Common::X64::CallFarFunction(*this, &Send); + Common::X64::ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); + + Xbyak::Label dont_process{}; + // Get increment + test(METHOD_ADDRESS, 0x3f000); + // If zero, method address doesn't update + je(dont_process); + + mov(ecx, METHOD_ADDRESS); + and_(METHOD_ADDRESS, 0xfff); + shr(ecx, 12); + and_(ecx, 0x3f); + lea(eax, ptr[rcx + METHOD_ADDRESS.cvt64()]); + sal(ecx, 12); + or_(eax, ecx); + + mov(METHOD_ADDRESS, eax); + + L(dont_process); +} + +void MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) { + ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid"); + const s32 jump_address = + static_cast(pc) + static_cast(opcode.GetBranchTarget() / sizeof(s32)); + + Xbyak::Label end; + auto value = Compile_GetRegister(opcode.src_a, eax); + cmp(value, 0); // test(value, value); + if (optimizer.has_delayed_pc) { + switch (opcode.branch_condition) { + case Macro::BranchCondition::Zero: + jne(end, T_NEAR); + break; + case Macro::BranchCondition::NotZero: + je(end, T_NEAR); + break; + } + + if (opcode.branch_annul) { + xor_(BRANCH_HOLDER, BRANCH_HOLDER); + jmp(labels[jump_address], T_NEAR); + } else { + Xbyak::Label handle_post_exit{}; + Xbyak::Label skip{}; + jmp(skip, T_NEAR); + + L(handle_post_exit); + xor_(BRANCH_HOLDER, BRANCH_HOLDER); + jmp(labels[jump_address], T_NEAR); + + L(skip); + mov(BRANCH_HOLDER, handle_post_exit); + jmp(delay_skip[pc], T_NEAR); + } + } else { + switch (opcode.branch_condition) { + case Macro::BranchCondition::Zero: + je(labels[jump_address], T_NEAR); + break; + case Macro::BranchCondition::NotZero: + jne(labels[jump_address], T_NEAR); + break; + } + } + + L(end); +} + +void MacroJITx64Impl::Optimizer_ScanFlags() { + optimizer.can_skip_carry = true; + optimizer.has_delayed_pc = false; + for (auto raw_op : code) { + Macro::Opcode op{}; + op.raw = raw_op; + + if (op.operation == Macro::Operation::ALU) { + // Scan for any ALU operations which actually use the carry flag, if they don't exist in + // our current code we can skip emitting the carry flag handling operations + if (op.alu_operation == Macro::ALUOperation::AddWithCarry || + op.alu_operation == Macro::ALUOperation::SubtractWithBorrow) { + optimizer.can_skip_carry = false; + } + } + + if (op.operation == Macro::Operation::Branch) { + if (!op.branch_annul) { + optimizer.has_delayed_pc = true; + } + } + } +} + +void MacroJITx64Impl::Compile() { + labels.fill(Xbyak::Label()); + + Common::X64::ABI_PushRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); + // JIT state + mov(STATE, Common::X64::ABI_PARAM1); + mov(PARAMETERS, Common::X64::ABI_PARAM2); + mov(MAX_PARAMETER, Common::X64::ABI_PARAM3); + xor_(RESULT, RESULT); + xor_(METHOD_ADDRESS, METHOD_ADDRESS); + xor_(BRANCH_HOLDER, BRANCH_HOLDER); + + mov(dword[STATE + offsetof(JITState, registers) + 4], Compile_FetchParameter()); + + // Track get register for zero registers and mark it as no-op + optimizer.zero_reg_skip = true; + + // AddImmediate tends to be used as a NOP instruction, if we detect this we can + // completely skip the entire code path and no emit anything + optimizer.skip_dummy_addimmediate = true; + + // SMO tends to emit a lot of unnecessary method moves, we can mitigate this by only emitting + // one if our register isn't "dirty" + optimizer.optimize_for_method_move = true; + + // Enable run-time assertions in JITted code + optimizer.enable_asserts = false; + + // Check to see if we can skip emitting certain instructions + Optimizer_ScanFlags(); + + const u32 op_count = static_cast(code.size()); + for (u32 i = 0; i < op_count; i++) { + if (i < op_count - 1) { + pc = i + 1; + next_opcode = GetOpCode(); + } else { + next_opcode = {}; + } + pc = i; + Compile_NextInstruction(); + } + + L(end_of_code); + + Common::X64::ABI_PopRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); + ret(); + ready(); + program = getCode(); +} + +bool MacroJITx64Impl::Compile_NextInstruction() { + const auto opcode = GetOpCode(); + if (labels[pc].getAddress()) { + return false; + } + + L(labels[pc]); + + switch (opcode.operation) { + case Macro::Operation::ALU: + Compile_ALU(opcode); + break; + case Macro::Operation::AddImmediate: + Compile_AddImmediate(opcode); + break; + case Macro::Operation::ExtractInsert: + Compile_ExtractInsert(opcode); + break; + case Macro::Operation::ExtractShiftLeftImmediate: + Compile_ExtractShiftLeftImmediate(opcode); + break; + case Macro::Operation::ExtractShiftLeftRegister: + Compile_ExtractShiftLeftRegister(opcode); + break; + case Macro::Operation::Read: + Compile_Read(opcode); + break; + case Macro::Operation::Branch: + Compile_Branch(opcode); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented opcode {}", opcode.operation.Value()); + break; + } + + if (optimizer.has_delayed_pc) { + if (opcode.is_exit) { + mov(rax, end_of_code); + test(BRANCH_HOLDER, BRANCH_HOLDER); + cmove(BRANCH_HOLDER, rax); + // Jump to next instruction to skip delay slot check + je(labels[pc + 1], T_NEAR); + } else { + // TODO(ogniK): Optimize delay slot branching + Xbyak::Label no_delay_slot{}; + test(BRANCH_HOLDER, BRANCH_HOLDER); + je(no_delay_slot, T_NEAR); + mov(rax, BRANCH_HOLDER); + xor_(BRANCH_HOLDER, BRANCH_HOLDER); + jmp(rax); + L(no_delay_slot); + } + L(delay_skip[pc]); + if (opcode.is_exit) { + return false; + } + } else { + test(BRANCH_HOLDER, BRANCH_HOLDER); + jne(end_of_code, T_NEAR); + if (opcode.is_exit) { + inc(BRANCH_HOLDER); + return false; + } + } + return true; +} + +static void WarnInvalidParameter(uintptr_t parameter, uintptr_t max_parameter) { + LOG_CRITICAL(HW_GPU, + "Macro JIT: invalid parameter access 0x{:x} (0x{:x} is the last parameter)", + parameter, max_parameter - sizeof(u32)); +} + +Xbyak::Reg32 MacroJITx64Impl::Compile_FetchParameter() { + Xbyak::Label parameter_ok{}; + cmp(PARAMETERS, MAX_PARAMETER); + jb(parameter_ok, T_NEAR); + Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); + mov(Common::X64::ABI_PARAM1, PARAMETERS); + mov(Common::X64::ABI_PARAM2, MAX_PARAMETER); + Common::X64::CallFarFunction(*this, &WarnInvalidParameter); + Common::X64::ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); + L(parameter_ok); + mov(eax, dword[PARAMETERS]); + add(PARAMETERS, sizeof(u32)); + return eax; +} + +Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) { + if (index == 0) { + // Register 0 is always zero + xor_(dst, dst); + } else { + mov(dst, dword[STATE + offsetof(JITState, registers) + index * sizeof(u32)]); + } + + return dst; +} + +void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) { + const auto SetRegister = [this](u32 reg_index, const Xbyak::Reg32& result) { + // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero + // register. + if (reg_index == 0) { + return; + } + mov(dword[STATE + offsetof(JITState, registers) + reg_index * sizeof(u32)], result); + }; + const auto SetMethodAddress = [this](const Xbyak::Reg32& reg32) { mov(METHOD_ADDRESS, reg32); }; + + switch (operation) { + case Macro::ResultOperation::IgnoreAndFetch: + SetRegister(reg, Compile_FetchParameter()); + break; + case Macro::ResultOperation::Move: + SetRegister(reg, RESULT); + break; + case Macro::ResultOperation::MoveAndSetMethod: + SetRegister(reg, RESULT); + SetMethodAddress(RESULT); + break; + case Macro::ResultOperation::FetchAndSend: + // Fetch parameter and send result. + SetRegister(reg, Compile_FetchParameter()); + Compile_Send(RESULT); + break; + case Macro::ResultOperation::MoveAndSend: + // Move and send result. + SetRegister(reg, RESULT); + Compile_Send(RESULT); + break; + case Macro::ResultOperation::FetchAndSetMethod: + // Fetch parameter and use result as Method Address. + SetRegister(reg, Compile_FetchParameter()); + SetMethodAddress(RESULT); + break; + case Macro::ResultOperation::MoveAndSetMethodFetchAndSend: + // Move result and use as Method Address, then fetch and send parameter. + SetRegister(reg, RESULT); + SetMethodAddress(RESULT); + Compile_Send(Compile_FetchParameter()); + break; + case Macro::ResultOperation::MoveAndSetMethodSend: + // Move result and use as Method Address, then send bits 12:17 of result. + SetRegister(reg, RESULT); + SetMethodAddress(RESULT); + shr(RESULT, 12); + and_(RESULT, 0b111111); + Compile_Send(RESULT); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented macro operation {}", operation); + break; + } +} + +Macro::Opcode MacroJITx64Impl::GetOpCode() const { + ASSERT(pc < code.size()); + return {code[pc]}; +} +} // Anonymous namespace +#endif + +static void Dump(u64 hash, std::span code, bool decompiled = false) { + const auto base_dir{Common::FS::GetEdenPath(Common::FS::EdenPath::DumpDir)}; + const auto macro_dir{base_dir / "macros"}; + if (!Common::FS::CreateDir(base_dir) || !Common::FS::CreateDir(macro_dir)) { + LOG_ERROR(Common_Filesystem, "Failed to create macro dump directories"); + return; + } + auto name{macro_dir / fmt::format("{:016x}.macro", hash)}; + + if (decompiled) { + auto new_name{macro_dir / fmt::format("decompiled_{:016x}.macro", hash)}; + if (Common::FS::Exists(name)) { + (void)Common::FS::RenameFile(name, new_name); + return; + } + name = new_name; + } + + std::fstream macro_file(name, std::ios::out | std::ios::binary); + if (!macro_file) { + LOG_ERROR(Common_Filesystem, "Unable to open or create file at {}", Common::FS::PathToUTF8String(name)); + return; + } + macro_file.write(reinterpret_cast(code.data()), code.size_bytes()); +} + +MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d_, bool is_interpreted_) + : hle_macros{std::make_optional(maxwell3d_)} + , maxwell3d{maxwell3d_} + , is_interpreted{is_interpreted_} +{} + +MacroEngine::~MacroEngine() = default; + +void MacroEngine::AddCode(u32 method, u32 data) { + uploaded_macro_code[method].push_back(data); +} + +void MacroEngine::ClearCode(u32 method) { + macro_cache.erase(method); + uploaded_macro_code.erase(method); +} + +void MacroEngine::Execute(u32 method, const std::vector& parameters) { + auto compiled_macro = macro_cache.find(method); + if (compiled_macro != macro_cache.end()) { + const auto& cache_info = compiled_macro->second; + if (cache_info.has_hle_program) { + cache_info.hle_program->Execute(parameters, method); + } else { + maxwell3d.RefreshParameters(); + cache_info.lle_program->Execute(parameters, method); + } + } else { + // Macro not compiled, check if it's uploaded and if so, compile it + std::optional mid_method; + const auto macro_code = uploaded_macro_code.find(method); + if (macro_code == uploaded_macro_code.end()) { + for (const auto& [method_base, code] : uploaded_macro_code) { + if (method >= method_base && (method - method_base) < code.size()) { + mid_method = method_base; + break; + } + } + if (!mid_method.has_value()) { + ASSERT_MSG(false, "Macro 0x{0:x} was not uploaded", method); + return; + } + } + auto& cache_info = macro_cache[method]; + + if (!mid_method.has_value()) { + cache_info.lle_program = Compile(macro_code->second); + cache_info.hash = Common::HashValue(macro_code->second); + } else { + const auto& macro_cached = uploaded_macro_code[mid_method.value()]; + const auto rebased_method = method - mid_method.value(); + auto& code = uploaded_macro_code[method]; + code.resize(macro_cached.size() - rebased_method); + std::memcpy(code.data(), macro_cached.data() + rebased_method, code.size() * sizeof(u32)); + cache_info.hash = Common::HashValue(code); + cache_info.lle_program = Compile(code); + } + + auto hle_program = hle_macros->GetHLEProgram(cache_info.hash); + if (!hle_program || Settings::values.disable_macro_hle) { + maxwell3d.RefreshParameters(); + cache_info.lle_program->Execute(parameters, method); + } else { + cache_info.has_hle_program = true; + cache_info.hle_program = std::move(hle_program); + cache_info.hle_program->Execute(parameters, method); + } + + if (Settings::values.dump_macros) { + Dump(cache_info.hash, macro_code->second, cache_info.has_hle_program); + } + } +} + +std::unique_ptr MacroEngine::Compile(const std::vector& code) { + if (is_interpreted) + return std::make_unique(maxwell3d, code); + return std::make_unique(maxwell3d, code); +} + +std::optional GetMacroEngine(Engines::Maxwell3D& maxwell3d) { +#ifdef ARCHITECTURE_x86_64 + return std::make_optional(maxwell3d, bool(Settings::values.disable_macro_jit)); +#else + return std::make_optional(maxwell3d, true); +#endif +} + +} // namespace Tegra diff --git a/src/video_core/macro/macro.h b/src/video_core/macro.h similarity index 76% rename from src/video_core/macro/macro.h rename to src/video_core/macro.h index 737ced9a45..b580d6f5ca 100644 --- a/src/video_core/macro/macro.h +++ b/src/video_core/macro.h @@ -95,24 +95,34 @@ union MethodAddress { } // namespace Macro -class HLEMacro; - class CachedMacro { public: - virtual ~CachedMacro() = default; - /** - * Executes the macro code with the specified input parameters. - * - * @param parameters The parameters of the macro - * @param method The method to execute - */ + CachedMacro(Engines::Maxwell3D& maxwell3d) + : maxwell3d{maxwell3d} + {} + virtual ~CachedMacro() = 0; + /// Executes the macro code with the specified input parameters. + /// @param parameters The parameters of the macro + /// @param method The method to execute virtual void Execute(const std::vector& parameters, u32 method) = 0; + Engines::Maxwell3D& maxwell3d; +}; + +class HLEMacro { +public: + explicit HLEMacro(Engines::Maxwell3D& maxwell3d_); + ~HLEMacro(); + // Allocates and returns a cached macro if the hash matches a known function. + // Returns nullptr otherwise. + [[nodiscard]] std::unique_ptr GetHLEProgram(u64 hash) const; +private: + Engines::Maxwell3D& maxwell3d; }; class MacroEngine { public: - explicit MacroEngine(Engines::Maxwell3D& maxwell3d); - virtual ~MacroEngine(); + explicit MacroEngine(Engines::Maxwell3D& maxwell3d, bool is_interpreted); + ~MacroEngine(); // Store the uploaded macro code to compile them when they're called. void AddCode(u32 method, u32 data); @@ -124,7 +134,7 @@ public: void Execute(u32 method, const std::vector& parameters); protected: - virtual std::unique_ptr Compile(const std::vector& code) = 0; + std::unique_ptr Compile(const std::vector& code); private: struct CacheInfo { @@ -136,10 +146,11 @@ private: std::unordered_map macro_cache; std::unordered_map> uploaded_macro_code; - std::unique_ptr hle_macros; + std::optional hle_macros; Engines::Maxwell3D& maxwell3d; + bool is_interpreted; }; -std::unique_ptr GetMacroEngine(Engines::Maxwell3D& maxwell3d); +std::optional GetMacroEngine(Engines::Maxwell3D& maxwell3d); } // namespace Tegra diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp deleted file mode 100644 index 2ff5e21c5e..0000000000 --- a/src/video_core/macro/macro.cpp +++ /dev/null @@ -1,140 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project -// SPDX-License-Identifier: GPL-3.0-or-later - -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include -#include -#include -#include - -#include "common/container_hash.h" - -#include -#include "common/assert.h" -#include "common/fs/fs.h" -#include "common/fs/path_util.h" -#include "common/settings.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/macro/macro.h" -#include "video_core/macro/macro_hle.h" -#include "video_core/macro/macro_interpreter.h" - -#ifdef ARCHITECTURE_x86_64 -#include "video_core/macro/macro_jit_x64.h" -#endif - -namespace Tegra { - -static void Dump(u64 hash, std::span code, bool decompiled = false) { - const auto base_dir{Common::FS::GetEdenPath(Common::FS::EdenPath::DumpDir)}; - const auto macro_dir{base_dir / "macros"}; - if (!Common::FS::CreateDir(base_dir) || !Common::FS::CreateDir(macro_dir)) { - LOG_ERROR(Common_Filesystem, "Failed to create macro dump directories"); - return; - } - auto name{macro_dir / fmt::format("{:016x}.macro", hash)}; - - if (decompiled) { - auto new_name{macro_dir / fmt::format("decompiled_{:016x}.macro", hash)}; - if (Common::FS::Exists(name)) { - (void)Common::FS::RenameFile(name, new_name); - return; - } - name = new_name; - } - - std::fstream macro_file(name, std::ios::out | std::ios::binary); - if (!macro_file) { - LOG_ERROR(Common_Filesystem, "Unable to open or create file at {}", - Common::FS::PathToUTF8String(name)); - return; - } - macro_file.write(reinterpret_cast(code.data()), code.size_bytes()); -} - -MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d_) - : hle_macros{std::make_unique(maxwell3d_)}, maxwell3d{maxwell3d_} {} - -MacroEngine::~MacroEngine() = default; - -void MacroEngine::AddCode(u32 method, u32 data) { - uploaded_macro_code[method].push_back(data); -} - -void MacroEngine::ClearCode(u32 method) { - macro_cache.erase(method); - uploaded_macro_code.erase(method); -} - -void MacroEngine::Execute(u32 method, const std::vector& parameters) { - auto compiled_macro = macro_cache.find(method); - if (compiled_macro != macro_cache.end()) { - const auto& cache_info = compiled_macro->second; - if (cache_info.has_hle_program) { - cache_info.hle_program->Execute(parameters, method); - } else { - maxwell3d.RefreshParameters(); - cache_info.lle_program->Execute(parameters, method); - } - } else { - // Macro not compiled, check if it's uploaded and if so, compile it - std::optional mid_method; - const auto macro_code = uploaded_macro_code.find(method); - if (macro_code == uploaded_macro_code.end()) { - for (const auto& [method_base, code] : uploaded_macro_code) { - if (method >= method_base && (method - method_base) < code.size()) { - mid_method = method_base; - break; - } - } - if (!mid_method.has_value()) { - ASSERT_MSG(false, "Macro 0x{0:x} was not uploaded", method); - return; - } - } - auto& cache_info = macro_cache[method]; - - if (!mid_method.has_value()) { - cache_info.lle_program = Compile(macro_code->second); - cache_info.hash = Common::HashValue(macro_code->second); - } else { - const auto& macro_cached = uploaded_macro_code[mid_method.value()]; - const auto rebased_method = method - mid_method.value(); - auto& code = uploaded_macro_code[method]; - code.resize(macro_cached.size() - rebased_method); - std::memcpy(code.data(), macro_cached.data() + rebased_method, - code.size() * sizeof(u32)); - cache_info.hash = Common::HashValue(code); - cache_info.lle_program = Compile(code); - } - - auto hle_program = hle_macros->GetHLEProgram(cache_info.hash); - if (!hle_program || Settings::values.disable_macro_hle) { - maxwell3d.RefreshParameters(); - cache_info.lle_program->Execute(parameters, method); - } else { - cache_info.has_hle_program = true; - cache_info.hle_program = std::move(hle_program); - cache_info.hle_program->Execute(parameters, method); - } - - if (Settings::values.dump_macros) { - Dump(cache_info.hash, macro_code->second, cache_info.has_hle_program); - } - } -} - -std::unique_ptr GetMacroEngine(Engines::Maxwell3D& maxwell3d) { - if (Settings::values.disable_macro_jit) { - return std::make_unique(maxwell3d); - } -#ifdef ARCHITECTURE_x86_64 - return std::make_unique(maxwell3d); -#else - return std::make_unique(maxwell3d); -#endif -} - -} // namespace Tegra diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp deleted file mode 100644 index 2f41e806c2..0000000000 --- a/src/video_core/macro/macro_hle.cpp +++ /dev/null @@ -1,606 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project -// SPDX-License-Identifier: GPL-3.0-or-later - -// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project -// SPDX-License-Identifier: GPL-3.0-or-later - -#include -#include -#include "common/assert.h" -#include "common/scope_exit.h" -#include "video_core/dirty_flags.h" -#include "video_core/engines/draw_manager.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/macro/macro.h" -#include "video_core/macro/macro_hle.h" -#include "video_core/memory_manager.h" -#include "video_core/rasterizer_interface.h" - -namespace Tegra { - -using Maxwell3D = Engines::Maxwell3D; - -namespace { - -bool IsTopologySafe(Maxwell3D::Regs::PrimitiveTopology topology) { - switch (topology) { - case Maxwell3D::Regs::PrimitiveTopology::Points: - case Maxwell3D::Regs::PrimitiveTopology::Lines: - case Maxwell3D::Regs::PrimitiveTopology::LineLoop: - case Maxwell3D::Regs::PrimitiveTopology::LineStrip: - case Maxwell3D::Regs::PrimitiveTopology::Triangles: - case Maxwell3D::Regs::PrimitiveTopology::TriangleStrip: - case Maxwell3D::Regs::PrimitiveTopology::TriangleFan: - case Maxwell3D::Regs::PrimitiveTopology::LinesAdjacency: - case Maxwell3D::Regs::PrimitiveTopology::LineStripAdjacency: - case Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency: - case Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency: - case Maxwell3D::Regs::PrimitiveTopology::Patches: - return true; - case Maxwell3D::Regs::PrimitiveTopology::Quads: - case Maxwell3D::Regs::PrimitiveTopology::QuadStrip: - case Maxwell3D::Regs::PrimitiveTopology::Polygon: - default: - return false; - } -} - -class HLEMacroImpl : public CachedMacro { -public: - explicit HLEMacroImpl(Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} {} - -protected: - Maxwell3D& maxwell3d; -}; - -/* - * @note: these macros have two versions, a normal and extended version, with the extended version - * also assigning the base vertex/instance. - */ -template -class HLE_DrawArraysIndirect final : public HLEMacroImpl { -public: - explicit HLE_DrawArraysIndirect(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} - - void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { - auto topology = static_cast(parameters[0]); - if (!maxwell3d.AnyParametersDirty() || !IsTopologySafe(topology)) { - Fallback(parameters); - return; - } - - auto& params = maxwell3d.draw_manager->GetIndirectParams(); - params.is_byte_count = false; - params.is_indexed = false; - params.include_count = false; - params.count_start_address = 0; - params.indirect_start_address = maxwell3d.GetMacroAddress(1); - params.buffer_size = 4 * sizeof(u32); - params.max_draw_counts = 1; - params.stride = 0; - - if constexpr (extended) { - maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro; - maxwell3d.SetHLEReplacementAttributeType( - 0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseInstance); - } - - maxwell3d.draw_manager->DrawArrayIndirect(topology); - - if constexpr (extended) { - maxwell3d.engine_state = Maxwell3D::EngineHint::None; - maxwell3d.replace_table.clear(); - } - } - -private: - void Fallback(const std::vector& parameters) { - SCOPE_EXIT { - if (extended) { - maxwell3d.engine_state = Maxwell3D::EngineHint::None; - maxwell3d.replace_table.clear(); - } - }; - maxwell3d.RefreshParameters(); - const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]); - - auto topology = static_cast(parameters[0]); - const u32 vertex_first = parameters[3]; - const u32 vertex_count = parameters[1]; - - if (!IsTopologySafe(topology) && - static_cast(maxwell3d.GetMaxCurrentVertices()) < - static_cast(vertex_first) + static_cast(vertex_count)) { - ASSERT_MSG(false, "Faulty draw!"); - return; - } - - const u32 base_instance = parameters[4]; - if constexpr (extended) { - maxwell3d.regs.global_base_instance_index = base_instance; - maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro; - maxwell3d.SetHLEReplacementAttributeType( - 0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseInstance); - } - - maxwell3d.draw_manager->DrawArray(topology, vertex_first, vertex_count, base_instance, - instance_count); - - if constexpr (extended) { - maxwell3d.regs.global_base_instance_index = 0; - maxwell3d.engine_state = Maxwell3D::EngineHint::None; - maxwell3d.replace_table.clear(); - } - } -}; - -/* - * @note: these macros have two versions, a normal and extended version, with the extended version - * also assigning the base vertex/instance. - */ -template -class HLE_DrawIndexedIndirect final : public HLEMacroImpl { -public: - explicit HLE_DrawIndexedIndirect(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} - - void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { - auto topology = static_cast(parameters[0]); - if (!maxwell3d.AnyParametersDirty() || !IsTopologySafe(topology)) { - Fallback(parameters); - return; - } - - const u32 estimate = static_cast(maxwell3d.EstimateIndexBufferSize()); - const u32 element_base = parameters[4]; - const u32 base_instance = parameters[5]; - maxwell3d.regs.vertex_id_base = element_base; - maxwell3d.regs.global_base_vertex_index = element_base; - maxwell3d.regs.global_base_instance_index = base_instance; - maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; - if constexpr (extended) { - maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro; - maxwell3d.SetHLEReplacementAttributeType( - 0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseVertex); - maxwell3d.SetHLEReplacementAttributeType( - 0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance); - } - auto& params = maxwell3d.draw_manager->GetIndirectParams(); - params.is_byte_count = false; - params.is_indexed = true; - params.include_count = false; - params.count_start_address = 0; - params.indirect_start_address = maxwell3d.GetMacroAddress(1); - params.buffer_size = 5 * sizeof(u32); - params.max_draw_counts = 1; - params.stride = 0; - maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; - maxwell3d.draw_manager->DrawIndexedIndirect(topology, 0, estimate); - maxwell3d.regs.vertex_id_base = 0x0; - maxwell3d.regs.global_base_vertex_index = 0x0; - maxwell3d.regs.global_base_instance_index = 0x0; - if constexpr (extended) { - maxwell3d.engine_state = Maxwell3D::EngineHint::None; - maxwell3d.replace_table.clear(); - } - } - -private: - void Fallback(const std::vector& parameters) { - maxwell3d.RefreshParameters(); - const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]); - const u32 element_base = parameters[4]; - const u32 base_instance = parameters[5]; - maxwell3d.regs.vertex_id_base = element_base; - maxwell3d.regs.global_base_vertex_index = element_base; - maxwell3d.regs.global_base_instance_index = base_instance; - maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; - if constexpr (extended) { - maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro; - maxwell3d.SetHLEReplacementAttributeType( - 0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseVertex); - maxwell3d.SetHLEReplacementAttributeType( - 0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance); - } - - maxwell3d.draw_manager->DrawIndex( - static_cast(parameters[0]), parameters[3], - parameters[1], element_base, base_instance, instance_count); - - maxwell3d.regs.vertex_id_base = 0x0; - maxwell3d.regs.global_base_vertex_index = 0x0; - maxwell3d.regs.global_base_instance_index = 0x0; - if constexpr (extended) { - maxwell3d.engine_state = Maxwell3D::EngineHint::None; - maxwell3d.replace_table.clear(); - } - } -}; - -class HLE_MultiLayerClear final : public HLEMacroImpl { -public: - explicit HLE_MultiLayerClear(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} - - void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { - maxwell3d.RefreshParameters(); - ASSERT(parameters.size() == 1); - - const Maxwell3D::Regs::ClearSurface clear_params{parameters[0]}; - const u32 rt_index = clear_params.RT; - const u32 num_layers = maxwell3d.regs.rt[rt_index].depth; - ASSERT(clear_params.layer == 0); - - maxwell3d.regs.clear_surface.raw = clear_params.raw; - maxwell3d.draw_manager->Clear(num_layers); - } -}; - -class HLE_MultiDrawIndexedIndirectCount final : public HLEMacroImpl { -public: - explicit HLE_MultiDrawIndexedIndirectCount(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} - - void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { - const auto topology = static_cast(parameters[2]); - if (!IsTopologySafe(topology)) { - Fallback(parameters); - return; - } - - const u32 start_indirect = parameters[0]; - const u32 end_indirect = parameters[1]; - if (start_indirect >= end_indirect) { - // Nothing to do. - return; - } - - const u32 padding = parameters[3]; // padding is in words - - // size of each indirect segment - const u32 indirect_words = 5 + padding; - const u32 stride = indirect_words * sizeof(u32); - const std::size_t draw_count = end_indirect - start_indirect; - const u32 estimate = static_cast(maxwell3d.EstimateIndexBufferSize()); - maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; - auto& params = maxwell3d.draw_manager->GetIndirectParams(); - params.is_byte_count = false; - params.is_indexed = true; - params.include_count = true; - params.count_start_address = maxwell3d.GetMacroAddress(4); - params.indirect_start_address = maxwell3d.GetMacroAddress(5); - params.buffer_size = stride * draw_count; - params.max_draw_counts = draw_count; - params.stride = stride; - maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; - maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro; - maxwell3d.SetHLEReplacementAttributeType( - 0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseVertex); - maxwell3d.SetHLEReplacementAttributeType( - 0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance); - maxwell3d.SetHLEReplacementAttributeType(0, 0x648, - Maxwell3D::HLEReplacementAttributeType::DrawID); - maxwell3d.draw_manager->DrawIndexedIndirect(topology, 0, estimate); - maxwell3d.engine_state = Maxwell3D::EngineHint::None; - maxwell3d.replace_table.clear(); - } - -private: - void Fallback(const std::vector& parameters) { - SCOPE_EXIT { - // Clean everything. - maxwell3d.regs.vertex_id_base = 0x0; - maxwell3d.engine_state = Maxwell3D::EngineHint::None; - maxwell3d.replace_table.clear(); - }; - maxwell3d.RefreshParameters(); - const u32 start_indirect = parameters[0]; - const u32 end_indirect = parameters[1]; - if (start_indirect >= end_indirect) { - // Nothing to do. - return; - } - const auto topology = static_cast(parameters[2]); - const u32 padding = parameters[3]; - const std::size_t max_draws = parameters[4]; - - const u32 indirect_words = 5 + padding; - const std::size_t first_draw = start_indirect; - const std::size_t effective_draws = end_indirect - start_indirect; - const std::size_t last_draw = start_indirect + (std::min)(effective_draws, max_draws); - - for (std::size_t index = first_draw; index < last_draw; index++) { - const std::size_t base = index * indirect_words + 5; - const u32 base_vertex = parameters[base + 3]; - const u32 base_instance = parameters[base + 4]; - maxwell3d.regs.vertex_id_base = base_vertex; - maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro; - maxwell3d.SetHLEReplacementAttributeType( - 0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseVertex); - maxwell3d.SetHLEReplacementAttributeType( - 0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance); - maxwell3d.CallMethod(0x8e3, 0x648, true); - maxwell3d.CallMethod(0x8e4, static_cast(index), true); - maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; - maxwell3d.draw_manager->DrawIndex(topology, parameters[base + 2], parameters[base], - base_vertex, base_instance, parameters[base + 1]); - } - } -}; - -class HLE_DrawIndirectByteCount final : public HLEMacroImpl { -public: - explicit HLE_DrawIndirectByteCount(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} - - void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { - const bool force = maxwell3d.Rasterizer().HasDrawTransformFeedback(); - - auto topology = static_cast(parameters[0] & 0xFFFFU); - if (!force && (!maxwell3d.AnyParametersDirty() || !IsTopologySafe(topology))) { - Fallback(parameters); - return; - } - auto& params = maxwell3d.draw_manager->GetIndirectParams(); - params.is_byte_count = true; - params.is_indexed = false; - params.include_count = false; - params.count_start_address = 0; - params.indirect_start_address = maxwell3d.GetMacroAddress(2); - params.buffer_size = 4; - params.max_draw_counts = 1; - params.stride = parameters[1]; - maxwell3d.regs.draw.begin = parameters[0]; - maxwell3d.regs.draw_auto_stride = parameters[1]; - maxwell3d.regs.draw_auto_byte_count = parameters[2]; - - maxwell3d.draw_manager->DrawArrayIndirect(topology); - } - -private: - void Fallback(const std::vector& parameters) { - maxwell3d.RefreshParameters(); - - maxwell3d.regs.draw.begin = parameters[0]; - maxwell3d.regs.draw_auto_stride = parameters[1]; - maxwell3d.regs.draw_auto_byte_count = parameters[2]; - - maxwell3d.draw_manager->DrawArray( - maxwell3d.regs.draw.topology, 0, - maxwell3d.regs.draw_auto_byte_count / maxwell3d.regs.draw_auto_stride, 0, 1); - } -}; - -class HLE_C713C83D8F63CCF3 final : public HLEMacroImpl { -public: - explicit HLE_C713C83D8F63CCF3(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} - - void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { - maxwell3d.RefreshParameters(); - const u32 offset = (parameters[0] & 0x3FFFFFFF) << 2; - const u32 address = maxwell3d.regs.shadow_scratch[24]; - auto& const_buffer = maxwell3d.regs.const_buffer; - const_buffer.size = 0x7000; - const_buffer.address_high = (address >> 24) & 0xFF; - const_buffer.address_low = address << 8; - const_buffer.offset = offset; - } -}; - -class HLE_D7333D26E0A93EDE final : public HLEMacroImpl { -public: - explicit HLE_D7333D26E0A93EDE(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} - - void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { - maxwell3d.RefreshParameters(); - const size_t index = parameters[0]; - const u32 address = maxwell3d.regs.shadow_scratch[42 + index]; - const u32 size = maxwell3d.regs.shadow_scratch[47 + index]; - auto& const_buffer = maxwell3d.regs.const_buffer; - const_buffer.size = size; - const_buffer.address_high = (address >> 24) & 0xFF; - const_buffer.address_low = address << 8; - } -}; - -class HLE_BindShader final : public HLEMacroImpl { -public: - explicit HLE_BindShader(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} - - void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { - maxwell3d.RefreshParameters(); - auto& regs = maxwell3d.regs; - const u32 index = parameters[0]; - if ((parameters[1] - regs.shadow_scratch[28 + index]) == 0) { - return; - } - - regs.pipelines[index & 0xF].offset = parameters[2]; - maxwell3d.dirty.flags[VideoCommon::Dirty::Shaders] = true; - regs.shadow_scratch[28 + index] = parameters[1]; - regs.shadow_scratch[34 + index] = parameters[2]; - - const u32 address = parameters[4]; - auto& const_buffer = regs.const_buffer; - const_buffer.size = 0x10000; - const_buffer.address_high = (address >> 24) & 0xFF; - const_buffer.address_low = address << 8; - - const size_t bind_group_id = parameters[3] & 0x7F; - auto& bind_group = regs.bind_groups[bind_group_id]; - bind_group.raw_config = 0x11; - maxwell3d.ProcessCBBind(bind_group_id); - } -}; - -class HLE_SetRasterBoundingBox final : public HLEMacroImpl { -public: - explicit HLE_SetRasterBoundingBox(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} - - void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { - maxwell3d.RefreshParameters(); - const u32 raster_mode = parameters[0]; - auto& regs = maxwell3d.regs; - const u32 raster_enabled = maxwell3d.regs.conservative_raster_enable; - const u32 scratch_data = maxwell3d.regs.shadow_scratch[52]; - regs.raster_bounding_box.raw = raster_mode & 0xFFFFF00F; - regs.raster_bounding_box.pad.Assign(scratch_data & raster_enabled); - } -}; - -template -class HLE_ClearConstBuffer final : public HLEMacroImpl { -public: - explicit HLE_ClearConstBuffer(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} - - void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { - maxwell3d.RefreshParameters(); - static constexpr std::array zeroes{}; - auto& regs = maxwell3d.regs; - regs.const_buffer.size = static_cast(base_size); - regs.const_buffer.address_high = parameters[0]; - regs.const_buffer.address_low = parameters[1]; - regs.const_buffer.offset = 0; - maxwell3d.ProcessCBMultiData(zeroes.data(), parameters[2] * 4); - } -}; - -class HLE_ClearMemory final : public HLEMacroImpl { -public: - explicit HLE_ClearMemory(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} - - void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { - maxwell3d.RefreshParameters(); - - const u32 needed_memory = parameters[2] / sizeof(u32); - if (needed_memory > zero_memory.size()) { - zero_memory.resize(needed_memory, 0); - } - auto& regs = maxwell3d.regs; - regs.upload.line_length_in = parameters[2]; - regs.upload.line_count = 1; - regs.upload.dest.address_high = parameters[0]; - regs.upload.dest.address_low = parameters[1]; - maxwell3d.CallMethod(static_cast(MAXWELL3D_REG_INDEX(launch_dma)), 0x1011, true); - maxwell3d.CallMultiMethod(static_cast(MAXWELL3D_REG_INDEX(inline_data)), - zero_memory.data(), needed_memory, needed_memory); - } - -private: - std::vector zero_memory; -}; - -class HLE_TransformFeedbackSetup final : public HLEMacroImpl { -public: - explicit HLE_TransformFeedbackSetup(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} - - void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { - maxwell3d.RefreshParameters(); - - auto& regs = maxwell3d.regs; - regs.transform_feedback_enabled = 1; - regs.transform_feedback.buffers[0].start_offset = 0; - regs.transform_feedback.buffers[1].start_offset = 0; - regs.transform_feedback.buffers[2].start_offset = 0; - regs.transform_feedback.buffers[3].start_offset = 0; - - regs.upload.line_length_in = 4; - regs.upload.line_count = 1; - regs.upload.dest.address_high = parameters[0]; - regs.upload.dest.address_low = parameters[1]; - maxwell3d.CallMethod(static_cast(MAXWELL3D_REG_INDEX(launch_dma)), 0x1011, true); - maxwell3d.CallMethod(static_cast(MAXWELL3D_REG_INDEX(inline_data)), - regs.transform_feedback.controls[0].stride, true); - - maxwell3d.Rasterizer().RegisterTransformFeedback(regs.upload.dest.Address()); - } -}; - -} // Anonymous namespace - -HLEMacro::HLEMacro(Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} { - builders.emplace(0x0D61FC9FAAC9FCADULL, - std::function(Maxwell3D&)>( - [](Maxwell3D& maxwell3d__) -> std::unique_ptr { - return std::make_unique>(maxwell3d__); - })); - builders.emplace(0x8A4D173EB99A8603ULL, - std::function(Maxwell3D&)>( - [](Maxwell3D& maxwell3d__) -> std::unique_ptr { - return std::make_unique>(maxwell3d__); - })); - builders.emplace(0x771BB18C62444DA0ULL, - std::function(Maxwell3D&)>( - [](Maxwell3D& maxwell3d__) -> std::unique_ptr { - return std::make_unique>(maxwell3d__); - })); - builders.emplace(0x0217920100488FF7ULL, - std::function(Maxwell3D&)>( - [](Maxwell3D& maxwell3d__) -> std::unique_ptr { - return std::make_unique>(maxwell3d__); - })); - builders.emplace(0x3F5E74B9C9A50164ULL, - std::function(Maxwell3D&)>( - [](Maxwell3D& maxwell3d__) -> std::unique_ptr { - return std::make_unique( - maxwell3d__); - })); - builders.emplace(0xEAD26C3E2109B06BULL, - std::function(Maxwell3D&)>( - [](Maxwell3D& maxwell3d__) -> std::unique_ptr { - return std::make_unique(maxwell3d__); - })); - builders.emplace(0xC713C83D8F63CCF3ULL, - std::function(Maxwell3D&)>( - [](Maxwell3D& maxwell3d__) -> std::unique_ptr { - return std::make_unique(maxwell3d__); - })); - builders.emplace(0xD7333D26E0A93EDEULL, - std::function(Maxwell3D&)>( - [](Maxwell3D& maxwell3d__) -> std::unique_ptr { - return std::make_unique(maxwell3d__); - })); - builders.emplace(0xEB29B2A09AA06D38ULL, - std::function(Maxwell3D&)>( - [](Maxwell3D& maxwell3d__) -> std::unique_ptr { - return std::make_unique(maxwell3d__); - })); - builders.emplace(0xDB1341DBEB4C8AF7ULL, - std::function(Maxwell3D&)>( - [](Maxwell3D& maxwell3d__) -> std::unique_ptr { - return std::make_unique(maxwell3d__); - })); - builders.emplace(0x6C97861D891EDf7EULL, - std::function(Maxwell3D&)>( - [](Maxwell3D& maxwell3d__) -> std::unique_ptr { - return std::make_unique>(maxwell3d__); - })); - builders.emplace(0xD246FDDF3A6173D7ULL, - std::function(Maxwell3D&)>( - [](Maxwell3D& maxwell3d__) -> std::unique_ptr { - return std::make_unique>(maxwell3d__); - })); - builders.emplace(0xEE4D0004BEC8ECF4ULL, - std::function(Maxwell3D&)>( - [](Maxwell3D& maxwell3d__) -> std::unique_ptr { - return std::make_unique(maxwell3d__); - })); - builders.emplace(0xFC0CF27F5FFAA661ULL, - std::function(Maxwell3D&)>( - [](Maxwell3D& maxwell3d__) -> std::unique_ptr { - return std::make_unique(maxwell3d__); - })); - builders.emplace(0xB5F74EDB717278ECULL, - std::function(Maxwell3D&)>( - [](Maxwell3D& maxwell3d__) -> std::unique_ptr { - return std::make_unique(maxwell3d__); - })); -} - -HLEMacro::~HLEMacro() = default; - -std::unique_ptr HLEMacro::GetHLEProgram(u64 hash) const { - const auto it = builders.find(hash); - if (it == builders.end()) { - return nullptr; - } - return it->second(maxwell3d); -} - -} // namespace Tegra diff --git a/src/video_core/macro/macro_hle.h b/src/video_core/macro/macro_hle.h deleted file mode 100644 index 33f92fab16..0000000000 --- a/src/video_core/macro/macro_hle.h +++ /dev/null @@ -1,33 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#pragma once - -#include -#include -#include - -#include "common/common_types.h" - -namespace Tegra { - -namespace Engines { -class Maxwell3D; -} - -class HLEMacro { -public: - explicit HLEMacro(Engines::Maxwell3D& maxwell3d_); - ~HLEMacro(); - - // Allocates and returns a cached macro if the hash matches a known function. - // Returns nullptr otherwise. - [[nodiscard]] std::unique_ptr GetHLEProgram(u64 hash) const; - -private: - Engines::Maxwell3D& maxwell3d; - std::unordered_map(Engines::Maxwell3D&)>> - builders; -}; - -} // namespace Tegra diff --git a/src/video_core/macro/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp deleted file mode 100644 index f9befce676..0000000000 --- a/src/video_core/macro/macro_interpreter.cpp +++ /dev/null @@ -1,362 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project -// SPDX-License-Identifier: GPL-3.0-or-later - -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include -#include - -#include "common/assert.h" -#include "common/logging/log.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/macro/macro_interpreter.h" - -namespace Tegra { -namespace { -class MacroInterpreterImpl final : public CachedMacro { -public: - explicit MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d_, const std::vector& code_) - : maxwell3d{maxwell3d_}, code{code_} {} - - void Execute(const std::vector& params, u32 method) override; - -private: - /// Resets the execution engine state, zeroing registers, etc. - void Reset(); - - /** - * Executes a single macro instruction located at the current program counter. Returns whether - * the interpreter should keep running. - * - * @param is_delay_slot Whether the current step is being executed due to a delay slot in a - * previous instruction. - */ - bool Step(bool is_delay_slot); - - /// Calculates the result of an ALU operation. src_a OP src_b; - u32 GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b); - - /// Performs the result operation on the input result and stores it in the specified register - /// (if necessary). - void ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result); - - /// Evaluates the branch condition and returns whether the branch should be taken or not. - bool EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const; - - /// Reads an opcode at the current program counter location. - Macro::Opcode GetOpcode() const; - - /// Returns the specified register's value. Register 0 is hardcoded to always return 0. - u32 GetRegister(u32 register_id) const; - - /// Sets the register to the input value. - void SetRegister(u32 register_id, u32 value); - - /// Sets the method address to use for the next Send instruction. - void SetMethodAddress(u32 address); - - /// Calls a GPU Engine method with the input parameter. - void Send(u32 value); - - /// Reads a GPU register located at the method address. - u32 Read(u32 method) const; - - /// Returns the next parameter in the parameter queue. - u32 FetchParameter(); - - Engines::Maxwell3D& maxwell3d; - - /// Current program counter - u32 pc{}; - /// Program counter to execute at after the delay slot is executed. - std::optional delayed_pc; - - /// General purpose macro registers. - std::array registers = {}; - - /// Method address to use for the next Send instruction. - Macro::MethodAddress method_address = {}; - - /// Input parameters of the current macro. - std::unique_ptr parameters; - std::size_t num_parameters = 0; - std::size_t parameters_capacity = 0; - /// Index of the next parameter that will be fetched by the 'parm' instruction. - u32 next_parameter_index = 0; - - bool carry_flag = false; - const std::vector& code; -}; - -void MacroInterpreterImpl::Execute(const std::vector& params, u32 method) { - Reset(); - - registers[1] = params[0]; - num_parameters = params.size(); - - if (num_parameters > parameters_capacity) { - parameters_capacity = num_parameters; - parameters = std::make_unique(num_parameters); - } - std::memcpy(parameters.get(), params.data(), num_parameters * sizeof(u32)); - - // Execute the code until we hit an exit condition. - bool keep_executing = true; - while (keep_executing) { - keep_executing = Step(false); - } - - // Assert the the macro used all the input parameters - ASSERT(next_parameter_index == num_parameters); -} - -void MacroInterpreterImpl::Reset() { - registers = {}; - pc = 0; - delayed_pc = {}; - method_address.raw = 0; - num_parameters = 0; - // The next parameter index starts at 1, because $r1 already has the value of the first - // parameter. - next_parameter_index = 1; - carry_flag = false; -} - -bool MacroInterpreterImpl::Step(bool is_delay_slot) { - u32 base_address = pc; - - Macro::Opcode opcode = GetOpcode(); - pc += 4; - - // Update the program counter if we were delayed - if (delayed_pc) { - ASSERT(is_delay_slot); - pc = *delayed_pc; - delayed_pc = {}; - } - - switch (opcode.operation) { - case Macro::Operation::ALU: { - u32 result = GetALUResult(opcode.alu_operation, GetRegister(opcode.src_a), - GetRegister(opcode.src_b)); - ProcessResult(opcode.result_operation, opcode.dst, result); - break; - } - case Macro::Operation::AddImmediate: { - ProcessResult(opcode.result_operation, opcode.dst, - GetRegister(opcode.src_a) + opcode.immediate); - break; - } - case Macro::Operation::ExtractInsert: { - u32 dst = GetRegister(opcode.src_a); - u32 src = GetRegister(opcode.src_b); - - src = (src >> opcode.bf_src_bit) & opcode.GetBitfieldMask(); - dst &= ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit); - dst |= src << opcode.bf_dst_bit; - ProcessResult(opcode.result_operation, opcode.dst, dst); - break; - } - case Macro::Operation::ExtractShiftLeftImmediate: { - u32 dst = GetRegister(opcode.src_a); - u32 src = GetRegister(opcode.src_b); - - u32 result = ((src >> dst) & opcode.GetBitfieldMask()) << opcode.bf_dst_bit; - - ProcessResult(opcode.result_operation, opcode.dst, result); - break; - } - case Macro::Operation::ExtractShiftLeftRegister: { - u32 dst = GetRegister(opcode.src_a); - u32 src = GetRegister(opcode.src_b); - - u32 result = ((src >> opcode.bf_src_bit) & opcode.GetBitfieldMask()) << dst; - - ProcessResult(opcode.result_operation, opcode.dst, result); - break; - } - case Macro::Operation::Read: { - u32 result = Read(GetRegister(opcode.src_a) + opcode.immediate); - ProcessResult(opcode.result_operation, opcode.dst, result); - break; - } - case Macro::Operation::Branch: { - ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid"); - u32 value = GetRegister(opcode.src_a); - bool taken = EvaluateBranchCondition(opcode.branch_condition, value); - if (taken) { - // Ignore the delay slot if the branch has the annul bit. - if (opcode.branch_annul) { - pc = base_address + opcode.GetBranchTarget(); - return true; - } - - delayed_pc = base_address + opcode.GetBranchTarget(); - // Execute one more instruction due to the delay slot. - return Step(true); - } - break; - } - default: - UNIMPLEMENTED_MSG("Unimplemented macro operation {}", opcode.operation.Value()); - break; - } - - // An instruction with the Exit flag will not actually - // cause an exit if it's executed inside a delay slot. - if (opcode.is_exit && !is_delay_slot) { - // Exit has a delay slot, execute the next instruction - Step(true); - return false; - } - - return true; -} - -u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b) { - switch (operation) { - case Macro::ALUOperation::Add: { - const u64 result{static_cast(src_a) + src_b}; - carry_flag = result > 0xffffffff; - return static_cast(result); - } - case Macro::ALUOperation::AddWithCarry: { - const u64 result{static_cast(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)}; - carry_flag = result > 0xffffffff; - return static_cast(result); - } - case Macro::ALUOperation::Subtract: { - const u64 result{static_cast(src_a) - src_b}; - carry_flag = result < 0x100000000; - return static_cast(result); - } - case Macro::ALUOperation::SubtractWithBorrow: { - const u64 result{static_cast(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)}; - carry_flag = result < 0x100000000; - return static_cast(result); - } - case Macro::ALUOperation::Xor: - return src_a ^ src_b; - case Macro::ALUOperation::Or: - return src_a | src_b; - case Macro::ALUOperation::And: - return src_a & src_b; - case Macro::ALUOperation::AndNot: - return src_a & ~src_b; - case Macro::ALUOperation::Nand: - return ~(src_a & src_b); - - default: - UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", operation); - return 0; - } -} - -void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result) { - switch (operation) { - case Macro::ResultOperation::IgnoreAndFetch: - // Fetch parameter and ignore result. - SetRegister(reg, FetchParameter()); - break; - case Macro::ResultOperation::Move: - // Move result. - SetRegister(reg, result); - break; - case Macro::ResultOperation::MoveAndSetMethod: - // Move result and use as Method Address. - SetRegister(reg, result); - SetMethodAddress(result); - break; - case Macro::ResultOperation::FetchAndSend: - // Fetch parameter and send result. - SetRegister(reg, FetchParameter()); - Send(result); - break; - case Macro::ResultOperation::MoveAndSend: - // Move and send result. - SetRegister(reg, result); - Send(result); - break; - case Macro::ResultOperation::FetchAndSetMethod: - // Fetch parameter and use result as Method Address. - SetRegister(reg, FetchParameter()); - SetMethodAddress(result); - break; - case Macro::ResultOperation::MoveAndSetMethodFetchAndSend: - // Move result and use as Method Address, then fetch and send parameter. - SetRegister(reg, result); - SetMethodAddress(result); - Send(FetchParameter()); - break; - case Macro::ResultOperation::MoveAndSetMethodSend: - // Move result and use as Method Address, then send bits 12:17 of result. - SetRegister(reg, result); - SetMethodAddress(result); - Send((result >> 12) & 0b111111); - break; - default: - UNIMPLEMENTED_MSG("Unimplemented result operation {}", operation); - break; - } -} - -bool MacroInterpreterImpl::EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const { - switch (cond) { - case Macro::BranchCondition::Zero: - return value == 0; - case Macro::BranchCondition::NotZero: - return value != 0; - } - UNREACHABLE(); -} - -Macro::Opcode MacroInterpreterImpl::GetOpcode() const { - ASSERT((pc % sizeof(u32)) == 0); - ASSERT(pc < code.size() * sizeof(u32)); - return {code[pc / sizeof(u32)]}; -} - -u32 MacroInterpreterImpl::GetRegister(u32 register_id) const { - return registers.at(register_id); -} - -void MacroInterpreterImpl::SetRegister(u32 register_id, u32 value) { - // Register 0 is hardwired as the zero register. - // Ensure no writes to it actually occur. - if (register_id == 0) { - return; - } - - registers.at(register_id) = value; -} - -void MacroInterpreterImpl::SetMethodAddress(u32 address) { - method_address.raw = address; -} - -void MacroInterpreterImpl::Send(u32 value) { - maxwell3d.CallMethod(method_address.address, value, true); - // Increment the method address by the method increment. - method_address.address.Assign(method_address.address.Value() + - method_address.increment.Value()); -} - -u32 MacroInterpreterImpl::Read(u32 method) const { - return maxwell3d.GetRegisterValue(method); -} - -u32 MacroInterpreterImpl::FetchParameter() { - ASSERT(next_parameter_index < num_parameters); - return parameters[next_parameter_index++]; -} -} // Anonymous namespace - -MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d_) - : MacroEngine{maxwell3d_}, maxwell3d{maxwell3d_} {} - -std::unique_ptr MacroInterpreter::Compile(const std::vector& code) { - return std::make_unique(maxwell3d, code); -} - -} // namespace Tegra diff --git a/src/video_core/macro/macro_interpreter.h b/src/video_core/macro/macro_interpreter.h deleted file mode 100644 index f5eeb0b76f..0000000000 --- a/src/video_core/macro/macro_interpreter.h +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#pragma once - -#include - -#include "common/common_types.h" -#include "video_core/macro/macro.h" - -namespace Tegra { -namespace Engines { -class Maxwell3D; -} - -class MacroInterpreter final : public MacroEngine { -public: - explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d_); - -protected: - std::unique_ptr Compile(const std::vector& code) override; - -private: - Engines::Maxwell3D& maxwell3d; -}; - -} // namespace Tegra diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp deleted file mode 100644 index 65935f6c62..0000000000 --- a/src/video_core/macro/macro_jit_x64.cpp +++ /dev/null @@ -1,678 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project -// SPDX-License-Identifier: GPL-3.0-or-later - -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include -#include -#include - -#include - -#include "common/assert.h" -#include "common/bit_field.h" -#include "common/logging/log.h" -#include "common/x64/xbyak_abi.h" -#include "common/x64/xbyak_util.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/macro/macro_interpreter.h" -#include "video_core/macro/macro_jit_x64.h" - -namespace Tegra { -namespace { -constexpr Xbyak::Reg64 STATE = Xbyak::util::rbx; -constexpr Xbyak::Reg32 RESULT = Xbyak::util::r10d; -constexpr Xbyak::Reg64 MAX_PARAMETER = Xbyak::util::r11; -constexpr Xbyak::Reg64 PARAMETERS = Xbyak::util::r12; -constexpr Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d; -constexpr Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15; - -constexpr std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({ - STATE, - RESULT, - MAX_PARAMETER, - PARAMETERS, - METHOD_ADDRESS, - BRANCH_HOLDER, -}); - -// Arbitrarily chosen based on current booting games. -constexpr size_t MAX_CODE_SIZE = 0x10000; - -std::bitset<32> PersistentCallerSavedRegs() { - return PERSISTENT_REGISTERS & Common::X64::ABI_ALL_CALLER_SAVED; -} - -/// @brief Must enforce W^X constraints, as we yet don't havea global "NO_EXECUTE" support flag -/// the speed loss is minimal, and in fact may be negligible, however for your peace of mind -/// I simply included known OSes whom had W^X issues -#if defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) -static const auto default_cg_mode = Xbyak::DontSetProtectRWE; -#else -static const auto default_cg_mode = nullptr; //Allow RWE -#endif - -class MacroJITx64Impl final : public Xbyak::CodeGenerator, public CachedMacro { -public: - explicit MacroJITx64Impl(Engines::Maxwell3D& maxwell3d_, const std::vector& code_) - : Xbyak::CodeGenerator(MAX_CODE_SIZE, default_cg_mode) - , code{code_}, maxwell3d{maxwell3d_} { - Compile(); - } - - void Execute(const std::vector& parameters, u32 method) override; - - void Compile_ALU(Macro::Opcode opcode); - void Compile_AddImmediate(Macro::Opcode opcode); - void Compile_ExtractInsert(Macro::Opcode opcode); - void Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode); - void Compile_ExtractShiftLeftRegister(Macro::Opcode opcode); - void Compile_Read(Macro::Opcode opcode); - void Compile_Branch(Macro::Opcode opcode); - -private: - void Optimizer_ScanFlags(); - - void Compile(); - bool Compile_NextInstruction(); - - Xbyak::Reg32 Compile_FetchParameter(); - Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst); - - void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg); - void Compile_Send(Xbyak::Reg32 value); - - Macro::Opcode GetOpCode() const; - - struct JITState { - Engines::Maxwell3D* maxwell3d{}; - std::array registers{}; - u32 carry_flag{}; - }; - static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0"); - using ProgramType = void (*)(JITState*, const u32*, const u32*); - - struct OptimizerState { - bool can_skip_carry{}; - bool has_delayed_pc{}; - bool zero_reg_skip{}; - bool skip_dummy_addimmediate{}; - bool optimize_for_method_move{}; - bool enable_asserts{}; - }; - OptimizerState optimizer{}; - - std::optional next_opcode{}; - ProgramType program{nullptr}; - - std::array labels; - std::array delay_skip; - Xbyak::Label end_of_code{}; - - bool is_delay_slot{}; - u32 pc{}; - - const std::vector& code; - Engines::Maxwell3D& maxwell3d; -}; - -void MacroJITx64Impl::Execute(const std::vector& parameters, u32 method) { - ASSERT_OR_EXECUTE(program != nullptr, { return; }); - JITState state{}; - state.maxwell3d = &maxwell3d; - state.registers = {}; - program(&state, parameters.data(), parameters.data() + parameters.size()); -} - -void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) { - const bool is_a_zero = opcode.src_a == 0; - const bool is_b_zero = opcode.src_b == 0; - const bool valid_operation = !is_a_zero && !is_b_zero; - [[maybe_unused]] const bool is_move_operation = !is_a_zero && is_b_zero; - const bool has_zero_register = is_a_zero || is_b_zero; - const bool no_zero_reg_skip = opcode.alu_operation == Macro::ALUOperation::AddWithCarry || - opcode.alu_operation == Macro::ALUOperation::SubtractWithBorrow; - - Xbyak::Reg32 src_a; - Xbyak::Reg32 src_b; - - if (!optimizer.zero_reg_skip || no_zero_reg_skip) { - src_a = Compile_GetRegister(opcode.src_a, RESULT); - src_b = Compile_GetRegister(opcode.src_b, eax); - } else { - if (!is_a_zero) { - src_a = Compile_GetRegister(opcode.src_a, RESULT); - } - if (!is_b_zero) { - src_b = Compile_GetRegister(opcode.src_b, eax); - } - } - - bool has_emitted = false; - - switch (opcode.alu_operation) { - case Macro::ALUOperation::Add: - if (optimizer.zero_reg_skip) { - if (valid_operation) { - add(src_a, src_b); - } - } else { - add(src_a, src_b); - } - - if (!optimizer.can_skip_carry) { - setc(byte[STATE + offsetof(JITState, carry_flag)]); - } - break; - case Macro::ALUOperation::AddWithCarry: - bt(dword[STATE + offsetof(JITState, carry_flag)], 0); - adc(src_a, src_b); - setc(byte[STATE + offsetof(JITState, carry_flag)]); - break; - case Macro::ALUOperation::Subtract: - if (optimizer.zero_reg_skip) { - if (valid_operation) { - sub(src_a, src_b); - has_emitted = true; - } - } else { - sub(src_a, src_b); - has_emitted = true; - } - if (!optimizer.can_skip_carry && has_emitted) { - setc(byte[STATE + offsetof(JITState, carry_flag)]); - } - break; - case Macro::ALUOperation::SubtractWithBorrow: - bt(dword[STATE + offsetof(JITState, carry_flag)], 0); - sbb(src_a, src_b); - setc(byte[STATE + offsetof(JITState, carry_flag)]); - break; - case Macro::ALUOperation::Xor: - if (optimizer.zero_reg_skip) { - if (valid_operation) { - xor_(src_a, src_b); - } - } else { - xor_(src_a, src_b); - } - break; - case Macro::ALUOperation::Or: - if (optimizer.zero_reg_skip) { - if (valid_operation) { - or_(src_a, src_b); - } - } else { - or_(src_a, src_b); - } - break; - case Macro::ALUOperation::And: - if (optimizer.zero_reg_skip) { - if (!has_zero_register) { - and_(src_a, src_b); - } - } else { - and_(src_a, src_b); - } - break; - case Macro::ALUOperation::AndNot: - if (optimizer.zero_reg_skip) { - if (!is_a_zero) { - not_(src_b); - and_(src_a, src_b); - } - } else { - not_(src_b); - and_(src_a, src_b); - } - break; - case Macro::ALUOperation::Nand: - if (optimizer.zero_reg_skip) { - if (!is_a_zero) { - and_(src_a, src_b); - not_(src_a); - } - } else { - and_(src_a, src_b); - not_(src_a); - } - break; - default: - UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", opcode.alu_operation.Value()); - break; - } - Compile_ProcessResult(opcode.result_operation, opcode.dst); -} - -void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) { - if (optimizer.skip_dummy_addimmediate) { - // Games tend to use this as an exit instruction placeholder. It's to encode an instruction - // without doing anything. In our case we can just not emit anything. - if (opcode.result_operation == Macro::ResultOperation::Move && opcode.dst == 0) { - return; - } - } - // Check for redundant moves - if (optimizer.optimize_for_method_move && - opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) { - if (next_opcode.has_value()) { - const auto next = *next_opcode; - if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod && - opcode.dst == next.dst) { - return; - } - } - } - if (optimizer.zero_reg_skip && opcode.src_a == 0) { - if (opcode.immediate == 0) { - xor_(RESULT, RESULT); - } else { - mov(RESULT, opcode.immediate); - } - } else { - auto result = Compile_GetRegister(opcode.src_a, RESULT); - if (opcode.immediate > 2) { - add(result, opcode.immediate); - } else if (opcode.immediate == 1) { - inc(result); - } else if (opcode.immediate < 0) { - sub(result, opcode.immediate * -1); - } - } - Compile_ProcessResult(opcode.result_operation, opcode.dst); -} - -void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) { - auto dst = Compile_GetRegister(opcode.src_a, RESULT); - auto src = Compile_GetRegister(opcode.src_b, eax); - - const u32 mask = ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit); - and_(dst, mask); - shr(src, opcode.bf_src_bit); - and_(src, opcode.GetBitfieldMask()); - shl(src, opcode.bf_dst_bit); - or_(dst, src); - - Compile_ProcessResult(opcode.result_operation, opcode.dst); -} - -void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) { - const auto dst = Compile_GetRegister(opcode.src_a, ecx); - const auto src = Compile_GetRegister(opcode.src_b, RESULT); - - shr(src, dst.cvt8()); - and_(src, opcode.GetBitfieldMask()); - shl(src, opcode.bf_dst_bit); - - Compile_ProcessResult(opcode.result_operation, opcode.dst); -} - -void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) { - const auto dst = Compile_GetRegister(opcode.src_a, ecx); - const auto src = Compile_GetRegister(opcode.src_b, RESULT); - - shr(src, opcode.bf_src_bit); - and_(src, opcode.GetBitfieldMask()); - shl(src, dst.cvt8()); - - Compile_ProcessResult(opcode.result_operation, opcode.dst); -} - -void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) { - if (optimizer.zero_reg_skip && opcode.src_a == 0) { - if (opcode.immediate == 0) { - xor_(RESULT, RESULT); - } else { - mov(RESULT, opcode.immediate); - } - } else { - auto result = Compile_GetRegister(opcode.src_a, RESULT); - if (opcode.immediate > 2) { - add(result, opcode.immediate); - } else if (opcode.immediate == 1) { - inc(result); - } else if (opcode.immediate < 0) { - sub(result, opcode.immediate * -1); - } - } - - // Equivalent to Engines::Maxwell3D::GetRegisterValue: - if (optimizer.enable_asserts) { - Xbyak::Label pass_range_check; - cmp(RESULT, static_cast(Engines::Maxwell3D::Regs::NUM_REGS)); - jb(pass_range_check); - int3(); - L(pass_range_check); - } - mov(rax, qword[STATE]); - mov(RESULT, - dword[rax + offsetof(Engines::Maxwell3D, regs) + - offsetof(Engines::Maxwell3D::Regs, reg_array) + RESULT.cvt64() * sizeof(u32)]); - - Compile_ProcessResult(opcode.result_operation, opcode.dst); -} - -void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) { - maxwell3d->CallMethod(method_address.address, value, true); -} - -void MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) { - Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - mov(Common::X64::ABI_PARAM1, qword[STATE]); - mov(Common::X64::ABI_PARAM2.cvt32(), METHOD_ADDRESS); - mov(Common::X64::ABI_PARAM3.cvt32(), value); - Common::X64::CallFarFunction(*this, &Send); - Common::X64::ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - - Xbyak::Label dont_process{}; - // Get increment - test(METHOD_ADDRESS, 0x3f000); - // If zero, method address doesn't update - je(dont_process); - - mov(ecx, METHOD_ADDRESS); - and_(METHOD_ADDRESS, 0xfff); - shr(ecx, 12); - and_(ecx, 0x3f); - lea(eax, ptr[rcx + METHOD_ADDRESS.cvt64()]); - sal(ecx, 12); - or_(eax, ecx); - - mov(METHOD_ADDRESS, eax); - - L(dont_process); -} - -void MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) { - ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid"); - const s32 jump_address = - static_cast(pc) + static_cast(opcode.GetBranchTarget() / sizeof(s32)); - - Xbyak::Label end; - auto value = Compile_GetRegister(opcode.src_a, eax); - cmp(value, 0); // test(value, value); - if (optimizer.has_delayed_pc) { - switch (opcode.branch_condition) { - case Macro::BranchCondition::Zero: - jne(end, T_NEAR); - break; - case Macro::BranchCondition::NotZero: - je(end, T_NEAR); - break; - } - - if (opcode.branch_annul) { - xor_(BRANCH_HOLDER, BRANCH_HOLDER); - jmp(labels[jump_address], T_NEAR); - } else { - Xbyak::Label handle_post_exit{}; - Xbyak::Label skip{}; - jmp(skip, T_NEAR); - - L(handle_post_exit); - xor_(BRANCH_HOLDER, BRANCH_HOLDER); - jmp(labels[jump_address], T_NEAR); - - L(skip); - mov(BRANCH_HOLDER, handle_post_exit); - jmp(delay_skip[pc], T_NEAR); - } - } else { - switch (opcode.branch_condition) { - case Macro::BranchCondition::Zero: - je(labels[jump_address], T_NEAR); - break; - case Macro::BranchCondition::NotZero: - jne(labels[jump_address], T_NEAR); - break; - } - } - - L(end); -} - -void MacroJITx64Impl::Optimizer_ScanFlags() { - optimizer.can_skip_carry = true; - optimizer.has_delayed_pc = false; - for (auto raw_op : code) { - Macro::Opcode op{}; - op.raw = raw_op; - - if (op.operation == Macro::Operation::ALU) { - // Scan for any ALU operations which actually use the carry flag, if they don't exist in - // our current code we can skip emitting the carry flag handling operations - if (op.alu_operation == Macro::ALUOperation::AddWithCarry || - op.alu_operation == Macro::ALUOperation::SubtractWithBorrow) { - optimizer.can_skip_carry = false; - } - } - - if (op.operation == Macro::Operation::Branch) { - if (!op.branch_annul) { - optimizer.has_delayed_pc = true; - } - } - } -} - -void MacroJITx64Impl::Compile() { - labels.fill(Xbyak::Label()); - - Common::X64::ABI_PushRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); - // JIT state - mov(STATE, Common::X64::ABI_PARAM1); - mov(PARAMETERS, Common::X64::ABI_PARAM2); - mov(MAX_PARAMETER, Common::X64::ABI_PARAM3); - xor_(RESULT, RESULT); - xor_(METHOD_ADDRESS, METHOD_ADDRESS); - xor_(BRANCH_HOLDER, BRANCH_HOLDER); - - mov(dword[STATE + offsetof(JITState, registers) + 4], Compile_FetchParameter()); - - // Track get register for zero registers and mark it as no-op - optimizer.zero_reg_skip = true; - - // AddImmediate tends to be used as a NOP instruction, if we detect this we can - // completely skip the entire code path and no emit anything - optimizer.skip_dummy_addimmediate = true; - - // SMO tends to emit a lot of unnecessary method moves, we can mitigate this by only emitting - // one if our register isn't "dirty" - optimizer.optimize_for_method_move = true; - - // Enable run-time assertions in JITted code - optimizer.enable_asserts = false; - - // Check to see if we can skip emitting certain instructions - Optimizer_ScanFlags(); - - const u32 op_count = static_cast(code.size()); - for (u32 i = 0; i < op_count; i++) { - if (i < op_count - 1) { - pc = i + 1; - next_opcode = GetOpCode(); - } else { - next_opcode = {}; - } - pc = i; - Compile_NextInstruction(); - } - - L(end_of_code); - - Common::X64::ABI_PopRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); - ret(); - ready(); - program = getCode(); -} - -bool MacroJITx64Impl::Compile_NextInstruction() { - const auto opcode = GetOpCode(); - if (labels[pc].getAddress()) { - return false; - } - - L(labels[pc]); - - switch (opcode.operation) { - case Macro::Operation::ALU: - Compile_ALU(opcode); - break; - case Macro::Operation::AddImmediate: - Compile_AddImmediate(opcode); - break; - case Macro::Operation::ExtractInsert: - Compile_ExtractInsert(opcode); - break; - case Macro::Operation::ExtractShiftLeftImmediate: - Compile_ExtractShiftLeftImmediate(opcode); - break; - case Macro::Operation::ExtractShiftLeftRegister: - Compile_ExtractShiftLeftRegister(opcode); - break; - case Macro::Operation::Read: - Compile_Read(opcode); - break; - case Macro::Operation::Branch: - Compile_Branch(opcode); - break; - default: - UNIMPLEMENTED_MSG("Unimplemented opcode {}", opcode.operation.Value()); - break; - } - - if (optimizer.has_delayed_pc) { - if (opcode.is_exit) { - mov(rax, end_of_code); - test(BRANCH_HOLDER, BRANCH_HOLDER); - cmove(BRANCH_HOLDER, rax); - // Jump to next instruction to skip delay slot check - je(labels[pc + 1], T_NEAR); - } else { - // TODO(ogniK): Optimize delay slot branching - Xbyak::Label no_delay_slot{}; - test(BRANCH_HOLDER, BRANCH_HOLDER); - je(no_delay_slot, T_NEAR); - mov(rax, BRANCH_HOLDER); - xor_(BRANCH_HOLDER, BRANCH_HOLDER); - jmp(rax); - L(no_delay_slot); - } - L(delay_skip[pc]); - if (opcode.is_exit) { - return false; - } - } else { - test(BRANCH_HOLDER, BRANCH_HOLDER); - jne(end_of_code, T_NEAR); - if (opcode.is_exit) { - inc(BRANCH_HOLDER); - return false; - } - } - return true; -} - -static void WarnInvalidParameter(uintptr_t parameter, uintptr_t max_parameter) { - LOG_CRITICAL(HW_GPU, - "Macro JIT: invalid parameter access 0x{:x} (0x{:x} is the last parameter)", - parameter, max_parameter - sizeof(u32)); -} - -Xbyak::Reg32 MacroJITx64Impl::Compile_FetchParameter() { - Xbyak::Label parameter_ok{}; - cmp(PARAMETERS, MAX_PARAMETER); - jb(parameter_ok, T_NEAR); - Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - mov(Common::X64::ABI_PARAM1, PARAMETERS); - mov(Common::X64::ABI_PARAM2, MAX_PARAMETER); - Common::X64::CallFarFunction(*this, &WarnInvalidParameter); - Common::X64::ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - L(parameter_ok); - mov(eax, dword[PARAMETERS]); - add(PARAMETERS, sizeof(u32)); - return eax; -} - -Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) { - if (index == 0) { - // Register 0 is always zero - xor_(dst, dst); - } else { - mov(dst, dword[STATE + offsetof(JITState, registers) + index * sizeof(u32)]); - } - - return dst; -} - -void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) { - const auto SetRegister = [this](u32 reg_index, const Xbyak::Reg32& result) { - // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero - // register. - if (reg_index == 0) { - return; - } - mov(dword[STATE + offsetof(JITState, registers) + reg_index * sizeof(u32)], result); - }; - const auto SetMethodAddress = [this](const Xbyak::Reg32& reg32) { mov(METHOD_ADDRESS, reg32); }; - - switch (operation) { - case Macro::ResultOperation::IgnoreAndFetch: - SetRegister(reg, Compile_FetchParameter()); - break; - case Macro::ResultOperation::Move: - SetRegister(reg, RESULT); - break; - case Macro::ResultOperation::MoveAndSetMethod: - SetRegister(reg, RESULT); - SetMethodAddress(RESULT); - break; - case Macro::ResultOperation::FetchAndSend: - // Fetch parameter and send result. - SetRegister(reg, Compile_FetchParameter()); - Compile_Send(RESULT); - break; - case Macro::ResultOperation::MoveAndSend: - // Move and send result. - SetRegister(reg, RESULT); - Compile_Send(RESULT); - break; - case Macro::ResultOperation::FetchAndSetMethod: - // Fetch parameter and use result as Method Address. - SetRegister(reg, Compile_FetchParameter()); - SetMethodAddress(RESULT); - break; - case Macro::ResultOperation::MoveAndSetMethodFetchAndSend: - // Move result and use as Method Address, then fetch and send parameter. - SetRegister(reg, RESULT); - SetMethodAddress(RESULT); - Compile_Send(Compile_FetchParameter()); - break; - case Macro::ResultOperation::MoveAndSetMethodSend: - // Move result and use as Method Address, then send bits 12:17 of result. - SetRegister(reg, RESULT); - SetMethodAddress(RESULT); - shr(RESULT, 12); - and_(RESULT, 0b111111); - Compile_Send(RESULT); - break; - default: - UNIMPLEMENTED_MSG("Unimplemented macro operation {}", operation); - break; - } -} - -Macro::Opcode MacroJITx64Impl::GetOpCode() const { - ASSERT(pc < code.size()); - return {code[pc]}; -} -} // Anonymous namespace - -MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d_) - : MacroEngine{maxwell3d_}, maxwell3d{maxwell3d_} {} - -std::unique_ptr MacroJITx64::Compile(const std::vector& code) { - return std::make_unique(maxwell3d, code); -} -} // namespace Tegra diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h deleted file mode 100644 index 99ee1b9e68..0000000000 --- a/src/video_core/macro/macro_jit_x64.h +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#pragma once - -#include "common/common_types.h" -#include "video_core/macro/macro.h" - -namespace Tegra { - -namespace Engines { -class Maxwell3D; -} - -class MacroJITx64 final : public MacroEngine { -public: - explicit MacroJITx64(Engines::Maxwell3D& maxwell3d_); - -protected: - std::unique_ptr Compile(const std::vector& code) override; - -private: - Engines::Maxwell3D& maxwell3d; -}; - -} // namespace Tegra