From 47e3533bd95f8d036d3bca0d38012b5495a0044f Mon Sep 17 00:00:00 2001 From: lizzie Date: Sun, 22 Feb 2026 21:58:31 +0000 Subject: [PATCH] [vk] batched draws Signed-off-by: lizzie --- src/video_core/engines/draw_manager.cpp | 4 + src/video_core/rasterizer_interface.h | 3 + .../renderer_null/null_rasterizer.cpp | 1 + .../renderer_null/null_rasterizer.h | 1 + .../renderer_opengl/gl_rasterizer.cpp | 4 + .../renderer_opengl/gl_rasterizer.h | 1 + .../renderer_vulkan/vk_graphics_pipeline.cpp | 5 +- .../renderer_vulkan/vk_rasterizer.cpp | 169 ++++++++++-------- .../renderer_vulkan/vk_rasterizer.h | 57 +++--- .../renderer_vulkan/vk_state_tracker.h | 1 + 10 files changed, 132 insertions(+), 114 deletions(-) diff --git a/src/video_core/engines/draw_manager.cpp b/src/video_core/engines/draw_manager.cpp index 971025cb55..dddd886b91 100644 --- a/src/video_core/engines/draw_manager.cpp +++ b/src/video_core/engines/draw_manager.cpp @@ -69,6 +69,7 @@ void DrawManager::ProcessMethodCall(u32 method, u32 argument) { void DrawManager::Clear(u32 layer_count) { if (maxwell3d->ShouldExecute()) { + maxwell3d->rasterizer->FlushBatchedDraws(); maxwell3d->rasterizer->Clear(layer_count); } } @@ -144,6 +145,8 @@ void DrawManager::SetInlineIndexBuffer(u32 index) { } void DrawManager::DrawBegin() { + maxwell3d->rasterizer->FlushBatchedDraws(); + const auto& regs{maxwell3d->regs}; auto reset_instance_count = regs.draw.instance_id == Maxwell3D::Regs::Draw::InstanceId::First; auto increment_instance_count = @@ -192,6 +195,7 @@ void DrawManager::DrawEnd(u32 instance_count, bool force_draw) { draw_state.inline_index_draw_indexes.clear(); break; } + maxwell3d->rasterizer->FlushBatchedDraws(); } void DrawManager::DrawIndexSmall(u32 argument) { diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index 481efbf53b..d70afbf447 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -39,6 +39,9 @@ class RasterizerInterface { public: virtual ~RasterizerInterface() = default; + /// Flushes draws batched by Draw() + virtual void FlushBatchedDraws() = 0; + /// Dispatches a draw invocation virtual void Draw(bool is_indexed, u32 instance_count) = 0; diff --git a/src/video_core/renderer_null/null_rasterizer.cpp b/src/video_core/renderer_null/null_rasterizer.cpp index a5cda0f389..09d3805f19 100644 --- a/src/video_core/renderer_null/null_rasterizer.cpp +++ b/src/video_core/renderer_null/null_rasterizer.cpp @@ -22,6 +22,7 @@ RasterizerNull::RasterizerNull(Tegra::GPU& gpu) : m_gpu{gpu} {} RasterizerNull::~RasterizerNull() = default; void RasterizerNull::Draw(bool is_indexed, u32 instance_count) {} +void RasterizerNull::FlushBatchedDraws() {} void RasterizerNull::DrawTexture() {} void RasterizerNull::Clear(u32 layer_count) {} void RasterizerNull::DispatchCompute() {} diff --git a/src/video_core/renderer_null/null_rasterizer.h b/src/video_core/renderer_null/null_rasterizer.h index c7f5849c75..b7274f22cf 100644 --- a/src/video_core/renderer_null/null_rasterizer.h +++ b/src/video_core/renderer_null/null_rasterizer.h @@ -37,6 +37,7 @@ public: explicit RasterizerNull(Tegra::GPU& gpu); ~RasterizerNull() override; + void FlushBatchedDraws() override; void Draw(bool is_indexed, u32 instance_count) override; void DrawTexture() override; void Clear(u32 layer_count) override; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 602509bfdb..867fb157b9 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -258,6 +258,10 @@ void RasterizerOpenGL::PrepareDraw(bool is_indexed, Func&& draw_func) { has_written_global_memory |= pipeline->WritesGlobalMemory(); } +void RasterizerOpenGL::FlushBatchedDraws() { + +} + void RasterizerOpenGL::Draw(bool is_indexed, u32 instance_count) { PrepareDraw(is_indexed, [this, is_indexed, instance_count](GLenum primitive_mode) { const auto& draw_state = maxwell3d->draw_manager->GetDrawState(); diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 6eae51ff7d..85de9ea4e8 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -81,6 +81,7 @@ public: StateTracker& state_tracker_); ~RasterizerOpenGL() override; + void FlushBatchedDraws() override; void Draw(bool is_indexed, u32 instance_count) override; void DrawIndirect() override; void DrawTexture() override; diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index d156baa77b..59654f0a0d 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -310,6 +310,8 @@ void GraphicsPipeline::AddTransition(GraphicsPipeline* transition) { template bool GraphicsPipeline::ConfigureImpl(bool is_indexed) { + maxwell3d->Rasterizer().FlushBatchedDraws(); //DRAW THINGS FROM PREVIOUS PIPELINE!!!!!! + std::array views; std::array samplers; size_t sampler_index{}; @@ -507,8 +509,7 @@ bool GraphicsPipeline::ConfigureImpl(bool is_indexed) { return true; } -void GraphicsPipeline::ConfigureDraw(const RescalingPushConstant& rescaling, - const RenderAreaPushConstant& render_area) { +void GraphicsPipeline::ConfigureDraw(const RescalingPushConstant& rescaling, const RenderAreaPushConstant& render_area) { scheduler.RequestRenderpass(texture_cache.GetFramebuffer()); if (!is_built.load(std::memory_order::relaxed)) { // Wait for the pipeline to be built diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 60b899a811..f8aad58dc7 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -48,19 +48,8 @@ namespace Vulkan { using Maxwell = Tegra::Engines::Maxwell3D::Regs; using MaxwellDrawState = Tegra::Engines::DrawManager::State; using VideoCommon::ImageViewId; -using VideoCommon::ImageViewType; - namespace { -struct DrawParams { - u32 base_instance; - u32 num_instances; - u32 base_vertex; - u32 num_vertices; - u32 first_index; - bool is_indexed; -}; - VkViewport GetViewportState(const Device& device, const Maxwell& regs, size_t index, float scale) { const auto& src = regs.viewport_transform[index]; const auto conv = [scale](float value) { @@ -151,8 +140,8 @@ VkRect2D GetScissorState(const Maxwell& regs, size_t index, u32 up_scale = 1, u3 return scissor; } -DrawParams MakeDrawParams(const MaxwellDrawState& draw_state, u32 num_instances, bool is_indexed) { - DrawParams params{ +RasterizerVulkan::DrawParams MakeDrawParams(const MaxwellDrawState& draw_state, u32 num_instances, bool is_indexed) { + RasterizerVulkan::DrawParams params{ .base_instance = draw_state.base_instance, .num_instances = num_instances, .base_vertex = is_indexed ? draw_state.base_index : draw_state.vertex_buffer.first, @@ -215,6 +204,8 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) { SCOPE_EXIT { gpu.TickWork(); }; + if (state_tracker.flags->count() > 0) + FlushBatchedDraws(); // Dirty state changes the way Vulkan would behave FlushWork(); gpu_memory->FlushCaching(); @@ -225,75 +216,94 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) { std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; // update engine as channel may be different. pipeline->SetEngine(maxwell3d, gpu_memory); - if (!pipeline->Configure(is_indexed)) - return; - - UpdateDynamicStates(); - - HandleTransformFeedback(); - query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64, - maxwell3d->regs.zpass_pixel_count_enable); - - draw_func(); + if (pipeline->Configure(is_indexed)) { + UpdateDynamicStates(); + HandleTransformFeedback(); + query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64, maxwell3d->regs.zpass_pixel_count_enable); + draw_func(); + } } -void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) { - PrepareDraw(is_indexed, [this, is_indexed, instance_count] { - const auto& draw_state = maxwell3d->draw_manager->GetDrawState(); - const u32 num_instances{instance_count}; - const DrawParams draw_params{MakeDrawParams(draw_state, num_instances, is_indexed)}; - - // Use VK_EXT_multi_draw if available (single draw becomes multi-draw with count=1) - if (device.IsExtMultiDrawSupported()) { - scheduler.Record([draw_params](vk::CommandBuffer cmdbuf) { - if (draw_params.is_indexed) { - // Use multi-draw indexed with single draw - const VkMultiDrawIndexedInfoEXT multi_draw_info{ - .firstIndex = draw_params.first_index, - .indexCount = draw_params.num_vertices, - }; - const int32_t vertex_offset = static_cast(draw_params.base_vertex); - cmdbuf.DrawMultiIndexedEXT(1, &multi_draw_info, draw_params.num_instances, - draw_params.base_instance, - sizeof(VkMultiDrawIndexedInfoEXT), &vertex_offset); - } else { - // Use multi-draw with single draw - const VkMultiDrawInfoEXT multi_draw_info{ - .firstVertex = draw_params.base_vertex, - .vertexCount = draw_params.num_vertices, - }; - cmdbuf.DrawMultiEXT(1, &multi_draw_info, draw_params.num_instances, - draw_params.base_instance, - sizeof(VkMultiDrawInfoEXT)); +void RasterizerVulkan::FlushBatchedDraws() { + auto const log_draw = [](RasterizerVulkan::DrawParams draw_params) { + // Log draw call + if (Settings::values.gpu_logging_enabled.GetValue() && Settings::values.gpu_log_vulkan_calls.GetValue()) { + const std::string params = draw_params.is_indexed ? + fmt::format("vertices={}, instances={}, firstIndex={}, baseVertex={}, baseInstance={}", draw_params.num_vertices, draw_params.num_instances, draw_params.first_index, draw_params.base_vertex, draw_params.base_instance) : + fmt::format("vertices={}, instances={}, firstVertex={}, firstInstance={}", draw_params.num_vertices, draw_params.num_instances, draw_params.base_vertex, draw_params.base_instance); + GPU::Logging::GPULogger::GetInstance().LogVulkanCall(draw_params.is_indexed ? "vkCmdDrawIndexed" : "vkCmdDraw", params, VK_SUCCESS); + } + }; + (void)log_draw; + auto const& list = batched_draw_params; + if (list.empty()) + return; + // Use VK_EXT_multi_draw if available (single draw becomes multi-draw with count=1) + for (size_t i = 0; i < list.size(); ) { + auto const first_i = i; + bool last_is_indexed = list[i].is_indexed; + u32 last_num_instances = list[i].num_instances; + u32 last_base_instance = list[i].base_instance; + // now for the sad draw part... + u32 count = 0; + if (last_is_indexed) { + std::vector multi_draw_info; + std::vector vertex_offset; + for (; i < list.size() + && list[i].is_indexed == last_is_indexed + && list[i].num_instances == last_num_instances + && list[i].base_instance == last_base_instance; ++i) { + multi_draw_info.push_back({ + .firstIndex = list[i].first_index, + .indexCount = list[i].num_vertices + }); + vertex_offset.push_back(list[i].base_vertex); + ++count; + } + if (device.IsExtMultiDrawSupported() && multi_draw_info.size() > 1) { + scheduler.Record([last_num_instances, last_base_instance, count, multi_draw_info, vertex_offset](vk::CommandBuffer cmdbuf) { + cmdbuf.DrawMultiIndexedEXT(count, multi_draw_info.data(), last_num_instances, last_base_instance, sizeof(VkMultiDrawIndexedInfoEXT), vertex_offset.data()); + }); + } else { + for (size_t j = first_i; j < i; ++j) { + scheduler.Record([draw_params = list[j]](vk::CommandBuffer cmdbuf) { + cmdbuf.DrawIndexed(draw_params.num_vertices, draw_params.num_instances, draw_params.first_index, draw_params.base_vertex, draw_params.base_instance); + }); } - }); + } } else { - // Fallback to standard draw calls - scheduler.Record([draw_params](vk::CommandBuffer cmdbuf) { - if (draw_params.is_indexed) { - cmdbuf.DrawIndexed(draw_params.num_vertices, draw_params.num_instances, - draw_params.first_index, draw_params.base_vertex, - draw_params.base_instance); - } else { - cmdbuf.Draw(draw_params.num_vertices, draw_params.num_instances, - draw_params.base_vertex, draw_params.base_instance); + std::vector multi_draw_info; + for (; i < list.size() + && list[i].is_indexed == last_is_indexed + && list[i].num_instances == last_num_instances + && list[i].base_instance == last_base_instance; ++i) { + multi_draw_info.push_back({ + .firstVertex = list[i].base_vertex, + .vertexCount = list[i].num_vertices + }); + ++count; + } + if (device.IsExtMultiDrawSupported() && multi_draw_info.size() > 1) { + scheduler.Record([last_num_instances, last_base_instance, count, multi_draw_info](vk::CommandBuffer cmdbuf) { + cmdbuf.DrawMultiEXT(count, multi_draw_info.data(), last_num_instances, last_base_instance, sizeof(VkMultiDrawInfoEXT)); + }); + } else { + for (size_t j = first_i; j < i; ++j) { + scheduler.Record([draw_params = list[j]](vk::CommandBuffer cmdbuf) { + cmdbuf.Draw(draw_params.num_vertices, draw_params.num_instances, draw_params.base_vertex, draw_params.base_instance); + }); } - }); + } } + } + batched_draw_params.clear(); +} - // Log draw call - if (Settings::values.gpu_logging_enabled.GetValue() && - Settings::values.gpu_log_vulkan_calls.GetValue()) { - const std::string params = is_indexed ? - fmt::format("vertices={}, instances={}, firstIndex={}, baseVertex={}, baseInstance={}", - draw_params.num_vertices, draw_params.num_instances, - draw_params.first_index, draw_params.base_vertex, draw_params.base_instance) : - fmt::format("vertices={}, instances={}, firstVertex={}, firstInstance={}", - draw_params.num_vertices, draw_params.num_instances, - draw_params.base_vertex, draw_params.base_instance); - GPU::Logging::GPULogger::GetInstance().LogVulkanCall( - is_indexed ? "vkCmdDrawIndexed" : "vkCmdDraw", params, VK_SUCCESS); - } +void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) { + PrepareDraw(is_indexed, [this, is_indexed, instance_count] { + auto const& draw_state = maxwell3d->draw_manager->GetDrawState(); + auto const num_instances = instance_count; + batched_draw_params.push_back(MakeDrawParams(draw_state, num_instances, is_indexed)); }); } @@ -356,16 +366,16 @@ void RasterizerVulkan::DrawIndirect() { } void RasterizerVulkan::DrawTexture() { - SCOPE_EXIT { gpu.TickWork(); }; + if (state_tracker.flags->count() > 0) + FlushBatchedDraws(); // Dirty state changes the way Vulkan would behave FlushWork(); std::scoped_lock l{texture_cache.mutex}; texture_cache.SynchronizeGraphicsDescriptors(); texture_cache.UpdateRenderTargets(false); - UpdateDynamicStates(); query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64, @@ -403,6 +413,8 @@ void RasterizerVulkan::DrawTexture() { } void RasterizerVulkan::Clear(u32 layer_count) { + if (state_tracker.flags->count() > 0) + FlushBatchedDraws(); // Dirty state changes the way Vulkan would behave FlushWork(); gpu_memory->FlushCaching(); @@ -559,6 +571,8 @@ void RasterizerVulkan::Clear(u32 layer_count) { } void RasterizerVulkan::DispatchCompute() { + if (state_tracker.flags->count() > 0) + FlushBatchedDraws(); // Dirty state changes the way Vulkan would behave FlushWork(); gpu_memory->FlushCaching(); @@ -934,7 +948,6 @@ void RasterizerVulkan::FlushWork() { static constexpr u32 DRAWS_TO_DISPATCH = 4096; static constexpr u32 CHECK_MASK = 7; #endif // ANDROID - static_assert(DRAWS_TO_DISPATCH % (CHECK_MASK + 1) == 0); if ((++draw_counter & CHECK_MASK) != CHECK_MASK) { return; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index b689c6b660..9d04a817e6 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -52,57 +52,48 @@ class StateTracker; class AccelerateDMA : public Tegra::Engines::AccelerateDMAInterface { public: - explicit AccelerateDMA(BufferCache& buffer_cache, TextureCache& texture_cache, - Scheduler& scheduler); - + explicit AccelerateDMA(BufferCache& buffer_cache, TextureCache& texture_cache, Scheduler& scheduler); bool BufferCopy(GPUVAddr start_address, GPUVAddr end_address, u64 amount) override; - bool BufferClear(GPUVAddr src_address, u64 amount, u32 value) override; - - bool ImageToBuffer(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::ImageOperand& src, - const Tegra::DMA::BufferOperand& dst) override; - - bool BufferToImage(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& src, - const Tegra::DMA::ImageOperand& dst) override; - + bool ImageToBuffer(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::ImageOperand& src, const Tegra::DMA::BufferOperand& dst) override; + bool BufferToImage(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& src, const Tegra::DMA::ImageOperand& dst) override; private: template - bool DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info, - const Tegra::DMA::BufferOperand& src, - const Tegra::DMA::ImageOperand& dst); - + bool DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& src, const Tegra::DMA::ImageOperand& dst); BufferCache& buffer_cache; TextureCache& texture_cache; Scheduler& scheduler; }; -class RasterizerVulkan final : public VideoCore::RasterizerInterface, - protected VideoCommon::ChannelSetupCaches { +class RasterizerVulkan final : public VideoCore::RasterizerInterface, protected VideoCommon::ChannelSetupCaches { public: - explicit RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_, - Tegra::MaxwellDeviceMemoryManager& device_memory_, - const Device& device_, MemoryAllocator& memory_allocator_, - StateTracker& state_tracker_, Scheduler& scheduler_); + struct DrawParams { + u32 base_instance; + u32 num_instances; + u32 base_vertex; + u32 num_vertices; + u32 first_index; + bool is_indexed; + }; + + explicit RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_, Tegra::MaxwellDeviceMemoryManager& device_memory_, const Device& device_, MemoryAllocator& memory_allocator_, StateTracker& state_tracker_, Scheduler& scheduler_); ~RasterizerVulkan() override; + void FlushBatchedDraws() override; void Draw(bool is_indexed, u32 instance_count) override; void DrawIndirect() override; void DrawTexture() override; void Clear(u32 layer_count) override; void DispatchCompute() override; void ResetCounter(VideoCommon::QueryType type) override; - void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, - VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; + void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; void FlushAll() override; - void FlushRegion(DAddr addr, u64 size, - VideoCommon::CacheType which = VideoCommon::CacheType::All) override; - bool MustFlushRegion(DAddr addr, u64 size, - VideoCommon::CacheType which = VideoCommon::CacheType::All) override; + void FlushRegion(DAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; + bool MustFlushRegion(DAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; VideoCore::RasterizerDownloadArea GetFlushArea(DAddr addr, u64 size) override; - void InvalidateRegion(DAddr addr, u64 size, - VideoCommon::CacheType which = VideoCommon::CacheType::All) override; + void InvalidateRegion(DAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; void InnerInvalidation(std::span> sequences) override; void OnCacheInvalidation(DAddr addr, u64 size) override; bool OnCPUWrite(DAddr addr, u64 size) override; @@ -114,8 +105,7 @@ public: void SignalSyncPoint(u32 value) override; void SignalReference() override; void ReleaseFences(bool force = true) override; - void FlushAndInvalidateRegion( - DAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; + void FlushAndInvalidateRegion(DAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; void WaitForIdle() override; void FragmentBarrier() override; void TiledCacheBarrier() override; @@ -136,9 +126,7 @@ public: void BindChannel(Tegra::Control::ChannelState& channel) override; void ReleaseChannel(s32 channel_id) override; - std::optional AccelerateDisplay(const Tegra::FramebufferConfig& config, - VAddr framebuffer_addr, - u32 pixel_stride); + std::optional AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride); private: static constexpr const u64 NEEDS_D24[] = { @@ -223,6 +211,7 @@ private: boost::container::static_vector image_view_indices; std::array image_view_ids; boost::container::static_vector sampler_handles; + std::vector batched_draw_params; u32 draw_counter = 0; }; diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.h b/src/video_core/renderer_vulkan/vk_state_tracker.h index 74bae9e181..601a9a68de 100644 --- a/src/video_core/renderer_vulkan/vk_state_tracker.h +++ b/src/video_core/renderer_vulkan/vk_state_tracker.h @@ -301,6 +301,7 @@ private: return is_dirty; } +public: struct StencilProperties { u32 ref = 0; u32 write_mask = 0;