Browse Source

[vk] Remove UniformRing and vkCmdResetQueryPool (#3270)

Fixes perfomance regression on Xenoblade Chronicles DE and Pokemon Scarlet (among other games)
It should be investigated the reason why such perfomance loss (more than ~10% in some case)

At core it partially reverted the following commits:

Reverts "[vk] Introduce Ring Buffers for Uniform Buffer (#2698)"
  This reverts commit 776958c79d.

Revert "[vk] Bring Vulkan closer to Spec (#180)"
  This reverts commit c8d6f23129.

Revert "[VK] PR 180 extension (#257)"
  This reverts commit 444b9f361e.

Revert "[vk] Fixes regression of PR #180 vk_scheduler.cpp for AMD GPU and Windows OS (#3071)"
  This reverts commit be218cc020.

Signed-off-by: Caio Oliveira <caiooliveirafarias0@gmail.com>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3270
Reviewed-by: Lizzie <lizzie@eden-emu.dev>
Reviewed-by: CamilleLaVey <camillelavey99@gmail.com>
Co-authored-by: Caio Oliveira <caiooliveirafarias0@gmail.com>
Co-committed-by: Caio Oliveira <caiooliveirafarias0@gmail.com>
lizzie/stack-fibers-vector
Caio Oliveira 3 days ago
committed by crueter
parent
commit
8440c2074d
No known key found for this signature in database GPG Key ID: 425ACD2D4830EBC6
  1. 41
      src/video_core/renderer_vulkan/vk_buffer_cache.cpp
  2. 28
      src/video_core/renderer_vulkan/vk_buffer_cache.h
  3. 13
      src/video_core/renderer_vulkan/vk_query_cache.cpp
  4. 29
      src/video_core/renderer_vulkan/vk_scheduler.cpp
  5. 1
      src/video_core/vulkan_common/vulkan_wrapper.cpp
  6. 4
      src/video_core/vulkan_common/vulkan_wrapper.h

41
src/video_core/renderer_vulkan/vk_buffer_cache.cpp

@ -338,11 +338,6 @@ BufferCacheRuntime::BufferCacheRuntime(const Device& device_, MemoryAllocator& m
uint8_pass = std::make_unique<Uint8Pass>(device, scheduler, descriptor_pool, staging_pool,
compute_pass_descriptor_queue);
}
const u32 ubo_align = static_cast<u32>(
device.GetUniformBufferAlignment() //check if the device has it
);
// add the ability to change the size in settings in future
uniform_ring.Init(memory_allocator, 8 * 1024 * 1024 /* 8 MiB */, ubo_align ? ubo_align : 256);
quad_array_index_buffer = std::make_shared<QuadArrayIndexBuffer>(device_, memory_allocator_,
scheduler_, staging_pool_);
quad_strip_index_buffer = std::make_shared<QuadStripIndexBuffer>(device_, memory_allocator_,
@ -361,41 +356,6 @@ void BufferCacheRuntime::FreeDeferredStagingBuffer(StagingBufferRef& ref) {
staging_pool.FreeDeferred(ref);
}
void BufferCacheRuntime::UniformRing::Init(MemoryAllocator& alloc, u64 bytes, u32 alignment)
{
for (size_t i = 0; i < NUM_FRAMES; ++i) {
VkBufferCreateInfo ci{
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = nullptr,
.flags = 0,
.size = bytes,
.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
.queueFamilyIndexCount = 0,
.pQueueFamilyIndices = nullptr,
};
buffers[i] = alloc.CreateBuffer(ci, MemoryUsage::Upload);
mapped[i] = buffers[i].Mapped().data();
}
size = bytes;
align = alignment ? alignment : 256;
head = 0;
current_frame = 0;
}
std::span<u8> BufferCacheRuntime::UniformRing::Alloc(u32 bytes, u32& out_offset) {
const u64 aligned = Common::AlignUp(head, static_cast<u64>(align));
u64 end = aligned + bytes;
if (end > size) {
return {}; // Fallback to staging pool
}
out_offset = static_cast<u32>(aligned);
head = end;
return {mapped[current_frame] + out_offset, bytes};
}
u64 BufferCacheRuntime::GetDeviceLocalMemory() const {
return device.GetDeviceLocalMemory();
}
@ -416,7 +376,6 @@ void BufferCacheRuntime::TickFrame(Common::SlotVector<Buffer>& slot_buffers) noe
for (auto it = slot_buffers.begin(); it != slot_buffers.end(); it++) {
it->ResetUsageTracking();
}
uniform_ring.BeginFrame();
}
void BufferCacheRuntime::Finish() {

28
src/video_core/renderer_vulkan/vk_buffer_cache.h

@ -127,15 +127,9 @@ public:
void BindTransformFeedbackBuffers(VideoCommon::HostBindings<Buffer>& bindings);
std::span<u8> BindMappedUniformBuffer([[maybe_unused]] size_t /*stage*/,
[[maybe_unused]] u32 /*binding_index*/,
std::span<u8> BindMappedUniformBuffer([[maybe_unused]] size_t stage,
[[maybe_unused]] u32 binding_index,
u32 size) {
u32 offset = 0;
if (auto span = uniform_ring.Alloc(size, offset); !span.empty()) {
BindBuffer(*uniform_ring.buffers[uniform_ring.current_frame], offset, size);
return span;
}
// Fallback for giant requests
const StagingBufferRef ref = staging_pool.Request(size, MemoryUsage::Upload);
BindBuffer(ref.buffer, static_cast<u32>(ref.offset), size);
return ref.mapped_span;
@ -163,24 +157,6 @@ private:
void ReserveNullBuffer();
vk::Buffer CreateNullBuffer();
struct UniformRing {
static constexpr size_t NUM_FRAMES = 3;
std::array<vk::Buffer, NUM_FRAMES> buffers{};
std::array<u8*, NUM_FRAMES> mapped{};
u64 size = 0;
u64 head = 0;
u32 align = 256;
size_t current_frame = 0;
void Init(MemoryAllocator& alloc, u64 bytes, u32 alignment);
void BeginFrame() {
current_frame = (current_frame + 1) % NUM_FRAMES;
head = 0;
}
std::span<u8> Alloc(u32 bytes, u32& out_offset);
};
UniformRing uniform_ring;
const Device& device;
MemoryAllocator& memory_allocator;
Scheduler& scheduler;

13
src/video_core/renderer_vulkan/vk_query_cache.cpp

@ -156,19 +156,6 @@ public:
ReserveHostQuery();
// Ensure outside render pass
scheduler.RequestOutsideRenderPassOperationContext();
// Reset query pool outside render pass
scheduler.Record([query_pool = current_query_pool,
query_index = current_bank_slot](vk::CommandBuffer cmdbuf) {
cmdbuf.ResetQueryPool(query_pool, static_cast<u32>(query_index), 1);
});
// Manually restart the render pass (required for vkCmdClearAttachments, etc.)
scheduler.RequestRenderpass(texture_cache.GetFramebuffer());
// Begin query inside the newly started render pass
scheduler.Record([query_pool = current_query_pool,
query_index = current_bank_slot](vk::CommandBuffer cmdbuf) {
const bool use_precise = Settings::IsGPULevelHigh();

29
src/video_core/renderer_vulkan/vk_scheduler.cpp

@ -302,8 +302,6 @@ void Scheduler::EndRenderPass()
images = renderpass_images,
ranges = renderpass_image_ranges](vk::CommandBuffer cmdbuf) {
std::array<VkImageMemoryBarrier, 9> barriers;
VkPipelineStageFlags src_stages = 0;
for (size_t i = 0; i < num_images; ++i) {
const VkImageSubresourceRange& range = ranges[i];
const bool is_color = (range.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) != 0;
@ -312,20 +310,14 @@ void Scheduler::EndRenderPass()
| VK_IMAGE_ASPECT_STENCIL_BIT)) !=0;
VkAccessFlags src_access = 0;
VkPipelineStageFlags this_stage = 0;
if (is_color) {
if (is_color)
src_access |= VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
this_stage |= VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
}
if (is_depth_stencil) {
else if (is_depth_stencil)
src_access |= VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
this_stage |= VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT
| VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT;
}
src_stages |= this_stage;
else
src_access |= VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT
| VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
barriers[i] = VkImageMemoryBarrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
@ -344,15 +336,10 @@ void Scheduler::EndRenderPass()
.subresourceRange = range,
};
}
// Graft: ensure explicit fragment tests + color output stages are always synchronized (AMD/Windows fix)
src_stages |= VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
cmdbuf.EndRenderPass();
cmdbuf.PipelineBarrier(src_stages,
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
0,
nullptr,

1
src/video_core/vulkan_common/vulkan_wrapper.cpp

@ -121,7 +121,6 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
X(vkCmdEndConditionalRenderingEXT);
X(vkCmdEndQuery);
X(vkCmdEndRenderPass);
X(vkCmdResetQueryPool);
X(vkCmdEndTransformFeedbackEXT);
X(vkCmdEndDebugUtilsLabelEXT);
X(vkCmdFillBuffer);

4
src/video_core/vulkan_common/vulkan_wrapper.h

@ -221,7 +221,6 @@ struct DeviceDispatch : InstanceDispatch {
PFN_vkCmdEndConditionalRenderingEXT vkCmdEndConditionalRenderingEXT{};
PFN_vkCmdEndDebugUtilsLabelEXT vkCmdEndDebugUtilsLabelEXT{};
PFN_vkCmdEndQuery vkCmdEndQuery{};
PFN_vkCmdResetQueryPool vkCmdResetQueryPool{};
PFN_vkCmdEndRenderPass vkCmdEndRenderPass{};
PFN_vkCmdEndTransformFeedbackEXT vkCmdEndTransformFeedbackEXT{};
PFN_vkCmdFillBuffer vkCmdFillBuffer{};
@ -1144,9 +1143,6 @@ public:
VkCommandBuffer operator*() const noexcept {
return handle;
}
void ResetQueryPool(VkQueryPool query_pool, uint32_t first, uint32_t count) const noexcept {
dld->vkCmdResetQueryPool(handle, query_pool, first, count);
}
void Begin(const VkCommandBufferBeginInfo& begin_info) const {
Check(dld->vkBeginCommandBuffer(handle, &begin_info));
}

Loading…
Cancel
Save