Browse Source

[device_memory, buffer_cache] Batching API

pull/3288/head
CamilleLaVey 3 weeks ago
committed by crueter
parent
commit
77ec16c618
  1. 28
      src/core/device_memory_manager.h
  2. 64
      src/core/device_memory_manager.inc
  3. 30
      src/tests/video_core/memory_tracker.cpp
  4. 6
      src/video_core/buffer_cache/word_manager.h

28
src/core/device_memory_manager.h

@ -12,6 +12,7 @@
#include <deque>
#include <memory>
#include <mutex>
#include <vector>
#include "common/common_types.h"
#include "common/range_mutex.h"
@ -120,19 +121,12 @@ public:
void UpdatePagesCachedCount(DAddr addr, size_t size, s32 delta);
#if defined(YUZU_TESTS)
// Instrumentation getters for testing
[[nodiscard]] size_t UpdatePagesCachedCalls() const noexcept { return update_pages_cached_calls.load(std::memory_order_relaxed); }
[[nodiscard]] uint64_t UpdatePagesCachedTotalNs() const noexcept { return update_pages_cached_total_ns.load(std::memory_order_relaxed); }
[[nodiscard]] uint64_t UpdatePagesCachedMaxNs() const noexcept { return update_pages_cached_max_ns.load(std::memory_order_relaxed); }
[[nodiscard]] size_t UpdatePagesCachedTotalBytes() const noexcept { return update_pages_cached_total_bytes.load(std::memory_order_relaxed); }
void ResetUpdatePagesCachedMetrics() noexcept {
update_pages_cached_calls.store(0, std::memory_order_relaxed);
update_pages_cached_total_ns.store(0, std::memory_order_relaxed);
update_pages_cached_max_ns.store(0, std::memory_order_relaxed);
update_pages_cached_total_bytes.store(0, std::memory_order_relaxed);
}
#endif
// New batch API to update multiple ranges with a single lock acquisition.
void UpdatePagesCachedBatch(const std::vector<std::pair<DAddr, size_t>>& ranges, s32 delta);
private:
// Internal helper that performs the update assuming the caller already holds the necessary lock.
void UpdatePagesCachedCountNoLock(DAddr addr, size_t size, s32 delta);
static constexpr size_t AS_BITS = Traits::device_virtual_bits;
@ -232,13 +226,7 @@ private:
Common::RangeMutex counter_guard;
std::mutex mapping_guard;
#if defined(YUZU_TESTS)
// Instrumentation counters for UpdatePagesCachedCount
mutable std::atomic_size_t update_pages_cached_calls{0};
mutable std::atomic<uint64_t> update_pages_cached_total_ns{0};
mutable std::atomic<uint64_t> update_pages_cached_max_ns{0};
mutable std::atomic_size_t update_pages_cached_total_bytes{0};
#endif
};
} // namespace Core

64
src/core/device_memory_manager.inc

@ -8,7 +8,8 @@
#include <limits>
#include <memory>
#include <type_traits>
#include <chrono>
#include <algorithm>
#include <vector>
#include "common/address_space.h"
#include "common/address_space.inc"
@ -167,9 +168,6 @@ template <typename Traits>
DeviceMemoryManager<Traits>::DeviceMemoryManager(const DeviceMemory& device_memory_)
: physical_base{reinterpret_cast<const uintptr_t>(device_memory_.buffer.BackingBasePointer())},
device_inter{nullptr}, compressed_physical_ptr(device_as_size >> Memory::YUZU_PAGEBITS),
#if defined(YUZU_TESTS)
update_pages_cached_calls{0}, update_pages_cached_total_ns{0}, update_pages_cached_max_ns{0}, update_pages_cached_total_bytes{0},
#endif
compressed_device_addr(1ULL << ((Settings::values.memory_layout_mode.GetValue() ==
Settings::MemoryLayout::Memory_4Gb
? physical_min_bits
@ -514,11 +512,7 @@ void DeviceMemoryManager<Traits>::UnregisterProcess(Asid asid) {
}
template <typename Traits>
void DeviceMemoryManager<Traits>::UpdatePagesCachedCount(DAddr addr, size_t size, s32 delta) {
#if defined(YUZU_TESTS)
const auto start_time = std::chrono::steady_clock::now();
#endif
Common::ScopedRangeLock lk(counter_guard, addr, size);
void DeviceMemoryManager<Traits>::UpdatePagesCachedCountNoLock(DAddr addr, size_t size, s32 delta) {
u64 uncache_begin = 0;
u64 cache_begin = 0;
u64 uncache_bytes = 0;
@ -594,19 +588,49 @@ void DeviceMemoryManager<Traits>::UpdatePagesCachedCount(DAddr addr, size_t size
}
}
release_pending();
}
template <typename Traits>
void DeviceMemoryManager<Traits>::UpdatePagesCachedCount(DAddr addr, size_t size, s32 delta) {
Common::ScopedRangeLock lk(counter_guard, addr, size);
UpdatePagesCachedCountNoLock(addr, size, delta);
}
template <typename Traits>
void DeviceMemoryManager<Traits>::UpdatePagesCachedBatch(const std::vector<std::pair<DAddr, size_t>>& ranges, s32 delta) {
if (ranges.empty()) {
return;
}
// Make a local copy and sort by address
std::vector<std::pair<DAddr, size_t>> tmp = ranges;
std::sort(tmp.begin(), tmp.end(), [](const auto& a, const auto& b) { return a.first < b.first; });
// Coalesce adjacent/overlapping ranges
std::vector<std::pair<DAddr, size_t>> coalesced;
DAddr cur_addr = tmp[0].first;
size_t cur_size = tmp[0].second;
for (size_t i = 1; i < tmp.size(); ++i) {
DAddr next_addr = tmp[i].first;
size_t next_size = tmp[i].second;
if (cur_addr + cur_size >= next_addr) {
// overlapping or contiguous
const DAddr end = std::max(cur_addr + cur_size, next_addr + next_size);
cur_size = end - cur_addr;
} else {
coalesced.emplace_back(cur_addr, cur_size);
cur_addr = next_addr;
cur_size = next_size;
}
}
coalesced.emplace_back(cur_addr, cur_size);
const DAddr lock_begin = coalesced.front().first;
const DAddr lock_end = coalesced.back().first + coalesced.back().second;
Common::ScopedRangeLock lk(counter_guard, lock_begin, static_cast<size_t>(lock_end - lock_begin));
#if defined(YUZU_TESTS)
const auto end_time = std::chrono::steady_clock::now();
const uint64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - start_time).count();
update_pages_cached_calls.fetch_add(1, std::memory_order_relaxed);
update_pages_cached_total_ns.fetch_add(ns, std::memory_order_relaxed);
update_pages_cached_total_bytes.fetch_add(size, std::memory_order_relaxed);
// Update max
uint64_t old_max = update_pages_cached_max_ns.load(std::memory_order_relaxed);
while (old_max < ns && !update_pages_cached_max_ns.compare_exchange_weak(old_max, ns, std::memory_order_relaxed)) {
// loop until updated
for (const auto& [addr, size] : coalesced) {
UpdatePagesCachedCountNoLock(addr, size, delta);
}
#endif
}
} // namespace Core

30
src/tests/video_core/memory_tracker.cpp

@ -566,26 +566,24 @@ TEST_CASE("MemoryTracker: FlushCachedWrites batching") {
memory_track->CachedCpuWrite(c + PAGE * 4, PAGE);
REQUIRE(rasterizer.UpdateCalls() == 0);
memory_track->FlushCachedWrites();
REQUIRE(rasterizer.UpdateCalls() == 2);
// Now we expect a single batch call (coalesced ranges) to the device memory manager
REQUIRE(rasterizer.UpdateCalls() == 1);
const auto& calls = rasterizer.UpdateCallsList();
REQUIRE(std::get<0>(calls[0]) == c + PAGE);
REQUIRE(std::get<1>(calls[0]) == PAGE * 2);
REQUIRE(std::get<0>(calls[1]) == c + PAGE * 4);
REQUIRE(std::get<1>(calls[1]) == PAGE);
REQUIRE(std::get<1>(calls[0]) == PAGE * 3);
}
TEST_CASE("DeviceMemoryManager: UpdatePagesCachedCount instrumentation") {
TEST_CASE("DeviceMemoryManager: UpdatePagesCachedBatch basic") {
Core::DeviceMemory device_memory;
Tegra::MaxwellDeviceMemoryManager manager(device_memory);
#if defined(YUZU_TESTS)
manager.ResetUpdatePagesCachedMetrics();
REQUIRE(manager.UpdatePagesCachedCalls() == 0);
manager.UpdatePagesCachedCount(0, Core::Memory::YUZU_PAGESIZE, 1);
REQUIRE(manager.UpdatePagesCachedCalls() == 1);
REQUIRE(manager.UpdatePagesCachedTotalBytes() >= Core::Memory::YUZU_PAGESIZE);
REQUIRE(manager.UpdatePagesCachedTotalNs() > 0);
REQUIRE(manager.UpdatePagesCachedMaxNs() > 0);
#else
SUCCEED("Instrumentation only available in test builds");
#endif
// empty should be a no-op
std::vector<std::pair<Core::DAddr, size_t>> empty;
manager.UpdatePagesCachedBatch(empty, 1);
// small ranges should be accepted and not crash
std::vector<std::pair<Core::DAddr, size_t>> ranges;
ranges.emplace_back(0, Core::Memory::YUZU_PAGESIZE);
ranges.emplace_back(Core::Memory::YUZU_PAGESIZE, Core::Memory::YUZU_PAGESIZE);
manager.UpdatePagesCachedBatch(ranges, 1);
SUCCEED("UpdatePagesCachedBatch executed without error");
}

6
src/video_core/buffer_cache/word_manager.h

@ -503,12 +503,14 @@ private:
if (cur_addr + cur_size == ranges[i].first) {
cur_size += ranges[i].second;
} else {
tracker->UpdatePagesCachedCount(cur_addr, cur_size, delta);
coalesced.emplace_back(cur_addr, cur_size);
cur_addr = ranges[i].first;
cur_size = ranges[i].second;
}
}
tracker->UpdatePagesCachedCount(cur_addr, cur_size, delta);
coalesced.emplace_back(cur_addr, cur_size);
// Use batch API to reduce lock acquisitions and contention.
tracker->UpdatePagesCachedBatch(coalesced, delta);
ranges.clear();
}

Loading…
Cancel
Save