diff --git a/src/core/device_memory_manager.h b/src/core/device_memory_manager.h index 5391229191..359ab358eb 100644 --- a/src/core/device_memory_manager.h +++ b/src/core/device_memory_manager.h @@ -12,6 +12,7 @@ #include #include #include +#include #include "common/common_types.h" #include "common/range_mutex.h" @@ -120,19 +121,12 @@ public: void UpdatePagesCachedCount(DAddr addr, size_t size, s32 delta); -#if defined(YUZU_TESTS) - // Instrumentation getters for testing - [[nodiscard]] size_t UpdatePagesCachedCalls() const noexcept { return update_pages_cached_calls.load(std::memory_order_relaxed); } - [[nodiscard]] uint64_t UpdatePagesCachedTotalNs() const noexcept { return update_pages_cached_total_ns.load(std::memory_order_relaxed); } - [[nodiscard]] uint64_t UpdatePagesCachedMaxNs() const noexcept { return update_pages_cached_max_ns.load(std::memory_order_relaxed); } - [[nodiscard]] size_t UpdatePagesCachedTotalBytes() const noexcept { return update_pages_cached_total_bytes.load(std::memory_order_relaxed); } - void ResetUpdatePagesCachedMetrics() noexcept { - update_pages_cached_calls.store(0, std::memory_order_relaxed); - update_pages_cached_total_ns.store(0, std::memory_order_relaxed); - update_pages_cached_max_ns.store(0, std::memory_order_relaxed); - update_pages_cached_total_bytes.store(0, std::memory_order_relaxed); - } -#endif + // New batch API to update multiple ranges with a single lock acquisition. + void UpdatePagesCachedBatch(const std::vector>& ranges, s32 delta); + +private: + // Internal helper that performs the update assuming the caller already holds the necessary lock. + void UpdatePagesCachedCountNoLock(DAddr addr, size_t size, s32 delta); static constexpr size_t AS_BITS = Traits::device_virtual_bits; @@ -232,13 +226,7 @@ private: Common::RangeMutex counter_guard; std::mutex mapping_guard; -#if defined(YUZU_TESTS) - // Instrumentation counters for UpdatePagesCachedCount - mutable std::atomic_size_t update_pages_cached_calls{0}; - mutable std::atomic update_pages_cached_total_ns{0}; - mutable std::atomic update_pages_cached_max_ns{0}; - mutable std::atomic_size_t update_pages_cached_total_bytes{0}; -#endif + }; } // namespace Core diff --git a/src/core/device_memory_manager.inc b/src/core/device_memory_manager.inc index 6f062e7882..35edbdd223 100644 --- a/src/core/device_memory_manager.inc +++ b/src/core/device_memory_manager.inc @@ -8,7 +8,8 @@ #include #include #include -#include +#include +#include #include "common/address_space.h" #include "common/address_space.inc" @@ -167,9 +168,6 @@ template DeviceMemoryManager::DeviceMemoryManager(const DeviceMemory& device_memory_) : physical_base{reinterpret_cast(device_memory_.buffer.BackingBasePointer())}, device_inter{nullptr}, compressed_physical_ptr(device_as_size >> Memory::YUZU_PAGEBITS), -#if defined(YUZU_TESTS) - update_pages_cached_calls{0}, update_pages_cached_total_ns{0}, update_pages_cached_max_ns{0}, update_pages_cached_total_bytes{0}, -#endif compressed_device_addr(1ULL << ((Settings::values.memory_layout_mode.GetValue() == Settings::MemoryLayout::Memory_4Gb ? physical_min_bits @@ -514,11 +512,7 @@ void DeviceMemoryManager::UnregisterProcess(Asid asid) { } template -void DeviceMemoryManager::UpdatePagesCachedCount(DAddr addr, size_t size, s32 delta) { -#if defined(YUZU_TESTS) - const auto start_time = std::chrono::steady_clock::now(); -#endif - Common::ScopedRangeLock lk(counter_guard, addr, size); +void DeviceMemoryManager::UpdatePagesCachedCountNoLock(DAddr addr, size_t size, s32 delta) { u64 uncache_begin = 0; u64 cache_begin = 0; u64 uncache_bytes = 0; @@ -594,19 +588,49 @@ void DeviceMemoryManager::UpdatePagesCachedCount(DAddr addr, size_t size } } release_pending(); +} + +template +void DeviceMemoryManager::UpdatePagesCachedCount(DAddr addr, size_t size, s32 delta) { + Common::ScopedRangeLock lk(counter_guard, addr, size); + UpdatePagesCachedCountNoLock(addr, size, delta); +} + +template +void DeviceMemoryManager::UpdatePagesCachedBatch(const std::vector>& ranges, s32 delta) { + if (ranges.empty()) { + return; + } + // Make a local copy and sort by address + std::vector> tmp = ranges; + std::sort(tmp.begin(), tmp.end(), [](const auto& a, const auto& b) { return a.first < b.first; }); + + // Coalesce adjacent/overlapping ranges + std::vector> coalesced; + DAddr cur_addr = tmp[0].first; + size_t cur_size = tmp[0].second; + for (size_t i = 1; i < tmp.size(); ++i) { + DAddr next_addr = tmp[i].first; + size_t next_size = tmp[i].second; + if (cur_addr + cur_size >= next_addr) { + // overlapping or contiguous + const DAddr end = std::max(cur_addr + cur_size, next_addr + next_size); + cur_size = end - cur_addr; + } else { + coalesced.emplace_back(cur_addr, cur_size); + cur_addr = next_addr; + cur_size = next_size; + } + } + coalesced.emplace_back(cur_addr, cur_size); + + const DAddr lock_begin = coalesced.front().first; + const DAddr lock_end = coalesced.back().first + coalesced.back().second; + Common::ScopedRangeLock lk(counter_guard, lock_begin, static_cast(lock_end - lock_begin)); -#if defined(YUZU_TESTS) - const auto end_time = std::chrono::steady_clock::now(); - const uint64_t ns = std::chrono::duration_cast(end_time - start_time).count(); - update_pages_cached_calls.fetch_add(1, std::memory_order_relaxed); - update_pages_cached_total_ns.fetch_add(ns, std::memory_order_relaxed); - update_pages_cached_total_bytes.fetch_add(size, std::memory_order_relaxed); - // Update max - uint64_t old_max = update_pages_cached_max_ns.load(std::memory_order_relaxed); - while (old_max < ns && !update_pages_cached_max_ns.compare_exchange_weak(old_max, ns, std::memory_order_relaxed)) { - // loop until updated + for (const auto& [addr, size] : coalesced) { + UpdatePagesCachedCountNoLock(addr, size, delta); } -#endif } } // namespace Core diff --git a/src/tests/video_core/memory_tracker.cpp b/src/tests/video_core/memory_tracker.cpp index ddcdf0ce08..949dc5cdfd 100644 --- a/src/tests/video_core/memory_tracker.cpp +++ b/src/tests/video_core/memory_tracker.cpp @@ -566,26 +566,24 @@ TEST_CASE("MemoryTracker: FlushCachedWrites batching") { memory_track->CachedCpuWrite(c + PAGE * 4, PAGE); REQUIRE(rasterizer.UpdateCalls() == 0); memory_track->FlushCachedWrites(); - REQUIRE(rasterizer.UpdateCalls() == 2); + // Now we expect a single batch call (coalesced ranges) to the device memory manager + REQUIRE(rasterizer.UpdateCalls() == 1); const auto& calls = rasterizer.UpdateCallsList(); REQUIRE(std::get<0>(calls[0]) == c + PAGE); - REQUIRE(std::get<1>(calls[0]) == PAGE * 2); - REQUIRE(std::get<0>(calls[1]) == c + PAGE * 4); - REQUIRE(std::get<1>(calls[1]) == PAGE); + REQUIRE(std::get<1>(calls[0]) == PAGE * 3); } -TEST_CASE("DeviceMemoryManager: UpdatePagesCachedCount instrumentation") { +TEST_CASE("DeviceMemoryManager: UpdatePagesCachedBatch basic") { Core::DeviceMemory device_memory; Tegra::MaxwellDeviceMemoryManager manager(device_memory); -#if defined(YUZU_TESTS) - manager.ResetUpdatePagesCachedMetrics(); - REQUIRE(manager.UpdatePagesCachedCalls() == 0); - manager.UpdatePagesCachedCount(0, Core::Memory::YUZU_PAGESIZE, 1); - REQUIRE(manager.UpdatePagesCachedCalls() == 1); - REQUIRE(manager.UpdatePagesCachedTotalBytes() >= Core::Memory::YUZU_PAGESIZE); - REQUIRE(manager.UpdatePagesCachedTotalNs() > 0); - REQUIRE(manager.UpdatePagesCachedMaxNs() > 0); -#else - SUCCEED("Instrumentation only available in test builds"); -#endif + // empty should be a no-op + std::vector> empty; + manager.UpdatePagesCachedBatch(empty, 1); + + // small ranges should be accepted and not crash + std::vector> ranges; + ranges.emplace_back(0, Core::Memory::YUZU_PAGESIZE); + ranges.emplace_back(Core::Memory::YUZU_PAGESIZE, Core::Memory::YUZU_PAGESIZE); + manager.UpdatePagesCachedBatch(ranges, 1); + SUCCEED("UpdatePagesCachedBatch executed without error"); } diff --git a/src/video_core/buffer_cache/word_manager.h b/src/video_core/buffer_cache/word_manager.h index 8840bfddbe..85844836e7 100644 --- a/src/video_core/buffer_cache/word_manager.h +++ b/src/video_core/buffer_cache/word_manager.h @@ -503,12 +503,14 @@ private: if (cur_addr + cur_size == ranges[i].first) { cur_size += ranges[i].second; } else { - tracker->UpdatePagesCachedCount(cur_addr, cur_size, delta); + coalesced.emplace_back(cur_addr, cur_size); cur_addr = ranges[i].first; cur_size = ranges[i].second; } } - tracker->UpdatePagesCachedCount(cur_addr, cur_size, delta); + coalesced.emplace_back(cur_addr, cur_size); + // Use batch API to reduce lock acquisitions and contention. + tracker->UpdatePagesCachedBatch(coalesced, delta); ranges.clear(); }