From 1a9b4b37e1f84822d8d05f1e5037d957e9cbfe84 Mon Sep 17 00:00:00 2001 From: CamilleLaVey Date: Sun, 18 Jan 2026 03:48:09 +0100 Subject: [PATCH] [buffer_cache] Add batching support for memory tracker updates (#3288) I added a batching/ coalescing of ranges in WordManager to reduce calls per pages in UpdatePagesCachedCount, also a test to verify if FlushCachedWrites coalesced (reduces callings to UpdatePagesCachedCount) callings and register each of them to inspect them. Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3288 Reviewed-by: Maufeat Reviewed-by: DraVee Co-authored-by: CamilleLaVey Co-committed-by: CamilleLaVey --- src/core/device_memory_manager.h | 14 +++++- src/core/device_memory_manager.inc | 51 ++++++++++++++++++++- src/tests/video_core/memory_tracker.cpp | 43 ++++++++++++++++++ src/video_core/buffer_cache/word_manager.h | 53 ++++++++++++++++++++-- 4 files changed, 155 insertions(+), 6 deletions(-) diff --git a/src/core/device_memory_manager.h b/src/core/device_memory_manager.h index 6dcf7bb228..249dc08f5f 100644 --- a/src/core/device_memory_manager.h +++ b/src/core/device_memory_manager.h @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later @@ -9,6 +12,7 @@ #include #include #include +#include #include "common/common_types.h" #include "common/range_mutex.h" @@ -44,6 +48,7 @@ public: ~DeviceMemoryManager(); static constexpr bool HAS_FLUSH_INVALIDATION = true; + static constexpr size_t AS_BITS = Traits::device_virtual_bits; void BindInterface(DeviceInterface* device_inter); @@ -117,7 +122,12 @@ public: void UpdatePagesCachedCount(DAddr addr, size_t size, s32 delta); - static constexpr size_t AS_BITS = Traits::device_virtual_bits; + // New batch API to update multiple ranges with a single lock acquisition. + void UpdatePagesCachedBatch(const std::vector>& ranges, s32 delta); + +private: + // Internal helper that performs the update assuming the caller already holds the necessary lock. + void UpdatePagesCachedCountNoLock(DAddr addr, size_t size, s32 delta); private: static constexpr size_t device_virtual_bits = Traits::device_virtual_bits; @@ -214,6 +224,8 @@ private: std::unique_ptr cached_pages; Common::RangeMutex counter_guard; std::mutex mapping_guard; + + }; } // namespace Core diff --git a/src/core/device_memory_manager.inc b/src/core/device_memory_manager.inc index 52dff5df9a..35edbdd223 100644 --- a/src/core/device_memory_manager.inc +++ b/src/core/device_memory_manager.inc @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later @@ -5,6 +8,8 @@ #include #include #include +#include +#include #include "common/address_space.h" #include "common/address_space.inc" @@ -507,8 +512,7 @@ void DeviceMemoryManager::UnregisterProcess(Asid asid) { } template -void DeviceMemoryManager::UpdatePagesCachedCount(DAddr addr, size_t size, s32 delta) { - Common::ScopedRangeLock lk(counter_guard, addr, size); +void DeviceMemoryManager::UpdatePagesCachedCountNoLock(DAddr addr, size_t size, s32 delta) { u64 uncache_begin = 0; u64 cache_begin = 0; u64 uncache_bytes = 0; @@ -586,4 +590,47 @@ void DeviceMemoryManager::UpdatePagesCachedCount(DAddr addr, size_t size release_pending(); } +template +void DeviceMemoryManager::UpdatePagesCachedCount(DAddr addr, size_t size, s32 delta) { + Common::ScopedRangeLock lk(counter_guard, addr, size); + UpdatePagesCachedCountNoLock(addr, size, delta); +} + +template +void DeviceMemoryManager::UpdatePagesCachedBatch(const std::vector>& ranges, s32 delta) { + if (ranges.empty()) { + return; + } + // Make a local copy and sort by address + std::vector> tmp = ranges; + std::sort(tmp.begin(), tmp.end(), [](const auto& a, const auto& b) { return a.first < b.first; }); + + // Coalesce adjacent/overlapping ranges + std::vector> coalesced; + DAddr cur_addr = tmp[0].first; + size_t cur_size = tmp[0].second; + for (size_t i = 1; i < tmp.size(); ++i) { + DAddr next_addr = tmp[i].first; + size_t next_size = tmp[i].second; + if (cur_addr + cur_size >= next_addr) { + // overlapping or contiguous + const DAddr end = std::max(cur_addr + cur_size, next_addr + next_size); + cur_size = end - cur_addr; + } else { + coalesced.emplace_back(cur_addr, cur_size); + cur_addr = next_addr; + cur_size = next_size; + } + } + coalesced.emplace_back(cur_addr, cur_size); + + const DAddr lock_begin = coalesced.front().first; + const DAddr lock_end = coalesced.back().first + coalesced.back().second; + Common::ScopedRangeLock lk(counter_guard, lock_begin, static_cast(lock_end - lock_begin)); + + for (const auto& [addr, size] : coalesced) { + UpdatePagesCachedCountNoLock(addr, size, delta); + } +} + } // namespace Core diff --git a/src/tests/video_core/memory_tracker.cpp b/src/tests/video_core/memory_tracker.cpp index b6fdefe0fc..b5b2a98216 100644 --- a/src/tests/video_core/memory_tracker.cpp +++ b/src/tests/video_core/memory_tracker.cpp @@ -4,11 +4,15 @@ #include #include #include +#include +#include #include #include "common/common_types.h" #include "video_core/buffer_cache/memory_tracker_base.h" +#include "core/device_memory.h" +#include "video_core/host1x/gpu_device_memory_manager.h" namespace { using Range = std::pair; @@ -23,6 +27,8 @@ constexpr VAddr c = 16 * HIGH_PAGE_SIZE; class RasterizerInterface { public: void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) { + ++update_calls; + calls.emplace_back(addr, size, delta); const u64 page_start{addr >> Core::DEVICE_PAGEBITS}; const u64 page_end{(addr + size + Core::DEVICE_PAGESIZE - 1) >> Core::DEVICE_PAGEBITS}; for (u64 page = page_start; page < page_end; ++page) { @@ -36,6 +42,9 @@ public: } } + [[nodiscard]] size_t UpdateCalls() const noexcept { return update_calls; } + [[nodiscard]] const std::vector>& UpdateCallsList() const noexcept { return calls; } + [[nodiscard]] int Count(VAddr addr) const noexcept { const auto it = page_table.find(addr >> Core::DEVICE_PAGEBITS); return it == page_table.end() ? 0 : it->second; @@ -51,7 +60,10 @@ public: private: std::unordered_map page_table; + std::vector> calls; + size_t update_calls = 0; }; + } // Anonymous namespace using MemoryTracker = VideoCommon::MemoryTrackerBase; @@ -544,3 +556,34 @@ TEST_CASE("MemoryTracker: Cached write downloads") { memory_track->MarkRegionAsCpuModified(c, WORD); REQUIRE(rasterizer.Count() == 0); } + +TEST_CASE("MemoryTracker: FlushCachedWrites batching") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD * 2); + memory_track->CachedCpuWrite(c + PAGE, PAGE); + memory_track->CachedCpuWrite(c + PAGE * 2, PAGE); + memory_track->CachedCpuWrite(c + PAGE * 4, PAGE); + REQUIRE(rasterizer.UpdateCalls() == 0); + memory_track->FlushCachedWrites(); + // Now we expect a single batch call (coalesced ranges) to the device memory manager + REQUIRE(rasterizer.UpdateCalls() == 1); + const auto& calls = rasterizer.UpdateCallsList(); + REQUIRE(std::get<0>(calls[0]) == c + PAGE); + REQUIRE(std::get<1>(calls[0]) == PAGE * 3); +} + +TEST_CASE("DeviceMemoryManager: UpdatePagesCachedBatch basic") { + Core::DeviceMemory device_memory; + Tegra::MaxwellDeviceMemoryManager manager(device_memory); + // empty should be a no-op + std::vector> empty; + manager.UpdatePagesCachedBatch(empty, 1); + + // small ranges should be accepted and not crash + std::vector> ranges; + ranges.emplace_back(0, Core::Memory::YUZU_PAGESIZE); + ranges.emplace_back(Core::Memory::YUZU_PAGESIZE, Core::Memory::YUZU_PAGESIZE); + manager.UpdatePagesCachedBatch(ranges, 1); + SUCCEED("UpdatePagesCachedBatch executed without error"); +} diff --git a/src/video_core/buffer_cache/word_manager.h b/src/video_core/buffer_cache/word_manager.h index b93bd57089..51f38a2eb9 100644 --- a/src/video_core/buffer_cache/word_manager.h +++ b/src/video_core/buffer_cache/word_manager.h @@ -11,6 +11,7 @@ #include #include #include +#include #include "common/alignment.h" #include "common/common_funcs.h" @@ -256,9 +257,10 @@ public: std::span state_words = words.template Span(); [[maybe_unused]] std::span untracked_words = words.template Span(); [[maybe_unused]] std::span cached_words = words.template Span(); + std::vector> ranges; IterateWords(dirty_addr - cpu_addr, size, [&](size_t index, u64 mask) { if constexpr (type == Type::CPU || type == Type::CachedCPU) { - NotifyRasterizer(index, untracked_words[index], mask); + CollectChangedRanges<(!enable)>(index, untracked_words[index], mask, ranges); } if constexpr (enable) { state_words[index] |= mask; @@ -279,6 +281,9 @@ public: } } }); + if (!ranges.empty()) { + ApplyCollectedRanges(ranges, (!enable) ? 1 : -1); + } } /** @@ -304,6 +309,7 @@ public: func(cpu_addr + pending_offset * BYTES_PER_PAGE, (pending_pointer - pending_offset) * BYTES_PER_PAGE); }; + std::vector> ranges; IterateWords(offset, size, [&](size_t index, u64 mask) { if constexpr (type == Type::GPU) { mask &= ~untracked_words[index]; @@ -311,7 +317,7 @@ public: const u64 word = state_words[index] & mask; if constexpr (clear) { if constexpr (type == Type::CPU || type == Type::CachedCPU) { - NotifyRasterizer(index, untracked_words[index], mask); + CollectChangedRanges(index, untracked_words[index], mask, ranges); } state_words[index] &= ~mask; if constexpr (type == Type::CPU || type == Type::CachedCPU) { @@ -343,6 +349,9 @@ public: if (pending) { release(); } + if (!ranges.empty()) { + ApplyCollectedRanges(ranges, 1); + } } /** @@ -425,13 +434,17 @@ public: u64* const cached_words = Array(); u64* const untracked_words = Array(); u64* const cpu_words = Array(); + std::vector> ranges; for (u64 word_index = 0; word_index < num_words; ++word_index) { const u64 cached_bits = cached_words[word_index]; - NotifyRasterizer(word_index, untracked_words[word_index], cached_bits); + CollectChangedRanges(word_index, untracked_words[word_index], cached_bits, ranges); untracked_words[word_index] |= cached_bits; cpu_words[word_index] |= cached_bits; cached_words[word_index] = 0; } + if (!ranges.empty()) { + ApplyCollectedRanges(ranges, -1); + } } private: @@ -470,6 +483,40 @@ private: * * @tparam add_to_tracker True when the tracker should start tracking the new pages */ + template + void CollectChangedRanges(u64 word_index, u64 current_bits, u64 new_bits, + std::vector>& out_ranges) const { + u64 changed_bits = (add_to_tracker ? current_bits : ~current_bits) & new_bits; + VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; + IteratePages(changed_bits, [&](size_t offset, size_t size) { + out_ranges.emplace_back(addr + offset * BYTES_PER_PAGE, size * BYTES_PER_PAGE); + }); + } + + void ApplyCollectedRanges(std::vector>& ranges, int delta) const { + if (ranges.empty()) return; + std::sort(ranges.begin(), ranges.end(), + [](const auto& a, const auto& b) { return a.first < b.first; }); + // Coalesce adjacent/contiguous ranges + std::vector> coalesced; + coalesced.reserve(ranges.size()); + VAddr cur_addr = ranges[0].first; + size_t cur_size = static_cast(ranges[0].second); + for (size_t i = 1; i < ranges.size(); ++i) { + if (cur_addr + cur_size == ranges[i].first) { + cur_size += static_cast(ranges[i].second); + } else { + coalesced.emplace_back(cur_addr, cur_size); + cur_addr = ranges[i].first; + cur_size = static_cast(ranges[i].second); + } + } + coalesced.emplace_back(cur_addr, cur_size); + // Use batch API to reduce lock acquisitions and contention. + tracker->UpdatePagesCachedBatch(coalesced, delta); + ranges.clear(); + } + template void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const { u64 changed_bits = (add_to_tracker ? current_bits : ~current_bits) & new_bits;