// Copyright (c) 2022- PPSSPP Project. // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, version 2.0 or later versions. // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License 2.0 for more details. // A copy of the GPL 2.0 should have been included with the program. // If not, see http://www.gnu.org/licenses/ // Official git repository and contact information can be found at // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. #include #include #include #include "Common/Profiler/Profiler.h" #include "Common/Thread/ThreadManager.h" #include "Common/TimeUtil.h" #include "Core/System.h" #include "GPU/Common/TextureDecoder.h" #include "GPU/Software/BinManager.h" #include "GPU/Software/Rasterizer.h" #include "GPU/Software/RasterizerRectangle.h" using namespace Rasterizer; struct BinWaitable : public Waitable { public: BinWaitable() { count_ = 0; } void Fill() { count_++; } bool Empty() { return count_ == 0; } void Drain() { int result = --count_; if (result == 0) { // We were the last one to increment. std::unique_lock lock(mutex_); cond_.notify_all(); } } void Wait() override { std::unique_lock lock(mutex_); while (count_ != 0) { cond_.wait(lock); } } std::atomic count_; std::mutex mutex_; std::condition_variable cond_; }; static inline void DrawBinItem(const BinItem &item, const RasterizerState &state) { switch (item.type) { case BinItemType::TRIANGLE: DrawTriangle(item.v0, item.v1, item.v2, item.range, state); break; case BinItemType::CLEAR_RECT: ClearRectangle(item.v0, item.v1, item.range, state); break; case BinItemType::RECT: DrawRectangle(item.v0, item.v1, item.range, state); break; case BinItemType::SPRITE: DrawSprite(item.v0, item.v1, item.range, state); break; case BinItemType::LINE: DrawLine(item.v0, item.v1, item.range, state); break; case BinItemType::POINT: DrawPoint(item.v0, item.range, state); break; } } class DrawBinItemsTask : public Task { public: DrawBinItemsTask(BinWaitable *notify, BinManager::BinItemQueue &items, std::atomic &status, const BinManager::BinStateQueue &states) : notify_(notify), items_(items), status_(status), states_(states) { } TaskType Type() const override { return TaskType::CPU_COMPUTE; } void Run() override { ProcessItems(); status_ = false; // In case of any atomic issues, do another pass. ProcessItems(); notify_->Drain(); } void Release() override { // Don't delete, this is statically allocated. } private: void ProcessItems() { while (!items_.Empty()) { const BinItem &item = items_.PeekNext(); DrawBinItem(item, states_[item.stateIndex]); items_.SkipNext(); } } BinWaitable *notify_; BinManager::BinItemQueue &items_; std::atomic &status_; const BinManager::BinStateQueue &states_; }; constexpr int BinManager::MAX_POSSIBLE_TASKS; BinManager::BinManager() { queueRange_.x1 = 0x7FFFFFFF; queueRange_.y1 = 0x7FFFFFFF; queueRange_.x2 = 0; queueRange_.y2 = 0; waitable_ = new BinWaitable(); for (auto &s : taskStatus_) s = false; int maxInitTasks = std::min(g_threadManager.GetNumLooperThreads(), MAX_POSSIBLE_TASKS); for (int i = 0; i < maxInitTasks; ++i) { taskQueues_[i].Setup(); for (DrawBinItemsTask *&task : taskLists_[i].tasks) task = new DrawBinItemsTask(waitable_, taskQueues_[i], taskStatus_[i], states_); } states_.Setup(); cluts_.Setup(); queue_.Setup(); } BinManager::~BinManager() { delete waitable_; for (int i = 0; i < MAX_POSSIBLE_TASKS; ++i) { for (DrawBinItemsTask *task : taskLists_[i].tasks) delete task; } } void BinManager::UpdateState() { PROFILE_THIS_SCOPE("bin_state"); if (HasDirty(SoftDirty::PIXEL_ALL | SoftDirty::SAMPLER_ALL | SoftDirty::RAST_ALL)) { if (states_.Full()) Flush("states"); stateIndex_ = (int)states_.Push(RasterizerState()); ComputeRasterizerState(&states_[stateIndex_]); states_[stateIndex_].samplerID.cached.clut = cluts_[clutIndex_].readable; ClearDirty(SoftDirty::PIXEL_ALL | SoftDirty::SAMPLER_ALL | SoftDirty::RAST_ALL); } if (lastFlipstats_ != gpuStats.numFlips) { lastFlipstats_ = gpuStats.numFlips; ResetStats(); } const auto &state = State(); const bool hadDepth = pendingWrites_[1].base != 0; if (HasDirty(SoftDirty::BINNER_RANGE)) { DrawingCoords scissorTL(gstate.getScissorX1(), gstate.getScissorY1()); DrawingCoords scissorBR(std::min(gstate.getScissorX2(), gstate.getRegionX2()), std::min(gstate.getScissorY2(), gstate.getRegionY2())); ScreenCoords screenScissorTL = TransformUnit::DrawingToScreen(scissorTL, 0); ScreenCoords screenScissorBR = TransformUnit::DrawingToScreen(scissorBR, 0); scissor_.x1 = screenScissorTL.x; scissor_.y1 = screenScissorTL.y; scissor_.x2 = screenScissorBR.x + SCREEN_SCALE_FACTOR - 1; scissor_.y2 = screenScissorBR.y + SCREEN_SCALE_FACTOR - 1; // If we're about to texture from something still pending (i.e. depth), flush. if (HasTextureWrite(state)) Flush("tex"); // Okay, now update what's pending. constexpr uint32_t mirrorMask = 0x0FFFFFFF & ~0x00600000; const uint32_t bpp = state.pixelID.FBFormat() == GE_FORMAT_8888 ? 4 : 2; pendingWrites_[0].Expand(gstate.getFrameBufAddress() & mirrorMask, bpp, gstate.FrameBufStride(), scissorTL, scissorBR); if (state.pixelID.depthWrite) pendingWrites_[1].Expand(gstate.getDepthBufAddress() & mirrorMask, 2, gstate.DepthBufStride(), scissorTL, scissorBR); ClearDirty(SoftDirty::BINNER_RANGE); } else if (pendingOverlap_) { if (HasTextureWrite(state)) Flush("tex"); } if (HasDirty(SoftDirty::BINNER_OVERLAP)) { // Disallow threads when rendering to the target, even offset. bool selfRender = HasTextureWrite(state); int newMaxTasks = selfRender ? 1 : g_threadManager.GetNumLooperThreads(); if (newMaxTasks > MAX_POSSIBLE_TASKS) newMaxTasks = MAX_POSSIBLE_TASKS; // We don't want to overlap wrong, so flush any pending. if (maxTasks_ != newMaxTasks) { maxTasks_ = newMaxTasks; Flush("selfrender"); } pendingOverlap_ = pendingOverlap_ || selfRender; // Lastly, we have to check if we're newly writing depth we were texturing before. // This happens in Call of Duty (depth clear after depth texture), for example. if (!hadDepth && state.pixelID.depthWrite) { for (size_t i = 0; i < states_.Size(); ++i) { if (HasTextureWrite(states_.Peek(i))) { Flush("selfdepth"); } } } ClearDirty(SoftDirty::BINNER_OVERLAP); } } bool BinManager::HasTextureWrite(const RasterizerState &state) { if (!state.enableTextures) return false; const int textureBits = textureBitsPerPixel[state.samplerID.texfmt]; for (int i = 0; i <= state.maxTexLevel; ++i) { int byteStride = (state.texbufw[i] * textureBits) / 8; int byteWidth = (state.samplerID.cached.sizes[i].w * textureBits) / 8; int h = state.samplerID.cached.sizes[i].h; if (HasPendingWrite(state.texaddr[i], byteStride, byteWidth, h)) return true; } return false; } inline void BinDirtyRange::Expand(uint32_t newBase, uint32_t bpp, uint32_t stride, DrawingCoords &tl, DrawingCoords &br) { const uint32_t w = br.x - tl.x + 1; const uint32_t h = br.y - tl.y + 1; newBase += tl.y * stride * bpp + tl.x * bpp; if (base == 0) { base = newBase; strideBytes = stride * bpp; widthBytes = w * bpp; height = h; return; } height = std::max(height, h); if (base == newBase && strideBytes == stride * bpp) { widthBytes = std::max(widthBytes, w * bpp); return; } if (stride != 0) height += ((int)base - (int)newBase) / (stride * bpp); base = std::min(base, newBase); strideBytes = std::max(strideBytes, stride * bpp); widthBytes = strideBytes; } void BinManager::UpdateClut(const void *src) { PROFILE_THIS_SCOPE("bin_clut"); if (cluts_.Full()) Flush("cluts"); clutIndex_ = (int)cluts_.Push(BinClut()); memcpy(cluts_[clutIndex_].readable, src, sizeof(BinClut)); } void BinManager::AddTriangle(const VertexData &v0, const VertexData &v1, const VertexData &v2) { Vec2 d01((int)v0.screenpos.x - (int)v1.screenpos.x, (int)v0.screenpos.y - (int)v1.screenpos.y); Vec2 d02((int)v0.screenpos.x - (int)v2.screenpos.x, (int)v0.screenpos.y - (int)v2.screenpos.y); Vec2 d12((int)v1.screenpos.x - (int)v2.screenpos.x, (int)v1.screenpos.y - (int)v2.screenpos.y); // Drop primitives which are not in CCW order by checking the cross product. static_assert(SCREEN_SCALE_FACTOR <= 16, "Fails if scale factor is too high"); if (d01.x * d02.y - d01.y * d02.x < 0) return; // If all points have identical coords, we'll have 0 weights and not skip properly, so skip here. if (d01.x == 0 && d01.y == 0 && d02.x == 0 && d02.y == 0) return; // Was it fully outside the scissor? const BinCoords range = Range(v0, v1, v2); if (range.Invalid()) return; if (queue_.Full()) Drain(); queue_.Push(BinItem{ BinItemType::TRIANGLE, stateIndex_, range, v0, v1, v2 }); Expand(range); } void BinManager::AddClearRect(const VertexData &v0, const VertexData &v1) { const BinCoords range = Range(v0, v1); if (range.Invalid()) return; if (queue_.Full()) Drain(); queue_.Push(BinItem{ BinItemType::CLEAR_RECT, stateIndex_, range, v0, v1 }); Expand(range); } void BinManager::AddRect(const VertexData &v0, const VertexData &v1) { const BinCoords range = Range(v0, v1); if (range.Invalid()) return; if (queue_.Full()) Drain(); queue_.Push(BinItem{ BinItemType::RECT, stateIndex_, range, v0, v1 }); Expand(range); } void BinManager::AddSprite(const VertexData &v0, const VertexData &v1) { const BinCoords range = Range(v0, v1); if (range.Invalid()) return; if (queue_.Full()) Drain(); queue_.Push(BinItem{ BinItemType::SPRITE, stateIndex_, range, v0, v1 }); Expand(range); } void BinManager::AddLine(const VertexData &v0, const VertexData &v1) { const BinCoords range = Range(v0, v1); if (range.Invalid()) return; if (queue_.Full()) Drain(); queue_.Push(BinItem{ BinItemType::LINE, stateIndex_, range, v0, v1 }); Expand(range); } void BinManager::AddPoint(const VertexData &v0) { const BinCoords range = Range(v0); if (range.Invalid()) return; if (queue_.Full()) Drain(); queue_.Push(BinItem{ BinItemType::POINT, stateIndex_, range, v0 }); Expand(range); } void BinManager::Drain() { PROFILE_THIS_SCOPE("bin_drain"); // If the waitable has fully drained, we can update our binning decisions. if (!tasksSplit_ || waitable_->Empty()) { int w2 = (queueRange_.x2 - queueRange_.x1 + (SCREEN_SCALE_FACTOR * 2 - 1)) / (SCREEN_SCALE_FACTOR * 2); int h2 = (queueRange_.y2 - queueRange_.y1 + (SCREEN_SCALE_FACTOR * 2 - 1)) / (SCREEN_SCALE_FACTOR * 2); // Always bin the entire possible range, but focus on the drawn area. ScreenCoords tl(0, 0, 0); ScreenCoords br(1024 * SCREEN_SCALE_FACTOR, 1024 * SCREEN_SCALE_FACTOR, 0); taskRanges_.clear(); if (h2 >= 18 && w2 >= h2 * 4) { int bin_w = std::max(4, (w2 + maxTasks_ - 1) / maxTasks_) * SCREEN_SCALE_FACTOR * 2; taskRanges_.push_back(BinCoords{ tl.x, tl.y, queueRange_.x1 + bin_w - 1, br.y - 1 }); for (int x = queueRange_.x1 + bin_w; x <= queueRange_.x2; x += bin_w) { int x2 = x + bin_w > queueRange_.x2 ? br.x : x + bin_w; taskRanges_.push_back(BinCoords{ x, tl.y, x2 - 1, br.y - 1 }); } } else if (h2 >= 18 && w2 >= 18) { int bin_h = std::max(4, (h2 + maxTasks_ - 1) / maxTasks_) * SCREEN_SCALE_FACTOR * 2; taskRanges_.push_back(BinCoords{ tl.x, tl.y, br.x - 1, queueRange_.y1 + bin_h - 1 }); for (int y = queueRange_.y1 + bin_h; y <= queueRange_.y2; y += bin_h) { int y2 = y + bin_h > queueRange_.y2 ? br.y : y + bin_h; taskRanges_.push_back(BinCoords{ tl.x, y, br.x - 1, y2 - 1 }); } } tasksSplit_ = true; } if (taskRanges_.size() <= 1) { PROFILE_THIS_SCOPE("bin_drain_single"); while (!queue_.Empty()) { const BinItem &item = queue_.PeekNext(); DrawBinItem(item, states_[item.stateIndex]); queue_.SkipNext(); } } else { while (!queue_.Empty()) { const BinItem &item = queue_.PeekNext(); for (int i = 0; i < (int)taskRanges_.size(); ++i) { const BinCoords range = taskRanges_[i].Intersect(item.range); if (range.Invalid()) continue; // This shouldn't often happen, but if it does, wait for space. if (taskQueues_[i].Full()) waitable_->Wait(); BinItem &taskItem = taskQueues_[i].PeekPush(); taskItem = item; taskItem.range = range; taskQueues_[i].PushPeeked(); } queue_.SkipNext(); } int threads = 0; for (int i = 0; i < (int)taskRanges_.size(); ++i) { if (taskQueues_[i].Empty()) continue; threads++; if (taskStatus_[i]) continue; waitable_->Fill(); taskStatus_[i] = true; g_threadManager.EnqueueTaskOnThread(i, taskLists_[i].Next(), true); enqueues_++; } mostThreads_ = std::max(mostThreads_, threads); } } void BinManager::Flush(const char *reason) { double st; if (coreCollectDebugStats) st = time_now_d(); Drain(); waitable_->Wait(); taskRanges_.clear(); tasksSplit_ = false; queue_.Reset(); while (states_.Size() > 1) states_.SkipNext(); while (cluts_.Size() > 1) cluts_.SkipNext(); queueRange_.x1 = 0x7FFFFFFF; queueRange_.y1 = 0x7FFFFFFF; queueRange_.x2 = 0; queueRange_.y2 = 0; for (auto &pending : pendingWrites_) pending.base = 0; pendingOverlap_ = false; // We'll need to set the pending writes again, since we just flushed it. dirty_ |= SoftDirty::BINNER_RANGE; if (coreCollectDebugStats) { double et = time_now_d(); flushReasonTimes_[reason] += et - st; if (et - st > slowestFlushTime_) { slowestFlushTime_ = et - st; slowestFlushReason_ = reason; } } } bool BinManager::HasPendingWrite(uint32_t start, uint32_t stride, uint32_t w, uint32_t h) { // We can only write to VRAM. if (!Memory::IsVRAMAddress(start)) return false; // Ignore mirrors for overlap detection. start &= 0x0FFFFFFF & ~0x00600000; uint32_t size = stride * h; for (const auto &range : pendingWrites_) { if (range.base == 0 || range.strideBytes == 0) continue; if (start >= range.base + range.height * range.strideBytes || start + size <= range.base) continue; // Let's simply go through each line. Might be in the stride gap. uint32_t row = start; for (uint32_t y = 0; y < h; ++y) { int32_t offset = row - range.base; int32_t rangeY = offset / (int32_t)range.strideBytes; uint32_t rangeX = offset % (int32_t)range.strideBytes; if (rangeY >= 0 && (uint32_t)rangeY < range.height) { // If this row is either within width, or extends beyond stride, overlap. if (rangeX < range.widthBytes || rangeX + w >= range.strideBytes) return true; } row += stride; } } return false; } void BinManager::GetStats(char *buffer, size_t bufsize) { double allTotal = 0.0; double slowestTotalTime = 0.0; const char *slowestTotalReason = nullptr; for (auto &it : flushReasonTimes_) { if (it.second > slowestTotalTime) { slowestTotalTime = it.second; slowestTotalReason = it.first; } allTotal += it.second; } // Many games are 30 FPS, so check last frame too for better stats. double recentTotal = allTotal; double slowestRecentTime = slowestTotalTime; const char *slowestRecentReason = slowestTotalReason; for (auto &it : lastFlushReasonTimes_) { if (it.second > slowestRecentTime) { slowestRecentTime = it.second; slowestRecentReason = it.first; } recentTotal += it.second; } snprintf(buffer, bufsize, "Slowest individual flush: %s (%0.4f)\n" "Slowest frame flush: %s (%0.4f)\n" "Slowest recent flush: %s (%0.4f)\n" "Total flush time: %0.4f (%05.2f%%, last 2: %05.2f%%)\n" "Thread enqueues: %d, count %d", slowestFlushReason_, slowestFlushTime_, slowestTotalReason, slowestTotalTime, slowestRecentReason, slowestRecentTime, allTotal, allTotal * (6000.0 / 1.001), recentTotal * (3000.0 / 1.001), enqueues_, mostThreads_); } void BinManager::ResetStats() { lastFlushReasonTimes_ = std::move(flushReasonTimes_); flushReasonTimes_.clear(); slowestFlushReason_ = nullptr; slowestFlushTime_ = 0.0; enqueues_ = 0; mostThreads_ = 0; } inline BinCoords BinCoords::Intersect(const BinCoords &range) const { BinCoords sub; sub.x1 = std::max(x1, range.x1); sub.y1 = std::max(y1, range.y1); sub.x2 = std::min(x2, range.x2); sub.y2 = std::min(y2, range.y2); return sub; } BinCoords BinManager::Scissor(BinCoords range) { return range.Intersect(scissor_); } BinCoords BinManager::Range(const VertexData &v0, const VertexData &v1, const VertexData &v2) { BinCoords range; range.x1 = std::min(std::min(v0.screenpos.x, v1.screenpos.x), v2.screenpos.x) & ~(SCREEN_SCALE_FACTOR - 1); range.y1 = std::min(std::min(v0.screenpos.y, v1.screenpos.y), v2.screenpos.y) & ~(SCREEN_SCALE_FACTOR - 1); range.x2 = std::max(std::max(v0.screenpos.x, v1.screenpos.x), v2.screenpos.x) | (SCREEN_SCALE_FACTOR - 1); range.y2 = std::max(std::max(v0.screenpos.y, v1.screenpos.y), v2.screenpos.y) | (SCREEN_SCALE_FACTOR - 1); return Scissor(range); } BinCoords BinManager::Range(const VertexData &v0, const VertexData &v1) { BinCoords range; range.x1 = std::min(v0.screenpos.x, v1.screenpos.x) & ~(SCREEN_SCALE_FACTOR - 1); range.y1 = std::min(v0.screenpos.y, v1.screenpos.y) & ~(SCREEN_SCALE_FACTOR - 1); range.x2 = std::max(v0.screenpos.x, v1.screenpos.x) | (SCREEN_SCALE_FACTOR - 1); range.y2 = std::max(v0.screenpos.y, v1.screenpos.y) | (SCREEN_SCALE_FACTOR - 1); return Scissor(range); } BinCoords BinManager::Range(const VertexData &v0) { BinCoords range; range.x1 = v0.screenpos.x & ~(SCREEN_SCALE_FACTOR - 1); range.y1 = v0.screenpos.y & ~(SCREEN_SCALE_FACTOR - 1); range.x2 = v0.screenpos.x | (SCREEN_SCALE_FACTOR - 1); range.y2 = v0.screenpos.y | (SCREEN_SCALE_FACTOR - 1); return Scissor(range); } void BinManager::Expand(const BinCoords &range) { queueRange_.x1 = std::min(queueRange_.x1, range.x1); queueRange_.y1 = std::min(queueRange_.y1, range.y1); queueRange_.x2 = std::max(queueRange_.x2, range.x2); queueRange_.y2 = std::max(queueRange_.y2, range.y2); if (maxTasks_ == 1 || (queueRange_.y2 - queueRange_.y1 >= 224 * SCREEN_SCALE_FACTOR && enqueues_ < 36 * maxTasks_)) { Drain(); } }