diff --git a/Common/Thread/ParallelLoop.cpp b/Common/Thread/ParallelLoop.cpp index 2b5f8b0606..fe3c698957 100644 --- a/Common/Thread/ParallelLoop.cpp +++ b/Common/Thread/ParallelLoop.cpp @@ -30,12 +30,15 @@ public: const TaskPriority priority_; }; -WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::function &loop, int lower, int upper, int minSize, TaskPriority priority) { +WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::function &loop, int lower, int upper, int minSize, int maxThreads, TaskPriority priority) { if (minSize == -1) { minSize = 1; } int numTasks = threadMan->GetNumLooperThreads(); + if (maxThreads > 0) { + numTasks = std::min(maxThreads, numTasks); + } int range = upper - lower; if (range <= 0) { // Nothing to do. A finished counter allocated to keep the API. @@ -88,7 +91,7 @@ WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std:: } } -void ParallelRangeLoop(ThreadManager *threadMan, const std::function &loop, int lower, int upper, int minSize, TaskPriority priority) { +void ParallelRangeLoop(ThreadManager *threadMan, const std::function &loop, int lower, int upper, int minSize, int maxThreads, TaskPriority priority) { if (cpu_info.num_cores == 1 || (minSize >= (upper - lower) && upper > lower)) { // "Optimization" for single-core devices, or minSize larger than the range. // No point in adding threading overhead, let's just do it inline (since this is the blocking variant). @@ -101,7 +104,7 @@ void ParallelRangeLoop(ThreadManager *threadMan, const std::function &loop, int lower, int upper, int minSize, TaskPriority priority); +// maxThreads can be set to 0 to use the number of cores. +WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::function &loop, int lower, int upper, int minSize, int maxThreads, TaskPriority priority); // Note that upper bounds are non-inclusive: range is [lower, upper) -void ParallelRangeLoop(ThreadManager *threadMan, const std::function &loop, int lower, int upper, int minSize, TaskPriority priority = TaskPriority::NORMAL); +// maxThreads can be set to 0 to use the number of cores. +void ParallelRangeLoop(ThreadManager *threadMan, const std::function &loop, int lower, int upper, int minSize, int maxThreads, TaskPriority priority = TaskPriority::NORMAL); // Common utilities for large (!) memory copies. // Will only fall back to threads if it seems to make sense. diff --git a/Common/TimeUtil.cpp b/Common/TimeUtil.cpp index fdb6817723..f49a2b5de8 100644 --- a/Common/TimeUtil.cpp +++ b/Common/TimeUtil.cpp @@ -23,6 +23,8 @@ #include #endif +#include "Common/Log.h" + // for _mm_pause #if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64) #include @@ -270,3 +272,10 @@ void GetCurrentTimeFormatted(char formattedTime[13]) { // Now tack on the milliseconds snprintf(formattedTime, 11, "%s:%03u", tmp, milliseconds % 1000); } + +LogScopeIfSlowMs::~LogScopeIfSlowMs() { + double now = time_now_d(); + if (now > endTime_) { + WARN_LOG(SYSTEM, "SLOW: %s took %0.2f ms", title_, (now - endTime_) * 1000.0); + } +} diff --git a/Common/TimeUtil.h b/Common/TimeUtil.h index 578886f3f7..05b1db342f 100644 --- a/Common/TimeUtil.h +++ b/Common/TimeUtil.h @@ -49,3 +49,16 @@ private: int64_t nsecs_; #endif }; + +class LogScopeIfSlowMs { +public: + LogScopeIfSlowMs(const char *title, int limitMs) { + title_ = title; + endTime_ = time_now_d() + 0.001 * limitMs; + } + ~LogScopeIfSlowMs(); + +private: + const char *title_; + double endTime_; +}; diff --git a/Core/ELF/ElfReader.cpp b/Core/ELF/ElfReader.cpp index 7adceca25d..bff1227c74 100644 --- a/Core/ELF/ElfReader.cpp +++ b/Core/ELF/ElfReader.cpp @@ -100,7 +100,7 @@ bool ElfReader::LoadRelocations(const Elf32_Rel *rels, int numRelocs) { relocOps[r] = Memory::ReadUnchecked_Instruction(addr, true).encoding; } - }, 0, numRelocs, 128, TaskPriority::HIGH); + }, 0, numRelocs, 128, 0, TaskPriority::HIGH); ParallelRangeLoop(&g_threadManager, [&](int l, int h) { for (int r = l; r < h; r++) { @@ -232,7 +232,7 @@ bool ElfReader::LoadRelocations(const Elf32_Rel *rels, int numRelocs) { Memory::WriteUnchecked_U32(op, addr); NotifyMemInfo(MemBlockFlags::WRITE, addr, 4, "Relocation"); } - }, 0, numRelocs, 128, TaskPriority::HIGH); + }, 0, numRelocs, 128, 0, TaskPriority::HIGH); if (numErrors) { WARN_LOG(Log::Loader, "%i bad relocations found!!!", numErrors.load()); diff --git a/Core/MemMap.cpp b/Core/MemMap.cpp index 67bbe6f9f3..9919734ef8 100644 --- a/Core/MemMap.cpp +++ b/Core/MemMap.cpp @@ -346,7 +346,7 @@ static void DoMemoryVoid(PointerWrap &p, uint32_t start, uint32_t size) { ParallelRangeLoop(&g_threadManager, [&](int l, int h) { for (int i = l; i < h; i++) _dbg_assert_msg_(d[i] == storage[i], "Savestate verification failure: %d (0x%X) (at %p) != %d (0x%X) (at %p).\n", d[i], d[i], &d[i], storage[i], storage[i], &storage[i]); - }, 0, size, 128); + }, 0, size, 128, 8); break; case PointerWrap::MODE_NOOP: break; diff --git a/GPU/Common/ReplacedTexture.cpp b/GPU/Common/ReplacedTexture.cpp index fb01001474..394bfc2eb8 100644 --- a/GPU/Common/ReplacedTexture.cpp +++ b/GPU/Common/ReplacedTexture.cpp @@ -730,10 +730,12 @@ bool ReplacedTexture::CopyLevelTo(int level, uint8_t *out, size_t outDataSize, i return false; } + // Dubious if this is worth it, sometimes seems to help, sometimes not. #define PARALLEL_COPY int blockSize; if (!Draw::DataFormatIsBlockCompressed(fmt, &blockSize)) { + LogScopeIfSlowMs log("memcpy", 10); if (fmt != Draw::DataFormat::R8G8B8A8_UNORM) { ERROR_LOG(Log::G3D, "Unexpected linear data format"); return false; @@ -755,6 +757,7 @@ bool ReplacedTexture::CopyLevelTo(int level, uint8_t *out, size_t outDataSize, i } else { #ifdef PARALLEL_COPY const int MIN_LINES_PER_THREAD = 4; + const int MAX_THREADS = 6; ParallelRangeLoop(&g_threadManager, [&](int l, int h) { int extraPixels = outW - info.w; for (int y = l; y < h; ++y) { @@ -762,7 +765,7 @@ bool ReplacedTexture::CopyLevelTo(int level, uint8_t *out, size_t outDataSize, i // Fill the rest of the line with black. memset((uint8_t *)out + rowPitch * y + info.w * 4, 0, extraPixels * 4); } - }, 0, info.h, MIN_LINES_PER_THREAD); + }, 0, info.h, MAX_THREADS, MIN_LINES_PER_THREAD); #else int extraPixels = outW - info.w; for (int y = 0; y < info.h; ++y) { diff --git a/GPU/Common/TextureScalerCommon.cpp b/GPU/Common/TextureScalerCommon.cpp index bbcb497b8f..5a7b77c791 100644 --- a/GPU/Common/TextureScalerCommon.cpp +++ b/GPU/Common/TextureScalerCommon.cpp @@ -698,22 +698,22 @@ const int MIN_LINES_PER_THREAD = 4; void TextureScalerCommon::ScaleXBRZ(int factor, u32* source, u32* dest, int width, int height) { xbrz::ScalerCfg cfg; - ParallelRangeLoop(&g_threadManager, std::bind(&xbrz::scale, factor, source, dest, width, height, xbrz::ColorFormat::ARGB, cfg, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD); + ParallelRangeLoop(&g_threadManager, std::bind(&xbrz::scale, factor, source, dest, width, height, xbrz::ColorFormat::ARGB, cfg, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD); } void TextureScalerCommon::ScaleBilinear(int factor, u32* source, u32* dest, int width, int height) { bufTmp1.resize(width * height * factor); u32 *tmpBuf = bufTmp1.data(); - ParallelRangeLoop(&g_threadManager, std::bind(&bilinearH, factor, source, tmpBuf, width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD); - ParallelRangeLoop(&g_threadManager, std::bind(&bilinearV, factor, tmpBuf, dest, width, 0, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD); + ParallelRangeLoop(&g_threadManager, std::bind(&bilinearH, factor, source, tmpBuf, width, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD); + ParallelRangeLoop(&g_threadManager, std::bind(&bilinearV, factor, tmpBuf, dest, width, 0, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD); } void TextureScalerCommon::ScaleBicubicBSpline(int factor, u32* source, u32* dest, int width, int height) { - ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicBSpline, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD); + ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicBSpline, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD); } void TextureScalerCommon::ScaleBicubicMitchell(int factor, u32* source, u32* dest, int width, int height) { - ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicMitchell, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD); + ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicMitchell, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD); } void TextureScalerCommon::ScaleHybrid(int factor, u32* source, u32* dest, int width, int height, bool bicubic) { @@ -730,8 +730,8 @@ void TextureScalerCommon::ScaleHybrid(int factor, u32* source, u32* dest, int wi bufTmp2.resize(width*height*factor*factor); bufTmp3.resize(width*height*factor*factor); - ParallelRangeLoop(&g_threadManager,std::bind(&generateDistanceMask, source, bufTmp1.data(), width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD); - ParallelRangeLoop(&g_threadManager,std::bind(&convolve3x3, bufTmp1.data(), bufTmp2.data(), KERNEL_SPLAT, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD); + ParallelRangeLoop(&g_threadManager,std::bind(&generateDistanceMask, source, bufTmp1.data(), width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD); + ParallelRangeLoop(&g_threadManager,std::bind(&convolve3x3, bufTmp1.data(), bufTmp2.data(), KERNEL_SPLAT, width, height, std::placeholders::_1, std::placeholders::_2), 0, 0, height, MIN_LINES_PER_THREAD); ScaleBilinear(factor, bufTmp2.data(), bufTmp3.data(), width, height); // mask C is now in bufTmp3 @@ -744,13 +744,13 @@ void TextureScalerCommon::ScaleHybrid(int factor, u32* source, u32* dest, int wi // Now we can mix it all together // The factor 8192 was found through practical testing on a variety of textures - ParallelRangeLoop(&g_threadManager,std::bind(&mix, dest, bufTmp2.data(), bufTmp3.data(), 8192, width*factor, std::placeholders::_1, std::placeholders::_2), 0, height*factor, MIN_LINES_PER_THREAD); + ParallelRangeLoop(&g_threadManager,std::bind(&mix, dest, bufTmp2.data(), bufTmp3.data(), 8192, width*factor, std::placeholders::_1, std::placeholders::_2), 0, height*factor, 32, MIN_LINES_PER_THREAD); } void TextureScalerCommon::DePosterize(u32* source, u32* dest, int width, int height) { bufTmp3.resize(width*height); - ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, source, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD); - ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD); - ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, dest, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD); - ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD); + ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, source, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD); + ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD); + ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, dest, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD); + ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD); } diff --git a/GPU/Debugger/Record.cpp b/GPU/Debugger/Record.cpp index 88ee8353ea..3eda04e7d3 100644 --- a/GPU/Debugger/Record.cpp +++ b/GPU/Debugger/Record.cpp @@ -287,7 +287,7 @@ static const u8 *mymemmem(const u8 *haystack, size_t off, size_t hlen, const u8 p++; alignp(); } - }, 0, range, 128 * 1024, TaskPriority::LOW); + }, 0, range, 128 * 1024, 8, TaskPriority::LOW); return result; } diff --git a/unittest/TestThreadManager.cpp b/unittest/TestThreadManager.cpp index ff15044ffb..abad1615c7 100644 --- a/unittest/TestThreadManager.cpp +++ b/unittest/TestThreadManager.cpp @@ -46,20 +46,20 @@ bool TestParallelLoop(ThreadManager *threadMan) { printf("tester thread ID: %d\n", GetCurrentThreadIdForDebug()); printf("waitable test\n"); - WaitableCounter *waitable = ParallelRangeLoopWaitable(threadMan, rangeFunc, 0, 7, 1, TaskPriority::HIGH); + WaitableCounter *waitable = ParallelRangeLoopWaitable(threadMan, rangeFunc, 0, 7, 1, 0, TaskPriority::HIGH); // Can do stuff here if we like. waitable->WaitAndRelease(); // Now it's done. // Try a loop with stragglers. printf("blocking test #1 [0-65)\n"); - ParallelRangeLoop(threadMan, rangeFunc, 0, 65, 1); + ParallelRangeLoop(threadMan, rangeFunc, 0, 65, 1, 0); // Try a loop with a relatively large minimum size. printf("blocking test #2 [0-100)\n"); - ParallelRangeLoop(threadMan, rangeFunc, 0, 100, 40); + ParallelRangeLoop(threadMan, rangeFunc, 0, 100, 40, 0); // Try a loop with minimum size larger than range. printf("waitable test [10-30)\n"); - WaitableCounter *waitable2 = ParallelRangeLoopWaitable(threadMan, rangeFunc, 10, 30, 40, TaskPriority::LOW); + WaitableCounter *waitable2 = ParallelRangeLoopWaitable(threadMan, rangeFunc, 10, 30, 40, 0, TaskPriority::LOW); waitable2->WaitAndRelease(); return true; }