Allow specifying a cap on the number of threads used in ParallelLoop

This commit is contained in:
Henrik Rydgård 2023-12-21 11:04:28 +01:00
parent e9e16210e8
commit 862177497a
10 changed files with 61 additions and 28 deletions

View File

@ -30,12 +30,15 @@ public:
const TaskPriority priority_; const TaskPriority priority_;
}; };
WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, TaskPriority priority) { WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, int maxThreads, TaskPriority priority) {
if (minSize == -1) { if (minSize == -1) {
minSize = 1; minSize = 1;
} }
int numTasks = threadMan->GetNumLooperThreads(); int numTasks = threadMan->GetNumLooperThreads();
if (maxThreads > 0) {
numTasks = std::min(maxThreads, numTasks);
}
int range = upper - lower; int range = upper - lower;
if (range <= 0) { if (range <= 0) {
// Nothing to do. A finished counter allocated to keep the API. // Nothing to do. A finished counter allocated to keep the API.
@ -88,7 +91,7 @@ WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::
} }
} }
void ParallelRangeLoop(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, TaskPriority priority) { void ParallelRangeLoop(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, int maxThreads, TaskPriority priority) {
if (cpu_info.num_cores == 1 || (minSize >= (upper - lower) && upper > lower)) { if (cpu_info.num_cores == 1 || (minSize >= (upper - lower) && upper > lower)) {
// "Optimization" for single-core devices, or minSize larger than the range. // "Optimization" for single-core devices, or minSize larger than the range.
// No point in adding threading overhead, let's just do it inline (since this is the blocking variant). // No point in adding threading overhead, let's just do it inline (since this is the blocking variant).
@ -101,7 +104,7 @@ void ParallelRangeLoop(ThreadManager *threadMan, const std::function<void(int, i
minSize = 1; minSize = 1;
} }
WaitableCounter *counter = ParallelRangeLoopWaitable(threadMan, loop, lower, upper, minSize, priority); WaitableCounter *counter = ParallelRangeLoopWaitable(threadMan, loop, lower, upper, minSize, maxThreads, priority);
// TODO: Optimize using minSize. We'll just compute whether there's a remainer, remove it from the call to ParallelRangeLoopWaitable, // TODO: Optimize using minSize. We'll just compute whether there's a remainer, remove it from the call to ParallelRangeLoopWaitable,
// and process the remainder right here. If there's no remainer, we'll steal a whole chunk. // and process the remainder right here. If there's no remainer, we'll steal a whole chunk.
if (counter) { if (counter) {
@ -118,12 +121,14 @@ void ParallelMemcpy(ThreadManager *threadMan, void *dst, const void *src, size_t
} }
// unknown's testing showed that 128kB is an appropriate minimum size. // unknown's testing showed that 128kB is an appropriate minimum size.
// Though, it probably depends on the number of CPU cores too.
// I'm capping the number of threads at 6.
char *d = (char *)dst; char *d = (char *)dst;
const char *s = (const char *)src; const char *s = (const char *)src;
ParallelRangeLoop(threadMan, [&](int l, int h) { ParallelRangeLoop(threadMan, [&](int l, int h) {
memmove(d + l, s + l, h - l); memmove(d + l, s + l, h - l);
}, 0, (int)bytes, 128 * 1024, priority); }, 0, (int)bytes, 128 * 1024, 6, priority);
} }
// NOTE: Supports a max of 2GB. // NOTE: Supports a max of 2GB.
@ -135,9 +140,10 @@ void ParallelMemset(ThreadManager *threadMan, void *dst, uint8_t value, size_t b
} }
// unknown's testing showed that 128kB is an appropriate minimum size. // unknown's testing showed that 128kB is an appropriate minimum size.
// See above though for number of threads.
char *d = (char *)dst; char *d = (char *)dst;
ParallelRangeLoop(threadMan, [&](int l, int h) { ParallelRangeLoop(threadMan, [&](int l, int h) {
memset(d + l, value, h - l); memset(d + l, value, h - l);
}, 0, (int)bytes, 128 * 1024, priority); }, 0, (int)bytes, 128 * 1024, 6, priority);
} }

View File

@ -36,10 +36,12 @@ public:
}; };
// Note that upper bounds are non-inclusive: range is [lower, upper) // Note that upper bounds are non-inclusive: range is [lower, upper)
WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, TaskPriority priority); // maxThreads can be set to 0 to use the number of cores.
WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, int maxThreads, TaskPriority priority);
// Note that upper bounds are non-inclusive: range is [lower, upper) // Note that upper bounds are non-inclusive: range is [lower, upper)
void ParallelRangeLoop(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, TaskPriority priority = TaskPriority::NORMAL); // maxThreads can be set to 0 to use the number of cores.
void ParallelRangeLoop(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, int maxThreads, TaskPriority priority = TaskPriority::NORMAL);
// Common utilities for large (!) memory copies. // Common utilities for large (!) memory copies.
// Will only fall back to threads if it seems to make sense. // Will only fall back to threads if it seems to make sense.

View File

@ -23,6 +23,8 @@
#include <unistd.h> #include <unistd.h>
#endif #endif
#include "Common/Log.h"
// for _mm_pause // for _mm_pause
#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64) #if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
#include <immintrin.h> #include <immintrin.h>
@ -270,3 +272,10 @@ void GetCurrentTimeFormatted(char formattedTime[13]) {
// Now tack on the milliseconds // Now tack on the milliseconds
snprintf(formattedTime, 11, "%s:%03u", tmp, milliseconds % 1000); snprintf(formattedTime, 11, "%s:%03u", tmp, milliseconds % 1000);
} }
LogScopeIfSlowMs::~LogScopeIfSlowMs() {
double now = time_now_d();
if (now > endTime_) {
WARN_LOG(SYSTEM, "SLOW: %s took %0.2f ms", title_, (now - endTime_) * 1000.0);
}
}

View File

@ -49,3 +49,16 @@ private:
int64_t nsecs_; int64_t nsecs_;
#endif #endif
}; };
class LogScopeIfSlowMs {
public:
LogScopeIfSlowMs(const char *title, int limitMs) {
title_ = title;
endTime_ = time_now_d() + 0.001 * limitMs;
}
~LogScopeIfSlowMs();
private:
const char *title_;
double endTime_;
};

View File

@ -100,7 +100,7 @@ bool ElfReader::LoadRelocations(const Elf32_Rel *rels, int numRelocs) {
relocOps[r] = Memory::ReadUnchecked_Instruction(addr, true).encoding; relocOps[r] = Memory::ReadUnchecked_Instruction(addr, true).encoding;
} }
}, 0, numRelocs, 128, TaskPriority::HIGH); }, 0, numRelocs, 128, 0, TaskPriority::HIGH);
ParallelRangeLoop(&g_threadManager, [&](int l, int h) { ParallelRangeLoop(&g_threadManager, [&](int l, int h) {
for (int r = l; r < h; r++) { for (int r = l; r < h; r++) {
@ -232,7 +232,7 @@ bool ElfReader::LoadRelocations(const Elf32_Rel *rels, int numRelocs) {
Memory::WriteUnchecked_U32(op, addr); Memory::WriteUnchecked_U32(op, addr);
NotifyMemInfo(MemBlockFlags::WRITE, addr, 4, "Relocation"); NotifyMemInfo(MemBlockFlags::WRITE, addr, 4, "Relocation");
} }
}, 0, numRelocs, 128, TaskPriority::HIGH); }, 0, numRelocs, 128, 0, TaskPriority::HIGH);
if (numErrors) { if (numErrors) {
WARN_LOG(Log::Loader, "%i bad relocations found!!!", numErrors.load()); WARN_LOG(Log::Loader, "%i bad relocations found!!!", numErrors.load());

View File

@ -346,7 +346,7 @@ static void DoMemoryVoid(PointerWrap &p, uint32_t start, uint32_t size) {
ParallelRangeLoop(&g_threadManager, [&](int l, int h) { ParallelRangeLoop(&g_threadManager, [&](int l, int h) {
for (int i = l; i < h; i++) for (int i = l; i < h; i++)
_dbg_assert_msg_(d[i] == storage[i], "Savestate verification failure: %d (0x%X) (at %p) != %d (0x%X) (at %p).\n", d[i], d[i], &d[i], storage[i], storage[i], &storage[i]); _dbg_assert_msg_(d[i] == storage[i], "Savestate verification failure: %d (0x%X) (at %p) != %d (0x%X) (at %p).\n", d[i], d[i], &d[i], storage[i], storage[i], &storage[i]);
}, 0, size, 128); }, 0, size, 128, 8);
break; break;
case PointerWrap::MODE_NOOP: case PointerWrap::MODE_NOOP:
break; break;

View File

@ -730,10 +730,12 @@ bool ReplacedTexture::CopyLevelTo(int level, uint8_t *out, size_t outDataSize, i
return false; return false;
} }
// Dubious if this is worth it, sometimes seems to help, sometimes not.
#define PARALLEL_COPY #define PARALLEL_COPY
int blockSize; int blockSize;
if (!Draw::DataFormatIsBlockCompressed(fmt, &blockSize)) { if (!Draw::DataFormatIsBlockCompressed(fmt, &blockSize)) {
LogScopeIfSlowMs log("memcpy", 10);
if (fmt != Draw::DataFormat::R8G8B8A8_UNORM) { if (fmt != Draw::DataFormat::R8G8B8A8_UNORM) {
ERROR_LOG(Log::G3D, "Unexpected linear data format"); ERROR_LOG(Log::G3D, "Unexpected linear data format");
return false; return false;
@ -755,6 +757,7 @@ bool ReplacedTexture::CopyLevelTo(int level, uint8_t *out, size_t outDataSize, i
} else { } else {
#ifdef PARALLEL_COPY #ifdef PARALLEL_COPY
const int MIN_LINES_PER_THREAD = 4; const int MIN_LINES_PER_THREAD = 4;
const int MAX_THREADS = 6;
ParallelRangeLoop(&g_threadManager, [&](int l, int h) { ParallelRangeLoop(&g_threadManager, [&](int l, int h) {
int extraPixels = outW - info.w; int extraPixels = outW - info.w;
for (int y = l; y < h; ++y) { for (int y = l; y < h; ++y) {
@ -762,7 +765,7 @@ bool ReplacedTexture::CopyLevelTo(int level, uint8_t *out, size_t outDataSize, i
// Fill the rest of the line with black. // Fill the rest of the line with black.
memset((uint8_t *)out + rowPitch * y + info.w * 4, 0, extraPixels * 4); memset((uint8_t *)out + rowPitch * y + info.w * 4, 0, extraPixels * 4);
} }
}, 0, info.h, MIN_LINES_PER_THREAD); }, 0, info.h, MAX_THREADS, MIN_LINES_PER_THREAD);
#else #else
int extraPixels = outW - info.w; int extraPixels = outW - info.w;
for (int y = 0; y < info.h; ++y) { for (int y = 0; y < info.h; ++y) {

View File

@ -698,22 +698,22 @@ const int MIN_LINES_PER_THREAD = 4;
void TextureScalerCommon::ScaleXBRZ(int factor, u32* source, u32* dest, int width, int height) { void TextureScalerCommon::ScaleXBRZ(int factor, u32* source, u32* dest, int width, int height) {
xbrz::ScalerCfg cfg; xbrz::ScalerCfg cfg;
ParallelRangeLoop(&g_threadManager, std::bind(&xbrz::scale, factor, source, dest, width, height, xbrz::ColorFormat::ARGB, cfg, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD); ParallelRangeLoop(&g_threadManager, std::bind(&xbrz::scale, factor, source, dest, width, height, xbrz::ColorFormat::ARGB, cfg, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
} }
void TextureScalerCommon::ScaleBilinear(int factor, u32* source, u32* dest, int width, int height) { void TextureScalerCommon::ScaleBilinear(int factor, u32* source, u32* dest, int width, int height) {
bufTmp1.resize(width * height * factor); bufTmp1.resize(width * height * factor);
u32 *tmpBuf = bufTmp1.data(); u32 *tmpBuf = bufTmp1.data();
ParallelRangeLoop(&g_threadManager, std::bind(&bilinearH, factor, source, tmpBuf, width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD); ParallelRangeLoop(&g_threadManager, std::bind(&bilinearH, factor, source, tmpBuf, width, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager, std::bind(&bilinearV, factor, tmpBuf, dest, width, 0, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD); ParallelRangeLoop(&g_threadManager, std::bind(&bilinearV, factor, tmpBuf, dest, width, 0, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
} }
void TextureScalerCommon::ScaleBicubicBSpline(int factor, u32* source, u32* dest, int width, int height) { void TextureScalerCommon::ScaleBicubicBSpline(int factor, u32* source, u32* dest, int width, int height) {
ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicBSpline, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD); ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicBSpline, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
} }
void TextureScalerCommon::ScaleBicubicMitchell(int factor, u32* source, u32* dest, int width, int height) { void TextureScalerCommon::ScaleBicubicMitchell(int factor, u32* source, u32* dest, int width, int height) {
ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicMitchell, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD); ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicMitchell, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
} }
void TextureScalerCommon::ScaleHybrid(int factor, u32* source, u32* dest, int width, int height, bool bicubic) { void TextureScalerCommon::ScaleHybrid(int factor, u32* source, u32* dest, int width, int height, bool bicubic) {
@ -730,8 +730,8 @@ void TextureScalerCommon::ScaleHybrid(int factor, u32* source, u32* dest, int wi
bufTmp2.resize(width*height*factor*factor); bufTmp2.resize(width*height*factor*factor);
bufTmp3.resize(width*height*factor*factor); bufTmp3.resize(width*height*factor*factor);
ParallelRangeLoop(&g_threadManager,std::bind(&generateDistanceMask, source, bufTmp1.data(), width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD); ParallelRangeLoop(&g_threadManager,std::bind(&generateDistanceMask, source, bufTmp1.data(), width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager,std::bind(&convolve3x3, bufTmp1.data(), bufTmp2.data(), KERNEL_SPLAT, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD); ParallelRangeLoop(&g_threadManager,std::bind(&convolve3x3, bufTmp1.data(), bufTmp2.data(), KERNEL_SPLAT, width, height, std::placeholders::_1, std::placeholders::_2), 0, 0, height, MIN_LINES_PER_THREAD);
ScaleBilinear(factor, bufTmp2.data(), bufTmp3.data(), width, height); ScaleBilinear(factor, bufTmp2.data(), bufTmp3.data(), width, height);
// mask C is now in bufTmp3 // mask C is now in bufTmp3
@ -744,13 +744,13 @@ void TextureScalerCommon::ScaleHybrid(int factor, u32* source, u32* dest, int wi
// Now we can mix it all together // Now we can mix it all together
// The factor 8192 was found through practical testing on a variety of textures // The factor 8192 was found through practical testing on a variety of textures
ParallelRangeLoop(&g_threadManager,std::bind(&mix, dest, bufTmp2.data(), bufTmp3.data(), 8192, width*factor, std::placeholders::_1, std::placeholders::_2), 0, height*factor, MIN_LINES_PER_THREAD); ParallelRangeLoop(&g_threadManager,std::bind(&mix, dest, bufTmp2.data(), bufTmp3.data(), 8192, width*factor, std::placeholders::_1, std::placeholders::_2), 0, height*factor, 32, MIN_LINES_PER_THREAD);
} }
void TextureScalerCommon::DePosterize(u32* source, u32* dest, int width, int height) { void TextureScalerCommon::DePosterize(u32* source, u32* dest, int width, int height) {
bufTmp3.resize(width*height); bufTmp3.resize(width*height);
ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, source, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD); ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, source, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD); ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, dest, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD); ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, dest, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD); ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
} }

View File

@ -287,7 +287,7 @@ static const u8 *mymemmem(const u8 *haystack, size_t off, size_t hlen, const u8
p++; p++;
alignp(); alignp();
} }
}, 0, range, 128 * 1024, TaskPriority::LOW); }, 0, range, 128 * 1024, 8, TaskPriority::LOW);
return result; return result;
} }

View File

@ -46,20 +46,20 @@ bool TestParallelLoop(ThreadManager *threadMan) {
printf("tester thread ID: %d\n", GetCurrentThreadIdForDebug()); printf("tester thread ID: %d\n", GetCurrentThreadIdForDebug());
printf("waitable test\n"); printf("waitable test\n");
WaitableCounter *waitable = ParallelRangeLoopWaitable(threadMan, rangeFunc, 0, 7, 1, TaskPriority::HIGH); WaitableCounter *waitable = ParallelRangeLoopWaitable(threadMan, rangeFunc, 0, 7, 1, 0, TaskPriority::HIGH);
// Can do stuff here if we like. // Can do stuff here if we like.
waitable->WaitAndRelease(); waitable->WaitAndRelease();
// Now it's done. // Now it's done.
// Try a loop with stragglers. // Try a loop with stragglers.
printf("blocking test #1 [0-65)\n"); printf("blocking test #1 [0-65)\n");
ParallelRangeLoop(threadMan, rangeFunc, 0, 65, 1); ParallelRangeLoop(threadMan, rangeFunc, 0, 65, 1, 0);
// Try a loop with a relatively large minimum size. // Try a loop with a relatively large minimum size.
printf("blocking test #2 [0-100)\n"); printf("blocking test #2 [0-100)\n");
ParallelRangeLoop(threadMan, rangeFunc, 0, 100, 40); ParallelRangeLoop(threadMan, rangeFunc, 0, 100, 40, 0);
// Try a loop with minimum size larger than range. // Try a loop with minimum size larger than range.
printf("waitable test [10-30)\n"); printf("waitable test [10-30)\n");
WaitableCounter *waitable2 = ParallelRangeLoopWaitable(threadMan, rangeFunc, 10, 30, 40, TaskPriority::LOW); WaitableCounter *waitable2 = ParallelRangeLoopWaitable(threadMan, rangeFunc, 10, 30, 40, 0, TaskPriority::LOW);
waitable2->WaitAndRelease(); waitable2->WaitAndRelease();
return true; return true;
} }