Allow specifying a cap on the number of threads used in ParallelLoop

This commit is contained in:
Henrik Rydgård 2023-12-21 11:04:28 +01:00
parent e9e16210e8
commit 862177497a
10 changed files with 61 additions and 28 deletions

View File

@ -30,12 +30,15 @@ public:
const TaskPriority priority_;
};
WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, TaskPriority priority) {
WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, int maxThreads, TaskPriority priority) {
if (minSize == -1) {
minSize = 1;
}
int numTasks = threadMan->GetNumLooperThreads();
if (maxThreads > 0) {
numTasks = std::min(maxThreads, numTasks);
}
int range = upper - lower;
if (range <= 0) {
// Nothing to do. A finished counter allocated to keep the API.
@ -88,7 +91,7 @@ WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::
}
}
void ParallelRangeLoop(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, TaskPriority priority) {
void ParallelRangeLoop(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, int maxThreads, TaskPriority priority) {
if (cpu_info.num_cores == 1 || (minSize >= (upper - lower) && upper > lower)) {
// "Optimization" for single-core devices, or minSize larger than the range.
// No point in adding threading overhead, let's just do it inline (since this is the blocking variant).
@ -101,7 +104,7 @@ void ParallelRangeLoop(ThreadManager *threadMan, const std::function<void(int, i
minSize = 1;
}
WaitableCounter *counter = ParallelRangeLoopWaitable(threadMan, loop, lower, upper, minSize, priority);
WaitableCounter *counter = ParallelRangeLoopWaitable(threadMan, loop, lower, upper, minSize, maxThreads, priority);
// TODO: Optimize using minSize. We'll just compute whether there's a remainer, remove it from the call to ParallelRangeLoopWaitable,
// and process the remainder right here. If there's no remainer, we'll steal a whole chunk.
if (counter) {
@ -118,12 +121,14 @@ void ParallelMemcpy(ThreadManager *threadMan, void *dst, const void *src, size_t
}
// unknown's testing showed that 128kB is an appropriate minimum size.
// Though, it probably depends on the number of CPU cores too.
// I'm capping the number of threads at 6.
char *d = (char *)dst;
const char *s = (const char *)src;
ParallelRangeLoop(threadMan, [&](int l, int h) {
memmove(d + l, s + l, h - l);
}, 0, (int)bytes, 128 * 1024, priority);
}, 0, (int)bytes, 128 * 1024, 6, priority);
}
// NOTE: Supports a max of 2GB.
@ -135,9 +140,10 @@ void ParallelMemset(ThreadManager *threadMan, void *dst, uint8_t value, size_t b
}
// unknown's testing showed that 128kB is an appropriate minimum size.
// See above though for number of threads.
char *d = (char *)dst;
ParallelRangeLoop(threadMan, [&](int l, int h) {
memset(d + l, value, h - l);
}, 0, (int)bytes, 128 * 1024, priority);
}, 0, (int)bytes, 128 * 1024, 6, priority);
}

View File

@ -36,10 +36,12 @@ public:
};
// Note that upper bounds are non-inclusive: range is [lower, upper)
WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, TaskPriority priority);
// maxThreads can be set to 0 to use the number of cores.
WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, int maxThreads, TaskPriority priority);
// Note that upper bounds are non-inclusive: range is [lower, upper)
void ParallelRangeLoop(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, TaskPriority priority = TaskPriority::NORMAL);
// maxThreads can be set to 0 to use the number of cores.
void ParallelRangeLoop(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, int maxThreads, TaskPriority priority = TaskPriority::NORMAL);
// Common utilities for large (!) memory copies.
// Will only fall back to threads if it seems to make sense.

View File

@ -23,6 +23,8 @@
#include <unistd.h>
#endif
#include "Common/Log.h"
// for _mm_pause
#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
#include <immintrin.h>
@ -270,3 +272,10 @@ void GetCurrentTimeFormatted(char formattedTime[13]) {
// Now tack on the milliseconds
snprintf(formattedTime, 11, "%s:%03u", tmp, milliseconds % 1000);
}
LogScopeIfSlowMs::~LogScopeIfSlowMs() {
double now = time_now_d();
if (now > endTime_) {
WARN_LOG(SYSTEM, "SLOW: %s took %0.2f ms", title_, (now - endTime_) * 1000.0);
}
}

View File

@ -49,3 +49,16 @@ private:
int64_t nsecs_;
#endif
};
class LogScopeIfSlowMs {
public:
LogScopeIfSlowMs(const char *title, int limitMs) {
title_ = title;
endTime_ = time_now_d() + 0.001 * limitMs;
}
~LogScopeIfSlowMs();
private:
const char *title_;
double endTime_;
};

View File

@ -100,7 +100,7 @@ bool ElfReader::LoadRelocations(const Elf32_Rel *rels, int numRelocs) {
relocOps[r] = Memory::ReadUnchecked_Instruction(addr, true).encoding;
}
}, 0, numRelocs, 128, TaskPriority::HIGH);
}, 0, numRelocs, 128, 0, TaskPriority::HIGH);
ParallelRangeLoop(&g_threadManager, [&](int l, int h) {
for (int r = l; r < h; r++) {
@ -232,7 +232,7 @@ bool ElfReader::LoadRelocations(const Elf32_Rel *rels, int numRelocs) {
Memory::WriteUnchecked_U32(op, addr);
NotifyMemInfo(MemBlockFlags::WRITE, addr, 4, "Relocation");
}
}, 0, numRelocs, 128, TaskPriority::HIGH);
}, 0, numRelocs, 128, 0, TaskPriority::HIGH);
if (numErrors) {
WARN_LOG(Log::Loader, "%i bad relocations found!!!", numErrors.load());

View File

@ -346,7 +346,7 @@ static void DoMemoryVoid(PointerWrap &p, uint32_t start, uint32_t size) {
ParallelRangeLoop(&g_threadManager, [&](int l, int h) {
for (int i = l; i < h; i++)
_dbg_assert_msg_(d[i] == storage[i], "Savestate verification failure: %d (0x%X) (at %p) != %d (0x%X) (at %p).\n", d[i], d[i], &d[i], storage[i], storage[i], &storage[i]);
}, 0, size, 128);
}, 0, size, 128, 8);
break;
case PointerWrap::MODE_NOOP:
break;

View File

@ -730,10 +730,12 @@ bool ReplacedTexture::CopyLevelTo(int level, uint8_t *out, size_t outDataSize, i
return false;
}
// Dubious if this is worth it, sometimes seems to help, sometimes not.
#define PARALLEL_COPY
int blockSize;
if (!Draw::DataFormatIsBlockCompressed(fmt, &blockSize)) {
LogScopeIfSlowMs log("memcpy", 10);
if (fmt != Draw::DataFormat::R8G8B8A8_UNORM) {
ERROR_LOG(Log::G3D, "Unexpected linear data format");
return false;
@ -755,6 +757,7 @@ bool ReplacedTexture::CopyLevelTo(int level, uint8_t *out, size_t outDataSize, i
} else {
#ifdef PARALLEL_COPY
const int MIN_LINES_PER_THREAD = 4;
const int MAX_THREADS = 6;
ParallelRangeLoop(&g_threadManager, [&](int l, int h) {
int extraPixels = outW - info.w;
for (int y = l; y < h; ++y) {
@ -762,7 +765,7 @@ bool ReplacedTexture::CopyLevelTo(int level, uint8_t *out, size_t outDataSize, i
// Fill the rest of the line with black.
memset((uint8_t *)out + rowPitch * y + info.w * 4, 0, extraPixels * 4);
}
}, 0, info.h, MIN_LINES_PER_THREAD);
}, 0, info.h, MAX_THREADS, MIN_LINES_PER_THREAD);
#else
int extraPixels = outW - info.w;
for (int y = 0; y < info.h; ++y) {

View File

@ -698,22 +698,22 @@ const int MIN_LINES_PER_THREAD = 4;
void TextureScalerCommon::ScaleXBRZ(int factor, u32* source, u32* dest, int width, int height) {
xbrz::ScalerCfg cfg;
ParallelRangeLoop(&g_threadManager, std::bind(&xbrz::scale, factor, source, dest, width, height, xbrz::ColorFormat::ARGB, cfg, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager, std::bind(&xbrz::scale, factor, source, dest, width, height, xbrz::ColorFormat::ARGB, cfg, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
}
void TextureScalerCommon::ScaleBilinear(int factor, u32* source, u32* dest, int width, int height) {
bufTmp1.resize(width * height * factor);
u32 *tmpBuf = bufTmp1.data();
ParallelRangeLoop(&g_threadManager, std::bind(&bilinearH, factor, source, tmpBuf, width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager, std::bind(&bilinearV, factor, tmpBuf, dest, width, 0, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager, std::bind(&bilinearH, factor, source, tmpBuf, width, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager, std::bind(&bilinearV, factor, tmpBuf, dest, width, 0, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
}
void TextureScalerCommon::ScaleBicubicBSpline(int factor, u32* source, u32* dest, int width, int height) {
ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicBSpline, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicBSpline, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
}
void TextureScalerCommon::ScaleBicubicMitchell(int factor, u32* source, u32* dest, int width, int height) {
ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicMitchell, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicMitchell, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
}
void TextureScalerCommon::ScaleHybrid(int factor, u32* source, u32* dest, int width, int height, bool bicubic) {
@ -730,8 +730,8 @@ void TextureScalerCommon::ScaleHybrid(int factor, u32* source, u32* dest, int wi
bufTmp2.resize(width*height*factor*factor);
bufTmp3.resize(width*height*factor*factor);
ParallelRangeLoop(&g_threadManager,std::bind(&generateDistanceMask, source, bufTmp1.data(), width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager,std::bind(&convolve3x3, bufTmp1.data(), bufTmp2.data(), KERNEL_SPLAT, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager,std::bind(&generateDistanceMask, source, bufTmp1.data(), width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager,std::bind(&convolve3x3, bufTmp1.data(), bufTmp2.data(), KERNEL_SPLAT, width, height, std::placeholders::_1, std::placeholders::_2), 0, 0, height, MIN_LINES_PER_THREAD);
ScaleBilinear(factor, bufTmp2.data(), bufTmp3.data(), width, height);
// mask C is now in bufTmp3
@ -744,13 +744,13 @@ void TextureScalerCommon::ScaleHybrid(int factor, u32* source, u32* dest, int wi
// Now we can mix it all together
// The factor 8192 was found through practical testing on a variety of textures
ParallelRangeLoop(&g_threadManager,std::bind(&mix, dest, bufTmp2.data(), bufTmp3.data(), 8192, width*factor, std::placeholders::_1, std::placeholders::_2), 0, height*factor, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager,std::bind(&mix, dest, bufTmp2.data(), bufTmp3.data(), 8192, width*factor, std::placeholders::_1, std::placeholders::_2), 0, height*factor, 32, MIN_LINES_PER_THREAD);
}
void TextureScalerCommon::DePosterize(u32* source, u32* dest, int width, int height) {
bufTmp3.resize(width*height);
ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, source, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, dest, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, source, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, dest, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
}

View File

@ -287,7 +287,7 @@ static const u8 *mymemmem(const u8 *haystack, size_t off, size_t hlen, const u8
p++;
alignp();
}
}, 0, range, 128 * 1024, TaskPriority::LOW);
}, 0, range, 128 * 1024, 8, TaskPriority::LOW);
return result;
}

View File

@ -46,20 +46,20 @@ bool TestParallelLoop(ThreadManager *threadMan) {
printf("tester thread ID: %d\n", GetCurrentThreadIdForDebug());
printf("waitable test\n");
WaitableCounter *waitable = ParallelRangeLoopWaitable(threadMan, rangeFunc, 0, 7, 1, TaskPriority::HIGH);
WaitableCounter *waitable = ParallelRangeLoopWaitable(threadMan, rangeFunc, 0, 7, 1, 0, TaskPriority::HIGH);
// Can do stuff here if we like.
waitable->WaitAndRelease();
// Now it's done.
// Try a loop with stragglers.
printf("blocking test #1 [0-65)\n");
ParallelRangeLoop(threadMan, rangeFunc, 0, 65, 1);
ParallelRangeLoop(threadMan, rangeFunc, 0, 65, 1, 0);
// Try a loop with a relatively large minimum size.
printf("blocking test #2 [0-100)\n");
ParallelRangeLoop(threadMan, rangeFunc, 0, 100, 40);
ParallelRangeLoop(threadMan, rangeFunc, 0, 100, 40, 0);
// Try a loop with minimum size larger than range.
printf("waitable test [10-30)\n");
WaitableCounter *waitable2 = ParallelRangeLoopWaitable(threadMan, rangeFunc, 10, 30, 40, TaskPriority::LOW);
WaitableCounter *waitable2 = ParallelRangeLoopWaitable(threadMan, rangeFunc, 10, 30, 40, 0, TaskPriority::LOW);
waitable2->WaitAndRelease();
return true;
}