mirror of
https://github.com/hrydgard/ppsspp.git
synced 2024-11-22 21:09:52 +00:00
Allow specifying a cap on the number of threads used in ParallelLoop
This commit is contained in:
parent
e9e16210e8
commit
862177497a
@ -30,12 +30,15 @@ public:
|
||||
const TaskPriority priority_;
|
||||
};
|
||||
|
||||
WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, TaskPriority priority) {
|
||||
WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, int maxThreads, TaskPriority priority) {
|
||||
if (minSize == -1) {
|
||||
minSize = 1;
|
||||
}
|
||||
|
||||
int numTasks = threadMan->GetNumLooperThreads();
|
||||
if (maxThreads > 0) {
|
||||
numTasks = std::min(maxThreads, numTasks);
|
||||
}
|
||||
int range = upper - lower;
|
||||
if (range <= 0) {
|
||||
// Nothing to do. A finished counter allocated to keep the API.
|
||||
@ -88,7 +91,7 @@ WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::
|
||||
}
|
||||
}
|
||||
|
||||
void ParallelRangeLoop(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, TaskPriority priority) {
|
||||
void ParallelRangeLoop(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, int maxThreads, TaskPriority priority) {
|
||||
if (cpu_info.num_cores == 1 || (minSize >= (upper - lower) && upper > lower)) {
|
||||
// "Optimization" for single-core devices, or minSize larger than the range.
|
||||
// No point in adding threading overhead, let's just do it inline (since this is the blocking variant).
|
||||
@ -101,7 +104,7 @@ void ParallelRangeLoop(ThreadManager *threadMan, const std::function<void(int, i
|
||||
minSize = 1;
|
||||
}
|
||||
|
||||
WaitableCounter *counter = ParallelRangeLoopWaitable(threadMan, loop, lower, upper, minSize, priority);
|
||||
WaitableCounter *counter = ParallelRangeLoopWaitable(threadMan, loop, lower, upper, minSize, maxThreads, priority);
|
||||
// TODO: Optimize using minSize. We'll just compute whether there's a remainer, remove it from the call to ParallelRangeLoopWaitable,
|
||||
// and process the remainder right here. If there's no remainer, we'll steal a whole chunk.
|
||||
if (counter) {
|
||||
@ -118,12 +121,14 @@ void ParallelMemcpy(ThreadManager *threadMan, void *dst, const void *src, size_t
|
||||
}
|
||||
|
||||
// unknown's testing showed that 128kB is an appropriate minimum size.
|
||||
// Though, it probably depends on the number of CPU cores too.
|
||||
// I'm capping the number of threads at 6.
|
||||
|
||||
char *d = (char *)dst;
|
||||
const char *s = (const char *)src;
|
||||
ParallelRangeLoop(threadMan, [&](int l, int h) {
|
||||
memmove(d + l, s + l, h - l);
|
||||
}, 0, (int)bytes, 128 * 1024, priority);
|
||||
}, 0, (int)bytes, 128 * 1024, 6, priority);
|
||||
}
|
||||
|
||||
// NOTE: Supports a max of 2GB.
|
||||
@ -135,9 +140,10 @@ void ParallelMemset(ThreadManager *threadMan, void *dst, uint8_t value, size_t b
|
||||
}
|
||||
|
||||
// unknown's testing showed that 128kB is an appropriate minimum size.
|
||||
// See above though for number of threads.
|
||||
|
||||
char *d = (char *)dst;
|
||||
ParallelRangeLoop(threadMan, [&](int l, int h) {
|
||||
memset(d + l, value, h - l);
|
||||
}, 0, (int)bytes, 128 * 1024, priority);
|
||||
}, 0, (int)bytes, 128 * 1024, 6, priority);
|
||||
}
|
||||
|
@ -36,10 +36,12 @@ public:
|
||||
};
|
||||
|
||||
// Note that upper bounds are non-inclusive: range is [lower, upper)
|
||||
WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, TaskPriority priority);
|
||||
// maxThreads can be set to 0 to use the number of cores.
|
||||
WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, int maxThreads, TaskPriority priority);
|
||||
|
||||
// Note that upper bounds are non-inclusive: range is [lower, upper)
|
||||
void ParallelRangeLoop(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, TaskPriority priority = TaskPriority::NORMAL);
|
||||
// maxThreads can be set to 0 to use the number of cores.
|
||||
void ParallelRangeLoop(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, int maxThreads, TaskPriority priority = TaskPriority::NORMAL);
|
||||
|
||||
// Common utilities for large (!) memory copies.
|
||||
// Will only fall back to threads if it seems to make sense.
|
||||
|
@ -23,6 +23,8 @@
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include "Common/Log.h"
|
||||
|
||||
// for _mm_pause
|
||||
#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
|
||||
#include <immintrin.h>
|
||||
@ -270,3 +272,10 @@ void GetCurrentTimeFormatted(char formattedTime[13]) {
|
||||
// Now tack on the milliseconds
|
||||
snprintf(formattedTime, 11, "%s:%03u", tmp, milliseconds % 1000);
|
||||
}
|
||||
|
||||
LogScopeIfSlowMs::~LogScopeIfSlowMs() {
|
||||
double now = time_now_d();
|
||||
if (now > endTime_) {
|
||||
WARN_LOG(SYSTEM, "SLOW: %s took %0.2f ms", title_, (now - endTime_) * 1000.0);
|
||||
}
|
||||
}
|
||||
|
@ -49,3 +49,16 @@ private:
|
||||
int64_t nsecs_;
|
||||
#endif
|
||||
};
|
||||
|
||||
class LogScopeIfSlowMs {
|
||||
public:
|
||||
LogScopeIfSlowMs(const char *title, int limitMs) {
|
||||
title_ = title;
|
||||
endTime_ = time_now_d() + 0.001 * limitMs;
|
||||
}
|
||||
~LogScopeIfSlowMs();
|
||||
|
||||
private:
|
||||
const char *title_;
|
||||
double endTime_;
|
||||
};
|
||||
|
@ -100,7 +100,7 @@ bool ElfReader::LoadRelocations(const Elf32_Rel *rels, int numRelocs) {
|
||||
|
||||
relocOps[r] = Memory::ReadUnchecked_Instruction(addr, true).encoding;
|
||||
}
|
||||
}, 0, numRelocs, 128, TaskPriority::HIGH);
|
||||
}, 0, numRelocs, 128, 0, TaskPriority::HIGH);
|
||||
|
||||
ParallelRangeLoop(&g_threadManager, [&](int l, int h) {
|
||||
for (int r = l; r < h; r++) {
|
||||
@ -232,7 +232,7 @@ bool ElfReader::LoadRelocations(const Elf32_Rel *rels, int numRelocs) {
|
||||
Memory::WriteUnchecked_U32(op, addr);
|
||||
NotifyMemInfo(MemBlockFlags::WRITE, addr, 4, "Relocation");
|
||||
}
|
||||
}, 0, numRelocs, 128, TaskPriority::HIGH);
|
||||
}, 0, numRelocs, 128, 0, TaskPriority::HIGH);
|
||||
|
||||
if (numErrors) {
|
||||
WARN_LOG(Log::Loader, "%i bad relocations found!!!", numErrors.load());
|
||||
|
@ -346,7 +346,7 @@ static void DoMemoryVoid(PointerWrap &p, uint32_t start, uint32_t size) {
|
||||
ParallelRangeLoop(&g_threadManager, [&](int l, int h) {
|
||||
for (int i = l; i < h; i++)
|
||||
_dbg_assert_msg_(d[i] == storage[i], "Savestate verification failure: %d (0x%X) (at %p) != %d (0x%X) (at %p).\n", d[i], d[i], &d[i], storage[i], storage[i], &storage[i]);
|
||||
}, 0, size, 128);
|
||||
}, 0, size, 128, 8);
|
||||
break;
|
||||
case PointerWrap::MODE_NOOP:
|
||||
break;
|
||||
|
@ -730,10 +730,12 @@ bool ReplacedTexture::CopyLevelTo(int level, uint8_t *out, size_t outDataSize, i
|
||||
return false;
|
||||
}
|
||||
|
||||
// Dubious if this is worth it, sometimes seems to help, sometimes not.
|
||||
#define PARALLEL_COPY
|
||||
|
||||
int blockSize;
|
||||
if (!Draw::DataFormatIsBlockCompressed(fmt, &blockSize)) {
|
||||
LogScopeIfSlowMs log("memcpy", 10);
|
||||
if (fmt != Draw::DataFormat::R8G8B8A8_UNORM) {
|
||||
ERROR_LOG(Log::G3D, "Unexpected linear data format");
|
||||
return false;
|
||||
@ -755,6 +757,7 @@ bool ReplacedTexture::CopyLevelTo(int level, uint8_t *out, size_t outDataSize, i
|
||||
} else {
|
||||
#ifdef PARALLEL_COPY
|
||||
const int MIN_LINES_PER_THREAD = 4;
|
||||
const int MAX_THREADS = 6;
|
||||
ParallelRangeLoop(&g_threadManager, [&](int l, int h) {
|
||||
int extraPixels = outW - info.w;
|
||||
for (int y = l; y < h; ++y) {
|
||||
@ -762,7 +765,7 @@ bool ReplacedTexture::CopyLevelTo(int level, uint8_t *out, size_t outDataSize, i
|
||||
// Fill the rest of the line with black.
|
||||
memset((uint8_t *)out + rowPitch * y + info.w * 4, 0, extraPixels * 4);
|
||||
}
|
||||
}, 0, info.h, MIN_LINES_PER_THREAD);
|
||||
}, 0, info.h, MAX_THREADS, MIN_LINES_PER_THREAD);
|
||||
#else
|
||||
int extraPixels = outW - info.w;
|
||||
for (int y = 0; y < info.h; ++y) {
|
||||
|
@ -698,22 +698,22 @@ const int MIN_LINES_PER_THREAD = 4;
|
||||
|
||||
void TextureScalerCommon::ScaleXBRZ(int factor, u32* source, u32* dest, int width, int height) {
|
||||
xbrz::ScalerCfg cfg;
|
||||
ParallelRangeLoop(&g_threadManager, std::bind(&xbrz::scale, factor, source, dest, width, height, xbrz::ColorFormat::ARGB, cfg, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
|
||||
ParallelRangeLoop(&g_threadManager, std::bind(&xbrz::scale, factor, source, dest, width, height, xbrz::ColorFormat::ARGB, cfg, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
|
||||
}
|
||||
|
||||
void TextureScalerCommon::ScaleBilinear(int factor, u32* source, u32* dest, int width, int height) {
|
||||
bufTmp1.resize(width * height * factor);
|
||||
u32 *tmpBuf = bufTmp1.data();
|
||||
ParallelRangeLoop(&g_threadManager, std::bind(&bilinearH, factor, source, tmpBuf, width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
|
||||
ParallelRangeLoop(&g_threadManager, std::bind(&bilinearV, factor, tmpBuf, dest, width, 0, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
|
||||
ParallelRangeLoop(&g_threadManager, std::bind(&bilinearH, factor, source, tmpBuf, width, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
|
||||
ParallelRangeLoop(&g_threadManager, std::bind(&bilinearV, factor, tmpBuf, dest, width, 0, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
|
||||
}
|
||||
|
||||
void TextureScalerCommon::ScaleBicubicBSpline(int factor, u32* source, u32* dest, int width, int height) {
|
||||
ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicBSpline, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
|
||||
ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicBSpline, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
|
||||
}
|
||||
|
||||
void TextureScalerCommon::ScaleBicubicMitchell(int factor, u32* source, u32* dest, int width, int height) {
|
||||
ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicMitchell, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
|
||||
ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicMitchell, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
|
||||
}
|
||||
|
||||
void TextureScalerCommon::ScaleHybrid(int factor, u32* source, u32* dest, int width, int height, bool bicubic) {
|
||||
@ -730,8 +730,8 @@ void TextureScalerCommon::ScaleHybrid(int factor, u32* source, u32* dest, int wi
|
||||
bufTmp2.resize(width*height*factor*factor);
|
||||
bufTmp3.resize(width*height*factor*factor);
|
||||
|
||||
ParallelRangeLoop(&g_threadManager,std::bind(&generateDistanceMask, source, bufTmp1.data(), width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
|
||||
ParallelRangeLoop(&g_threadManager,std::bind(&convolve3x3, bufTmp1.data(), bufTmp2.data(), KERNEL_SPLAT, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
|
||||
ParallelRangeLoop(&g_threadManager,std::bind(&generateDistanceMask, source, bufTmp1.data(), width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
|
||||
ParallelRangeLoop(&g_threadManager,std::bind(&convolve3x3, bufTmp1.data(), bufTmp2.data(), KERNEL_SPLAT, width, height, std::placeholders::_1, std::placeholders::_2), 0, 0, height, MIN_LINES_PER_THREAD);
|
||||
ScaleBilinear(factor, bufTmp2.data(), bufTmp3.data(), width, height);
|
||||
// mask C is now in bufTmp3
|
||||
|
||||
@ -744,13 +744,13 @@ void TextureScalerCommon::ScaleHybrid(int factor, u32* source, u32* dest, int wi
|
||||
|
||||
// Now we can mix it all together
|
||||
// The factor 8192 was found through practical testing on a variety of textures
|
||||
ParallelRangeLoop(&g_threadManager,std::bind(&mix, dest, bufTmp2.data(), bufTmp3.data(), 8192, width*factor, std::placeholders::_1, std::placeholders::_2), 0, height*factor, MIN_LINES_PER_THREAD);
|
||||
ParallelRangeLoop(&g_threadManager,std::bind(&mix, dest, bufTmp2.data(), bufTmp3.data(), 8192, width*factor, std::placeholders::_1, std::placeholders::_2), 0, height*factor, 32, MIN_LINES_PER_THREAD);
|
||||
}
|
||||
|
||||
void TextureScalerCommon::DePosterize(u32* source, u32* dest, int width, int height) {
|
||||
bufTmp3.resize(width*height);
|
||||
ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, source, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
|
||||
ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
|
||||
ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, dest, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
|
||||
ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
|
||||
ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, source, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
|
||||
ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
|
||||
ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, dest, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
|
||||
ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
|
||||
}
|
||||
|
@ -287,7 +287,7 @@ static const u8 *mymemmem(const u8 *haystack, size_t off, size_t hlen, const u8
|
||||
p++;
|
||||
alignp();
|
||||
}
|
||||
}, 0, range, 128 * 1024, TaskPriority::LOW);
|
||||
}, 0, range, 128 * 1024, 8, TaskPriority::LOW);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
@ -46,20 +46,20 @@ bool TestParallelLoop(ThreadManager *threadMan) {
|
||||
printf("tester thread ID: %d\n", GetCurrentThreadIdForDebug());
|
||||
|
||||
printf("waitable test\n");
|
||||
WaitableCounter *waitable = ParallelRangeLoopWaitable(threadMan, rangeFunc, 0, 7, 1, TaskPriority::HIGH);
|
||||
WaitableCounter *waitable = ParallelRangeLoopWaitable(threadMan, rangeFunc, 0, 7, 1, 0, TaskPriority::HIGH);
|
||||
// Can do stuff here if we like.
|
||||
waitable->WaitAndRelease();
|
||||
// Now it's done.
|
||||
|
||||
// Try a loop with stragglers.
|
||||
printf("blocking test #1 [0-65)\n");
|
||||
ParallelRangeLoop(threadMan, rangeFunc, 0, 65, 1);
|
||||
ParallelRangeLoop(threadMan, rangeFunc, 0, 65, 1, 0);
|
||||
// Try a loop with a relatively large minimum size.
|
||||
printf("blocking test #2 [0-100)\n");
|
||||
ParallelRangeLoop(threadMan, rangeFunc, 0, 100, 40);
|
||||
ParallelRangeLoop(threadMan, rangeFunc, 0, 100, 40, 0);
|
||||
// Try a loop with minimum size larger than range.
|
||||
printf("waitable test [10-30)\n");
|
||||
WaitableCounter *waitable2 = ParallelRangeLoopWaitable(threadMan, rangeFunc, 10, 30, 40, TaskPriority::LOW);
|
||||
WaitableCounter *waitable2 = ParallelRangeLoopWaitable(threadMan, rangeFunc, 10, 30, 40, 0, TaskPriority::LOW);
|
||||
waitable2->WaitAndRelease();
|
||||
return true;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user