Allow specifying a cap on the number of threads used in ParallelLoop

2024-11-22 21:09:52 +00:00 · 2023-12-21 11:04:28 +01:00 · 2023-12-21 11:04:28 +01:00 · 862177497a
commit 862177497a
parent e9e16210e8
10 changed files with 61 additions and 28 deletions
--- a/Common/Thread/ParallelLoop.cpp
+++ b/Common/Thread/ParallelLoop.cpp
@ -30,12 +30,15 @@ public:
 	const TaskPriority priority_;
 };

-WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, TaskPriority priority) {
+WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, int maxThreads, TaskPriority priority) {
 	if (minSize == -1) {
 		minSize = 1;
 	}

 	int numTasks = threadMan->GetNumLooperThreads();
+	if (maxThreads > 0) {
+		numTasks = std::min(maxThreads, numTasks);
+	}
 	int range = upper - lower;
 	if (range <= 0) {
 		// Nothing to do. A finished counter allocated to keep the API.
@ -88,7 +91,7 @@ WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::
 	}
 }

-void ParallelRangeLoop(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, TaskPriority priority) {
+void ParallelRangeLoop(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, int maxThreads, TaskPriority priority) {
 	if (cpu_info.num_cores == 1 || (minSize >= (upper - lower) && upper > lower)) {
 		// "Optimization" for single-core devices, or minSize larger than the range.
 		// No point in adding threading overhead, let's just do it inline (since this is the blocking variant).
@ -101,7 +104,7 @@ void ParallelRangeLoop(ThreadManager *threadMan, const std::function<void(int, i
 		minSize = 1;
 	}

-	WaitableCounter *counter = ParallelRangeLoopWaitable(threadMan, loop, lower, upper, minSize, priority);
+	WaitableCounter *counter = ParallelRangeLoopWaitable(threadMan, loop, lower, upper, minSize, maxThreads, priority);
 	// TODO: Optimize using minSize. We'll just compute whether there's a remainer, remove it from the call to ParallelRangeLoopWaitable,
 	// and process the remainder right here. If there's no remainer, we'll steal a whole chunk.
 	if (counter) {
@ -118,12 +121,14 @@ void ParallelMemcpy(ThreadManager *threadMan, void *dst, const void *src, size_t
 	}

 	// unknown's testing showed that 128kB is an appropriate minimum size.
+	// Though, it probably depends on the number of CPU cores too.
+	// I'm capping the number of threads at 6.

 	char *d = (char *)dst;
 	const char *s = (const char *)src;
 	ParallelRangeLoop(threadMan, [&](int l, int h) {
 		memmove(d + l, s + l, h - l);
-	}, 0, (int)bytes, 128 * 1024, priority);
+	}, 0, (int)bytes, 128 * 1024, 6, priority);
 }

 // NOTE: Supports a max of 2GB.
@ -135,9 +140,10 @@ void ParallelMemset(ThreadManager *threadMan, void *dst, uint8_t value, size_t b
 	}

 	// unknown's testing showed that 128kB is an appropriate minimum size.
+	// See above though for number of threads.

 	char *d = (char *)dst;
 	ParallelRangeLoop(threadMan, [&](int l, int h) {
 		memset(d + l, value, h - l);
-	}, 0, (int)bytes, 128 * 1024, priority);
+	}, 0, (int)bytes, 128 * 1024, 6, priority);
 }
--- a/Common/Thread/ParallelLoop.h
+++ b/Common/Thread/ParallelLoop.h
@ -36,10 +36,12 @@ public:
 };

 // Note that upper bounds are non-inclusive: range is [lower, upper)
-WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, TaskPriority priority);
+// maxThreads can be set to 0 to use the number of cores.
+WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, int maxThreads, TaskPriority priority);

 // Note that upper bounds are non-inclusive: range is [lower, upper)
-void ParallelRangeLoop(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, TaskPriority priority = TaskPriority::NORMAL);
+// maxThreads can be set to 0 to use the number of cores.
+void ParallelRangeLoop(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize, int maxThreads, TaskPriority priority = TaskPriority::NORMAL);

 // Common utilities for large (!) memory copies.
 // Will only fall back to threads if it seems to make sense.
--- a/Common/TimeUtil.cpp
+++ b/Common/TimeUtil.cpp
@ -23,6 +23,8 @@
 #include <unistd.h>
 #endif

+#include "Common/Log.h"
+
 // for _mm_pause
 #if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
 #include <immintrin.h>
@ -270,3 +272,10 @@ void GetCurrentTimeFormatted(char formattedTime[13]) {
 	// Now tack on the milliseconds
 	snprintf(formattedTime, 11, "%s:%03u", tmp, milliseconds % 1000);
 }
+
+LogScopeIfSlowMs::~LogScopeIfSlowMs() {
+	double now = time_now_d();
+	if (now > endTime_) {
+		WARN_LOG(SYSTEM, "SLOW: %s took %0.2f ms", title_, (now - endTime_) * 1000.0);
+	}
+}
--- a/Common/TimeUtil.h
+++ b/Common/TimeUtil.h
@ -49,3 +49,16 @@ private:
 	int64_t nsecs_;
 #endif
 };
+
+class LogScopeIfSlowMs {
+public:
+	LogScopeIfSlowMs(const char *title, int limitMs) {
+		title_ = title;
+		endTime_ = time_now_d() + 0.001 * limitMs;
+	}
+	~LogScopeIfSlowMs();
+
+private:
+	const char *title_;
+	double endTime_;
+};
--- a/Core/ELF/ElfReader.cpp
+++ b/Core/ELF/ElfReader.cpp
@ -100,7 +100,7 @@ bool ElfReader::LoadRelocations(const Elf32_Rel *rels, int numRelocs) {

 			relocOps[r] = Memory::ReadUnchecked_Instruction(addr, true).encoding;
 		}
-	}, 0, numRelocs, 128, TaskPriority::HIGH);
+	}, 0, numRelocs, 128, 0, TaskPriority::HIGH);

 	ParallelRangeLoop(&g_threadManager, [&](int l, int h) {
 		for (int r = l; r < h; r++) {
@ -232,7 +232,7 @@ bool ElfReader::LoadRelocations(const Elf32_Rel *rels, int numRelocs) {
 			Memory::WriteUnchecked_U32(op, addr);
 			NotifyMemInfo(MemBlockFlags::WRITE, addr, 4, "Relocation");
 		}
-	}, 0, numRelocs, 128, TaskPriority::HIGH);
+	}, 0, numRelocs, 128, 0, TaskPriority::HIGH);

 	if (numErrors) {
 		WARN_LOG(Log::Loader, "%i bad relocations found!!!", numErrors.load());
--- a/Core/MemMap.cpp
+++ b/Core/MemMap.cpp
@ -346,7 +346,7 @@ static void DoMemoryVoid(PointerWrap &p, uint32_t start, uint32_t size) {
 		ParallelRangeLoop(&g_threadManager, [&](int l, int h) {
 			for (int i = l; i < h; i++)
 				_dbg_assert_msg_(d[i] == storage[i], "Savestate verification failure: %d (0x%X) (at %p) != %d (0x%X) (at %p).\n", d[i], d[i], &d[i], storage[i], storage[i], &storage[i]);
-		}, 0, size, 128);
+		}, 0, size, 128, 8);
 		break;
 	case PointerWrap::MODE_NOOP:
 		break;
--- a/GPU/Common/ReplacedTexture.cpp
+++ b/GPU/Common/ReplacedTexture.cpp
@ -730,10 +730,12 @@ bool ReplacedTexture::CopyLevelTo(int level, uint8_t *out, size_t outDataSize, i
 		return false;
 	}

+	// Dubious if this is worth it, sometimes seems to help, sometimes not.
 #define PARALLEL_COPY

 	int blockSize;
 	if (!Draw::DataFormatIsBlockCompressed(fmt, &blockSize)) {
+		LogScopeIfSlowMs log("memcpy", 10);
 		if (fmt != Draw::DataFormat::R8G8B8A8_UNORM) {
 			ERROR_LOG(Log::G3D, "Unexpected linear data format");
 			return false;
@ -755,6 +757,7 @@ bool ReplacedTexture::CopyLevelTo(int level, uint8_t *out, size_t outDataSize, i
 		} else {
 #ifdef PARALLEL_COPY
 			const int MIN_LINES_PER_THREAD = 4;
+			const int MAX_THREADS = 6;
 			ParallelRangeLoop(&g_threadManager, [&](int l, int h) {
 				int extraPixels = outW - info.w;
 				for (int y = l; y < h; ++y) {
@ -762,7 +765,7 @@ bool ReplacedTexture::CopyLevelTo(int level, uint8_t *out, size_t outDataSize, i
 					// Fill the rest of the line with black.
 					memset((uint8_t *)out + rowPitch * y + info.w * 4, 0, extraPixels * 4);
 				}
-				}, 0, info.h, MIN_LINES_PER_THREAD);
+				}, 0, info.h, MAX_THREADS, MIN_LINES_PER_THREAD);
 #else
 			int extraPixels = outW - info.w;
 			for (int y = 0; y < info.h; ++y) {
--- a/GPU/Common/TextureScalerCommon.cpp
+++ b/GPU/Common/TextureScalerCommon.cpp
@ -698,22 +698,22 @@ const int MIN_LINES_PER_THREAD = 4;

 void TextureScalerCommon::ScaleXBRZ(int factor, u32* source, u32* dest, int width, int height) {
 	xbrz::ScalerCfg cfg;
-	ParallelRangeLoop(&g_threadManager, std::bind(&xbrz::scale, factor, source, dest, width, height, xbrz::ColorFormat::ARGB, cfg, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
+	ParallelRangeLoop(&g_threadManager, std::bind(&xbrz::scale, factor, source, dest, width, height, xbrz::ColorFormat::ARGB, cfg, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
 }

 void TextureScalerCommon::ScaleBilinear(int factor, u32* source, u32* dest, int width, int height) {
 	bufTmp1.resize(width * height * factor);
 	u32 *tmpBuf = bufTmp1.data();
-	ParallelRangeLoop(&g_threadManager, std::bind(&bilinearH, factor, source, tmpBuf, width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
-	ParallelRangeLoop(&g_threadManager, std::bind(&bilinearV, factor, tmpBuf, dest, width, 0, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
+	ParallelRangeLoop(&g_threadManager, std::bind(&bilinearH, factor, source, tmpBuf, width, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
+	ParallelRangeLoop(&g_threadManager, std::bind(&bilinearV, factor, tmpBuf, dest, width, 0, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
 }

 void TextureScalerCommon::ScaleBicubicBSpline(int factor, u32* source, u32* dest, int width, int height) {
-	ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicBSpline, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
+	ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicBSpline, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
 }

 void TextureScalerCommon::ScaleBicubicMitchell(int factor, u32* source, u32* dest, int width, int height) {
-	ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicMitchell, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
+	ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicMitchell, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
 }

 void TextureScalerCommon::ScaleHybrid(int factor, u32* source, u32* dest, int width, int height, bool bicubic) {
@ -730,8 +730,8 @@ void TextureScalerCommon::ScaleHybrid(int factor, u32* source, u32* dest, int wi
 	bufTmp2.resize(width*height*factor*factor);
 	bufTmp3.resize(width*height*factor*factor);

-	ParallelRangeLoop(&g_threadManager,std::bind(&generateDistanceMask, source, bufTmp1.data(), width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
-	ParallelRangeLoop(&g_threadManager,std::bind(&convolve3x3, bufTmp1.data(), bufTmp2.data(), KERNEL_SPLAT, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
+	ParallelRangeLoop(&g_threadManager,std::bind(&generateDistanceMask, source, bufTmp1.data(), width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
+	ParallelRangeLoop(&g_threadManager,std::bind(&convolve3x3, bufTmp1.data(), bufTmp2.data(), KERNEL_SPLAT, width, height, std::placeholders::_1, std::placeholders::_2), 0, 0, height, MIN_LINES_PER_THREAD);
 	ScaleBilinear(factor, bufTmp2.data(), bufTmp3.data(), width, height);
 	// mask C is now in bufTmp3

@ -744,13 +744,13 @@ void TextureScalerCommon::ScaleHybrid(int factor, u32* source, u32* dest, int wi

 	// Now we can mix it all together
 	// The factor 8192 was found through practical testing on a variety of textures
-	ParallelRangeLoop(&g_threadManager,std::bind(&mix, dest, bufTmp2.data(), bufTmp3.data(), 8192, width*factor, std::placeholders::_1, std::placeholders::_2), 0, height*factor, MIN_LINES_PER_THREAD);
+	ParallelRangeLoop(&g_threadManager,std::bind(&mix, dest, bufTmp2.data(), bufTmp3.data(), 8192, width*factor, std::placeholders::_1, std::placeholders::_2), 0, height*factor, 32, MIN_LINES_PER_THREAD);
 }

 void TextureScalerCommon::DePosterize(u32* source, u32* dest, int width, int height) {
 	bufTmp3.resize(width*height);
-	ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, source, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
-	ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
-	ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, dest, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
-	ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
+	ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, source, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
+	ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
+	ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, dest, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
+	ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, 0, MIN_LINES_PER_THREAD);
 }
--- a/GPU/Debugger/Record.cpp
+++ b/GPU/Debugger/Record.cpp
@ -287,7 +287,7 @@ static const u8 *mymemmem(const u8 *haystack, size_t off, size_t hlen, const u8
 			p++;
 			alignp();
 		}
-	}, 0, range, 128 * 1024, TaskPriority::LOW);
+	}, 0, range, 128 * 1024, 8, TaskPriority::LOW);

 	return result;
 }
--- a/unittest/TestThreadManager.cpp
+++ b/unittest/TestThreadManager.cpp
@ -46,20 +46,20 @@ bool TestParallelLoop(ThreadManager *threadMan) {
 	printf("tester thread ID: %d\n", GetCurrentThreadIdForDebug());

 	printf("waitable test\n");
-	WaitableCounter *waitable = ParallelRangeLoopWaitable(threadMan, rangeFunc, 0, 7, 1, TaskPriority::HIGH);
+	WaitableCounter *waitable = ParallelRangeLoopWaitable(threadMan, rangeFunc, 0, 7, 1, 0, TaskPriority::HIGH);
 	// Can do stuff here if we like.
 	waitable->WaitAndRelease();
 	// Now it's done.

 	// Try a loop with stragglers.
 	printf("blocking test #1 [0-65)\n");
-	ParallelRangeLoop(threadMan, rangeFunc, 0, 65, 1);
+	ParallelRangeLoop(threadMan, rangeFunc, 0, 65, 1, 0);
 	// Try a loop with a relatively large minimum size.
 	printf("blocking test #2 [0-100)\n");
-	ParallelRangeLoop(threadMan, rangeFunc, 0, 100, 40);
+	ParallelRangeLoop(threadMan, rangeFunc, 0, 100, 40, 0);
 	// Try a loop with minimum size larger than range.
 	printf("waitable test [10-30)\n");
-	WaitableCounter *waitable2 = ParallelRangeLoopWaitable(threadMan, rangeFunc, 10, 30, 40, TaskPriority::LOW);
+	WaitableCounter *waitable2 = ParallelRangeLoopWaitable(threadMan, rangeFunc, 10, 30, 40, 0, TaskPriority::LOW);
 	waitable2->WaitAndRelease();
 	return true;
 }