Address feedback (except the mailbox refcount)

This commit is contained in:
Henrik Rydgård 2021-06-13 10:16:53 +02:00
parent 1d59560409
commit 81f0c3a8e4
9 changed files with 37 additions and 49 deletions

View File

@ -26,19 +26,14 @@ WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::
}
int numTasks = threadMan->GetNumLooperThreads();
int range = upper - lower;
if (range <= 0) {
// Bad range. A finished counter allocated.
// Nothing to do. A finished counter allocated to keep the API.
return new WaitableCounter(0);
}
if (range <= numTasks) {
// Just assign one task per thread, as many as we have.
WaitableCounter *waitableCounter = new WaitableCounter(range);
for (int i = 0; i < range; i++) {
threadMan->EnqueueTaskOnThread(i, new LoopRangeTask(waitableCounter, loop, i, i + 1), TaskType::CPU_COMPUTE);
}
} else if (range <= minSize) {
// Single background task.
WaitableCounter *waitableCounter = new WaitableCounter(1);
threadMan->EnqueueTaskOnThread(0, new LoopRangeTask(waitableCounter, loop, lower, upper), TaskType::CPU_COMPUTE);
return waitableCounter;
} else {
// Split the range between threads. Allow for some fractional bits.
@ -68,7 +63,7 @@ WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::
}
threadMan->EnqueueTaskOnThread(i, new LoopRangeTask(waitableCounter, loop, start, end), TaskType::CPU_COMPUTE);
counter += delta;
if ((counter >> fractionalBits) > upper) {
if ((counter >> fractionalBits) >= upper) {
break;
}
}
@ -78,7 +73,6 @@ WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::
int stragglerStart = (int)(counter >> fractionalBits);
int stragglerEnd = upper;
if (stragglerStart < stragglerEnd) {
// printf("doing stragglers: %d-%d\n", start, upper);
loop(stragglerStart, stragglerEnd);
}
return waitableCounter;
@ -114,14 +108,13 @@ void ParallelMemcpy(ThreadManager *threadMan, void *dst, const void *src, size_t
return;
}
// 128 is the largest cacheline size on common CPUs.
// Still I suspect that the optimal minSize is a lot higher.
// unknown's testing showed that 128kB is an appropriate minimum size.
char *d = (char *)dst;
char *s = (char *)src;
const char *s = (const char *)src;
ParallelRangeLoop(threadMan, [&](int l, int h) {
memmove(d + l, s + l, h - l);
}, 0, (int)bytes, 128);
}, 0, (int)bytes, 128 * 1024);
}
// NOTE: Supports a max of 2GB.
@ -132,11 +125,10 @@ void ParallelMemset(ThreadManager *threadMan, void *dst, uint8_t value, size_t b
return;
}
// 128 is the largest cacheline size on common CPUs.
// Still I suspect that the optimal minSize is a lot higher.
// unknown's testing showed that 128kB is an appropriate minimum size.
char *d = (char *)dst;
ParallelRangeLoop(threadMan, [&](int l, int h) {
memset(d + l, value, h - l);
}, 0, (int)bytes, 128);
}, 0, (int)bytes, 128 * 1024);
}

View File

@ -6,7 +6,7 @@
#include "Common/Thread/ThreadManager.h"
// Same as the latch from C++21, just counting upwards for no particular reason.
// Same as the latch from C++21.
struct WaitableCounter : public Waitable {
public:
WaitableCounter(int count) : count_(count) {}
@ -25,10 +25,9 @@ public:
void Wait() override {
std::unique_lock<std::mutex> lock(mutex_);
if (count_ == 0) {
return;
while (count_ != 0) {
cond_.wait(lock);
}
cond_.wait(lock);
}
int count_;
@ -44,6 +43,6 @@ void ParallelRangeLoop(ThreadManager *threadMan, const std::function<void(int, i
// Common utilities for large (!) memory copies.
// Will only fall back to threads if it seems to make sense.
// NOTE: These support a max of 2GB.
void ParallelMemcpy(ThreadManager *threadMan, void *dst, const void *src, size_t bytes);
void ParallelMemset(ThreadManager *threadMan, void *dst, uint8_t value, size_t bytes);

View File

@ -28,7 +28,7 @@ struct GlobalThreadContext {
std::deque<Task *> queue;
std::vector<ThreadContext *> threads_;
int roundRobin;
int roundRobin = 0;
};
struct ThreadContext {

View File

@ -17,7 +17,6 @@ public:
virtual void Run() = 0;
virtual bool Cancellable() { return false; }
virtual void Cancel() {}
virtual float Priority() { return 1.0f; }
virtual uint64_t id() { return 0; }
};
@ -53,8 +52,8 @@ public:
// something meaningful yourself.
void TryCancelTask(uint64_t id);
// Parallel loops get to use half the threads,
// so we still have some worker threads for other tasks.
// Parallel loops (assumed compute-limited) get one thread per logical core. We have a few extra threads too
// for I/O bounds tasks, that can be run concurrently with those.
int GetNumLooperThreads() const;
private:

View File

@ -16,16 +16,17 @@
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
#include <algorithm>
#include "Common/GPU/OpenGL/GLCommon.h"
#include "GPU/Common/TextureScalerCommon.h"
#include "GPU/GLES/TextureScalerGLES.h"
#include "Common/Data/Convert/ColorConv.h"
#include "Common/Log.h"
#include "Common/Thread/ParallelLoop.h"
#include "Core/ThreadPools.h"
#include "Common/GPU/OpenGL/GLCommon.h"
#include "Common/GPU/DataFormat.h"
#include "Core/ThreadPools.h"
#include "GPU/Common/TextureScalerCommon.h"
#include "GPU/GLES/TextureScalerGLES.h"
int TextureScalerGLES::BytesPerPixel(u32 format) {
return ((Draw::DataFormat)format == Draw::DataFormat::R8G8B8A8_UNORM) ? 4 : 2;
}
@ -42,15 +43,15 @@ void TextureScalerGLES::ConvertTo8888(u32 format, u32* source, u32* &dest, int w
break;
case Draw::DataFormat::R4G4B4A4_UNORM_PACK16:
ParallelRangeLoop(&g_threadManager, std::bind(&convert4444_gl, (u16*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height, 1);
ParallelRangeLoop(&g_threadManager, std::bind(&convert4444_gl, (u16*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_TEXSCALE_LINES_PER_THREAD);
break;
case Draw::DataFormat::R5G6B5_UNORM_PACK16:
ParallelRangeLoop(&g_threadManager, std::bind(&convert565_gl, (u16*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height, 1);
ParallelRangeLoop(&g_threadManager, std::bind(&convert565_gl, (u16*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_TEXSCALE_LINES_PER_THREAD);
break;
case Draw::DataFormat::R5G5B5A1_UNORM_PACK16:
ParallelRangeLoop(&g_threadManager, std::bind(&convert5551_gl, (u16*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height, 1);
ParallelRangeLoop(&g_threadManager, std::bind(&convert5551_gl, (u16*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_TEXSCALE_LINES_PER_THREAD);
break;
default:

View File

@ -42,23 +42,21 @@ u32 TextureScalerVulkan::Get8888Format() {
}
void TextureScalerVulkan::ConvertTo8888(u32 format, u32* source, u32* &dest, int width, int height) {
const int MIN_LINES_PER_THREAD = 4;
switch (format) {
case VULKAN_8888_FORMAT:
dest = source; // already fine
break;
case VULKAN_4444_FORMAT:
ParallelRangeLoop(&g_threadManager, std::bind(&convert4444_dx9, (u16*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager, std::bind(&convert4444_dx9, (u16*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_TEXSCALE_LINES_PER_THREAD);
break;
case VULKAN_565_FORMAT:
ParallelRangeLoop(&g_threadManager, std::bind(&convert565_dx9, (u16*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager, std::bind(&convert565_dx9, (u16*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_TEXSCALE_LINES_PER_THREAD);
break;
case VULKAN_1555_FORMAT:
ParallelRangeLoop(&g_threadManager, std::bind(&convert5551_dx9, (u16*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
ParallelRangeLoop(&g_threadManager, std::bind(&convert5551_dx9, (u16*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_TEXSCALE_LINES_PER_THREAD);
break;
default:

View File

@ -630,15 +630,6 @@ handleELF:
// INFO_LOG(SYSTEM, "Completed writing info for %s", info_->GetTitle().c_str());
}
float Priority() override {
auto fl = info_->GetFileLoader();
if (fl && fl->IsRemote()) {
// Increase the value so remote info loads after non-remote.
return info_->lastAccessedTime + 1000.0f;
}
return info_->lastAccessedTime;
}
private:
Path gamePath_;
std::shared_ptr<GameInfo> info_;

View File

@ -14,11 +14,13 @@
#include "Common/System/NativeApp.h"
#include "Common/System/System.h"
#include "Common/CPUDetect.h"
#include "Common/File/VFS/VFS.h"
#include "Common/File/VFS/AssetReader.h"
#include "Common/File/FileUtil.h"
#include "Common/GraphicsContext.h"
#include "Common/TimeUtil.h"
#include "Common/Thread/ThreadManager.h"
#include "Core/Config.h"
#include "Core/ConfigValues.h"
#include "Core/Core.h"
@ -330,6 +332,8 @@ int main(int argc, const char* argv[])
if (testFilenames.empty())
return printUsage(argv[0], argc <= 1 ? NULL : "No executables specified");
g_threadManager.Init(cpu_info.num_cores, cpu_info.logical_cpu_count);
LogManager::Init(&g_Config.bEnableLogging);
LogManager *logman = LogManager::GetInstance();

View File

@ -49,6 +49,10 @@ bool TestParallelLoop(ThreadManager *threadMan) {
// Try a loop with a relatively large minimum size.
printf("blocking test #2 [0-100)\n");
ParallelRangeLoop(threadMan, rangeFunc, 0, 100, 40);
// Try a loop with minimum size larger than range.
printf("waitable test [10-30)\n");
WaitableCounter *waitable2 = ParallelRangeLoopWaitable(threadMan, rangeFunc, 10, 30, 40);
waitable2->WaitAndRelease();
return true;
}