Parallelized texture scaling

2025-02-22 07:03:38 +00:00 · 2013-05-01 02:18:35 +02:00 · 2013-05-01 02:18:35 +02:00 · af68180319
commit af68180319
parent 5e918a644f
4 changed files with 216 additions and 66 deletions
--- a/Common/MemoryUtil.h
+++ b/Common/MemoryUtil.h
@ -34,4 +34,48 @@ std::string MemUsage();

 inline int GetPageSize() { return 4096; }

+template <typename T>
+class SimpleBuf {
+public:
+	SimpleBuf() : buf_(NULL), size_(0) {
+	}
+
+	SimpleBuf(size_t size) : buf_(NULL) {
+		resize(size);
+	}
+
+	~SimpleBuf() {
+		if (buf_ != NULL) {
+			FreeMemoryPages(buf_, size_ * sizeof(T));
+		}
+	}
+
+	inline T &operator[](size_t index) {
+		return buf_[index];
+	}
+
+	// Doesn't preserve contents.
+	void resize(size_t size) {
+		if (size_ < size) {
+			if (buf_ != NULL) {
+				FreeMemoryPages(buf_, size_ * sizeof(T));
+			}
+			buf_ = (T *)AllocateMemoryPages(size * sizeof(T));
+			size_ = size;
+		}
+	}
+
+	T *data() {
+		return buf_;
+	}
+
+	size_t size() {
+		return size_;
+	}
+
+private:
+	T *buf_;
+	size_t size_;
+};
+
 #endif
--- a/GPU/GLES/TextureScaler.cpp
+++ b/GPU/GLES/TextureScaler.cpp
@ -18,10 +18,92 @@
 #include "TextureScaler.h"

 #include "Core/Config.h"
+#include "Common/Log.h"
+#include "Common/MsgHandler.h"
+#include "Common/CommonFuncs.h"
 #include "ext/xbrz/xbrz.h"

+WorkerThread::WorkerThread() : active(true), started(false) {
+	thread = new std::thread([&]() { WorkFunc(); });
+	doneMutex.lock();
+	while(!started) { };
+}
+
+WorkerThread::~WorkerThread() {
+	mutex.lock();
+	active = false;
+	signal.notify_one();
+	mutex.unlock();
+	thread->join();
+	delete thread;
+}
+
+void WorkerThread::Process(const std::function<void()>& work) {
+	mutex.lock();
+	work_ = work;
+	signal.notify_one();
+	mutex.unlock();
+}
+
+void WorkerThread::WaitForCompletion() {
+	done.wait(doneMutex);
+}
+
+void WorkerThread::WorkFunc() {
+	mutex.lock();
+	started = true;
+	while(active) {
+		signal.wait(mutex);
+		if(active) work_();
+		doneMutex.lock();
+		done.notify_one();
+		doneMutex.unlock();
+	}
+}
+
+
+TextureScaler::TextureScaler() : numThreads(4), workersStarted(false) {
+}
+
+void TextureScaler::StartWorkers() {
+	if(!workersStarted) {
+		for(int i=0; i<numThreads; ++i) {
+			workers.push_back(std::make_shared<WorkerThread>());
+		}
+		workersStarted = true;
+	}
+}
+
+void TextureScaler::ParallelLoop(std::function<void(int,int)> loop, int lower, int upper) {
+	StartWorkers();
+	int range = upper-lower;
+	if(range >= numThreads*2) { // don't parallelize tiny loops
+		// could do slightly better load balancing for the generic case, 
+		// but doesn't matter since all our loops are power of 2
+		int chunk = range/numThreads; 
+		for(int s=lower, i=0; i<numThreads; s+=chunk, ++i) {
+			workers[i]->Process(std::bind(loop, s, std::min(s+chunk,upper)));
+		}
+		for(int i=0; i<numThreads; ++i) {
+			workers[i]->WaitForCompletion();
+		}
+	} else {
+		loop(lower, upper);
+	}
+}
+
+//#define SCALING_MEASURE_TIME
+
+#ifdef SCALING_MEASURE_TIME
+#include "native/base/timeutil.h"
+#endif
+
 void TextureScaler::Scale(u32* &data, GLenum &dstFmt, int &width, int &height) {
 	if(g_Config.iXBRZTexScalingLevel > 1) {
+		#ifdef SCALING_MEASURE_TIME
+		double t_start = real_time_now();
+		#endif
+
 		int factor = g_Config.iXBRZTexScalingLevel;

 		// depending on the factor and texture sizes, these can be pretty large (25 MB for a 512 by 512 texture with scaling factor 5)
@ -37,52 +119,71 @@ void TextureScaler::Scale(u32* &data, GLenum &dstFmt, int &width, int &height) {
 			break;

 		case GL_UNSIGNED_SHORT_4_4_4_4:
-			for(int y = 0; y < height; ++y) {
-				for(int x = 0; x < width; ++x) {
-					u32 val = ((u16*)data)[y*width + x];
-					u32 r = ((val>>12) & 0xF) * 17;
-					u32 g = ((val>> 8) & 0xF) * 17;
-					u32 b = ((val>> 4) & 0xF) * 17;
-					u32 a = ((val>> 0) & 0xF) * 17;
-					xbrzInputBuf[y*width + x] = (a << 24) | (b << 16) | (g << 8) | r;
+			ParallelLoop([&](int l, int u){
+				for(int y = l; y < u; ++y) {
+					for(int x = 0; x < width; ++x) {
+						u32 val = ((u16*)data)[y*width + x];
+						u32 r = ((val>>12) & 0xF) * 17;
+						u32 g = ((val>> 8) & 0xF) * 17;
+						u32 b = ((val>> 4) & 0xF) * 17;
+						u32 a = ((val>> 0) & 0xF) * 17;
+						xbrzInputBuf[y*width + x] = (a << 24) | (b << 16) | (g << 8) | r;
+					}
 				}
-			}
+			}, 0, height);
 			break;

 		case GL_UNSIGNED_SHORT_5_6_5:
-			for(int y = 0; y < height; ++y) {
-				for(int x = 0; x < width; ++x) {
-					u32 val = ((u16*)data)[y*width + x];
-					u32 r = ((val>>11) & 0x1F) * 8;
-					u32 g = ((val>> 5) & 0x3F) * 4;
-					u32 b = ((val    ) & 0x1F) * 8;
-					xbrzInputBuf[y*width + x] = (0xFF << 24) | (b << 16) | (g << 8) | r;
+			ParallelLoop([&](int l, int u){
+				for(int y = l; y < u; ++y) {
+					for(int x = 0; x < width; ++x) {
+						u32 val = ((u16*)data)[y*width + x];
+						u32 r = ((val>>11) & 0x1F) * 8;
+						u32 g = ((val>> 5) & 0x3F) * 4;
+						u32 b = ((val    ) & 0x1F) * 8;
+						xbrzInputBuf[y*width + x] = (0xFF << 24) | (b << 16) | (g << 8) | r;
+					}
 				}
-			}
+			}, 0, height);
 			break;

 		case GL_UNSIGNED_SHORT_5_5_5_1:
-			for(int y = 0; y < height; ++y) {
-				for(int x = 0; x < width; ++x) {
-					u32 val = ((u16*)data)[y*width + x];
-					u32 r = ((val>>11) & 0x1F) * 8;
-					u32 g = ((val>> 6) & 0x1F) * 8;
-					u32 b = ((val>> 1) & 0x1F) * 8;
-					u32 a = (val & 0x1) * 255;
-					xbrzInputBuf[y*width + x] = (a << 24) | (b << 16) | (g << 8) | r;
+			ParallelLoop([&](int l, int u) {
+				for(int y = l; y < u; ++y) {
+					for(int x = 0; x < width; ++x) {
+						u32 val = ((u16*)data)[y*width + x];
+						u32 r = ((val>>11) & 0x1F) * 8;
+						u32 g = ((val>> 6) & 0x1F) * 8;
+						u32 b = ((val>> 1) & 0x1F) * 8;
+						u32 a = (val & 0x1) * 255;
+						xbrzInputBuf[y*width + x] = (a << 24) | (b << 16) | (g << 8) | r;
+					}
 				}
-			}
+			}, 0, height);
 			break;

 		default:
 			ERROR_LOG(G3D, "iXBRZTexScaling: unsupported texture format");
 		}

-		// scale and update values accordingly
-		xbrz::scale(factor, xbrzInputBuf, xbrzBuf, width, height);
+		// scale 
+		xbrz::ScalerCfg cfg;
+		ParallelLoop([&](int l, int u) {
+			xbrz::scale(factor, xbrzInputBuf, xbrzBuf, width, height, cfg, l, u);
+		}, 0, height);
+
+		// update values accordingly
 		data = xbrzBuf;
 		dstFmt = GL_UNSIGNED_BYTE;
 		width *= factor;
 		height *= factor;
+
+		#ifdef SCALING_MEASURE_TIME
+		if(width*height > 64*64*factor*factor) {
+			double t = real_time_now() - t_start;
+			NOTICE_LOG(MASTER_LOG, "TextureScaler: processed %9d pixels in %6.5lf seconds. (%9.0lf Mpixels/second)", 
+				width*height, t, (width*height)/(t*1000*1000));
+		}
+		#endif
 	}
 }
--- a/GPU/GLES/TextureScaler.h
+++ b/GPU/GLES/TextureScaler.h
@ -21,55 +21,53 @@
 #include "../Globals.h"
 #include "../native/ext/glew/GL/glew.h"

-template <typename T>
-class SimpleBuf {
+#include <functional>
+#include <vector>
+
+#include "native/thread/thread.h"
+#include "base/mutex.h"
+
+// This is the simplest possible worker implementation I can think of
+// but entirely sufficient for the given purpose.
+// Only handles a single item of work at a time.
+class WorkerThread {
 public:
-	SimpleBuf() : buf_(NULL), size_(0) {
-	}
+	WorkerThread();
+	~WorkerThread();

-	SimpleBuf(size_t size) : buf_(NULL) {
-		resize(size);
-	}
-
-	~SimpleBuf() {
-		if (buf_ != NULL) {
-			FreeMemoryPages(buf_, size_ * sizeof(T));
-		}
-	}
-
-	inline T &operator[](size_t index) {
-		return buf_[index];
-	}
-
-	// Doesn't preserve contents.
-	void resize(size_t size) {
-		if (size_ < size) {
-			if (buf_ != NULL) {
-				FreeMemoryPages(buf_, size_ * sizeof(T));
-			}
-			buf_ = (T *)AllocateMemoryPages(size * sizeof(T));
-			size_ = size;
-		}
-	}
-
-	T *data() {
-		return buf_;
-	}
-
-	size_t size() {
-		return size_;
-	}
+	// submit a new work item
+	void Process(const std::function<void()>& work);
+	// wait for a submitted work item to be completed
+	void WaitForCompletion();

 private:
-	T *buf_;
-	size_t size_;
+	std::thread *thread; // the worker thread
+	condition_variable signal; // used to signal new work
+	condition_variable done; // used to signal work completion
+	recursive_mutex mutex, doneMutex; // associated with each respective condition variable
+	volatile bool active, started;
+	std::function<void()> work_; // the work to be done by this thread
+
+	void WorkFunc();
+
+	WorkerThread(const WorkerThread& other) { } // prevent copies
 };

 class TextureScaler {
 public:
+	TextureScaler();
+
 	void Scale(u32* &data, GLenum &dstfmt, int &width, int &height);

 private:
+	const int numThreads;
+	std::vector<std::shared_ptr<WorkerThread>> workers;
+
+	bool workersStarted;
+	void StartWorkers();
+
+	void ParallelLoop(std::function<void(int,int)> loop, int lower, int upper);
+
 	SimpleBuf<u32> bufInput;
 	SimpleBuf<u32> bufOutput;
 };
--- a/ext/xbrz/xbrz.h
+++ b/ext/xbrz/xbrz.h
@ -25,6 +25,13 @@
 #include <limits>
 #include "config.h"

+#ifdef max
+#undef max
+#endif
+#ifdef min
+#undef min
+#endif
+
 namespace xbrz
 {
 /*