Parallelized texture scaling

This commit is contained in:
Peter Thoman 2013-05-01 02:18:35 +02:00 committed by Peter Thoman
parent 5e918a644f
commit af68180319
4 changed files with 216 additions and 66 deletions

View File

@ -34,4 +34,48 @@ std::string MemUsage();
inline int GetPageSize() { return 4096; }
template <typename T>
class SimpleBuf {
public:
SimpleBuf() : buf_(NULL), size_(0) {
}
SimpleBuf(size_t size) : buf_(NULL) {
resize(size);
}
~SimpleBuf() {
if (buf_ != NULL) {
FreeMemoryPages(buf_, size_ * sizeof(T));
}
}
inline T &operator[](size_t index) {
return buf_[index];
}
// Doesn't preserve contents.
void resize(size_t size) {
if (size_ < size) {
if (buf_ != NULL) {
FreeMemoryPages(buf_, size_ * sizeof(T));
}
buf_ = (T *)AllocateMemoryPages(size * sizeof(T));
size_ = size;
}
}
T *data() {
return buf_;
}
size_t size() {
return size_;
}
private:
T *buf_;
size_t size_;
};
#endif

View File

@ -18,10 +18,92 @@
#include "TextureScaler.h"
#include "Core/Config.h"
#include "Common/Log.h"
#include "Common/MsgHandler.h"
#include "Common/CommonFuncs.h"
#include "ext/xbrz/xbrz.h"
WorkerThread::WorkerThread() : active(true), started(false) {
thread = new std::thread([&]() { WorkFunc(); });
doneMutex.lock();
while(!started) { };
}
WorkerThread::~WorkerThread() {
mutex.lock();
active = false;
signal.notify_one();
mutex.unlock();
thread->join();
delete thread;
}
void WorkerThread::Process(const std::function<void()>& work) {
mutex.lock();
work_ = work;
signal.notify_one();
mutex.unlock();
}
void WorkerThread::WaitForCompletion() {
done.wait(doneMutex);
}
void WorkerThread::WorkFunc() {
mutex.lock();
started = true;
while(active) {
signal.wait(mutex);
if(active) work_();
doneMutex.lock();
done.notify_one();
doneMutex.unlock();
}
}
TextureScaler::TextureScaler() : numThreads(4), workersStarted(false) {
}
void TextureScaler::StartWorkers() {
if(!workersStarted) {
for(int i=0; i<numThreads; ++i) {
workers.push_back(std::make_shared<WorkerThread>());
}
workersStarted = true;
}
}
void TextureScaler::ParallelLoop(std::function<void(int,int)> loop, int lower, int upper) {
StartWorkers();
int range = upper-lower;
if(range >= numThreads*2) { // don't parallelize tiny loops
// could do slightly better load balancing for the generic case,
// but doesn't matter since all our loops are power of 2
int chunk = range/numThreads;
for(int s=lower, i=0; i<numThreads; s+=chunk, ++i) {
workers[i]->Process(std::bind(loop, s, std::min(s+chunk,upper)));
}
for(int i=0; i<numThreads; ++i) {
workers[i]->WaitForCompletion();
}
} else {
loop(lower, upper);
}
}
//#define SCALING_MEASURE_TIME
#ifdef SCALING_MEASURE_TIME
#include "native/base/timeutil.h"
#endif
void TextureScaler::Scale(u32* &data, GLenum &dstFmt, int &width, int &height) {
if(g_Config.iXBRZTexScalingLevel > 1) {
#ifdef SCALING_MEASURE_TIME
double t_start = real_time_now();
#endif
int factor = g_Config.iXBRZTexScalingLevel;
// depending on the factor and texture sizes, these can be pretty large (25 MB for a 512 by 512 texture with scaling factor 5)
@ -37,52 +119,71 @@ void TextureScaler::Scale(u32* &data, GLenum &dstFmt, int &width, int &height) {
break;
case GL_UNSIGNED_SHORT_4_4_4_4:
for(int y = 0; y < height; ++y) {
for(int x = 0; x < width; ++x) {
u32 val = ((u16*)data)[y*width + x];
u32 r = ((val>>12) & 0xF) * 17;
u32 g = ((val>> 8) & 0xF) * 17;
u32 b = ((val>> 4) & 0xF) * 17;
u32 a = ((val>> 0) & 0xF) * 17;
xbrzInputBuf[y*width + x] = (a << 24) | (b << 16) | (g << 8) | r;
ParallelLoop([&](int l, int u){
for(int y = l; y < u; ++y) {
for(int x = 0; x < width; ++x) {
u32 val = ((u16*)data)[y*width + x];
u32 r = ((val>>12) & 0xF) * 17;
u32 g = ((val>> 8) & 0xF) * 17;
u32 b = ((val>> 4) & 0xF) * 17;
u32 a = ((val>> 0) & 0xF) * 17;
xbrzInputBuf[y*width + x] = (a << 24) | (b << 16) | (g << 8) | r;
}
}
}
}, 0, height);
break;
case GL_UNSIGNED_SHORT_5_6_5:
for(int y = 0; y < height; ++y) {
for(int x = 0; x < width; ++x) {
u32 val = ((u16*)data)[y*width + x];
u32 r = ((val>>11) & 0x1F) * 8;
u32 g = ((val>> 5) & 0x3F) * 4;
u32 b = ((val ) & 0x1F) * 8;
xbrzInputBuf[y*width + x] = (0xFF << 24) | (b << 16) | (g << 8) | r;
ParallelLoop([&](int l, int u){
for(int y = l; y < u; ++y) {
for(int x = 0; x < width; ++x) {
u32 val = ((u16*)data)[y*width + x];
u32 r = ((val>>11) & 0x1F) * 8;
u32 g = ((val>> 5) & 0x3F) * 4;
u32 b = ((val ) & 0x1F) * 8;
xbrzInputBuf[y*width + x] = (0xFF << 24) | (b << 16) | (g << 8) | r;
}
}
}
}, 0, height);
break;
case GL_UNSIGNED_SHORT_5_5_5_1:
for(int y = 0; y < height; ++y) {
for(int x = 0; x < width; ++x) {
u32 val = ((u16*)data)[y*width + x];
u32 r = ((val>>11) & 0x1F) * 8;
u32 g = ((val>> 6) & 0x1F) * 8;
u32 b = ((val>> 1) & 0x1F) * 8;
u32 a = (val & 0x1) * 255;
xbrzInputBuf[y*width + x] = (a << 24) | (b << 16) | (g << 8) | r;
ParallelLoop([&](int l, int u) {
for(int y = l; y < u; ++y) {
for(int x = 0; x < width; ++x) {
u32 val = ((u16*)data)[y*width + x];
u32 r = ((val>>11) & 0x1F) * 8;
u32 g = ((val>> 6) & 0x1F) * 8;
u32 b = ((val>> 1) & 0x1F) * 8;
u32 a = (val & 0x1) * 255;
xbrzInputBuf[y*width + x] = (a << 24) | (b << 16) | (g << 8) | r;
}
}
}
}, 0, height);
break;
default:
ERROR_LOG(G3D, "iXBRZTexScaling: unsupported texture format");
}
// scale and update values accordingly
xbrz::scale(factor, xbrzInputBuf, xbrzBuf, width, height);
// scale
xbrz::ScalerCfg cfg;
ParallelLoop([&](int l, int u) {
xbrz::scale(factor, xbrzInputBuf, xbrzBuf, width, height, cfg, l, u);
}, 0, height);
// update values accordingly
data = xbrzBuf;
dstFmt = GL_UNSIGNED_BYTE;
width *= factor;
height *= factor;
#ifdef SCALING_MEASURE_TIME
if(width*height > 64*64*factor*factor) {
double t = real_time_now() - t_start;
NOTICE_LOG(MASTER_LOG, "TextureScaler: processed %9d pixels in %6.5lf seconds. (%9.0lf Mpixels/second)",
width*height, t, (width*height)/(t*1000*1000));
}
#endif
}
}

View File

@ -21,55 +21,53 @@
#include "../Globals.h"
#include "../native/ext/glew/GL/glew.h"
template <typename T>
class SimpleBuf {
#include <functional>
#include <vector>
#include "native/thread/thread.h"
#include "base/mutex.h"
// This is the simplest possible worker implementation I can think of
// but entirely sufficient for the given purpose.
// Only handles a single item of work at a time.
class WorkerThread {
public:
SimpleBuf() : buf_(NULL), size_(0) {
}
WorkerThread();
~WorkerThread();
SimpleBuf(size_t size) : buf_(NULL) {
resize(size);
}
~SimpleBuf() {
if (buf_ != NULL) {
FreeMemoryPages(buf_, size_ * sizeof(T));
}
}
inline T &operator[](size_t index) {
return buf_[index];
}
// Doesn't preserve contents.
void resize(size_t size) {
if (size_ < size) {
if (buf_ != NULL) {
FreeMemoryPages(buf_, size_ * sizeof(T));
}
buf_ = (T *)AllocateMemoryPages(size * sizeof(T));
size_ = size;
}
}
T *data() {
return buf_;
}
size_t size() {
return size_;
}
// submit a new work item
void Process(const std::function<void()>& work);
// wait for a submitted work item to be completed
void WaitForCompletion();
private:
T *buf_;
size_t size_;
std::thread *thread; // the worker thread
condition_variable signal; // used to signal new work
condition_variable done; // used to signal work completion
recursive_mutex mutex, doneMutex; // associated with each respective condition variable
volatile bool active, started;
std::function<void()> work_; // the work to be done by this thread
void WorkFunc();
WorkerThread(const WorkerThread& other) { } // prevent copies
};
class TextureScaler {
public:
TextureScaler();
void Scale(u32* &data, GLenum &dstfmt, int &width, int &height);
private:
const int numThreads;
std::vector<std::shared_ptr<WorkerThread>> workers;
bool workersStarted;
void StartWorkers();
void ParallelLoop(std::function<void(int,int)> loop, int lower, int upper);
SimpleBuf<u32> bufInput;
SimpleBuf<u32> bufOutput;
};

View File

@ -25,6 +25,13 @@
#include <limits>
#include "config.h"
#ifdef max
#undef max
#endif
#ifdef min
#undef min
#endif
namespace xbrz
{
/*