分割ブロックを並列で変換するようにした

This commit is contained in:
lltcggie 2015-06-06 21:40:55 +09:00
parent 0cb997f2b7
commit b4815227b7
7 changed files with 388 additions and 58 deletions

235
common/tinypl.hpp Normal file
View File

@ -0,0 +1,235 @@
/**
* Tiny Parallel Library
*
* The library implements following parallel algorithms that its
* interface are compatible with Intel TBB and Microsoft PPL.
* - parallel_for_each(first,last,func)
* - parallel_for(first,last,func)
* - parallel_invoke(f1,f2,...) (up to 4 args)
*/
#ifndef TINYPL_HPP
#define TINYPL_HPP
#include <algorithm>
#include <iterator>
#include <memory>
#include <thread>
#include <atomic>
#include <functional>
//#define BOOST_ASIO_ENABLE_HANDLER_TRACKING
#include <boost/asio.hpp>
#ifndef TINYPL_WORKERNUM
#define TINYPL_WORKERNUM 0
#endif
#ifndef TINYPL_MIN_ITERATE
#define TINYPL_MIN_ITERATE 1
#endif
namespace tinypl
{
namespace impl {
// task scheduler
class scheduler {
friend struct waiter;
public:
explicit scheduler(std::size_t thnum = 0)
: worker_(new boost::asio::io_service::work(iosrv_))
{
if (thnum == 0)
thnum = std::max(1u, std::thread::hardware_concurrency());
thnum_ = thnum;
// start worker threads
for (std::size_t i = 0; i < thnum - 1; ++i)
thpool_.emplace_back(std::bind(static_cast<std::size_t(boost::asio::io_service::*)(void)>(&boost::asio::io_service::run), &iosrv_));
}
~scheduler()
{
// stop all worker threads
worker_.reset();
for (auto &th : thpool_)
th.join();
}
// # of worker threads
std::size_t worker_num() const { return thnum_; }
// euqueue task
template <class F>
void enqueue(F f) { iosrv_.post(f); }
std::vector<std::thread::id> get_thread_pool_id_list() const
{
std::vector<std::thread::id> list;
list.reserve(thpool_.size());
for (const auto &th : thpool_)
list.push_back(th.get_id());
return list;
}
public:
// get scheduler object
static scheduler& instance()
{
static scheduler sched(TINYPL_WORKERNUM);
return sched;
}
private:
boost::asio::io_service iosrv_;
std::unique_ptr<boost::asio::io_service::work> worker_;
std::vector<std::thread> thpool_;
std::size_t thnum_;
};
// task waiter
struct waiter {
scheduler& sched_;
volatile std::atomic<uint32_t> count_;
waiter(scheduler& sched, unsigned int count)
: sched_(sched), count_(count) {}
~waiter()
{
while (0 < count_) {
sched_.iosrv_.poll_one();
// FIXME: It may cause heavy busyloop in worst-case scenario.
}
}
struct holder {
explicit holder(waiter& w) : w_(w) {}
~holder() { w_.count_--; }
waiter& w_;
};
};
// task of parallel_for_each algorithm
template <class Itr, class Func>
void parallel_foreach_task(waiter* w, Itr first, Itr last, const Func& func)
{
waiter::holder h(*w);
while (first != last)
func(*first++);
}
// task of parallel_for algorithm
template <class IdxType, class Func>
void parallel_for_task(waiter* w, IdxType first, IdxType last, const Func& func)
{
waiter::holder h(*w);
while (first < last)
func(first++);
}
// task of parallel_invoke algorithm
template <class Func>
void parallel_invoke_task(waiter* w, const Func& func)
{
waiter::holder h(*w);
func();
}
} // namespace impl
/**
* parallel_for_each algorithm
*/
template <class Itr, class Func>
void parallel_for_each(Itr first, Itr last, const Func& func)
{
impl::scheduler& sched = impl::scheduler::instance();
std::size_t range = std::distance(first, last);
std::size_t block = std::max(range / sched.worker_num(), std::size_t(TINYPL_MIN_ITERATE));
impl::waiter w(sched, (range + block - 1) / block);
for (Itr next = first; first != last; first = next) {
std::advance(next, std::min(range, block));
range -= std::min(range, block);
if (next != last) {
sched.enqueue(boost::bind(&impl::parallel_foreach_task<Itr, Func>, &w, first, next, func));
}
else {
impl::parallel_foreach_task<Itr, Func>(&w, first, next, func);
}
}
}
/**
* parallel_for algorithm
*/
template <class IdxType, class Func>
void parallel_for(impl::scheduler& sched, IdxType first, IdxType last, const Func& func)
{
IdxType range = last - first;
IdxType block = static_cast<IdxType>(std::max(range / sched.worker_num(), std::size_t(TINYPL_MIN_ITERATE)));
impl::waiter w(sched, (range + block - 1) / block);
for (IdxType next = first; first < last; first = next) {
next = std::min(last, next + block);
if (next < last) {
sched.enqueue(std::bind(&impl::parallel_for_task<IdxType, Func>, &w, first, next, func));
}
else {
impl::parallel_for_task<IdxType, Func>(&w, first, next, func);
}
}
}
/**
* parallel_for algorithm
*/
template <class IdxType, class Func>
void parallel_for(IdxType first, IdxType last, const Func& func)
{
impl::scheduler& sched = impl::scheduler::instance();
parallel_for(sched, fist, last, func);
}
/**
* parallel_invoke algorithm (2 args)
*/
template <class F1, class F2>
void parallel_invoke(const F1& f1, const F2& f2)
{
impl::scheduler& sched = impl::scheduler::instance();
impl::waiter w(sched, 1);
sched.enqueue(boost::bind(&impl::parallel_invoke_task<F1>, &w, f1));
f2();
}
/**
* parallel_invoke algorithm (3 args)
*/
template <class F1, class F2, class F3>
void parallel_invoke(const F1& f1, const F2& f2, const F3& f3)
{
impl::scheduler& sched = impl::scheduler::instance();
impl::waiter w(sched, 2);
sched.enqueue(boost::bind(&impl::parallel_invoke_task<F1>, &w, f1));
sched.enqueue(boost::bind(&impl::parallel_invoke_task<F2>, &w, f2));
f3();
}
/**
* parallel_invoke algorithm (4 args)
*/
template <class F1, class F2, class F3, class F4>
void parallel_invoke(const F1& f1, const F2& f2, const F3& f3, const F4& f4)
{
impl::scheduler& sched = impl::scheduler::instance();
impl::waiter w(sched, 3);
sched.enqueue(boost::bind(&impl::parallel_invoke_task<F1>, &w, f1));
sched.enqueue(boost::bind(&impl::parallel_invoke_task<F2>, &w, f2));
sched.enqueue(boost::bind(&impl::parallel_invoke_task<F3>, &w, f3));
f4();
}
} // namespace tinypl
#endif

View File

@ -9,6 +9,7 @@
#include <boost/algorithm/string.hpp>
#include <chrono>
#include <cuda_runtime.h>
#include "tinypl.hpp"
#if defined(WIN32) || defined(WIN64)
#include <Windows.h>
@ -62,7 +63,7 @@ static std::once_flag waifu2x_cudnn_once_flag;
} \
} while (0)
Waifu2x::Waifu2x() : is_inited(false), isCuda(false), block(nullptr), dummy_data(nullptr), out_block(nullptr)
Waifu2x::Waifu2x() : job(1), is_inited(false), isCuda(false)
{
}
@ -375,7 +376,7 @@ Waifu2x::eWaifu2xError Waifu2x::ConstractNet(boost::shared_ptr<caffe::Net<float>
}
// ネットワークを使って画像を再構築する
Waifu2x::eWaifu2xError Waifu2x::ReconstructImage(boost::shared_ptr<caffe::Net<float>> net, cv::Mat &im)
Waifu2x::eWaifu2xError Waifu2x::ReconstructImage(std::vector<boost::shared_ptr<caffe::Net<float>>> nets, cv::Mat &im)
{
const auto Height = im.size().height;
const auto Width = im.size().width;
@ -390,18 +391,6 @@ Waifu2x::eWaifu2xError Waifu2x::ReconstructImage(boost::shared_ptr<caffe::Net<fl
try
{
const auto input_layer =
boost::dynamic_pointer_cast<caffe::MemoryDataLayer<float>>(
net->layer_by_name("image_input_layer"));
assert(input_layer);
const auto conv7_layer =
boost::dynamic_pointer_cast<caffe::ConvolutionLayer<float>>(
net->layer_by_name("conv7_layer"));
assert(conv7_layer);
input_layer->set_batch_size(batch_size);
const int WidthNum = Width / output_size;
const int HeightNum = Height / output_size;
@ -410,9 +399,41 @@ Waifu2x::eWaifu2xError Waifu2x::ReconstructImage(boost::shared_ptr<caffe::Net<fl
const int input_block_plane_size = block_size * block_size;
const int output_block_plane_size = crop_size * crop_size;
// 画像は(消費メモリの都合上)output_size*output_sizeに分けて再構築する
for (int num = 0; num < BlockNum; num += batch_size)
const int BatchNum = BlockNum / batch_size + (BlockNum % batch_size != 0 ? 1 : 0);
for (auto net : nets)
{
const auto input_layer =
boost::dynamic_pointer_cast<caffe::MemoryDataLayer<float>>(
net->layer_by_name("image_input_layer"));
assert(input_layer);
input_layer->set_batch_size(batch_size);
}
// 画像は(消費メモリの都合上)output_size*output_sizeに分けて再構築する
tinypl::parallel_for(*net_scheduler, 0, BatchNum, [&](const int batch_n)
{
const auto id = std::this_thread::get_id();
const auto net_scheduler_id_map_it = net_scheduler_id_map.find(id);
assert(net_scheduler_id_map_it != net_scheduler_id_map.end());
const int index = net_scheduler_id_map_it->second;
auto net = nets[index];
float *block = blocks[index];
float *dummy_data = dummy_datas[index];
float *out_block = out_blocks[index];
const auto input_layer =
boost::dynamic_pointer_cast<caffe::MemoryDataLayer<float>>(
net->layer_by_name("image_input_layer"));
assert(input_layer);
const int num = batch_n * batch_size;
const int processNum = (BlockNum - num) >= batch_size ? batch_size : BlockNum - num;
if (processNum < batch_size)
@ -455,7 +476,7 @@ Waifu2x::eWaifu2xError Waifu2x::ReconstructImage(boost::shared_ptr<caffe::Net<fl
}
// ネットワークに画像を入力
input_layer->Reset(block, dummy_data, input_block_plane_size * batch_size);
input_layer->Reset(block, dummy_data, input_block_plane_size * processNum);
// 計算
auto out = net->ForwardPrefilled(nullptr);
@ -487,7 +508,7 @@ Waifu2x::eWaifu2xError Waifu2x::ReconstructImage(boost::shared_ptr<caffe::Net<fl
for (int i = 0; i < crop_size; i++)
memcpy(imptr + (h + i) * Line + w, fptr + i * crop_size, crop_size * sizeof(float));
}
}
});
}
catch (...)
{
@ -519,6 +540,8 @@ Waifu2x::eWaifu2xError Waifu2x::init(int argc, char** argv, const std::string &M
crop_size = CropSize;
batch_size = BatchSize;
job = 2;
output_size = crop_size - offset * 2;
block_size = crop_size + layer_num * 2;
original_width_height = 128 + layer_num * 2;
@ -577,13 +600,18 @@ Waifu2x::eWaifu2xError Waifu2x::init(int argc, char** argv, const std::string &M
const std::string model_path = (mode_dir_path / "srcnn.prototxt").string();
const std::string param_path = (mode_dir_path / ("noise" + std::to_string(noise_level) + "_model.json")).string();
ret = ConstractNet(net_noise, model_path, process);
if (ret != eWaifu2xError_OK)
return ret;
net_noises.resize(job);
ret = LoadParameter(net_noise, param_path);
if (ret != eWaifu2xError_OK)
return ret;
for (auto &net_noise : net_noises)
{
ret = ConstractNet(net_noise, model_path, process);
if (ret != eWaifu2xError_OK)
return ret;
ret = LoadParameter(net_noise, param_path);
if (ret != eWaifu2xError_OK)
return ret;
}
}
if (mode == "scale" || mode == "noise_scale" || mode == "auto_scale")
@ -591,33 +619,62 @@ Waifu2x::eWaifu2xError Waifu2x::init(int argc, char** argv, const std::string &M
const std::string model_path = (mode_dir_path / "srcnn.prototxt").string();
const std::string param_path = (mode_dir_path / "scale2.0x_model.json").string();
ret = ConstractNet(net_scale, model_path, process);
if (ret != eWaifu2xError_OK)
return ret;
net_scales.resize(job);
ret = LoadParameter(net_scale, param_path);
if (ret != eWaifu2xError_OK)
return ret;
for (auto &net_scale : net_scales)
{
ret = ConstractNet(net_scale, model_path, process);
if (ret != eWaifu2xError_OK)
return ret;
ret = LoadParameter(net_scale, param_path);
if (ret != eWaifu2xError_OK)
return ret;
}
}
const int input_block_plane_size = block_size * block_size;
const int output_block_plane_size = crop_size * crop_size;
blocks.resize(job);
dummy_datas.resize(job);
out_blocks.resize(job);
if (isCuda)
{
CUDA_CHECK_WAIFU2X(cudaHostAlloc(&block, sizeof(float) * input_block_plane_size * batch_size, cudaHostAllocWriteCombined));
CUDA_CHECK_WAIFU2X(cudaHostAlloc(&dummy_data, sizeof(float) * input_block_plane_size * batch_size, cudaHostAllocWriteCombined));
CUDA_CHECK_WAIFU2X(cudaHostAlloc(&out_block, sizeof(float) * output_block_plane_size * batch_size, cudaHostAllocDefault));
for (auto &block : blocks) {
CUDA_CHECK_WAIFU2X(cudaHostAlloc(&block, sizeof(float) * input_block_plane_size * batch_size, cudaHostAllocWriteCombined));
}
for (auto &dummy_data : dummy_datas) {
CUDA_CHECK_WAIFU2X(cudaHostAlloc(&dummy_data, sizeof(float) * input_block_plane_size * batch_size, cudaHostAllocWriteCombined));
}
for (auto &out_block : out_blocks) {
CUDA_CHECK_WAIFU2X(cudaHostAlloc(&out_block, sizeof(float) * output_block_plane_size * batch_size, cudaHostAllocDefault));
}
}
else
{
block = new float[input_block_plane_size * batch_size];
dummy_data = new float[input_block_plane_size * batch_size];
out_block = new float[output_block_plane_size * batch_size];
for (auto &block : blocks)
block = new float[input_block_plane_size * batch_size];
for (auto &dummy_data : dummy_datas)
dummy_data = new float[input_block_plane_size * batch_size];
for (auto &out_block : out_blocks)
out_block = new float[output_block_plane_size * batch_size];
}
for (size_t i = 0; i < input_block_plane_size * batch_size; i++)
dummy_data[i] = 0.0f;
for (auto dummy_data : dummy_datas)
{
for (size_t i = 0; i < input_block_plane_size * batch_size; i++)
dummy_data[i] = 0.0f;
}
net_scheduler.reset(new tinypl::impl::scheduler(job));
const auto list = net_scheduler->get_thread_pool_id_list();
for (size_t i = 0; i < list.size(); i++)
net_scheduler_id_map.emplace(list[i], i);
net_scheduler_id_map.emplace(std::this_thread::get_id(), list.size());
is_inited = true;
}
@ -631,20 +688,34 @@ Waifu2x::eWaifu2xError Waifu2x::init(int argc, char** argv, const std::string &M
void Waifu2x::destroy()
{
net_noise.reset();
net_scale.reset();
net_scheduler.reset();
net_noises.clear();
net_scales.clear();
if (isCuda)
{
CUDA_HOST_SAFE_FREE(block);
CUDA_HOST_SAFE_FREE(dummy_data);
CUDA_HOST_SAFE_FREE(out_block);
for (auto &block : blocks) {
CUDA_HOST_SAFE_FREE(block);
}
for (auto &dummy_data : dummy_datas) {
CUDA_HOST_SAFE_FREE(dummy_data);
}
for (auto &out_block : out_blocks) {
CUDA_HOST_SAFE_FREE(out_block);
}
}
else
{
SAFE_DELETE_WAIFU2X(block);
SAFE_DELETE_WAIFU2X(dummy_data);
SAFE_DELETE_WAIFU2X(out_block);
for (auto &block : blocks) {
SAFE_DELETE_WAIFU2X(block);
}
for (auto &dummy_data : dummy_datas) {
SAFE_DELETE_WAIFU2X(dummy_data);
}
for (auto &out_block : out_blocks) {
SAFE_DELETE_WAIFU2X(out_block);
}
}
is_inited = false;
@ -680,7 +751,7 @@ Waifu2x::eWaifu2xError Waifu2x::waifu2x(const std::string &input_file, const std
{
PaddingImage(im, im);
ret = ReconstructImage(net_noise, im);
ret = ReconstructImage(net_noises, im);
if (ret != eWaifu2xError_OK)
return ret;
@ -701,7 +772,7 @@ Waifu2x::eWaifu2xError Waifu2x::waifu2x(const std::string &input_file, const std
{
Zoom2xAndPaddingImage(im, im, image_size);
ret = ReconstructImage(net_scale, im);
ret = ReconstructImage(net_scales, im);
if (ret != eWaifu2xError_OK)
return ret;

View File

@ -5,8 +5,11 @@
#include <vector>
#include <utility>
#include <functional>
#include <memory>
#include <thread>
#include <boost/shared_ptr.hpp>
#include <opencv2/opencv.hpp>
#include <unordered_map>
namespace caffe
@ -15,6 +18,15 @@ namespace caffe
class Net;
};
namespace tinypl
{
namespace impl
{
// task scheduler
class scheduler;
}
}
class Waifu2x
{
public:
@ -64,15 +76,19 @@ private:
double scale_ratio;
std::string model_dir;
std::string process;
int job;
bool isCuda;
boost::shared_ptr<caffe::Net<float>> net_noise;
boost::shared_ptr<caffe::Net<float>> net_scale;
std::vector<boost::shared_ptr<caffe::Net<float>>> net_noises;
std::vector<boost::shared_ptr<caffe::Net<float>>> net_scales;
float *block;
float *dummy_data;
float *out_block;
std::vector<float *> blocks;
std::vector<float *> dummy_datas;
std::vector<float *> out_blocks;
std::unique_ptr<tinypl::impl::scheduler> net_scheduler;
std::unordered_map<std::thread::id, size_t> net_scheduler_id_map;
private:
eWaifu2xError LoadImage(cv::Mat &float_image, const std::string &input_file);
@ -82,7 +98,7 @@ private:
eWaifu2xError CreateZoomColorImage(const cv::Mat &float_image, const cv::Size_<int> &zoom_size, std::vector<cv::Mat> &cubic_planes);
eWaifu2xError LoadParameter(boost::shared_ptr<caffe::Net<float>> net, const std::string &param_path);
eWaifu2xError ConstractNet(boost::shared_ptr<caffe::Net<float>> &net, const std::string &model_path, const std::string &process);
eWaifu2xError ReconstructImage(boost::shared_ptr<caffe::Net<float>> net, cv::Mat &im);
eWaifu2xError ReconstructImage(std::vector<boost::shared_ptr<caffe::Net<float>>> nets, cv::Mat &im);
public:
Waifu2x();

View File

@ -55,7 +55,7 @@
</PrecompiledHeader>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions>WIN32;_WIN32_WINNT=0x0600;_DEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
@ -74,7 +74,7 @@
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions>WIN32;_WIN32_WINNT=0x0600;NDEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
@ -95,6 +95,7 @@
<ClCompile Include="Source.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\common\tinypl.hpp" />
<ClInclude Include="..\common\waifu2x.h" />
<ClInclude Include="CControl.h" />
<ClInclude Include="CDialog.h" />

View File

@ -56,6 +56,9 @@
<ClInclude Include="resource.h">
<Filter>ヘッダー ファイル</Filter>
</ClInclude>
<ClInclude Include="..\common\tinypl.hpp">
<Filter>ヘッダー ファイル</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ResourceCompile Include="Resource.rc">

View File

@ -55,7 +55,7 @@
</PrecompiledHeader>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions>WIN32;_WIN32_WINNT=0x0600;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
@ -71,7 +71,7 @@
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions>WIN32;_WIN32_WINNT=0x0600;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
@ -86,6 +86,7 @@
<ClCompile Include="Source.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\common\tinypl.hpp" />
<ClInclude Include="..\common\waifu2x.h" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />

View File

@ -26,5 +26,8 @@
<ClInclude Include="..\common\waifu2x.h">
<Filter>ヘッダー ファイル</Filter>
</ClInclude>
<ClInclude Include="..\common\tinypl.hpp">
<Filter>ヘッダー ファイル</Filter>
</ClInclude>
</ItemGroup>
</Project>