[pstl] Support Threading Building Blocks 2020 (oneTBB) for "tbb" parallel backend.

After the changes the "tbb" parallel backend will work with old TBB versions(TBB_INTERFACE_VERSION <= 12000) and new ones (TBB 2020 and greater)

More about oneTBB:
https://github.com/oneapi-src/oneTBB

Phabricator Review:
https://reviews.llvm.org/D87380
This commit is contained in:
Dvorskiy, Mikhail 2020-09-14 14:20:32 +03:00
parent 5cac85c931
commit 0b2e0e80d9

View File

@ -25,6 +25,7 @@
#include <tbb/parallel_invoke.h>
#include <tbb/task_arena.h>
#include <tbb/tbb_allocator.h>
#include <tbb/task.h>
#if TBB_INTERFACE_VERSION < 10000
# error Intel(R) Threading Building Blocks 2018 is required; older versions are not supported.
@ -71,7 +72,11 @@ class __buffer
inline void
__cancel_execution()
{
#if TBB_INTERFACE_VERSION <= 12000
tbb::task::self().group()->cancel_group_execution();
#else
tbb::task::current_context()->cancel_group_execution();
#endif
}
//------------------------------------------------------------------------
@ -413,17 +418,308 @@ __parallel_transform_scan(_ExecutionPolicy&&, _Index __n, _Up __u, _Tp __init, _
//------------------------------------------------------------------------
#define _PSTL_MERGE_CUT_OFF 2000
template <typename _Func>
class __func_task;
template <typename _Func>
class __root_task;
#if TBB_INTERFACE_VERSION <= 12000
class __task : public tbb::task
{
public:
template <typename _Fn>
__task*
make_continuation(_Fn&& __f)
{
return new (allocate_continuation()) __func_task<typename std::decay<_Fn>::type>(std::forward<_Fn>(__f));
}
template <typename _Fn>
__task*
make_child_of(__task* parent, _Fn&& __f)
{
return new (parent->allocate_child()) __func_task<typename std::decay<_Fn>::type>(std::forward<_Fn>(__f));
}
template <typename _Fn>
__task*
make_additional_child_of(tbb::task* parent, _Fn&& __f)
{
return new (tbb::task::allocate_additional_child_of(*parent))
__func_task<typename std::decay<_Fn>::type>(std::forward<_Fn>(__f));
}
inline void
recycle_as_continuation()
{
tbb::task::recycle_as_continuation();
}
inline void
recycle_as_child_of(__task* parent)
{
tbb::task::recycle_as_child_of(*parent);
}
inline void
spawn(__task* __t)
{
tbb::task::spawn(*__t);
}
template <typename _Fn>
static inline void
spawn_root_and_wait(__root_task<_Fn>& __root)
{
tbb::task::spawn_root_and_wait(*__root._M_task);
}
};
template <typename _Func>
class __func_task : public __task
{
_Func _M_func;
tbb::task*
execute()
{
return _M_func(this);
};
public:
template <typename _Fn>
__func_task(_Fn&& __f) : _M_func{std::forward<_Fn>(__f)}
{
}
_Func&
body()
{
return _M_func;
}
};
template <typename _Func>
class __root_task
{
tbb::task* _M_task;
public:
template <typename... Args>
__root_task(Args&&... args)
: _M_task{new (tbb::task::allocate_root()) __func_task<_Func>{_Func(std::forward<Args>(args)...)}}
{
}
friend class __task;
friend class __func_task<_Func>;
};
#else // TBB_INTERFACE_VERSION <= 12000
class __task : public tbb::detail::d1::task
{
protected:
tbb::detail::d1::small_object_allocator _M_allocator{};
tbb::detail::d1::execution_data* _M_execute_data{};
__task* _M_parent{};
std::atomic<int> _M_refcount{};
bool _M_recycle{};
template <typename _Fn>
__task*
allocate_func_task(_Fn&& __f)
{
assert(_M_execute_data != nullptr);
tbb::detail::d1::small_object_allocator __alloc{};
auto __t =
__alloc.new_object<__func_task<typename std::decay<_Fn>::type>>(*_M_execute_data, std::forward<_Fn>(__f));
__t->_M_allocator = __alloc;
return __t;
}
public:
__task*
parent()
{
return _M_parent;
}
void
set_ref_count(int __n)
{
_M_refcount.store(__n, std::memory_order_release);
}
template <typename _Fn>
__task*
make_continuation(_Fn&& __f)
{
auto __t = allocate_func_task(std::forward<_Fn&&>(__f));
__t->_M_parent = _M_parent;
_M_parent = nullptr;
return __t;
}
template <typename _Fn>
__task*
make_child_of(__task* __parent, _Fn&& __f)
{
auto __t = allocate_func_task(std::forward<_Fn&&>(__f));
__t->_M_parent = __parent;
return __t;
}
template <typename _Fn>
__task*
make_additional_child_of(__task* __parent, _Fn&& __f)
{
auto __t = make_child_of(__parent, std::forward<_Fn>(__f));
assert(__parent->_M_refcount.load(std::memory_order_relaxed) > 0);
++__parent->_M_refcount;
return __t;
}
inline void
recycle_as_continuation()
{
_M_recycle = true;
}
inline void
recycle_as_child_of(__task* parent)
{
_M_recycle = true;
_M_parent = parent;
}
inline void
spawn(__task* __t)
{
assert(_M_execute_data != nullptr);
tbb::detail::d1::spawn(*__t, *_M_execute_data->context);
}
template <typename _Fn>
static inline void
spawn_root_and_wait(__root_task<_Fn>& __root)
{
tbb::detail::d1::execute_and_wait(*__root._M_func_task, __root._M_context, __root._M_wait_object,
__root._M_context);
}
template <typename _Func>
friend class __func_task;
};
template <typename _Func>
class __func_task : public __task
{
_Func _M_func;
__task*
execute(tbb::detail::d1::execution_data& __ed) override
{
_M_execute_data = &__ed;
_M_recycle = false;
__task* __next = _M_func(this);
return finalize(__next);
};
__task*
cancel(tbb::detail::d1::execution_data& __ed) override
{
return finalize(nullptr);
}
__task*
finalize(__task* __next)
{
bool __recycle = _M_recycle;
_M_recycle = false;
if (__recycle)
{
return __next;
}
auto __parent = _M_parent;
auto __alloc = _M_allocator;
auto __ed = _M_execute_data;
this->~__func_task();
assert(__parent != nullptr);
assert(__parent->_M_refcount.load(std::memory_order_relaxed) > 0);
if (--__parent->_M_refcount == 0)
{
assert(__next == nullptr);
__alloc.deallocate(this, *__ed);
return __parent;
}
return __next;
}
friend class __root_task<_Func>;
public:
template <typename _Fn>
__func_task(_Fn&& __f) : _M_func(std::forward<_Fn>(__f))
{
}
_Func&
body()
{
return _M_func;
}
};
template <typename _Func>
class __root_task : public __task
{
__task*
execute(tbb::detail::d1::execution_data& __ed) override
{
_M_wait_object.release();
return nullptr;
};
__task*
cancel(tbb::detail::d1::execution_data& __ed) override
{
_M_wait_object.release();
return nullptr;
}
__func_task<_Func>* _M_func_task{};
tbb::detail::d1::wait_context _M_wait_object{0};
tbb::task_group_context _M_context{};
public:
template <typename... Args>
__root_task(Args&&... args) : _M_wait_object{1}
{
tbb::detail::d1::small_object_allocator __alloc{};
_M_func_task = __alloc.new_object<__func_task<_Func>>(_Func(std::forward<Args>(args)...));
_M_func_task->_M_allocator = __alloc;
_M_func_task->_M_parent = this;
_M_refcount.store(1, std::memory_order_relaxed);
}
friend class __task;
};
#endif // TBB_INTERFACE_VERSION <= 12000
template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Compare, typename _Cleanup,
typename _LeafMerge>
class __merge_task : public tbb::task
class __merge_func
{
typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2;
typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType;
typedef typename std::iterator_traits<_RandomAccessIterator1>::value_type _ValueType;
/*override*/ tbb::task*
execute();
_RandomAccessIterator1 _M_x_beg;
_RandomAccessIterator2 _M_z_beg;
@ -529,7 +825,7 @@ class __merge_task : public tbb::task
};
public:
__merge_task(_SizeType __xs, _SizeType __xe, _SizeType __ys, _SizeType __ye, _SizeType __zs, _Compare __comp,
__merge_func(_SizeType __xs, _SizeType __xe, _SizeType __ys, _SizeType __ye, _SizeType __zs, _Compare __comp,
_Cleanup, _LeafMerge __leaf_merge, _SizeType __nsort, _RandomAccessIterator1 __x_beg,
_RandomAccessIterator2 __z_beg, bool __x_orig, bool __y_orig, bool __root)
: _M_xs(__xs), _M_xe(__xe), _M_ys(__ys), _M_ye(__ye), _M_zs(__zs), _M_x_beg(__x_beg), _M_z_beg(__z_beg),
@ -554,12 +850,14 @@ class __merge_task : public tbb::task
_y_orig = __on_off;
}
__task*
operator()(__task* __self);
private:
__merge_task*
parent_merge() const
__merge_func*
parent_merge(__task* __self) const
{
tbb::task* p = (_root ? nullptr : parent());
return static_cast<__merge_task*>(p);
return _root ? nullptr : &static_cast<__func_task<__merge_func>*>(__self->parent())->body();
}
bool
x_less_y()
@ -615,8 +913,8 @@ class __merge_task : public tbb::task
_y_orig = !_y_orig;
}
tbb::task*
merge_ranges()
__task*
merge_ranges(__task* __self)
{
assert(_x_orig == _y_orig); //two merged subrange must be lie into the same buffer
@ -626,7 +924,7 @@ class __merge_task : public tbb::task
// need to merge {x} and {y}
if (__n > __merge_cut_off)
return split_merging();
return split_merging(__self);
//merge to buffer
if (_x_orig)
@ -634,7 +932,7 @@ class __merge_task : public tbb::task
_M_leaf_merge(_M_x_beg + _M_xs, _M_x_beg + _M_xe, _M_x_beg + _M_ys, _M_x_beg + _M_ye, _M_z_beg + _M_zs,
_M_comp, __move_value_construct(), __move_value_construct(), __move_range_construct(),
__move_range_construct());
assert(parent_merge()); //not root merging task
assert(parent_merge(__self)); //not root merging task
}
//merge to "origin"
else
@ -656,13 +954,13 @@ class __merge_task : public tbb::task
return nullptr;
}
tbb::task*
process_ranges()
__task*
process_ranges(__task* __self)
{
assert(_x_orig == _y_orig);
assert(!_split);
auto p = parent_merge();
auto p = parent_merge(__self);
if (!p)
{ //root merging task
@ -685,7 +983,7 @@ class __merge_task : public tbb::task
move_y_range(); //parallel moving
}
// need to merge {x} and {y}.
return merge_ranges();
return merge_ranges(__self);
}
//else: not root merging task (parent_merge() == NULL)
//optimization, just for sort algorithm, //{x} <= {y}
@ -699,12 +997,12 @@ class __merge_task : public tbb::task
const auto id_range = _M_zs;
p->set_odd(id_range, !_x_orig);
return merge_ranges();
return merge_ranges(__self);
}
//splitting as merge task into 2 of the same level
tbb::task*
split_merging()
__task*
split_merging(__task* __self)
{
assert(_x_orig == _y_orig);
const auto __nx = (_M_xe - _M_xs);
@ -732,43 +1030,42 @@ class __merge_task : public tbb::task
}
auto __zm = _M_zs + ((__xm - _M_xs) + (__ym - _M_ys));
__merge_func __right_func(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _Cleanup(), _M_leaf_merge, _M_nsort,
_M_x_beg, _M_z_beg, _x_orig, _y_orig, _root);
__right_func._split = true;
auto __merge_task = __self->make_additional_child_of(__self->parent(), std::move(__right_func));
__self->spawn(__merge_task);
__self->recycle_as_continuation();
__merge_task* __right = new (tbb::task::allocate_additional_child_of(*parent()))
__merge_task(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _Cleanup(), _M_leaf_merge, _M_nsort, _M_x_beg,
_M_z_beg, _x_orig, _y_orig, _root);
__right->_split = true;
tbb::task::spawn(*__right);
tbb::task::recycle_as_continuation();
_M_xe = __xm;
_M_ye = __ym;
_split = true;
return this;
return __self;
}
};
template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename __M_Compare, typename _Cleanup,
typename _LeafMerge>
tbb::task*
__merge_task<_RandomAccessIterator1, _RandomAccessIterator2, __M_Compare, _Cleanup, _LeafMerge>::execute()
__task*
__merge_func<_RandomAccessIterator1, _RandomAccessIterator2, __M_Compare, _Cleanup, _LeafMerge>::
operator()(__task* __self)
{
//a. split merge task into 2 of the same level; the special logic,
//without processing(process_ranges) adjacent sub-ranges x and y
if (_split)
return merge_ranges();
return merge_ranges(__self);
//b. General merging of adjacent sub-ranges x and y (with optimization in case of {x} <= {y} )
//1. x and y are in the even buffer
//2. x and y are in the odd buffer
if (_x_orig == _y_orig)
return process_ranges();
return process_ranges(__self);
//3. x is in even buffer, y is in the odd buffer
//4. x is in odd buffer, y is in the even buffer
if (!parent_merge())
if (!parent_merge(__self))
{ //root merge task
if (_x_orig)
move_x_range();
@ -788,11 +1085,11 @@ __merge_task<_RandomAccessIterator1, _RandomAccessIterator2, __M_Compare, _Clean
move_y_range();
}
return process_ranges();
return process_ranges(__self);
}
template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Compare, typename _LeafSort>
class __stable_sort_task : public tbb::task
class __stable_sort_func
{
public:
typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
@ -800,8 +1097,6 @@ class __stable_sort_task : public tbb::task
typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType;
private:
/*override*/ tbb::task*
execute();
_RandomAccessIterator1 _M_xs, _M_xe, _M_x_beg;
_RandomAccessIterator2 _M_zs, _M_z_beg;
_Compare _M_comp;
@ -810,22 +1105,25 @@ class __stable_sort_task : public tbb::task
_SizeType _M_nsort; //zero or number of elements to be sorted for partial_sort alforithm
public:
__stable_sort_task(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __zs,
__stable_sort_func(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __zs,
bool __root, _Compare __comp, _LeafSort __leaf_sort, _SizeType __nsort,
_RandomAccessIterator1 __x_beg, _RandomAccessIterator2 __z_beg)
: _M_xs(__xs), _M_xe(__xe), _M_x_beg(__x_beg), _M_zs(__zs), _M_z_beg(__z_beg), _M_comp(__comp),
_M_leaf_sort(__leaf_sort), _M_root(__root), _M_nsort(__nsort)
{
}
__task*
operator()(__task* __self);
};
#define _PSTL_STABLE_SORT_CUT_OFF 500
template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Compare, typename _LeafSort>
tbb::task*
__stable_sort_task<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, _LeafSort>::execute()
__task*
__stable_sort_func<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, _LeafSort>::operator()(__task* __self)
{
typedef __merge_task<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, __utils::__serial_destroy,
typedef __merge_func<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, __utils::__serial_destroy,
__utils::__serial_move_merge>
_MergeTaskType;
@ -835,34 +1133,27 @@ __stable_sort_task<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, _Le
if (__n <= __sort_cut_off)
{
_M_leaf_sort(_M_xs, _M_xe, _M_comp);
assert(!_M_root);
tbb::task* p = parent();
const auto id_range = _M_xs - _M_x_beg;
return nullptr;
}
const _RandomAccessIterator1 __xm = _M_xs + __n / 2;
const _RandomAccessIterator2 __zm = _M_zs + (__xm - _M_xs);
const _RandomAccessIterator2 __ze = _M_zs + __n;
_MergeTaskType* __m = new (allocate_continuation()) _MergeTaskType(
_M_xs - _M_x_beg, __xm - _M_x_beg, __xm - _M_x_beg, _M_xe - _M_x_beg, _M_zs - _M_z_beg, _M_comp,
__utils::__serial_destroy(), __utils::__serial_move_merge(__nmerge), _M_nsort, _M_x_beg, _M_z_beg,
/*x_orig*/ true, /*y_orig*/ true, /*root*/ _M_root);
_MergeTaskType __m(_MergeTaskType(_M_xs - _M_x_beg, __xm - _M_x_beg, __xm - _M_x_beg, _M_xe - _M_x_beg,
_M_zs - _M_z_beg, _M_comp, __utils::__serial_destroy(),
__utils::__serial_move_merge(__nmerge), _M_nsort, _M_x_beg, _M_z_beg,
/*x_orig*/ true, /*y_orig*/ true, /*root*/ _M_root));
auto __parent = __self->make_continuation(std::move(__m));
__parent->set_ref_count(2);
auto __right = __self->make_child_of(
__parent, __stable_sort_func(__xm, _M_xe, __zm, false, _M_comp, _M_leaf_sort, _M_nsort, _M_x_beg, _M_z_beg));
__self->spawn(__right);
__self->recycle_as_child_of(__parent);
_M_root = false;
__m->set_ref_count(2);
auto __right = new (__m->allocate_child())
__stable_sort_task(__xm, _M_xe, __zm, _M_root, _M_comp, _M_leaf_sort, _M_nsort, _M_x_beg, _M_z_beg);
spawn(*__right);
recycle_as_child_of(*__m);
_M_xe = __xm;
return this;
return __self;
}
template <class _ExecutionPolicy, typename _RandomAccessIterator, typename _Compare, typename _LeafSort>
@ -882,11 +1173,9 @@ __parallel_stable_sort(_ExecutionPolicy&&, _RandomAccessIterator __xs, _RandomAc
if (__n > __sort_cut_off)
{
__buffer<_ValueType> __buf(__n);
tbb::task* root = new (tbb::task::allocate_root())
__stable_sort_task<_RandomAccessIterator, _ValueType*, _Compare, _LeafSort>(
__xs, __xe, __buf.get(), true, __comp, __leaf_sort, __nsort, __xs, __buf.get());
tbb::task::spawn_root_and_wait(*root);
__root_task<__stable_sort_func<_RandomAccessIterator, _ValueType*, _Compare, _LeafSort>> __root{
__xs, __xe, __buf.get(), true, __comp, __leaf_sort, __nsort, __xs, __buf.get()};
__task::spawn_root_and_wait(__root);
return;
}
//serial sort
@ -899,10 +1188,8 @@ __parallel_stable_sort(_ExecutionPolicy&&, _RandomAccessIterator __xs, _RandomAc
//------------------------------------------------------------------------
template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _RandomAccessIterator3,
typename _Compare, typename _LeafMerge>
class __merge_task_static : public tbb::task
class __merge_func_static
{
/*override*/ tbb::task*
execute();
_RandomAccessIterator1 _M_xs, _M_xe;
_RandomAccessIterator2 _M_ys, _M_ye;
_RandomAccessIterator3 _M_zs;
@ -910,20 +1197,23 @@ class __merge_task_static : public tbb::task
_LeafMerge _M_leaf_merge;
public:
__merge_task_static(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __ys,
__merge_func_static(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __ys,
_RandomAccessIterator2 __ye, _RandomAccessIterator3 __zs, _Compare __comp,
_LeafMerge __leaf_merge)
: _M_xs(__xs), _M_xe(__xe), _M_ys(__ys), _M_ye(__ye), _M_zs(__zs), _M_comp(__comp), _M_leaf_merge(__leaf_merge)
{
}
__task*
operator()(__task* __self);
};
//TODO: consider usage of parallel_for with a custom blocked_range
template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _RandomAccessIterator3,
typename __M_Compare, typename _LeafMerge>
tbb::task*
__merge_task_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3, __M_Compare,
_LeafMerge>::execute()
__task*
__merge_func_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3, __M_Compare, _LeafMerge>::
operator()(__task* __self)
{
typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2;
@ -949,14 +1239,14 @@ __merge_task_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAcces
__ym = std::lower_bound(_M_ys, _M_ye, *__xm, _M_comp);
}
const _RandomAccessIterator3 __zm = _M_zs + ((__xm - _M_xs) + (__ym - _M_ys));
tbb::task* __right = new (tbb::task::allocate_additional_child_of(*parent()))
__merge_task_static(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _M_leaf_merge);
tbb::task::spawn(*__right);
tbb::task::recycle_as_continuation();
auto __right = __self->make_additional_child_of(
__self->parent(), __merge_func_static(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _M_leaf_merge));
__self->spawn(__right);
__self->recycle_as_continuation();
_M_xe = __xm;
_M_ye = __ym;
return this;
return __self;
}
template <class _ExecutionPolicy, typename _RandomAccessIterator1, typename _RandomAccessIterator2,
@ -979,11 +1269,11 @@ __parallel_merge(_ExecutionPolicy&&, _RandomAccessIterator1 __xs, _RandomAccessI
else
{
tbb::this_task_arena::isolate([=]() {
typedef __merge_task_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3,
typedef __merge_func_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3,
_Compare, _LeafMerge>
_TaskType;
tbb::task::spawn_root_and_wait(*new (tbb::task::allocate_root())
_TaskType(__xs, __xe, __ys, __ye, __zs, __comp, __leaf_merge));
__root_task<_TaskType> __root{__xs, __xe, __ys, __ye, __zs, __comp, __leaf_merge};
__task::spawn_root_and_wait(__root);
});
}
}