57 #include <type_traits>
62 inline namespace _V1 {
76 template <
typename T,
class BinaryOperation,
int Dims,
size_t Extent,
77 typename IdentityContainerT,
bool View =
false,
typename Subst =
void>
89 template <
typename T,
class BinaryOperation>
91 #ifdef SYCL_REDUCTION_DETERMINISTIC
92 std::bool_constant<false>;
94 std::bool_constant<((is_sgenfloat_v<T> &&
sizeof(T) == 4) ||
95 is_sgeninteger_v<T>) &&
113 template <
typename T,
class BinaryOperation>
115 #ifdef SYCL_REDUCTION_DETERMINISTIC
116 std::bool_constant<false>;
121 is_sgenfloat_v<T> &&
sizeof(T) == 8>;
129 template <
typename T,
class BinaryOperation>
131 #ifdef SYCL_REDUCTION_DETERMINISTIC
132 std::bool_constant<false>;
134 std::bool_constant<((is_sgeninteger_v<T> &&
135 (
sizeof(T) == 4 ||
sizeof(T) == 8)) ||
136 is_sgenfloat_v<T>) &&
146 template <
typename... Ts>
using ReduTupleT = sycl::detail::tuple<Ts...>;
152 size_t LocalMemBytesPerWorkItem);
154 size_t &NWorkGroups);
156 size_t LocalMemBytesPerWorkItem);
158 template <
typename T,
class BinaryOperation,
bool IsOptional>
165 template <
typename T,
class BinaryOperation,
int Dims, std::size_t Extent,
166 typename IdentityContainerT,
bool View,
typename Subst>
168 IdentityContainerT, View, Subst>> {
170 using op = BinaryOperation;
171 static constexpr
int dims = Dims;
172 static constexpr
size_t extent = Extent;
173 static constexpr
bool has_identity = IdentityContainerT::has_identity;
182 template <
typename ReducerRelayT = ReducerT>
auto &
getElement(
size_t E) {
183 return MReducerRef.getElement(E);
186 template <
typename ReducerRelayT = ReducerT> constexpr
auto getIdentity() {
188 "Identity unavailable.");
189 return MReducerRef.getIdentity();
195 template <
typename ReducerRelayT = ReducerT>
200 "Static identity unavailable.");
201 return ReducerT::getIdentity();
205 ReducerT &MReducerRef;
232 template <
typename _T = Ty,
int _Dims = Dims>
237 return static_cast<Reducer *
>(
this)->combine(
static_cast<_T
>(1));
240 template <
typename _T = Ty,
int _Dims = Dims>
245 return static_cast<Reducer *
>(
this)->combine(
static_cast<_T
>(1));
248 template <
typename _T = Ty,
int _Dims = Dims>
250 operator+=(
const _T &Partial) {
251 return static_cast<Reducer *
>(
this)->combine(Partial);
254 template <
typename _T = Ty,
int _Dims = Dims>
256 operator*=(
const _T &Partial) {
257 return static_cast<Reducer *
>(
this)->combine(Partial);
260 template <
typename _T = Ty,
int _Dims = Dims>
262 operator|=(
const _T &Partial) {
263 return static_cast<Reducer *
>(
this)->combine(Partial);
266 template <
typename _T = Ty,
int _Dims = Dims>
268 operator^=(
const _T &Partial) {
269 return static_cast<Reducer *
>(
this)->combine(Partial);
272 template <
typename _T = Ty,
int _Dims = Dims>
274 operator&=(
const _T &Partial) {
275 return static_cast<Reducer *
>(
this)->combine(Partial);
279 template <access::address_space Space>
286 template <access::address_space Space,
class T,
class AtomicFunctor>
287 void atomic_combine_impl(T *ReduVarPtr, AtomicFunctor Functor)
const {
288 auto reducer =
static_cast<const Reducer *
>(
this);
289 for (
size_t E = 0; E < Extent; ++E) {
297 getMemoryScope<Space>(), Space>(
298 address_space_cast<Space, access::decorated::no>(ReduVarPtr)[E]);
299 Functor(std::move(AtomicRef), *ReducerElem);
303 template <
class _T, access::address_space Space,
class BinaryOp>
304 static constexpr
bool BasicCheck =
305 std::is_same_v<remove_decoration_t<_T>, Ty> &&
312 typename _T = Ty,
class _BinaryOperation = BinaryOp>
313 std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
314 (IsReduOptForFastAtomicFetch<_T, _BinaryOperation>::value ||
315 IsReduOptForAtomic64Op<_T, _BinaryOperation>::value) &&
316 IsPlus<_T, _BinaryOperation>::value>
318 atomic_combine_impl<Space>(
319 ReduVarPtr, [](
auto &&Ref,
auto Val) {
return Ref.fetch_add(Val); });
324 typename _T = Ty,
class _BinaryOperation = BinaryOp>
325 std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
329 atomic_combine_impl<Space>(
330 ReduVarPtr, [](
auto &&Ref,
auto Val) {
return Ref.fetch_or(Val); });
335 typename _T = Ty,
class _BinaryOperation = BinaryOp>
336 std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
340 atomic_combine_impl<Space>(
341 ReduVarPtr, [](
auto &&Ref,
auto Val) {
return Ref.fetch_xor(Val); });
346 typename _T = Ty,
class _BinaryOperation = BinaryOp>
347 std::enable_if_t<std::is_same_v<remove_decoration_t<_T>, _T> &&
353 atomic_combine_impl<Space>(
354 ReduVarPtr, [](
auto &&Ref,
auto Val) {
return Ref.fetch_and(Val); });
359 typename _T = Ty,
class _BinaryOperation = BinaryOp>
360 std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
365 atomic_combine_impl<Space>(
366 ReduVarPtr, [](
auto &&Ref,
auto Val) {
return Ref.fetch_min(Val); });
371 typename _T = Ty,
class _BinaryOperation = BinaryOp>
372 std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
377 atomic_combine_impl<Space>(
378 ReduVarPtr, [](
auto &&Ref,
auto Val) {
return Ref.fetch_max(Val); });
384 template <
typename T,
class BinaryOperation,
bool ExplicitIdentity,
385 typename CondT =
void>
387 static_assert(!std::is_same_v<T, T>,
388 "Partial specializations don't cover all possible options!");
392 template <
typename T,
class BinaryOperation,
bool ExplicitIdentity>
394 T, BinaryOperation, ExplicitIdentity,
397 static constexpr
bool has_identity =
true;
409 template <
typename T,
class BinaryOperation>
411 T, BinaryOperation, true,
414 static constexpr
bool has_identity =
true;
428 template <
typename T,
class BinaryOperation>
430 T, BinaryOperation, false,
431 std::enable_if_t<!IsKnownIdentityOp<T, BinaryOperation>::value>> {
433 static constexpr
bool has_identity =
false;
438 template <
typename T,
class BinaryOperation,
bool IsOptional>
440 using value_type = std::conditional_t<IsOptional, std::optional<T>, T>;
442 template <
bool ExplicitIdentity>
443 constexpr value_type GetInitValue(
445 &IdentityContainer) {
446 constexpr
bool ContainerHasIdentity =
448 ExplicitIdentity>::has_identity;
449 static_assert(IsOptional || ContainerHasIdentity);
450 if constexpr (!ContainerHasIdentity)
453 return IdentityContainer.getIdentity();
460 template <
bool ExplicitIdentity>
464 : MValue(GetInitValue(IdentityContainer)) {}
467 if constexpr (IsOptional)
468 MValue = MValue ? BinOp(*MValue, OtherValue) : OtherValue;
470 MValue = BinOp(MValue, OtherValue);
475 if constexpr (IsOptional) {
477 return combine(BinOp, *Other.MValue);
481 return combine(BinOp, Other.MValue);
486 if constexpr (IsOptional)
492 if constexpr (IsOptional)
498 constexpr
explicit operator bool()
const {
499 if constexpr (IsOptional)
500 return MValue.has_value();
514 template <
typename T,
class BinaryOperation,
bool IsOptional>
516 detail::ReducerElement<T, BinaryOperation, IsOptional>>
529 template <
class BinaryOperation,
typename IdentityContainerT>
532 const BinaryOperation
BOp;
543 template <
typename T,
class BinaryOperation,
int Dims,
size_t Extent,
544 typename IdentityContainerT,
bool View>
546 T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
547 std::enable_if_t<Dims == 0 && Extent == 1 && View == false &&
548 !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
550 reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
552 Dims == 0 && Extent == 1 && View == false &&
553 !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>>,
555 static constexpr
bool has_identity = IdentityContainerT::has_identity;
560 reducer(
const IdentityContainerT &IdentityContainer, BinaryOperation BOp)
561 : MValue(IdentityContainer), MIdentity(IdentityContainer),
565 :
reducer(Token.IdentityContainer, Token.BOp) {}
573 MValue.combine(MBinaryOp, Partial);
577 template <
bool HasIdentityRelay = has_
identity>
578 std::enable_if_t<HasIdentityRelay && (HasIdentityRelay == has_identity), T>
580 return MIdentity.getIdentity();
587 const element_type &getElement(
size_t)
const {
return MValue; }
589 detail::ReducerElement<T, BinaryOperation, !has_identity> MValue;
590 const IdentityContainerT MIdentity;
591 BinaryOperation MBinaryOp;
599 template <
typename T,
class BinaryOperation,
int Dims,
size_t Extent,
600 typename IdentityContainerT,
bool View>
602 T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
603 std::enable_if_t<Dims == 0 && Extent == 1 && View == false &&
604 detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
606 reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
608 Dims == 0 && Extent == 1 && View == false &&
609 detail::IsKnownIdentityOp<T, BinaryOperation>::value>>>,
611 static constexpr
bool has_identity = IdentityContainerT::has_identity;
617 reducer(
const IdentityContainerT & , BinaryOperation)
618 : MValue(getIdentity()) {}
621 :
reducer(Token.IdentityContainer, Token.BOp) {}
630 MValue.combine(BOp, Partial);
639 static constexpr T getIdentity() {
643 element_type &getElement(
size_t) {
return MValue; }
644 const element_type &getElement(
size_t)
const {
return MValue; }
645 detail::ReducerElement<T, BinaryOperation, !has_identity> MValue;
650 template <
typename T,
class BinaryOperation,
int Dims,
size_t Extent,
651 typename IdentityContainerT,
bool View>
652 class reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
653 std::enable_if_t<Dims == 0 && View == true>>
655 reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
656 std::enable_if_t<Dims == 0 && View == true>>>,
658 static constexpr
bool has_identity = IdentityContainerT::has_identity;
664 : MElement(Ref), MBinaryOp(BOp) {}
667 :
reducer(Token.IdentityContainer, Token.BOp) {}
675 MElement.combine(MBinaryOp, Partial);
683 const element_type &getElement(
size_t)
const {
return MElement; }
685 element_type &MElement;
686 BinaryOperation MBinaryOp;
691 template <
typename T,
class BinaryOperation,
int Dims,
size_t Extent,
692 typename IdentityContainerT,
bool View>
694 T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
695 std::enable_if_t<Dims == 1 && View == false &&
696 !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
698 reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
700 Dims == 1 && View == false &&
701 !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>>,
703 static constexpr
bool has_identity = IdentityContainerT::has_identity;
708 reducer(
const IdentityContainerT &IdentityContainer, BinaryOperation BOp)
709 : MValue(IdentityContainer), MIdentity(IdentityContainer),
713 :
reducer(Token.IdentityContainer, Token.BOp) {}
720 reducer<T, BinaryOperation, Dims - 1, Extent, IdentityContainerT,
true>
722 return {MValue[Index], MBinaryOp};
725 template <
bool HasIdentityRelay = has_
identity>
726 std::enable_if_t<HasIdentityRelay && (HasIdentityRelay == has_identity), T>
728 return MIdentity.getIdentity();
734 element_type &getElement(
size_t E) {
return MValue[E]; }
735 const element_type &getElement(
size_t E)
const {
return MValue[E]; }
738 const IdentityContainerT MIdentity;
739 BinaryOperation MBinaryOp;
744 template <
typename T,
class BinaryOperation,
int Dims,
size_t Extent,
745 typename IdentityContainerT,
bool View>
747 T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
748 std::enable_if_t<Dims == 1 && View == false &&
749 detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
751 reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
753 Dims == 1 && View == false &&
754 detail::IsKnownIdentityOp<T, BinaryOperation>::value>>>,
756 static constexpr
bool has_identity = IdentityContainerT::has_identity;
762 reducer(
const IdentityContainerT & , BinaryOperation)
763 : MValue(getIdentity()) {}
766 :
reducer(Token.IdentityContainer, Token.BOp) {}
775 reducer<T, BinaryOperation, Dims - 1, Extent, IdentityContainerT,
true>
777 return {MValue[Index], BinaryOperation()};
785 static constexpr T getIdentity() {
789 element_type &getElement(
size_t E) {
return MValue[E]; }
790 const element_type &getElement(
size_t E)
const {
return MValue[E]; }
792 marray<element_type, Extent> MValue;
805 accessor<T, AccessorDims,
Mode, access::target::device, IsPH, PropList>> {
806 static constexpr
int value = AccessorDims;
814 template <
class T,
int Dims,
typename AllocatorT>
827 template <
class KernelName>
829 std::conditional_t<std::is_same_v<KernelName, auto_name>,
auto_name,
834 std::shared_ptr<int> &Counter);
836 template <
typename T,
class BinaryOperation,
int Dims,
size_t Extent,
837 bool ExplicitIdentity,
typename RedOutVar>
840 ExplicitIdentity, RedOutVar>;
843 static constexpr T chooseIdentity(
const T &Identity) {
879 static constexpr
size_t dims = Dims;
887 static constexpr
bool is_usm = std::is_same_v<RedOutVar, T *>;
893 : MIdentityContainer(chooseIdentity(Identity)), MBinaryOp(BinaryOp),
894 InitializeToIdentity(Init), MRedOut(
std::move(RedOut)) {}
896 template <
typename RelayT = T,
897 typename RelayBinaryOperation = BinaryOperation>
899 BinaryOperation BinaryOp,
bool Init, RedOutVar RedOut,
903 MBinaryOp(BinaryOp), InitializeToIdentity(Init),
904 MRedOut(
std::move(RedOut)) {}
906 template <
typename RelayT = T,
907 typename RelayBinaryOperation = BinaryOperation>
909 BinaryOperation BinaryOp,
bool Init, RedOutVar RedOut,
912 : MIdentityContainer(), MBinaryOp(BinaryOp), InitializeToIdentity(Init),
913 MRedOut(
std::move(RedOut)) {}
916 CGH.addReduction(MOutBufPtr);
920 template <
bool IsOneWG>
924 if constexpr (IsOneWG) {
928 std::make_shared<buffer<reducer_element_type, 1>>(
range<1>(Size));
929 CGH.addReduction(MOutBufPtr);
935 auto Buffer = std::make_shared<buffer<_T, 1>>(
range<1>(Size));
936 CGH.addReduction(Buffer);
948 "Unexpected size of reducer element type.");
954 auto ReinterpretRedOut =
955 MRedOut.template reinterpret<reducer_element_type>();
956 return accessor{ReinterpretRedOut, CGH};
964 std::make_shared<buffer<reducer_element_type, 1>>(
range<1>(Size));
965 CGH.addReduction(MOutBufPtr);
977 template <
typename KernelName,
typename FuncTy,
982 auto DoIt = [&](
auto &Out) {
983 auto RWReduVal = std::make_shared<std::array<T, num_elements>>();
985 (*RWReduVal)[i] = decltype(MIdentityContainer)::getIdentity();
987 auto Buf = std::make_shared<buffer<T, 1>>(RWReduVal.get()->data(),
989 Buf->set_final_data();
999 Buf->template get_access<access::mode::read_write>(CopyHandler);
1002 CopyHandler.addReduction(RWReduVal);
1003 CopyHandler.addReduction(Buf);
1015 for (
size_t i = 0; i < NElements; ++i) {
1016 if (IsUpdateOfUserVar)
1017 Out[i] = BOp(Out[i], Mem[i]);
1024 CopyHandler.
copy(Mem, OutAcc);
1042 template <
typename KernelName,
typename FuncTy,
1047 "Initialize to identity not allowed for identity-less reductions.");
1052 return MIdentityContainer;
1058 auto CounterMem = std::make_shared<int>(0);
1059 CGH.addReduction(CounterMem);
1060 auto CounterBuf = std::make_shared<buffer<int, 1>>(CounterMem.get(), 1);
1061 CounterBuf->set_final_data();
1062 CGH.addReduction(CounterBuf);
1063 return {*CounterBuf, CGH};
1069 queue q = createSyclObjFromImpl<queue>(CGH.MQueue);
1071 auto Deleter = [=](
auto *Ptr) {
free(Ptr, q); };
1073 std::shared_ptr<int> Counter(malloc_device<int>(1, q), Deleter);
1074 CGH.addReduction(Counter);
1078 return Counter.get();
1099 std::shared_ptr<buffer<reducer_element_type, 1>> MOutBufPtr;
1101 BinaryOperation MBinaryOp;
1102 bool InitializeToIdentity;
1110 template <
typename T,
class BinaryOperation,
int Dims,
size_t Extent,
1111 bool ExplicitIdentity,
typename RedOutVar>
1115 ExplicitIdentity, RedOutVar> {
1118 ExplicitIdentity, RedOutVar>;
1120 ExplicitIdentity, RedOutVar>;
1127 static_assert(Dims <= 1,
"Multi-dimensional reductions are not supported.");
1131 template <
bool ExplicitIdentityRelay = ExplicitIdentity,
1132 typename = std::enable_if_t<!ExplicitIdentityRelay>>
1134 bool InitializeToIdentity =
false)
1137 if (Var.size() != 1)
1139 "Reduction variable must be a scalar.");
1143 "initialize_to_identity property cannot be "
1144 "used with identityless reductions.");
1149 template <
bool ExplicitIdentityRelay = ExplicitIdentity,
1150 typename = std::enable_if_t<ExplicitIdentityRelay>>
1152 bool InitializeToIdentity)
1155 if (Var.size() != 1)
1157 "Reduction variable must be a scalar.");
1161 template <
class BinaryOp,
int Dims,
size_t Extent,
bool ExplicitIdentity,
1162 typename RedOutVar,
typename... RestTy>
1165 Extent, ExplicitIdentity, RedOutVar>{
1166 RedVar, std::forward<RestTy>(Rest)...};
1172 event E = CGH.finalize();
1173 handler AuxHandler(CGH.MQueue, CGH.eventNeeded());
1174 if (!createSyclObjFromImpl<queue>(CGH.MQueue).is_in_order())
1176 AuxHandler.saveCodeLoc(CGH.MCodeLoc);
1178 CGH.MLastEvent = AuxHandler.finalize();
1188 template <
typename KernelName,
class Reduction>
1190 static_assert(Reduction::is_usm,
1191 "All implementations using this helper are expected to have "
1192 "USM reduction, not a buffer-based one.");
1193 size_t NElements = Reduction::num_elements;
1194 auto InAcc = Redu.getReadAccToPreviousPartialReds(CGH);
1195 auto UserVarPtr = Redu.getUserRedVarAccess(CGH);
1196 bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1197 auto BOp = Redu.getBinaryOperation();
1199 for (
size_t i = 0; i < NElements; ++i) {
1200 auto Elem = InAcc[i];
1201 if (IsUpdateOfUserVar)
1202 UserVarPtr[i] = BOp(UserVarPtr[i], *Elem);
1204 UserVarPtr[i] = *Elem;
1223 std::conditional_t<std::is_same_v<KernelName, auto_name>,
auto_name,
1224 MainOrAux<KernelName, Strategy, Ts...>>;
1232 template <
typename KernelName,
int Dims,
typename PropertiesT,
1233 typename KernelType,
typename Reduction>
1234 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1236 Reduction &Redu, KernelType &KernelFunc) {
1237 static_assert(Reduction::has_identity,
1238 "Identityless reductions are not supported by the "
1239 "local_atomic_and_atomic_cross_wg strategy.");
1241 std::ignore = Queue;
1245 Redu.template withInitializedMem<Name>(CGH, [&](
auto Out) {
1246 size_t NElements = Reduction::num_elements;
1252 typename Reduction::reducer_type Reducer;
1253 KernelFunc(NDId, Reducer);
1256 auto LID = NDId.get_local_id(0);
1257 for (
size_t E = LID; E < NElements; E += NDId.get_local_range(0)) {
1263 Reducer.template atomic_combine<access::address_space::local_space>(
1270 for (
size_t E = 0; E < NElements; ++E) {
1273 Reducer.atomic_combine(&Out[0]);
1282 reduction::strategy::group_reduce_and_last_wg_detection> {
1283 template <
typename KernelName,
int Dims,
typename PropertiesT,
1284 typename KernelType,
typename Reduction>
1285 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1287 Reduction &Redu, KernelType &KernelFunc) {
1288 static_assert(Reduction::has_identity,
1289 "Identityless reductions are not supported by the "
1290 "group_reduce_and_last_wg_detection strategy.");
1292 std::ignore = Queue;
1293 size_t NElements = Reduction::num_elements;
1297 auto Out = Redu.getUserRedVarAccess(CGH);
1299 auto &PartialSumsBuf = Redu.getTempBuffer(NWorkGroups * NElements, CGH);
1302 bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1303 auto Rest = [&](
auto NWorkGroupsFinished) {
1309 decltype(NWorkGroupsFinished)>;
1313 typename Reduction::reducer_type Reducer;
1314 KernelFunc(NDId, Reducer);
1316 typename Reduction::binary_operation BOp;
1317 auto Group = NDId.get_group();
1321 size_t LID = NDId.get_local_id(0);
1322 for (
size_t E = 0; E < NElements; ++E) {
1326 if (NWorkGroups == 1) {
1329 if (IsUpdateOfUserVar)
1330 RedElem = BOp(RedElem, Out[E]);
1333 PartialSums[NDId.get_group_linear_id() * NElements + E] =
1339 if (NWorkGroups == 1)
1351 NWorkGroupsFinished[0]);
1352 DoReducePartialSumsInLastWG[0] =
1353 ++NFinished ==
static_cast<int>(NWorkGroups);
1357 if (DoReducePartialSumsInLastWG[0]) {
1360 for (
size_t E = 0; E < NElements; ++E) {
1362 for (
size_t I = LID; I < NWorkGroups; I += WGSize)
1363 LocalSum = BOp(LocalSum, PartialSums[I * NElements + E]);
1367 if (IsUpdateOfUserVar)
1368 Result = BOp(Result, Out[E]);
1381 !
device.
has(aspect::usm_device_allocations))
1382 Rest(Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH));
1384 Rest(Redu.getGroupsCounterAccDiscrete(CGH));
1394 while ((N >>= 1) != 0)
1399 template <
typename FuncTy>
1415 if (Pivot != WorkSize) {
1416 if (Pivot + LID < WorkSize)
1417 Func(LID, Pivot + LID);
1422 for (
size_t CurPivot = Pivot >> 1; CurPivot > 0; CurPivot >>= 1) {
1424 Func(LID, CurPivot + LID);
1433 typename BinOpTy,
typename AccessFuncTy>
1435 BinOpTy &BOp, AccessFuncTy AccessFunc) {
1437 size_t AdjustedWorkSize;
1444 LocalReds[LID] = AccessFunc(LID);
1445 AdjustedWorkSize = WorkSize;
1450 AdjustedWorkSize = std::min(WorkSize, WGSize);
1451 if (LID < AdjustedWorkSize) {
1452 auto LocalSum = AccessFunc(LID);
1453 for (
size_t I = LID + WGSize; I < WorkSize; I += WGSize)
1454 LocalSum = BOp(LocalSum, AccessFunc(I));
1456 LocalReds[LID] = LocalSum;
1460 LocalReds[I] = BOp(LocalReds[I], LocalReds[J]);
1468 template <
typename... LocalAccT,
typename... BOPsT,
size_t... Is>
1472 std::index_sequence<Is...>) {
1474 auto ProcessOne = [=](
auto &LocalAcc,
auto &BOp) {
1475 LocalAcc[I] = BOp(LocalAcc[I], LocalAcc[J]);
1477 (ProcessOne(std::get<Is>(LocalAccs), std::get<Is>(BOPs)), ...);
1482 template <
typename KernelName,
int Dims,
typename PropertiesT,
1483 typename KernelType,
typename Reduction>
1484 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1486 Reduction &Redu, KernelType &KernelFunc) {
1487 using reducer_type =
typename Reduction::reducer_type;
1494 constexpr
bool UsePartialSumForOutput =
1495 !Reduction::is_usm && Reduction::has_identity;
1497 std::ignore = Queue;
1498 size_t NElements = Reduction::num_elements;
1502 bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1504 Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1506 if constexpr (UsePartialSumForOutput)
1507 return (NWorkGroups == 1)
1509 : Redu.getWriteAccForPartialReds(NElements, CGH);
1511 return Redu.getUserRedVarAccess(CGH);
1514 auto NWorkGroupsFinished =
1515 Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH);
1518 auto IdentityContainer = Redu.getIdentityContainer();
1519 auto BOp = Redu.getBinaryOperation();
1526 reducer_type Reducer = reducer_type(IdentityContainer, BOp);
1527 KernelFunc(NDId, Reducer);
1529 auto ElementCombiner = [&](element_type &LHS,
const element_type &RHS) {
1530 return LHS.combine(BOp, RHS);
1535 size_t LID = NDId.get_local_linear_id();
1536 for (
size_t E = 0; E < NElements; ++E) {
1538 doTreeReduction<WorkSizeGuarantees::Equal>(
1539 WGSize, NDId, LocalReds, ElementCombiner,
1543 auto V = LocalReds[0];
1545 bool IsOneWG = NWorkGroups == 1;
1546 if (IsOneWG && IsUpdateOfUserVar)
1547 V.combine(BOp, Out[E]);
1551 if (UsePartialSumForOutput || !IsOneWG)
1552 PartialSums[NDId.get_group_linear_id() * NElements + E] = V;
1566 NWorkGroupsFinished[0]);
1567 DoReducePartialSumsInLastWG[0] =
1568 ++NFinished == NWorkGroups && NWorkGroups > 1;
1572 if (DoReducePartialSumsInLastWG[0]) {
1575 for (
size_t E = 0; E < NElements; ++E) {
1576 doTreeReduction<WorkSizeGuarantees::None>(
1577 NWorkGroups, NDId, LocalReds, ElementCombiner,
1578 [&](
size_t I) {
return PartialSums[I * NElements + E]; });
1580 auto V = LocalReds[0];
1581 if (IsUpdateOfUserVar)
1582 V.combine(BOp, Out[E]);
1593 template <
typename KernelName,
int Dims,
typename PropertiesT,
1594 typename KernelType,
typename Reduction>
1595 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1597 Reduction &Redu, KernelType &KernelFunc) {
1598 static_assert(Reduction::has_identity,
1599 "Identityless reductions are not supported by the "
1600 "group_reduce_and_atomic_cross_wg strategy.");
1602 std::ignore = Queue;
1606 Redu.template withInitializedMem<Name>(CGH, [&](
auto Out) {
1607 size_t NElements = Reduction::num_elements;
1611 typename Reduction::reducer_type Reducer;
1612 KernelFunc(NDIt, Reducer);
1614 typename Reduction::binary_operation BOp;
1615 for (
size_t E = 0; E < NElements; ++E) {
1619 if (NDIt.get_local_linear_id() == 0)
1620 Reducer.atomic_combine(&Out[0]);
1628 reduction::strategy::local_mem_tree_and_atomic_cross_wg> {
1629 template <
typename KernelName,
int Dims,
typename PropertiesT,
1630 typename KernelType,
typename Reduction>
1631 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1633 Reduction &Redu, KernelType &KernelFunc) {
1634 using reducer_type =
typename Reduction::reducer_type;
1637 std::ignore = Queue;
1641 Redu.template withInitializedMem<Name>(CGH, [&](
auto Out) {
1642 size_t NElements = Reduction::num_elements;
1651 reducer_type Reducer;
1652 KernelFunc(NDIt, Reducer);
1654 size_t WGSize = NDIt.get_local_range().size();
1655 size_t LID = NDIt.get_local_linear_id();
1657 typename Reduction::binary_operation BOp;
1658 auto ElementCombiner = [&](element_type &LHS,
const element_type &RHS) {
1659 return LHS.combine(BOp, RHS);
1664 for (
size_t E = 0; E < NElements; ++E) {
1666 doTreeReduction<WorkSizeGuarantees::Equal>(
1667 WGSize, NDIt, LocalReds, ElementCombiner,
1674 if (E != NElements - 1) {
1680 Reducer.atomic_combine(&Out[0]);
1689 reduction::strategy::group_reduce_and_multiple_kernels> {
1690 template <
typename KernelName,
int Dims,
typename PropertiesT,
1691 typename KernelType,
typename Reduction>
1692 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1694 Reduction &Redu, KernelType &KernelFunc) {
1695 static_assert(Reduction::has_identity,
1696 "Identityless reductions are not supported by the "
1697 "group_reduce_and_multiple_kernels strategy.");
1707 constexpr
bool HFR = Reduction::has_fast_reduce;
1708 size_t OneElemSize = HFR ? 0 :
sizeof(
typename Reduction::result_type);
1715 "The implementation handling parallel_for with"
1716 " reduction requires work group size not bigger"
1718 std::to_string(MaxWGSize));
1720 size_t NElements = Reduction::num_elements;
1722 auto Out = Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1724 bool IsUpdateOfUserVar =
1725 !Reduction::is_usm && !Redu.initializeToIdentity() && NWorkGroups == 1;
1733 typename Reduction::reducer_type Reducer;
1734 KernelFunc(NDIt, Reducer);
1737 size_t WGID = NDIt.get_group_linear_id();
1738 typename Reduction::binary_operation BOp;
1739 for (
size_t E = 0; E < NElements; ++E) {
1740 typename Reduction::result_type PSum;
1743 if (NDIt.get_local_linear_id() == 0) {
1744 if (IsUpdateOfUserVar)
1745 PSum = BOp(*Out[E], PSum);
1746 Out[WGID * NElements + E] = PSum;
1760 "The implementation handling parallel_for with "
1761 "reduction requires the maximal work group "
1762 "size to be greater than 1 to converge. "
1763 "The maximal work group size depends on the "
1764 "device and the size of the objects passed to "
1767 while (NWorkItems > 1) {
1769 size_t NElements = Reduction::num_elements;
1776 bool HasUniformWG = NWorkGroups * WGSize == NWorkItems;
1777 if (!Reduction::has_fast_reduce)
1778 HasUniformWG = HasUniformWG && (WGSize & (WGSize - 1)) == 0;
1782 auto In = Redu.getReadAccToPreviousPartialReds(AuxHandler);
1784 Redu.getWriteAccForPartialReds(NWorkGroups * NElements, AuxHandler);
1790 bool IsUpdateOfUserVar = !Reduction::is_usm &&
1791 !Redu.initializeToIdentity() &&
1793 range<1> GlobalRange = {HasUniformWG ? NWorkItems
1794 : NWorkGroups * WGSize};
1797 typename Reduction::binary_operation BOp;
1798 size_t WGID = NDIt.get_group_linear_id();
1799 size_t GID = NDIt.get_global_linear_id();
1801 for (
size_t E = 0; E < NElements; ++E) {
1802 typename Reduction::result_type PSum =
1803 (HasUniformWG || (GID < NWorkItems))
1804 ? *In[GID * NElements + E]
1808 if (NDIt.get_local_linear_id() == 0) {
1809 if (IsUpdateOfUserVar)
1810 PSum = BOp(*Out[E], PSum);
1811 Out[WGID * NElements + E] = PSum;
1815 NWorkItems = NWorkGroups;
1819 if constexpr (Reduction::is_usm) {
1821 reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
1828 template <
typename KernelName,
int Dims,
typename PropertiesT,
1829 typename KernelType,
typename Reduction>
1830 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1832 Reduction &Redu, KernelType &KernelFunc) {
1833 using element_type =
typename Reduction::reducer_element_type;
1835 constexpr
bool HFR = Reduction::has_fast_reduce;
1836 size_t OneElemSize = HFR ? 0 :
sizeof(element_type);
1843 "The implementation handling parallel_for with"
1844 " reduction requires work group size not bigger"
1846 std::to_string(MaxWGSize));
1850 bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1851 std::ignore = IsUpdateOfUserVar;
1856 auto First = [&](
auto KernelTag) {
1858 constexpr
bool IsOneWG =
1859 std::is_same_v<std::remove_reference_t<decltype(KernelTag)>,
1862 constexpr
size_t NElements = Reduction::num_elements;
1867 if constexpr (IsOneWG)
1868 return Redu.getUserRedVarAccess(CGH);
1870 return Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1876 auto BOp = Redu.getBinaryOperation();
1877 auto IdentityContainer = Redu.getIdentityContainer();
1881 decltype(KernelTag)>;
1885 typename Reduction::reducer_type Reducer =
1886 typename Reduction::reducer_type(IdentityContainer, BOp);
1887 KernelFunc(NDIt, Reducer);
1889 size_t WGSize = NDIt.get_local_range().size();
1890 size_t LID = NDIt.get_local_linear_id();
1892 auto ElementCombiner = [&](element_type &LHS,
const element_type &RHS) {
1893 return LHS.combine(BOp, RHS);
1898 for (
size_t E = 0; E < NElements; ++E) {
1900 doTreeReduction<WorkSizeGuarantees::Equal>(
1901 WGSize, NDIt, LocalReds, ElementCombiner,
1906 element_type PSum = LocalReds[0];
1907 if constexpr (IsOneWG) {
1908 if (IsUpdateOfUserVar)
1909 PSum.combine(BOp, Out[E]);
1912 size_t GrID = NDIt.get_group_linear_id();
1913 Out[GrID * NElements + E] = PSum;
1918 if (E != NElements - 1) {
1925 if (NWorkGroups == 1)
1939 "The implementation handling parallel_for with "
1940 "reduction requires the maximal work group "
1941 "size to be greater than 1 to converge. "
1942 "The maximal work group size depends on the "
1943 "device and the size of the objects passed to "
1946 while (NWorkItems > 1) {
1950 auto Rest = [&](
auto KernelTag) {
1953 constexpr
bool IsOneWG =
1954 std::is_same_v<std::remove_reference_t<decltype(KernelTag)>,
1957 constexpr
size_t NElements = Reduction::num_elements;
1963 bool HasUniformWG = NWorkGroups * WGSize == NWorkItems;
1967 auto In = Redu.getReadAccToPreviousPartialReds(AuxHandler);
1970 if constexpr (IsOneWG)
1971 return Redu.getUserRedVarAccess(AuxHandler);
1973 return Redu.getWriteAccForPartialReds(NWorkGroups * NElements,
1977 bool UniformPow2WG = HasUniformWG && (WGSize & (WGSize - 1)) == 0;
1982 auto BOp = Redu.getBinaryOperation();
1985 decltype(KernelTag)>;
1987 range<1> GlobalRange = {UniformPow2WG ? NWorkItems
1988 : NWorkGroups * WGSize};
1991 size_t WGSize = NDIt.get_local_range().size();
1992 size_t LID = NDIt.get_local_linear_id();
1993 size_t GID = NDIt.get_global_linear_id();
1994 size_t GrID = NDIt.get_group_linear_id();
1997 size_t RemainingWorkSize =
1998 sycl::min(WGSize, NWorkItems - GrID * WGSize);
2000 auto ElementCombiner = [&](element_type &LHS,
2001 const element_type &RHS) {
2002 return LHS.combine(BOp, RHS);
2005 for (
size_t E = 0; E < NElements; ++E) {
2007 doTreeReduction<WorkSizeGuarantees::LessOrEqual>(
2008 RemainingWorkSize, NDIt, LocalReds, ElementCombiner,
2009 [&](
size_t) {
return In[GID * NElements + E]; });
2013 element_type PSum = LocalReds[0];
2014 if constexpr (IsOneWG) {
2015 if (IsUpdateOfUserVar)
2016 PSum.combine(BOp, Out[E]);
2019 Out[GrID * NElements + E] = PSum;
2024 if (E != NElements - 1) {
2029 NWorkItems = NWorkGroups;
2033 if (NWorkGroups == 1)
2044 template <
bool IsOneWG,
typename... Reductions,
size_t... Is>
2046 std::tuple<Reductions...> &ReduTuple,
2047 std::index_sequence<Is...>) {
2049 std::get<Is>(ReduTuple).
template getWriteMemForPartialReds<IsOneWG>(
2055 template <
typename OutAccT,
typename LocalAccT,
typename BOPT,
2056 typename IdentityContainerT>
2058 IdentityContainerT IdentityContainer,
2059 bool IsInitializeToIdentity) {
2060 if constexpr (!IdentityContainerT::has_identity) {
2061 return BOP(LocalAcc[0], OutAcc[0]);
2063 return BOP(LocalAcc[0], IsInitializeToIdentity
2064 ? IdentityContainer.getIdentity()
2069 template <
bool IsOneWG,
typename... Reductions,
typename... OutAccT,
2070 typename... LocalAccT,
typename... BOPsT,
typename... Ts,
2076 std::array<
bool,
sizeof...(Reductions)> IsInitializeToIdentity,
2077 std::index_sequence<Is...>) {
2078 if constexpr (IsOneWG) {
2082 std::get<Is>(OutAccs), std::get<Is>(LocalAccs), std::get<Is>(BOPs),
2083 std::get<Is>(IdentityVals), IsInitializeToIdentity[Is])),
2085 ((std::get<Is>(OutAccs)[OutAccIndex] = *std::get<Is>(LocalAccs)[0]), ...);
2089 ((std::get<Is>(OutAccs)[OutAccIndex] = std::get<Is>(LocalAccs)[0]), ...);
2105 template <
size_t... Is,
size_t... Js>
2106 constexpr std::index_sequence<Is..., Js...>
2112 template <
size_t... Is,
size_t... Js,
class... Rs>
2114 std::index_sequence<Js...>, Rs...) {
2119 template <
typename T>
struct Func {
2120 static constexpr
bool value = !std::remove_pointer_t<T>::is_usm;
2125 template <
typename T>
struct Func {
2132 std::conditional_t<Cond, std::index_sequence<I>, std::index_sequence<>>;
2140 template <
typename... T,
typename FunctorT,
size_t... Is,
2141 std::enable_if_t<(
sizeof...(Is) > 0),
int> Z = 0>
2145 Is, std::tuple<T...>>>::value,
2148 template <
typename... T,
typename FunctorT,
size_t... Is,
2149 std::enable_if_t<(
sizeof...(Is) == 0),
int> Z = 0>
2151 return std::index_sequence<>{};
2157 template <
typename... T,
typename FunctorT,
size_t... Is>
2163 template <
typename Reduction>
struct Func {
2165 (Reduction::dims == 0 && Reduction::num_elements == 1);
2170 template <
typename Reduction>
struct Func {
2172 (Reduction::dims == 1 && Reduction::num_elements >= 1);
2176 template <
typename ElementType,
typename BOPT>
2178 return [&](ElementType &LHS,
const ElementType &RHS) {
2179 return LHS.combine(BOP, RHS);
2183 template <
typename... Reductions,
typename... BOPsT,
size_t... Is>
2185 std::index_sequence<Is...>) {
2188 Is, std::tuple<Reductions...>>::reducer_element_type>(
2189 std::get<Is>(BOPsTuple))...);
2192 template <
typename... Reductions,
typename... BOPsT>
2195 BOPsTuple, std::make_index_sequence<
sizeof...(Reductions)>{});
2200 template <
bool IsOneWG,
typename... Reductions,
int Dims,
typename... LocalAccT,
2201 typename... OutAccT,
typename... ReducerT,
typename... Ts,
2202 typename... BOPsT,
size_t... Is>
2207 std::array<
bool,
sizeof...(Reductions)> InitToIdentityProps,
2208 std::index_sequence<Is...> ReduIndices) {
2212 ((std::get<Is>(LocalAccsTuple)[LID] =
2227 GrID, OutAccsTuple, LocalAccsTuple, AdjustedBOPsTuple, IdentitiesTuple,
2228 InitToIdentityProps, ReduIndices);
2233 template <
bool IsOneWG,
typename Reduction,
int Dims,
typename LocalAccT,
2234 typename OutAccT,
typename ReducerT,
typename BOPT>
2236 OutAccT Out, ReducerT &Reducer, BOPT BOp,
2237 bool IsInitializeToIdentity) {
2238 using element_type =
typename Reduction::reducer_element_type;
2243 auto ElementCombiner = [&](element_type &LHS,
const element_type &RHS) {
2244 return LHS.combine(BOp, RHS);
2249 auto NElements = Reduction::num_elements;
2250 for (
size_t E = 0; E < NElements; ++E) {
2251 doTreeReduction<WorkSizeGuarantees::Equal>(
2252 WGSize, NDIt, LocalReds, ElementCombiner,
2258 size_t OutIdx = GrID * NElements + E;
2259 if constexpr (IsOneWG) {
2262 if constexpr (Reduction::has_identity) {
2263 Out[OutIdx] = *ElementCombiner(LocalReds[0], IsInitializeToIdentity
2264 ? Reducer.identity()
2267 Out[OutIdx] = *LocalReds[0];
2271 Out[OutIdx] = LocalReds[0];
2276 if (E != NElements - 1) {
2282 template <
bool IsOneWG,
typename... Reductions,
int Dims,
typename... LocalAccT,
2283 typename... OutAccT,
typename... ReducerT,
typename... BOPsT,
2289 std::array<
bool,
sizeof...(Reductions)> InitToIdentityProps,
2290 std::index_sequence<Is...>) {
2291 using ReductionPack = std::tuple<Reductions...>;
2292 (reduCGFuncImplArrayHelper<IsOneWG, std::tuple_element_t<Is, ReductionPack>>(
2293 NDIt, std::get<Is>(LocalAccsTuple), std::get<Is>(OutAccsTuple),
2294 std::get<Is>(ReducersTuple), std::get<Is>(BOPsTuple),
2295 InitToIdentityProps[Is]),
2299 namespace reduction::main_krn {
2302 template <
typename KernelName,
typename KernelType,
int Dims,
2303 typename PropertiesT,
typename... Reductions,
size_t... Is>
2306 std::tuple<Reductions...> &ReduTuple,
2307 std::index_sequence<Is...> ReduIndices) {
2319 auto ScalarIs =
filterSequence<Reductions...>(ScalarPredicate, ReduIndices);
2322 auto ArrayIs =
filterSequence<Reductions...>(ArrayPredicate, ReduIndices);
2331 auto Rest = [&](
auto KernelTag,
auto OutAccsTuple) {
2332 auto IdentitiesTuple =
2333 makeReduTupleT(std::get<Is>(ReduTuple).getIdentityContainer()...);
2336 std::array InitToIdentityProps{
2337 std::get<Is>(ReduTuple).initializeToIdentity()...};
2341 decltype(KernelTag)>;
2345 constexpr
bool IsOneWG =
2346 std::is_same_v<std::remove_reference_t<decltype(KernelTag)>,
2351 auto ReduIndices = std::index_sequence_for<Reductions...>();
2352 auto ReducerTokensTuple =
2353 std::tuple{
typename Reductions::reducer_token_type{
2354 std::get<Is>(IdentitiesTuple), std::get<Is>(BOPsTuple)}...};
2355 auto ReducersTuple = std::tuple<
typename Reductions::reducer_type...>{
2356 std::get<Is>(ReducerTokensTuple)...};
2357 std::apply([&](
auto &...Reducers) { KernelFunc(NDIt, Reducers...); },
2365 NDIt, LocalAccsTuple, OutAccsTuple, ReducersTuple, IdentitiesTuple,
2366 BOPsTuple, InitToIdentityProps, ScalarIs);
2373 NDIt, LocalAccsTuple, OutAccsTuple, ReducersTuple, BOPsTuple,
2374 InitToIdentityProps, ArrayIs);
2379 if (NWorkGroups == 1)
2381 createReduOutAccs<true>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2384 createReduOutAccs<false>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2388 template <
typename... Reductions,
size_t... Is>
2390 std::tuple<Reductions...> &ReduTuple,
2391 std::index_sequence<Is...>) {
2392 auto ProcessOne = [&CGH](
auto Redu) {
2393 if constexpr (!decltype(Redu)::is_usm)
2394 Redu.getUserRedVarAccess(CGH);
2396 (ProcessOne(std::get<Is>(ReduTuple)), ...);
2401 template <
bool IsOneWG,
typename... Reductions,
int Dims,
typename... LocalAccT,
2402 typename... InAccT,
typename... OutAccT,
typename... Ts,
2403 typename... BOPsT,
size_t... Is>
2405 nd_item<Dims> NDIt,
size_t LID,
size_t GID,
size_t RemainingWorkSize,
2409 std::array<
bool,
sizeof...(Reductions)> InitToIdentityProps,
2410 std::index_sequence<Is...> ReduIndices) {
2413 if (LID < RemainingWorkSize)
2414 ((std::get<Is>(LocalAccsTuple)[LID] = std::get<Is>(InAccsTuple)[GID]), ...);
2421 AdjustedBOPsTuple, ReduIndices);
2427 GrID, OutAccsTuple, LocalAccsTuple, AdjustedBOPsTuple, IdentitiesTuple,
2428 InitToIdentityProps, ReduIndices);
2432 template <
bool IsOneWG,
typename Reduction,
int Dims,
typename LocalAccT,
2433 typename InAccT,
typename OutAccT,
typename T,
typename BOPT>
2435 size_t RemainingWorkSize, LocalAccT LocalReds,
2436 InAccT In, OutAccT Out, T IdentityContainer,
2437 BOPT BOp,
bool IsInitializeToIdentity) {
2438 using element_type =
typename Reduction::reducer_element_type;
2439 auto ElementCombiner = [&](element_type &LHS,
const element_type &RHS) {
2440 return LHS.combine(BOp, RHS);
2445 auto NElements = Reduction::num_elements;
2446 for (
size_t E = 0; E < NElements; ++E) {
2447 doTreeReduction<WorkSizeGuarantees::LessOrEqual>(
2448 RemainingWorkSize, NDIt, LocalReds, ElementCombiner,
2449 [&](
size_t) {
return In[GID * NElements + E]; });
2454 size_t OutIdx = GrID * NElements + E;
2455 if constexpr (IsOneWG) {
2458 if constexpr (Reduction::has_identity) {
2459 Out[OutIdx] = *ElementCombiner(LocalReds[0],
2460 IsInitializeToIdentity
2461 ? IdentityContainer.getIdentity()
2464 Out[OutIdx] = *LocalReds[0];
2468 Out[OutIdx] = LocalReds[0];
2473 if (E != NElements - 1) {
2479 template <
bool IsOneWG,
typename... Reductions,
int Dims,
typename... LocalAccT,
2480 typename... InAccT,
typename... OutAccT,
typename... Ts,
2481 typename... BOPsT,
size_t... Is>
2483 nd_item<Dims> NDIt,
size_t LID,
size_t GID,
size_t RemainingWorkSize,
2487 std::array<
bool,
sizeof...(Reductions)> InitToIdentityProps,
2488 std::index_sequence<Is...>) {
2489 using ReductionPack = std::tuple<Reductions...>;
2491 std::tuple_element_t<Is, ReductionPack>>(
2492 NDIt, LID, GID, RemainingWorkSize, std::get<Is>(LocalAccsTuple),
2493 std::get<Is>(InAccsTuple), std::get<Is>(OutAccsTuple),
2494 std::get<Is>(IdentitiesTuple), std::get<Is>(BOPsTuple),
2495 InitToIdentityProps[Is]),
2499 namespace reduction::aux_krn {
2500 template <
class KernelName,
class Predicate>
struct Multi;
2502 template <
typename KernelName,
typename KernelType,
typename... Reductions,
2505 std::tuple<Reductions...> &ReduTuple,
2506 std::index_sequence<Is...> ReduIndices) {
2510 bool Pow2WG = (WGSize & (WGSize - 1)) == 0;
2511 bool HasUniformWG = Pow2WG && (NWorkGroups * WGSize == NWorkItems);
2515 auto ScalarIs =
filterSequence<Reductions...>(ScalarPredicate, ReduIndices);
2518 auto ArrayIs =
filterSequence<Reductions...>(ArrayPredicate, ReduIndices);
2524 std::get<Is>(ReduTuple).getReadAccToPreviousPartialReds(CGH)...);
2526 auto IdentitiesTuple =
2527 makeReduTupleT(std::get<Is>(ReduTuple).getIdentityContainer()...);
2530 std::array InitToIdentityProps{
2531 std::get<Is>(ReduTuple).initializeToIdentity()...};
2536 auto Rest = [&](
auto Predicate,
auto OutAccsTuple) {
2537 auto AccReduIndices =
filterSequence<Reductions...>(Predicate, ReduIndices);
2541 decltype(Predicate)>;
2543 range<1> GlobalRange = {HasUniformWG ? NWorkItems : NWorkGroups * WGSize};
2547 constexpr
bool IsOneWG =
2548 std::is_same_v<std::remove_reference_t<decltype(Predicate)>,
2552 size_t RemainingWorkSize =
2559 NDIt, LID, GID, RemainingWorkSize, LocalAccsTuple, InAccsTuple,
2560 OutAccsTuple, IdentitiesTuple, BOPsTuple, InitToIdentityProps,
2563 NDIt, LID, GID, RemainingWorkSize, LocalAccsTuple, InAccsTuple,
2564 OutAccsTuple, IdentitiesTuple, BOPsTuple, InitToIdentityProps,
2568 if (NWorkGroups == 1)
2570 createReduOutAccs<true>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2573 createReduOutAccs<false>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2579 return sizeof(
typename Reduction::result_type);
2582 template <
typename Reduction,
typename... RestT>
2584 return sizeof(
typename Reduction::result_type) +
2588 template <
typename... ReductionT,
size_t... Is>
2590 std::index_sequence<Is...>) {
2596 template <
typename TupleT, std::size_t... Is>
2597 std::tuple<std::tuple_element_t<Is, TupleT>...>
2599 return {std::get<Is>(std::move(Tuple))...};
2603 template <
typename KernelName,
int Dims,
typename PropertiesT,
2605 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
2608 std::tuple<RestT...> ArgsTuple(Rest...);
2609 constexpr
size_t NumArgs =
sizeof...(RestT);
2610 auto KernelFunc =
std::get<NumArgs - 1>(ArgsTuple);
2611 auto ReduIndices = std::make_index_sequence<NumArgs - 1>();
2621 "The implementation handling parallel_for with"
2622 " reduction requires work group size not bigger"
2624 std::to_string(MaxWGSize));
2626 reduCGFuncMulti<KernelName>(CGH, KernelFunc, NDRange, Properties, ReduTuple,
2631 while (NWorkItems > 1) {
2633 NWorkItems = reduAuxCGFunc<KernelName, decltype(KernelFunc)>(
2634 AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
2643 template <reduction::strategy Strategy>
2647 template <
typename KernelName,
int Dims,
typename PropertiesT,
2648 typename KernelType,
typename Reduction>
2649 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
2651 Reduction &Redu, KernelType &KernelFunc) {
2652 auto Delegate = [&](
auto Impl) {
2653 Impl.template run<KernelName>(CGH, Queue, NDRange, Properties, Redu,
2657 if constexpr (Reduction::has_float64_atomics) {
2661 if constexpr (Reduction::has_fast_reduce)
2665 }
else if constexpr (Reduction::has_fast_atomics) {
2666 if constexpr (
sizeof(
typename Reduction::result_type) == 8) {
2673 if constexpr (Reduction::has_fast_reduce)
2680 if constexpr (Reduction::has_fast_reduce) {
2686 if constexpr (Reduction::has_fast_reduce)
2692 assert(
false &&
"Must be unreachable!");
2694 template <
typename KernelName,
int Dims,
typename PropertiesT,
2696 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
2705 typename PropertiesT,
typename... RestT>
2707 PropertiesT Properties, RestT... Rest) {
2709 Properties, Rest...);
2712 __SYCL_EXPORT uint32_t
2716 typename PropertiesT,
typename... RestT>
2718 PropertiesT Properties, RestT... Rest) {
2719 std::tuple<RestT...> ArgsTuple(Rest...);
2720 constexpr
size_t NumArgs =
sizeof...(RestT);
2721 static_assert(NumArgs > 1,
"No reduction!");
2722 auto KernelFunc =
std::get<NumArgs - 1>(ArgsTuple);
2723 auto ReduIndices = std::make_index_sequence<NumArgs - 1>();
2728 size_t OneElemSize = [&]() {
2730 if constexpr (
sizeof...(RestT) == 2) {
2732 constexpr
bool IsTreeReduction =
2733 !Reduction::has_fast_reduce && !Reduction::has_fast_atomics;
2734 return IsTreeReduction ?
sizeof(
typename Reduction::result_type) : 0;
2740 uint32_t NumConcurrentWorkGroups =
2741 #ifdef __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS
2742 __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS;
2752 size_t NWorkItems = Range.
size();
2753 size_t WGSize = std::min(NWorkItems, PrefWGSize);
2754 size_t NWorkGroups = NWorkItems / WGSize;
2755 if (NWorkItems % WGSize)
2757 size_t MaxNWorkGroups = NumConcurrentWorkGroups;
2758 NWorkGroups = std::min(NWorkGroups, MaxNWorkGroups);
2759 size_t NDRItems = NWorkGroups * WGSize;
2762 size_t PerGroup = Range.
size() / NWorkGroups;
2769 auto UpdatedKernelFunc = [=](
auto NDId,
auto &...Reducers) {
2773 auto Group = NDId.get_group();
2774 size_t GroupId = Group.get_group_linear_id();
2775 size_t NumGroups = Group.get_group_linear_range();
2776 bool LastGroup = (GroupId == NumGroups - 1);
2777 size_t GroupStart = GroupId * PerGroup;
2778 size_t GroupEnd = LastGroup ? Range.
size() : (GroupStart + PerGroup);
2781 size_t Start = GroupStart + NDId.get_local_id(0);
2782 size_t End = GroupEnd;
2783 size_t Stride = NDId.get_local_range(0);
2784 auto GetDelinearized = [&](
size_t I) {
2786 if constexpr (std::is_invocable_v<decltype(KernelFunc),
id<Dims>,
2787 decltype(Reducers)...>)
2794 for (
size_t I = Start; I < End; I += Stride)
2795 KernelFunc(GetDelinearized(I), Reducers...);
2797 if constexpr (NumArgs == 2) {
2799 auto &Redu = std::get<0>(ReduTuple);
2801 constexpr
auto StrategyToUse = [&]() {
2816 if constexpr (Reduction::has_fast_reduce && Reduction::has_identity)
2818 else if constexpr (Reduction::has_fast_atomics &&
2819 sizeof(
typename Reduction::result_type) != 8)
2825 reduction_parallel_for<KernelName, StrategyToUse>(CGH, NDRange, Properties,
2826 Redu, UpdatedKernelFunc);
2829 [&](
auto &...Reds) {
2830 return reduction_parallel_for<KernelName, Strategy>(
2831 CGH, NDRange, Properties, Reds..., UpdatedKernelFunc);
2842 template <
typename T,
typename AllocatorT,
typename BinaryOperation>
2844 BinaryOperation Combiner,
const property_list &PropList = {}) {
2847 PropList.has_property<property::reduction::initialize_to_identity>();
2848 return detail::make_reduction<BinaryOperation, 0, 1, false>(
2857 template <
typename T,
typename BinaryOperation>
2861 PropList.has_property<property::reduction::initialize_to_identity>();
2862 return detail::make_reduction<BinaryOperation, 0, 1, false>(
2869 template <
typename T,
typename AllocatorT,
typename BinaryOperation>
2871 BinaryOperation Combiner,
const property_list &PropList = {}) {
2874 PropList.has_property<property::reduction::initialize_to_identity>();
2875 return detail::make_reduction<BinaryOperation, 0, 1, true>(
2882 template <
typename T,
typename BinaryOperation>
2883 auto reduction(T *Var,
const T &Identity, BinaryOperation Combiner,
2886 PropList.has_property<property::reduction::initialize_to_identity>();
2887 return detail::make_reduction<BinaryOperation, 0, 1, true>(
2896 template <
typename T,
size_t Extent,
typename BinaryOperation,
2897 typename = std::enable_if_t<Extent != dynamic_extent>>
2901 PropList.has_property<property::reduction::initialize_to_identity>();
2902 return detail::make_reduction<BinaryOperation, 1, Extent, false>(
2909 template <
typename T,
size_t Extent,
typename BinaryOperation,
2910 typename = std::enable_if_t<Extent != dynamic_extent>>
2912 BinaryOperation Combiner,
const property_list &PropList = {}) {
2914 PropList.has_property<property::reduction::initialize_to_identity>();
2915 return detail::make_reduction<BinaryOperation, 1, Extent, true>(
The file contains implementations of accessor class.
Defines a shared array that can be used by kernels in queues.
Helper class for accessing internal reducer member functions.
constexpr auto getIdentity()
auto & getElement(size_t E)
static constexpr auto getIdentityStatic()
ReducerAccess(ReducerT &ReducerRef)
constexpr T & operator*() noexcept
ReducerElement & combine(BinaryOperation BinOp, const T &OtherValue)
ReducerElement & combine(BinaryOperation BinOp, const ReducerElement &Other)
ReducerElement(const ReductionIdentityContainer< T, BinaryOperation, ExplicitIdentity > &IdentityContainer)
constexpr const T & operator*() const noexcept
static constexpr T getIdentity()
Returns the statically known identity value.
ReductionIdentityContainer(const T &)
ReductionIdentityContainer()
T getIdentity() const
Returns the identity value given by user.
ReductionIdentityContainer(const T &Identity)
Templated class for common functionality of all reduction implementation classes.
This class is the default KernelName template parameter type for kernel invocation APIs such as singl...
Use CRTP to avoid redefining shorthand operators in terms of combine.
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&(IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value||IsReduOptForAtomic64Op< _T, _BinaryOperation >::value) &&IsMaximum< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic MAX operation: *ReduVarPtr = sycl::maximum(*ReduVarPtr, MValue);.
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&(IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value||IsReduOptForAtomic64Op< _T, _BinaryOperation >::value) &&IsPlus< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic ADD operation: *ReduVarPtr += MValue;.
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value &&IsBitOR< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic BITWISE OR operation: *ReduVarPtr |= MValue;.
std::enable_if_t< std::is_same_v< remove_decoration_t< _T >, _T > &&IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value &&IsBitAND< _T, _BinaryOperation >::value &&(Space==access::address_space::global_space||Space==access::address_space::local_space)> atomic_combine(_T *ReduVarPtr) const
Atomic BITWISE AND operation: *ReduVarPtr &= MValue;.
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&(IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value||IsReduOptForAtomic64Op< _T, _BinaryOperation >::value) &&IsMinimum< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic MIN operation: *ReduVarPtr = sycl::minimum(*ReduVarPtr, MValue);.
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value &&IsBitXOR< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic BITWISE XOR operation: *ReduVarPtr ^= MValue;.
BinaryOperation binary_operation
static constexpr int dimensions
const identity_container_type & getIdentityContainer()
auto & getTempBuffer(size_t Size, handler &CGH)
BinaryOperation getBinaryOperation() const
Returns the binary operation associated with the reduction.
static constexpr bool has_fast_reduce
static constexpr size_t dims
BinaryOperation binary_operation
typename ReducerTraits< reducer_type >::element_type reducer_element_type
std::enable_if_t<!HasIdentity > withInitializedMem(handler &CGH, FuncTy Func)
static constexpr bool has_fast_atomics
auto getWriteMemForPartialReds(size_t Size, handler &CGH)
static constexpr bool has_float64_atomics
static constexpr size_t num_elements
std::enable_if_t< HasIdentity > withInitializedMem(handler &CGH, FuncTy Func)
Provide Func with a properly initialized memory to write the reduction result to.
static constexpr bool is_known_identity
bool initializeToIdentity() const
auto getUserRedVarAccess(handler &CGH)
reduction_impl_algo(BinaryOperation BinaryOp, bool Init, RedOutVar RedOut, std::enable_if_t<!IsKnownIdentityOp< RelayT, RelayBinaryOperation >::value, int >=0)
static constexpr bool is_usm
reduction_impl_algo(const T &Identity, BinaryOperation BinaryOp, bool Init, RedOutVar RedOut)
reduction_impl_algo(BinaryOperation BinaryOp, bool Init, RedOutVar RedOut, std::enable_if_t< IsKnownIdentityOp< RelayT, RelayBinaryOperation >::value, int >=0)
auto getWriteAccForPartialReds(size_t Size, handler &CGH)
Returns an accessor accessing the memory that will hold the reduction partial sums.
auto getReadAccToPreviousPartialReds(handler &CGH) const
static constexpr bool has_identity
accessor< int, 1, access::mode::read_write, access::target::device, access::placeholder::false_t > getReadWriteAccessorToInitializedGroupsCounter(handler &CGH)
ReductionIdentityContainer< T, BinaryOperation, ExplicitIdentity > identity_container_type
auto getGroupsCounterAccDiscrete(handler &CGH)
Base non-template class which is a base class for all reduction implementation classes.
This class encapsulates the reduction variable/accessor, the reduction operator and an optional opera...
reduction_impl(RedOutVar Var, BinaryOperation BOp, bool InitializeToIdentity=false)
Constructs reduction_impl when no identity is specified.
static constexpr bool is_known_identity
static constexpr bool is_usm
reduction_impl(RedOutVar &Var, const T &Identity, BinaryOperation BOp, bool InitializeToIdentity)
Constructs reduction_impl with an explicit identity value.
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
detail::is_device_info_desc< Param >::return_type get_info() const
Queries this SYCL device for information requested by the template parameter param.
bool has(aspect Aspect) const __SYCL_WARN_IMAGE_ASPECT(Aspect)
Indicates if the SYCL device has the given feature.
Command group handler class.
void depends_on(event Event)
Registers event dependencies on this command group.
void single_task(_KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function as a function object type.
void parallel_for(range< 1 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, std::shared_ptr< T_Dst > Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Provides a cross-platform math array class template that works on SYCL devices as well as in host C++...
Identifies an instance of the function object executing at each point in an nd_range.
size_t get_local_linear_id() const
size_t __SYCL_ALWAYS_INLINE get_group_linear_id() const
range< Dimensions > get_local_range() const
size_t __SYCL_ALWAYS_INLINE get_global_linear_id() const
void barrier(access::fence_space accessSpace=access::fence_space::global_and_local) const
Defines the iteration domain of both the work-groups and the overall dispatch.
range< Dimensions > get_local_range() const
range< Dimensions > get_group_range() const
Objects of the property_list class are containers for the SYCL properties.
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
device get_device() const
reducer(const reducer &)=delete
reducer & operator=(const reducer &)=delete
reducer(const IdentityContainerT &IdentityContainer, BinaryOperation BOp)
std::enable_if_t< HasIdentityRelay &&(HasIdentityRelay==has_identity), T > identity() const
reducer(reducer &&)=delete
reducer & operator=(reducer &&)=delete
reducer(const detail::ReducerToken< BinaryOperation, IdentityContainerT > &Token)
reducer & combine(const T &Partial)
reducer(reducer &&)=delete
reducer(const IdentityContainerT &, BinaryOperation)
reducer(const detail::ReducerToken< BinaryOperation, IdentityContainerT > &Token)
reducer< T, BinaryOperation, Dims - 1, Extent, IdentityContainerT, true > operator[](size_t Index)
reducer & operator=(reducer &&)=delete
reducer(const reducer &)=delete
reducer & operator=(const reducer &)=delete
reducer(const reducer &)=delete
reducer & operator=(reducer &&)=delete
reducer(const detail::ReducerToken< BinaryOperation, IdentityContainerT > &Token)
reducer(const IdentityContainerT &IdentityContainer, BinaryOperation BOp)
reducer & operator=(const reducer &)=delete
reducer< T, BinaryOperation, Dims - 1, Extent, IdentityContainerT, true > operator[](size_t Index)
reducer(reducer &&)=delete
std::enable_if_t< HasIdentityRelay &&(HasIdentityRelay==has_identity), T > identity() const
reducer & combine(const T &Partial)
reducer(const reducer &)=delete
reducer & operator=(reducer &&)=delete
reducer & operator=(const reducer &)=delete
reducer(reducer &&)=delete
reducer(const IdentityContainerT &, BinaryOperation)
reducer(const detail::ReducerToken< BinaryOperation, IdentityContainerT > &Token)
reducer(element_type &Ref, BinaryOperation BOp)
reducer(const reducer &)=delete
reducer & operator=(const reducer &)=delete
reducer & operator=(reducer &&)=delete
reducer(const detail::ReducerToken< BinaryOperation, IdentityContainerT > &Token)
reducer & combine(const T &Partial)
reducer(reducer &&)=delete
Class that is used to represent objects that are passed to user's lambda functions and representing u...
constexpr _SYCL_SPAN_INLINE_VISIBILITY pointer data() const noexcept
constexpr tuple_element< I, tuple< Types... > >::type & get(sycl::detail::tuple< Types... > &Arg) noexcept
item< Dims, false > getDelinearizedItem(range< Dims > Range, id< Dims > Id)
void withAuxHandler(handler &CGH, FunctorTy Func)
@ group_reduce_and_last_wg_detection
@ group_reduce_and_atomic_cross_wg
@ local_atomic_and_atomic_cross_wg
@ local_mem_tree_and_atomic_cross_wg
@ group_reduce_and_multiple_kernels
void finalizeHandler(handler &CGH)
void writeReduSumsToOutAccs(size_t OutAccIndex, ReduTupleT< OutAccT... > OutAccs, ReduTupleT< LocalAccT... > LocalAccs, ReduTupleT< BOPsT... > BOPs, ReduTupleT< Ts... > IdentityVals, std::array< bool, sizeof...(Reductions)> IsInitializeToIdentity, std::index_sequence< Is... >)
size_t reduGetMemPerWorkItem(std::tuple< ReductionT... > &ReduTuple, std::index_sequence< Is... >)
device getDeviceFromHandler(handler &CommandGroupHandlerRef)
void doTreeReductionHelper(size_t WorkSize, size_t LID, FuncTy Func)
void reduCGFuncImplScalar(nd_item< Dims > NDIt, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, std::tuple< ReducerT... > &ReducersTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... > ReduIndices)
All scalar reductions are processed together; there is one loop of log2(N) steps, and each reduction ...
uint32_t reduGetMaxNumConcurrentWorkGroups(std::shared_ptr< queue_impl > Queue)
size_t reduGetMaxWGSize(std::shared_ptr< queue_impl > Queue, size_t LocalMemBytesPerWorkItem)
void reduAuxCGFuncImplArray(nd_item< Dims > NDIt, size_t LID, size_t GID, size_t RemainingWorkSize, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< InAccT... > InAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... >)
std::bool_constant< std::is_same_v< BinaryOperation, sycl::maximum< T > >||std::is_same_v< BinaryOperation, sycl::maximum< void > >> IsMaximum
void reduCGFuncImplArrayHelper(nd_item< Dims > NDIt, LocalAccT LocalReds, OutAccT Out, ReducerT &Reducer, BOPT BOp, bool IsInitializeToIdentity)
Each array reduction is processed separately.
std::bool_constant< std::is_same_v< BinaryOperation, sycl::bit_or< T > >||std::is_same_v< BinaryOperation, sycl::bit_or< void > >> IsBitOR
std::bool_constant< IsZeroIdentityOp< T, BinaryOperation >::value||IsOneIdentityOp< T, BinaryOperation >::value||IsOnesIdentityOp< T, BinaryOperation >::value||IsMinimumIdentityOp< T, BinaryOperation >::value||IsMaximumIdentityOp< T, BinaryOperation >::value||IsFalseIdentityOp< T, BinaryOperation >::value||IsTrueIdentityOp< T, BinaryOperation >::value > IsKnownIdentityOp
id< 1 > getDelinearizedId(const range< 1 > &, size_t Index)
void reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu)
Copies the final reduction result kept in read-write accessor to user's USM memory.
void reduAuxCGFuncImplArrayHelper(nd_item< Dims > NDIt, size_t LID, size_t GID, size_t RemainingWorkSize, LocalAccT LocalReds, InAccT In, OutAccT Out, T IdentityContainer, BOPT BOp, bool IsInitializeToIdentity)
void doTreeReductionOnTuple(size_t WorkSize, size_t LID, ReduTupleT< LocalAccT... > &LocalAccs, ReduTupleT< BOPsT... > &BOPs, std::index_sequence< Is... >)
void associateReduAccsWithHandler(handler &CGH, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
size_t GreatestPowerOfTwo(size_t N)
Computes the greatest power-of-two less than or equal to N.
void doTreeReduction(size_t WorkSize, nd_item< Dim > NDIt, LocalRedsTy &LocalReds, BinOpTy &BOp, AccessFuncTy AccessFunc)
std::conditional_t< std::is_same_v< KernelName, auto_name >, auto_name, reduction::InitMemKrn< KernelName > > __sycl_init_mem_for
A helper to pass undefined (sycl::detail::auto_name) names unmodified.
std::bool_constant< std::is_same_v< BinaryOperation, sycl::multiplies< T > >||std::is_same_v< BinaryOperation, sycl::multiplies< void > >> IsMultiplies
size_t reduGetMemPerWorkItemHelper(Reduction &)
auto getReducerAccess(ReducerT &Reducer)
std::bool_constant<(IsPlus< T, BinaryOperation >::value||IsMinimum< T, BinaryOperation >::value||IsMaximum< T, BinaryOperation >::value) &&is_sgenfloat_v< T > &&sizeof(T)==8 > IsReduOptForAtomic64Op
constexpr auto filterSequenceHelper(FunctorT, std::index_sequence< Is... >)
For each index 'I' from the given indices pack 'Is' this function initially creates a number of short...
void reduAuxCGFuncImplScalar(nd_item< Dims > NDIt, size_t LID, size_t GID, size_t RemainingWorkSize, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< InAccT... > InAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... > ReduIndices)
All scalar reductions are processed together; there is one loop of log2(N) steps, and each reduction ...
std::bool_constant<((is_sgenfloat_v< T > &&sizeof(T)==4)||is_sgeninteger_v< T >) &&IsValidAtomicType< T >::value &&(IsPlus< T, BinaryOperation >::value||IsMinimum< T, BinaryOperation >::value||IsMaximum< T, BinaryOperation >::value||IsBitOR< T, BinaryOperation >::value||IsBitXOR< T, BinaryOperation >::value||IsBitAND< T, BinaryOperation >::value)> IsReduOptForFastAtomicFetch
std::tuple< std::tuple_element_t< Is, TupleT >... > tuple_select_elements(TupleT Tuple, std::index_sequence< Is... >)
Utility function: for the given tuple.
constexpr tuple< Ts... > make_tuple(Ts... Args)
size_t reduGetPreferredWGSize(std::shared_ptr< queue_impl > &Queue, size_t LocalMemBytesPerWorkItem)
std::bool_constant< std::is_same_v< BinaryOperation, sycl::plus< T > >||std::is_same_v< BinaryOperation, sycl::plus< void > >> IsPlus
size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... > ReduIndices)
constexpr auto filterSequence(FunctorT F, std::index_sequence< Is... > Indices)
For each index 'I' from the given indices pack 'Is' this function returns an index sequence consistin...
std::bool_constant<((is_sgeninteger_v< T > &&(sizeof(T)==4||sizeof(T)==8))||is_sgenfloat_v< T >) &&(IsPlus< T, BinaryOperation >::value||IsMinimum< T, BinaryOperation >::value||IsMaximum< T, BinaryOperation >::value)> IsReduOptForFastReduce
void reduCGFuncImplArray(nd_item< Dims > NDIt, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, std::tuple< ReducerT... > &ReducersTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... >)
ReduTupleT< Ts... > makeReduTupleT(Ts... Elements)
std::conditional_t< std::is_same_v< KernelName, auto_name >, auto_name, MainOrAux< KernelName, Strategy, Ts... > > __sycl_reduction_kernel
A helper to pass undefined (sycl::detail::auto_name) names unmodified.
std::bool_constant< std::is_same_v< BinaryOperation, sycl::minimum< T > >||std::is_same_v< BinaryOperation, sycl::minimum< void > >> IsMinimum
void reduction_parallel_for(handler &CGH, range< Dims > NDRange, PropertiesT Properties, RestT... Rest)
void addCounterInit(handler &CGH, std::shared_ptr< sycl::detail::queue_impl > &Queue, std::shared_ptr< int > &Counter)
size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize, size_t &NWorkGroups)
constexpr std::index_sequence concat_sequences(std::index_sequence<>)
std::bool_constant< std::is_same_v< BinaryOperation, sycl::bit_and< T > >||std::is_same_v< BinaryOperation, sycl::bit_and< void > >> IsBitAND
auto createReduOutAccs(size_t NWorkGroups, handler &CGH, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
For the given 'Reductions' types pack and indices enumerating them this function either creates new t...
auto getLastCombine(OutAccT OutAcc, LocalAccT LocalAcc, BOPT BOP, IdentityContainerT IdentityContainer, bool IsInitializeToIdentity)
std::bool_constant< std::is_same_v< BinaryOperation, sycl::bit_xor< T > >||std::is_same_v< BinaryOperation, sycl::bit_xor< void > >> IsBitXOR
typename tuple_element< I, T >::type tuple_element_t
constexpr auto makeAdjustedBOPs(ReduTupleT< BOPsT... > &BOPsTuple, std::index_sequence< Is... >)
void reduCGFuncMulti(handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, PropertiesT Properties, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... > ReduIndices)
constexpr auto makeAdjustedBOP(BOPT &BOP)
auto make_reduction(RedOutVar RedVar, RestTy &&...Rest)
sycl::detail::tuple< Ts... > ReduTupleT
constexpr mode_tag_t< access_mode::read > read_only
constexpr property::no_init no_init
constexpr mode_tag_t< access_mode::read_write > read_write
std::enable_if_t<(is_group_v< std::decay_t< Group >> &&(detail::is_scalar_arithmetic< T >::value||(detail::is_complex< T >::value &&detail::is_multiplies< T, BinaryOperation >::value)) &&detail::is_native_op< T, BinaryOperation >::value), T > reduce_over_group(Group g, T x, BinaryOperation binary_op)
auto reduction(buffer< T, 1, AllocatorT > Var, handler &CGH, BinaryOperation Combiner, const property_list &PropList={})
Constructs a reduction object using the given buffer Var, handler CGH, reduction operation Combiner,...
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
void free(void *ptr, const context &ctxt, const detail::code_location &CodeLoc=detail::code_location::current())
_Abi const simd< _Tp, _Abi > & noexcept
static constexpr bool value
std::conditional_t< Cond, std::index_sequence< I >, std::index_sequence<> > type
static constexpr bool value
static constexpr bool value
static constexpr bool value
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, RestT... Rest)
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, RestT... Rest)
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
const IdentityContainerT & IdentityContainer
const BinaryOperation BOp
Helper class for accessing reducer-defined types in CRTP May prove to be useful for other things late...
static constexpr int value
is_device_copyable is a user specializable class template to indicate that a type T is device copyabl...