37 static constexpr
bool value =
38 std::is_base_of_v<reduction_impl_base, std::remove_reference_t<T>>;
44 static constexpr
bool value =
68 template <
typename T,
class BinaryOperation,
int Dims,
size_t Extent,
69 typename IdentityContainerT,
bool View =
false,
typename Subst =
void>
81 template <
typename T,
class BinaryOperation>
83 #ifdef SYCL_REDUCTION_DETERMINISTIC
84 std::bool_constant<false>;
86 std::bool_constant<((is_sgenfloat<T>::value &&
sizeof(T) == 4) ||
87 is_sgeninteger<T>::value) &&
88 IsValidAtomicType<T>::value &&
89 (IsPlus<T, BinaryOperation>::value ||
90 IsMinimum<T, BinaryOperation>::value ||
91 IsMaximum<T, BinaryOperation>::value ||
92 IsBitOR<T, BinaryOperation>::value ||
93 IsBitXOR<T, BinaryOperation>::value ||
105 template <
typename T,
class BinaryOperation>
107 #ifdef SYCL_REDUCTION_DETERMINISTIC
108 std::bool_constant<false>;
121 template <
typename T,
class BinaryOperation>
123 #ifdef SYCL_REDUCTION_DETERMINISTIC
124 std::bool_constant<false>;
127 (
sizeof(T) == 4 ||
sizeof(T) == 8)) ||
138 template <
typename... Ts>
using ReduTupleT = sycl::detail::tuple<Ts...>;
144 size_t LocalMemBytesPerWorkItem);
146 size_t &NWorkGroups);
148 size_t LocalMemBytesPerWorkItem);
150 template <
typename T,
class BinaryOperation,
bool IsOptional>
157 template <
typename T,
class BinaryOperation,
int Dims, std::size_t Extent,
158 typename IdentityContainerT,
bool View,
typename Subst>
160 IdentityContainerT, View, Subst>> {
162 using op = BinaryOperation;
163 static constexpr
int dims = Dims;
164 static constexpr
size_t extent = Extent;
165 static constexpr
bool has_identity = IdentityContainerT::has_identity;
174 template <
typename ReducerRelayT = ReducerT>
auto &
getElement(
size_t E) {
175 return MReducerRef.getElement(E);
178 template <
typename ReducerRelayT = ReducerT> constexpr
auto getIdentity() {
180 "Identity unavailable.");
181 return MReducerRef.getIdentity();
187 template <
typename ReducerRelayT = ReducerT>
192 "Static identity unavailable.");
193 return ReducerT::getIdentity();
197 ReducerT &MReducerRef;
224 template <
typename _T = Ty,
int _Dims = Dims>
229 return static_cast<Reducer *
>(
this)->combine(
static_cast<_T
>(1));
232 template <
typename _T = Ty,
int _Dims = Dims>
237 return static_cast<Reducer *
>(
this)->combine(
static_cast<_T
>(1));
240 template <
typename _T = Ty,
int _Dims = Dims>
242 operator+=(
const _T &Partial) {
243 return static_cast<Reducer *
>(
this)->combine(Partial);
246 template <
typename _T = Ty,
int _Dims = Dims>
248 operator*=(
const _T &Partial) {
249 return static_cast<Reducer *
>(
this)->combine(Partial);
252 template <
typename _T = Ty,
int _Dims = Dims>
254 operator|=(
const _T &Partial) {
255 return static_cast<Reducer *
>(
this)->combine(Partial);
258 template <
typename _T = Ty,
int _Dims = Dims>
260 operator^=(
const _T &Partial) {
261 return static_cast<Reducer *
>(
this)->combine(Partial);
264 template <
typename _T = Ty,
int _Dims = Dims>
266 operator&=(
const _T &Partial) {
267 return static_cast<Reducer *
>(
this)->combine(Partial);
271 template <access::address_space Space>
273 return Space == access::address_space::local_space
274 ? memory_scope::work_group
275 : memory_scope::device;
278 template <access::address_space Space,
class T,
class AtomicFunctor>
279 void atomic_combine_impl(
T *ReduVarPtr, AtomicFunctor Functor)
const {
280 auto reducer =
static_cast<const Reducer *
>(
this);
281 for (
size_t E = 0; E < Extent; ++E) {
288 auto AtomicRef = sycl::atomic_ref<
T, memory_order::relaxed,
289 getMemoryScope<Space>(),
Space>(
290 address_space_cast<Space, access::decorated::no>(ReduVarPtr)[E]);
291 Functor(std::move(AtomicRef), *ReducerElem);
295 template <
class _T, access::address_space Space,
class BinaryOp>
296 static constexpr
bool BasicCheck =
297 std::is_same_v<remove_decoration_t<_T>, Ty> &&
298 (
Space == access::address_space::global_space ||
299 Space == access::address_space::local_space);
304 typename _T = Ty,
class _BinaryOperation = BinaryOp>
305 std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
306 (IsReduOptForFastAtomicFetch<_T, _BinaryOperation>::value ||
307 IsReduOptForAtomic64Op<_T, _BinaryOperation>::value) &&
308 IsPlus<_T, _BinaryOperation>::value>
310 atomic_combine_impl<Space>(
311 ReduVarPtr, [](
auto &&Ref,
auto Val) {
return Ref.fetch_add(Val); });
316 typename _T = Ty,
class _BinaryOperation = BinaryOp>
317 std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
321 atomic_combine_impl<Space>(
322 ReduVarPtr, [](
auto &&Ref,
auto Val) {
return Ref.fetch_or(Val); });
327 typename _T = Ty,
class _BinaryOperation = BinaryOp>
328 std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
332 atomic_combine_impl<Space>(
333 ReduVarPtr, [](
auto &&Ref,
auto Val) {
return Ref.fetch_xor(Val); });
338 typename _T = Ty,
class _BinaryOperation = BinaryOp>
339 std::enable_if_t<std::is_same_v<remove_decoration_t<_T>, _T> &&
342 (
Space == access::address_space::global_space ||
343 Space == access::address_space::local_space)>
345 atomic_combine_impl<Space>(
346 ReduVarPtr, [](
auto &&Ref,
auto Val) {
return Ref.fetch_and(Val); });
351 typename _T = Ty,
class _BinaryOperation = BinaryOp>
352 std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
357 atomic_combine_impl<Space>(
358 ReduVarPtr, [](
auto &&Ref,
auto Val) {
return Ref.fetch_min(Val); });
363 typename _T = Ty,
class _BinaryOperation = BinaryOp>
364 std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
369 atomic_combine_impl<Space>(
370 ReduVarPtr, [](
auto &&Ref,
auto Val) {
return Ref.fetch_max(Val); });
376 template <
typename T,
class BinaryOperation,
bool ExplicitIdentity,
377 typename CondT =
void>
379 static_assert(!std::is_same_v<T, T>,
380 "Partial specializations don't cover all possible options!");
384 template <
typename T,
class BinaryOperation,
bool ExplicitIdentity>
386 T, BinaryOperation, ExplicitIdentity,
389 static constexpr
bool has_identity =
true;
401 template <
typename T,
class BinaryOperation>
403 T, BinaryOperation, true,
406 static constexpr
bool has_identity =
true;
420 template <
typename T,
class BinaryOperation>
422 T, BinaryOperation, false,
423 std::enable_if_t<!IsKnownIdentityOp<T, BinaryOperation>::value>> {
425 static constexpr
bool has_identity =
false;
430 template <
typename T,
class BinaryOperation,
bool IsOptional>
432 using value_type = std::conditional_t<IsOptional, std::optional<T>, T>;
434 template <
bool ExplicitIdentity>
435 constexpr value_type GetInitValue(
437 &IdentityContainer) {
438 constexpr
bool ContainerHasIdentity =
440 ExplicitIdentity>::has_identity;
441 static_assert(IsOptional || ContainerHasIdentity);
442 if constexpr (!ContainerHasIdentity)
445 return IdentityContainer.getIdentity();
449 ReducerElement() =
default;
452 template <
bool ExplicitIdentity>
456 : MValue(GetInitValue(IdentityContainer)) {}
459 if constexpr (IsOptional)
460 MValue = MValue ? BinOp(*MValue, OtherValue) : OtherValue;
462 MValue = BinOp(MValue, OtherValue);
467 if constexpr (IsOptional) {
469 return combine(BinOp, *Other.MValue);
473 return combine(BinOp, Other.MValue);
478 if constexpr (IsOptional)
484 if constexpr (IsOptional)
490 constexpr
explicit operator bool()
const {
491 if constexpr (IsOptional)
492 return MValue.has_value();
504 static constexpr
int dimensions = Dims;
508 template <
class BinaryOperation,
typename IdentityContainerT>
511 const BinaryOperation
BOp;
522 template <
typename T,
class BinaryOperation,
int Dims,
size_t Extent,
523 typename IdentityContainerT,
bool View>
525 T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
526 std::enable_if_t<Dims == 0 && Extent == 1 && View == false &&
527 !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
529 reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
531 Dims == 0 && Extent == 1 && View == false &&
532 !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>>,
534 static constexpr
bool has_identity = IdentityContainerT::has_identity;
539 reducer(
const IdentityContainerT &IdentityContainer, BinaryOperation BOp)
540 : MValue(IdentityContainer), MIdentity(IdentityContainer),
544 :
reducer(Token.IdentityContainer, Token.BOp) {}
552 MValue.combine(MBinaryOp, Partial);
556 template <
bool HasIdentityRelay = has_
identity>
557 std::enable_if_t<HasIdentityRelay && (HasIdentityRelay == has_identity),
T>
559 return MIdentity.getIdentity();
566 const element_type &getElement(
size_t)
const {
return MValue; }
568 detail::ReducerElement<T, BinaryOperation, !has_identity> MValue;
569 const IdentityContainerT MIdentity;
570 BinaryOperation MBinaryOp;
578 template <
typename T,
class BinaryOperation,
int Dims,
size_t Extent,
579 typename IdentityContainerT,
bool View>
581 T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
582 std::enable_if_t<Dims == 0 && Extent == 1 && View == false &&
583 detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
585 reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
587 Dims == 0 && Extent == 1 && View == false &&
588 detail::IsKnownIdentityOp<T, BinaryOperation>::value>>>,
590 static constexpr
bool has_identity = IdentityContainerT::has_identity;
596 reducer(
const IdentityContainerT & , BinaryOperation)
597 : MValue(getIdentity()) {}
600 :
reducer(Token.IdentityContainer, Token.BOp) {}
609 MValue.combine(BOp, Partial);
618 static constexpr T getIdentity() {
622 element_type &getElement(
size_t) {
return MValue; }
623 const element_type &getElement(
size_t)
const {
return MValue; }
624 detail::ReducerElement<T, BinaryOperation, !has_identity> MValue;
629 template <
typename T,
class BinaryOperation,
int Dims,
size_t Extent,
630 typename IdentityContainerT,
bool View>
631 class reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
632 std::enable_if_t<Dims == 0 && View == true>>
634 reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
635 std::enable_if_t<Dims == 0 && View == true>>>,
637 static constexpr
bool has_identity = IdentityContainerT::has_identity;
643 : MElement(Ref), MBinaryOp(BOp) {}
646 :
reducer(Token.IdentityContainer, Token.BOp) {}
654 MElement.combine(MBinaryOp, Partial);
662 const element_type &getElement(
size_t)
const {
return MElement; }
664 element_type &MElement;
665 BinaryOperation MBinaryOp;
670 template <
typename T,
class BinaryOperation,
int Dims,
size_t Extent,
671 typename IdentityContainerT,
bool View>
673 T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
674 std::enable_if_t<Dims == 1 && View == false &&
675 !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
677 reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
679 Dims == 1 && View == false &&
680 !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>>,
682 static constexpr
bool has_identity = IdentityContainerT::has_identity;
687 reducer(
const IdentityContainerT &IdentityContainer, BinaryOperation BOp)
688 : MValue(IdentityContainer), MIdentity(IdentityContainer),
692 :
reducer(Token.IdentityContainer, Token.BOp) {}
699 reducer<
T, BinaryOperation, Dims - 1, Extent, IdentityContainerT,
true>
701 return {MValue[Index], MBinaryOp};
704 template <
bool HasIdentityRelay = has_
identity>
705 std::enable_if_t<HasIdentityRelay && (HasIdentityRelay == has_identity),
T>
707 return MIdentity.getIdentity();
713 element_type &getElement(
size_t E) {
return MValue[E]; }
714 const element_type &getElement(
size_t E)
const {
return MValue[E]; }
716 marray<element_type, Extent> MValue;
717 const IdentityContainerT MIdentity;
718 BinaryOperation MBinaryOp;
723 template <
typename T,
class BinaryOperation,
int Dims,
size_t Extent,
724 typename IdentityContainerT,
bool View>
726 T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
727 std::enable_if_t<Dims == 1 && View == false &&
728 detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
730 reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
732 Dims == 1 && View == false &&
733 detail::IsKnownIdentityOp<T, BinaryOperation>::value>>>,
735 static constexpr
bool has_identity = IdentityContainerT::has_identity;
741 reducer(
const IdentityContainerT & , BinaryOperation)
742 : MValue(getIdentity()) {}
745 :
reducer(Token.IdentityContainer, Token.BOp) {}
754 reducer<
T, BinaryOperation, Dims - 1, Extent, IdentityContainerT,
true>
756 return {MValue[Index], BinaryOperation()};
764 static constexpr T getIdentity() {
768 element_type &getElement(
size_t E) {
return MValue[E]; }
769 const element_type &getElement(
size_t E)
const {
return MValue[E]; }
771 marray<element_type, Extent> MValue;
778 static constexpr
int value = 1;
784 accessor<T, AccessorDims, Mode, access::target::device, IsPH, PropList>> {
785 static constexpr
int value = AccessorDims;
793 template <
class T,
int Dims,
typename AllocatorT>
806 template <
class KernelName>
808 std::conditional_t<std::is_same_v<KernelName, auto_name>,
auto_name,
811 template <
typename T,
class BinaryOperation,
int Dims,
size_t Extent,
812 bool ExplicitIdentity,
typename RedOutVar>
815 ExplicitIdentity, RedOutVar>;
818 static constexpr T chooseIdentity(
const T &Identity) {
830 if constexpr (is_known_identity) {
839 static constexpr
bool is_known_identity =
841 static constexpr
bool has_identity = is_known_identity || ExplicitIdentity;
854 static constexpr
size_t dims = Dims;
855 static constexpr
bool has_float64_atomics =
857 static constexpr
bool has_fast_atomics =
859 static constexpr
bool has_fast_reduce =
862 static constexpr
bool is_usm = std::is_same_v<RedOutVar, T *>;
864 static constexpr
size_t num_elements = Extent;
868 : MIdentityContainer(chooseIdentity(Identity)), MBinaryOp(BinaryOp),
871 template <
typename RelayT = T,
872 typename RelayBinaryOperation = BinaryOperation>
874 BinaryOperation BinaryOp,
bool Init, RedOutVar RedOut,
879 MRedOut(
std::move(RedOut)) {}
881 template <
typename RelayT = T,
882 typename RelayBinaryOperation = BinaryOperation>
884 BinaryOperation BinaryOp,
bool Init, RedOutVar RedOut,
888 MRedOut(
std::move(RedOut)) {}
891 CGH.addReduction(MOutBufPtr);
892 return accessor{*MOutBufPtr, CGH, sycl::read_only};
895 template <
bool IsOneWG>
899 if constexpr (IsOneWG) {
900 return getUserRedVarAccess(CGH);
903 std::make_shared<buffer<reducer_element_type, 1>>(
range<1>(Size));
904 CGH.addReduction(MOutBufPtr);
910 auto Buffer = std::make_shared<buffer<_T, 1>>(
range<1>(Size));
911 CGH.addReduction(Buffer);
923 "Unexpected size of reducer element type.");
927 if constexpr (!is_usm && has_identity) {
929 auto ReinterpretRedOut =
930 MRedOut.template reinterpret<reducer_element_type>();
931 return accessor{ReinterpretRedOut, CGH};
939 std::make_shared<buffer<reducer_element_type, 1>>(
range<1>(Size));
940 CGH.addReduction(MOutBufPtr);
952 template <
typename KernelName,
typename FuncTy,
953 bool HasIdentity = has_identity>
957 auto DoIt = [&](
auto &Out) {
958 auto RWReduVal = std::make_shared<std::array<T, num_elements>>();
959 for (
int i = 0; i < num_elements; ++i) {
960 (*RWReduVal)[i] = decltype(MIdentityContainer)::getIdentity();
962 CGH.addReduction(RWReduVal);
963 auto Buf = std::make_shared<buffer<T, 1>>(RWReduVal.get()->data(),
965 Buf->set_final_data();
966 CGH.addReduction(Buf);
976 Buf->template get_access<access::mode::read_write>(CopyHandler);
977 if constexpr (is_usm) {
979 bool IsUpdateOfUserVar = !initializeToIdentity();
980 auto BOp = getBinaryOperation();
985 size_t NElements = num_elements;
988 for (
int i = 0; i < NElements; ++i) {
989 if (IsUpdateOfUserVar)
990 Out[i] = BOp(Out[i], Mem[i]);
997 CopyHandler.
copy(Mem, OutAcc);
1001 if constexpr (is_usm) {
1006 if (initializeToIdentity())
1015 template <
typename KernelName,
typename FuncTy,
1016 bool HasIdentity = has_identity>
1019 assert(!initializeToIdentity() &&
1020 "Initialize to identity not allowed for identity-less reductions.");
1025 return MIdentityContainer;
1029 access::placeholder::false_t>
1031 auto CounterMem = std::make_shared<int>(0);
1032 CGH.addReduction(CounterMem);
1033 auto CounterBuf = std::make_shared<buffer<int, 1>>(CounterMem.get(), 1);
1034 CounterBuf->set_final_data();
1035 CGH.addReduction(CounterBuf);
1036 return {*CounterBuf, CGH};
1042 queue q = createSyclObjFromImpl<queue>(CGH.MQueue);
1044 auto Deleter = [=](
auto *Ptr) {
free(Ptr, q); };
1046 std::shared_ptr<int> Counter(malloc_device<int>(1, q), Deleter);
1047 CGH.addReduction(Counter);
1049 auto Event = q.
memset(Counter.get(), 0,
sizeof(
int));
1052 return Counter.get();
1061 if constexpr (is_usm)
1069 identity_container_type MIdentityContainer;
1073 std::shared_ptr<buffer<reducer_element_type, 1>> MOutBufPtr;
1075 BinaryOperation MBinaryOp;
1084 template <
typename T,
class BinaryOperation,
int Dims,
size_t Extent,
1085 bool ExplicitIdentity,
typename RedOutVar>
1089 ExplicitIdentity, RedOutVar> {
1092 ExplicitIdentity, RedOutVar>;
1094 ExplicitIdentity, RedOutVar>;
1097 using algo::is_known_identity;
1101 static_assert(Dims <= 1,
"Multi-dimensional reductions are not supported.");
1105 template <
bool ExplicitIdentityRelay = ExplicitIdentity,
1106 typename = std::enable_if_t<!ExplicitIdentityRelay>>
1110 if constexpr (!is_usm)
1111 if (Var.size() != 1)
1112 throw sycl::runtime_error(errc::invalid,
1113 "Reduction variable must be a scalar.",
1114 PI_ERROR_INVALID_VALUE);
1115 if constexpr (!is_known_identity)
1117 throw sycl::runtime_error(errc::invalid,
1118 "initialize_to_identity property cannot be "
1119 "used with identityless reductions.",
1120 PI_ERROR_INVALID_VALUE);
1125 template <
bool ExplicitIdentityRelay = ExplicitIdentity,
1126 typename = std::enable_if_t<ExplicitIdentityRelay>>
1130 if constexpr (!is_usm)
1131 if (Var.size() != 1)
1132 throw sycl::runtime_error(errc::invalid,
1133 "Reduction variable must be a scalar.",
1134 PI_ERROR_INVALID_VALUE);
1138 template <
class BinaryOp,
int Dims,
size_t Extent,
bool ExplicitIdentity,
1139 typename RedOutVar,
typename... RestTy>
1142 Extent, ExplicitIdentity, RedOutVar>{
1143 RedVar, std::forward<RestTy>(Rest)...};
1149 event E = CGH.finalize();
1150 handler AuxHandler(CGH.MQueue, CGH.MIsHost);
1152 AuxHandler.saveCodeLoc(CGH.MCodeLoc);
1154 CGH.MLastEvent = AuxHandler.finalize();
1164 template <
typename KernelName,
class Reduction>
1166 static_assert(Reduction::is_usm,
1167 "All implementations using this helper are expected to have "
1168 "USM reduction, not a buffer-based one.");
1169 size_t NElements = Reduction::num_elements;
1170 auto InAcc = Redu.getReadAccToPreviousPartialReds(CGH);
1171 auto UserVarPtr = Redu.getUserRedVarAccess(CGH);
1172 bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1173 auto BOp = Redu.getBinaryOperation();
1175 for (
int i = 0; i < NElements; ++i) {
1176 auto Elem = InAcc[i];
1177 if (IsUpdateOfUserVar)
1178 UserVarPtr[i] = BOp(UserVarPtr[i], *Elem);
1180 UserVarPtr[i] = *Elem;
1199 std::conditional_t<std::is_same_v<KernelName, auto_name>,
auto_name,
1200 MainOrAux<KernelName, Strategy, Ts...>>;
1208 template <
typename KernelName,
int Dims,
typename PropertiesT,
1209 typename KernelType,
typename Reduction>
1210 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1213 static_assert(Reduction::has_identity,
1214 "Identityless reductions are not supported by the "
1215 "local_atomic_and_atomic_cross_wg strategy.");
1217 std::ignore = Queue;
1220 reduction::strategy::local_atomic_and_atomic_cross_wg>;
1221 Redu.template withInitializedMem<Name>(CGH, [&](
auto Out) {
1222 size_t NElements = Reduction::num_elements;
1228 typename Reduction::reducer_type Reducer;
1232 auto LID = NDId.get_local_id(0);
1233 for (
size_t E = LID; E < NElements; E += NDId.get_local_range(0)) {
1239 Reducer.template atomic_combine<access::address_space::local_space>(
1246 for (
size_t E = 0; E < NElements; ++E) {
1249 Reducer.template atomic_combine(&Out[0]);
1258 reduction::strategy::group_reduce_and_last_wg_detection> {
1259 template <
typename KernelName,
int Dims,
typename PropertiesT,
1260 typename KernelType,
typename Reduction>
1261 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1264 static_assert(Reduction::has_identity,
1265 "Identityless reductions are not supported by the "
1266 "group_reduce_and_last_wg_detection strategy.");
1268 std::ignore = Queue;
1269 size_t NElements = Reduction::num_elements;
1273 auto Out = Redu.getUserRedVarAccess(CGH);
1275 auto &PartialSumsBuf = Redu.getTempBuffer(NWorkGroups * NElements, CGH);
1278 bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1279 auto Rest = [&](
auto NWorkGroupsFinished) {
1284 reduction::strategy::group_reduce_and_last_wg_detection,
1285 decltype(NWorkGroupsFinished)>;
1289 typename Reduction::reducer_type Reducer;
1292 typename Reduction::binary_operation BOp;
1293 auto Group = NDId.get_group();
1297 size_t LID = NDId.get_local_id(0);
1298 for (
int E = 0; E < NElements; ++E) {
1302 if (NWorkGroups == 1) {
1305 if (IsUpdateOfUserVar)
1306 RedElem = BOp(RedElem, Out[E]);
1309 PartialSums[NDId.get_group_linear_id() * NElements + E] =
1315 if (NWorkGroups == 1)
1322 sycl::atomic_ref<int, memory_order::acq_rel, memory_scope::device,
1323 access::address_space::global_space>(
1324 NWorkGroupsFinished[0]);
1325 DoReducePartialSumsInLastWG[0] = ++NFinished == NWorkGroups;
1329 if (DoReducePartialSumsInLastWG[0]) {
1332 for (
int E = 0; E < NElements; ++E) {
1334 for (
size_t I = LID; I < NWorkGroups; I += WGSize)
1335 LocalSum = BOp(LocalSum, PartialSums[I * NElements + E]);
1339 if (IsUpdateOfUserVar)
1340 Result = BOp(Result, Out[E]);
1353 !
device.
has(aspect::usm_device_allocations))
1354 Rest(Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH));
1356 Rest(Redu.getGroupsCounterAccDiscrete(CGH));
1366 while ((N >>= 1) != 0)
1371 template <
typename FuncTy>
1387 if (Pivot != WorkSize) {
1388 if (Pivot + LID < WorkSize)
1389 Func(LID, Pivot + LID);
1394 for (
size_t CurPivot = Pivot >> 1; CurPivot > 0; CurPivot >>= 1) {
1396 Func(LID, CurPivot + LID);
1405 typename BinOpTy,
typename AccessFuncTy>
1407 BinOpTy &BOp, AccessFuncTy AccessFunc) {
1409 size_t AdjustedWorkSize;
1410 if constexpr (WSGuarantee == WorkSizeGuarantees::LessOrEqual ||
1411 WSGuarantee == WorkSizeGuarantees::Equal) {
1415 if (WSGuarantee == WorkSizeGuarantees::Equal || LID < WorkSize)
1416 LocalReds[LID] = AccessFunc(LID);
1417 AdjustedWorkSize = WorkSize;
1422 AdjustedWorkSize = std::min(WorkSize, WGSize);
1423 if (LID < AdjustedWorkSize) {
1424 auto LocalSum = AccessFunc(LID);
1425 for (
size_t I = LID + WGSize; I < WorkSize; I += WGSize)
1426 LocalSum = BOp(LocalSum, AccessFunc(I));
1428 LocalReds[LID] = LocalSum;
1432 LocalReds[I] = BOp(LocalReds[I], LocalReds[J]);
1440 template <
typename... LocalAccT,
typename... BOPsT,
size_t... Is>
1444 std::index_sequence<Is...>) {
1446 auto ProcessOne = [=](
auto &LocalAcc,
auto &BOp) {
1447 LocalAcc[I] = BOp(LocalAcc[I], LocalAcc[J]);
1449 (ProcessOne(std::get<Is>(LocalAccs), std::get<Is>(BOPs)), ...);
1454 template <
typename KernelName,
int Dims,
typename PropertiesT,
1455 typename KernelType,
typename Reduction>
1456 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1459 using reducer_type =
typename Reduction::reducer_type;
1466 constexpr
bool UsePartialSumForOutput =
1467 !Reduction::is_usm && Reduction::has_identity;
1469 std::ignore = Queue;
1470 size_t NElements = Reduction::num_elements;
1474 bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1476 Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1478 if constexpr (UsePartialSumForOutput)
1479 return (NWorkGroups == 1)
1481 : Redu.getWriteAccForPartialReds(NElements, CGH);
1483 return Redu.getUserRedVarAccess(CGH);
1486 auto NWorkGroupsFinished =
1487 Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH);
1490 auto IdentityContainer = Redu.getIdentityContainer();
1491 auto BOp = Redu.getBinaryOperation();
1494 reduction::strategy::range_basic>;
1498 reducer_type Reducer = reducer_type(IdentityContainer, BOp);
1501 auto ElementCombiner = [&](element_type &LHS,
const element_type &RHS) {
1502 return LHS.combine(BOp, RHS);
1507 size_t LID = NDId.get_local_linear_id();
1508 for (
int E = 0; E < NElements; ++E) {
1510 doTreeReduction<WorkSizeGuarantees::Equal>(
1511 WGSize, NDId, LocalReds, ElementCombiner,
1515 auto V = LocalReds[0];
1517 bool IsOneWG = NWorkGroups == 1;
1518 if (IsOneWG && IsUpdateOfUserVar)
1519 V.combine(BOp, Out[E]);
1523 if (UsePartialSumForOutput || !IsOneWG)
1524 PartialSums[NDId.get_group_linear_id() * NElements + E] = V;
1533 sycl::atomic_ref<int, memory_order::acq_rel, memory_scope::device,
1534 access::address_space::global_space>(
1535 NWorkGroupsFinished[0]);
1536 DoReducePartialSumsInLastWG[0] =
1537 ++NFinished == NWorkGroups && NWorkGroups > 1;
1541 if (DoReducePartialSumsInLastWG[0]) {
1544 for (
int E = 0; E < NElements; ++E) {
1545 doTreeReduction<WorkSizeGuarantees::None>(
1546 NWorkGroups, NDId, LocalReds, ElementCombiner,
1547 [&](
size_t I) {
return PartialSums[I * NElements + E]; });
1549 auto V = LocalReds[0];
1550 if (IsUpdateOfUserVar)
1551 V.combine(BOp, Out[E]);
1562 template <
typename KernelName,
int Dims,
typename PropertiesT,
1563 typename KernelType,
typename Reduction>
1564 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1567 static_assert(Reduction::has_identity,
1568 "Identityless reductions are not supported by the "
1569 "group_reduce_and_atomic_cross_wg strategy.");
1571 std::ignore = Queue;
1574 reduction::strategy::group_reduce_and_atomic_cross_wg>;
1575 Redu.template withInitializedMem<Name>(CGH, [&](
auto Out) {
1576 size_t NElements = Reduction::num_elements;
1580 typename Reduction::reducer_type Reducer;
1583 typename Reduction::binary_operation BOp;
1584 for (
int E = 0; E < NElements; ++E) {
1588 if (NDIt.get_local_linear_id() == 0)
1589 Reducer.atomic_combine(&Out[0]);
1597 reduction::strategy::local_mem_tree_and_atomic_cross_wg> {
1598 template <
typename KernelName,
int Dims,
typename PropertiesT,
1599 typename KernelType,
typename Reduction>
1600 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1603 using reducer_type =
typename Reduction::reducer_type;
1606 std::ignore = Queue;
1609 reduction::strategy::local_mem_tree_and_atomic_cross_wg>;
1610 Redu.template withInitializedMem<Name>(CGH, [&](
auto Out) {
1611 size_t NElements = Reduction::num_elements;
1620 reducer_type Reducer;
1623 size_t WGSize = NDIt.get_local_range().size();
1624 size_t LID = NDIt.get_local_linear_id();
1626 typename Reduction::binary_operation BOp;
1627 auto ElementCombiner = [&](element_type &LHS,
const element_type &RHS) {
1628 return LHS.combine(BOp, RHS);
1633 for (
int E = 0; E < NElements; ++E) {
1635 doTreeReduction<WorkSizeGuarantees::Equal>(
1636 WGSize, NDIt, LocalReds, ElementCombiner,
1643 if (E != NElements - 1) {
1649 Reducer.atomic_combine(&Out[0]);
1658 reduction::strategy::group_reduce_and_multiple_kernels> {
1659 template <
typename KernelName,
int Dims,
typename PropertiesT,
1660 typename KernelType,
typename Reduction>
1661 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1664 static_assert(Reduction::has_identity,
1665 "Identityless reductions are not supported by the "
1666 "group_reduce_and_multiple_kernels strategy.");
1676 constexpr
bool HFR = Reduction::has_fast_reduce;
1677 size_t OneElemSize = HFR ? 0 :
sizeof(
typename Reduction::result_type);
1683 throw sycl::runtime_error(
"The implementation handling parallel_for with"
1684 " reduction requires work group size not bigger"
1686 std::to_string(MaxWGSize),
1687 PI_ERROR_INVALID_WORK_GROUP_SIZE);
1689 size_t NElements = Reduction::num_elements;
1691 auto Out = Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1693 bool IsUpdateOfUserVar =
1694 !Reduction::is_usm && !Redu.initializeToIdentity() && NWorkGroups == 1;
1698 reduction::strategy::group_reduce_and_multiple_kernels>;
1702 typename Reduction::reducer_type Reducer;
1706 size_t WGID = NDIt.get_group_linear_id();
1707 typename Reduction::binary_operation BOp;
1708 for (
int E = 0; E < NElements; ++E) {
1709 typename Reduction::result_type PSum;
1712 if (NDIt.get_local_linear_id() == 0) {
1713 if (IsUpdateOfUserVar)
1714 PSum = BOp(*Out[E], PSum);
1715 Out[WGID * NElements + E] = PSum;
1728 throw sycl::runtime_error(
"The implementation handling parallel_for with "
1729 "reduction requires the maximal work group "
1730 "size to be greater than 1 to converge. "
1731 "The maximal work group size depends on the "
1732 "device and the size of the objects passed to "
1734 PI_ERROR_INVALID_WORK_GROUP_SIZE);
1735 size_t NWorkItems = NDRange.get_group_range().size();
1736 while (NWorkItems > 1) {
1738 size_t NElements = Reduction::num_elements;
1745 bool HasUniformWG = NWorkGroups * WGSize == NWorkItems;
1746 if (!Reduction::has_fast_reduce)
1747 HasUniformWG = HasUniformWG && (WGSize & (WGSize - 1)) == 0;
1751 auto In = Redu.getReadAccToPreviousPartialReds(AuxHandler);
1753 Redu.getWriteAccForPartialReds(NWorkGroups * NElements, AuxHandler);
1757 reduction::strategy::group_reduce_and_multiple_kernels>;
1759 bool IsUpdateOfUserVar = !Reduction::is_usm &&
1760 !Redu.initializeToIdentity() &&
1762 range<1> GlobalRange = {HasUniformWG ? NWorkItems
1763 : NWorkGroups * WGSize};
1766 typename Reduction::binary_operation BOp;
1767 size_t WGID = NDIt.get_group_linear_id();
1768 size_t GID = NDIt.get_global_linear_id();
1770 for (
int E = 0; E < NElements; ++E) {
1771 typename Reduction::result_type PSum =
1772 (HasUniformWG || (GID < NWorkItems))
1773 ? *In[GID * NElements + E]
1777 if (NDIt.get_local_linear_id() == 0) {
1778 if (IsUpdateOfUserVar)
1779 PSum = BOp(*Out[E], PSum);
1780 Out[WGID * NElements + E] = PSum;
1784 NWorkItems = NWorkGroups;
1788 if constexpr (Reduction::is_usm) {
1790 reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
1797 template <
typename KernelName,
int Dims,
typename PropertiesT,
1798 typename KernelType,
typename Reduction>
1799 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1802 using element_type =
typename Reduction::reducer_element_type;
1804 constexpr
bool HFR = Reduction::has_fast_reduce;
1805 size_t OneElemSize = HFR ? 0 :
sizeof(element_type);
1811 throw sycl::runtime_error(
"The implementation handling parallel_for with"
1812 " reduction requires work group size not bigger"
1814 std::to_string(MaxWGSize),
1815 PI_ERROR_INVALID_WORK_GROUP_SIZE);
1819 bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1820 std::ignore = IsUpdateOfUserVar;
1825 auto First = [&](
auto KernelTag) {
1827 constexpr
bool IsOneWG =
1828 std::is_same_v<std::remove_reference_t<decltype(KernelTag)>,
1831 constexpr
size_t NElements = Reduction::num_elements;
1836 if constexpr (IsOneWG)
1837 return Redu.getUserRedVarAccess(CGH);
1839 return Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1845 auto BOp = Redu.getBinaryOperation();
1846 auto IdentityContainer = Redu.getIdentityContainer();
1849 reduction::strategy::basic,
1850 decltype(KernelTag)>;
1854 typename Reduction::reducer_type Reducer =
1855 typename Reduction::reducer_type(IdentityContainer, BOp);
1858 size_t WGSize = NDIt.get_local_range().size();
1859 size_t LID = NDIt.get_local_linear_id();
1861 auto ElementCombiner = [&](element_type &LHS,
const element_type &RHS) {
1862 return LHS.combine(BOp, RHS);
1867 for (
int E = 0; E < NElements; ++E) {
1869 doTreeReduction<WorkSizeGuarantees::Equal>(
1870 WGSize, NDIt, LocalReds, ElementCombiner,
1875 element_type PSum = LocalReds[0];
1876 if constexpr (IsOneWG) {
1877 if (IsUpdateOfUserVar)
1878 PSum.combine(BOp, Out[E]);
1881 size_t GrID = NDIt.get_group_linear_id();
1882 Out[GrID * NElements + E] = PSum;
1887 if (E != NElements - 1) {
1894 if (NWorkGroups == 1)
1907 throw sycl::runtime_error(
"The implementation handling parallel_for with "
1908 "reduction requires the maximal work group "
1909 "size to be greater than 1 to converge. "
1910 "The maximal work group size depends on the "
1911 "device and the size of the objects passed to "
1913 PI_ERROR_INVALID_WORK_GROUP_SIZE);
1915 while (NWorkItems > 1) {
1919 auto Rest = [&](
auto KernelTag) {
1922 constexpr
bool IsOneWG =
1923 std::is_same_v<std::remove_reference_t<decltype(KernelTag)>,
1926 constexpr
size_t NElements = Reduction::num_elements;
1932 bool HasUniformWG = NWorkGroups * WGSize == NWorkItems;
1936 auto In = Redu.getReadAccToPreviousPartialReds(AuxHandler);
1939 if constexpr (IsOneWG)
1940 return Redu.getUserRedVarAccess(AuxHandler);
1942 return Redu.getWriteAccForPartialReds(NWorkGroups * NElements,
1946 bool UniformPow2WG = HasUniformWG && (WGSize & (WGSize - 1)) == 0;
1951 auto BOp = Redu.getBinaryOperation();
1953 reduction::strategy::basic,
1954 decltype(KernelTag)>;
1956 range<1> GlobalRange = {UniformPow2WG ? NWorkItems
1957 : NWorkGroups * WGSize};
1960 size_t WGSize = NDIt.get_local_range().size();
1961 size_t LID = NDIt.get_local_linear_id();
1962 size_t GID = NDIt.get_global_linear_id();
1963 size_t GrID = NDIt.get_group_linear_id();
1966 size_t RemainingWorkSize =
1967 sycl::min(WGSize, NWorkItems - GrID * WGSize);
1969 auto ElementCombiner = [&](element_type &LHS,
1970 const element_type &RHS) {
1971 return LHS.combine(BOp, RHS);
1974 for (
int E = 0; E < NElements; ++E) {
1976 doTreeReduction<WorkSizeGuarantees::LessOrEqual>(
1977 RemainingWorkSize, NDIt, LocalReds, ElementCombiner,
1978 [&](
size_t) {
return In[GID * NElements + E]; });
1982 element_type PSum = LocalReds[0];
1983 if constexpr (IsOneWG) {
1984 if (IsUpdateOfUserVar)
1985 PSum.combine(BOp, Out[E]);
1988 Out[GrID * NElements + E] = PSum;
1993 if (E != NElements - 1) {
1998 NWorkItems = NWorkGroups;
2002 if (NWorkGroups == 1)
2013 template <
bool IsOneWG,
typename... Reductions,
size_t... Is>
2015 std::tuple<Reductions...> &ReduTuple,
2016 std::index_sequence<Is...>) {
2018 std::get<Is>(ReduTuple).
template getWriteMemForPartialReds<IsOneWG>(
2024 template <
typename OutAccT,
typename LocalAccT,
typename BOPT,
2025 typename IdentityContainerT>
2027 IdentityContainerT IdentityContainer,
2028 bool IsInitializeToIdentity) {
2029 if constexpr (!IdentityContainerT::has_identity) {
2030 return BOP(LocalAcc[0], OutAcc[0]);
2032 return BOP(LocalAcc[0], IsInitializeToIdentity
2033 ? IdentityContainer.getIdentity()
2038 template <
bool IsOneWG,
typename... Reductions,
typename... OutAccT,
2039 typename... LocalAccT,
typename... BOPsT,
typename... Ts,
2045 std::array<
bool,
sizeof...(Reductions)> IsInitializeToIdentity,
2046 std::index_sequence<Is...>) {
2047 if constexpr (IsOneWG) {
2051 std::get<Is>(OutAccs), std::get<Is>(LocalAccs), std::get<Is>(BOPs),
2052 std::get<Is>(IdentityVals), IsInitializeToIdentity[Is])),
2054 ((std::get<Is>(OutAccs)[OutAccIndex] = *std::get<Is>(LocalAccs)[0]), ...);
2058 ((std::get<Is>(OutAccs)[OutAccIndex] = std::get<Is>(LocalAccs)[0]), ...);
2074 template <
size_t... Is,
size_t... Js>
2075 constexpr std::index_sequence<Is..., Js...>
2081 template <
size_t... Is,
size_t... Js,
class... Rs>
2083 std::index_sequence<Js...>, Rs...) {
2088 template <
typename T>
struct Func {
2089 static constexpr
bool value = !std::remove_pointer_t<T>::is_usm;
2094 template <
typename T>
struct Func {
2095 static constexpr
bool value =
false;
2101 std::conditional_t<Cond, std::index_sequence<I>, std::index_sequence<>>;
2109 template <
typename... T,
typename FunctorT,
size_t... Is,
2110 std::enable_if_t<(
sizeof...(Is) > 0),
int> Z = 0>
2114 Is, std::tuple<T...>>>::value,
2117 template <
typename...
T,
typename FunctorT,
size_t... Is,
2118 std::enable_if_t<(
sizeof...(Is) == 0),
int> Z = 0>
2120 return std::index_sequence<>{};
2126 template <
typename...
T,
typename FunctorT,
size_t... Is>
2132 template <
typename Reduction>
struct Func {
2133 static constexpr
bool value =
2134 (Reduction::dims == 0 && Reduction::num_elements == 1);
2139 template <
typename Reduction>
struct Func {
2140 static constexpr
bool value =
2141 (Reduction::dims == 1 && Reduction::num_elements >= 1);
2145 template <
typename ElementType,
typename BOPT>
2147 return [&](ElementType &LHS,
const ElementType &RHS) {
2148 return LHS.combine(BOP, RHS);
2152 template <
typename... Reductions,
typename... BOPsT,
size_t... Is>
2154 std::index_sequence<Is...>) {
2157 Is, std::tuple<Reductions...>>::reducer_element_type>(
2158 std::get<Is>(BOPsTuple))...);
2161 template <
typename... Reductions,
typename... BOPsT>
2164 BOPsTuple, std::make_index_sequence<
sizeof...(Reductions)>{});
2169 template <
bool IsOneWG,
typename... Reductions,
int Dims,
typename... LocalAccT,
2170 typename... OutAccT,
typename... ReducerT,
typename... Ts,
2171 typename... BOPsT,
size_t... Is>
2176 std::array<
bool,
sizeof...(Reductions)> InitToIdentityProps,
2177 std::index_sequence<Is...> ReduIndices) {
2181 ((std::get<Is>(LocalAccsTuple)[LID] =
2196 GrID, OutAccsTuple, LocalAccsTuple, AdjustedBOPsTuple, IdentitiesTuple,
2197 InitToIdentityProps, ReduIndices);
2202 template <
bool IsOneWG,
typename Reduction,
int Dims,
typename LocalAccT,
2203 typename OutAccT,
typename ReducerT,
typename BOPT>
2205 OutAccT Out, ReducerT &Reducer, BOPT BOp,
2206 bool IsInitializeToIdentity) {
2207 using element_type =
typename Reduction::reducer_element_type;
2212 auto ElementCombiner = [&](element_type &LHS,
const element_type &RHS) {
2213 return LHS.combine(BOp, RHS);
2218 auto NElements = Reduction::num_elements;
2219 for (
size_t E = 0; E < NElements; ++E) {
2220 doTreeReduction<WorkSizeGuarantees::Equal>(
2221 WGSize, NDIt, LocalReds, ElementCombiner,
2227 size_t OutIdx = GrID * NElements + E;
2228 if constexpr (IsOneWG) {
2231 if constexpr (Reduction::has_identity) {
2232 Out[OutIdx] = *ElementCombiner(LocalReds[0], IsInitializeToIdentity
2233 ? Reducer.identity()
2236 Out[OutIdx] = *LocalReds[0];
2240 Out[OutIdx] = LocalReds[0];
2245 if (E != NElements - 1) {
2251 template <
bool IsOneWG,
typename... Reductions,
int Dims,
typename... LocalAccT,
2252 typename... OutAccT,
typename... ReducerT,
typename... BOPsT,
2258 std::array<
bool,
sizeof...(Reductions)> InitToIdentityProps,
2259 std::index_sequence<Is...>) {
2260 using ReductionPack = std::tuple<Reductions...>;
2261 (reduCGFuncImplArrayHelper<IsOneWG, std::tuple_element_t<Is, ReductionPack>>(
2262 NDIt, std::get<Is>(LocalAccsTuple), std::get<Is>(OutAccsTuple),
2263 std::get<Is>(ReducersTuple), std::get<Is>(BOPsTuple),
2264 InitToIdentityProps[Is]),
2268 namespace reduction::main_krn {
2271 template <
typename KernelName,
typename KernelType,
int Dims,
2272 typename PropertiesT,
typename... Reductions,
size_t... Is>
2275 std::tuple<Reductions...> &ReduTuple,
2276 std::index_sequence<Is...> ReduIndices) {
2288 auto ScalarIs =
filterSequence<Reductions...>(ScalarPredicate, ReduIndices);
2291 auto ArrayIs =
filterSequence<Reductions...>(ArrayPredicate, ReduIndices);
2300 auto Rest = [&](
auto KernelTag,
auto OutAccsTuple) {
2301 auto IdentitiesTuple =
2302 makeReduTupleT(std::get<Is>(ReduTuple).getIdentityContainer()...);
2305 std::array InitToIdentityProps{
2306 std::get<Is>(ReduTuple).initializeToIdentity()...};
2309 reduction::strategy::multi,
2310 decltype(KernelTag)>;
2314 constexpr
bool IsOneWG =
2315 std::is_same_v<std::remove_reference_t<decltype(KernelTag)>,
2320 auto ReduIndices = std::index_sequence_for<Reductions...>();
2321 auto ReducerTokensTuple =
2322 std::tuple{
typename Reductions::reducer_token_type{
2323 std::get<Is>(IdentitiesTuple), std::get<Is>(BOPsTuple)}...};
2324 auto ReducersTuple = std::tuple<
typename Reductions::reducer_type...>{
2325 std::get<Is>(ReducerTokensTuple)...};
2326 std::apply([&](
auto &...Reducers) {
KernelFunc(NDIt, Reducers...); },
2334 NDIt, LocalAccsTuple, OutAccsTuple, ReducersTuple, IdentitiesTuple,
2335 BOPsTuple, InitToIdentityProps, ScalarIs);
2342 NDIt, LocalAccsTuple, OutAccsTuple, ReducersTuple, BOPsTuple,
2343 InitToIdentityProps, ArrayIs);
2348 if (NWorkGroups == 1)
2350 createReduOutAccs<true>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2353 createReduOutAccs<false>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2357 template <
typename... Reductions,
size_t... Is>
2359 std::tuple<Reductions...> &ReduTuple,
2360 std::index_sequence<Is...>) {
2361 auto ProcessOne = [&CGH](
auto Redu) {
2362 if constexpr (!decltype(Redu)::is_usm)
2363 Redu.getUserRedVarAccess(CGH);
2365 (ProcessOne(std::get<Is>(ReduTuple)), ...);
2370 template <
bool IsOneWG,
typename... Reductions,
int Dims,
typename... LocalAccT,
2371 typename... InAccT,
typename... OutAccT,
typename... Ts,
2372 typename... BOPsT,
size_t... Is>
2374 nd_item<Dims> NDIt,
size_t LID,
size_t GID,
size_t RemainingWorkSize,
2378 std::array<
bool,
sizeof...(Reductions)> InitToIdentityProps,
2379 std::index_sequence<Is...> ReduIndices) {
2382 if (LID < RemainingWorkSize)
2383 ((std::get<Is>(LocalAccsTuple)[LID] = std::get<Is>(InAccsTuple)[GID]), ...);
2390 AdjustedBOPsTuple, ReduIndices);
2396 GrID, OutAccsTuple, LocalAccsTuple, AdjustedBOPsTuple, IdentitiesTuple,
2397 InitToIdentityProps, ReduIndices);
2401 template <
bool IsOneWG,
typename Reduction,
int Dims,
typename LocalAccT,
2402 typename InAccT,
typename OutAccT,
typename T,
typename BOPT>
2404 size_t RemainingWorkSize, LocalAccT LocalReds,
2405 InAccT In, OutAccT Out, T IdentityContainer,
2406 BOPT BOp,
bool IsInitializeToIdentity) {
2407 using element_type =
typename Reduction::reducer_element_type;
2408 auto ElementCombiner = [&](element_type &LHS,
const element_type &RHS) {
2409 return LHS.combine(BOp, RHS);
2414 auto NElements = Reduction::num_elements;
2415 for (
size_t E = 0; E < NElements; ++E) {
2416 doTreeReduction<WorkSizeGuarantees::LessOrEqual>(
2417 RemainingWorkSize, NDIt, LocalReds, ElementCombiner,
2418 [&](
size_t) {
return In[GID * NElements + E]; });
2423 size_t OutIdx = GrID * NElements + E;
2424 if constexpr (IsOneWG) {
2427 if constexpr (Reduction::has_identity) {
2428 Out[OutIdx] = *ElementCombiner(LocalReds[0],
2429 IsInitializeToIdentity
2430 ? IdentityContainer.getIdentity()
2433 Out[OutIdx] = *LocalReds[0];
2437 Out[OutIdx] = LocalReds[0];
2442 if (E != NElements - 1) {
2448 template <
bool IsOneWG,
typename... Reductions,
int Dims,
typename... LocalAccT,
2449 typename... InAccT,
typename... OutAccT,
typename... Ts,
2450 typename... BOPsT,
size_t... Is>
2452 nd_item<Dims> NDIt,
size_t LID,
size_t GID,
size_t RemainingWorkSize,
2456 std::array<
bool,
sizeof...(Reductions)> InitToIdentityProps,
2457 std::index_sequence<Is...>) {
2458 using ReductionPack = std::tuple<Reductions...>;
2460 std::tuple_element_t<Is, ReductionPack>>(
2461 NDIt, LID, GID, RemainingWorkSize, std::get<Is>(LocalAccsTuple),
2462 std::get<Is>(InAccsTuple), std::get<Is>(OutAccsTuple),
2463 std::get<Is>(IdentitiesTuple), std::get<Is>(BOPsTuple),
2464 InitToIdentityProps[Is]),
2468 namespace reduction::aux_krn {
2469 template <
class KernelName,
class Predicate>
struct Multi;
2471 template <
typename KernelName,
typename KernelType,
typename... Reductions,
2474 std::tuple<Reductions...> &ReduTuple,
2475 std::index_sequence<Is...> ReduIndices) {
2479 bool Pow2WG = (WGSize & (WGSize - 1)) == 0;
2480 bool HasUniformWG = Pow2WG && (NWorkGroups * WGSize == NWorkItems);
2484 auto ScalarIs =
filterSequence<Reductions...>(ScalarPredicate, ReduIndices);
2487 auto ArrayIs =
filterSequence<Reductions...>(ArrayPredicate, ReduIndices);
2493 std::get<Is>(ReduTuple).getReadAccToPreviousPartialReds(CGH)...);
2495 auto IdentitiesTuple =
2496 makeReduTupleT(std::get<Is>(ReduTuple).getIdentityContainer()...);
2499 std::array InitToIdentityProps{
2500 std::get<Is>(ReduTuple).initializeToIdentity()...};
2505 auto Rest = [&](
auto Predicate,
auto OutAccsTuple) {
2506 auto AccReduIndices =
filterSequence<Reductions...>(Predicate, ReduIndices);
2509 reduction::strategy::multi,
2510 decltype(Predicate)>;
2512 range<1> GlobalRange = {HasUniformWG ? NWorkItems : NWorkGroups * WGSize};
2516 constexpr
bool IsOneWG =
2517 std::is_same_v<std::remove_reference_t<decltype(Predicate)>,
2521 size_t RemainingWorkSize =
2528 NDIt, LID, GID, RemainingWorkSize, LocalAccsTuple, InAccsTuple,
2529 OutAccsTuple, IdentitiesTuple, BOPsTuple, InitToIdentityProps,
2532 NDIt, LID, GID, RemainingWorkSize, LocalAccsTuple, InAccsTuple,
2533 OutAccsTuple, IdentitiesTuple, BOPsTuple, InitToIdentityProps,
2537 if (NWorkGroups == 1)
2539 createReduOutAccs<true>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2542 createReduOutAccs<false>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2548 return sizeof(
typename Reduction::result_type);
2551 template <
typename Reduction,
typename... RestT>
2553 return sizeof(
typename Reduction::result_type) +
2557 template <
typename... ReductionT,
size_t... Is>
2559 std::index_sequence<Is...>) {
2565 template <
typename TupleT, std::size_t... Is>
2566 std::tuple<std::tuple_element_t<Is, TupleT>...>
2568 return {std::get<Is>(std::move(Tuple))...};
2572 template <
typename KernelName,
int Dims,
typename PropertiesT,
2574 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
2577 std::tuple<RestT...> ArgsTuple(Rest...);
2578 constexpr
size_t NumArgs =
sizeof...(RestT);
2580 auto ReduIndices = std::make_index_sequence<NumArgs - 1>();
2589 throw sycl::runtime_error(
"The implementation handling parallel_for with"
2590 " reduction requires work group size not bigger"
2592 std::to_string(MaxWGSize),
2593 PI_ERROR_INVALID_WORK_GROUP_SIZE);
2595 reduCGFuncMulti<KernelName>(CGH,
KernelFunc, NDRange, Properties, ReduTuple,
2600 while (NWorkItems > 1) {
2602 NWorkItems = reduAuxCGFunc<KernelName, decltype(KernelFunc)>(
2603 AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
2612 template <reduction::strategy Strategy>
2616 template <
typename KernelName,
int Dims,
typename PropertiesT,
2617 typename KernelType,
typename Reduction>
2618 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
2621 auto Delegate = [&](
auto Impl) {
2622 Impl.template run<KernelName>(CGH, Queue, NDRange, Properties, Redu,
2626 if constexpr (Reduction::has_float64_atomics) {
2630 if constexpr (Reduction::has_fast_reduce)
2634 }
else if constexpr (Reduction::has_fast_atomics) {
2635 if constexpr (
sizeof(
typename Reduction::result_type) == 8) {
2642 if constexpr (Reduction::has_fast_reduce)
2649 if constexpr (Reduction::has_fast_reduce) {
2655 if constexpr (Reduction::has_fast_reduce)
2661 assert(
false &&
"Must be unreachable!");
2663 template <
typename KernelName,
int Dims,
typename PropertiesT,
2665 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
2674 typename PropertiesT,
typename... RestT>
2676 PropertiesT Properties, RestT... Rest) {
2678 Properties, Rest...);
2681 __SYCL_EXPORT uint32_t
2685 typename PropertiesT,
typename... RestT>
2687 PropertiesT Properties, RestT... Rest) {
2688 std::tuple<RestT...> ArgsTuple(Rest...);
2689 constexpr
size_t NumArgs =
sizeof...(RestT);
2690 static_assert(NumArgs > 1,
"No reduction!");
2692 auto ReduIndices = std::make_index_sequence<NumArgs - 1>();
2697 size_t OneElemSize = [&]() {
2699 if constexpr (
sizeof...(RestT) == 2) {
2701 constexpr
bool IsTreeReduction =
2702 !Reduction::has_fast_reduce && !Reduction::has_fast_atomics;
2703 return IsTreeReduction ?
sizeof(
typename Reduction::result_type) : 0;
2709 uint32_t NumConcurrentWorkGroups =
2710 #ifdef __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS
2711 __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS;
2721 size_t NWorkItems = Range.
size();
2722 size_t WGSize = std::min(NWorkItems, PrefWGSize);
2723 size_t NWorkGroups = NWorkItems / WGSize;
2724 if (NWorkItems % WGSize)
2726 size_t MaxNWorkGroups = NumConcurrentWorkGroups;
2727 NWorkGroups = std::min(NWorkGroups, MaxNWorkGroups);
2728 size_t NDRItems = NWorkGroups * WGSize;
2731 size_t PerGroup = Range.
size() / NWorkGroups;
2738 auto UpdatedKernelFunc = [=](
auto NDId,
auto &...Reducers) {
2742 auto Group = NDId.get_group();
2743 size_t GroupId = Group.get_group_linear_id();
2744 size_t NumGroups = Group.get_group_linear_range();
2745 bool LastGroup = (GroupId == NumGroups - 1);
2746 size_t GroupStart = GroupId * PerGroup;
2747 size_t GroupEnd = LastGroup ? Range.
size() : (GroupStart + PerGroup);
2750 size_t Start = GroupStart + NDId.get_local_id(0);
2751 size_t End = GroupEnd;
2752 size_t Stride = NDId.get_local_range(0);
2753 auto GetDelinearized = [&](
size_t I) {
2756 decltype(Reducers)...>)
2763 for (
size_t I = Start; I < End; I += Stride)
2766 if constexpr (NumArgs == 2) {
2768 auto &Redu = std::get<0>(ReduTuple);
2770 constexpr
auto StrategyToUse = [&]() {
2771 if constexpr (Strategy != reduction::strategy::auto_select)
2785 if constexpr (Reduction::has_fast_reduce && Reduction::has_identity)
2786 return reduction::strategy::group_reduce_and_last_wg_detection;
2787 else if constexpr (Reduction::has_fast_atomics &&
2788 sizeof(
typename Reduction::result_type) != 8)
2789 return reduction::strategy::local_atomic_and_atomic_cross_wg;
2791 return reduction::strategy::range_basic;
2794 reduction_parallel_for<KernelName, StrategyToUse>(CGH, NDRange, Properties,
2795 Redu, UpdatedKernelFunc);
2798 [&](
auto &...Reds) {
2799 return reduction_parallel_for<KernelName, Strategy>(
2800 CGH, NDRange, Properties, Reds..., UpdatedKernelFunc);
2811 template <
typename T,
typename AllocatorT,
typename BinaryOperation>
2813 BinaryOperation Combiner,
const property_list &PropList = {}) {
2816 PropList.has_property<property::reduction::initialize_to_identity>();
2817 return detail::make_reduction<BinaryOperation, 0, 1, false>(
2826 template <
typename T,
typename BinaryOperation>
2830 PropList.has_property<property::reduction::initialize_to_identity>();
2831 return detail::make_reduction<BinaryOperation, 0, 1, false>(
2838 template <
typename T,
typename AllocatorT,
typename BinaryOperation>
2840 BinaryOperation Combiner,
const property_list &PropList = {}) {
2843 PropList.has_property<property::reduction::initialize_to_identity>();
2844 return detail::make_reduction<BinaryOperation, 0, 1, true>(
2851 template <
typename T,
typename BinaryOperation>
2855 PropList.has_property<property::reduction::initialize_to_identity>();
2856 return detail::make_reduction<BinaryOperation, 0, 1, true>(
2865 template <
typename T,
size_t Extent,
typename BinaryOperation,
2866 typename = std::enable_if_t<Extent != dynamic_extent>>
2870 PropList.has_property<property::reduction::initialize_to_identity>();
2871 return detail::make_reduction<BinaryOperation, 1, Extent, false>(
2878 template <
typename T,
size_t Extent,
typename BinaryOperation,
2879 typename = std::enable_if_t<Extent != dynamic_extent>>
2881 BinaryOperation Combiner,
const property_list &PropList = {}) {
2883 PropList.has_property<property::reduction::initialize_to_identity>();
2884 return detail::make_reduction<BinaryOperation, 1, Extent, true>(