57 #include <type_traits>
62 inline namespace _V1 {
76 template <
typename T,
class BinaryOperation,
int Dims,
size_t Extent,
77 typename IdentityContainerT,
bool View =
false,
typename Subst =
void>
89 template <
typename T,
class BinaryOperation>
91 #ifdef SYCL_REDUCTION_DETERMINISTIC
92 std::bool_constant<false>;
94 std::bool_constant<((is_sgenfloat_v<T> &&
sizeof(T) == 4) ||
112 template <
typename T,
class BinaryOperation>
114 #ifdef SYCL_REDUCTION_DETERMINISTIC
115 std::bool_constant<false>;
120 is_sgenfloat_v<T> &&
sizeof(T) == 8>;
128 template <
typename T,
class BinaryOperation>
130 #ifdef SYCL_REDUCTION_DETERMINISTIC
131 std::bool_constant<false>;
134 (is_sgeninteger_v<T> && (
sizeof(T) == 4 ||
sizeof(T) == 8)) ||
144 template <
typename... Ts>
using ReduTupleT = sycl::detail::tuple<Ts...>;
150 size_t LocalMemBytesPerWorkItem);
152 size_t &NWorkGroups);
154 size_t LocalMemBytesPerWorkItem);
156 template <
typename T,
class BinaryOperation,
bool IsOptional>
163 template <
typename T,
class BinaryOperation,
int Dims, std::size_t Extent,
164 typename IdentityContainerT,
bool View,
typename Subst>
166 IdentityContainerT, View, Subst>> {
168 using op = BinaryOperation;
169 static constexpr
int dims = Dims;
170 static constexpr
size_t extent = Extent;
171 static constexpr
bool has_identity = IdentityContainerT::has_identity;
180 template <
typename ReducerRelayT = ReducerT>
auto &
getElement(
size_t E) {
181 return MReducerRef.getElement(E);
184 template <
typename ReducerRelayT = ReducerT> constexpr
auto getIdentity() {
186 "Identity unavailable.");
187 return MReducerRef.getIdentity();
193 template <
typename ReducerRelayT = ReducerT>
198 "Static identity unavailable.");
199 return ReducerT::getIdentity();
203 ReducerT &MReducerRef;
230 template <
typename _T = Ty,
int _Dims = Dims>
235 return static_cast<Reducer *
>(
this)->combine(
static_cast<_T
>(1));
238 template <
typename _T = Ty,
int _Dims = Dims>
243 return static_cast<Reducer *
>(
this)->combine(
static_cast<_T
>(1));
246 template <
typename _T = Ty,
int _Dims = Dims>
248 operator+=(
const _T &Partial) {
249 return static_cast<Reducer *
>(
this)->combine(Partial);
252 template <
typename _T = Ty,
int _Dims = Dims>
254 operator*=(
const _T &Partial) {
255 return static_cast<Reducer *
>(
this)->combine(Partial);
258 template <
typename _T = Ty,
int _Dims = Dims>
260 operator|=(
const _T &Partial) {
261 return static_cast<Reducer *
>(
this)->combine(Partial);
264 template <
typename _T = Ty,
int _Dims = Dims>
266 operator^=(
const _T &Partial) {
267 return static_cast<Reducer *
>(
this)->combine(Partial);
270 template <
typename _T = Ty,
int _Dims = Dims>
272 operator&=(
const _T &Partial) {
273 return static_cast<Reducer *
>(
this)->combine(Partial);
277 template <access::address_space Space>
284 template <access::address_space Space,
class T,
class AtomicFunctor>
285 void atomic_combine_impl(T *ReduVarPtr, AtomicFunctor Functor)
const {
286 auto reducer =
static_cast<const Reducer *
>(
this);
287 for (
size_t E = 0; E < Extent; ++E) {
295 getMemoryScope<Space>(), Space>(
296 address_space_cast<Space, access::decorated::no>(ReduVarPtr)[E]);
297 Functor(std::move(AtomicRef), *ReducerElem);
301 template <
class _T, access::address_space Space,
class BinaryOp>
302 static constexpr
bool BasicCheck =
303 std::is_same_v<remove_decoration_t<_T>, Ty> &&
310 typename _T = Ty,
class _BinaryOperation = BinaryOp>
311 std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
312 (IsReduOptForFastAtomicFetch<_T, _BinaryOperation>::value ||
313 IsReduOptForAtomic64Op<_T, _BinaryOperation>::value) &&
314 IsPlus<_T, _BinaryOperation>::value>
316 atomic_combine_impl<Space>(
317 ReduVarPtr, [](
auto &&Ref,
auto Val) {
return Ref.fetch_add(Val); });
322 typename _T = Ty,
class _BinaryOperation = BinaryOp>
323 std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
327 atomic_combine_impl<Space>(
328 ReduVarPtr, [](
auto &&Ref,
auto Val) {
return Ref.fetch_or(Val); });
333 typename _T = Ty,
class _BinaryOperation = BinaryOp>
334 std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
338 atomic_combine_impl<Space>(
339 ReduVarPtr, [](
auto &&Ref,
auto Val) {
return Ref.fetch_xor(Val); });
344 typename _T = Ty,
class _BinaryOperation = BinaryOp>
345 std::enable_if_t<std::is_same_v<remove_decoration_t<_T>, _T> &&
351 atomic_combine_impl<Space>(
352 ReduVarPtr, [](
auto &&Ref,
auto Val) {
return Ref.fetch_and(Val); });
357 typename _T = Ty,
class _BinaryOperation = BinaryOp>
358 std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
363 atomic_combine_impl<Space>(
364 ReduVarPtr, [](
auto &&Ref,
auto Val) {
return Ref.fetch_min(Val); });
369 typename _T = Ty,
class _BinaryOperation = BinaryOp>
370 std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
375 atomic_combine_impl<Space>(
376 ReduVarPtr, [](
auto &&Ref,
auto Val) {
return Ref.fetch_max(Val); });
382 template <
typename T,
class BinaryOperation,
bool ExplicitIdentity,
383 typename CondT =
void>
385 static_assert(!std::is_same_v<T, T>,
386 "Partial specializations don't cover all possible options!");
390 template <
typename T,
class BinaryOperation,
bool ExplicitIdentity>
392 T, BinaryOperation, ExplicitIdentity,
395 static constexpr
bool has_identity =
true;
407 template <
typename T,
class BinaryOperation>
409 T, BinaryOperation, true,
412 static constexpr
bool has_identity =
true;
426 template <
typename T,
class BinaryOperation>
428 T, BinaryOperation, false,
429 std::enable_if_t<!IsKnownIdentityOp<T, BinaryOperation>::value>> {
431 static constexpr
bool has_identity =
false;
436 template <
typename T,
class BinaryOperation,
bool IsOptional>
438 using value_type = std::conditional_t<IsOptional, std::optional<T>, T>;
440 template <
bool ExplicitIdentity>
441 constexpr value_type GetInitValue(
443 &IdentityContainer) {
444 constexpr
bool ContainerHasIdentity =
446 ExplicitIdentity>::has_identity;
447 static_assert(IsOptional || ContainerHasIdentity);
448 if constexpr (!ContainerHasIdentity)
451 return IdentityContainer.getIdentity();
458 template <
bool ExplicitIdentity>
462 : MValue(GetInitValue(IdentityContainer)) {}
465 if constexpr (IsOptional)
466 MValue = MValue ? BinOp(*MValue, OtherValue) : OtherValue;
468 MValue = BinOp(MValue, OtherValue);
473 if constexpr (IsOptional) {
475 return combine(BinOp, *Other.MValue);
479 return combine(BinOp, Other.MValue);
484 if constexpr (IsOptional)
490 if constexpr (IsOptional)
496 constexpr
explicit operator bool()
const {
497 if constexpr (IsOptional)
498 return MValue.has_value();
512 template <
typename T,
class BinaryOperation,
bool IsOptional>
514 detail::ReducerElement<T, BinaryOperation, IsOptional>>
527 template <
class BinaryOperation,
typename IdentityContainerT>
530 const BinaryOperation
BOp;
541 template <
typename T,
class BinaryOperation,
int Dims,
size_t Extent,
542 typename IdentityContainerT,
bool View>
544 T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
545 std::enable_if_t<Dims == 0 && Extent == 1 && View == false &&
546 !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
548 reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
550 Dims == 0 && Extent == 1 && View == false &&
551 !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>>,
553 static constexpr
bool has_identity = IdentityContainerT::has_identity;
558 reducer(
const IdentityContainerT &IdentityContainer, BinaryOperation BOp)
559 : MValue(IdentityContainer), MIdentity(IdentityContainer),
563 :
reducer(Token.IdentityContainer, Token.BOp) {}
571 MValue.combine(MBinaryOp, Partial);
575 template <
bool HasIdentityRelay = has_
identity>
576 std::enable_if_t<HasIdentityRelay && (HasIdentityRelay == has_identity), T>
578 return MIdentity.getIdentity();
585 const element_type &getElement(
size_t)
const {
return MValue; }
587 detail::ReducerElement<T, BinaryOperation, !has_identity> MValue;
588 const IdentityContainerT MIdentity;
589 BinaryOperation MBinaryOp;
597 template <
typename T,
class BinaryOperation,
int Dims,
size_t Extent,
598 typename IdentityContainerT,
bool View>
600 T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
601 std::enable_if_t<Dims == 0 && Extent == 1 && View == false &&
602 detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
604 reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
606 Dims == 0 && Extent == 1 && View == false &&
607 detail::IsKnownIdentityOp<T, BinaryOperation>::value>>>,
609 static constexpr
bool has_identity = IdentityContainerT::has_identity;
615 reducer(
const IdentityContainerT & , BinaryOperation)
616 : MValue(getIdentity()) {}
619 :
reducer(Token.IdentityContainer, Token.BOp) {}
628 MValue.combine(BOp, Partial);
637 static constexpr T getIdentity() {
641 element_type &getElement(
size_t) {
return MValue; }
642 const element_type &getElement(
size_t)
const {
return MValue; }
643 detail::ReducerElement<T, BinaryOperation, !has_identity> MValue;
648 template <
typename T,
class BinaryOperation,
int Dims,
size_t Extent,
649 typename IdentityContainerT,
bool View>
650 class reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
651 std::enable_if_t<Dims == 0 && View == true>>
653 reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
654 std::enable_if_t<Dims == 0 && View == true>>>,
656 static constexpr
bool has_identity = IdentityContainerT::has_identity;
662 : MElement(Ref), MBinaryOp(BOp) {}
665 :
reducer(Token.IdentityContainer, Token.BOp) {}
673 MElement.combine(MBinaryOp, Partial);
681 const element_type &getElement(
size_t)
const {
return MElement; }
683 element_type &MElement;
684 BinaryOperation MBinaryOp;
689 template <
typename T,
class BinaryOperation,
int Dims,
size_t Extent,
690 typename IdentityContainerT,
bool View>
692 T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
693 std::enable_if_t<Dims == 1 && View == false &&
694 !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
696 reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
698 Dims == 1 && View == false &&
699 !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>>,
701 static constexpr
bool has_identity = IdentityContainerT::has_identity;
706 reducer(
const IdentityContainerT &IdentityContainer, BinaryOperation BOp)
707 : MValue(IdentityContainer), MIdentity(IdentityContainer),
711 :
reducer(Token.IdentityContainer, Token.BOp) {}
718 reducer<T, BinaryOperation, Dims - 1, Extent, IdentityContainerT,
true>
720 return {MValue[Index], MBinaryOp};
723 template <
bool HasIdentityRelay = has_
identity>
724 std::enable_if_t<HasIdentityRelay && (HasIdentityRelay == has_identity), T>
726 return MIdentity.getIdentity();
732 element_type &getElement(
size_t E) {
return MValue[E]; }
733 const element_type &getElement(
size_t E)
const {
return MValue[E]; }
736 const IdentityContainerT MIdentity;
737 BinaryOperation MBinaryOp;
742 template <
typename T,
class BinaryOperation,
int Dims,
size_t Extent,
743 typename IdentityContainerT,
bool View>
745 T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
746 std::enable_if_t<Dims == 1 && View == false &&
747 detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
749 reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
751 Dims == 1 && View == false &&
752 detail::IsKnownIdentityOp<T, BinaryOperation>::value>>>,
754 static constexpr
bool has_identity = IdentityContainerT::has_identity;
760 reducer(
const IdentityContainerT & , BinaryOperation)
761 : MValue(getIdentity()) {}
764 :
reducer(Token.IdentityContainer, Token.BOp) {}
773 reducer<T, BinaryOperation, Dims - 1, Extent, IdentityContainerT,
true>
775 return {MValue[Index], BinaryOperation()};
783 static constexpr T getIdentity() {
787 element_type &getElement(
size_t E) {
return MValue[E]; }
788 const element_type &getElement(
size_t E)
const {
return MValue[E]; }
790 marray<element_type, Extent> MValue;
803 accessor<T, AccessorDims,
Mode, access::target::device, IsPH, PropList>> {
804 static constexpr
int value = AccessorDims;
812 template <
class T,
int Dims,
typename AllocatorT>
825 template <
class KernelName>
827 std::conditional_t<std::is_same_v<KernelName, auto_name>,
auto_name,
830 template <
typename T,
class BinaryOperation,
int Dims,
size_t Extent,
831 bool ExplicitIdentity,
typename RedOutVar>
834 ExplicitIdentity, RedOutVar>;
837 static constexpr T chooseIdentity(
const T &Identity) {
873 static constexpr
size_t dims = Dims;
881 static constexpr
bool is_usm = std::is_same_v<RedOutVar, T *>;
887 : MIdentityContainer(chooseIdentity(Identity)), MBinaryOp(BinaryOp),
888 InitializeToIdentity(Init), MRedOut(
std::move(RedOut)) {}
890 template <
typename RelayT = T,
891 typename RelayBinaryOperation = BinaryOperation>
893 BinaryOperation BinaryOp,
bool Init, RedOutVar RedOut,
897 MBinaryOp(BinaryOp), InitializeToIdentity(Init),
898 MRedOut(
std::move(RedOut)) {}
900 template <
typename RelayT = T,
901 typename RelayBinaryOperation = BinaryOperation>
903 BinaryOperation BinaryOp,
bool Init, RedOutVar RedOut,
906 : MIdentityContainer(), MBinaryOp(BinaryOp), InitializeToIdentity(Init),
907 MRedOut(
std::move(RedOut)) {}
910 CGH.addReduction(MOutBufPtr);
914 template <
bool IsOneWG>
918 if constexpr (IsOneWG) {
922 std::make_shared<buffer<reducer_element_type, 1>>(
range<1>(Size));
923 CGH.addReduction(MOutBufPtr);
929 auto Buffer = std::make_shared<buffer<_T, 1>>(
range<1>(Size));
930 CGH.addReduction(Buffer);
942 "Unexpected size of reducer element type.");
948 auto ReinterpretRedOut =
949 MRedOut.template reinterpret<reducer_element_type>();
950 return accessor{ReinterpretRedOut, CGH};
958 std::make_shared<buffer<reducer_element_type, 1>>(
range<1>(Size));
959 CGH.addReduction(MOutBufPtr);
971 template <
typename KernelName,
typename FuncTy,
976 auto DoIt = [&](
auto &Out) {
977 auto RWReduVal = std::make_shared<std::array<T, num_elements>>();
979 (*RWReduVal)[i] = decltype(MIdentityContainer)::getIdentity();
981 auto Buf = std::make_shared<buffer<T, 1>>(RWReduVal.get()->data(),
983 Buf->set_final_data();
993 Buf->template get_access<access::mode::read_write>(CopyHandler);
996 CopyHandler.addReduction(RWReduVal);
997 CopyHandler.addReduction(Buf);
1009 for (
size_t i = 0; i < NElements; ++i) {
1010 if (IsUpdateOfUserVar)
1011 Out[i] = BOp(Out[i], Mem[i]);
1018 CopyHandler.
copy(Mem, OutAcc);
1036 template <
typename KernelName,
typename FuncTy,
1041 "Initialize to identity not allowed for identity-less reductions.");
1046 return MIdentityContainer;
1052 auto CounterMem = std::make_shared<int>(0);
1053 CGH.addReduction(CounterMem);
1054 auto CounterBuf = std::make_shared<buffer<int, 1>>(CounterMem.get(), 1);
1055 CounterBuf->set_final_data();
1056 CGH.addReduction(CounterBuf);
1057 return {*CounterBuf, CGH};
1063 queue q = createSyclObjFromImpl<queue>(CGH.MQueue);
1065 auto Deleter = [=](
auto *Ptr) {
free(Ptr, q); };
1067 std::shared_ptr<int> Counter(malloc_device<int>(1, q), Deleter);
1068 CGH.addReduction(Counter);
1070 auto Event = q.
memset(Counter.get(), 0,
sizeof(
int));
1073 return Counter.get();
1094 std::shared_ptr<buffer<reducer_element_type, 1>> MOutBufPtr;
1096 BinaryOperation MBinaryOp;
1097 bool InitializeToIdentity;
1105 template <
typename T,
class BinaryOperation,
int Dims,
size_t Extent,
1106 bool ExplicitIdentity,
typename RedOutVar>
1110 ExplicitIdentity, RedOutVar> {
1113 ExplicitIdentity, RedOutVar>;
1115 ExplicitIdentity, RedOutVar>;
1122 static_assert(Dims <= 1,
"Multi-dimensional reductions are not supported.");
1126 template <
bool ExplicitIdentityRelay = ExplicitIdentity,
1127 typename = std::enable_if_t<!ExplicitIdentityRelay>>
1129 bool InitializeToIdentity =
false)
1132 if (Var.size() != 1)
1134 "Reduction variable must be a scalar.");
1138 "initialize_to_identity property cannot be "
1139 "used with identityless reductions.");
1144 template <
bool ExplicitIdentityRelay = ExplicitIdentity,
1145 typename = std::enable_if_t<ExplicitIdentityRelay>>
1147 bool InitializeToIdentity)
1150 if (Var.size() != 1)
1152 "Reduction variable must be a scalar.");
1156 template <
class BinaryOp,
int Dims,
size_t Extent,
bool ExplicitIdentity,
1157 typename RedOutVar,
typename... RestTy>
1160 Extent, ExplicitIdentity, RedOutVar>{
1161 RedVar, std::forward<RestTy>(Rest)...};
1167 event E = CGH.finalize();
1168 handler AuxHandler(CGH.MQueue, CGH.MIsHost);
1170 AuxHandler.saveCodeLoc(CGH.MCodeLoc);
1172 CGH.MLastEvent = AuxHandler.finalize();
1182 template <
typename KernelName,
class Reduction>
1184 static_assert(Reduction::is_usm,
1185 "All implementations using this helper are expected to have "
1186 "USM reduction, not a buffer-based one.");
1187 size_t NElements = Reduction::num_elements;
1188 auto InAcc = Redu.getReadAccToPreviousPartialReds(CGH);
1189 auto UserVarPtr = Redu.getUserRedVarAccess(CGH);
1190 bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1191 auto BOp = Redu.getBinaryOperation();
1193 for (
size_t i = 0; i < NElements; ++i) {
1194 auto Elem = InAcc[i];
1195 if (IsUpdateOfUserVar)
1196 UserVarPtr[i] = BOp(UserVarPtr[i], *Elem);
1198 UserVarPtr[i] = *Elem;
1217 std::conditional_t<std::is_same_v<KernelName, auto_name>,
auto_name,
1218 MainOrAux<KernelName, Strategy, Ts...>>;
1226 template <
typename KernelName,
int Dims,
typename PropertiesT,
1227 typename KernelType,
typename Reduction>
1228 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1230 Reduction &Redu, KernelType &KernelFunc) {
1231 static_assert(Reduction::has_identity,
1232 "Identityless reductions are not supported by the "
1233 "local_atomic_and_atomic_cross_wg strategy.");
1235 std::ignore = Queue;
1239 Redu.template withInitializedMem<Name>(CGH, [&](
auto Out) {
1240 size_t NElements = Reduction::num_elements;
1246 typename Reduction::reducer_type Reducer;
1247 KernelFunc(NDId, Reducer);
1250 auto LID = NDId.get_local_id(0);
1251 for (
size_t E = LID; E < NElements; E += NDId.get_local_range(0)) {
1257 Reducer.template atomic_combine<access::address_space::local_space>(
1264 for (
size_t E = 0; E < NElements; ++E) {
1267 Reducer.template atomic_combine(&Out[0]);
1276 reduction::strategy::group_reduce_and_last_wg_detection> {
1277 template <
typename KernelName,
int Dims,
typename PropertiesT,
1278 typename KernelType,
typename Reduction>
1279 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1281 Reduction &Redu, KernelType &KernelFunc) {
1282 static_assert(Reduction::has_identity,
1283 "Identityless reductions are not supported by the "
1284 "group_reduce_and_last_wg_detection strategy.");
1286 std::ignore = Queue;
1287 size_t NElements = Reduction::num_elements;
1291 auto Out = Redu.getUserRedVarAccess(CGH);
1293 auto &PartialSumsBuf = Redu.getTempBuffer(NWorkGroups * NElements, CGH);
1296 bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1297 auto Rest = [&](
auto NWorkGroupsFinished) {
1303 decltype(NWorkGroupsFinished)>;
1307 typename Reduction::reducer_type Reducer;
1308 KernelFunc(NDId, Reducer);
1310 typename Reduction::binary_operation BOp;
1311 auto Group = NDId.get_group();
1315 size_t LID = NDId.get_local_id(0);
1316 for (
size_t E = 0; E < NElements; ++E) {
1320 if (NWorkGroups == 1) {
1323 if (IsUpdateOfUserVar)
1324 RedElem = BOp(RedElem, Out[E]);
1327 PartialSums[NDId.get_group_linear_id() * NElements + E] =
1333 if (NWorkGroups == 1)
1345 NWorkGroupsFinished[0]);
1346 DoReducePartialSumsInLastWG[0] =
1347 ++NFinished ==
static_cast<int>(NWorkGroups);
1351 if (DoReducePartialSumsInLastWG[0]) {
1354 for (
size_t E = 0; E < NElements; ++E) {
1356 for (
size_t I = LID; I < NWorkGroups; I += WGSize)
1357 LocalSum = BOp(LocalSum, PartialSums[I * NElements + E]);
1361 if (IsUpdateOfUserVar)
1362 Result = BOp(Result, Out[E]);
1375 !
device.
has(aspect::usm_device_allocations))
1376 Rest(Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH));
1378 Rest(Redu.getGroupsCounterAccDiscrete(CGH));
1388 while ((N >>= 1) != 0)
1393 template <
typename FuncTy>
1409 if (Pivot != WorkSize) {
1410 if (Pivot + LID < WorkSize)
1411 Func(LID, Pivot + LID);
1416 for (
size_t CurPivot = Pivot >> 1; CurPivot > 0; CurPivot >>= 1) {
1418 Func(LID, CurPivot + LID);
1427 typename BinOpTy,
typename AccessFuncTy>
1429 BinOpTy &BOp, AccessFuncTy AccessFunc) {
1431 size_t AdjustedWorkSize;
1438 LocalReds[LID] = AccessFunc(LID);
1439 AdjustedWorkSize = WorkSize;
1444 AdjustedWorkSize = std::min(WorkSize, WGSize);
1445 if (LID < AdjustedWorkSize) {
1446 auto LocalSum = AccessFunc(LID);
1447 for (
size_t I = LID + WGSize; I < WorkSize; I += WGSize)
1448 LocalSum = BOp(LocalSum, AccessFunc(I));
1450 LocalReds[LID] = LocalSum;
1454 LocalReds[I] = BOp(LocalReds[I], LocalReds[J]);
1462 template <
typename... LocalAccT,
typename... BOPsT,
size_t... Is>
1466 std::index_sequence<Is...>) {
1468 auto ProcessOne = [=](
auto &LocalAcc,
auto &BOp) {
1469 LocalAcc[I] = BOp(LocalAcc[I], LocalAcc[J]);
1471 (ProcessOne(std::get<Is>(LocalAccs), std::get<Is>(BOPs)), ...);
1476 template <
typename KernelName,
int Dims,
typename PropertiesT,
1477 typename KernelType,
typename Reduction>
1478 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1480 Reduction &Redu, KernelType &KernelFunc) {
1481 using reducer_type =
typename Reduction::reducer_type;
1488 constexpr
bool UsePartialSumForOutput =
1489 !Reduction::is_usm && Reduction::has_identity;
1491 std::ignore = Queue;
1492 size_t NElements = Reduction::num_elements;
1496 bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1498 Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1500 if constexpr (UsePartialSumForOutput)
1501 return (NWorkGroups == 1)
1503 : Redu.getWriteAccForPartialReds(NElements, CGH);
1505 return Redu.getUserRedVarAccess(CGH);
1508 auto NWorkGroupsFinished =
1509 Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH);
1512 auto IdentityContainer = Redu.getIdentityContainer();
1513 auto BOp = Redu.getBinaryOperation();
1520 reducer_type Reducer = reducer_type(IdentityContainer, BOp);
1521 KernelFunc(NDId, Reducer);
1523 auto ElementCombiner = [&](element_type &LHS,
const element_type &RHS) {
1524 return LHS.combine(BOp, RHS);
1529 size_t LID = NDId.get_local_linear_id();
1530 for (
size_t E = 0; E < NElements; ++E) {
1532 doTreeReduction<WorkSizeGuarantees::Equal>(
1533 WGSize, NDId, LocalReds, ElementCombiner,
1537 auto V = LocalReds[0];
1539 bool IsOneWG = NWorkGroups == 1;
1540 if (IsOneWG && IsUpdateOfUserVar)
1541 V.combine(BOp, Out[E]);
1545 if (UsePartialSumForOutput || !IsOneWG)
1546 PartialSums[NDId.get_group_linear_id() * NElements + E] = V;
1560 NWorkGroupsFinished[0]);
1561 DoReducePartialSumsInLastWG[0] =
1562 ++NFinished == NWorkGroups && NWorkGroups > 1;
1566 if (DoReducePartialSumsInLastWG[0]) {
1569 for (
size_t E = 0; E < NElements; ++E) {
1570 doTreeReduction<WorkSizeGuarantees::None>(
1571 NWorkGroups, NDId, LocalReds, ElementCombiner,
1572 [&](
size_t I) {
return PartialSums[I * NElements + E]; });
1574 auto V = LocalReds[0];
1575 if (IsUpdateOfUserVar)
1576 V.combine(BOp, Out[E]);
1587 template <
typename KernelName,
int Dims,
typename PropertiesT,
1588 typename KernelType,
typename Reduction>
1589 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1591 Reduction &Redu, KernelType &KernelFunc) {
1592 static_assert(Reduction::has_identity,
1593 "Identityless reductions are not supported by the "
1594 "group_reduce_and_atomic_cross_wg strategy.");
1596 std::ignore = Queue;
1600 Redu.template withInitializedMem<Name>(CGH, [&](
auto Out) {
1601 size_t NElements = Reduction::num_elements;
1605 typename Reduction::reducer_type Reducer;
1606 KernelFunc(NDIt, Reducer);
1608 typename Reduction::binary_operation BOp;
1609 for (
size_t E = 0; E < NElements; ++E) {
1613 if (NDIt.get_local_linear_id() == 0)
1614 Reducer.atomic_combine(&Out[0]);
1622 reduction::strategy::local_mem_tree_and_atomic_cross_wg> {
1623 template <
typename KernelName,
int Dims,
typename PropertiesT,
1624 typename KernelType,
typename Reduction>
1625 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1627 Reduction &Redu, KernelType &KernelFunc) {
1628 using reducer_type =
typename Reduction::reducer_type;
1631 std::ignore = Queue;
1635 Redu.template withInitializedMem<Name>(CGH, [&](
auto Out) {
1636 size_t NElements = Reduction::num_elements;
1645 reducer_type Reducer;
1646 KernelFunc(NDIt, Reducer);
1648 size_t WGSize = NDIt.get_local_range().size();
1649 size_t LID = NDIt.get_local_linear_id();
1651 typename Reduction::binary_operation BOp;
1652 auto ElementCombiner = [&](element_type &LHS,
const element_type &RHS) {
1653 return LHS.combine(BOp, RHS);
1658 for (
size_t E = 0; E < NElements; ++E) {
1660 doTreeReduction<WorkSizeGuarantees::Equal>(
1661 WGSize, NDIt, LocalReds, ElementCombiner,
1668 if (E != NElements - 1) {
1674 Reducer.atomic_combine(&Out[0]);
1683 reduction::strategy::group_reduce_and_multiple_kernels> {
1684 template <
typename KernelName,
int Dims,
typename PropertiesT,
1685 typename KernelType,
typename Reduction>
1686 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1688 Reduction &Redu, KernelType &KernelFunc) {
1689 static_assert(Reduction::has_identity,
1690 "Identityless reductions are not supported by the "
1691 "group_reduce_and_multiple_kernels strategy.");
1701 constexpr
bool HFR = Reduction::has_fast_reduce;
1702 size_t OneElemSize = HFR ? 0 :
sizeof(
typename Reduction::result_type);
1709 "The implementation handling parallel_for with"
1710 " reduction requires work group size not bigger"
1712 std::to_string(MaxWGSize));
1714 size_t NElements = Reduction::num_elements;
1716 auto Out = Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1718 bool IsUpdateOfUserVar =
1719 !Reduction::is_usm && !Redu.initializeToIdentity() && NWorkGroups == 1;
1727 typename Reduction::reducer_type Reducer;
1728 KernelFunc(NDIt, Reducer);
1731 size_t WGID = NDIt.get_group_linear_id();
1732 typename Reduction::binary_operation BOp;
1733 for (
size_t E = 0; E < NElements; ++E) {
1734 typename Reduction::result_type PSum;
1737 if (NDIt.get_local_linear_id() == 0) {
1738 if (IsUpdateOfUserVar)
1739 PSum = BOp(*Out[E], PSum);
1740 Out[WGID * NElements + E] = PSum;
1754 "The implementation handling parallel_for with "
1755 "reduction requires the maximal work group "
1756 "size to be greater than 1 to converge. "
1757 "The maximal work group size depends on the "
1758 "device and the size of the objects passed to "
1761 while (NWorkItems > 1) {
1763 size_t NElements = Reduction::num_elements;
1770 bool HasUniformWG = NWorkGroups * WGSize == NWorkItems;
1771 if (!Reduction::has_fast_reduce)
1772 HasUniformWG = HasUniformWG && (WGSize & (WGSize - 1)) == 0;
1776 auto In = Redu.getReadAccToPreviousPartialReds(AuxHandler);
1778 Redu.getWriteAccForPartialReds(NWorkGroups * NElements, AuxHandler);
1784 bool IsUpdateOfUserVar = !Reduction::is_usm &&
1785 !Redu.initializeToIdentity() &&
1787 range<1> GlobalRange = {HasUniformWG ? NWorkItems
1788 : NWorkGroups * WGSize};
1791 typename Reduction::binary_operation BOp;
1792 size_t WGID = NDIt.get_group_linear_id();
1793 size_t GID = NDIt.get_global_linear_id();
1795 for (
size_t E = 0; E < NElements; ++E) {
1796 typename Reduction::result_type PSum =
1797 (HasUniformWG || (GID < NWorkItems))
1798 ? *In[GID * NElements + E]
1802 if (NDIt.get_local_linear_id() == 0) {
1803 if (IsUpdateOfUserVar)
1804 PSum = BOp(*Out[E], PSum);
1805 Out[WGID * NElements + E] = PSum;
1809 NWorkItems = NWorkGroups;
1813 if constexpr (Reduction::is_usm) {
1815 reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
1822 template <
typename KernelName,
int Dims,
typename PropertiesT,
1823 typename KernelType,
typename Reduction>
1824 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1826 Reduction &Redu, KernelType &KernelFunc) {
1827 using element_type =
typename Reduction::reducer_element_type;
1829 constexpr
bool HFR = Reduction::has_fast_reduce;
1830 size_t OneElemSize = HFR ? 0 :
sizeof(element_type);
1837 "The implementation handling parallel_for with"
1838 " reduction requires work group size not bigger"
1840 std::to_string(MaxWGSize));
1844 bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1845 std::ignore = IsUpdateOfUserVar;
1850 auto First = [&](
auto KernelTag) {
1852 constexpr
bool IsOneWG =
1853 std::is_same_v<std::remove_reference_t<decltype(KernelTag)>,
1856 constexpr
size_t NElements = Reduction::num_elements;
1861 if constexpr (IsOneWG)
1862 return Redu.getUserRedVarAccess(CGH);
1864 return Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1870 auto BOp = Redu.getBinaryOperation();
1871 auto IdentityContainer = Redu.getIdentityContainer();
1875 decltype(KernelTag)>;
1879 typename Reduction::reducer_type Reducer =
1880 typename Reduction::reducer_type(IdentityContainer, BOp);
1881 KernelFunc(NDIt, Reducer);
1883 size_t WGSize = NDIt.get_local_range().size();
1884 size_t LID = NDIt.get_local_linear_id();
1886 auto ElementCombiner = [&](element_type &LHS,
const element_type &RHS) {
1887 return LHS.combine(BOp, RHS);
1892 for (
size_t E = 0; E < NElements; ++E) {
1894 doTreeReduction<WorkSizeGuarantees::Equal>(
1895 WGSize, NDIt, LocalReds, ElementCombiner,
1900 element_type PSum = LocalReds[0];
1901 if constexpr (IsOneWG) {
1902 if (IsUpdateOfUserVar)
1903 PSum.combine(BOp, Out[E]);
1906 size_t GrID = NDIt.get_group_linear_id();
1907 Out[GrID * NElements + E] = PSum;
1912 if (E != NElements - 1) {
1919 if (NWorkGroups == 1)
1933 "The implementation handling parallel_for with "
1934 "reduction requires the maximal work group "
1935 "size to be greater than 1 to converge. "
1936 "The maximal work group size depends on the "
1937 "device and the size of the objects passed to "
1940 while (NWorkItems > 1) {
1944 auto Rest = [&](
auto KernelTag) {
1947 constexpr
bool IsOneWG =
1948 std::is_same_v<std::remove_reference_t<decltype(KernelTag)>,
1951 constexpr
size_t NElements = Reduction::num_elements;
1957 bool HasUniformWG = NWorkGroups * WGSize == NWorkItems;
1961 auto In = Redu.getReadAccToPreviousPartialReds(AuxHandler);
1964 if constexpr (IsOneWG)
1965 return Redu.getUserRedVarAccess(AuxHandler);
1967 return Redu.getWriteAccForPartialReds(NWorkGroups * NElements,
1971 bool UniformPow2WG = HasUniformWG && (WGSize & (WGSize - 1)) == 0;
1976 auto BOp = Redu.getBinaryOperation();
1979 decltype(KernelTag)>;
1981 range<1> GlobalRange = {UniformPow2WG ? NWorkItems
1982 : NWorkGroups * WGSize};
1985 size_t WGSize = NDIt.get_local_range().size();
1986 size_t LID = NDIt.get_local_linear_id();
1987 size_t GID = NDIt.get_global_linear_id();
1988 size_t GrID = NDIt.get_group_linear_id();
1991 size_t RemainingWorkSize =
1992 sycl::min(WGSize, NWorkItems - GrID * WGSize);
1994 auto ElementCombiner = [&](element_type &LHS,
1995 const element_type &RHS) {
1996 return LHS.combine(BOp, RHS);
1999 for (
size_t E = 0; E < NElements; ++E) {
2001 doTreeReduction<WorkSizeGuarantees::LessOrEqual>(
2002 RemainingWorkSize, NDIt, LocalReds, ElementCombiner,
2003 [&](
size_t) {
return In[GID * NElements + E]; });
2007 element_type PSum = LocalReds[0];
2008 if constexpr (IsOneWG) {
2009 if (IsUpdateOfUserVar)
2010 PSum.combine(BOp, Out[E]);
2013 Out[GrID * NElements + E] = PSum;
2018 if (E != NElements - 1) {
2023 NWorkItems = NWorkGroups;
2027 if (NWorkGroups == 1)
2038 template <
bool IsOneWG,
typename... Reductions,
size_t... Is>
2040 std::tuple<Reductions...> &ReduTuple,
2041 std::index_sequence<Is...>) {
2043 std::get<Is>(ReduTuple).
template getWriteMemForPartialReds<IsOneWG>(
2049 template <
typename OutAccT,
typename LocalAccT,
typename BOPT,
2050 typename IdentityContainerT>
2052 IdentityContainerT IdentityContainer,
2053 bool IsInitializeToIdentity) {
2054 if constexpr (!IdentityContainerT::has_identity) {
2055 return BOP(LocalAcc[0], OutAcc[0]);
2057 return BOP(LocalAcc[0], IsInitializeToIdentity
2058 ? IdentityContainer.getIdentity()
2063 template <
bool IsOneWG,
typename... Reductions,
typename... OutAccT,
2064 typename... LocalAccT,
typename... BOPsT,
typename... Ts,
2070 std::array<
bool,
sizeof...(Reductions)> IsInitializeToIdentity,
2071 std::index_sequence<Is...>) {
2072 if constexpr (IsOneWG) {
2076 std::get<Is>(OutAccs), std::get<Is>(LocalAccs), std::get<Is>(BOPs),
2077 std::get<Is>(IdentityVals), IsInitializeToIdentity[Is])),
2079 ((std::get<Is>(OutAccs)[OutAccIndex] = *std::get<Is>(LocalAccs)[0]), ...);
2083 ((std::get<Is>(OutAccs)[OutAccIndex] = std::get<Is>(LocalAccs)[0]), ...);
2099 template <
size_t... Is,
size_t... Js>
2100 constexpr std::index_sequence<Is..., Js...>
2106 template <
size_t... Is,
size_t... Js,
class... Rs>
2108 std::index_sequence<Js...>, Rs...) {
2113 template <
typename T>
struct Func {
2114 static constexpr
bool value = !std::remove_pointer_t<T>::is_usm;
2119 template <
typename T>
struct Func {
2126 std::conditional_t<Cond, std::index_sequence<I>, std::index_sequence<>>;
2134 template <
typename... T,
typename FunctorT,
size_t... Is,
2135 std::enable_if_t<(
sizeof...(Is) > 0),
int> Z = 0>
2139 Is, std::tuple<T...>>>::value,
2142 template <
typename... T,
typename FunctorT,
size_t... Is,
2143 std::enable_if_t<(
sizeof...(Is) == 0),
int> Z = 0>
2145 return std::index_sequence<>{};
2151 template <
typename... T,
typename FunctorT,
size_t... Is>
2157 template <
typename Reduction>
struct Func {
2159 (Reduction::dims == 0 && Reduction::num_elements == 1);
2164 template <
typename Reduction>
struct Func {
2166 (Reduction::dims == 1 && Reduction::num_elements >= 1);
2170 template <
typename ElementType,
typename BOPT>
2172 return [&](ElementType &LHS,
const ElementType &RHS) {
2173 return LHS.combine(BOP, RHS);
2177 template <
typename... Reductions,
typename... BOPsT,
size_t... Is>
2179 std::index_sequence<Is...>) {
2182 Is, std::tuple<Reductions...>>::reducer_element_type>(
2183 std::get<Is>(BOPsTuple))...);
2186 template <
typename... Reductions,
typename... BOPsT>
2189 BOPsTuple, std::make_index_sequence<
sizeof...(Reductions)>{});
2194 template <
bool IsOneWG,
typename... Reductions,
int Dims,
typename... LocalAccT,
2195 typename... OutAccT,
typename... ReducerT,
typename... Ts,
2196 typename... BOPsT,
size_t... Is>
2201 std::array<
bool,
sizeof...(Reductions)> InitToIdentityProps,
2202 std::index_sequence<Is...> ReduIndices) {
2206 ((std::get<Is>(LocalAccsTuple)[LID] =
2221 GrID, OutAccsTuple, LocalAccsTuple, AdjustedBOPsTuple, IdentitiesTuple,
2222 InitToIdentityProps, ReduIndices);
2227 template <
bool IsOneWG,
typename Reduction,
int Dims,
typename LocalAccT,
2228 typename OutAccT,
typename ReducerT,
typename BOPT>
2230 OutAccT Out, ReducerT &Reducer, BOPT BOp,
2231 bool IsInitializeToIdentity) {
2232 using element_type =
typename Reduction::reducer_element_type;
2237 auto ElementCombiner = [&](element_type &LHS,
const element_type &RHS) {
2238 return LHS.combine(BOp, RHS);
2243 auto NElements = Reduction::num_elements;
2244 for (
size_t E = 0; E < NElements; ++E) {
2245 doTreeReduction<WorkSizeGuarantees::Equal>(
2246 WGSize, NDIt, LocalReds, ElementCombiner,
2252 size_t OutIdx = GrID * NElements + E;
2253 if constexpr (IsOneWG) {
2256 if constexpr (Reduction::has_identity) {
2257 Out[OutIdx] = *ElementCombiner(LocalReds[0], IsInitializeToIdentity
2258 ? Reducer.identity()
2261 Out[OutIdx] = *LocalReds[0];
2265 Out[OutIdx] = LocalReds[0];
2270 if (E != NElements - 1) {
2276 template <
bool IsOneWG,
typename... Reductions,
int Dims,
typename... LocalAccT,
2277 typename... OutAccT,
typename... ReducerT,
typename... BOPsT,
2283 std::array<
bool,
sizeof...(Reductions)> InitToIdentityProps,
2284 std::index_sequence<Is...>) {
2285 using ReductionPack = std::tuple<Reductions...>;
2286 (reduCGFuncImplArrayHelper<IsOneWG, std::tuple_element_t<Is, ReductionPack>>(
2287 NDIt, std::get<Is>(LocalAccsTuple), std::get<Is>(OutAccsTuple),
2288 std::get<Is>(ReducersTuple), std::get<Is>(BOPsTuple),
2289 InitToIdentityProps[Is]),
2293 namespace reduction::main_krn {
2296 template <
typename KernelName,
typename KernelType,
int Dims,
2297 typename PropertiesT,
typename... Reductions,
size_t... Is>
2300 std::tuple<Reductions...> &ReduTuple,
2301 std::index_sequence<Is...> ReduIndices) {
2313 auto ScalarIs =
filterSequence<Reductions...>(ScalarPredicate, ReduIndices);
2316 auto ArrayIs =
filterSequence<Reductions...>(ArrayPredicate, ReduIndices);
2325 auto Rest = [&](
auto KernelTag,
auto OutAccsTuple) {
2326 auto IdentitiesTuple =
2327 makeReduTupleT(std::get<Is>(ReduTuple).getIdentityContainer()...);
2330 std::array InitToIdentityProps{
2331 std::get<Is>(ReduTuple).initializeToIdentity()...};
2335 decltype(KernelTag)>;
2339 constexpr
bool IsOneWG =
2340 std::is_same_v<std::remove_reference_t<decltype(KernelTag)>,
2345 auto ReduIndices = std::index_sequence_for<Reductions...>();
2346 auto ReducerTokensTuple =
2347 std::tuple{
typename Reductions::reducer_token_type{
2348 std::get<Is>(IdentitiesTuple), std::get<Is>(BOPsTuple)}...};
2349 auto ReducersTuple = std::tuple<
typename Reductions::reducer_type...>{
2350 std::get<Is>(ReducerTokensTuple)...};
2351 std::apply([&](
auto &...Reducers) { KernelFunc(NDIt, Reducers...); },
2359 NDIt, LocalAccsTuple, OutAccsTuple, ReducersTuple, IdentitiesTuple,
2360 BOPsTuple, InitToIdentityProps, ScalarIs);
2367 NDIt, LocalAccsTuple, OutAccsTuple, ReducersTuple, BOPsTuple,
2368 InitToIdentityProps, ArrayIs);
2373 if (NWorkGroups == 1)
2375 createReduOutAccs<true>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2378 createReduOutAccs<false>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2382 template <
typename... Reductions,
size_t... Is>
2384 std::tuple<Reductions...> &ReduTuple,
2385 std::index_sequence<Is...>) {
2386 auto ProcessOne = [&CGH](
auto Redu) {
2387 if constexpr (!decltype(Redu)::is_usm)
2388 Redu.getUserRedVarAccess(CGH);
2390 (ProcessOne(std::get<Is>(ReduTuple)), ...);
2395 template <
bool IsOneWG,
typename... Reductions,
int Dims,
typename... LocalAccT,
2396 typename... InAccT,
typename... OutAccT,
typename... Ts,
2397 typename... BOPsT,
size_t... Is>
2399 nd_item<Dims> NDIt,
size_t LID,
size_t GID,
size_t RemainingWorkSize,
2403 std::array<
bool,
sizeof...(Reductions)> InitToIdentityProps,
2404 std::index_sequence<Is...> ReduIndices) {
2407 if (LID < RemainingWorkSize)
2408 ((std::get<Is>(LocalAccsTuple)[LID] = std::get<Is>(InAccsTuple)[GID]), ...);
2415 AdjustedBOPsTuple, ReduIndices);
2421 GrID, OutAccsTuple, LocalAccsTuple, AdjustedBOPsTuple, IdentitiesTuple,
2422 InitToIdentityProps, ReduIndices);
2426 template <
bool IsOneWG,
typename Reduction,
int Dims,
typename LocalAccT,
2427 typename InAccT,
typename OutAccT,
typename T,
typename BOPT>
2429 size_t RemainingWorkSize, LocalAccT LocalReds,
2430 InAccT In, OutAccT Out, T IdentityContainer,
2431 BOPT BOp,
bool IsInitializeToIdentity) {
2432 using element_type =
typename Reduction::reducer_element_type;
2433 auto ElementCombiner = [&](element_type &LHS,
const element_type &RHS) {
2434 return LHS.combine(BOp, RHS);
2439 auto NElements = Reduction::num_elements;
2440 for (
size_t E = 0; E < NElements; ++E) {
2441 doTreeReduction<WorkSizeGuarantees::LessOrEqual>(
2442 RemainingWorkSize, NDIt, LocalReds, ElementCombiner,
2443 [&](
size_t) {
return In[GID * NElements + E]; });
2448 size_t OutIdx = GrID * NElements + E;
2449 if constexpr (IsOneWG) {
2452 if constexpr (Reduction::has_identity) {
2453 Out[OutIdx] = *ElementCombiner(LocalReds[0],
2454 IsInitializeToIdentity
2455 ? IdentityContainer.getIdentity()
2458 Out[OutIdx] = *LocalReds[0];
2462 Out[OutIdx] = LocalReds[0];
2467 if (E != NElements - 1) {
2473 template <
bool IsOneWG,
typename... Reductions,
int Dims,
typename... LocalAccT,
2474 typename... InAccT,
typename... OutAccT,
typename... Ts,
2475 typename... BOPsT,
size_t... Is>
2477 nd_item<Dims> NDIt,
size_t LID,
size_t GID,
size_t RemainingWorkSize,
2481 std::array<
bool,
sizeof...(Reductions)> InitToIdentityProps,
2482 std::index_sequence<Is...>) {
2483 using ReductionPack = std::tuple<Reductions...>;
2485 std::tuple_element_t<Is, ReductionPack>>(
2486 NDIt, LID, GID, RemainingWorkSize, std::get<Is>(LocalAccsTuple),
2487 std::get<Is>(InAccsTuple), std::get<Is>(OutAccsTuple),
2488 std::get<Is>(IdentitiesTuple), std::get<Is>(BOPsTuple),
2489 InitToIdentityProps[Is]),
2493 namespace reduction::aux_krn {
2494 template <
class KernelName,
class Predicate>
struct Multi;
2496 template <
typename KernelName,
typename KernelType,
typename... Reductions,
2499 std::tuple<Reductions...> &ReduTuple,
2500 std::index_sequence<Is...> ReduIndices) {
2504 bool Pow2WG = (WGSize & (WGSize - 1)) == 0;
2505 bool HasUniformWG = Pow2WG && (NWorkGroups * WGSize == NWorkItems);
2509 auto ScalarIs =
filterSequence<Reductions...>(ScalarPredicate, ReduIndices);
2512 auto ArrayIs =
filterSequence<Reductions...>(ArrayPredicate, ReduIndices);
2518 std::get<Is>(ReduTuple).getReadAccToPreviousPartialReds(CGH)...);
2520 auto IdentitiesTuple =
2521 makeReduTupleT(std::get<Is>(ReduTuple).getIdentityContainer()...);
2524 std::array InitToIdentityProps{
2525 std::get<Is>(ReduTuple).initializeToIdentity()...};
2530 auto Rest = [&](
auto Predicate,
auto OutAccsTuple) {
2531 auto AccReduIndices =
filterSequence<Reductions...>(Predicate, ReduIndices);
2535 decltype(Predicate)>;
2537 range<1> GlobalRange = {HasUniformWG ? NWorkItems : NWorkGroups * WGSize};
2541 constexpr
bool IsOneWG =
2542 std::is_same_v<std::remove_reference_t<decltype(Predicate)>,
2546 size_t RemainingWorkSize =
2553 NDIt, LID, GID, RemainingWorkSize, LocalAccsTuple, InAccsTuple,
2554 OutAccsTuple, IdentitiesTuple, BOPsTuple, InitToIdentityProps,
2557 NDIt, LID, GID, RemainingWorkSize, LocalAccsTuple, InAccsTuple,
2558 OutAccsTuple, IdentitiesTuple, BOPsTuple, InitToIdentityProps,
2562 if (NWorkGroups == 1)
2564 createReduOutAccs<true>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2567 createReduOutAccs<false>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2573 return sizeof(
typename Reduction::result_type);
2576 template <
typename Reduction,
typename... RestT>
2578 return sizeof(
typename Reduction::result_type) +
2582 template <
typename... ReductionT,
size_t... Is>
2584 std::index_sequence<Is...>) {
2590 template <
typename TupleT, std::size_t... Is>
2591 std::tuple<std::tuple_element_t<Is, TupleT>...>
2593 return {std::get<Is>(std::move(Tuple))...};
2597 template <
typename KernelName,
int Dims,
typename PropertiesT,
2599 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
2602 std::tuple<RestT...> ArgsTuple(Rest...);
2603 constexpr
size_t NumArgs =
sizeof...(RestT);
2604 auto KernelFunc =
std::get<NumArgs - 1>(ArgsTuple);
2605 auto ReduIndices = std::make_index_sequence<NumArgs - 1>();
2615 "The implementation handling parallel_for with"
2616 " reduction requires work group size not bigger"
2618 std::to_string(MaxWGSize));
2620 reduCGFuncMulti<KernelName>(CGH, KernelFunc, NDRange, Properties, ReduTuple,
2625 while (NWorkItems > 1) {
2627 NWorkItems = reduAuxCGFunc<KernelName, decltype(KernelFunc)>(
2628 AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
2637 template <reduction::strategy Strategy>
2641 template <
typename KernelName,
int Dims,
typename PropertiesT,
2642 typename KernelType,
typename Reduction>
2643 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
2645 Reduction &Redu, KernelType &KernelFunc) {
2646 auto Delegate = [&](
auto Impl) {
2647 Impl.template run<KernelName>(CGH, Queue, NDRange, Properties, Redu,
2651 if constexpr (Reduction::has_float64_atomics) {
2655 if constexpr (Reduction::has_fast_reduce)
2659 }
else if constexpr (Reduction::has_fast_atomics) {
2660 if constexpr (
sizeof(
typename Reduction::result_type) == 8) {
2667 if constexpr (Reduction::has_fast_reduce)
2674 if constexpr (Reduction::has_fast_reduce) {
2680 if constexpr (Reduction::has_fast_reduce)
2686 assert(
false &&
"Must be unreachable!");
2688 template <
typename KernelName,
int Dims,
typename PropertiesT,
2690 static void run(
handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
2699 typename PropertiesT,
typename... RestT>
2701 PropertiesT Properties, RestT... Rest) {
2703 Properties, Rest...);
2706 __SYCL_EXPORT uint32_t
2710 typename PropertiesT,
typename... RestT>
2712 PropertiesT Properties, RestT... Rest) {
2713 std::tuple<RestT...> ArgsTuple(Rest...);
2714 constexpr
size_t NumArgs =
sizeof...(RestT);
2715 static_assert(NumArgs > 1,
"No reduction!");
2716 auto KernelFunc =
std::get<NumArgs - 1>(ArgsTuple);
2717 auto ReduIndices = std::make_index_sequence<NumArgs - 1>();
2722 size_t OneElemSize = [&]() {
2724 if constexpr (
sizeof...(RestT) == 2) {
2726 constexpr
bool IsTreeReduction =
2727 !Reduction::has_fast_reduce && !Reduction::has_fast_atomics;
2728 return IsTreeReduction ?
sizeof(
typename Reduction::result_type) : 0;
2734 uint32_t NumConcurrentWorkGroups =
2735 #ifdef __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS
2736 __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS;
2746 size_t NWorkItems = Range.
size();
2747 size_t WGSize = std::min(NWorkItems, PrefWGSize);
2748 size_t NWorkGroups = NWorkItems / WGSize;
2749 if (NWorkItems % WGSize)
2751 size_t MaxNWorkGroups = NumConcurrentWorkGroups;
2752 NWorkGroups = std::min(NWorkGroups, MaxNWorkGroups);
2753 size_t NDRItems = NWorkGroups * WGSize;
2756 size_t PerGroup = Range.
size() / NWorkGroups;
2763 auto UpdatedKernelFunc = [=](
auto NDId,
auto &...Reducers) {
2767 auto Group = NDId.get_group();
2768 size_t GroupId = Group.get_group_linear_id();
2769 size_t NumGroups = Group.get_group_linear_range();
2770 bool LastGroup = (GroupId == NumGroups - 1);
2771 size_t GroupStart = GroupId * PerGroup;
2772 size_t GroupEnd = LastGroup ? Range.
size() : (GroupStart + PerGroup);
2775 size_t Start = GroupStart + NDId.get_local_id(0);
2776 size_t End = GroupEnd;
2777 size_t Stride = NDId.get_local_range(0);
2778 auto GetDelinearized = [&](
size_t I) {
2780 if constexpr (std::is_invocable_v<decltype(KernelFunc),
id<Dims>,
2781 decltype(Reducers)...>)
2788 for (
size_t I = Start; I < End; I += Stride)
2789 KernelFunc(GetDelinearized(I), Reducers...);
2791 if constexpr (NumArgs == 2) {
2793 auto &Redu = std::get<0>(ReduTuple);
2795 constexpr
auto StrategyToUse = [&]() {
2810 if constexpr (Reduction::has_fast_reduce && Reduction::has_identity)
2812 else if constexpr (Reduction::has_fast_atomics &&
2813 sizeof(
typename Reduction::result_type) != 8)
2819 reduction_parallel_for<KernelName, StrategyToUse>(CGH, NDRange, Properties,
2820 Redu, UpdatedKernelFunc);
2823 [&](
auto &...Reds) {
2824 return reduction_parallel_for<KernelName, Strategy>(
2825 CGH, NDRange, Properties, Reds..., UpdatedKernelFunc);
2836 template <
typename T,
typename AllocatorT,
typename BinaryOperation>
2838 BinaryOperation Combiner,
const property_list &PropList = {}) {
2841 PropList.has_property<property::reduction::initialize_to_identity>();
2842 return detail::make_reduction<BinaryOperation, 0, 1, false>(
2851 template <
typename T,
typename BinaryOperation>
2855 PropList.has_property<property::reduction::initialize_to_identity>();
2856 return detail::make_reduction<BinaryOperation, 0, 1, false>(
2863 template <
typename T,
typename AllocatorT,
typename BinaryOperation>
2865 BinaryOperation Combiner,
const property_list &PropList = {}) {
2868 PropList.has_property<property::reduction::initialize_to_identity>();
2869 return detail::make_reduction<BinaryOperation, 0, 1, true>(
2876 template <
typename T,
typename BinaryOperation>
2877 auto reduction(T *Var,
const T &Identity, BinaryOperation Combiner,
2880 PropList.has_property<property::reduction::initialize_to_identity>();
2881 return detail::make_reduction<BinaryOperation, 0, 1, true>(
2890 template <
typename T,
size_t Extent,
typename BinaryOperation,
2891 typename = std::enable_if_t<Extent != dynamic_extent>>
2895 PropList.has_property<property::reduction::initialize_to_identity>();
2896 return detail::make_reduction<BinaryOperation, 1, Extent, false>(
2903 template <
typename T,
size_t Extent,
typename BinaryOperation,
2904 typename = std::enable_if_t<Extent != dynamic_extent>>
2906 BinaryOperation Combiner,
const property_list &PropList = {}) {
2908 PropList.has_property<property::reduction::initialize_to_identity>();
2909 return detail::make_reduction<BinaryOperation, 1, Extent, true>(
The file contains implementations of accessor class.
Defines a shared array that can be used by kernels in queues.
Helper class for accessing internal reducer member functions.
constexpr auto getIdentity()
auto & getElement(size_t E)
static constexpr auto getIdentityStatic()
ReducerAccess(ReducerT &ReducerRef)
constexpr T & operator*() noexcept
ReducerElement & combine(BinaryOperation BinOp, const T &OtherValue)
ReducerElement & combine(BinaryOperation BinOp, const ReducerElement &Other)
ReducerElement(const ReductionIdentityContainer< T, BinaryOperation, ExplicitIdentity > &IdentityContainer)
constexpr const T & operator*() const noexcept
static constexpr T getIdentity()
Returns the statically known identity value.
ReductionIdentityContainer(const T &)
ReductionIdentityContainer()
T getIdentity() const
Returns the identity value given by user.
ReductionIdentityContainer(const T &Identity)
Templated class for common functionality of all reduction implementation classes.
This class is the default KernelName template parameter type for kernel invocation APIs such as singl...
Use CRTP to avoid redefining shorthand operators in terms of combine.
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&(IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value||IsReduOptForAtomic64Op< _T, _BinaryOperation >::value) &&IsMaximum< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic MAX operation: *ReduVarPtr = sycl::maximum(*ReduVarPtr, MValue);.
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&(IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value||IsReduOptForAtomic64Op< _T, _BinaryOperation >::value) &&IsPlus< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic ADD operation: *ReduVarPtr += MValue;.
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value &&IsBitOR< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic BITWISE OR operation: *ReduVarPtr |= MValue;.
std::enable_if_t< std::is_same_v< remove_decoration_t< _T >, _T > &&IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value &&IsBitAND< _T, _BinaryOperation >::value &&(Space==access::address_space::global_space||Space==access::address_space::local_space)> atomic_combine(_T *ReduVarPtr) const
Atomic BITWISE AND operation: *ReduVarPtr &= MValue;.
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&(IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value||IsReduOptForAtomic64Op< _T, _BinaryOperation >::value) &&IsMinimum< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic MIN operation: *ReduVarPtr = sycl::minimum(*ReduVarPtr, MValue);.
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value &&IsBitXOR< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic BITWISE XOR operation: *ReduVarPtr ^= MValue;.
BinaryOperation binary_operation
static constexpr int dimensions
const identity_container_type & getIdentityContainer()
auto & getTempBuffer(size_t Size, handler &CGH)
BinaryOperation getBinaryOperation() const
Returns the binary operation associated with the reduction.
static constexpr bool has_fast_reduce
static constexpr size_t dims
BinaryOperation binary_operation
typename ReducerTraits< reducer_type >::element_type reducer_element_type
std::enable_if_t<!HasIdentity > withInitializedMem(handler &CGH, FuncTy Func)
static constexpr bool has_fast_atomics
auto getWriteMemForPartialReds(size_t Size, handler &CGH)
static constexpr bool has_float64_atomics
static constexpr size_t num_elements
std::enable_if_t< HasIdentity > withInitializedMem(handler &CGH, FuncTy Func)
Provide Func with a properly initialized memory to write the reduction result to.
static constexpr bool is_known_identity
bool initializeToIdentity() const
auto getUserRedVarAccess(handler &CGH)
reduction_impl_algo(BinaryOperation BinaryOp, bool Init, RedOutVar RedOut, std::enable_if_t<!IsKnownIdentityOp< RelayT, RelayBinaryOperation >::value, int >=0)
static constexpr bool is_usm
reduction_impl_algo(const T &Identity, BinaryOperation BinaryOp, bool Init, RedOutVar RedOut)
reduction_impl_algo(BinaryOperation BinaryOp, bool Init, RedOutVar RedOut, std::enable_if_t< IsKnownIdentityOp< RelayT, RelayBinaryOperation >::value, int >=0)
auto getWriteAccForPartialReds(size_t Size, handler &CGH)
Returns an accessor accessing the memory that will hold the reduction partial sums.
auto getReadAccToPreviousPartialReds(handler &CGH) const
static constexpr bool has_identity
accessor< int, 1, access::mode::read_write, access::target::device, access::placeholder::false_t > getReadWriteAccessorToInitializedGroupsCounter(handler &CGH)
ReductionIdentityContainer< T, BinaryOperation, ExplicitIdentity > identity_container_type
auto getGroupsCounterAccDiscrete(handler &CGH)
Base non-template class which is a base class for all reduction implementation classes.
This class encapsulates the reduction variable/accessor, the reduction operator and an optional opera...
reduction_impl(RedOutVar Var, BinaryOperation BOp, bool InitializeToIdentity=false)
Constructs reduction_impl when no identity is specified.
static constexpr bool is_known_identity
static constexpr bool is_usm
reduction_impl(RedOutVar &Var, const T &Identity, BinaryOperation BOp, bool InitializeToIdentity)
Constructs reduction_impl with an explicit identity value.
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
detail::is_device_info_desc< Param >::return_type get_info() const
Queries this SYCL device for information requested by the template parameter param.
bool has(aspect Aspect) const __SYCL_WARN_IMAGE_ASPECT(Aspect)
Indicates if the SYCL device has the given feature.
Command group handler class.
void depends_on(event Event)
Registers event dependencies on this command group.
void single_task(_KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function as a function object type.
void parallel_for(range< 1 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, std::shared_ptr< T_Dst > Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Provides a cross-platform math array class template that works on SYCL devices as well as in host C++...
Identifies an instance of the function object executing at each point in an nd_range.
size_t get_local_linear_id() const
size_t __SYCL_ALWAYS_INLINE get_group_linear_id() const
range< Dimensions > get_local_range() const
size_t __SYCL_ALWAYS_INLINE get_global_linear_id() const
void barrier(access::fence_space accessSpace=access::fence_space::global_and_local) const
Defines the iteration domain of both the work-groups and the overall dispatch.
range< Dimensions > get_local_range() const
range< Dimensions > get_group_range() const
Objects of the property_list class are containers for the SYCL properties.
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
event memset(void *Ptr, int Value, size_t Count, const detail::code_location &CodeLoc=detail::code_location::current())
Fills the memory pointed by a USM pointer with the value specified.
device get_device() const
reducer(const reducer &)=delete
reducer & operator=(const reducer &)=delete
reducer(const IdentityContainerT &IdentityContainer, BinaryOperation BOp)
std::enable_if_t< HasIdentityRelay &&(HasIdentityRelay==has_identity), T > identity() const
reducer(reducer &&)=delete
reducer & operator=(reducer &&)=delete
reducer(const detail::ReducerToken< BinaryOperation, IdentityContainerT > &Token)
reducer & combine(const T &Partial)
reducer(reducer &&)=delete
reducer(const IdentityContainerT &, BinaryOperation)
reducer(const detail::ReducerToken< BinaryOperation, IdentityContainerT > &Token)
reducer< T, BinaryOperation, Dims - 1, Extent, IdentityContainerT, true > operator[](size_t Index)
reducer & operator=(reducer &&)=delete
reducer(const reducer &)=delete
reducer & operator=(const reducer &)=delete
reducer(const reducer &)=delete
reducer & operator=(reducer &&)=delete
reducer(const detail::ReducerToken< BinaryOperation, IdentityContainerT > &Token)
reducer(const IdentityContainerT &IdentityContainer, BinaryOperation BOp)
reducer & operator=(const reducer &)=delete
reducer< T, BinaryOperation, Dims - 1, Extent, IdentityContainerT, true > operator[](size_t Index)
reducer(reducer &&)=delete
std::enable_if_t< HasIdentityRelay &&(HasIdentityRelay==has_identity), T > identity() const
reducer & combine(const T &Partial)
reducer(const reducer &)=delete
reducer & operator=(reducer &&)=delete
reducer & operator=(const reducer &)=delete
reducer(reducer &&)=delete
reducer(const IdentityContainerT &, BinaryOperation)
reducer(const detail::ReducerToken< BinaryOperation, IdentityContainerT > &Token)
reducer(element_type &Ref, BinaryOperation BOp)
reducer(const reducer &)=delete
reducer & operator=(const reducer &)=delete
reducer & operator=(reducer &&)=delete
reducer(const detail::ReducerToken< BinaryOperation, IdentityContainerT > &Token)
reducer & combine(const T &Partial)
reducer(reducer &&)=delete
Class that is used to represent objects that are passed to user's lambda functions and representing u...
constexpr _SYCL_SPAN_INLINE_VISIBILITY pointer data() const noexcept
constexpr tuple_element< I, tuple< Types... > >::type & get(sycl::detail::tuple< Types... > &Arg) noexcept
item< Dims, false > getDelinearizedItem(range< Dims > Range, id< Dims > Id)
void withAuxHandler(handler &CGH, FunctorTy Func)
@ group_reduce_and_last_wg_detection
@ group_reduce_and_atomic_cross_wg
@ local_atomic_and_atomic_cross_wg
@ local_mem_tree_and_atomic_cross_wg
@ group_reduce_and_multiple_kernels
void finalizeHandler(handler &CGH)
void writeReduSumsToOutAccs(size_t OutAccIndex, ReduTupleT< OutAccT... > OutAccs, ReduTupleT< LocalAccT... > LocalAccs, ReduTupleT< BOPsT... > BOPs, ReduTupleT< Ts... > IdentityVals, std::array< bool, sizeof...(Reductions)> IsInitializeToIdentity, std::index_sequence< Is... >)
size_t reduGetMemPerWorkItem(std::tuple< ReductionT... > &ReduTuple, std::index_sequence< Is... >)
device getDeviceFromHandler(handler &CommandGroupHandlerRef)
void doTreeReductionHelper(size_t WorkSize, size_t LID, FuncTy Func)
void reduCGFuncImplScalar(nd_item< Dims > NDIt, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, std::tuple< ReducerT... > &ReducersTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... > ReduIndices)
All scalar reductions are processed together; there is one loop of log2(N) steps, and each reduction ...
uint32_t reduGetMaxNumConcurrentWorkGroups(std::shared_ptr< queue_impl > Queue)
size_t reduGetMaxWGSize(std::shared_ptr< queue_impl > Queue, size_t LocalMemBytesPerWorkItem)
void reduAuxCGFuncImplArray(nd_item< Dims > NDIt, size_t LID, size_t GID, size_t RemainingWorkSize, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< InAccT... > InAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... >)
std::bool_constant< std::is_same_v< BinaryOperation, sycl::maximum< T > >||std::is_same_v< BinaryOperation, sycl::maximum< void > >> IsMaximum
void reduCGFuncImplArrayHelper(nd_item< Dims > NDIt, LocalAccT LocalReds, OutAccT Out, ReducerT &Reducer, BOPT BOp, bool IsInitializeToIdentity)
Each array reduction is processed separately.
std::bool_constant< std::is_same_v< BinaryOperation, sycl::bit_or< T > >||std::is_same_v< BinaryOperation, sycl::bit_or< void > >> IsBitOR
std::bool_constant< IsZeroIdentityOp< T, BinaryOperation >::value||IsOneIdentityOp< T, BinaryOperation >::value||IsOnesIdentityOp< T, BinaryOperation >::value||IsMinimumIdentityOp< T, BinaryOperation >::value||IsMaximumIdentityOp< T, BinaryOperation >::value||IsFalseIdentityOp< T, BinaryOperation >::value||IsTrueIdentityOp< T, BinaryOperation >::value > IsKnownIdentityOp
id< 1 > getDelinearizedId(const range< 1 > &, size_t Index)
void reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu)
Copies the final reduction result kept in read-write accessor to user's USM memory.
void reduAuxCGFuncImplArrayHelper(nd_item< Dims > NDIt, size_t LID, size_t GID, size_t RemainingWorkSize, LocalAccT LocalReds, InAccT In, OutAccT Out, T IdentityContainer, BOPT BOp, bool IsInitializeToIdentity)
void doTreeReductionOnTuple(size_t WorkSize, size_t LID, ReduTupleT< LocalAccT... > &LocalAccs, ReduTupleT< BOPsT... > &BOPs, std::index_sequence< Is... >)
std::bool_constant<((is_sgenfloat_v< T > &&sizeof(T)==4)||is_sgeninteger_v< T >)&&IsValidAtomicType< T >::value &&(IsPlus< T, BinaryOperation >::value||IsMinimum< T, BinaryOperation >::value||IsMaximum< T, BinaryOperation >::value||IsBitOR< T, BinaryOperation >::value||IsBitXOR< T, BinaryOperation >::value||IsBitAND< T, BinaryOperation >::value)> IsReduOptForFastAtomicFetch
void associateReduAccsWithHandler(handler &CGH, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
size_t GreatestPowerOfTwo(size_t N)
Computes the greatest power-of-two less than or equal to N.
void doTreeReduction(size_t WorkSize, nd_item< Dim > NDIt, LocalRedsTy &LocalReds, BinOpTy &BOp, AccessFuncTy AccessFunc)
std::conditional_t< std::is_same_v< KernelName, auto_name >, auto_name, reduction::InitMemKrn< KernelName > > __sycl_init_mem_for
A helper to pass undefined (sycl::detail::auto_name) names unmodified.
std::bool_constant< std::is_same_v< BinaryOperation, sycl::multiplies< T > >||std::is_same_v< BinaryOperation, sycl::multiplies< void > >> IsMultiplies
size_t reduGetMemPerWorkItemHelper(Reduction &)
std::bool_constant<((is_sgeninteger_v< T > &&(sizeof(T)==4||sizeof(T)==8))||is_sgenfloat_v< T >)&&(IsPlus< T, BinaryOperation >::value||IsMinimum< T, BinaryOperation >::value||IsMaximum< T, BinaryOperation >::value)> IsReduOptForFastReduce
auto getReducerAccess(ReducerT &Reducer)
std::bool_constant<(IsPlus< T, BinaryOperation >::value||IsMinimum< T, BinaryOperation >::value||IsMaximum< T, BinaryOperation >::value) &&is_sgenfloat_v< T > &&sizeof(T)==8 > IsReduOptForAtomic64Op
constexpr auto filterSequenceHelper(FunctorT, std::index_sequence< Is... >)
For each index 'I' from the given indices pack 'Is' this function initially creates a number of short...
void reduAuxCGFuncImplScalar(nd_item< Dims > NDIt, size_t LID, size_t GID, size_t RemainingWorkSize, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< InAccT... > InAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... > ReduIndices)
All scalar reductions are processed together; there is one loop of log2(N) steps, and each reduction ...
std::tuple< std::tuple_element_t< Is, TupleT >... > tuple_select_elements(TupleT Tuple, std::index_sequence< Is... >)
Utility function: for the given tuple.
constexpr tuple< Ts... > make_tuple(Ts... Args)
size_t reduGetPreferredWGSize(std::shared_ptr< queue_impl > &Queue, size_t LocalMemBytesPerWorkItem)
std::bool_constant< std::is_same_v< BinaryOperation, sycl::plus< T > >||std::is_same_v< BinaryOperation, sycl::plus< void > >> IsPlus
size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... > ReduIndices)
constexpr auto filterSequence(FunctorT F, std::index_sequence< Is... > Indices)
For each index 'I' from the given indices pack 'Is' this function returns an index sequence consistin...
void reduCGFuncImplArray(nd_item< Dims > NDIt, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, std::tuple< ReducerT... > &ReducersTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... >)
ReduTupleT< Ts... > makeReduTupleT(Ts... Elements)
std::conditional_t< std::is_same_v< KernelName, auto_name >, auto_name, MainOrAux< KernelName, Strategy, Ts... > > __sycl_reduction_kernel
A helper to pass undefined (sycl::detail::auto_name) names unmodified.
std::bool_constant< std::is_same_v< BinaryOperation, sycl::minimum< T > >||std::is_same_v< BinaryOperation, sycl::minimum< void > >> IsMinimum
void reduction_parallel_for(handler &CGH, range< Dims > NDRange, PropertiesT Properties, RestT... Rest)
size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize, size_t &NWorkGroups)
constexpr std::index_sequence concat_sequences(std::index_sequence<>)
std::bool_constant< std::is_same_v< BinaryOperation, sycl::bit_and< T > >||std::is_same_v< BinaryOperation, sycl::bit_and< void > >> IsBitAND
auto createReduOutAccs(size_t NWorkGroups, handler &CGH, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
For the given 'Reductions' types pack and indices enumerating them this function either creates new t...
auto getLastCombine(OutAccT OutAcc, LocalAccT LocalAcc, BOPT BOP, IdentityContainerT IdentityContainer, bool IsInitializeToIdentity)
std::bool_constant< std::is_same_v< BinaryOperation, sycl::bit_xor< T > >||std::is_same_v< BinaryOperation, sycl::bit_xor< void > >> IsBitXOR
typename tuple_element< I, T >::type tuple_element_t
constexpr auto makeAdjustedBOPs(ReduTupleT< BOPsT... > &BOPsTuple, std::index_sequence< Is... >)
void reduCGFuncMulti(handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, PropertiesT Properties, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... > ReduIndices)
constexpr auto makeAdjustedBOP(BOPT &BOP)
auto make_reduction(RedOutVar RedVar, RestTy &&...Rest)
sycl::detail::tuple< Ts... > ReduTupleT
constexpr mode_tag_t< access_mode::read > read_only
constexpr property::no_init no_init
constexpr mode_tag_t< access_mode::read_write > read_write
std::enable_if_t<(is_group_v< std::decay_t< Group >> &&(detail::is_scalar_arithmetic< T >::value||(detail::is_complex< T >::value &&detail::is_multiplies< T, BinaryOperation >::value)) &&detail::is_native_op< T, BinaryOperation >::value), T > reduce_over_group(Group g, T x, BinaryOperation binary_op)
auto reduction(buffer< T, 1, AllocatorT > Var, handler &CGH, BinaryOperation Combiner, const property_list &PropList={})
Constructs a reduction object using the given buffer Var, handler CGH, reduction operation Combiner,...
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
void free(void *ptr, const context &ctxt, const detail::code_location &CodeLoc=detail::code_location::current())
_Abi const simd< _Tp, _Abi > & noexcept
static constexpr bool value
std::conditional_t< Cond, std::index_sequence< I >, std::index_sequence<> > type
static constexpr bool value
static constexpr bool value
static constexpr bool value
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, RestT... Rest)
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, RestT... Rest)
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
const IdentityContainerT & IdentityContainer
const BinaryOperation BOp
Helper class for accessing reducer-defined types in CRTP May prove to be useful for other things late...
static constexpr int value
is_device_copyable is a user specializable class template to indicate that a type T is device copyabl...