DPC++ Runtime
Runtime libraries for oneAPI DPC++
reduction.hpp
Go to the documentation of this file.
1 //==---------------- reduction.hpp - SYCL reduction ------------*- C++ -*---==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 // ===--------------------------------------------------------------------=== //
8 
9 #pragma once
10 
11 #include <sycl/accessor.hpp>
12 #include <sycl/atomic.hpp>
13 #include <sycl/atomic_ref.hpp>
14 #include <sycl/detail/tuple.hpp>
16 #include <sycl/group_algorithm.hpp>
17 #include <sycl/handler.hpp>
18 #include <sycl/kernel.hpp>
19 #include <sycl/known_identity.hpp>
22 #include <sycl/usm.hpp>
23 
24 #include <tuple>
25 
26 namespace sycl {
28 namespace detail {
29 
33 
35 template <typename T> struct IsReduction {
36  static constexpr bool value =
37  std::is_base_of<reduction_impl_base, std::remove_reference_t<T>>::value;
38 };
39 
42 template <typename FirstT, typename... RestT> struct AreAllButLastReductions {
43  static constexpr bool value =
45 };
46 
49 template <typename T> struct AreAllButLastReductions<T> {
50  static constexpr bool value = !IsReduction<T>::value;
51 };
52 } // namespace detail
53 
67 template <typename T, class BinaryOperation, int Dims, size_t Extent,
68  bool View = false, typename Subst = void>
69 class reducer;
70 
71 namespace detail {
72 // This type trait is used to detect if the atomic operation BinaryOperation
73 // used with operands of the type T is available for using in reduction.
74 // The order in which the atomic operations are performed may be arbitrary and
75 // thus may cause different results from run to run even on the same elements
76 // and on same device. The macro SYCL_REDUCTION_DETERMINISTIC prohibits using
77 // atomic operations for reduction and helps to produce stable results.
78 // SYCL_REDUCTION_DETERMINISTIC is a short term solution, which perhaps become
79 // deprecated eventually and is replaced by a sycl property passed to reduction.
80 template <typename T, class BinaryOperation>
82 #ifdef SYCL_REDUCTION_DETERMINISTIC
84 #else
85  bool_constant<((is_sgenfloat<T>::value && sizeof(T) == 4) ||
94 #endif
95 
96 // This type trait is used to detect if the atomic operation BinaryOperation
97 // used with operands of the type T is available for using in reduction, in
98 // addition to the cases covered by "IsReduOptForFastAtomicFetch", if the device
99 // has the atomic64 aspect. This type trait should only be used if the device
100 // has the atomic64 aspect. Note that this type trait is currently a subset of
101 // IsReduOptForFastReduce. The macro SYCL_REDUCTION_DETERMINISTIC prohibits
102 // using the reduce_over_group() algorithm to produce stable results across same
103 // type devices.
104 template <typename T, class BinaryOperation>
106 #ifdef SYCL_REDUCTION_DETERMINISTIC
108 #else
112  is_sgenfloat<T>::value && sizeof(T) == 8>;
113 #endif
114 
115 // This type trait is used to detect if the group algorithm reduce() used with
116 // operands of the type T and the operation BinaryOperation is available
117 // for using in reduction.
118 // The macro SYCL_REDUCTION_DETERMINISTIC prohibits using the reduce() algorithm
119 // to produce stable results across same type devices.
120 template <typename T, class BinaryOperation>
122 #ifdef SYCL_REDUCTION_DETERMINISTIC
124 #else
126  (sizeof(T) == 4 || sizeof(T) == 8)) ||
131 #endif
132 
133 // std::tuple seems to be a) too heavy and b) not copyable to device now
134 // Thus sycl::detail::tuple is used instead.
135 // Switching from sycl::device::tuple to std::tuple can be done by re-defining
136 // the ReduTupleT type and makeReduTupleT() function below.
137 template <typename... Ts> using ReduTupleT = sycl::detail::tuple<Ts...>;
138 template <typename... Ts> ReduTupleT<Ts...> makeReduTupleT(Ts... Elements) {
139  return sycl::detail::make_tuple(Elements...);
140 }
141 
142 __SYCL_EXPORT size_t reduGetMaxWGSize(std::shared_ptr<queue_impl> Queue,
143  size_t LocalMemBytesPerWorkItem);
144 __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
145  size_t &NWorkGroups);
146 __SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
147  size_t LocalMemBytesPerWorkItem);
148 
151 template <typename Reducer> struct ReducerTraits;
152 
153 template <typename T, class BinaryOperation, int Dims, std::size_t Extent,
154  bool View, typename Subst>
155 struct ReducerTraits<reducer<T, BinaryOperation, Dims, Extent, View, Subst>> {
156  using type = T;
157  using op = BinaryOperation;
158  static constexpr int dims = Dims;
159  static constexpr size_t extent = Extent;
160 };
161 
173 template <class Reducer> class combiner {
174  using Ty = typename ReducerTraits<Reducer>::type;
175  using BinaryOp = typename ReducerTraits<Reducer>::op;
176  static constexpr int Dims = ReducerTraits<Reducer>::dims;
177  static constexpr size_t Extent = ReducerTraits<Reducer>::extent;
178 
179 public:
180  template <typename _T = Ty, int _Dims = Dims>
181  enable_if_t<(_Dims == 0) && IsPlus<_T, BinaryOp>::value &&
183  Reducer &>
184  operator++() {
185  return static_cast<Reducer *>(this)->combine(static_cast<_T>(1));
186  }
187 
188  template <typename _T = Ty, int _Dims = Dims>
189  enable_if_t<(_Dims == 0) && IsPlus<_T, BinaryOp>::value &&
191  Reducer &>
192  operator++(int) {
193  return static_cast<Reducer *>(this)->combine(static_cast<_T>(1));
194  }
195 
196  template <typename _T = Ty, int _Dims = Dims>
197  enable_if_t<(_Dims == 0) && IsPlus<_T, BinaryOp>::value, Reducer &>
198  operator+=(const _T &Partial) {
199  return static_cast<Reducer *>(this)->combine(Partial);
200  }
201 
202  template <typename _T = Ty, int _Dims = Dims>
203  enable_if_t<(_Dims == 0) && IsMultiplies<_T, BinaryOp>::value, Reducer &>
204  operator*=(const _T &Partial) {
205  return static_cast<Reducer *>(this)->combine(Partial);
206  }
207 
208  template <typename _T = Ty, int _Dims = Dims>
209  enable_if_t<(_Dims == 0) && IsBitOR<_T, BinaryOp>::value, Reducer &>
210  operator|=(const _T &Partial) {
211  return static_cast<Reducer *>(this)->combine(Partial);
212  }
213 
214  template <typename _T = Ty, int _Dims = Dims>
215  enable_if_t<(_Dims == 0) && IsBitXOR<_T, BinaryOp>::value, Reducer &>
216  operator^=(const _T &Partial) {
217  return static_cast<Reducer *>(this)->combine(Partial);
218  }
219 
220  template <typename _T = Ty, int _Dims = Dims>
221  enable_if_t<(_Dims == 0) && IsBitAND<_T, BinaryOp>::value, Reducer &>
222  operator&=(const _T &Partial) {
223  return static_cast<Reducer *>(this)->combine(Partial);
224  }
225 
226 private:
227  template <access::address_space Space>
228  static constexpr memory_scope getMemoryScope() {
229  return Space == access::address_space::local_space
230  ? memory_scope::work_group
231  : memory_scope::device;
232  }
233 
234  template <access::address_space Space, class T, class AtomicFunctor>
235  void atomic_combine_impl(T *ReduVarPtr, AtomicFunctor Functor) const {
236  auto reducer = static_cast<const Reducer *>(this);
237  for (size_t E = 0; E < Extent; ++E) {
238  auto AtomicRef = sycl::atomic_ref<T, memory_order::relaxed,
239  getMemoryScope<Space>(), Space>(
240  address_space_cast<Space, access::decorated::no>(ReduVarPtr)[E]);
241  Functor(AtomicRef, reducer->getElement(E));
242  }
243  }
244 
245  template <class _T, access::address_space Space, class BinaryOp>
246  static constexpr bool BasicCheck =
247  std::is_same<remove_decoration_t<_T>, Ty>::value &&
248  (Space == access::address_space::global_space ||
249  Space == access::address_space::local_space);
250 
251 public:
253  template <access::address_space Space = access::address_space::global_space,
254  typename _T = Ty, class _BinaryOperation = BinaryOp>
255  enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
256  (IsReduOptForFastAtomicFetch<_T, _BinaryOperation>::value ||
257  IsReduOptForAtomic64Op<_T, _BinaryOperation>::value) &&
258  IsPlus<_T, _BinaryOperation>::value>
259  atomic_combine(_T *ReduVarPtr) const {
260  atomic_combine_impl<Space>(
261  ReduVarPtr, [](auto Ref, auto Val) { return Ref.fetch_add(Val); });
262  }
263 
265  template <access::address_space Space = access::address_space::global_space,
266  typename _T = Ty, class _BinaryOperation = BinaryOp>
270  atomic_combine(_T *ReduVarPtr) const {
271  atomic_combine_impl<Space>(
272  ReduVarPtr, [](auto Ref, auto Val) { return Ref.fetch_or(Val); });
273  }
274 
276  template <access::address_space Space = access::address_space::global_space,
277  typename _T = Ty, class _BinaryOperation = BinaryOp>
281  atomic_combine(_T *ReduVarPtr) const {
282  atomic_combine_impl<Space>(
283  ReduVarPtr, [](auto Ref, auto Val) { return Ref.fetch_xor(Val); });
284  }
285 
287  template <access::address_space Space = access::address_space::global_space,
288  typename _T = Ty, class _BinaryOperation = BinaryOp>
292  (Space == access::address_space::global_space ||
293  Space == access::address_space::local_space)>
294  atomic_combine(_T *ReduVarPtr) const {
295  atomic_combine_impl<Space>(
296  ReduVarPtr, [](auto Ref, auto Val) { return Ref.fetch_and(Val); });
297  }
298 
300  template <access::address_space Space = access::address_space::global_space,
301  typename _T = Ty, class _BinaryOperation = BinaryOp>
306  atomic_combine(_T *ReduVarPtr) const {
307  atomic_combine_impl<Space>(
308  ReduVarPtr, [](auto Ref, auto Val) { return Ref.fetch_min(Val); });
309  }
310 
312  template <access::address_space Space = access::address_space::global_space,
313  typename _T = Ty, class _BinaryOperation = BinaryOp>
318  atomic_combine(_T *ReduVarPtr) const {
319  atomic_combine_impl<Space>(
320  ReduVarPtr, [](auto Ref, auto Val) { return Ref.fetch_max(Val); });
321  }
322 };
323 } // namespace detail
324 
330 template <typename T, class BinaryOperation, int Dims, size_t Extent, bool View>
331 class reducer<
332  T, BinaryOperation, Dims, Extent, View,
333  std::enable_if_t<Dims == 0 && Extent == 1 && View == false &&
334  !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
335  : public detail::combiner<
336  reducer<T, BinaryOperation, Dims, Extent, View,
337  std::enable_if_t<
338  Dims == 0 && Extent == 1 && View == false &&
339  !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>> {
340 public:
341  reducer(const T &Identity, BinaryOperation BOp)
342  : MValue(Identity), MIdentity(Identity), MBinaryOp(BOp) {}
343 
344  reducer &combine(const T &Partial) {
345  MValue = MBinaryOp(MValue, Partial);
346  return *this;
347  }
348 
349  T getIdentity() const { return MIdentity; }
350 
351  T &getElement(size_t) { return MValue; }
352  const T &getElement(size_t) const { return MValue; }
354 
355 private:
356  const T MIdentity;
357  BinaryOperation MBinaryOp;
358 };
359 
365 template <typename T, class BinaryOperation, int Dims, size_t Extent, bool View>
366 class reducer<
367  T, BinaryOperation, Dims, Extent, View,
368  std::enable_if_t<Dims == 0 && Extent == 1 && View == false &&
369  detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
370  : public detail::combiner<
371  reducer<T, BinaryOperation, Dims, Extent, View,
372  std::enable_if_t<
373  Dims == 0 && Extent == 1 && View == false &&
374  detail::IsKnownIdentityOp<T, BinaryOperation>::value>>> {
375 public:
376  reducer() : MValue(getIdentity()) {}
377  reducer(const T & /* Identity */, BinaryOperation) : MValue(getIdentity()) {}
378 
379  reducer &combine(const T &Partial) {
380  BinaryOperation BOp;
381  MValue = BOp(MValue, Partial);
382  return *this;
383  }
384 
385  static T getIdentity() {
387  }
388 
389  T &getElement(size_t) { return MValue; }
390  const T &getElement(size_t) const { return MValue; }
392 };
393 
396 template <typename T, class BinaryOperation, int Dims, size_t Extent, bool View>
397 class reducer<T, BinaryOperation, Dims, Extent, View,
398  std::enable_if_t<Dims == 0 && View == true>>
399  : public detail::combiner<
400  reducer<T, BinaryOperation, Dims, Extent, View,
401  std::enable_if_t<Dims == 0 && View == true>>> {
402 public:
403  reducer(T &Ref, BinaryOperation BOp) : MElement(Ref), MBinaryOp(BOp) {}
404 
405  reducer &combine(const T &Partial) {
406  MElement = MBinaryOp(MElement, Partial);
407  return *this;
408  }
409 
410 private:
411  T &MElement;
412  BinaryOperation MBinaryOp;
413 };
414 
417 template <typename T, class BinaryOperation, int Dims, size_t Extent, bool View>
418 class reducer<
419  T, BinaryOperation, Dims, Extent, View,
420  std::enable_if_t<Dims == 1 && View == false &&
421  !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
422  : public detail::combiner<
423  reducer<T, BinaryOperation, Dims, Extent, View,
424  std::enable_if_t<
425  Dims == 1 && View == false &&
426  !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>> {
427 public:
428  reducer(const T &Identity, BinaryOperation BOp)
429  : MValue(Identity), MIdentity(Identity), MBinaryOp(BOp) {}
430 
431  reducer<T, BinaryOperation, Dims - 1, Extent, true> operator[](size_t Index) {
432  return {MValue[Index], MBinaryOp};
433  }
434 
435  T getIdentity() const { return MIdentity; }
436  T &getElement(size_t E) { return MValue[E]; }
437  const T &getElement(size_t E) const { return MValue[E]; }
438 
439 private:
440  marray<T, Extent> MValue;
441  const T MIdentity;
442  BinaryOperation MBinaryOp;
443 };
444 
447 template <typename T, class BinaryOperation, int Dims, size_t Extent, bool View>
448 class reducer<
449  T, BinaryOperation, Dims, Extent, View,
450  std::enable_if_t<Dims == 1 && View == false &&
451  detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
452  : public detail::combiner<
453  reducer<T, BinaryOperation, Dims, Extent, View,
454  std::enable_if_t<
455  Dims == 1 && View == false &&
456  detail::IsKnownIdentityOp<T, BinaryOperation>::value>>> {
457 public:
458  reducer() : MValue(getIdentity()) {}
459  reducer(const T & /* Identity */, BinaryOperation) : MValue(getIdentity()) {}
460 
461  // SYCL 2020 revision 4 says this should be const, but this is a bug
462  // see https://github.com/KhronosGroup/SYCL-Docs/pull/252
463  reducer<T, BinaryOperation, Dims - 1, Extent, true> operator[](size_t Index) {
464  return {MValue[Index], BinaryOperation()};
465  }
466 
467  static T getIdentity() {
469  }
470 
471  T &getElement(size_t E) { return MValue[E]; }
472  const T &getElement(size_t E) const { return MValue[E]; }
473 
474 private:
475  marray<T, Extent> MValue;
476 };
477 
478 namespace detail {
481 template <typename T, class BinaryOperation> class reduction_impl_common {
482 protected:
483  reduction_impl_common(const T &Identity, BinaryOperation BinaryOp,
484  bool Init = false)
485  : MIdentity(Identity), MBinaryOp(BinaryOp), InitializeToIdentity(Init) {}
486 
487 public:
489  template <typename _T = T, class _BinaryOperation = BinaryOperation>
491  _T> constexpr getIdentity() {
493  }
494 
496  template <typename _T = T, class _BinaryOperation = BinaryOperation>
499  return MIdentity;
500  }
501 
503  BinaryOperation getBinaryOperation() const { return MBinaryOp; }
505 
506 protected:
509  const T MIdentity;
510 
511  BinaryOperation MBinaryOp;
513 };
514 
515 // Used for determining dimensions for temporary storage (mainly).
516 template <class T> struct data_dim_t {
517  static constexpr int value = 1;
518 };
519 
520 template <class T, int AccessorDims, access::mode Mode,
521  access::placeholder IsPH, typename PropList>
522 struct data_dim_t<
523  accessor<T, AccessorDims, Mode, access::target::device, IsPH, PropList>> {
524  static constexpr int value = AccessorDims;
525 };
526 
527 template <class T> struct get_red_t;
528 template <class T> struct get_red_t<T *> {
529  using type = T;
530 };
531 
532 template <class T, int AccessorDims, access::mode Mode,
533  access::placeholder IsPH, typename PropList>
534 struct get_red_t<
535  accessor<T, AccessorDims, Mode, access::target::device, IsPH, PropList>> {
536  using type = T;
537 };
538 
539 namespace reduction {
540 // Kernel name wrapper for initializing reduction-related memory through
541 // reduction_impl_algo::withInitializedMem.
542 template <typename KernelName> struct InitMemKrn;
543 } // namespace reduction
544 
547 template <class KernelName>
549  std::conditional_t<std::is_same<KernelName, auto_name>::value, auto_name,
551 
552 template <typename T, class BinaryOperation, int Dims, size_t Extent,
553  typename RedOutVar>
554 class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {
557 
558 public:
560  using result_type = T;
561  using binary_operation = BinaryOperation;
562 
563  static constexpr size_t dims = Dims;
564  static constexpr bool has_float64_atomics =
566  static constexpr bool has_fast_atomics =
568  static constexpr bool has_fast_reduce =
570 
571  static constexpr bool is_usm = std::is_same_v<RedOutVar, T *>;
572 
573  static constexpr size_t num_elements = Extent;
574 
575  reduction_impl_algo(const T &Identity, BinaryOperation BinaryOp, bool Init,
576  RedOutVar RedOut)
577  : base(Identity, BinaryOp, Init), MRedOut(std::move(RedOut)){};
578 
580  CGH.addReduction(MOutBufPtr);
581  return accessor{*MOutBufPtr, CGH, sycl::read_only};
582  }
583 
584  template <bool IsOneWG>
585  auto getWriteMemForPartialReds(size_t Size, handler &CGH) {
586  // If there is only one WG we can avoid creation of temporary buffer with
587  // partial sums and write directly into user's reduction variable.
588  if constexpr (IsOneWG) {
589  return MRedOut;
590  } else {
591  MOutBufPtr = std::make_shared<buffer<T, 1>>(range<1>(Size));
592  CGH.addReduction(MOutBufPtr);
593  return accessor{*MOutBufPtr, CGH};
594  }
595  }
596 
597  template <class _T = T> auto &getTempBuffer(size_t Size, handler &CGH) {
598  auto Buffer = std::make_shared<buffer<_T, 1>>(range<1>(Size));
599  CGH.addReduction(Buffer);
600  return *Buffer;
601  }
602 
609  auto getWriteAccForPartialReds(size_t Size, handler &CGH) {
610  if constexpr (!is_usm) {
611  if (Size == 1) {
612  CGH.associateWithHandler(&MRedOut, access::target::device);
613  return MRedOut;
614  }
615  }
616 
617  // Create a new output buffer and return an accessor to it.
618  //
619  // Array reductions are performed element-wise to avoid stack growth.
620  MOutBufPtr = std::make_shared<buffer<T, 1>>(range<1>(Size));
621  CGH.addReduction(MOutBufPtr);
622  return accessor{*MOutBufPtr, CGH};
623  }
624 
630  //
631  // This currently optimizes for a number of kernel instantiations instead of
632  // runtime latency. That might change in future.
633  template <typename KernelName, typename FuncTy>
634  void withInitializedMem(handler &CGH, FuncTy Func) {
635  // "Template" lambda to ensure that only one type of Func (USM/Buf) is
636  // instantiated for the code below.
637  auto DoIt = [&](auto &Out) {
638  auto RWReduVal = std::make_shared<std::array<T, num_elements>>();
639  for (int i = 0; i < num_elements; ++i) {
640  (*RWReduVal)[i] = base::getIdentity();
641  }
642  CGH.addReduction(RWReduVal);
643  auto Buf = std::make_shared<buffer<T, 1>>(RWReduVal.get()->data(),
644  range<1>(num_elements));
645  Buf->set_final_data();
646  CGH.addReduction(Buf);
647  accessor Mem{*Buf, CGH};
648  Func(Mem);
649 
650  reduction::withAuxHandler(CGH, [&](handler &CopyHandler) {
651  // MSVC (19.32.31329) has problems compiling the line below when used
652  // as a host compiler in c++17 mode (but not in c++latest)
653  // accessor Mem{*Buf, CopyHandler};
654  // so use the old-style API.
655  auto Mem =
656  Buf->template get_access<access::mode::read_write>(CopyHandler);
657  if constexpr (is_usm) {
658  // Can't capture whole reduction, copy into distinct variables.
659  bool IsUpdateOfUserVar = !base::initializeToIdentity();
660  auto BOp = base::getBinaryOperation();
661 
662  // Don't use constexpr as non-default host compilers (unlike clang)
663  // might actually create a capture resulting in binary differences
664  // between host/device in lambda captures.
665  size_t NElements = num_elements;
666 
668  for (int i = 0; i < NElements; ++i) {
669  if (IsUpdateOfUserVar)
670  Out[i] = BOp(Out[i], Mem[i]);
671  else
672  Out[i] = Mem[i];
673  }
674  });
675  } else {
676  associateWithHandler(CopyHandler, &Out, access::target::device);
677  CopyHandler.copy(Mem, Out);
678  }
679  });
680  };
681  if constexpr (is_usm) {
682  // Don't dispatch based on base::initializeToIdentity() as that would lead
683  // to two different instantiations of Func.
684  DoIt(MRedOut);
685  } else {
686  if (base::initializeToIdentity())
687  DoIt(MRedOut);
688  else
689  Func(MRedOut);
690  }
691  }
692 
693  accessor<int, 1, access::mode::read_write, access::target::device,
694  access::placeholder::false_t>
696  auto CounterMem = std::make_shared<int>(0);
697  CGH.addReduction(CounterMem);
698  auto CounterBuf = std::make_shared<buffer<int, 1>>(CounterMem.get(), 1);
699  CounterBuf->set_final_data();
700  CGH.addReduction(CounterBuf);
701  return {*CounterBuf, CGH};
702  }
703 
704  // On discrete (vs. integrated) GPUs it's faster to initialize memory with an
705  // extra kernel than copy it from the host.
707  queue q = createSyclObjFromImpl<queue>(CGH.MQueue);
708  device Dev = q.get_device();
709  auto Deleter = [=](auto *Ptr) { free(Ptr, q); };
710 
711  std::shared_ptr<int> Counter(malloc_device<int>(1, q), Deleter);
712  CGH.addReduction(Counter);
713 
714  auto Event = q.memset(Counter.get(), 0, sizeof(int));
715  CGH.depends_on(Event);
716 
717  return Counter.get();
718  }
719 
720  RedOutVar &getUserRedVar() { return MRedOut; }
721 
722 private:
723  // Array reduction is performed element-wise to avoid stack growth, hence
724  // 1-dimensional always.
725  std::shared_ptr<buffer<T, 1>> MOutBufPtr;
726 
728  RedOutVar MRedOut;
729 };
732 template <typename T, class BinaryOperation, int Dims, size_t Extent,
733  typename RedOutVar>
735  : private reduction_impl_base,
736  public reduction_impl_algo<T, BinaryOperation, Dims, Extent, RedOutVar> {
737 private:
740 
741  static constexpr bool is_known_identity =
743 
744  // TODO: Do we also need chooseBinOp?
745  static constexpr T chooseIdentity(const T &Identity) {
746  // For now the implementation ignores the identity value given by user
747  // when the implementation knows the identity.
748  // The SPEC could prohibit passing identity parameter to operations with
749  // known identity, but that could have some bad consequences too.
750  // For example, at some moment the implementation may NOT know the identity
751  // for COMPLEX-PLUS reduction. User may create a program that would pass
752  // COMPLEX value (0,0) as identity for PLUS reduction. At some later moment
753  // when the implementation starts handling COMPLEX-PLUS as known operation
754  // the existing user's program remains compilable and working correctly.
755  // I.e. with this constructor here, adding more reduction operations to the
756  // list of known operations does not break the existing programs.
757  if constexpr (is_known_identity) {
758  (void)Identity;
759  return reducer_type::getIdentity();
760 
761  } else {
762  return Identity;
763  }
764  }
765 
766 public:
767  using algo::is_usm;
768 
770 
771  // Only scalar and 1D array reductions are supported by SYCL 2020.
772  static_assert(Dims <= 1, "Multi-dimensional reductions are not supported.");
773 
775  template <typename _self = self,
777  reduction_impl(RedOutVar Var, bool InitializeToIdentity = false)
778  : algo(reducer_type::getIdentity(), BinaryOperation(),
779  InitializeToIdentity, Var) {
780  if constexpr (!is_usm)
781  if (Var.size() != 1)
782  throw sycl::runtime_error(errc::invalid,
783  "Reduction variable must be a scalar.",
784  PI_ERROR_INVALID_VALUE);
785  }
786 
788  reduction_impl(RedOutVar &Var, const T &Identity, BinaryOperation BOp,
790  : algo(chooseIdentity(Identity), BOp, InitializeToIdentity, Var) {
791  if constexpr (!is_usm)
792  if (Var.size() != 1)
793  throw sycl::runtime_error(errc::invalid,
794  "Reduction variable must be a scalar.",
795  PI_ERROR_INVALID_VALUE);
796  }
797 };
798 
799 template <class BinaryOp, int Dims, size_t Extent, typename RedOutVar,
800  typename... RestTy>
801 auto make_reduction(RedOutVar RedVar, RestTy &&...Rest) {
803  Extent, RedOutVar>{RedVar,
804  std::forward<RestTy>(Rest)...};
805 }
806 
807 namespace reduction {
808 inline void finalizeHandler(handler &CGH) { CGH.finalize(); }
809 template <class FunctorTy> void withAuxHandler(handler &CGH, FunctorTy Func) {
810  event E = CGH.finalize();
811  handler AuxHandler(CGH.MQueue, CGH.MIsHost);
812  AuxHandler.depends_on(E);
813  AuxHandler.saveCodeLoc(CGH.MCodeLoc);
814  Func(AuxHandler);
815  CGH.MLastEvent = AuxHandler.finalize();
816  return;
817 }
818 } // namespace reduction
819 
820 // This method is used for implementation of parallel_for accepting 1 reduction.
821 // TODO: remove this method when everything is switched to general algorithm
822 // implementing arbitrary number of reductions in parallel_for().
825 template <typename KernelName, class Reduction>
826 void reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu) {
827  static_assert(Reduction::is_usm,
828  "All implementations using this helper are expected to have "
829  "USM reduction, not a buffer-based one.");
830  size_t NElements = Reduction::num_elements;
831  auto InAcc = Redu.getReadAccToPreviousPartialReds(CGH);
832  auto UserVarPtr = Redu.getUserRedVar();
833  bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
834  auto BOp = Redu.getBinaryOperation();
835  CGH.single_task<KernelName>([=] {
836  for (int i = 0; i < NElements; ++i) {
837  if (IsUpdateOfUserVar)
838  UserVarPtr[i] = BOp(UserVarPtr[i], InAcc.get_pointer()[i]);
839  else
840  UserVarPtr[i] = InAcc.get_pointer()[i];
841  }
842  });
843 }
844 
845 namespace reduction {
846 template <typename KernelName, strategy S, class... Ts> struct MainKrn;
847 template <typename KernelName, strategy S, class... Ts> struct AuxKrn;
848 } // namespace reduction
849 
852 template <template <typename, reduction::strategy, typename...> class MainOrAux,
853  class KernelName, reduction::strategy Strategy, class... Ts>
855  std::conditional_t<std::is_same<KernelName, auto_name>::value, auto_name,
856  MainOrAux<KernelName, Strategy, Ts...>>;
857 
858 // Implementations.
859 
860 template <reduction::strategy> struct NDRangeReduction;
861 
862 template <>
863 struct NDRangeReduction<reduction::strategy::local_atomic_and_atomic_cross_wg> {
864  template <typename KernelName, int Dims, typename PropertiesT,
865  typename KernelType, typename Reduction>
866  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
867  nd_range<Dims> NDRange, PropertiesT &Properties,
868  Reduction &Redu, KernelType &KernelFunc) {
869  std::ignore = Queue;
870  using Name = __sycl_reduction_kernel<
871  reduction::MainKrn, KernelName,
872  reduction::strategy::local_atomic_and_atomic_cross_wg>;
873  Redu.template withInitializedMem<Name>(CGH, [&](auto Out) {
874  size_t NElements = Reduction::num_elements;
876  CGH};
877 
878  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<1> NDId) {
879  // Call user's functions. Reducer.MValue gets initialized there.
880  typename Reduction::reducer_type Reducer;
881  KernelFunc(NDId, Reducer);
882 
883  // Work-group cooperates to initialize multiple reduction variables
884  auto LID = NDId.get_local_id(0);
885  for (size_t E = LID; E < NElements; E += NDId.get_local_range(0)) {
886  GroupSum[E] = Reducer.getIdentity();
887  }
889 
890  // Each work-item has its own reducer to combine
891  Reducer.template atomic_combine<access::address_space::local_space>(
892  &GroupSum[0]);
893 
894  // Single work-item performs finalization for entire work-group
895  // TODO: Opportunity to parallelize across elements
897  if (LID == 0) {
898  for (size_t E = 0; E < NElements; ++E) {
899  Reducer.getElement(E) = GroupSum[E];
900  }
901  Reducer.template atomic_combine(&Out[0]);
902  }
903  });
904  });
905  }
906 };
907 
908 template <>
910  reduction::strategy::group_reduce_and_last_wg_detection> {
911  template <typename KernelName, int Dims, typename PropertiesT,
912  typename KernelType, typename Reduction>
913  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
914  nd_range<Dims> NDRange, PropertiesT &Properties,
915  Reduction &Redu, KernelType &KernelFunc) {
916  std::ignore = Queue;
917  size_t NElements = Reduction::num_elements;
918  size_t WGSize = NDRange.get_local_range().size();
919  size_t NWorkGroups = NDRange.get_group_range().size();
920 
921  auto &Out = Redu.getUserRedVar();
922  if constexpr (!Reduction::is_usm)
923  associateWithHandler(CGH, &Out, access::target::device);
924 
925  auto &PartialSumsBuf = Redu.getTempBuffer(NWorkGroups * NElements, CGH);
926  accessor PartialSums(PartialSumsBuf, CGH, sycl::read_write, sycl::no_init);
927 
928  bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
929  auto Rest = [&](auto NWorkGroupsFinished) {
930  local_accessor<int, 1> DoReducePartialSumsInLastWG{1, CGH};
931 
932  using Name = __sycl_reduction_kernel<
933  reduction::MainKrn, KernelName,
934  reduction::strategy::group_reduce_and_last_wg_detection,
935  decltype(NWorkGroupsFinished)>;
936 
937  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<1> NDId) {
938  // Call user's functions. Reducer.MValue gets initialized there.
939  typename Reduction::reducer_type Reducer;
940  KernelFunc(NDId, Reducer);
941 
942  typename Reduction::binary_operation BOp;
943  auto Group = NDId.get_group();
944 
945  // If there are multiple values, reduce each separately
946  // reduce_over_group is only defined for each T, not for span<T, ...>
947  size_t LID = NDId.get_local_id(0);
948  for (int E = 0; E < NElements; ++E) {
949  auto &RedElem = Reducer.getElement(E);
950  RedElem = reduce_over_group(Group, RedElem, BOp);
951  if (LID == 0) {
952  if (NWorkGroups == 1) {
953  // Can avoid using partial sum and write the final result
954  // immediately.
955  if (IsUpdateOfUserVar)
956  RedElem = BOp(RedElem, Out[E]);
957  Out[E] = RedElem;
958  } else {
959  PartialSums[NDId.get_group_linear_id() * NElements + E] =
960  Reducer.getElement(E);
961  }
962  }
963  }
964 
965  if (NWorkGroups == 1)
966  // We're done.
967  return;
968 
969  // Signal this work-group has finished after all values are reduced
970  if (LID == 0) {
971  auto NFinished =
972  sycl::atomic_ref<int, memory_order::acq_rel, memory_scope::device,
973  access::address_space::global_space>(
974  NWorkGroupsFinished[0]);
975  DoReducePartialSumsInLastWG[0] = ++NFinished == NWorkGroups;
976  }
977 
979  if (DoReducePartialSumsInLastWG[0]) {
980  // Reduce each result separately
981  // TODO: Opportunity to parallelize across elements.
982  for (int E = 0; E < NElements; ++E) {
983  auto LocalSum = Reducer.getIdentity();
984  for (size_t I = LID; I < NWorkGroups; I += WGSize)
985  LocalSum = BOp(LocalSum, PartialSums[I * NElements + E]);
986  auto Result = reduce_over_group(Group, LocalSum, BOp);
987 
988  if (LID == 0) {
989  if (IsUpdateOfUserVar)
990  Result = BOp(Result, Out[E]);
991  Out[E] = Result;
992  }
993  }
994  }
995  });
996  };
997 
998  auto device = getDeviceFromHandler(CGH);
999  // Integrated/discrete GPUs have different faster path. For discrete GPUs
1000  // fast path requires USM device allocations though, so check for that as
1001  // well.
1002  if (device.get_info<info::device::host_unified_memory>() ||
1003  !device.has(aspect::usm_device_allocations))
1004  Rest(Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH));
1005  else
1006  Rest(Redu.getGroupsCounterAccDiscrete(CGH));
1007  }
1008 };
1009 
1010 template <typename LocalRedsTy, typename BinOpTy, typename BarrierTy,
1011  typename IdentityTy>
1012 void doTreeReduction(size_t WGSize, size_t LID, bool DisableExtraElem,
1013  IdentityTy Identity, LocalRedsTy &LocalReds, BinOpTy &BOp,
1014  BarrierTy Barrier) {
1015  // For work-groups, which size is not power of two, local accessors have
1016  // an additional element with index WGSize that is used by the
1017  // tree-reduction algorithm. Initialize those additional elements with
1018  // identity values here.
1019  if (!DisableExtraElem)
1020  LocalReds[WGSize] = Identity;
1021  Barrier();
1022  size_t PrevStep = WGSize;
1023  for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
1024  if (LID < CurStep)
1025  LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
1026  else if (!DisableExtraElem && LID == CurStep && (PrevStep & 0x1))
1027  LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
1028  Barrier();
1029  PrevStep = CurStep;
1030  }
1031 }
1032 
1033 template <> struct NDRangeReduction<reduction::strategy::range_basic> {
1034  template <typename KernelName, int Dims, typename PropertiesT,
1035  typename KernelType, typename Reduction>
1036  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1037  nd_range<Dims> NDRange, PropertiesT &Properties,
1038  Reduction &Redu, KernelType &KernelFunc) {
1039  std::ignore = Queue;
1040  size_t NElements = Reduction::num_elements;
1041  size_t WGSize = NDRange.get_local_range().size();
1042  size_t NWorkGroups = NDRange.get_group_range().size();
1043 
1044  bool IsUpdateOfUserVar = !Reduction::is_usm && !Redu.initializeToIdentity();
1045  auto PartialSums =
1046  Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1047  auto Out = (NWorkGroups == 1)
1048  ? PartialSums
1049  : Redu.getWriteAccForPartialReds(NElements, CGH);
1051  CGH};
1052  auto NWorkGroupsFinished =
1053  Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH);
1054  local_accessor<int, 1> DoReducePartialSumsInLastWG{1, CGH};
1055 
1056  auto Identity = Redu.getIdentity();
1057  auto BOp = Redu.getBinaryOperation();
1058 
1059  using Name = __sycl_reduction_kernel<reduction::MainKrn, KernelName,
1060  reduction::strategy::range_basic>;
1061 
1062  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<1> NDId) {
1063  // Call user's functions. Reducer.MValue gets initialized there.
1064  typename Reduction::reducer_type Reducer(Identity, BOp);
1065  KernelFunc(NDId, Reducer);
1066 
1067  // If there are multiple values, reduce each separately
1068  // This prevents local memory from scaling with elements
1069  size_t LID = NDId.get_local_linear_id();
1070  for (int E = 0; E < NElements; ++E) {
1071 
1072  // Copy the element to local memory to prepare it for tree-reduction.
1073  LocalReds[LID] = Reducer.getElement(E);
1074 
1075  doTreeReduction(WGSize, LID, false, Identity, LocalReds, BOp,
1076  [&]() { workGroupBarrier(); });
1077 
1078  if (LID == 0) {
1079  auto V = BOp(LocalReds[0], LocalReds[WGSize]);
1080  if (NWorkGroups == 1 && IsUpdateOfUserVar)
1081  V = BOp(V, Out[E]);
1082  // if NWorkGroups == 1, then PartialsSum and Out point to same memory.
1083  PartialSums[NDId.get_group_linear_id() * NElements + E] = V;
1084  }
1085  }
1086 
1087  // Signal this work-group has finished after all values are reduced
1088  if (LID == 0) {
1089  auto NFinished =
1090  sycl::atomic_ref<int, memory_order::acq_rel, memory_scope::device,
1091  access::address_space::global_space>(
1092  NWorkGroupsFinished[0]);
1093  DoReducePartialSumsInLastWG[0] =
1094  ++NFinished == NWorkGroups && NWorkGroups > 1;
1095  }
1096 
1097  workGroupBarrier();
1098  if (DoReducePartialSumsInLastWG[0]) {
1099  // Reduce each result separately
1100  // TODO: Opportunity to parallelize across elements
1101  for (int E = 0; E < NElements; ++E) {
1102  auto LocalSum = Identity;
1103  for (size_t I = LID; I < NWorkGroups; I += WGSize)
1104  LocalSum = BOp(LocalSum, PartialSums[I * NElements + E]);
1105 
1106  LocalReds[LID] = LocalSum;
1107 
1108  doTreeReduction(WGSize, LID, false, Identity, LocalReds, BOp,
1109  [&]() { workGroupBarrier(); });
1110  if (LID == 0) {
1111  auto V = BOp(LocalReds[0], LocalReds[WGSize]);
1112  if (IsUpdateOfUserVar)
1113  V = BOp(V, Out[E]);
1114  Out[E] = V;
1115  }
1116  }
1117  }
1118  });
1119 
1120  if constexpr (Reduction::is_usm)
1121  reduction::withAuxHandler(CGH, [&](handler &CopyHandler) {
1122  reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
1123  });
1124  }
1125 };
1126 
1127 template <>
1128 struct NDRangeReduction<reduction::strategy::group_reduce_and_atomic_cross_wg> {
1129  template <typename KernelName, int Dims, typename PropertiesT,
1130  typename KernelType, typename Reduction>
1131  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1132  nd_range<Dims> NDRange, PropertiesT &Properties,
1133  Reduction &Redu, KernelType &KernelFunc) {
1134  std::ignore = Queue;
1135  using Name = __sycl_reduction_kernel<
1136  reduction::MainKrn, KernelName,
1137  reduction::strategy::group_reduce_and_atomic_cross_wg>;
1138  Redu.template withInitializedMem<Name>(CGH, [&](auto Out) {
1139  size_t NElements = Reduction::num_elements;
1140 
1141  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<Dims> NDIt) {
1142  // Call user's function. Reducer.MValue gets initialized there.
1143  typename Reduction::reducer_type Reducer;
1144  KernelFunc(NDIt, Reducer);
1145 
1146  typename Reduction::binary_operation BOp;
1147  for (int E = 0; E < NElements; ++E) {
1148  Reducer.getElement(E) =
1149  reduce_over_group(NDIt.get_group(), Reducer.getElement(E), BOp);
1150  }
1151  if (NDIt.get_local_linear_id() == 0)
1152  Reducer.atomic_combine(&Out[0]);
1153  });
1154  });
1155  }
1156 };
1157 
1158 template <>
1160  reduction::strategy::local_mem_tree_and_atomic_cross_wg> {
1161  template <typename KernelName, int Dims, typename PropertiesT,
1162  typename KernelType, typename Reduction>
1163  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1164  nd_range<Dims> NDRange, PropertiesT &Properties,
1165  Reduction &Redu, KernelType &KernelFunc) {
1166  std::ignore = Queue;
1167  using Name = __sycl_reduction_kernel<
1168  reduction::MainKrn, KernelName,
1169  reduction::strategy::local_mem_tree_and_atomic_cross_wg>;
1170  Redu.template withInitializedMem<Name>(CGH, [&](auto Out) {
1171  size_t NElements = Reduction::num_elements;
1172  size_t WGSize = NDRange.get_local_range().size();
1173  bool IsPow2WG = (WGSize & (WGSize - 1)) == 0;
1174 
1175  // Use local memory to reduce elements in work-groups into zero-th
1176  // element. If WGSize is not power of two, then WGSize+1 elements are
1177  // allocated. The additional last element is used to catch reduce elements
1178  // that could otherwise be lost in the tree-reduction algorithm used in
1179  // the kernel.
1180  size_t NLocalElements = WGSize + (IsPow2WG ? 0 : 1);
1182  NLocalElements, CGH};
1183 
1184  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<Dims> NDIt) {
1185  // Call user's functions. Reducer.MValue gets initialized there.
1186  typename Reduction::reducer_type Reducer;
1187  KernelFunc(NDIt, Reducer);
1188 
1189  size_t WGSize = NDIt.get_local_range().size();
1190  size_t LID = NDIt.get_local_linear_id();
1191 
1192  // If there are multiple values, reduce each separately
1193  // This prevents local memory from scaling with elements
1194  for (int E = 0; E < NElements; ++E) {
1195 
1196  // Copy the element to local memory to prepare it for tree-reduction.
1197  LocalReds[LID] = Reducer.getElement(E);
1198 
1199  typename Reduction::binary_operation BOp;
1200  doTreeReduction(WGSize, LID, IsPow2WG, Reducer.getIdentity(),
1201  LocalReds, BOp, [&]() { NDIt.barrier(); });
1202 
1203  if (LID == 0) {
1204  Reducer.getElement(E) =
1205  IsPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]);
1206  }
1207 
1208  // Ensure item 0 is finished with LocalReds before next iteration
1209  if (E != NElements - 1) {
1210  NDIt.barrier();
1211  }
1212  }
1213 
1214  if (LID == 0) {
1215  Reducer.atomic_combine(&Out[0]);
1216  }
1217  });
1218  });
1219  }
1220 };
1221 
1222 template <>
1224  reduction::strategy::group_reduce_and_multiple_kernels> {
1225  template <typename KernelName, int Dims, typename PropertiesT,
1226  typename KernelType, typename Reduction>
1227  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1228  nd_range<Dims> NDRange, PropertiesT &Properties,
1229  Reduction &Redu, KernelType &KernelFunc) {
1230  // Before running the kernels, check that device has enough local memory
1231  // to hold local arrays that may be required for the reduction algorithm.
1232  // TODO: If the work-group-size is limited by the local memory, then
1233  // a special version of the main kernel may be created. The one that would
1234  // not use local accessors, which means it would not do the reduction in
1235  // the main kernel, but simply generate Range.get_global_range.size() number
1236  // of partial sums, leaving the reduction work to the additional/aux
1237  // kernels.
1238  constexpr bool HFR = Reduction::has_fast_reduce;
1239  size_t OneElemSize = HFR ? 0 : sizeof(typename Reduction::result_type);
1240  // TODO: currently the maximal work group size is determined for the given
1241  // queue/device, while it may be safer to use queries to the kernel compiled
1242  // for the device.
1243  size_t MaxWGSize = reduGetMaxWGSize(Queue, OneElemSize);
1244  if (NDRange.get_local_range().size() > MaxWGSize)
1245  throw sycl::runtime_error("The implementation handling parallel_for with"
1246  " reduction requires work group size not bigger"
1247  " than " +
1248  std::to_string(MaxWGSize),
1249  PI_ERROR_INVALID_WORK_GROUP_SIZE);
1250 
1251  size_t NElements = Reduction::num_elements;
1252  size_t NWorkGroups = NDRange.get_group_range().size();
1253  auto Out = Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1254 
1255  bool IsUpdateOfUserVar =
1256  !Reduction::is_usm && !Redu.initializeToIdentity() && NWorkGroups == 1;
1257 
1258  using Name = __sycl_reduction_kernel<
1259  reduction::MainKrn, KernelName,
1260  reduction::strategy::group_reduce_and_multiple_kernels>;
1261 
1262  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<Dims> NDIt) {
1263  // Call user's functions. Reducer.MValue gets initialized there.
1264  typename Reduction::reducer_type Reducer;
1265  KernelFunc(NDIt, Reducer);
1266 
1267  // Compute the partial sum/reduction for the work-group.
1268  size_t WGID = NDIt.get_group_linear_id();
1269  typename Reduction::binary_operation BOp;
1270  for (int E = 0; E < NElements; ++E) {
1271  typename Reduction::result_type PSum;
1272  PSum = Reducer.getElement(E);
1273  PSum = reduce_over_group(NDIt.get_group(), PSum, BOp);
1274  if (NDIt.get_local_linear_id() == 0) {
1275  if (IsUpdateOfUserVar)
1276  PSum = BOp(Out[E], PSum);
1277  Out[WGID * NElements + E] = PSum;
1278  }
1279  }
1280  });
1281 
1283 
1284  // Run the additional kernel as many times as needed to reduce all partial
1285  // sums into one scalar.
1286 
1287  // TODO: Create a special slow/sequential version of the kernel that would
1288  // handle the reduction instead of reporting an assert below.
1289  if (MaxWGSize <= 1)
1290  throw sycl::runtime_error("The implementation handling parallel_for with "
1291  "reduction requires the maximal work group "
1292  "size to be greater than 1 to converge. "
1293  "The maximal work group size depends on the "
1294  "device and the size of the objects passed to "
1295  "the reduction.",
1296  PI_ERROR_INVALID_WORK_GROUP_SIZE);
1297  size_t NWorkItems = NDRange.get_group_range().size();
1298  while (NWorkItems > 1) {
1299  reduction::withAuxHandler(CGH, [&](handler &AuxHandler) {
1300  size_t NElements = Reduction::num_elements;
1301  size_t NWorkGroups;
1302  size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
1303 
1304  // The last work-group may be not fully loaded with work, or the work
1305  // group size may be not power of two. Those two cases considered
1306  // inefficient as they require additional code and checks in the kernel.
1307  bool HasUniformWG = NWorkGroups * WGSize == NWorkItems;
1308  if (!Reduction::has_fast_reduce)
1309  HasUniformWG = HasUniformWG && (WGSize & (WGSize - 1)) == 0;
1310 
1311  // Get read accessor to the buffer that was used as output
1312  // in the previous kernel.
1313  auto In = Redu.getReadAccToPreviousPartialReds(AuxHandler);
1314  auto Out =
1315  Redu.getWriteAccForPartialReds(NWorkGroups * NElements, AuxHandler);
1316 
1317  using Name = __sycl_reduction_kernel<
1318  reduction::AuxKrn, KernelName,
1319  reduction::strategy::group_reduce_and_multiple_kernels>;
1320 
1321  bool IsUpdateOfUserVar = !Reduction::is_usm &&
1322  !Redu.initializeToIdentity() &&
1323  NWorkGroups == 1;
1324  range<1> GlobalRange = {HasUniformWG ? NWorkItems
1325  : NWorkGroups * WGSize};
1326  nd_range<1> Range{GlobalRange, range<1>(WGSize)};
1327  AuxHandler.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
1328  typename Reduction::binary_operation BOp;
1329  size_t WGID = NDIt.get_group_linear_id();
1330  size_t GID = NDIt.get_global_linear_id();
1331 
1332  for (int E = 0; E < NElements; ++E) {
1333  typename Reduction::result_type PSum =
1334  (HasUniformWG || (GID < NWorkItems))
1335  ? In[GID * NElements + E]
1336  : Reduction::reducer_type::getIdentity();
1337  PSum = reduce_over_group(NDIt.get_group(), PSum, BOp);
1338  if (NDIt.get_local_linear_id() == 0) {
1339  if (IsUpdateOfUserVar)
1340  PSum = BOp(Out[E], PSum);
1341  Out[WGID * NElements + E] = PSum;
1342  }
1343  }
1344  });
1345  NWorkItems = NWorkGroups;
1346  });
1347  } // end while (NWorkItems > 1)
1348 
1349  if constexpr (Reduction::is_usm) {
1350  reduction::withAuxHandler(CGH, [&](handler &CopyHandler) {
1351  reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
1352  });
1353  }
1354  }
1355 };
1356 
1357 template <> struct NDRangeReduction<reduction::strategy::basic> {
1358  template <typename KernelName, int Dims, typename PropertiesT,
1359  typename KernelType, typename Reduction>
1360  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1361  nd_range<Dims> NDRange, PropertiesT &Properties,
1362  Reduction &Redu, KernelType &KernelFunc) {
1363  constexpr bool HFR = Reduction::has_fast_reduce;
1364  size_t OneElemSize = HFR ? 0 : sizeof(typename Reduction::result_type);
1365  // TODO: currently the maximal work group size is determined for the given
1366  // queue/device, while it may be safer to use queries to the kernel compiled
1367  // for the device.
1368  size_t MaxWGSize = reduGetMaxWGSize(Queue, OneElemSize);
1369  if (NDRange.get_local_range().size() > MaxWGSize)
1370  throw sycl::runtime_error("The implementation handling parallel_for with"
1371  " reduction requires work group size not bigger"
1372  " than " +
1373  std::to_string(MaxWGSize),
1374  PI_ERROR_INVALID_WORK_GROUP_SIZE);
1375 
1376  size_t NElements = Reduction::num_elements;
1377  size_t WGSize = NDRange.get_local_range().size();
1378  bool IsPow2WG = (WGSize & (WGSize - 1)) == 0;
1379  size_t NWorkGroups = NDRange.get_group_range().size();
1380  auto Out = Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1381 
1382  bool IsUpdateOfUserVar =
1383  !Reduction::is_usm && !Redu.initializeToIdentity() && NWorkGroups == 1;
1384 
1385  // Use local memory to reduce elements in work-groups into 0-th element.
1386  // If WGSize is not power of two, then WGSize+1 elements are allocated.
1387  // The additional last element is used to catch elements that could
1388  // otherwise be lost in the tree-reduction algorithm.
1389  size_t NumLocalElements = WGSize + (IsPow2WG ? 0 : 1);
1391  NumLocalElements, CGH};
1392  typename Reduction::result_type ReduIdentity = Redu.getIdentity();
1393  using Name = __sycl_reduction_kernel<reduction::MainKrn, KernelName,
1394  reduction::strategy::basic>;
1395 
1396  auto BOp = Redu.getBinaryOperation();
1397  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<Dims> NDIt) {
1398  // Call user's functions. Reducer.MValue gets initialized there.
1399  typename Reduction::reducer_type Reducer(ReduIdentity, BOp);
1400  KernelFunc(NDIt, Reducer);
1401 
1402  size_t WGSize = NDIt.get_local_range().size();
1403  size_t LID = NDIt.get_local_linear_id();
1404 
1405  // If there are multiple values, reduce each separately
1406  // This prevents local memory from scaling with elements
1407  for (int E = 0; E < NElements; ++E) {
1408 
1409  // Copy the element to local memory to prepare it for tree-reduction.
1410  LocalReds[LID] = Reducer.getElement(E);
1411 
1412  doTreeReduction(WGSize, LID, IsPow2WG, ReduIdentity, LocalReds, BOp,
1413  [&]() { NDIt.barrier(); });
1414 
1415  // Compute the partial sum/reduction for the work-group.
1416  if (LID == 0) {
1417  size_t GrID = NDIt.get_group_linear_id();
1418  typename Reduction::result_type PSum =
1419  IsPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]);
1420  if (IsUpdateOfUserVar)
1421  PSum = BOp(Out[0], PSum);
1422  Out[GrID * NElements + E] = PSum;
1423  }
1424 
1425  // Ensure item 0 is finished with LocalReds before next iteration
1426  if (E != NElements - 1) {
1427  NDIt.barrier();
1428  }
1429  }
1430  });
1431 
1433 
1434  // 2. Run the additional kernel as many times as needed to reduce
1435  // all partial sums into one scalar.
1436 
1437  // TODO: Create a special slow/sequential version of the kernel that would
1438  // handle the reduction instead of reporting an assert below.
1439  if (MaxWGSize <= 1)
1440  throw sycl::runtime_error("The implementation handling parallel_for with "
1441  "reduction requires the maximal work group "
1442  "size to be greater than 1 to converge. "
1443  "The maximal work group size depends on the "
1444  "device and the size of the objects passed to "
1445  "the reduction.",
1446  PI_ERROR_INVALID_WORK_GROUP_SIZE);
1447  size_t NWorkItems = NDRange.get_group_range().size();
1448  while (NWorkItems > 1) {
1449  reduction::withAuxHandler(CGH, [&](handler &AuxHandler) {
1450  size_t NElements = Reduction::num_elements;
1451  size_t NWorkGroups;
1452  size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
1453 
1454  // The last work-group may be not fully loaded with work, or the work
1455  // group size may be not power of two. Those two cases considered
1456  // inefficient as they require additional code and checks in the kernel.
1457  bool HasUniformWG = NWorkGroups * WGSize == NWorkItems;
1458 
1459  // Get read accessor to the buffer that was used as output
1460  // in the previous kernel.
1461  auto In = Redu.getReadAccToPreviousPartialReds(AuxHandler);
1462  auto Out =
1463  Redu.getWriteAccForPartialReds(NWorkGroups * NElements, AuxHandler);
1464 
1465  bool IsUpdateOfUserVar = !Reduction::is_usm &&
1466  !Redu.initializeToIdentity() &&
1467  NWorkGroups == 1;
1468 
1469  bool UniformPow2WG = HasUniformWG && (WGSize & (WGSize - 1)) == 0;
1470  // Use local memory to reduce elements in work-groups into 0-th element.
1471  // If WGSize is not power of two, then WGSize+1 elements are allocated.
1472  // The additional last element is used to catch elements that could
1473  // otherwise be lost in the tree-reduction algorithm.
1474  size_t NumLocalElements = WGSize + (UniformPow2WG ? 0 : 1);
1476  NumLocalElements, AuxHandler};
1477 
1478  auto ReduIdentity = Redu.getIdentity();
1479  auto BOp = Redu.getBinaryOperation();
1480  using Name = __sycl_reduction_kernel<reduction::AuxKrn, KernelName,
1481  reduction::strategy::basic>;
1482 
1483  range<1> GlobalRange = {UniformPow2WG ? NWorkItems
1484  : NWorkGroups * WGSize};
1485  nd_range<1> Range{GlobalRange, range<1>(WGSize)};
1486  AuxHandler.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
1487  size_t WGSize = NDIt.get_local_range().size();
1488  size_t LID = NDIt.get_local_linear_id();
1489  size_t GID = NDIt.get_global_linear_id();
1490 
1491  for (int E = 0; E < NElements; ++E) {
1492  // Copy the element to local memory to prepare it for
1493  // tree-reduction.
1494  LocalReds[LID] = (UniformPow2WG || GID < NWorkItems)
1495  ? In[GID * NElements + E]
1496  : ReduIdentity;
1497 
1498  doTreeReduction(WGSize, LID, UniformPow2WG, ReduIdentity, LocalReds,
1499  BOp, [&]() { NDIt.barrier(); });
1500 
1501  // Compute the partial sum/reduction for the work-group.
1502  if (LID == 0) {
1503  size_t GrID = NDIt.get_group_linear_id();
1504  typename Reduction::result_type PSum =
1505  UniformPow2WG ? LocalReds[0]
1506  : BOp(LocalReds[0], LocalReds[WGSize]);
1507  if (IsUpdateOfUserVar)
1508  PSum = BOp(Out[0], PSum);
1509  Out[GrID * NElements + E] = PSum;
1510  }
1511 
1512  // Ensure item 0 is finished with LocalReds before next iteration
1513  if (E != NElements - 1) {
1514  NDIt.barrier();
1515  }
1516  }
1517  });
1518  NWorkItems = NWorkGroups;
1519  });
1520  } // end while (NWorkItems > 1)
1521 
1522  if constexpr (Reduction::is_usm) {
1523  reduction::withAuxHandler(CGH, [&](handler &CopyHandler) {
1524  reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
1525  });
1526  }
1527  }
1528 };
1529 
1533 template <bool IsOneWG, typename... Reductions, size_t... Is>
1534 auto createReduOutAccs(size_t NWorkGroups, handler &CGH,
1535  std::tuple<Reductions...> &ReduTuple,
1536  std::index_sequence<Is...>) {
1537  return makeReduTupleT(
1538  std::get<Is>(ReduTuple).template getWriteMemForPartialReds<IsOneWG>(
1539  NWorkGroups *
1540  std::tuple_element_t<Is, std::tuple<Reductions...>>::num_elements,
1541  CGH)...);
1542 }
1543 
1544 template <typename... LocalAccT, typename... BOPsT, size_t... Is>
1545 void reduceReduLocalAccs(size_t IndexA, size_t IndexB,
1546  ReduTupleT<LocalAccT...> LocalAccs,
1547  ReduTupleT<BOPsT...> BOPs,
1548  std::index_sequence<Is...>) {
1549  auto ProcessOne = [=](auto &LocalAcc, auto &BOp) {
1550  LocalAcc[IndexA] = BOp(LocalAcc[IndexA], LocalAcc[IndexB]);
1551  };
1552  (ProcessOne(std::get<Is>(LocalAccs), std::get<Is>(BOPs)), ...);
1553 }
1554 
1555 template <typename... Reductions, typename... OutAccT, typename... LocalAccT,
1556  typename... BOPsT, typename... Ts, size_t... Is>
1558  bool Pow2WG, bool IsOneWG, size_t OutAccIndex, size_t WGSize,
1560  ReduTupleT<BOPsT...> BOPs, ReduTupleT<Ts...> IdentityVals,
1561  std::array<bool, sizeof...(Reductions)> IsInitializeToIdentity,
1562  std::index_sequence<Is...>) {
1563  // Add the initial value of user's variable to the final result.
1564  if (IsOneWG)
1565  ((std::get<Is>(LocalAccs)[0] = std::get<Is>(BOPs)(
1566  std::get<Is>(LocalAccs)[0], IsInitializeToIdentity[Is]
1567  ? std::get<Is>(IdentityVals)
1568  : std::get<Is>(OutAccs)[0])),
1569  ...);
1570 
1571  if (Pow2WG) {
1572  // The partial sums for the work-group are stored in 0-th elements of local
1573  // accessors. Simply write those sums to output accessors.
1574  ((std::get<Is>(OutAccs)[OutAccIndex] = std::get<Is>(LocalAccs)[0]), ...);
1575  } else {
1576  // Each of local accessors keeps two partial sums: in 0-th and WGsize-th
1577  // elements. Combine them into final partial sums and write to output
1578  // accessors.
1579  ((std::get<Is>(OutAccs)[OutAccIndex] = std::get<Is>(BOPs)(
1580  std::get<Is>(LocalAccs)[0], std::get<Is>(LocalAccs)[WGSize])),
1581  ...);
1582  }
1583 }
1584 
1585 // Concatenate an empty sequence.
1586 constexpr std::index_sequence<> concat_sequences(std::index_sequence<>) {
1587  return {};
1588 }
1589 
1590 // Concatenate a sequence consisting of 1 element.
1591 template <size_t I>
1592 constexpr std::index_sequence<I> concat_sequences(std::index_sequence<I>) {
1593  return {};
1594 }
1595 
1596 // Concatenate two potentially empty sequences.
1597 template <size_t... Is, size_t... Js>
1598 constexpr std::index_sequence<Is..., Js...>
1599 concat_sequences(std::index_sequence<Is...>, std::index_sequence<Js...>) {
1600  return {};
1601 }
1602 
1603 // Concatenate more than 2 sequences.
1604 template <size_t... Is, size_t... Js, class... Rs>
1605 constexpr auto concat_sequences(std::index_sequence<Is...>,
1606  std::index_sequence<Js...>, Rs...) {
1607  return concat_sequences(std::index_sequence<Is..., Js...>{}, Rs{}...);
1608 }
1609 
1611  template <typename T> struct Func {
1612  static constexpr bool value = !std::remove_pointer_t<T>::is_usm;
1613  };
1614 };
1615 
1617  template <typename T> struct Func {
1618  static constexpr bool value = false;
1619  };
1620 };
1621 
1622 template <bool Cond, size_t I> struct FilterElement {
1623  using type =
1624  std::conditional_t<Cond, std::index_sequence<I>, std::index_sequence<>>;
1625 };
1626 
1632 template <typename... T, typename FunctorT, size_t... Is,
1633  std::enable_if_t<(sizeof...(Is) > 0), int> Z = 0>
1634 constexpr auto filterSequenceHelper(FunctorT, std::index_sequence<Is...>) {
1635  return concat_sequences(
1636  typename FilterElement<FunctorT::template Func<std::tuple_element_t<
1637  Is, std::tuple<T...>>>::value,
1638  Is>::type{}...);
1639 }
1640 template <typename... T, typename FunctorT, size_t... Is,
1641  std::enable_if_t<(sizeof...(Is) == 0), int> Z = 0>
1642 constexpr auto filterSequenceHelper(FunctorT, std::index_sequence<Is...>) {
1643  return std::index_sequence<>{};
1644 }
1645 
1649 template <typename... T, typename FunctorT, size_t... Is>
1650 constexpr auto filterSequence(FunctorT F, std::index_sequence<Is...> Indices) {
1651  return filterSequenceHelper<T...>(F, Indices);
1652 }
1653 
1655  template <typename Reduction> struct Func {
1656  static constexpr bool value =
1657  (Reduction::dims == 0 && Reduction::num_elements == 1);
1658  };
1659 };
1660 
1662  template <typename Reduction> struct Func {
1663  static constexpr bool value =
1664  (Reduction::dims == 1 && Reduction::num_elements >= 1);
1665  };
1666 };
1667 
1670 template <typename... Reductions, int Dims, typename... LocalAccT,
1671  typename... OutAccT, typename... ReducerT, typename... Ts,
1672  typename... BOPsT, size_t... Is>
1674  bool Pow2WG, bool IsOneWG, nd_item<Dims> NDIt,
1675  ReduTupleT<LocalAccT...> LocalAccsTuple,
1676  ReduTupleT<OutAccT...> OutAccsTuple, std::tuple<ReducerT...> &ReducersTuple,
1677  ReduTupleT<Ts...> IdentitiesTuple, ReduTupleT<BOPsT...> BOPsTuple,
1678  std::array<bool, sizeof...(Reductions)> InitToIdentityProps,
1679  std::index_sequence<Is...> ReduIndices) {
1680  size_t WGSize = NDIt.get_local_range().size();
1681  size_t LID = NDIt.get_local_linear_id();
1682 
1683  ((std::get<Is>(LocalAccsTuple)[LID] = std::get<Is>(ReducersTuple).MValue),
1684  ...);
1685 
1686  // For work-groups, which size is not power of two, local accessors have
1687  // an additional element with index WGSize that is used by the tree-reduction
1688  // algorithm. Initialize those additional elements with identity values here.
1689  if (!Pow2WG)
1690  ((std::get<Is>(LocalAccsTuple)[WGSize] = std::get<Is>(IdentitiesTuple)),
1691  ...);
1692  NDIt.barrier();
1693 
1694  size_t PrevStep = WGSize;
1695  for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
1696  if (LID < CurStep) {
1697  // LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
1698  reduceReduLocalAccs(LID, LID + CurStep, LocalAccsTuple, BOPsTuple,
1699  ReduIndices);
1700  } else if (!Pow2WG && LID == CurStep && (PrevStep & 0x1)) {
1701  // LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
1702  reduceReduLocalAccs(WGSize, PrevStep - 1, LocalAccsTuple, BOPsTuple,
1703  ReduIndices);
1704  }
1705  NDIt.barrier();
1706  PrevStep = CurStep;
1707  }
1708 
1709  // Compute the partial sum/reduction for the work-group.
1710  if (LID == 0) {
1711  size_t GrID = NDIt.get_group_linear_id();
1712  writeReduSumsToOutAccs<Reductions...>(
1713  Pow2WG, IsOneWG, GrID, WGSize, OutAccsTuple, LocalAccsTuple, BOPsTuple,
1714  IdentitiesTuple, InitToIdentityProps, ReduIndices);
1715  }
1716 }
1717 
1719 template <typename Reduction, int Dims, typename LocalAccT, typename OutAccT,
1720  typename ReducerT, typename T, typename BOPT>
1721 void reduCGFuncImplArrayHelper(bool Pow2WG, bool IsOneWG, nd_item<Dims> NDIt,
1722  LocalAccT LocalReds, OutAccT Out,
1723  ReducerT &Reducer, T Identity, BOPT BOp,
1724  bool IsInitializeToIdentity) {
1725  size_t WGSize = NDIt.get_local_range().size();
1726  size_t LID = NDIt.get_local_linear_id();
1727 
1728  // If there are multiple values, reduce each separately
1729  // This prevents local memory from scaling with elements
1730  auto NElements = Reduction::num_elements;
1731  for (size_t E = 0; E < NElements; ++E) {
1732 
1733  // Copy the element to local memory to prepare it for tree-reduction.
1734  LocalReds[LID] = Reducer.getElement(E);
1735 
1736  doTreeReduction(WGSize, LID, Pow2WG, Identity, LocalReds, BOp,
1737  [&]() { NDIt.barrier(); });
1738 
1739  // Add the initial value of user's variable to the final result.
1740  if (LID == 0) {
1741  if (IsOneWG) {
1742  LocalReds[0] =
1743  BOp(LocalReds[0], IsInitializeToIdentity ? Identity : Out[E]);
1744  }
1745 
1746  size_t GrID = NDIt.get_group_linear_id();
1747  Out[GrID * NElements + E] =
1748  Pow2WG ?
1749  // The partial sums for the work-group are stored in 0-th
1750  // elements of local accessors. Simply write those sums to
1751  // output accessors.
1752  LocalReds[0]
1753  :
1754  // Each of local accessors keeps two partial sums: in 0-th
1755  // and WGsize-th elements. Combine them into final partial
1756  // sums and write to output accessors.
1757  BOp(LocalReds[0], LocalReds[WGSize]);
1758  }
1759 
1760  // Ensure item 0 is finished with LocalReds before next iteration
1761  if (E != NElements - 1) {
1762  NDIt.barrier();
1763  }
1764  }
1765 }
1766 
1767 template <typename... Reductions, int Dims, typename... LocalAccT,
1768  typename... OutAccT, typename... ReducerT, typename... Ts,
1769  typename... BOPsT, size_t... Is>
1771  bool Pow2WG, bool IsOneWG, nd_item<Dims> NDIt,
1772  ReduTupleT<LocalAccT...> LocalAccsTuple,
1773  ReduTupleT<OutAccT...> OutAccsTuple, std::tuple<ReducerT...> &ReducersTuple,
1774  ReduTupleT<Ts...> IdentitiesTuple, ReduTupleT<BOPsT...> BOPsTuple,
1775  std::array<bool, sizeof...(Reductions)> InitToIdentityProps,
1776  std::index_sequence<Is...>) {
1777  using ReductionPack = std::tuple<Reductions...>;
1778  (reduCGFuncImplArrayHelper<std::tuple_element_t<Is, ReductionPack>>(
1779  Pow2WG, IsOneWG, NDIt, std::get<Is>(LocalAccsTuple),
1780  std::get<Is>(OutAccsTuple), std::get<Is>(ReducersTuple),
1781  std::get<Is>(IdentitiesTuple), std::get<Is>(BOPsTuple),
1782  InitToIdentityProps[Is]),
1783  ...);
1784 }
1785 
1786 namespace reduction::main_krn {
1787 template <class KernelName, class Accessor> struct NDRangeMulti;
1788 } // namespace reduction::main_krn
1789 template <typename KernelName, typename KernelType, int Dims,
1790  typename PropertiesT, typename... Reductions, size_t... Is>
1791 void reduCGFuncMulti(handler &CGH, KernelType KernelFunc,
1792  const nd_range<Dims> &Range, PropertiesT Properties,
1793  std::tuple<Reductions...> &ReduTuple,
1794  std::index_sequence<Is...> ReduIndices) {
1795  size_t WGSize = Range.get_local_range().size();
1796  bool Pow2WG = (WGSize & (WGSize - 1)) == 0;
1797 
1798  // Split reduction sequence into two:
1799  // 1) Scalar reductions
1800  // 2) Array reductions
1801  // This allows us to reuse the existing implementation for scalar reductions
1802  // and introduce a new implementation for array reductions. Longer term it
1803  // may make sense to generalize the code such that each phase below applies
1804  // to all available reduction implementations -- today all reduction classes
1805  // use the same privatization-based approach, so this is unnecessary.
1806  IsScalarReduction ScalarPredicate;
1807  auto ScalarIs = filterSequence<Reductions...>(ScalarPredicate, ReduIndices);
1808 
1809  IsArrayReduction ArrayPredicate;
1810  auto ArrayIs = filterSequence<Reductions...>(ArrayPredicate, ReduIndices);
1811 
1812  // Create inputs using the global order of all reductions
1813  size_t LocalAccSize = WGSize + (Pow2WG ? 0 : 1);
1814 
1815  auto LocalAccsTuple =
1817  LocalAccSize, CGH}...);
1818 
1819  size_t NWorkGroups = Range.get_group_range().size();
1820  bool IsOneWG = NWorkGroups == 1;
1821 
1822  // The type of the Out "accessor" differs between scenarios when there is just
1823  // one WorkGroup and when there are multiple. Use this lambda to write the
1824  // code just once.
1825  auto Rest = [&](auto OutAccsTuple) {
1826  auto IdentitiesTuple =
1827  makeReduTupleT(std::get<Is>(ReduTuple).getIdentity()...);
1828  auto BOPsTuple =
1829  makeReduTupleT(std::get<Is>(ReduTuple).getBinaryOperation()...);
1830  std::array InitToIdentityProps{
1831  std::get<Is>(ReduTuple).initializeToIdentity()...};
1832 
1833  using Name = __sycl_reduction_kernel<reduction::MainKrn, KernelName,
1834  reduction::strategy::multi,
1835  decltype(OutAccsTuple)>;
1836 
1837  CGH.parallel_for<Name>(Range, Properties, [=](nd_item<Dims> NDIt) {
1838  // Pass all reductions to user's lambda in the same order as supplied
1839  // Each reducer initializes its own storage
1840  auto ReduIndices = std::index_sequence_for<Reductions...>();
1841  auto ReducersTuple = std::tuple{typename Reductions::reducer_type{
1842  std::get<Is>(IdentitiesTuple), std::get<Is>(BOPsTuple)}...};
1843  std::apply([&](auto &...Reducers) { KernelFunc(NDIt, Reducers...); },
1844  ReducersTuple);
1845 
1846  // Combine and write-back the results of any scalar reductions
1847  // reduCGFuncImplScalar<Reductions...>(NDIt, LocalAccsTuple, OutAccsTuple,
1848  // ReducersTuple, IdentitiesTuple, BOPsTuple, InitToIdentityProps,
1849  // ReduIndices);
1850  reduCGFuncImplScalar<Reductions...>(
1851  Pow2WG, IsOneWG, NDIt, LocalAccsTuple, OutAccsTuple, ReducersTuple,
1852  IdentitiesTuple, BOPsTuple, InitToIdentityProps, ScalarIs);
1853 
1854  // Combine and write-back the results of any array reductions
1855  // These are handled separately to minimize temporary storage and account
1856  // for the fact that each array reduction may have a different number of
1857  // elements to reduce (i.e. a different extent).
1858  reduCGFuncImplArray<Reductions...>(
1859  Pow2WG, IsOneWG, NDIt, LocalAccsTuple, OutAccsTuple, ReducersTuple,
1860  IdentitiesTuple, BOPsTuple, InitToIdentityProps, ArrayIs);
1861  });
1862  };
1863 
1864  if (IsOneWG)
1865  Rest(createReduOutAccs<true>(NWorkGroups, CGH, ReduTuple, ReduIndices));
1866  else
1867  Rest(createReduOutAccs<false>(NWorkGroups, CGH, ReduTuple, ReduIndices));
1868 }
1869 
1870 template <typename... Reductions, size_t... Is>
1872  std::tuple<Reductions...> &ReduTuple,
1873  std::index_sequence<Is...>) {
1874  auto ProcessOne = [&CGH](auto Redu) {
1875  if constexpr (!decltype(Redu)::is_usm) {
1876  associateWithHandler(CGH, &Redu.getUserRedVar(), access::target::device);
1877  }
1878  };
1879  (ProcessOne(std::get<Is>(ReduTuple)), ...);
1880 }
1881 
1884 template <typename... Reductions, int Dims, typename... LocalAccT,
1885  typename... InAccT, typename... OutAccT, typename... Ts,
1886  typename... BOPsT, size_t... Is>
1888  bool UniformPow2WG, bool IsOneWG, nd_item<Dims> NDIt, size_t LID,
1889  size_t GID, size_t NWorkItems, size_t WGSize,
1890  ReduTupleT<LocalAccT...> LocalAccsTuple, ReduTupleT<InAccT...> InAccsTuple,
1891  ReduTupleT<OutAccT...> OutAccsTuple, ReduTupleT<Ts...> IdentitiesTuple,
1892  ReduTupleT<BOPsT...> BOPsTuple,
1893  std::array<bool, sizeof...(Reductions)> InitToIdentityProps,
1894  std::index_sequence<Is...> ReduIndices) {
1895  // Normally, the local accessors are initialized with elements from the input
1896  // accessors. The exception is the case when (GID >= NWorkItems), which
1897  // possible only when UniformPow2WG is false. For that case the elements of
1898  // local accessors are initialized with identity value, so they would not
1899  // give any impact into the final partial sums during the tree-reduction
1900  // algorithm work.
1901  ((std::get<Is>(LocalAccsTuple)[LID] = UniformPow2WG || GID < NWorkItems
1902  ? std::get<Is>(InAccsTuple)[GID]
1903  : std::get<Is>(IdentitiesTuple)),
1904  ...);
1905 
1906  // For work-groups, which size is not power of two, local accessors have
1907  // an additional element with index WGSize that is used by the tree-reduction
1908  // algorithm. Initialize those additional elements with identity values here.
1909  if (!UniformPow2WG)
1910  ((std::get<Is>(LocalAccsTuple)[WGSize] = std::get<Is>(IdentitiesTuple)),
1911  ...);
1912 
1913  NDIt.barrier();
1914 
1915  size_t PrevStep = WGSize;
1916  for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
1917  if (LID < CurStep) {
1918  // LocalAcc[LID] = BOp(LocalAcc[LID], LocalAcc[LID + CurStep]);
1919  reduceReduLocalAccs(LID, LID + CurStep, LocalAccsTuple, BOPsTuple,
1920  ReduIndices);
1921  } else if (!UniformPow2WG && LID == CurStep && (PrevStep & 0x1)) {
1922  // LocalAcc[WGSize] = BOp(LocalAcc[WGSize], LocalAcc[PrevStep - 1]);
1923  reduceReduLocalAccs(WGSize, PrevStep - 1, LocalAccsTuple, BOPsTuple,
1924  ReduIndices);
1925  }
1926  NDIt.barrier();
1927  PrevStep = CurStep;
1928  }
1929 
1930  // Compute the partial sum/reduction for the work-group.
1931  if (LID == 0) {
1932  size_t GrID = NDIt.get_group_linear_id();
1933  writeReduSumsToOutAccs<Reductions...>(
1934  UniformPow2WG, IsOneWG, GrID, WGSize, OutAccsTuple, LocalAccsTuple,
1935  BOPsTuple, IdentitiesTuple, InitToIdentityProps, ReduIndices);
1936  }
1937 }
1938 
1939 template <typename Reduction, int Dims, typename LocalAccT, typename InAccT,
1940  typename OutAccT, typename T, typename BOPT>
1941 void reduAuxCGFuncImplArrayHelper(bool UniformPow2WG, bool IsOneWG,
1942  nd_item<Dims> NDIt, size_t LID, size_t GID,
1943  size_t NWorkItems, size_t WGSize,
1944  LocalAccT LocalReds, InAccT In, OutAccT Out,
1945  T Identity, BOPT BOp,
1946  bool IsInitializeToIdentity) {
1947 
1948  // If there are multiple values, reduce each separately
1949  // This prevents local memory from scaling with elements
1950  auto NElements = Reduction::num_elements;
1951  for (size_t E = 0; E < NElements; ++E) {
1952  // Normally, the local accessors are initialized with elements from the
1953  // input accessors. The exception is the case when (GID >= NWorkItems),
1954  // which possible only when UniformPow2WG is false. For that case the
1955  // elements of local accessors are initialized with identity value, so they
1956  // would not give any impact into the final partial sums during the
1957  // tree-reduction algorithm work.
1958  if (UniformPow2WG || GID < NWorkItems) {
1959  LocalReds[LID] = In[GID * NElements + E];
1960  } else {
1961  LocalReds[LID] = Identity;
1962  }
1963 
1964  doTreeReduction(WGSize, LID, UniformPow2WG, Identity, LocalReds, BOp,
1965  [&]() { NDIt.barrier(); });
1966 
1967  // Add the initial value of user's variable to the final result.
1968  if (LID == 0) {
1969  if (IsOneWG) {
1970  LocalReds[0] =
1971  BOp(LocalReds[0], IsInitializeToIdentity ? Identity : Out[E]);
1972  }
1973 
1974  size_t GrID = NDIt.get_group_linear_id();
1975  Out[GrID * NElements + E] =
1976  UniformPow2WG ?
1977  // The partial sums for the work-group are stored in
1978  // 0-th elements of local accessors. Simply write those
1979  // sums to output accessors.
1980  LocalReds[0]
1981  :
1982  // Each of local accessors keeps two partial sums: in
1983  // 0-th and WGsize-th elements. Combine them into final
1984  // partial sums and write to output accessors.
1985  BOp(LocalReds[0], LocalReds[WGSize]);
1986  }
1987 
1988  // Ensure item 0 is finished with LocalReds before next iteration
1989  if (E != NElements - 1) {
1990  NDIt.barrier();
1991  }
1992  }
1993 }
1994 
1995 template <typename... Reductions, int Dims, typename... LocalAccT,
1996  typename... InAccT, typename... OutAccT, typename... Ts,
1997  typename... BOPsT, size_t... Is>
1999  bool UniformPow2WG, bool IsOneWG, nd_item<Dims> NDIt, size_t LID,
2000  size_t GID, size_t NWorkItems, size_t WGSize,
2001  ReduTupleT<LocalAccT...> LocalAccsTuple, ReduTupleT<InAccT...> InAccsTuple,
2002  ReduTupleT<OutAccT...> OutAccsTuple, ReduTupleT<Ts...> IdentitiesTuple,
2003  ReduTupleT<BOPsT...> BOPsTuple,
2004  std::array<bool, sizeof...(Reductions)> InitToIdentityProps,
2005  std::index_sequence<Is...>) {
2006  using ReductionPack = std::tuple<Reductions...>;
2007  (reduAuxCGFuncImplArrayHelper<std::tuple_element_t<Is, ReductionPack>>(
2008  UniformPow2WG, IsOneWG, NDIt, LID, GID, NWorkItems, WGSize,
2009  std::get<Is>(LocalAccsTuple), std::get<Is>(InAccsTuple),
2010  std::get<Is>(OutAccsTuple), std::get<Is>(IdentitiesTuple),
2011  std::get<Is>(BOPsTuple), InitToIdentityProps[Is]),
2012  ...);
2013 }
2014 
2015 namespace reduction::aux_krn {
2016 template <class KernelName, class Predicate> struct Multi;
2017 } // namespace reduction::aux_krn
2018 template <typename KernelName, typename KernelType, typename... Reductions,
2019  size_t... Is>
2020 size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
2021  std::tuple<Reductions...> &ReduTuple,
2022  std::index_sequence<Is...> ReduIndices) {
2023  size_t NWorkGroups;
2024  size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
2025 
2026  bool Pow2WG = (WGSize & (WGSize - 1)) == 0;
2027  bool IsOneWG = NWorkGroups == 1;
2028  bool HasUniformWG = Pow2WG && (NWorkGroups * WGSize == NWorkItems);
2029 
2030  // Like reduCGFuncImpl, we also have to split out scalar and array reductions
2031  IsScalarReduction ScalarPredicate;
2032  auto ScalarIs = filterSequence<Reductions...>(ScalarPredicate, ReduIndices);
2033 
2034  IsArrayReduction ArrayPredicate;
2035  auto ArrayIs = filterSequence<Reductions...>(ArrayPredicate, ReduIndices);
2036 
2037  size_t LocalAccSize = WGSize + (HasUniformWG ? 0 : 1);
2038  auto LocalAccsTuple =
2040  LocalAccSize, CGH}...);
2041  auto InAccsTuple = makeReduTupleT(
2042  std::get<Is>(ReduTuple).getReadAccToPreviousPartialReds(CGH)...);
2043 
2044  auto IdentitiesTuple =
2045  makeReduTupleT(std::get<Is>(ReduTuple).getIdentity()...);
2046  auto BOPsTuple =
2047  makeReduTupleT(std::get<Is>(ReduTuple).getBinaryOperation()...);
2048  std::array InitToIdentityProps{
2049  std::get<Is>(ReduTuple).initializeToIdentity()...};
2050 
2051  // Predicate/OutAccsTuple below have different type depending on us having
2052  // just a single WG or multiple WGs. Use this lambda to avoid code
2053  // duplication.
2054  auto Rest = [&](auto Predicate, auto OutAccsTuple) {
2055  auto AccReduIndices = filterSequence<Reductions...>(Predicate, ReduIndices);
2056  associateReduAccsWithHandler(CGH, ReduTuple, AccReduIndices);
2057  using Name = __sycl_reduction_kernel<reduction::AuxKrn, KernelName,
2058  reduction::strategy::multi,
2059  decltype(Predicate)>;
2060  // TODO: Opportunity to parallelize across number of elements
2061  range<1> GlobalRange = {HasUniformWG ? NWorkItems : NWorkGroups * WGSize};
2062  nd_range<1> Range{GlobalRange, range<1>(WGSize)};
2063  CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
2064  size_t WGSize = NDIt.get_local_range().size();
2065  size_t LID = NDIt.get_local_linear_id();
2066  size_t GID = NDIt.get_global_linear_id();
2067 
2068  // Handle scalar and array reductions
2069  reduAuxCGFuncImplScalar<Reductions...>(
2070  HasUniformWG, IsOneWG, NDIt, LID, GID, NWorkItems, WGSize,
2071  LocalAccsTuple, InAccsTuple, OutAccsTuple, IdentitiesTuple, BOPsTuple,
2072  InitToIdentityProps, ScalarIs);
2073  reduAuxCGFuncImplArray<Reductions...>(
2074  HasUniformWG, IsOneWG, NDIt, LID, GID, NWorkItems, WGSize,
2075  LocalAccsTuple, InAccsTuple, OutAccsTuple, IdentitiesTuple, BOPsTuple,
2076  InitToIdentityProps, ArrayIs);
2077  });
2078  };
2079  if (NWorkGroups == 1)
2081  createReduOutAccs<true>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2082  else
2083  Rest(EmptyReductionPredicate{},
2084  createReduOutAccs<false>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2085 
2086  return NWorkGroups;
2087 }
2088 
2089 template <typename Reduction> size_t reduGetMemPerWorkItemHelper(Reduction &) {
2090  return sizeof(typename Reduction::result_type);
2091 }
2092 
2093 template <typename Reduction, typename... RestT>
2094 size_t reduGetMemPerWorkItemHelper(Reduction &, RestT... Rest) {
2095  return sizeof(typename Reduction::result_type) +
2096  reduGetMemPerWorkItemHelper(Rest...);
2097 }
2098 
2099 template <typename... ReductionT, size_t... Is>
2100 size_t reduGetMemPerWorkItem(std::tuple<ReductionT...> &ReduTuple,
2101  std::index_sequence<Is...>) {
2102  return reduGetMemPerWorkItemHelper(std::get<Is>(ReduTuple)...);
2103 }
2104 
2107 template <typename TupleT, std::size_t... Is>
2108 std::tuple<std::tuple_element_t<Is, TupleT>...>
2109 tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>) {
2110  return {std::get<Is>(std::move(Tuple))...};
2111 }
2112 
2113 template <> struct NDRangeReduction<reduction::strategy::multi> {
2114  template <typename KernelName, int Dims, typename PropertiesT,
2115  typename... RestT>
2116  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
2117  nd_range<Dims> NDRange, PropertiesT &Properties,
2118  RestT... Rest) {
2119  std::tuple<RestT...> ArgsTuple(Rest...);
2120  constexpr size_t NumArgs = sizeof...(RestT);
2121  auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
2122  auto ReduIndices = std::make_index_sequence<NumArgs - 1>();
2123  auto ReduTuple = detail::tuple_select_elements(ArgsTuple, ReduIndices);
2124 
2125  size_t LocalMemPerWorkItem = reduGetMemPerWorkItem(ReduTuple, ReduIndices);
2126  // TODO: currently the maximal work group size is determined for the given
2127  // queue/device, while it is safer to use queries to the kernel compiled
2128  // for the device.
2129  size_t MaxWGSize = reduGetMaxWGSize(Queue, LocalMemPerWorkItem);
2130  if (NDRange.get_local_range().size() > MaxWGSize)
2131  throw sycl::runtime_error("The implementation handling parallel_for with"
2132  " reduction requires work group size not bigger"
2133  " than " +
2134  std::to_string(MaxWGSize),
2135  PI_ERROR_INVALID_WORK_GROUP_SIZE);
2136 
2137  reduCGFuncMulti<KernelName>(CGH, KernelFunc, NDRange, Properties, ReduTuple,
2138  ReduIndices);
2140 
2141  size_t NWorkItems = NDRange.get_group_range().size();
2142  while (NWorkItems > 1) {
2143  reduction::withAuxHandler(CGH, [&](handler &AuxHandler) {
2144  NWorkItems = reduAuxCGFunc<KernelName, decltype(KernelFunc)>(
2145  AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
2146  });
2147  } // end while (NWorkItems > 1)
2148  }
2149 };
2150 
2151 // Auto-dispatch. Must be the last one.
2152 template <> struct NDRangeReduction<reduction::strategy::auto_select> {
2153  // Some readability aliases, to increase signal/noise ratio below.
2154  template <reduction::strategy Strategy>
2157 
2158  template <typename KernelName, int Dims, typename PropertiesT,
2159  typename KernelType, typename Reduction>
2160  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
2161  nd_range<Dims> NDRange, PropertiesT &Properties,
2162  Reduction &Redu, KernelType &KernelFunc) {
2163  auto Delegate = [&](auto Impl) {
2164  Impl.template run<KernelName>(CGH, Queue, NDRange, Properties, Redu,
2165  KernelFunc);
2166  };
2167 
2168  if constexpr (Reduction::has_float64_atomics) {
2169  if (getDeviceFromHandler(CGH).has(aspect::atomic64))
2171 
2172  if constexpr (Reduction::has_fast_reduce)
2174  else
2175  return Delegate(Impl<Strat::basic>{});
2176  } else if constexpr (Reduction::has_fast_atomics) {
2177  if constexpr (Reduction::has_fast_reduce) {
2179  } else {
2181  }
2182  } else {
2183  if constexpr (Reduction::has_fast_reduce)
2185  else
2186  return Delegate(Impl<Strat::basic>{});
2187  }
2188 
2189  assert(false && "Must be unreachable!");
2190  }
2191  template <typename KernelName, int Dims, typename PropertiesT,
2192  typename... RestT>
2193  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
2194  nd_range<Dims> NDRange, PropertiesT &Properties,
2195  RestT... Rest) {
2196  return Impl<Strat::multi>::run<KernelName>(CGH, Queue, NDRange, Properties,
2197  Rest...);
2198  }
2199 };
2200 
2201 template <typename KernelName, reduction::strategy Strategy, int Dims,
2202  typename PropertiesT, typename... RestT>
2204  PropertiesT Properties, RestT... Rest) {
2205  NDRangeReduction<Strategy>::template run<KernelName>(CGH, CGH.MQueue, NDRange,
2206  Properties, Rest...);
2207 }
2208 
2209 __SYCL_EXPORT uint32_t
2210 reduGetMaxNumConcurrentWorkGroups(std::shared_ptr<queue_impl> Queue);
2211 
2212 template <typename KernelName, reduction::strategy Strategy, int Dims,
2213  typename PropertiesT, typename... RestT>
2215  PropertiesT Properties, RestT... Rest) {
2216  std::tuple<RestT...> ArgsTuple(Rest...);
2217  constexpr size_t NumArgs = sizeof...(RestT);
2218  static_assert(NumArgs > 1, "No reduction!");
2219  auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
2220  auto ReduIndices = std::make_index_sequence<NumArgs - 1>();
2221  auto ReduTuple = detail::tuple_select_elements(ArgsTuple, ReduIndices);
2222 
2223  // Before running the kernels, check that device has enough local memory
2224  // to hold local arrays required for the tree-reduction algorithm.
2225  size_t OneElemSize = [&]() {
2226  // Can't use outlined NumArgs due to a bug in gcc 8.4.
2227  if constexpr (sizeof...(RestT) == 2) {
2228  using Reduction = std::tuple_element_t<0, decltype(ReduTuple)>;
2229  constexpr bool IsTreeReduction =
2230  !Reduction::has_fast_reduce && !Reduction::has_fast_atomics;
2231  return IsTreeReduction ? sizeof(typename Reduction::result_type) : 0;
2232  } else {
2233  return reduGetMemPerWorkItem(ReduTuple, ReduIndices);
2234  }
2235  }();
2236 
2237  uint32_t NumConcurrentWorkGroups =
2238 #ifdef __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS
2239  __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS;
2240 #else
2242 #endif
2243 
2244  // TODO: currently the preferred work group size is determined for the given
2245  // queue/device, while it is safer to use queries to the kernel pre-compiled
2246  // for the device.
2247  size_t PrefWGSize = reduGetPreferredWGSize(CGH.MQueue, OneElemSize);
2248 
2249  size_t NWorkItems = Range.size();
2250  size_t WGSize = std::min(NWorkItems, PrefWGSize);
2251  size_t NWorkGroups = NWorkItems / WGSize;
2252  if (NWorkItems % WGSize)
2253  NWorkGroups++;
2254  size_t MaxNWorkGroups = NumConcurrentWorkGroups;
2255  NWorkGroups = std::min(NWorkGroups, MaxNWorkGroups);
2256  size_t NDRItems = NWorkGroups * WGSize;
2257  nd_range<1> NDRange{range<1>{NDRItems}, range<1>{WGSize}};
2258 
2259  size_t PerGroup = Range.size() / NWorkGroups;
2260  // Iterate through the index space by assigning contiguous chunks to each
2261  // work-group, then iterating through each chunk using a stride equal to the
2262  // work-group's local range, which gives much better performance than using
2263  // stride equal to 1. For each of the index the given the original KernelFunc
2264  // is called and the reduction value hold in \p Reducer is accumulated in
2265  // those calls.
2266  auto UpdatedKernelFunc = [=](auto NDId, auto &...Reducers) {
2267  // Divide into contiguous chunks and assign each chunk to a Group
2268  // Rely on precomputed division to avoid repeating expensive operations
2269  // TODO: Some devices may prefer alternative remainder handling
2270  auto Group = NDId.get_group();
2271  size_t GroupId = Group.get_group_linear_id();
2272  size_t NumGroups = Group.get_group_linear_range();
2273  bool LastGroup = (GroupId == NumGroups - 1);
2274  size_t GroupStart = GroupId * PerGroup;
2275  size_t GroupEnd = LastGroup ? Range.size() : (GroupStart + PerGroup);
2276 
2277  // Loop over the contiguous chunk
2278  size_t Start = GroupStart + NDId.get_local_id(0);
2279  size_t End = GroupEnd;
2280  size_t Stride = NDId.get_local_range(0);
2281  auto GetDelinearized = [&](size_t I) {
2282  auto Id = getDelinearizedId(Range, I);
2283  if constexpr (std::is_invocable_v<decltype(KernelFunc), id<Dims>,
2284  decltype(Reducers)...>)
2285  return Id;
2286  else
2287  // SYCL doesn't provide parallel_for accepting offset in presence of
2288  // reductions, so use with_offset==false.
2289  return reduction::getDelinearizedItem(Range, Id);
2290  };
2291  for (size_t I = Start; I < End; I += Stride)
2292  KernelFunc(GetDelinearized(I), Reducers...);
2293  };
2294  if constexpr (NumArgs == 2) {
2295  using Reduction = std::tuple_element_t<0, decltype(ReduTuple)>;
2296  auto &Redu = std::get<0>(ReduTuple);
2297 
2298  constexpr auto StrategyToUse = [&]() {
2299  if constexpr (Strategy != reduction::strategy::auto_select)
2300  return Strategy;
2301 
2302  // TODO: Both group_reduce_and_last_wg_detection and range_basic require
2303  // memory_order::acq_rel support that isn't guaranteed by the
2304  // specification. However, implementing run-time check for that would
2305  // result in an extra kernel compilation(s). We probably need to
2306  // investigate if the usage of kernel_bundles can mitigate that.
2307  if constexpr (Reduction::has_fast_reduce)
2308  return reduction::strategy::group_reduce_and_last_wg_detection;
2309  else if constexpr (Reduction::has_fast_atomics)
2310  return reduction::strategy::local_atomic_and_atomic_cross_wg;
2311  else
2312  return reduction::strategy::range_basic;
2313  }();
2314 
2315  reduction_parallel_for<KernelName, StrategyToUse>(CGH, NDRange, Properties,
2316  Redu, UpdatedKernelFunc);
2317  } else {
2318  return std::apply(
2319  [&](auto &...Reds) {
2320  return reduction_parallel_for<KernelName, Strategy>(
2321  CGH, NDRange, Properties, Reds..., UpdatedKernelFunc);
2322  },
2323  ReduTuple);
2324  }
2325 }
2326 } // namespace detail
2327 
2330 template <
2331  typename T, typename AllocatorT, typename BinaryOperation,
2332  typename = std::enable_if_t<has_known_identity<BinaryOperation, T>::value>>
2333 auto reduction(buffer<T, 1, AllocatorT> Var, handler &CGH, BinaryOperation,
2334  const property_list &PropList = {}) {
2335  bool InitializeToIdentity =
2336  PropList.has_property<property::reduction::initialize_to_identity>();
2337  return detail::make_reduction<BinaryOperation, 0, 1>(accessor{Var, CGH},
2339 }
2340 
2345 template <
2346  typename T, typename AllocatorT, typename BinaryOperation,
2347  typename = std::enable_if_t<!has_known_identity<BinaryOperation, T>::value>>
2348 detail::reduction_impl<
2349  T, BinaryOperation, 0, 1,
2350  accessor<T, 1, access::mode::read_write, access::target::device,
2351  access::placeholder::true_t,
2352  ext::oneapi::accessor_property_list<>>>
2354  const property_list &PropList = {}) {
2355  // TODO: implement reduction that works even when identity is not known.
2356  (void)PropList;
2357  throw runtime_error("Identity-less reductions with unknown identity are not "
2358  "supported yet.",
2359  PI_ERROR_INVALID_VALUE);
2360 }
2361 
2365 template <
2366  typename T, typename BinaryOperation,
2367  typename = std::enable_if_t<has_known_identity<BinaryOperation, T>::value>>
2368 auto reduction(T *Var, BinaryOperation, const property_list &PropList = {}) {
2369  bool InitializeToIdentity =
2370  PropList.has_property<property::reduction::initialize_to_identity>();
2371  return detail::make_reduction<BinaryOperation, 0, 1>(Var,
2373 }
2374 
2380 template <
2381  typename T, typename BinaryOperation,
2382  typename = std::enable_if_t<!has_known_identity<BinaryOperation, T>::value>>
2383 detail::reduction_impl<T, BinaryOperation, 0, 1, T *>
2384 reduction(T *, BinaryOperation, const property_list &PropList = {}) {
2385  // TODO: implement reduction that works even when identity is not known.
2386  (void)PropList;
2387  throw runtime_error("Identity-less reductions with unknown identity are not "
2388  "supported yet.",
2389  PI_ERROR_INVALID_VALUE);
2390 }
2391 
2395 template <typename T, typename AllocatorT, typename BinaryOperation>
2396 auto reduction(buffer<T, 1, AllocatorT> Var, handler &CGH, const T &Identity,
2397  BinaryOperation Combiner, const property_list &PropList = {}) {
2398  bool InitializeToIdentity =
2399  PropList.has_property<property::reduction::initialize_to_identity>();
2400  return detail::make_reduction<BinaryOperation, 0, 1>(
2401  accessor{Var, CGH}, Identity, Combiner, InitializeToIdentity);
2402 }
2403 
2407 template <typename T, typename BinaryOperation>
2408 auto reduction(T *Var, const T &Identity, BinaryOperation Combiner,
2409  const property_list &PropList = {}) {
2410  bool InitializeToIdentity =
2411  PropList.has_property<property::reduction::initialize_to_identity>();
2412  return detail::make_reduction<BinaryOperation, 0, 1>(Var, Identity, Combiner,
2414 }
2415 
2419 template <
2420  typename T, size_t Extent, typename BinaryOperation,
2421  typename = std::enable_if_t<Extent != dynamic_extent &&
2422  has_known_identity<BinaryOperation, T>::value>>
2423 auto reduction(span<T, Extent> Span, BinaryOperation,
2424  const property_list &PropList = {}) {
2425  bool InitializeToIdentity =
2426  PropList.has_property<property::reduction::initialize_to_identity>();
2427  return detail::make_reduction<BinaryOperation, 1, Extent>(
2428  Span.data(), InitializeToIdentity);
2429 }
2430 
2436 template <
2437  typename T, size_t Extent, typename BinaryOperation,
2438  typename = std::enable_if_t<Extent != dynamic_extent &&
2439  !has_known_identity<BinaryOperation, T>::value>>
2440 detail::reduction_impl<T, BinaryOperation, 1, Extent, T *>
2441 reduction(span<T, Extent>, BinaryOperation,
2442  const property_list &PropList = {}) {
2443  // TODO: implement reduction that works even when identity is not known.
2444  (void)PropList;
2445  throw runtime_error("Identity-less reductions with unknown identity are not "
2446  "supported yet.",
2447  PI_ERROR_INVALID_VALUE);
2448 }
2449 
2453 template <typename T, size_t Extent, typename BinaryOperation,
2454  typename = std::enable_if_t<Extent != dynamic_extent>>
2455 auto reduction(span<T, Extent> Span, const T &Identity,
2456  BinaryOperation Combiner, const property_list &PropList = {}) {
2457  bool InitializeToIdentity =
2458  PropList.has_property<property::reduction::initialize_to_identity>();
2459  return detail::make_reduction<BinaryOperation, 1, Extent>(
2460  Span.data(), Identity, Combiner, InitializeToIdentity);
2461 }
2462 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
2463 } // namespace sycl
The file contains implementations of accessor class.
Defines a shared array that can be used by kernels in queues.
Definition: buffer.hpp:146
This class is the default KernelName template parameter type for kernel invocation APIs such as singl...
Definition: kernel.hpp:39
Use CRTP to avoid redefining shorthand operators in terms of combine.
Definition: reduction.hpp:173
enable_if_t< std::is_same< remove_decoration_t< _T >, _T >::value &&IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value &&IsBitAND< _T, _BinaryOperation >::value &&(Space==access::address_space::global_space||Space==access::address_space::local_space)> atomic_combine(_T *ReduVarPtr) const
Atomic BITWISE AND operation: *ReduVarPtr &= MValue;.
Definition: reduction.hpp:294
enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&(IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value||IsReduOptForAtomic64Op< _T, _BinaryOperation >::value) &&IsPlus< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic ADD operation: *ReduVarPtr += MValue;.
Definition: reduction.hpp:259
enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&(IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value||IsReduOptForAtomic64Op< _T, _BinaryOperation >::value) &&IsMinimum< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic MIN operation: *ReduVarPtr = sycl::minimum(*ReduVarPtr, MValue);.
Definition: reduction.hpp:306
enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value &&IsBitXOR< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic BITWISE XOR operation: *ReduVarPtr ^= MValue;.
Definition: reduction.hpp:281
enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value &&IsBitOR< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic BITWISE OR operation: *ReduVarPtr |= MValue;.
Definition: reduction.hpp:270
enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&(IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value||IsReduOptForAtomic64Op< _T, _BinaryOperation >::value) &&IsMaximum< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic MAX operation: *ReduVarPtr = sycl::maximum(*ReduVarPtr, MValue);.
Definition: reduction.hpp:318
void withInitializedMem(handler &CGH, FuncTy Func)
Provide Func with a properly initialized memory to write the reduction result to.
Definition: reduction.hpp:634
auto & getTempBuffer(size_t Size, handler &CGH)
Definition: reduction.hpp:597
auto getReadAccToPreviousPartialReds(handler &CGH) const
Definition: reduction.hpp:579
accessor< int, 1, access::mode::read_write, access::target::device, access::placeholder::false_t > getReadWriteAccessorToInitializedGroupsCounter(handler &CGH)
Definition: reduction.hpp:695
auto getWriteMemForPartialReds(size_t Size, handler &CGH)
Definition: reduction.hpp:585
auto getGroupsCounterAccDiscrete(handler &CGH)
Definition: reduction.hpp:706
auto getWriteAccForPartialReds(size_t Size, handler &CGH)
Returns an accessor accessing the memory that will hold the reduction partial sums.
Definition: reduction.hpp:609
reduction_impl_algo(const T &Identity, BinaryOperation BinaryOp, bool Init, RedOutVar RedOut)
Definition: reduction.hpp:575
Base non-template class which is a base class for all reduction implementation classes.
Definition: reduction.hpp:32
Templated class for common functionality of all reduction implementation classes.
Definition: reduction.hpp:481
constexpr enable_if_t< IsKnownIdentityOp< _T, _BinaryOperation >::value, _T > getIdentity()
Returns the statically known identity value.
Definition: reduction.hpp:491
enable_if_t<!IsKnownIdentityOp< _T, _BinaryOperation >::value, _T > getIdentity()
Returns the identity value given by user.
Definition: reduction.hpp:498
const T MIdentity
Identity of the BinaryOperation.
Definition: reduction.hpp:509
reduction_impl_common(const T &Identity, BinaryOperation BinaryOp, bool Init=false)
Definition: reduction.hpp:483
BinaryOperation getBinaryOperation() const
Returns the binary operation associated with the reduction.
Definition: reduction.hpp:503
This class encapsulates the reduction variable/accessor, the reduction operator and an optional opera...
Definition: reduction.hpp:736
reduction_impl(RedOutVar &Var, const T &Identity, BinaryOperation BOp, bool InitializeToIdentity)
Constructs reduction_impl when the identity value is unknown.
Definition: reduction.hpp:788
reduction_impl(RedOutVar Var, bool InitializeToIdentity=false)
Constructs reduction_impl when the identity value is statically known.
Definition: reduction.hpp:777
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:49
detail::is_device_info_desc< Param >::return_type get_info() const
Queries this SYCL device for information requested by the template parameter param.
Definition: device.cpp:136
bool has(aspect Aspect) const
Indicates if the SYCL device has the given feature.
Definition: device.cpp:201
Command group handler class.
Definition: handler.hpp:312
void depends_on(event Event)
Registers event dependencies on this command group.
Definition: handler.cpp:746
void single_task(_KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function as a function object type.
Definition: handler.hpp:1557
void parallel_for(range< 1 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:1562
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, std::shared_ptr< T_Dst > Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Definition: handler.hpp:2106
Identifies an instance of the function object executing at each point in an nd_range.
Definition: nd_item.hpp:36
size_t __SYCL_ALWAYS_INLINE get_global_linear_id() const
Definition: nd_item.hpp:48
void barrier(access::fence_space accessSpace=access::fence_space::global_and_local) const
Definition: nd_item.hpp:112
size_t get_local_linear_id() const
Definition: nd_item.hpp:62
range< dimensions > get_local_range() const
Definition: nd_item.hpp:98
size_t __SYCL_ALWAYS_INLINE get_group_linear_id() const
Definition: nd_item.hpp:78
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: nd_range.hpp:23
range< dimensions > get_local_range() const
Definition: nd_range.hpp:42
range< dimensions > get_group_range() const
Definition: nd_range.hpp:44
Objects of the property_list class are containers for the SYCL properties.
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
Definition: queue.hpp:88
device get_device() const
Definition: queue.cpp:77
event memset(void *Ptr, int Value, size_t Count)
Fills the memory pointed by a USM pointer with the value specified.
Definition: queue.cpp:87
size_t size() const
Definition: range.hpp:50
Class that is used to represent objects that are passed to user's lambda functions and representing u...
Definition: reduction.hpp:69
constexpr _SYCL_SPAN_INLINE_VISIBILITY pointer data() const noexcept
Definition: sycl_span.hpp:376
#define __SYCL_INLINE_VER_NAMESPACE(X)
class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(accessor) accessor accessor(buffer< DataT, Dimensions, AllocatorT >) -> accessor< DataT, Dimensions, access::mode::read_write, target::device, access::placeholder::true_t >
Buffer accessor.
constexpr tuple_element< I, tuple< Types... > >::type & get(sycl::detail::tuple< Types... > &Arg) noexcept
Definition: tuple.hpp:199
item< Dims, false > getDelinearizedItem(range< Dims > Range, id< Dims > Id)
void withAuxHandler(handler &CGH, FunctorTy Func)
Definition: reduction.hpp:809
void finalizeHandler(handler &CGH)
Definition: reduction.hpp:808
void free(void *Ptr, const context &Ctxt, const code_location &CL)
Definition: usm_impl.cpp:267
bool_constant< std::is_same< BinaryOperation, sycl::minimum< T > >::value||std::is_same< BinaryOperation, sycl::minimum< void > >::value > IsMinimum
size_t reduGetMemPerWorkItem(std::tuple< ReductionT... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:2100
bool_constant<((is_sgenfloat< T >::value &&sizeof(T)==4)||is_sgeninteger< T >::value) &&IsValidAtomicType< T >::value &&(IsPlus< T, BinaryOperation >::value||IsMinimum< T, BinaryOperation >::value||IsMaximum< T, BinaryOperation >::value||IsBitOR< T, BinaryOperation >::value||IsBitXOR< T, BinaryOperation >::value||IsBitAND< T, BinaryOperation >::value)> IsReduOptForFastAtomicFetch
Definition: reduction.hpp:93
void reduAuxCGFuncImplArray(bool UniformPow2WG, bool IsOneWG, nd_item< Dims > NDIt, size_t LID, size_t GID, size_t NWorkItems, size_t WGSize, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< InAccT... > InAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... >)
Definition: reduction.hpp:1998
void reduCGFuncImplArrayHelper(bool Pow2WG, bool IsOneWG, nd_item< Dims > NDIt, LocalAccT LocalReds, OutAccT Out, ReducerT &Reducer, T Identity, BOPT BOp, bool IsInitializeToIdentity)
Each array reduction is processed separately.
Definition: reduction.hpp:1721
device getDeviceFromHandler(handler &CommandGroupHandlerRef)
Definition: accessor.cpp:15
bool_constant< std::is_same< BinaryOperation, sycl::bit_and< T > >::value||std::is_same< BinaryOperation, sycl::bit_and< void > >::value > IsBitAND
size_t reduGetMemPerWorkItemHelper(Reduction &, RestT... Rest)
Definition: reduction.hpp:2094
uint32_t reduGetMaxNumConcurrentWorkGroups(std::shared_ptr< queue_impl > Queue)
size_t reduGetMaxWGSize(std::shared_ptr< queue_impl > Queue, size_t LocalMemBytesPerWorkItem)
void writeReduSumsToOutAccs(bool Pow2WG, bool IsOneWG, size_t OutAccIndex, size_t WGSize, ReduTupleT< OutAccT... > OutAccs, ReduTupleT< LocalAccT... > LocalAccs, ReduTupleT< BOPsT... > BOPs, ReduTupleT< Ts... > IdentityVals, std::array< bool, sizeof...(Reductions)> IsInitializeToIdentity, std::index_sequence< Is... >)
Definition: reduction.hpp:1557
id< 1 > getDelinearizedId(const range< 1 > &, size_t Index)
Definition: id.hpp:323
void reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu)
Copies the final reduction result kept in read-write accessor to user's USM memory.
Definition: reduction.hpp:826
void associateReduAccsWithHandler(handler &CGH, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:1871
bool_constant< std::is_same< BinaryOperation, sycl::multiplies< T > >::value||std::is_same< BinaryOperation, sycl::multiplies< void > >::value > IsMultiplies
void reduAuxCGFuncImplScalar(bool UniformPow2WG, bool IsOneWG, nd_item< Dims > NDIt, size_t LID, size_t GID, size_t NWorkItems, size_t WGSize, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< InAccT... > InAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... > ReduIndices)
All scalar reductions are processed together; there is one loop of log2(N) steps, and each reduction ...
Definition: reduction.hpp:1887
sycl::memory_scope memory_scope
Definition: atomic_ref.hpp:30
std::conditional_t< std::is_same< KernelName, auto_name >::value, auto_name, MainOrAux< KernelName, Strategy, Ts... > > __sycl_reduction_kernel
A helper to pass undefined (sycl::detail::auto_name) names unmodified.
Definition: reduction.hpp:856
std::conditional_t< std::is_same< KernelName, auto_name >::value, auto_name, reduction::InitMemKrn< KernelName > > __sycl_init_mem_for
A helper to pass undefined (sycl::detail::auto_name) names unmodified.
Definition: reduction.hpp:550
std::integral_constant< bool, V > bool_constant
constexpr auto concat_sequences(std::index_sequence< Is... >, std::index_sequence< Js... >, Rs...)
Definition: reduction.hpp:1605
constexpr auto filterSequenceHelper(FunctorT, std::index_sequence< Is... >)
For each index 'I' from the given indices pack 'Is' this function initially creates a number of short...
Definition: reduction.hpp:1634
bool_constant<(IsPlus< T, BinaryOperation >::value||IsMinimum< T, BinaryOperation >::value||IsMaximum< T, BinaryOperation >::value) &&is_sgenfloat< T >::value &&sizeof(T)==8 > IsReduOptForAtomic64Op
Definition: reduction.hpp:112
bool_constant< std::is_same< BinaryOperation, sycl::bit_xor< T > >::value||std::is_same< BinaryOperation, sycl::bit_xor< void > >::value > IsBitXOR
void reduAuxCGFuncImplArrayHelper(bool UniformPow2WG, bool IsOneWG, nd_item< Dims > NDIt, size_t LID, size_t GID, size_t NWorkItems, size_t WGSize, LocalAccT LocalReds, InAccT In, OutAccT Out, T Identity, BOPT BOp, bool IsInitializeToIdentity)
Definition: reduction.hpp:1941
std::tuple< std::tuple_element_t< Is, TupleT >... > tuple_select_elements(TupleT Tuple, std::index_sequence< Is... >)
Utility function: for the given tuple.
Definition: reduction.hpp:2109
constexpr tuple< Ts... > make_tuple(Ts... Args)
Definition: tuple.hpp:36
void reduceReduLocalAccs(size_t IndexA, size_t IndexB, ReduTupleT< LocalAccT... > LocalAccs, ReduTupleT< BOPsT... > BOPs, std::index_sequence< Is... >)
Definition: reduction.hpp:1545
auto make_reduction(RedOutVar RedVar, RestTy &&...Rest)
Definition: reduction.hpp:801
size_t reduGetPreferredWGSize(std::shared_ptr< queue_impl > &Queue, size_t LocalMemBytesPerWorkItem)
Definition: reduction.cpp:105
size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... > ReduIndices)
Definition: reduction.hpp:2020
void reduCGFuncImplScalar(bool Pow2WG, bool IsOneWG, nd_item< Dims > NDIt, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, std::tuple< ReducerT... > &ReducersTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... > ReduIndices)
All scalar reductions are processed together; there is one loop of log2(N) steps, and each reduction ...
Definition: reduction.hpp:1673
constexpr auto filterSequence(FunctorT F, std::index_sequence< Is... > Indices)
For each index 'I' from the given indices pack 'Is' this function returns an index sequence consistin...
Definition: reduction.hpp:1650
void associateWithHandler(handler &, AccessorBaseHost *, access::target)
ReduTupleT< Ts... > makeReduTupleT(Ts... Elements)
Definition: reduction.hpp:138
bool_constant<((is_sgeninteger< T >::value &&(sizeof(T)==4||sizeof(T)==8))||is_sgenfloat< T >::value) &&(IsPlus< T, BinaryOperation >::value||IsMinimum< T, BinaryOperation >::value||IsMaximum< T, BinaryOperation >::value)> IsReduOptForFastReduce
Definition: reduction.hpp:130
void reduCGFuncImplArray(bool Pow2WG, bool IsOneWG, nd_item< Dims > NDIt, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, std::tuple< ReducerT... > &ReducersTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... >)
Definition: reduction.hpp:1770
bool_constant< std::is_same< BinaryOperation, sycl::plus< T > >::value||std::is_same< BinaryOperation, sycl::plus< void > >::value > IsPlus
size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize, size_t &NWorkGroups)
Definition: reduction.cpp:19
bool_constant< std::is_same< BinaryOperation, sycl::bit_or< T > >::value||std::is_same< BinaryOperation, sycl::bit_or< void > >::value > IsBitOR
typename std::enable_if< B, T >::type enable_if_t
auto createReduOutAccs(size_t NWorkGroups, handler &CGH, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
For the given 'Reductions' types pack and indices enumerating them this function either creates new t...
Definition: reduction.hpp:1534
void doTreeReduction(size_t WGSize, size_t LID, bool DisableExtraElem, IdentityTy Identity, LocalRedsTy &LocalReds, BinOpTy &BOp, BarrierTy Barrier)
Definition: reduction.hpp:1012
void reduction_parallel_for(handler &CGH, range< Dims > Range, PropertiesT Properties, RestT... Rest)
Definition: reduction.hpp:2214
typename tuple_element< I, T >::type tuple_element_t
Definition: tuple.hpp:56
static void workGroupBarrier()
Definition: group.hpp:33
void reduCGFuncMulti(handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, PropertiesT Properties, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... > ReduIndices)
Definition: reduction.hpp:1791
sycl::detail::tuple< Ts... > ReduTupleT
Definition: reduction.hpp:137
bool_constant< std::is_same< BinaryOperation, sycl::maximum< T > >::value||std::is_same< BinaryOperation, sycl::maximum< void > >::value > IsMaximum
bool_constant< IsZeroIdentityOp< T, BinaryOperation >::value||IsOneIdentityOp< T, BinaryOperation >::value||IsOnesIdentityOp< T, BinaryOperation >::value||IsMinimumIdentityOp< T, BinaryOperation >::value||IsMaximumIdentityOp< T, BinaryOperation >::value||IsFalseIdentityOp< T, BinaryOperation >::value||IsTrueIdentityOp< T, BinaryOperation >::value > IsKnownIdentityOp
sycl::detail::enable_if_t<(is_group_helper_v< GroupHelper >), T > reduce_over_group(GroupHelper group_helper, T x, BinaryOperation binary_op)
constexpr property::no_init no_init
constexpr mode_tag_t< access_mode::read_write > read_write
Definition: access.hpp:73
auto reduction(span< T, Extent > Span, const T &Identity, BinaryOperation Combiner, const property_list &PropList={})
Constructs a reduction object using the reduction variable referenced by the given sycl::span Span,...
Definition: reduction.hpp:2455
constexpr size_t dynamic_extent
Definition: sycl_span.hpp:150
---— Error handling, matching OpenCL plugin semantics.
Definition: access.hpp:14
std::function< void(const sycl::nd_item< NDims > &)> KernelFunc
Predicate returning true if all template type parameters except the last one are reductions.
Definition: reduction.hpp:42
std::conditional_t< Cond, std::index_sequence< I >, std::index_sequence<> > type
Definition: reduction.hpp:1624
Predicate returning true if a type is a reduction.
Definition: reduction.hpp:35
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:2160
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, RestT... Rest)
Definition: reduction.hpp:2193
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1360
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1131
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:913
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1227
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:866
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1163
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, RestT... Rest)
Definition: reduction.hpp:2116
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1036
Helper class for accessing reducer-defined types in CRTP May prove to be useful for other things late...
Definition: reduction.hpp:151