DPC++ Runtime
Runtime libraries for oneAPI Data Parallel C++
reduction.hpp
Go to the documentation of this file.
1 //==---------------- reduction.hpp - SYCL reduction ------------*- C++ -*---==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 // ===--------------------------------------------------------------------=== //
8 
9 #pragma once
10 
11 #include <CL/sycl/accessor.hpp>
12 #include <CL/sycl/atomic.hpp>
13 #include <CL/sycl/detail/tuple.hpp>
14 #include <CL/sycl/handler.hpp>
15 #include <CL/sycl/kernel.hpp>
19 
20 #include <tuple>
21 
23 namespace sycl {
24 namespace ext {
25 namespace oneapi {
26 
27 namespace detail {
28 
33 
34 // This type trait is used to detect if the atomic operation BinaryOperation
35 // used with operands of the type T is available for using in reduction.
36 // The order in which the atomic operations are performed may be arbitrary and
37 // thus may cause different results from run to run even on the same elements
38 // and on same device. The macro SYCL_REDUCTION_DETERMINISTIC prohibits using
39 // atomic operations for reduction and helps to produce stable results.
40 // SYCL_REDUCTION_DETERMINISTIC is a short term solution, which perhaps become
41 // deprecated eventually and is replaced by a sycl property passed to reduction.
42 template <typename T, class BinaryOperation>
44 #ifdef SYCL_REDUCTION_DETERMINISTIC
45  bool_constant<false>;
46 #else
47  bool_constant<sycl::detail::is_sgeninteger<T>::value &&
48  sycl::detail::IsValidAtomicType<T>::value &&
49  (sycl::detail::IsPlus<T, BinaryOperation>::value ||
50  sycl::detail::IsMinimum<T, BinaryOperation>::value ||
51  sycl::detail::IsMaximum<T, BinaryOperation>::value ||
52  sycl::detail::IsBitOR<T, BinaryOperation>::value ||
53  sycl::detail::IsBitXOR<T, BinaryOperation>::value ||
55 #endif
56 
57 // This type trait is used to detect if the atomic operation BinaryOperation
58 // used with operands of the type T is available for using in reduction, in
59 // addition to the cases covered by "IsReduOptForFastAtomicFetch", if the device
60 // has the atomic64 aspect. This type trait should only be used if the device
61 // has the atomic64 aspect. Note that this type trait is currently a subset of
62 // IsReduOptForFastReduce. The macro SYCL_REDUCTION_DETERMINISTIC prohibits
63 // using the reduce_over_group() algorithm to produce stable results across same
64 // type devices.
65 // TODO 32 bit floating point atomics are eventually expected to be supported by
66 // the has_fast_atomics specialization. Once the reducer class is updated to
67 // replace the deprecated atomic class with atomic_ref, the (sizeof(T) == 4)
68 // case should be removed here and replaced in IsReduOptForFastAtomicFetch.
69 template <typename T, class BinaryOperation>
71 #ifdef SYCL_REDUCTION_DETERMINISTIC
73 #else
76  (sizeof(T) == 4 || sizeof(T) == 8)>;
77 #endif
78 
79 // This type trait is used to detect if the group algorithm reduce() used with
80 // operands of the type T and the operation BinaryOperation is available
81 // for using in reduction.
82 // The macro SYCL_REDUCTION_DETERMINISTIC prohibits using the reduce() algorithm
83 // to produce stable results across same type devices.
84 template <typename T, class BinaryOperation>
86 #ifdef SYCL_REDUCTION_DETERMINISTIC
88 #else
90  (sizeof(T) == 4 || sizeof(T) == 8)) ||
95 #endif
96 
97 // std::tuple seems to be a) too heavy and b) not copyable to device now
98 // Thus sycl::detail::tuple is used instead.
99 // Switching from sycl::device::tuple to std::tuple can be done by re-defining
100 // the ReduTupleT type and makeReduTupleT() function below.
101 template <typename... Ts> using ReduTupleT = sycl::detail::tuple<Ts...>;
102 template <typename... Ts> ReduTupleT<Ts...> makeReduTupleT(Ts... Elements) {
103  return sycl::detail::make_tuple(Elements...);
104 }
105 
106 __SYCL_EXPORT size_t reduGetMaxWGSize(std::shared_ptr<queue_impl> Queue,
107  size_t LocalMemBytesPerWorkItem);
108 __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
109  size_t &NWorkGroups);
110 
115 template <typename T, class BinaryOperation, typename Subst = void>
116 class reducer {
117 public:
118  reducer(const T &Identity, BinaryOperation BOp)
119  : MValue(Identity), MIdentity(Identity), MBinaryOp(BOp) {}
120  void combine(const T &Partial) { MValue = MBinaryOp(MValue, Partial); }
121 
122  T getIdentity() const { return MIdentity; }
123 
124  template <typename _T = T>
128  combine(static_cast<T>(1));
129  }
130 
131  template <typename _T = T>
134  operator++(int) {
135  combine(static_cast<T>(1));
136  }
137 
138  template <typename _T = T>
140  operator+=(const _T &Partial) {
141  combine(Partial);
142  }
143 
144  template <typename _T = T>
146  operator*=(const _T &Partial) {
147  combine(Partial);
148  }
149 
150  template <typename _T = T>
152  operator|=(const _T &Partial) {
153  combine(Partial);
154  }
155 
156  template <typename _T = T>
158  operator^=(const _T &Partial) {
159  combine(Partial);
160  }
161 
162  template <typename _T = T>
164  operator&=(const _T &Partial) {
165  combine(Partial);
166  }
167 
169 
170 private:
171  const T MIdentity;
172  BinaryOperation MBinaryOp;
173 };
174 
190 //
191 // TODO: More types and ops can be added to here later.
192 template <typename T, class BinaryOperation>
193 class reducer<T, BinaryOperation,
194  enable_if_t<IsKnownIdentityOp<T, BinaryOperation>::value>> {
195 public:
196  reducer() : MValue(getIdentity()) {}
197  reducer(const T &, BinaryOperation) : MValue(getIdentity()) {}
198 
199  void combine(const T &Partial) {
200  BinaryOperation BOp;
201  MValue = BOp(MValue, Partial);
202  }
203 
204  template <typename _T = T, class _BinaryOperation = BinaryOperation>
208  }
209 
210  template <typename _T = T>
214  combine(static_cast<T>(1));
215  }
216 
217  template <typename _T = T>
220  operator++(int) {
221  combine(static_cast<T>(1));
222  }
223 
224  template <typename _T = T>
226  operator+=(const _T &Partial) {
227  combine(Partial);
228  }
229 
230  template <typename _T = T>
232  operator*=(const _T &Partial) {
233  combine(Partial);
234  }
235 
236  template <typename _T = T>
238  operator|=(const _T &Partial) {
239  combine(Partial);
240  }
241 
242  template <typename _T = T>
244  operator^=(const _T &Partial) {
245  combine(Partial);
246  }
247 
248  template <typename _T = T>
250  operator&=(const _T &Partial) {
251  combine(Partial);
252  }
253 
254 private:
255  template <access::address_space Space>
256  static constexpr memory_scope getMemoryScope() {
257  return Space == access::address_space::local_space
258  ? memory_scope::work_group
260  }
261 
262 public:
264  template <access::address_space Space = access::address_space::global_space,
265  typename _T = T, class _BinaryOperation = BinaryOperation>
267  (IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value ||
268  IsReduOptForAtomic64Add<T, _BinaryOperation>::value) &&
270  (Space == access::address_space::global_space ||
271  Space == access::address_space::local_space)>
272  atomic_combine(_T *ReduVarPtr) const {
274  *multi_ptr<T, Space>(ReduVarPtr))
275  .fetch_add(MValue);
276  }
277 
279  template <access::address_space Space = access::address_space::global_space,
280  typename _T = T, class _BinaryOperation = BinaryOperation>
284  (Space == access::address_space::global_space ||
285  Space == access::address_space::local_space)>
286  atomic_combine(_T *ReduVarPtr) const {
288  *multi_ptr<T, Space>(ReduVarPtr))
289  .fetch_or(MValue);
290  }
291 
293  template <access::address_space Space = access::address_space::global_space,
294  typename _T = T, class _BinaryOperation = BinaryOperation>
298  (Space == access::address_space::global_space ||
299  Space == access::address_space::local_space)>
300  atomic_combine(_T *ReduVarPtr) const {
302  *multi_ptr<T, Space>(ReduVarPtr))
303  .fetch_xor(MValue);
304  }
305 
307  template <access::address_space Space = access::address_space::global_space,
308  typename _T = T, class _BinaryOperation = BinaryOperation>
312  (Space == access::address_space::global_space ||
313  Space == access::address_space::local_space)>
314  atomic_combine(_T *ReduVarPtr) const {
316  *multi_ptr<T, Space>(ReduVarPtr))
317  .fetch_and(MValue);
318  }
319 
321  template <access::address_space Space = access::address_space::global_space,
322  typename _T = T, class _BinaryOperation = BinaryOperation>
326  (Space == access::address_space::global_space ||
327  Space == access::address_space::local_space)>
328  atomic_combine(_T *ReduVarPtr) const {
330  *multi_ptr<T, Space>(ReduVarPtr))
331  .fetch_min(MValue);
332  }
333 
335  template <access::address_space Space = access::address_space::global_space,
336  typename _T = T, class _BinaryOperation = BinaryOperation>
340  (Space == access::address_space::global_space ||
341  Space == access::address_space::local_space)>
342  atomic_combine(_T *ReduVarPtr) const {
344  *multi_ptr<T, Space>(ReduVarPtr))
345  .fetch_max(MValue);
346  }
347 
349 };
350 
354 
357 template <typename FirstT, typename... RestT> struct AreAllButLastReductions {
358  static constexpr bool value =
359  std::is_base_of<reduction_impl_base, FirstT>::value &&
360  AreAllButLastReductions<RestT...>::value;
361 };
362 
365 template <typename T> struct AreAllButLastReductions<T> {
366  static constexpr bool value = !std::is_base_of<reduction_impl_base, T>::value;
367 };
368 
371 template <typename T, class BinaryOperation, int Dims, bool IsUSM,
372  access::placeholder IsPlaceholder = access::placeholder::false_t>
373 class reduction_impl : private reduction_impl_base {
374 public:
376  using result_type = T;
377  using binary_operation = BinaryOperation;
378  using rw_accessor_type =
379  accessor<T, Dims, access::mode::read_write, access::target::device,
381  using dw_accessor_type =
382  accessor<T, Dims, access::mode::discard_write, access::target::device,
384  static constexpr int accessor_dim = Dims;
385  static constexpr int buffer_dim = (Dims == 0) ? 1 : Dims;
386 
387  static constexpr bool has_atomic_add_float64 =
389  static constexpr bool has_fast_atomics =
391  static constexpr bool has_fast_reduce =
393  static constexpr bool is_usm = IsUSM;
394  static constexpr bool is_placeholder =
395  (IsPlaceholder == access::placeholder::true_t);
396 
397  // Only scalar (i.e. 0-dim and 1-dim with 1 element) reductions supported now.
398  // TODO: suport (Dims > 1) accessors/reductions.
399  // TODO: support true 1-Dimensional accessors/reductions (size() > 1).
400  // (size() == 1) is checked in the constructor of reduction_impl.
401  static_assert(Dims <= 1,
402  "Multi-dimensional reductions are not supported yet.");
403 
405  template <typename _T = T, class _BinaryOperation = BinaryOperation>
407  _T> constexpr getIdentity() {
408  return reducer_type::getIdentity();
409  }
410 
412  template <typename _T = T, class _BinaryOperation = BinaryOperation>
415  return MIdentity;
416  }
417 
420  template <typename _T, typename AllocatorT,
421  std::enable_if_t<IsKnownIdentityOp<_T, BinaryOperation>::value> * =
422  nullptr>
425  : MRWAcc(std::make_shared<rw_accessor_type>(Buffer)),
426  MIdentity(getIdentity()), InitializeToIdentity(InitializeToIdentity) {
428  if (Buffer.size() != 1)
429  throw sycl::runtime_error("Reduction variable must be a scalar.",
431  }
432 
434  template <
435  typename _T = T,
438  : MRWAcc(new rw_accessor_type(Acc)), MIdentity(getIdentity()),
439  InitializeToIdentity(false) {
440  if (Acc.size() != 1)
441  throw sycl::runtime_error("Reduction variable must be a scalar.",
443  }
444 
446  template <
447  typename _T = T,
450  : MDWAcc(new dw_accessor_type(Acc)), MIdentity(getIdentity()),
451  InitializeToIdentity(true) {
452  if (Acc.size() != 1)
453  throw sycl::runtime_error("Reduction variable must be a scalar.",
455  }
456 
460  template <
461  typename _T, typename AllocatorT,
464  const T & /*Identity*/, BinaryOperation,
466  : MRWAcc(std::make_shared<rw_accessor_type>(Buffer)),
467  MIdentity(getIdentity()), InitializeToIdentity(InitializeToIdentity) {
469  if (Buffer.size() != 1)
470  throw sycl::runtime_error("Reduction variable must be a scalar.",
472  // For now the implementation ignores the identity value given by user
473  // when the implementation knows the identity.
474  // The SPEC could prohibit passing identity parameter to operations with
475  // known identity, but that could have some bad consequences too.
476  // For example, at some moment the implementation may NOT know the identity
477  // for COMPLEX-PLUS reduction. User may create a program that would pass
478  // COMPLEX value (0,0) as identity for PLUS reduction. At some later moment
479  // when the implementation starts handling COMPLEX-PLUS as known operation
480  // the existing user's program remains compilable and working correctly.
481  // I.e. with this constructor here, adding more reduction operations to the
482  // list of known operations does not break the existing programs.
483  }
484 
487  template <
488  typename _T = T,
490  reduction_impl(rw_accessor_type &Acc, const T & /*Identity*/, BinaryOperation)
491  : MRWAcc(new rw_accessor_type(Acc)), MIdentity(getIdentity()),
492  InitializeToIdentity(false) {
493  if (Acc.size() != 1)
494  throw sycl::runtime_error("Reduction variable must be a scalar.",
496  // For now the implementation ignores the identity value given by user
497  // when the implementation knows the identity.
498  // The SPEC could prohibit passing identity parameter to operations with
499  // known identity, but that could have some bad consequences too.
500  // For example, at some moment the implementation may NOT know the identity
501  // for COMPLEX-PLUS reduction. User may create a program that would pass
502  // COMPLEX value (0,0) as identity for PLUS reduction. At some later moment
503  // when the implementation starts handling COMPLEX-PLUS as known operation
504  // the existing user's program remains compilable and working correctly.
505  // I.e. with this constructor here, adding more reduction operations to the
506  // list of known operations does not break the existing programs.
507  }
508 
511  template <
512  typename _T = T,
514  reduction_impl(dw_accessor_type &Acc, const T & /*Identity*/, BinaryOperation)
515  : MDWAcc(new dw_accessor_type(Acc)), MIdentity(getIdentity()),
516  InitializeToIdentity(true) {
517  if (Acc.size() != 1)
518  throw sycl::runtime_error("Reduction variable must be a scalar.",
520  // For now the implementation ignores the identity value given by user
521  // when the implementation knows the identity.
522  // The SPEC could prohibit passing identity parameter to operations with
523  // known identity, but that could have some bad consequences too.
524  // For example, at some moment the implementation may NOT know the identity
525  // for COMPLEX-PLUS reduction. User may create a program that would pass
526  // COMPLEX value (0,0) as identity for PLUS reduction. At some later moment
527  // when the implementation starts handling COMPLEX-PLUS as known operation
528  // the existing user's program remains compilable and working correctly.
529  // I.e. with this constructor here, adding more reduction operations to the
530  // list of known operations does not break the existing programs.
531  }
532 
535  template <
536  typename _T, typename AllocatorT,
539  const T &Identity, BinaryOperation BOp,
541  : MRWAcc(std::make_shared<rw_accessor_type>(Buffer)), MIdentity(Identity),
542  MBinaryOp(BOp), InitializeToIdentity(InitializeToIdentity) {
544  if (Buffer.size() != 1)
545  throw sycl::runtime_error("Reduction variable must be a scalar.",
547  }
548 
550  template <
551  typename _T = T,
553  reduction_impl(rw_accessor_type &Acc, const T &Identity, BinaryOperation BOp)
554  : MRWAcc(new rw_accessor_type(Acc)), MIdentity(Identity), MBinaryOp(BOp),
555  InitializeToIdentity(false) {
556  if (Acc.size() != 1)
557  throw sycl::runtime_error("Reduction variable must be a scalar.",
559  }
560 
562  template <
563  typename _T = T,
565  reduction_impl(dw_accessor_type &Acc, const T &Identity, BinaryOperation BOp)
566  : MDWAcc(new dw_accessor_type(Acc)), MIdentity(Identity), MBinaryOp(BOp),
567  InitializeToIdentity(true) {
568  if (Acc.size() != 1)
569  throw sycl::runtime_error("Reduction variable must be a scalar.",
571  }
572 
577  template <
578  typename _T = T,
580  reduction_impl(T *VarPtr, bool InitializeToIdentity = false)
581  : MIdentity(getIdentity()), MUSMPointer(VarPtr),
583 
589  template <
590  typename _T = T,
592  reduction_impl(T *VarPtr, const T &Identity, BinaryOperation,
593  bool InitializeToIdentity = false)
594  : MIdentity(Identity), MUSMPointer(VarPtr),
596  // For now the implementation ignores the identity value given by user
597  // when the implementation knows the identity.
598  // The SPEC could prohibit passing identity parameter to operations with
599  // known identity, but that could have some bad consequences too.
600  // For example, at some moment the implementation may NOT know the identity
601  // for COMPLEX-PLUS reduction. User may create a program that would pass
602  // COMPLEX value (0,0) as identity for PLUS reduction. At some later moment
603  // when the implementation starts handling COMPLEX-PLUS as known operation
604  // the existing user's program remains compilable and working correctly.
605  // I.e. with this constructor here, adding more reduction operations to the
606  // list of known operations does not break the existing programs.
607  }
608 
613  template <
614  typename _T = T,
616  reduction_impl(T *VarPtr, const T &Identity, BinaryOperation BOp,
617  bool InitializeToIdentity = false)
618  : MIdentity(Identity), MUSMPointer(VarPtr), MBinaryOp(BOp),
620 
625 #ifndef __SYCL_DEVICE_ONLY__
626  if (MRWAcc)
627  CGH.associateWithHandler(MRWAcc.get(), access::target::device);
628  else if (MDWAcc)
629  CGH.associateWithHandler(MDWAcc.get(), access::target::device);
630 #else
631  (void)CGH;
632 #endif
633  }
634 
639  template <typename _T = result_type>
640  static accessor<_T, buffer_dim, access::mode::read_write,
641  access::target::local>
642  getReadWriteLocalAcc(size_t Size, handler &CGH) {
643  return {Size, CGH};
644  }
645 
648  CGH.addReduction(MOutBufPtr);
649  return {*MOutBufPtr, CGH};
650  }
651 
653  template <bool IsOneWG, bool _IsUSM = is_usm>
654  std::enable_if_t<IsOneWG && _IsUSM, result_type *>
656  return getUSMPointer();
657  }
658 
662  template <bool IsOneWG, bool _IsUSM = is_usm>
663  std::enable_if_t<IsOneWG && !_IsUSM, rw_accessor_type>
665  if (MRWAcc)
666  return *MRWAcc;
667  return getWriteMemForPartialReds<false>(1, CGH);
668  }
669 
672  template <bool IsOneWG>
673  std::enable_if_t<!IsOneWG, rw_accessor_type>
674  getWriteMemForPartialReds(size_t Size, handler &CGH) {
675  MOutBufPtr = std::make_shared<buffer<T, buffer_dim>>(range<1>(Size));
676  CGH.addReduction(MOutBufPtr);
677  return createHandlerWiredReadWriteAccessor(CGH, *MOutBufPtr);
678  }
679 
687  if (Size == 1 && MRWAcc != nullptr) {
689  return *MRWAcc;
690  }
691 
692  // Create a new output buffer and return an accessor to it.
693  MOutBufPtr = std::make_shared<buffer<T, buffer_dim>>(range<1>(Size));
694  CGH.addReduction(MOutBufPtr);
695  return createHandlerWiredReadWriteAccessor(CGH, *MOutBufPtr);
696  }
697 
702 
703  template <bool HasFastAtomics = (has_fast_atomics || has_atomic_add_float64)>
704  std::enable_if_t<HasFastAtomics, rw_accessor_type>
706  if (!is_usm && !initializeToIdentity())
707  return *MRWAcc;
708 
709  auto RWReduVal = std::make_shared<T>(MIdentity);
710  CGH.addReduction(RWReduVal);
711  MOutBufPtr = std::make_shared<buffer<T, 1>>(RWReduVal.get(), range<1>(1));
712  CGH.addReduction(MOutBufPtr);
713  return createHandlerWiredReadWriteAccessor(CGH, *MOutBufPtr);
714  }
715 
716  accessor<int, 1, access::mode::read_write, access::target::device,
717  access::placeholder::false_t>
719  auto CounterMem = std::make_shared<int>(0);
720  CGH.addReduction(CounterMem);
721  auto CounterBuf = std::make_shared<buffer<int, 1>>(CounterMem.get(), 1);
722  CGH.addReduction(CounterBuf);
723  return {*CounterBuf, CGH};
724  }
725 
726  bool hasUserDiscardWriteAccessor() { return MDWAcc != nullptr; }
727 
728  template <bool _IsUSM = IsUSM>
729  std::enable_if_t<!_IsUSM, rw_accessor_type &> getUserReadWriteAccessor() {
730  return *MRWAcc;
731  }
732 
733  template <bool _IsUSM = IsUSM>
734  std::enable_if_t<!_IsUSM, dw_accessor_type &> getUserDiscardWriteAccessor() {
735  return *MDWAcc;
736  }
737 
739  assert(is_usm && "Unexpected call of getUSMPointer().");
740  return MUSMPointer;
741  }
742 
743  static inline result_type *getOutPointer(const rw_accessor_type &OutAcc) {
744  return OutAcc.get_pointer().get();
745  }
746 
747  static inline result_type *getOutPointer(result_type *OutPtr) {
748  return OutPtr;
749  }
750 
752  BinaryOperation getBinaryOperation() const { return MBinaryOp; }
754 
755 private:
756  template <typename BufferT, access::placeholder IsPH = IsPlaceholder>
757  std::enable_if_t<IsPH == access::placeholder::false_t, rw_accessor_type>
758  createHandlerWiredReadWriteAccessor(handler &CGH, BufferT Buffer) {
759  return {Buffer, CGH};
760  }
761 
762  template <typename BufferT, access::placeholder IsPH = IsPlaceholder>
763  std::enable_if_t<IsPH == access::placeholder::true_t, rw_accessor_type>
764  createHandlerWiredReadWriteAccessor(handler &CGH, BufferT Buffer) {
765  rw_accessor_type Acc(Buffer);
766  CGH.require(Acc);
767  return Acc;
768  }
769 
772  const T MIdentity;
773 
775  std::shared_ptr<rw_accessor_type> MRWAcc;
776  std::shared_ptr<dw_accessor_type> MDWAcc;
777 
778  std::shared_ptr<buffer<T, buffer_dim>> MOutBufPtr;
779 
782  T *MUSMPointer = nullptr;
783 
784  BinaryOperation MBinaryOp;
785 
787 };
788 
792 template <typename T1, bool B1, bool B2, typename T2>
794 template <typename T1, bool B1, bool B2, typename T2>
796 
801 template <typename Name, typename Type, bool B1, bool B2, typename T3 = void>
804 };
805 template <typename Type, bool B1, bool B2, typename T3>
807  T3> {
809 };
810 template <typename Name, typename Type, bool B1, bool B2, typename T3>
813 };
814 template <typename Type, bool B1, bool B2, typename T3>
816  T3> {
818 };
819 
825 template <typename KernelFunc, int Dims, typename ReducerT>
826 void reductionLoop(const range<Dims> &Range, ReducerT &Reducer,
827  const nd_item<1> &NdId, KernelFunc &F) {
828  size_t Start = NdId.get_global_id(0);
829  size_t End = Range.size();
830  size_t Stride = NdId.get_global_range(0);
831  for (size_t I = Start; I < End; I += Stride)
832  F(sycl::detail::getDelinearizedId(Range, I), Reducer);
833 }
834 
835 template <typename KernelName, typename KernelType, int Dims, class Reduction>
836 std::enable_if_t<Reduction::has_fast_atomics>
837 reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const range<Dims> &Range,
838  const nd_range<1> &NDRange, Reduction &Redu) {
839  auto Out = Redu.getReadWriteAccessorToInitializedMem(CGH);
840  auto GroupSum = Reduction::getReadWriteLocalAcc(1, CGH);
841  using Name =
842  typename get_reduction_main_kernel_name_t<KernelName, KernelType,
843  Reduction::is_usm, false>::name;
844  CGH.parallel_for<Name>(NDRange, [=](nd_item<1> NDId) {
845  // Call user's functions. Reducer.MValue gets initialized there.
846  typename Reduction::reducer_type Reducer;
847  reductionLoop(Range, Reducer, NDId, KernelFunc);
848 
849  auto LID = NDId.get_local_id(0);
850  if (LID == 0)
851  GroupSum[0] = Reducer.getIdentity();
853  Reducer.template atomic_combine<access::address_space::local_space>(
854  &GroupSum[0]);
855 
857  if (LID == 0) {
858  Reducer.MValue = GroupSum[0];
859  Reducer.template atomic_combine(Reduction::getOutPointer(Out));
860  }
861  });
862 }
863 
864 template <typename KernelName, typename KernelType, int Dims, class Reduction>
865 std::enable_if_t<!Reduction::has_fast_atomics && Reduction::has_fast_reduce>
866 reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const range<Dims> &Range,
867  const nd_range<1> &NDRange, Reduction &Redu) {
868  size_t WGSize = NDRange.get_local_range().size();
869  size_t NWorkGroups = NDRange.get_group_range().size();
870 
871  bool IsUpdateOfUserVar = !Reduction::is_usm && !Redu.initializeToIdentity();
872  auto PartialSums = Redu.getWriteAccForPartialReds(NWorkGroups, CGH);
873  auto Out =
874  (NWorkGroups == 1) ? PartialSums : Redu.getWriteAccForPartialReds(1, CGH);
875  auto NWorkGroupsFinished =
876  Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH);
877  auto DoReducePartialSumsInLastWG =
878  Reduction::template getReadWriteLocalAcc<int>(1, CGH);
879 
880  using Name =
881  typename get_reduction_main_kernel_name_t<KernelName, KernelType,
882  Reduction::is_usm, false>::name;
883  CGH.parallel_for<Name>(NDRange, [=](nd_item<1> NDId) {
884  // Call user's functions. Reducer.MValue gets initialized there.
885  typename Reduction::reducer_type Reducer;
886  reductionLoop(Range, Reducer, NDId, KernelFunc);
887 
888  typename Reduction::binary_operation BOp;
889  auto Group = NDId.get_group();
890  Reducer.MValue = reduce_over_group(Group, Reducer.MValue, BOp);
891 
892  size_t LID = NDId.get_local_id(0);
893  if (LID == 0) {
894  if (NWorkGroups == 1 && IsUpdateOfUserVar)
895  Reducer.MValue = BOp(Reducer.MValue, *Reduction::getOutPointer(Out));
896  // if NWorkGroups == 1, then PartialsSum and Out point to same memory.
897  Reduction::getOutPointer(PartialSums)[NDId.get_group_linear_id()] =
898  Reducer.MValue;
899 
900  auto NFinished =
901  atomic_ref<int, memory_order::relaxed, memory_scope::device,
902  access::address_space::global_space>(
903  NWorkGroupsFinished[0]);
904  DoReducePartialSumsInLastWG[0] =
905  ++NFinished == NWorkGroups && NWorkGroups > 1;
906  }
907 
909  if (DoReducePartialSumsInLastWG[0]) {
910  auto LocalSum = Reducer.getIdentity();
911  for (size_t I = LID; I < NWorkGroups; I += WGSize)
912  LocalSum = BOp(LocalSum, PartialSums[I]);
913  Reducer.MValue = reduce_over_group(Group, LocalSum, BOp);
914 
915  if (LID == 0) {
916  if (IsUpdateOfUserVar)
917  Reducer.MValue = BOp(Reducer.MValue, *Reduction::getOutPointer(Out));
918  Reduction::getOutPointer(Out)[0] = Reducer.MValue;
919  }
920  }
921  });
922 }
923 
924 template <typename KernelName, typename KernelType, int Dims, class Reduction>
925 std::enable_if_t<!Reduction::has_fast_atomics && !Reduction::has_fast_reduce>
926 reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const range<Dims> &Range,
927  const nd_range<1> &NDRange, Reduction &Redu) {
928  size_t WGSize = NDRange.get_local_range().size();
929  size_t NWorkGroups = NDRange.get_group_range().size();
930 
931  bool IsUpdateOfUserVar = !Reduction::is_usm && !Redu.initializeToIdentity();
932  auto PartialSums = Redu.getWriteAccForPartialReds(NWorkGroups, CGH);
933  auto Out =
934  (NWorkGroups == 1) ? PartialSums : Redu.getWriteAccForPartialReds(1, CGH);
935  auto LocalReds = Reduction::getReadWriteLocalAcc(WGSize + 1, CGH);
936  auto NWorkGroupsFinished =
937  Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH);
938  auto DoReducePartialSumsInLastWG =
939  Reduction::template getReadWriteLocalAcc<int>(1, CGH);
940 
941  auto Identity = Redu.getIdentity();
942  auto BOp = Redu.getBinaryOperation();
943  using Name =
944  typename get_reduction_main_kernel_name_t<KernelName, KernelType,
945  Reduction::is_usm, false>::name;
946  CGH.parallel_for<Name>(NDRange, [=](nd_item<1> NDId) {
947  // Call user's functions. Reducer.MValue gets initialized there.
948  typename Reduction::reducer_type Reducer(Identity, BOp);
949  reductionLoop(Range, Reducer, NDId, KernelFunc);
950 
951  // Copy the element to local memory to prepare it for tree-reduction.
952  size_t LID = NDId.get_local_linear_id();
953  LocalReds[LID] = Reducer.MValue;
954  if (LID == 0)
955  LocalReds[WGSize] = Identity;
957 
958  // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0].
959  // LocalReds[WGSize] accumulates last/odd elements when the step
960  // of tree-reduction loop is not even.
961  size_t PrevStep = WGSize;
962  for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
963  if (LID < CurStep)
964  LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
965  else if (LID == CurStep && (PrevStep & 0x1))
966  LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
968  PrevStep = CurStep;
969  }
970 
971  if (LID == 0) {
972  auto V = BOp(LocalReds[0], LocalReds[WGSize]);
973  if (NWorkGroups == 1 && IsUpdateOfUserVar)
974  V = BOp(V, *Reduction::getOutPointer(Out));
975  // if NWorkGroups == 1, then PartialsSum and Out point to same memory.
976  Reduction::getOutPointer(PartialSums)[NDId.get_group_linear_id()] = V;
977 
978  auto NFinished =
979  atomic_ref<int, memory_order::relaxed, memory_scope::device,
980  access::address_space::global_space>(
981  NWorkGroupsFinished[0]);
982  DoReducePartialSumsInLastWG[0] =
983  ++NFinished == NWorkGroups && NWorkGroups > 1;
984  }
985 
987  if (DoReducePartialSumsInLastWG[0]) {
988  auto LocalSum = Identity;
989  for (size_t I = LID; I < NWorkGroups; I += WGSize)
990  LocalSum = BOp(LocalSum, Reduction::getOutPointer(PartialSums)[I]);
991 
992  LocalReds[LID] = LocalSum;
993  if (LID == 0)
994  LocalReds[WGSize] = Identity;
996 
997  size_t PrevStep = WGSize;
998  for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
999  if (LID < CurStep)
1000  LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
1001  else if (LID == CurStep && (PrevStep & 0x1))
1002  LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
1004  PrevStep = CurStep;
1005  }
1006  if (LID == 0) {
1007  auto V = BOp(LocalReds[0], LocalReds[WGSize]);
1008  if (IsUpdateOfUserVar)
1009  V = BOp(V, *Reduction::getOutPointer(Out));
1010  Reduction::getOutPointer(Out)[0] = V;
1011  }
1012  }
1013  });
1014 }
1015 
1016 template <typename KernelName, typename KernelType, int Dims, class Reduction>
1017 void reduCGFunc(handler &CGH, KernelType KernelFunc, const range<Dims> &Range,
1018  size_t MaxWGSize, uint32_t NumConcurrentWorkGroups,
1019  Reduction &Redu) {
1020  size_t NWorkItems = Range.size();
1021  size_t WGSize = std::min(NWorkItems, MaxWGSize);
1022  size_t NWorkGroups = NWorkItems / WGSize;
1023  if (NWorkItems % WGSize)
1024  NWorkGroups++;
1025  size_t MaxNWorkGroups = NumConcurrentWorkGroups;
1026  NWorkGroups = std::min(NWorkGroups, MaxNWorkGroups);
1027  size_t NDRItems = NWorkGroups * WGSize;
1028  nd_range<1> NDRange{range<1>{NDRItems}, range<1>{WGSize}};
1029 
1030  reduCGFuncImpl<KernelName>(CGH, KernelFunc, Range, NDRange, Redu);
1031 }
1032 
1042 template <typename KernelName, typename KernelType, int Dims, class Reduction,
1043  bool IsPow2WG>
1045 reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
1046  Reduction &, typename Reduction::rw_accessor_type Out) {
1047  using Name = typename get_reduction_main_kernel_name_t<
1048  KernelName, KernelType, Reduction::is_usm, IsPow2WG>::name;
1049  CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
1050  // Call user's function. Reducer.MValue gets initialized there.
1051  typename Reduction::reducer_type Reducer;
1052  KernelFunc(NDIt, Reducer);
1053 
1054  typename Reduction::binary_operation BOp;
1055  Reducer.MValue = ext::oneapi::reduce(NDIt.get_group(), Reducer.MValue, BOp);
1056  if (NDIt.get_local_linear_id() == 0)
1057  Reducer.atomic_combine(Reduction::getOutPointer(Out));
1058  });
1059 }
1060 
1069 template <typename KernelName, typename KernelType, int Dims, class Reduction,
1070  bool IsPow2WG>
1072 reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
1073  Reduction &, typename Reduction::rw_accessor_type Out) {
1074  size_t WGSize = Range.get_local_range().size();
1075 
1076  // Use local memory to reduce elements in work-groups into zero-th element.
1077  // If WGSize is not power of two, then WGSize+1 elements are allocated.
1078  // The additional last element is used to catch reduce elements that could
1079  // otherwise be lost in the tree-reduction algorithm used in the kernel.
1080  size_t NLocalElements = WGSize + (IsPow2WG ? 0 : 1);
1081  auto LocalReds = Reduction::getReadWriteLocalAcc(NLocalElements, CGH);
1082 
1083  using Name = typename get_reduction_main_kernel_name_t<
1084  KernelName, KernelType, Reduction::is_usm, IsPow2WG>::name;
1085  CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
1086  // Call user's functions. Reducer.MValue gets initialized there.
1087  typename Reduction::reducer_type Reducer;
1088  KernelFunc(NDIt, Reducer);
1089 
1090  size_t WGSize = NDIt.get_local_range().size();
1091  size_t LID = NDIt.get_local_linear_id();
1092 
1093  // Copy the element to local memory to prepare it for tree-reduction.
1094  LocalReds[LID] = Reducer.MValue;
1095  if (!IsPow2WG)
1096  LocalReds[WGSize] = Reducer.getIdentity();
1097  NDIt.barrier();
1098 
1099  // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0].
1100  // LocalReds[WGSize] accumulates last/odd elements when the step
1101  // of tree-reduction loop is not even.
1102  typename Reduction::binary_operation BOp;
1103  size_t PrevStep = WGSize;
1104  for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
1105  if (LID < CurStep)
1106  LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
1107  else if (!IsPow2WG && LID == CurStep && (PrevStep & 0x1))
1108  LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
1109  NDIt.barrier();
1110  PrevStep = CurStep;
1111  }
1112 
1113  if (LID == 0) {
1114  Reducer.MValue =
1115  IsPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]);
1116  Reducer.atomic_combine(Reduction::getOutPointer(Out));
1117  }
1118  });
1119 }
1120 
1121 template <typename KernelName, typename KernelType, int Dims, class Reduction>
1123 reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
1124  Reduction &Redu) {
1125 
1126  size_t WGSize = Range.get_local_range().size();
1127 
1128  // User's initialized read-write accessor is re-used here if
1129  // initialize_to_identity is not set (i.e. if user's variable is initialized).
1130  // Otherwise, a new buffer is initialized with identity value and a new
1131  // read-write accessor to that buffer is created. That is done because
1132  // atomic operations update some initialized memory.
1133  // User's USM pointer is not re-used even when initialize_to_identity is not
1134  // set because it does not worth the creation of an additional variant of
1135  // a user's kernel for that case.
1136  auto Out = Redu.getReadWriteAccessorToInitializedMem(CGH);
1137 
1138  // If the work group size is not pow of 2, then the kernel runs some
1139  // additional code and checks in it.
1140  // If the reduction has fast reduce then the kernel does not care if the work
1141  // group size is pow of 2 or not, assume true for such cases.
1142  bool IsPow2WG = Reduction::has_fast_reduce || ((WGSize & (WGSize - 1)) == 0);
1143  if (IsPow2WG)
1144  reduCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
1145  CGH, KernelFunc, Range, Redu, Out);
1146  else
1147  reduCGFuncImpl<KernelName, KernelType, Dims, Reduction, false>(
1148  CGH, KernelFunc, Range, Redu, Out);
1149 }
1150 
1159 template <typename KernelName, typename KernelType, int Dims, class Reduction,
1160  bool IsPow2WG>
1162 reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
1163  Reduction &Redu, typename Reduction::rw_accessor_type Out) {
1164 
1165  size_t NWorkGroups = Range.get_group_range().size();
1166  bool IsUpdateOfUserVar =
1167  !Reduction::is_usm && !Redu.initializeToIdentity() && NWorkGroups == 1;
1168 
1169  using Name = typename get_reduction_main_kernel_name_t<
1170  KernelName, KernelType, Reduction::is_usm, IsPow2WG>::name;
1171  CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
1172  // Call user's functions. Reducer.MValue gets initialized there.
1173  typename Reduction::reducer_type Reducer;
1174  KernelFunc(NDIt, Reducer);
1175 
1176  // Compute the partial sum/reduction for the work-group.
1177  size_t WGID = NDIt.get_group_linear_id();
1178  typename Reduction::result_type PSum = Reducer.MValue;
1179  typename Reduction::binary_operation BOp;
1180  PSum = ext::oneapi::reduce(NDIt.get_group(), PSum, BOp);
1181  if (NDIt.get_local_linear_id() == 0) {
1182  if (IsUpdateOfUserVar)
1183  PSum = BOp(*(Reduction::getOutPointer(Out)), PSum);
1184  Reduction::getOutPointer(Out)[WGID] = PSum;
1185  }
1186  });
1187 }
1188 
1197 template <typename KernelName, typename KernelType, int Dims, class Reduction,
1198  bool IsPow2WG>
1200 reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
1201  Reduction &Redu, typename Reduction::rw_accessor_type Out) {
1202  size_t WGSize = Range.get_local_range().size();
1203  size_t NWorkGroups = Range.get_group_range().size();
1204 
1205  bool IsUpdateOfUserVar =
1206  !Reduction::is_usm && !Redu.initializeToIdentity() && NWorkGroups == 1;
1207 
1208  // Use local memory to reduce elements in work-groups into 0-th element.
1209  // If WGSize is not power of two, then WGSize+1 elements are allocated.
1210  // The additional last element is used to catch elements that could
1211  // otherwise be lost in the tree-reduction algorithm.
1212  size_t NumLocalElements = WGSize + (IsPow2WG ? 0 : 1);
1213  auto LocalReds = Reduction::getReadWriteLocalAcc(NumLocalElements, CGH);
1214  typename Reduction::result_type ReduIdentity = Redu.getIdentity();
1215  using Name = typename get_reduction_main_kernel_name_t<
1216  KernelName, KernelType, Reduction::is_usm, IsPow2WG>::name;
1217  auto BOp = Redu.getBinaryOperation();
1218  CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
1219  // Call user's functions. Reducer.MValue gets initialized there.
1220  typename Reduction::reducer_type Reducer(ReduIdentity, BOp);
1221  KernelFunc(NDIt, Reducer);
1222 
1223  size_t WGSize = NDIt.get_local_range().size();
1224  size_t LID = NDIt.get_local_linear_id();
1225  // Copy the element to local memory to prepare it for tree-reduction.
1226  LocalReds[LID] = Reducer.MValue;
1227  if (!IsPow2WG)
1228  LocalReds[WGSize] = ReduIdentity;
1229  NDIt.barrier();
1230 
1231  // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]
1232  // LocalReds[WGSize] accumulates last/odd elements when the step
1233  // of tree-reduction loop is not even.
1234  size_t PrevStep = WGSize;
1235  for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
1236  if (LID < CurStep)
1237  LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
1238  else if (!IsPow2WG && LID == CurStep && (PrevStep & 0x1))
1239  LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
1240  NDIt.barrier();
1241  PrevStep = CurStep;
1242  }
1243 
1244  // Compute the partial sum/reduction for the work-group.
1245  if (LID == 0) {
1246  size_t GrID = NDIt.get_group_linear_id();
1247  typename Reduction::result_type PSum =
1248  IsPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]);
1249  if (IsUpdateOfUserVar)
1250  PSum = BOp(*(Reduction::getOutPointer(Out)), PSum);
1251  Reduction::getOutPointer(Out)[GrID] = PSum;
1252  }
1253  });
1254 }
1255 
1256 template <typename KernelName, typename KernelType, int Dims, class Reduction>
1258 reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
1259  Reduction &Redu) {
1260  size_t WGSize = Range.get_local_range().size();
1261  size_t NWorkGroups = Range.get_group_range().size();
1262 
1263  // If the work group size is not pow of 2, then the kernel runs some
1264  // additional code and checks in it.
1265  // If the reduction has fast reduce then the kernel does not care if the work
1266  // group size is pow of 2 or not, assume true for such cases.
1267  bool IsPow2WG = Reduction::has_fast_reduce || ((WGSize & (WGSize - 1)) == 0);
1268 
1269  auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, CGH);
1270  if (IsPow2WG)
1271  reduCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
1272  CGH, KernelFunc, Range, Redu, Out);
1273  else
1274  reduCGFuncImpl<KernelName, KernelType, Dims, Reduction, false>(
1275  CGH, KernelFunc, Range, Redu, Out);
1276 }
1277 
1286 template <typename KernelName, typename KernelType, bool UniformWG,
1287  class Reduction, typename InputT, typename OutputT>
1289 reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
1290  size_t WGSize, Reduction &Redu, InputT In, OutputT Out) {
1291  using Name = typename get_reduction_aux_kernel_name_t<
1292  KernelName, KernelType, Reduction::is_usm, UniformWG, OutputT>::name;
1293  bool IsUpdateOfUserVar =
1294  !Reduction::is_usm && !Redu.initializeToIdentity() && NWorkGroups == 1;
1295  range<1> GlobalRange = {UniformWG ? NWorkItems : NWorkGroups * WGSize};
1296  nd_range<1> Range{GlobalRange, range<1>(WGSize)};
1297  CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
1298  typename Reduction::binary_operation BOp;
1299  size_t WGID = NDIt.get_group_linear_id();
1300  size_t GID = NDIt.get_global_linear_id();
1301  typename Reduction::result_type PSum =
1302  (UniformWG || (GID < NWorkItems))
1303  ? In[GID]
1304  : Reduction::reducer_type::getIdentity();
1305  PSum = ext::oneapi::reduce(NDIt.get_group(), PSum, BOp);
1306  if (NDIt.get_local_linear_id() == 0) {
1307  if (IsUpdateOfUserVar)
1308  PSum = BOp(*(Reduction::getOutPointer(Out)), PSum);
1309  Reduction::getOutPointer(Out)[WGID] = PSum;
1310  }
1311  });
1312 }
1313 
1321 template <typename KernelName, typename KernelType, bool UniformPow2WG,
1322  class Reduction, typename InputT, typename OutputT>
1324 reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
1325  size_t WGSize, Reduction &Redu, InputT In, OutputT Out) {
1326  bool IsUpdateOfUserVar =
1327  !Reduction::is_usm && !Redu.initializeToIdentity() && NWorkGroups == 1;
1328 
1329  // Use local memory to reduce elements in work-groups into 0-th element.
1330  // If WGSize is not power of two, then WGSize+1 elements are allocated.
1331  // The additional last element is used to catch elements that could
1332  // otherwise be lost in the tree-reduction algorithm.
1333  size_t NumLocalElements = WGSize + (UniformPow2WG ? 0 : 1);
1334  auto LocalReds = Reduction::getReadWriteLocalAcc(NumLocalElements, CGH);
1335 
1336  auto ReduIdentity = Redu.getIdentity();
1337  auto BOp = Redu.getBinaryOperation();
1338  using Name = typename get_reduction_aux_kernel_name_t<
1339  KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name;
1340  range<1> GlobalRange = {UniformPow2WG ? NWorkItems : NWorkGroups * WGSize};
1341  nd_range<1> Range{GlobalRange, range<1>(WGSize)};
1342  CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
1343  size_t WGSize = NDIt.get_local_range().size();
1344  size_t LID = NDIt.get_local_linear_id();
1345  size_t GID = NDIt.get_global_linear_id();
1346 
1347  // Copy the element to local memory to prepare it for tree-reduction.
1348  LocalReds[LID] =
1349  (UniformPow2WG || GID < NWorkItems) ? In[GID] : ReduIdentity;
1350  if (!UniformPow2WG)
1351  LocalReds[WGSize] = ReduIdentity;
1352  NDIt.barrier();
1353 
1354  // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]
1355  // LocalReds[WGSize] accumulates last/odd elements when the step
1356  // of tree-reduction loop is not even.
1357  size_t PrevStep = WGSize;
1358  for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
1359  if (LID < CurStep)
1360  LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
1361  else if (!UniformPow2WG && LID == CurStep && (PrevStep & 0x1))
1362  LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
1363  NDIt.barrier();
1364  PrevStep = CurStep;
1365  }
1366 
1367  // Compute the partial sum/reduction for the work-group.
1368  if (LID == 0) {
1369  size_t GrID = NDIt.get_group_linear_id();
1370  typename Reduction::result_type PSum =
1371  UniformPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]);
1372  if (IsUpdateOfUserVar)
1373  PSum = BOp(*(Reduction::getOutPointer(Out)), PSum);
1374  Reduction::getOutPointer(Out)[GrID] = PSum;
1375  }
1376  });
1377 }
1378 
1383 template <typename KernelName, typename KernelType, class Reduction>
1385 reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
1386  Reduction &Redu) {
1387 
1388  size_t NWorkGroups;
1389  size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
1390 
1391  // The last work-group may be not fully loaded with work, or the work group
1392  // size may be not power of two. Those two cases considered inefficient
1393  // as they require additional code and checks in the kernel.
1394  bool HasUniformWG = NWorkGroups * WGSize == NWorkItems;
1395  if (!Reduction::has_fast_reduce)
1396  HasUniformWG = HasUniformWG && (WGSize & (WGSize - 1)) == 0;
1397 
1398  // Get read accessor to the buffer that was used as output
1399  // in the previous kernel.
1400  auto In = Redu.getReadAccToPreviousPartialReds(CGH);
1401  auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, CGH);
1402  if (HasUniformWG)
1403  reduAuxCGFuncImpl<KernelName, KernelType, true>(
1404  CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Out);
1405  else
1406  reduAuxCGFuncImpl<KernelName, KernelType, false>(
1407  CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Out);
1408  return NWorkGroups;
1409 }
1410 
1411 // This method is used for implementation of parallel_for accepting 1 reduction.
1412 // TODO: remove this method when everything is switched to general algorithm
1413 // implementing arbitrary number of reductions in parallel_for().
1417 template <typename KernelName, class Reduction>
1418 std::enable_if_t<!Reduction::is_usm>
1419 reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu) {
1420  auto InAcc = Redu.getReadAccToPreviousPartialReds(CGH);
1421  Redu.associateWithHandler(CGH);
1422  if (Redu.hasUserDiscardWriteAccessor())
1423  CGH.copy(InAcc, Redu.getUserDiscardWriteAccessor());
1424  else
1425  CGH.copy(InAcc, Redu.getUserReadWriteAccessor());
1426 }
1427 
1428 // This method is used for implementation of parallel_for accepting 1 reduction.
1429 // TODO: remove this method when everything is switched to general algorithm
1430 // implementing arbitrary number of reductions in parallel_for().
1433 template <typename KernelName, class Reduction>
1434 std::enable_if_t<Reduction::is_usm>
1435 reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu) {
1436  auto InAcc = Redu.getReadAccToPreviousPartialReds(CGH);
1437  auto UserVarPtr = Redu.getUSMPointer();
1438  bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1439  auto BOp = Redu.getBinaryOperation();
1440  CGH.single_task<KernelName>([=] {
1441  if (IsUpdateOfUserVar)
1442  *UserVarPtr = BOp(*UserVarPtr, *(InAcc.get_pointer()));
1443  else
1444  *UserVarPtr = *(InAcc.get_pointer());
1445  });
1446 }
1447 
1451 template <typename... Reductions, size_t... Is>
1452 auto createReduLocalAccs(size_t Size, handler &CGH,
1453  std::index_sequence<Is...>) {
1454  return makeReduTupleT(
1455  std::tuple_element_t<Is, std::tuple<Reductions...>>::getReadWriteLocalAcc(
1456  Size, CGH)...);
1457 }
1458 
1462 template <bool IsOneWG, typename... Reductions, size_t... Is>
1463 auto createReduOutAccs(size_t NWorkGroups, handler &CGH,
1464  std::tuple<Reductions...> &ReduTuple,
1465  std::index_sequence<Is...>) {
1466  return makeReduTupleT(
1467  std::get<Is>(ReduTuple).template getWriteMemForPartialReds<IsOneWG>(
1468  NWorkGroups, CGH)...);
1469 }
1470 
1474 template <typename... Reductions, size_t... Is>
1476  std::tuple<Reductions...> &ReduTuple,
1477  std::index_sequence<Is...>) {
1478  return makeReduTupleT(
1479  std::get<Is>(ReduTuple).getReadAccToPreviousPartialReds(CGH)...);
1480 }
1481 
1482 template <typename... Reductions, size_t... Is>
1483 ReduTupleT<typename Reductions::result_type...>
1484 getReduIdentities(std::tuple<Reductions...> &ReduTuple,
1485  std::index_sequence<Is...>) {
1486  return {std::get<Is>(ReduTuple).getIdentity()...};
1487 }
1488 
1489 template <typename... Reductions, size_t... Is>
1490 ReduTupleT<typename Reductions::binary_operation...>
1491 getReduBOPs(std::tuple<Reductions...> &ReduTuple, std::index_sequence<Is...>) {
1492  return {std::get<Is>(ReduTuple).getBinaryOperation()...};
1493 }
1494 
1495 template <typename... Reductions, size_t... Is>
1496 std::array<bool, sizeof...(Reductions)>
1497 getInitToIdentityProperties(std::tuple<Reductions...> &ReduTuple,
1498  std::index_sequence<Is...>) {
1499  return {std::get<Is>(ReduTuple).initializeToIdentity()...};
1500 }
1501 
1502 template <typename... Reductions, size_t... Is>
1503 std::tuple<typename Reductions::reducer_type...>
1506  std::index_sequence<Is...>) {
1507  return {typename Reductions::reducer_type{std::get<Is>(Identities),
1508  std::get<Is>(BOPsTuple)}...};
1509 }
1510 
1511 template <typename KernelType, int Dims, typename... ReducerT, size_t... Is>
1513  std::tuple<ReducerT...> &Reducers,
1514  std::index_sequence<Is...>) {
1515  KernelFunc(NDIt, std::get<Is>(Reducers)...);
1516 }
1517 
1518 template <bool Pow2WG, typename... LocalAccT, typename... ReducerT,
1519  typename... ResultT, size_t... Is>
1520 void initReduLocalAccs(size_t LID, size_t WGSize,
1521  ReduTupleT<LocalAccT...> LocalAccs,
1522  const std::tuple<ReducerT...> &Reducers,
1523  ReduTupleT<ResultT...> Identities,
1524  std::index_sequence<Is...>) {
1525  std::tie(std::get<Is>(LocalAccs)[LID]...) =
1526  std::make_tuple(std::get<Is>(Reducers).MValue...);
1527 
1528  // For work-groups, which size is not power of two, local accessors have
1529  // an additional element with index WGSize that is used by the tree-reduction
1530  // algorithm. Initialize those additional elements with identity values here.
1531  if (!Pow2WG)
1532  std::tie(std::get<Is>(LocalAccs)[WGSize]...) =
1533  std::make_tuple(std::get<Is>(Identities)...);
1534 }
1535 
1536 template <bool UniformPow2WG, typename... LocalAccT, typename... InputAccT,
1537  typename... ResultT, size_t... Is>
1538 void initReduLocalAccs(size_t LID, size_t GID, size_t NWorkItems, size_t WGSize,
1539  ReduTupleT<InputAccT...> LocalAccs,
1540  ReduTupleT<LocalAccT...> InputAccs,
1541  ReduTupleT<ResultT...> Identities,
1542  std::index_sequence<Is...>) {
1543  // Normally, the local accessors are initialized with elements from the input
1544  // accessors. The exception is the case when (GID >= NWorkItems), which
1545  // possible only when UniformPow2WG is false. For that case the elements of
1546  // local accessors are initialized with identity value, so they would not
1547  // give any impact into the final partial sums during the tree-reduction
1548  // algorithm work.
1549  if (UniformPow2WG || GID < NWorkItems)
1550  std::tie(std::get<Is>(LocalAccs)[LID]...) =
1551  std::make_tuple(std::get<Is>(InputAccs)[GID]...);
1552  else
1553  std::tie(std::get<Is>(LocalAccs)[LID]...) =
1554  std::make_tuple(std::get<Is>(Identities)...);
1555 
1556  // For work-groups, which size is not power of two, local accessors have
1557  // an additional element with index WGSize that is used by the tree-reduction
1558  // algorithm. Initialize those additional elements with identity values here.
1559  if (!UniformPow2WG)
1560  std::tie(std::get<Is>(LocalAccs)[WGSize]...) =
1561  std::make_tuple(std::get<Is>(Identities)...);
1562 }
1563 
1564 template <typename... LocalAccT, typename... BOPsT, size_t... Is>
1565 void reduceReduLocalAccs(size_t IndexA, size_t IndexB,
1566  ReduTupleT<LocalAccT...> LocalAccs,
1567  ReduTupleT<BOPsT...> BOPs,
1568  std::index_sequence<Is...>) {
1569  std::tie(std::get<Is>(LocalAccs)[IndexA]...) =
1570  std::make_tuple((std::get<Is>(BOPs)(std::get<Is>(LocalAccs)[IndexA],
1571  std::get<Is>(LocalAccs)[IndexB]))...);
1572 }
1573 
1574 template <bool Pow2WG, bool IsOneWG, typename... Reductions,
1575  typename... OutAccT, typename... LocalAccT, typename... BOPsT,
1576  typename... Ts, size_t... Is>
1578  size_t OutAccIndex, size_t WGSize, std::tuple<Reductions...> *,
1580  ReduTupleT<BOPsT...> BOPs, ReduTupleT<Ts...> IdentityVals,
1581  std::array<bool, sizeof...(Reductions)> IsInitializeToIdentity,
1582  std::index_sequence<Is...>) {
1583  // Add the initial value of user's variable to the final result.
1584  if (IsOneWG)
1585  std::tie(std::get<Is>(LocalAccs)[0]...) = std::make_tuple(std::get<Is>(
1586  BOPs)(std::get<Is>(LocalAccs)[0],
1587  IsInitializeToIdentity[Is]
1588  ? std::get<Is>(IdentityVals)
1589  : std::tuple_element_t<Is, std::tuple<Reductions...>>::
1590  getOutPointer(std::get<Is>(OutAccs))[0])...);
1591 
1592  if (Pow2WG) {
1593  // The partial sums for the work-group are stored in 0-th elements of local
1594  // accessors. Simply write those sums to output accessors.
1595  std::tie(std::tuple_element_t<Is, std::tuple<Reductions...>>::getOutPointer(
1596  std::get<Is>(OutAccs))[OutAccIndex]...) =
1597  std::make_tuple(std::get<Is>(LocalAccs)[0]...);
1598  } else {
1599  // Each of local accessors keeps two partial sums: in 0-th and WGsize-th
1600  // elements. Combine them into final partial sums and write to output
1601  // accessors.
1602  std::tie(std::tuple_element_t<Is, std::tuple<Reductions...>>::getOutPointer(
1603  std::get<Is>(OutAccs))[OutAccIndex]...) =
1604  std::make_tuple(std::get<Is>(BOPs)(std::get<Is>(LocalAccs)[0],
1605  std::get<Is>(LocalAccs)[WGSize])...);
1606  }
1607 }
1608 
1609 // Concatenate an empty sequence.
1610 constexpr std::index_sequence<> concat_sequences(std::index_sequence<>) {
1611  return {};
1612 }
1613 
1614 // Concatenate a sequence consisting of 1 element.
1615 template <size_t I>
1616 constexpr std::index_sequence<I> concat_sequences(std::index_sequence<I>) {
1617  return {};
1618 }
1619 
1620 // Concatenate two potentially empty sequences.
1621 template <size_t... Is, size_t... Js>
1622 constexpr std::index_sequence<Is..., Js...>
1623 concat_sequences(std::index_sequence<Is...>, std::index_sequence<Js...>) {
1624  return {};
1625 }
1626 
1627 // Concatenate more than 2 sequences.
1628 template <size_t... Is, size_t... Js, class... Rs>
1629 constexpr auto concat_sequences(std::index_sequence<Is...>,
1630  std::index_sequence<Js...>, Rs...) {
1631  return concat_sequences(std::index_sequence<Is..., Js...>{}, Rs{}...);
1632 }
1633 
1635  template <typename T> struct Func {
1636  static constexpr bool value = !std::remove_pointer_t<T>::is_usm;
1637  };
1638 };
1639 
1641  template <typename T> struct Func { static constexpr bool value = false; };
1642 };
1643 
1644 template <bool Cond, size_t I> struct FilterElement {
1645  using type =
1646  std::conditional_t<Cond, std::index_sequence<I>, std::index_sequence<>>;
1647 };
1648 
1654 template <typename... T, typename FunctorT, size_t... Is,
1655  std::enable_if_t<(sizeof...(Is) > 0), int> Z = 0>
1656 constexpr auto filterSequenceHelper(FunctorT, std::index_sequence<Is...>) {
1657  return concat_sequences(
1658  typename FilterElement<FunctorT::template Func<std::tuple_element_t<
1659  Is, std::tuple<T...>>>::value,
1660  Is>::type{}...);
1661 }
1662 template <typename... T, typename FunctorT, size_t... Is,
1663  std::enable_if_t<(sizeof...(Is) == 0), int> Z = 0>
1664 constexpr auto filterSequenceHelper(FunctorT, std::index_sequence<Is...>) {
1665  return std::index_sequence<>{};
1666 }
1667 
1671 template <typename... T, typename FunctorT, size_t... Is>
1672 constexpr auto filterSequence(FunctorT F, std::index_sequence<Is...> Indices) {
1673  return filterSequenceHelper<T...>(F, Indices);
1674 }
1675 
1676 template <typename KernelName, bool Pow2WG, bool IsOneWG, typename KernelType,
1677  int Dims, typename... Reductions, size_t... Is>
1678 void reduCGFuncImpl(handler &CGH, KernelType KernelFunc,
1679  const nd_range<Dims> &Range,
1680  std::tuple<Reductions...> &ReduTuple,
1681  std::index_sequence<Is...> ReduIndices) {
1682 
1683  size_t WGSize = Range.get_local_range().size();
1684  size_t LocalAccSize = WGSize + (Pow2WG ? 0 : 1);
1685  auto LocalAccsTuple =
1686  createReduLocalAccs<Reductions...>(LocalAccSize, CGH, ReduIndices);
1687 
1688  size_t NWorkGroups = IsOneWG ? 1 : Range.get_group_range().size();
1689  auto OutAccsTuple =
1690  createReduOutAccs<IsOneWG>(NWorkGroups, CGH, ReduTuple, ReduIndices);
1691  auto IdentitiesTuple = getReduIdentities(ReduTuple, ReduIndices);
1692  auto BOPsTuple = getReduBOPs(ReduTuple, ReduIndices);
1693  auto InitToIdentityProps =
1694  getInitToIdentityProperties(ReduTuple, ReduIndices);
1695 
1696  using Name = typename get_reduction_main_kernel_name_t<
1697  KernelName, KernelType, Pow2WG, IsOneWG, decltype(OutAccsTuple)>::name;
1698  CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
1699  auto ReduIndices = std::index_sequence_for<Reductions...>();
1700  auto ReducersTuple =
1701  createReducers<Reductions...>(IdentitiesTuple, BOPsTuple, ReduIndices);
1702  // The .MValue field of each of the elements in ReducersTuple
1703  // gets initialized in this call.
1704  callReduUserKernelFunc(KernelFunc, NDIt, ReducersTuple, ReduIndices);
1705 
1706  size_t WGSize = NDIt.get_local_range().size();
1707  size_t LID = NDIt.get_local_linear_id();
1708  initReduLocalAccs<Pow2WG>(LID, WGSize, LocalAccsTuple, ReducersTuple,
1709  IdentitiesTuple, ReduIndices);
1710  NDIt.barrier();
1711 
1712  size_t PrevStep = WGSize;
1713  for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
1714  if (LID < CurStep) {
1715  // LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
1716  reduceReduLocalAccs(LID, LID + CurStep, LocalAccsTuple, BOPsTuple,
1717  ReduIndices);
1718  } else if (!Pow2WG && LID == CurStep && (PrevStep & 0x1)) {
1719  // LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
1720  reduceReduLocalAccs(WGSize, PrevStep - 1, LocalAccsTuple, BOPsTuple,
1721  ReduIndices);
1722  }
1723  NDIt.barrier();
1724  PrevStep = CurStep;
1725  }
1726 
1727  // Compute the partial sum/reduction for the work-group.
1728  if (LID == 0) {
1729  size_t GrID = NDIt.get_group_linear_id();
1730  writeReduSumsToOutAccs<Pow2WG, IsOneWG>(
1731  GrID, WGSize, (std::tuple<Reductions...> *)nullptr, OutAccsTuple,
1732  LocalAccsTuple, BOPsTuple, IdentitiesTuple, InitToIdentityProps,
1733  ReduIndices);
1734  }
1735  });
1736 }
1737 
1738 template <typename KernelName, typename KernelType, int Dims,
1739  typename... Reductions, size_t... Is>
1740 void reduCGFunc(handler &CGH, KernelType KernelFunc,
1741  const nd_range<Dims> &Range,
1742  std::tuple<Reductions...> &ReduTuple,
1743  std::index_sequence<Is...> ReduIndices) {
1744  size_t WGSize = Range.get_local_range().size();
1745  size_t NWorkGroups = Range.get_group_range().size();
1746  bool Pow2WG = (WGSize & (WGSize - 1)) == 0;
1747  if (NWorkGroups == 1) {
1748  // TODO: consider having only one variant of kernel instead of two here.
1749  // Having two kernels, where one is just slighly more efficient than
1750  // another, and only for the purpose of running 1 work-group may be too
1751  // expensive.
1752  if (Pow2WG)
1753  reduCGFuncImpl<KernelName, true, true>(CGH, KernelFunc, Range, ReduTuple,
1754  ReduIndices);
1755  else
1756  reduCGFuncImpl<KernelName, false, true>(CGH, KernelFunc, Range, ReduTuple,
1757  ReduIndices);
1758  } else {
1759  if (Pow2WG)
1760  reduCGFuncImpl<KernelName, true, false>(CGH, KernelFunc, Range, ReduTuple,
1761  ReduIndices);
1762  else
1763  reduCGFuncImpl<KernelName, false, false>(CGH, KernelFunc, Range,
1764  ReduTuple, ReduIndices);
1765  }
1766 }
1767 
1768 // Specialization for devices with the atomic64 aspect, which guarantees 64 (and
1769 // temporarily 32) bit floating point support for atomic add.
1770 // TODO 32 bit floating point atomics are eventually expected to be supported by
1771 // the has_fast_atomics specialization. Corresponding changes to
1772 // IsReduOptForAtomic64Add, as prescribed in its documentation, should then also
1773 // be made.
1774 template <typename KernelName, typename KernelType, int Dims, class Reduction>
1775 std::enable_if_t<Reduction::has_atomic_add_float64>
1777  const nd_range<Dims> &Range, Reduction &,
1778  typename Reduction::rw_accessor_type Out) {
1779  using Name = typename get_reduction_main_kernel_name_t<
1780  KernelName, KernelType, Reduction::is_usm,
1781  Reduction::has_atomic_add_float64,
1782  typename Reduction::rw_accessor_type>::name;
1783  CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
1784  // Call user's function. Reducer.MValue gets initialized there.
1785  typename Reduction::reducer_type Reducer;
1786  KernelFunc(NDIt, Reducer);
1787 
1788  typename Reduction::binary_operation BOp;
1789  Reducer.MValue = reduce_over_group(NDIt.get_group(), Reducer.MValue, BOp);
1790  if (NDIt.get_local_linear_id() == 0) {
1791  Reducer.atomic_combine(Reduction::getOutPointer(Out));
1792  }
1793  });
1794 }
1795 
1796 // Specialization for devices with the atomic64 aspect, which guarantees 64 (and
1797 // temporarily 32) bit floating point support for atomic add.
1798 // TODO 32 bit floating point atomics are eventually expected to be supported by
1799 // the has_fast_atomics specialization. Corresponding changes to
1800 // IsReduOptForAtomic64Add, as prescribed in its documentation, should then also
1801 // be made.
1802 template <typename KernelName, typename KernelType, int Dims, class Reduction>
1805  const nd_range<Dims> &Range, Reduction &Redu) {
1806 
1807  auto Out = Redu.getReadWriteAccessorToInitializedMem(CGH);
1808  reduCGFuncImplAtomic64<KernelName, KernelType, Dims, Reduction>(
1809  CGH, KernelFunc, Range, Redu, Out);
1810 }
1811 
1813 
1814 template <typename ReductionT>
1815 void associateReduAccsWithHandlerHelper(handler &CGH, ReductionT &Redu) {
1816  Redu.associateWithHandler(CGH);
1817 }
1818 
1819 template <typename ReductionT, typename... RestT,
1820  enable_if_t<(sizeof...(RestT) > 0), int> Z = 0>
1821 void associateReduAccsWithHandlerHelper(handler &CGH, ReductionT &Redu,
1822  RestT &... Rest) {
1823  Redu.associateWithHandler(CGH);
1824  associateReduAccsWithHandlerHelper(CGH, Rest...);
1825 }
1826 
1827 template <typename... Reductions, size_t... Is>
1829  std::tuple<Reductions...> &ReduTuple,
1830  std::index_sequence<Is...>) {
1831  associateReduAccsWithHandlerHelper(CGH, std::get<Is>(ReduTuple)...);
1832 }
1833 
1834 template <typename KernelName, typename KernelType, bool UniformPow2WG,
1835  bool IsOneWG, typename... Reductions, size_t... Is>
1836 void reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
1837  size_t WGSize, std::tuple<Reductions...> &ReduTuple,
1838  std::index_sequence<Is...> ReduIndices) {
1839  // The last kernel DOES write to user's accessor passed to reduction.
1840  // Associate it with handler manually.
1843  Predicate;
1844  auto AccReduIndices = filterSequence<Reductions...>(Predicate, ReduIndices);
1845  associateReduAccsWithHandler(CGH, ReduTuple, AccReduIndices);
1846 
1847  size_t LocalAccSize = WGSize + (UniformPow2WG ? 0 : 1);
1848  auto LocalAccsTuple =
1849  createReduLocalAccs<Reductions...>(LocalAccSize, CGH, ReduIndices);
1850  auto InAccsTuple =
1851  getReadAccsToPreviousPartialReds(CGH, ReduTuple, ReduIndices);
1852  auto OutAccsTuple =
1853  createReduOutAccs<IsOneWG>(NWorkGroups, CGH, ReduTuple, ReduIndices);
1854  auto IdentitiesTuple = getReduIdentities(ReduTuple, ReduIndices);
1855  auto BOPsTuple = getReduBOPs(ReduTuple, ReduIndices);
1856  auto InitToIdentityProps =
1857  getInitToIdentityProperties(ReduTuple, ReduIndices);
1858 
1859  using Name =
1860  typename get_reduction_aux_kernel_name_t<KernelName, KernelType,
1861  UniformPow2WG, IsOneWG,
1862  decltype(OutAccsTuple)>::name;
1863  range<1> GlobalRange = {UniformPow2WG ? NWorkItems : NWorkGroups * WGSize};
1864  nd_range<1> Range{GlobalRange, range<1>(WGSize)};
1865  CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
1866  auto ReduIndices = std::index_sequence_for<Reductions...>();
1867  size_t WGSize = NDIt.get_local_range().size();
1868  size_t LID = NDIt.get_local_linear_id();
1869  size_t GID = NDIt.get_global_linear_id();
1870  initReduLocalAccs<UniformPow2WG>(LID, GID, NWorkItems, WGSize,
1871  LocalAccsTuple, InAccsTuple,
1872  IdentitiesTuple, ReduIndices);
1873  NDIt.barrier();
1874 
1875  size_t PrevStep = WGSize;
1876  for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
1877  if (LID < CurStep) {
1878  // LocalAcc[LID] = BOp(LocalAcc[LID], LocalAcc[LID + CurStep]);
1879  reduceReduLocalAccs(LID, LID + CurStep, LocalAccsTuple, BOPsTuple,
1880  ReduIndices);
1881  } else if (!UniformPow2WG && LID == CurStep && (PrevStep & 0x1)) {
1882  // LocalAcc[WGSize] = BOp(LocalAcc[WGSize], LocalAcc[PrevStep - 1]);
1883  reduceReduLocalAccs(WGSize, PrevStep - 1, LocalAccsTuple, BOPsTuple,
1884  ReduIndices);
1885  }
1886  NDIt.barrier();
1887  PrevStep = CurStep;
1888  }
1889 
1890  // Compute the partial sum/reduction for the work-group.
1891  if (LID == 0) {
1892  size_t GrID = NDIt.get_group_linear_id();
1893  writeReduSumsToOutAccs<UniformPow2WG, IsOneWG>(
1894  GrID, WGSize, (std::tuple<Reductions...> *)nullptr, OutAccsTuple,
1895  LocalAccsTuple, BOPsTuple, IdentitiesTuple, InitToIdentityProps,
1896  ReduIndices);
1897  }
1898  });
1899 }
1900 
1901 template <typename KernelName, typename KernelType, typename... Reductions,
1902  size_t... Is>
1903 size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
1904  std::tuple<Reductions...> &ReduTuple,
1905  std::index_sequence<Is...> ReduIndices) {
1906  size_t NWorkGroups;
1907  size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
1908 
1909  bool Pow2WG = (WGSize & (WGSize - 1)) == 0;
1910  bool HasUniformWG = Pow2WG && (NWorkGroups * WGSize == NWorkItems);
1911  if (NWorkGroups == 1) {
1912  if (HasUniformWG)
1913  reduAuxCGFuncImpl<KernelName, KernelType, true, true>(
1914  CGH, NWorkItems, NWorkGroups, WGSize, ReduTuple, ReduIndices);
1915  else
1916  reduAuxCGFuncImpl<KernelName, KernelType, false, true>(
1917  CGH, NWorkItems, NWorkGroups, WGSize, ReduTuple, ReduIndices);
1918  } else {
1919  if (HasUniformWG)
1920  reduAuxCGFuncImpl<KernelName, KernelType, true, false>(
1921  CGH, NWorkItems, NWorkGroups, WGSize, ReduTuple, ReduIndices);
1922  else
1923  reduAuxCGFuncImpl<KernelName, KernelType, false, false>(
1924  CGH, NWorkItems, NWorkGroups, WGSize, ReduTuple, ReduIndices);
1925  }
1926  return NWorkGroups;
1927 }
1928 
1929 inline void
1931  std::shared_ptr<detail::queue_impl>, bool) {}
1932 
1933 template <typename Reduction, typename... RestT>
1934 std::enable_if_t<Reduction::is_usm>
1935 reduSaveFinalResultToUserMemHelper(std::vector<event> &Events,
1936  std::shared_ptr<detail::queue_impl> Queue,
1937  bool IsHost, Reduction &, RestT... Rest) {
1938  // Reductions initialized with USM pointer currently do not require copying
1939  // because the last kernel write directly to USM memory.
1940  reduSaveFinalResultToUserMemHelper(Events, Queue, IsHost, Rest...);
1941 }
1942 
1943 template <typename Reduction, typename... RestT>
1944 std::enable_if_t<!Reduction::is_usm> reduSaveFinalResultToUserMemHelper(
1945  std::vector<event> &Events, std::shared_ptr<detail::queue_impl> Queue,
1946  bool IsHost, Reduction &Redu, RestT... Rest) {
1947  if (Redu.hasUserDiscardWriteAccessor()) {
1948  handler CopyHandler(Queue, IsHost);
1949  auto InAcc = Redu.getReadAccToPreviousPartialReds(CopyHandler);
1950  auto OutAcc = Redu.getUserDiscardWriteAccessor();
1951  Redu.associateWithHandler(CopyHandler);
1952  if (!Events.empty())
1953  CopyHandler.depends_on(Events.back());
1954  CopyHandler.copy(InAcc, OutAcc);
1955  event CopyEvent = CopyHandler.finalize();
1956  Events.push_back(CopyEvent);
1957  }
1958  reduSaveFinalResultToUserMemHelper(Events, Queue, IsHost, Rest...);
1959 }
1960 
1965 template <typename... Reduction, size_t... Is>
1966 std::shared_ptr<event>
1967 reduSaveFinalResultToUserMem(std::shared_ptr<detail::queue_impl> Queue,
1968  bool IsHost, std::tuple<Reduction...> &ReduTuple,
1969  std::index_sequence<Is...>) {
1970  std::vector<event> Events;
1971  reduSaveFinalResultToUserMemHelper(Events, Queue, IsHost,
1972  std::get<Is>(ReduTuple)...);
1973  if (!Events.empty())
1974  return std::make_shared<event>(Events.back());
1975  return std::shared_ptr<event>();
1976 }
1977 
1978 template <typename Reduction> size_t reduGetMemPerWorkItemHelper(Reduction &) {
1979  return sizeof(typename Reduction::result_type);
1980 }
1981 
1982 template <typename Reduction, typename... RestT>
1983 size_t reduGetMemPerWorkItemHelper(Reduction &, RestT... Rest) {
1984  return sizeof(typename Reduction::result_type) +
1985  reduGetMemPerWorkItemHelper(Rest...);
1986 }
1987 
1988 template <typename... ReductionT, size_t... Is>
1989 size_t reduGetMemPerWorkItem(std::tuple<ReductionT...> &ReduTuple,
1990  std::index_sequence<Is...>) {
1991  return reduGetMemPerWorkItemHelper(std::get<Is>(ReduTuple)...);
1992 }
1993 
1996 template <typename TupleT, std::size_t... Is>
1997 std::tuple<std::tuple_element_t<Is, TupleT>...>
1998 tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>) {
1999  return {std::get<Is>(std::move(Tuple))...};
2000 }
2001 
2002 } // namespace detail
2003 
2008 template <typename T, class BinaryOperation, int Dims, access::mode AccMode,
2009  access::placeholder IsPH>
2010 detail::reduction_impl<T, BinaryOperation, Dims, false, IsPH>
2012  const T &Identity, BinaryOperation BOp) {
2013  return {Acc, Identity, BOp};
2014 }
2015 
2020 template <typename T, class BinaryOperation, int Dims, access::mode AccMode,
2021  access::placeholder IsPH>
2022 std::enable_if_t<detail::IsKnownIdentityOp<T, BinaryOperation>::value,
2023  detail::reduction_impl<T, BinaryOperation, Dims, false, IsPH>>
2025  BinaryOperation) {
2026  return {Acc};
2027 }
2028 
2033 template <typename T, class BinaryOperation>
2034 detail::reduction_impl<T, BinaryOperation, 1, true>
2035 reduction(T *VarPtr, const T &Identity, BinaryOperation BOp) {
2036  return {VarPtr, Identity, BOp};
2037 }
2038 
2044 template <typename T, class BinaryOperation>
2045 std::enable_if_t<detail::IsKnownIdentityOp<T, BinaryOperation>::value,
2046  detail::reduction_impl<T, BinaryOperation, 1, true>>
2047 reduction(T *VarPtr, BinaryOperation) {
2048  return {VarPtr};
2049 }
2050 
2051 // ---- has_known_identity
2052 template <typename BinaryOperation, typename AccumulatorT>
2054  : sycl::has_known_identity<BinaryOperation, AccumulatorT> {};
2055 
2056 template <typename BinaryOperation, typename AccumulatorT>
2059 
2060 // ---- known_identity
2061 template <typename BinaryOperation, typename AccumulatorT>
2062 struct known_identity : sycl::known_identity<BinaryOperation, AccumulatorT> {};
2063 
2064 template <typename BinaryOperation, typename AccumulatorT>
2067 
2068 } // namespace oneapi
2069 } // namespace ext
2070 
2071 #ifdef __SYCL_INTERNAL_API
2072 namespace __SYCL2020_DEPRECATED("use 'ext::oneapi' instead") ONEAPI {
2073  using namespace ext::oneapi;
2074  namespace detail {
2076  __SYCL_EXPORT size_t reduGetMaxWGSize(std::shared_ptr<queue_impl> Queue,
2077  size_t LocalMemBytesPerWorkItem);
2078  __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
2079  size_t &NWorkGroups);
2080  } // namespace detail
2081 } // namespace ONEAPI
2082 #endif // __SYCL_INTERNAL_API
2083 } // namespace sycl
2084 } // __SYCL_INLINE_NAMESPACE(cl)
cl::sycl::detail::associateWithHandler
void associateWithHandler(handler &, AccessorBaseHost *, access::target)
Definition: handler_proxy.cpp:17
cl::sycl::ext::oneapi::reduction
std::enable_if_t< detail::IsKnownIdentityOp< T, BinaryOperation >::value, detail::reduction_impl< T, BinaryOperation, 1, true > > reduction(T *VarPtr, BinaryOperation)
Creates and returns an object implementing the reduction functionality.
Definition: reduction.hpp:2047
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, enable_if_t< IsKnownIdentityOp< T, BinaryOperation >::value > >::operator|=
enable_if_t< sycl::detail::IsBitOR< _T, BinaryOperation >::value > operator|=(const _T &Partial)
Definition: reduction.hpp:238
cl::sycl::nd_range
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: nd_range.hpp:23
cl::sycl::ext::oneapi::detail::reducer::MValue
T MValue
Definition: reduction.hpp:168
cl::sycl::ext::oneapi::detail::reduction_impl::initializeToIdentity
bool initializeToIdentity() const
Definition: reduction.hpp:753
cl::sycl::ext::oneapi::detail::reduction_impl::reduction_impl
reduction_impl(T *VarPtr, const T &Identity, BinaryOperation, bool InitializeToIdentity=false)
Constructs reduction_impl when the identity value is statically known, and user still passed the iden...
Definition: reduction.hpp:592
cl::sycl::detail::known_identity_impl
Definition: known_identity.hpp:120
cl::sycl::detail::IsBitOR
bool_constant< std::is_same< BinaryOperation, sycl::bit_or< T > >::value||std::is_same< BinaryOperation, sycl::bit_or< void > >::value > IsBitOR
Definition: known_identity.hpp:48
cl::sycl::detail::getDelinearizedId
id< 1 > getDelinearizedId(const range< 1 > &, size_t Index)
Definition: id.hpp:318
type
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, enable_if_t< IsKnownIdentityOp< T, BinaryOperation >::value > >::atomic_combine
enable_if_t< std::is_same< typename remove_AS< _T >::type, T >::value &&IsReduOptForFastAtomicFetch< T, _BinaryOperation >::value &&sycl::detail::IsMaximum< T, _BinaryOperation >::value &&(Space==access::address_space::global_space||Space==access::address_space::local_space)> atomic_combine(_T *ReduVarPtr) const
Atomic MAX operation: *ReduVarPtr = sycl::maximum(*ReduVarPtr, MValue);.
Definition: reduction.hpp:342
cl::sycl::ext::oneapi::detail::reducer
Class that is used to represent objects that are passed to user's lambda functions and representing u...
Definition: reduction.hpp:116
cl::sycl::ext::oneapi::detail::reduGetMemPerWorkItemHelper
size_t reduGetMemPerWorkItemHelper(Reduction &, RestT... Rest)
Definition: reduction.hpp:1983
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, enable_if_t< IsKnownIdentityOp< T, BinaryOperation >::value > >::atomic_combine
enable_if_t< std::is_same< typename remove_AS< _T >::type, T >::value &&IsReduOptForFastAtomicFetch< T, _BinaryOperation >::value &&sycl::detail::IsMinimum< T, _BinaryOperation >::value &&(Space==access::address_space::global_space||Space==access::address_space::local_space)> atomic_combine(_T *ReduVarPtr) const
Atomic MIN operation: *ReduVarPtr = sycl::minimum(*ReduVarPtr, MValue);.
Definition: reduction.hpp:328
cl::sycl::info::device
device
Definition: info_desc.hpp:49
cl::sycl::detail::make_tuple
constexpr tuple< Ts... > make_tuple(Ts... Args)
Definition: tuple.hpp:36
cl::sycl::ext::oneapi::detail::reducer::operator++
enable_if_t< sycl::detail::IsPlus< _T, BinaryOperation >::value &&sycl::detail::is_geninteger< _T >::value > operator++(int)
Definition: reduction.hpp:134
cl::sycl::ext::oneapi::detail::filterSequenceHelper
constexpr auto filterSequenceHelper(FunctorT, std::index_sequence< Is... >)
For each index 'I' from the given indices pack 'Is' this function initially creates a number of short...
Definition: reduction.hpp:1656
cl::sycl::ext::oneapi::detail::reducer::operator|=
enable_if_t< sycl::detail::IsBitOR< _T, BinaryOperation >::value > operator|=(const _T &Partial)
Definition: reduction.hpp:152
__SYCL2020_DEPRECATED
#define __SYCL2020_DEPRECATED(message)
Definition: defines_elementary.hpp:56
cl::sycl::ext::oneapi::known_identity
Definition: reduction.hpp:2062
cl::sycl::ext::oneapi::detail::get_reduction_main_kernel_name_t
Helper structs to get additional kernel name types based on given Name and additional template parame...
Definition: reduction.hpp:802
tuple.hpp
cl::sycl::access::placeholder
placeholder
Definition: access.hpp:43
cl::sycl::detail::IsPlus
bool_constant< std::is_same< BinaryOperation, sycl::plus< T > >::value||std::is_same< BinaryOperation, sycl::plus< void > >::value > IsPlus
Definition: known_identity.hpp:23
cl::sycl::ext::oneapi::detail::FilterElement::type
std::conditional_t< Cond, std::index_sequence< I >, std::index_sequence<> > type
Definition: reduction.hpp:1646
cl::sycl::ext::oneapi::detail::reduSaveFinalResultToUserMem
std::enable_if_t<!Reduction::is_usm > reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu)
Copies the final reduction result kept in read-write accessor to user's accessor.
Definition: reduction.hpp:1419
cl::sycl::ext::oneapi::known_identity_v
__SYCL_INLINE_CONSTEXPR AccumulatorT known_identity_v
Definition: reduction.hpp:2065
cl::sycl::ext::oneapi::detail::__sycl_reduction_main_kernel
These are the forward declaration for the classes that help to create names for additional kernels.
Definition: reduction.hpp:793
cl::sycl::detail::workGroupBarrier
static void workGroupBarrier()
Definition: group.hpp:33
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, enable_if_t< IsKnownIdentityOp< T, BinaryOperation >::value > >::atomic_combine
enable_if_t< std::is_same< typename remove_AS< _T >::type, T >::value &&IsReduOptForFastAtomicFetch< T, _BinaryOperation >::value &&sycl::detail::IsBitXOR< T, _BinaryOperation >::value &&(Space==access::address_space::global_space||Space==access::address_space::local_space)> atomic_combine(_T *ReduVarPtr) const
Atomic BITWISE XOR operation: *ReduVarPtr ^= MValue;.
Definition: reduction.hpp:300
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, enable_if_t< IsKnownIdentityOp< T, BinaryOperation >::value > >::operator++
enable_if_t< sycl::detail::IsPlus< _T, BinaryOperation >::value &&sycl::detail::is_geninteger< _T >::value > operator++(int)
Definition: reduction.hpp:220
cl::sycl::detail::IsKnownIdentityOp
bool_constant< IsZeroIdentityOp< T, BinaryOperation >::value||IsOneIdentityOp< T, BinaryOperation >::value||IsOnesIdentityOp< T, BinaryOperation >::value||IsMinimumIdentityOp< T, BinaryOperation >::value||IsMaximumIdentityOp< T, BinaryOperation >::value||IsFalseIdentityOp< T, BinaryOperation >::value||IsTrueIdentityOp< T, BinaryOperation >::value > IsKnownIdentityOp
Definition: known_identity.hpp:112
cl::sycl::reduce_over_group
detail::enable_if_t<(is_group_v< std::decay_t< Group >> &&detail::is_scalar_arithmetic< T >::value &&detail::is_native_op< T, BinaryOperation >::value), T > reduce_over_group(Group, T x, BinaryOperation binary_op)
Definition: group_algorithm.hpp:127
cl::sycl::ext::oneapi::detail::createReduLocalAccs
auto createReduLocalAccs(size_t Size, handler &CGH, std::index_sequence< Is... >)
For the given 'Reductions' types pack and indices enumerating only the reductions for which a local a...
Definition: reduction.hpp:1452
cl::sycl::ext::oneapi::detail::reduction_impl::getOutPointer
static result_type * getOutPointer(result_type *OutPtr)
Definition: reduction.hpp:747
accessor_property_list.hpp
cl::sycl::ext::oneapi::detail::reduSaveFinalResultToUserMemHelper
std::enable_if_t< Reduction::is_usm > reduSaveFinalResultToUserMemHelper(std::vector< event > &Events, std::shared_ptr< detail::queue_impl > Queue, bool IsHost, Reduction &, RestT... Rest)
Definition: reduction.hpp:1935
cl::sycl::detail::is_contained
Definition: type_list.hpp:54
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, enable_if_t< IsKnownIdentityOp< T, BinaryOperation >::value > >::operator++
enable_if_t< sycl::detail::IsPlus< _T, BinaryOperation >::value &&sycl::detail::is_geninteger< _T >::value > operator++()
Definition: reduction.hpp:213
cl::sycl::ext::oneapi::detail::reduction_impl::reduction_impl
reduction_impl(rw_accessor_type &Acc)
Constructs reduction_impl when the identity value is statically known.
Definition: reduction.hpp:437
cl::sycl::ext::oneapi::detail::filterSequence
constexpr auto filterSequence(FunctorT F, std::index_sequence< Is... > Indices)
For each index 'I' from the given indices pack 'Is' this function returns an index sequence consistin...
Definition: reduction.hpp:1672
cl::sycl::ext::oneapi::detail::writeReduSumsToOutAccs
void writeReduSumsToOutAccs(size_t OutAccIndex, size_t WGSize, std::tuple< Reductions... > *, ReduTupleT< OutAccT... > OutAccs, ReduTupleT< LocalAccT... > LocalAccs, ReduTupleT< BOPsT... > BOPs, ReduTupleT< Ts... > IdentityVals, std::array< bool, sizeof...(Reductions)> IsInitializeToIdentity, std::index_sequence< Is... >)
Definition: reduction.hpp:1577
cl::sycl::ext::oneapi::detail::createReducers
std::tuple< typename Reductions::reducer_type... > createReducers(ReduTupleT< typename Reductions::result_type... > Identities, ReduTupleT< typename Reductions::binary_operation... > BOPsTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:1504
cl::sycl::nd_range::get_group_range
range< dimensions > get_group_range() const
Definition: nd_range.hpp:44
cl::sycl::ext::oneapi::detail::initReduLocalAccs
void initReduLocalAccs(size_t LID, size_t GID, size_t NWorkItems, size_t WGSize, ReduTupleT< InputAccT... > LocalAccs, ReduTupleT< LocalAccT... > InputAccs, ReduTupleT< ResultT... > Identities, std::index_sequence< Is... >)
Definition: reduction.hpp:1538
cl::sycl::ext::oneapi::detail::reduction_impl::getUSMPointer
result_type * getUSMPointer()
Definition: reduction.hpp:738
cl::sycl::ext::oneapi::detail::reduction_impl::getOutPointer
static result_type * getOutPointer(const rw_accessor_type &OutAcc)
Definition: reduction.hpp:743
cl::sycl::ext::oneapi::detail::reducer::operator&=
enable_if_t< sycl::detail::IsBitAND< _T, BinaryOperation >::value > operator&=(const _T &Partial)
Definition: reduction.hpp:164
cl::sycl::ext::oneapi::detail::getReadAccsToPreviousPartialReds
auto getReadAccsToPreviousPartialReds(handler &CGH, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
For the given 'Reductions' types pack and indices enumerating them this function returns accessors to...
Definition: reduction.hpp:1475
bool_constant
cl::sycl::multi_ptr
Provides constructors for address space qualified and non address space qualified pointers to allow i...
Definition: atomic.hpp:32
cl::sycl::ext::oneapi::detail::concat_sequences
constexpr auto concat_sequences(std::index_sequence< Is... >, std::index_sequence< Js... >, Rs...)
Definition: reduction.hpp:1629
cl::sycl::ext::oneapi::detail::reducer::operator*=
enable_if_t< sycl::detail::IsMultiplies< _T, BinaryOperation >::value > operator*=(const _T &Partial)
Definition: reduction.hpp:146
cl::sycl::has_known_identity
Definition: known_identity.hpp:213
cl::sycl::ext::oneapi::detail::callReduUserKernelFunc
void callReduUserKernelFunc(KernelType KernelFunc, nd_item< Dims > NDIt, std::tuple< ReducerT... > &Reducers, std::index_sequence< Is... >)
Definition: reduction.hpp:1512
cl::sycl::ext::oneapi::detail::reduction_impl::getReadWriteAccessorToInitializedMem
std::enable_if_t< HasFastAtomics, rw_accessor_type > getReadWriteAccessorToInitializedMem(handler &CGH)
If reduction is initialized with read-write accessor, which does not require initialization with iden...
Definition: reduction.hpp:705
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, enable_if_t< IsKnownIdentityOp< T, BinaryOperation >::value > >::atomic_combine
enable_if_t< std::is_same< typename remove_AS< _T >::type, T >::value &&(IsReduOptForFastAtomicFetch< T, _BinaryOperation >::value||IsReduOptForAtomic64Add< T, _BinaryOperation >::value) &&sycl::detail::IsPlus< T, _BinaryOperation >::value &&(Space==access::address_space::global_space||Space==access::address_space::local_space)> atomic_combine(_T *ReduVarPtr) const
Atomic ADD operation: *ReduVarPtr += MValue;.
Definition: reduction.hpp:272
cl::sycl::ext::oneapi::detail::IsReduOptForFastReduce
bool_constant<((sycl::detail::is_sgeninteger< T >::value &&(sizeof(T)==4||sizeof(T)==8))||sycl::detail::is_sgenfloat< T >::value) &&(sycl::detail::IsPlus< T, BinaryOperation >::value||sycl::detail::IsMinimum< T, BinaryOperation >::value||sycl::detail::IsMaximum< T, BinaryOperation >::value)> IsReduOptForFastReduce
Definition: reduction.hpp:94
cl::sycl::buffer
Defines a shared array that can be used by kernels in queues.
Definition: buffer.hpp:46
cl::sycl::ext::oneapi::detail::reduction_impl::getUserReadWriteAccessor
std::enable_if_t<!_IsUSM, rw_accessor_type & > getUserReadWriteAccessor()
Definition: reduction.hpp:729
cl::sycl::ext::oneapi::detail::reduction_impl_base
Base non-template class which is a base class for all reduction implementation classes.
Definition: reduction.hpp:353
cl::sycl::ext::oneapi::detail::reduction_impl::reduction_impl
reduction_impl(T *VarPtr, bool InitializeToIdentity=false)
Constructs reduction_impl when the identity value is statically known.
Definition: reduction.hpp:580
cl::sycl::ext::oneapi::detail::IsReduOptForFastAtomicFetch
bool_constant< sycl::detail::is_sgeninteger< T >::value &&sycl::detail::IsValidAtomicType< T >::value &&(sycl::detail::IsPlus< T, BinaryOperation >::value||sycl::detail::IsMinimum< T, BinaryOperation >::value||sycl::detail::IsMaximum< T, BinaryOperation >::value||sycl::detail::IsBitOR< T, BinaryOperation >::value||sycl::detail::IsBitXOR< T, BinaryOperation >::value||sycl::detail::IsBitAND< T, BinaryOperation >::value)> IsReduOptForFastAtomicFetch
Definition: reduction.hpp:54
cl::sycl::range< 1 >
cl::sycl::detail::IsBitAND
bool_constant< std::is_same< BinaryOperation, sycl::bit_and< T > >::value||std::is_same< BinaryOperation, sycl::bit_and< void > >::value > IsBitAND
Definition: known_identity.hpp:43
cl::sycl::detail::IsMinimum
bool_constant< std::is_same< BinaryOperation, sycl::minimum< T > >::value||std::is_same< BinaryOperation, sycl::minimum< void > >::value > IsMinimum
Definition: known_identity.hpp:33
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, enable_if_t< IsKnownIdentityOp< T, BinaryOperation >::value > >::atomic_combine
enable_if_t< std::is_same< typename remove_AS< _T >::type, T >::value &&IsReduOptForFastAtomicFetch< T, _BinaryOperation >::value &&sycl::detail::IsBitOR< T, _BinaryOperation >::value &&(Space==access::address_space::global_space||Space==access::address_space::local_space)> atomic_combine(_T *ReduVarPtr) const
Atomic BITWISE OR operation: *ReduVarPtr |= MValue;.
Definition: reduction.hpp:286
cl::sycl::ext::oneapi::detail::reduction_impl::getReadAccToPreviousPartialReds
accessor< T, buffer_dim, access::mode::read > getReadAccToPreviousPartialReds(handler &CGH) const
Definition: reduction.hpp:647
cl::sycl::handler::depends_on
void depends_on(event Event)
Registers event dependencies on this command group.
Definition: handler.hpp:1234
cl::sycl::buffer::size
size_t size() const noexcept
Definition: buffer.hpp:273
cl::sycl::range::size
size_t size() const
Definition: range.hpp:50
cl::sycl::ext::oneapi::detail::reduGetMaxWGSize
size_t reduGetMaxWGSize(std::shared_ptr< queue_impl > Queue, size_t LocalMemBytesPerWorkItem)
cl::sycl::memory_scope
memory_scope
Definition: memory_enums.hpp:24
cl::sycl::ext::oneapi::detail::reduCGFunc
void reduCGFunc(handler &CGH, KernelType KernelFunc, const range< Dims > &Range, size_t MaxWGSize, uint32_t NumConcurrentWorkGroups, Reduction &Redu)
Definition: reduction.hpp:1017
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, enable_if_t< IsKnownIdentityOp< T, BinaryOperation >::value > >::operator&=
enable_if_t< sycl::detail::IsBitAND< _T, BinaryOperation >::value > operator&=(const _T &Partial)
Definition: reduction.hpp:250
cl::sycl::handler::require
void require(accessor< DataT, Dims, AccMode, AccTarget, access::placeholder::true_t > Acc)
Requires access to the memory object associated with the placeholder accessor.
Definition: handler.hpp:1222
cl::sycl::handler::parallel_for
void parallel_for(range< 1 > NumWorkItems, KernelType KernelFunc)
Definition: handler.hpp:1325
cl::sycl::detail::conditional_t
typename std::conditional< B, T, F >::type conditional_t
Definition: stl_type_traits.hpp:27
cl::sycl::ext::oneapi::detail::reduceReduLocalAccs
void reduceReduLocalAccs(size_t IndexA, size_t IndexB, ReduTupleT< LocalAccT... > LocalAccs, ReduTupleT< BOPsT... > BOPs, std::index_sequence< Is... >)
Definition: reduction.hpp:1565
cl::sycl::ext::oneapi::detail::getInitToIdentityProperties
std::array< bool, sizeof...(Reductions)> getInitToIdentityProperties(std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:1497
kernel.hpp
cl::sycl::ext::oneapi::detail::tuple_select_elements
std::tuple< std::tuple_element_t< Is, TupleT >... > tuple_select_elements(TupleT Tuple, std::index_sequence< Is... >)
Utility function: for the given tuple.
Definition: reduction.hpp:1998
cl::sycl::nd_item::get_global_range
range< dimensions > get_global_range() const
Definition: nd_item.hpp:92
cl::sycl::ext::oneapi::detail::reduction_impl::reduction_impl
reduction_impl(rw_accessor_type &Acc, const T &, BinaryOperation)
Constructs reduction_impl when the identity value is statically known, and user still passed the iden...
Definition: reduction.hpp:490
cl::sycl::ext::oneapi::detail::EmptyReductionPredicate::Func
Definition: reduction.hpp:1641
cl::sycl::detail::bool_constant
std::integral_constant< bool, V > bool_constant
Definition: stl_type_traits.hpp:40
cl::sycl::ext::oneapi::detail::__sycl_reduction_aux_kernel
Definition: reduction.hpp:795
cl::sycl::ext::oneapi::detail::reduction_impl::getUserDiscardWriteAccessor
std::enable_if_t<!_IsUSM, dw_accessor_type & > getUserDiscardWriteAccessor()
Definition: reduction.hpp:734
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, enable_if_t< IsKnownIdentityOp< T, BinaryOperation >::value > >::reducer
reducer(const T &, BinaryOperation)
Definition: reduction.hpp:197
cl::sycl::ext::oneapi::detail::reducer::operator+=
enable_if_t< sycl::detail::IsPlus< _T, BinaryOperation >::value > operator+=(const _T &Partial)
Definition: reduction.hpp:140
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, enable_if_t< IsKnownIdentityOp< T, BinaryOperation >::value > >::combine
void combine(const T &Partial)
Definition: reduction.hpp:199
cl::sycl::ext::oneapi::detail::reduction_impl::result_type
T result_type
Definition: reduction.hpp:376
cl::sycl::ext::oneapi::detail::associateReduAccsWithHandlerHelper
void associateReduAccsWithHandlerHelper(handler &CGH, ReductionT &Redu, RestT &... Rest)
Definition: reduction.hpp:1821
cl::sycl::ext::oneapi::detail::reduction_impl::getIdentity
enable_if_t<!IsKnownIdentityOp< _T, _BinaryOperation >::value, _T > getIdentity()
Returns the identity value given by user.
Definition: reduction.hpp:414
cl::sycl::ext::oneapi::detail::reduAuxCGFunc
enable_if_t<!Reduction::has_fast_atomics, size_t > reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, Reduction &Redu)
Implements a command group function that enqueues a kernel that does one iteration of reduction of el...
Definition: reduction.hpp:1385
cl::sycl::ext::oneapi::detail::AreAllButLastReductions
Predicate returning true if all template type parameters except the last one are reductions.
Definition: handler.hpp:322
cl::sycl::ext::oneapi::detail::associateReduAccsWithHandler
void associateReduAccsWithHandler(handler &CGH, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:1828
cl::sycl::accessor
Buffer accessor.
Definition: accessor.hpp:225
cl::sycl::ext::oneapi::detail::getReduIdentities
ReduTupleT< typename Reductions::result_type... > getReduIdentities(std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:1484
cl::sycl::ext::oneapi::detail::reduction_impl::reduction_impl
reduction_impl(dw_accessor_type &Acc)
Constructs reduction_impl when the identity value is statically known.
Definition: reduction.hpp:449
cl::sycl::handler::copy
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, std::shared_ptr< T_Dst > Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Definition: handler.hpp:2146
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, enable_if_t< IsKnownIdentityOp< T, BinaryOperation >::value > >::reducer
reducer()
Definition: reduction.hpp:196
cl
We provide new interfaces for matrix muliply in this patch:
Definition: access.hpp:13
cl::sycl::ext::oneapi::detail::reduction_impl::reduction_impl
reduction_impl(buffer< _T, 1, AllocatorT > Buffer, handler &CGH, bool InitializeToIdentity)
SYCL-2020.
Definition: reduction.hpp:423
cl::sycl::ext::oneapi::detail::reduCGFuncImplAtomic64
std::enable_if_t< Reduction::has_atomic_add_float64 > reduCGFuncImplAtomic64(handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, Reduction &, typename Reduction::rw_accessor_type Out)
Definition: reduction.hpp:1776
cl::sycl::ext::oneapi::detail::reductionLoop
void reductionLoop(const range< Dims > &Range, ReducerT &Reducer, const nd_item< 1 > &NdId, KernelFunc &F)
Called in device code.
Definition: reduction.hpp:826
cl::sycl::detail::IsBitXOR
bool_constant< std::is_same< BinaryOperation, sycl::bit_xor< T > >::value||std::is_same< BinaryOperation, sycl::bit_xor< void > >::value > IsBitXOR
Definition: known_identity.hpp:53
cl::sycl::ext::oneapi::detail::reduAuxCGFuncImpl
void reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups, size_t WGSize, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... > ReduIndices)
Definition: reduction.hpp:1836
cl::sycl::detail::tie
auto tie(Ts &... Args)
Definition: tuple.hpp:40
cl::sycl::ext::oneapi::detail::reduCGFuncAtomic64
enable_if_t< Reduction::has_atomic_add_float64 > reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, Reduction &Redu)
Definition: reduction.hpp:1804
cl::sycl::ext::oneapi::detail::reduction_impl
This class encapsulates the reduction variable/accessor, the reduction operator and an optional opera...
Definition: handler.hpp:245
cl::sycl::ext::oneapi::detail::createReduOutAccs
auto createReduOutAccs(size_t NWorkGroups, handler &CGH, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
For the given 'Reductions' types pack and indices enumerating them this function either creates new t...
Definition: reduction.hpp:1463
cl::sycl::handler
Command group handler class.
Definition: handler.hpp:361
cl::sycl::ext::oneapi::accessor_property_list
Objects of the accessor_property_list class are containers for the SYCL properties.
Definition: property_list.hpp:18
cl::sycl::ext::oneapi::detail::reduction_impl::getReadWriteAccessorToInitializedGroupsCounter
accessor< int, 1, access::mode::read_write, access::target::device, access::placeholder::false_t > getReadWriteAccessorToInitializedGroupsCounter(handler &CGH)
Definition: reduction.hpp:718
cl::sycl::accessor::get_pointer
DataT * get_pointer() const
Definition: accessor.hpp:1605
cl::sycl::ext::oneapi::detail::reduction_impl::reduction_impl
reduction_impl(rw_accessor_type &Acc, const T &Identity, BinaryOperation BOp)
Constructs reduction_impl when the identity value is unknown.
Definition: reduction.hpp:553
cl::sycl::detail::queue_impl
Definition: queue_impl.hpp:53
cl::sycl::ext::oneapi::detail::reduction_impl::getBinaryOperation
BinaryOperation getBinaryOperation() const
Returns the binary operation associated with the reduction.
Definition: reduction.hpp:752
cl::sycl::access::address_space
address_space
Definition: access.hpp:45
accessor.hpp
cl::sycl::ext::oneapi::detail::reducer::operator++
enable_if_t< sycl::detail::IsPlus< _T, BinaryOperation >::value &&sycl::detail::is_geninteger< _T >::value > operator++()
Definition: reduction.hpp:127
cl::sycl::ext::oneapi::detail::getReduBOPs
ReduTupleT< typename Reductions::binary_operation... > getReduBOPs(std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:1491
cl::sycl::ext::oneapi::detail::IsNonUsmReductionPredicate::Func
Definition: reduction.hpp:1635
cl::sycl::nd_item::get_global_id
id< dimensions > get_global_id() const
Definition: nd_item.hpp:40
cl::sycl::ext::oneapi::detail::reduction_impl::getWriteMemForPartialReds
std::enable_if_t<!IsOneWG, rw_accessor_type > getWriteMemForPartialReds(size_t Size, handler &CGH)
Constructs a new temporary buffer to hold partial sums and returns the accessor for that buffer.
Definition: reduction.hpp:674
KernelFunc
std::function< void(const sycl::nd_item< NDims > &)> KernelFunc
Definition: pi_esimd_emulator.cpp:129
cl::sycl::detail::InitializeToIdentity
@ InitializeToIdentity
Definition: property_helper.hpp:34
cl::sycl::ext::oneapi::detail::makeReduTupleT
ReduTupleT< Ts... > makeReduTupleT(Ts... Elements)
Definition: reduction.hpp:102
PI_INVALID_VALUE
@ PI_INVALID_VALUE
Definition: pi.h:87
cl::sycl::detail::remove_AS
Definition: access.hpp:197
cl::sycl::ext::oneapi::detail::reduction_impl::reduction_impl
reduction_impl(T *VarPtr, const T &Identity, BinaryOperation BOp, bool InitializeToIdentity=false)
Constructs reduction_impl when the identity value is unknown.
Definition: reduction.hpp:616
cl::sycl::detail::tuple< Ts... >
atomic.hpp
cl::sycl::detail::auto_name
This class is the default KernelName template parameter type for kernel invocation APIs such as singl...
Definition: kernel.hpp:35
cl::sycl::ext::oneapi::detail::ReduTupleT
sycl::detail::tuple< Ts... > ReduTupleT
Definition: reduction.hpp:101
cl::sycl::accessor::size
size_t size() const noexcept
Definition: accessor.hpp:1524
cl::sycl::nd_item
Identifies an instance of the function object executing at each point in an nd_range.
Definition: helpers.hpp:32
handler.hpp
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, enable_if_t< IsKnownIdentityOp< T, BinaryOperation >::value > >::operator*=
enable_if_t< sycl::detail::IsMultiplies< _T, BinaryOperation >::value > operator*=(const _T &Partial)
Definition: reduction.hpp:232
cl::sycl::detail::IsMaximum
bool_constant< std::is_same< BinaryOperation, sycl::maximum< T > >::value||std::is_same< BinaryOperation, sycl::maximum< void > >::value > IsMaximum
Definition: known_identity.hpp:38
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, enable_if_t< IsKnownIdentityOp< T, BinaryOperation >::value > >::operator^=
enable_if_t< sycl::detail::IsBitXOR< _T, BinaryOperation >::value > operator^=(const _T &Partial)
Definition: reduction.hpp:244
std
Definition: accessor.hpp:2358
cl::sycl::ext::oneapi::detail::IsNonUsmReductionPredicate
Definition: reduction.hpp:1634
__SYCL_INLINE_CONSTEXPR
#define __SYCL_INLINE_CONSTEXPR
Definition: defines_elementary.hpp:65
cl::sycl::ext::oneapi::detail::reduction_impl::reduction_impl
reduction_impl(buffer< _T, 1, AllocatorT > Buffer, handler &CGH, const T &Identity, BinaryOperation BOp, bool InitializeToIdentity)
SYCL-2020.
Definition: reduction.hpp:538
group_algorithm.hpp
cl::sycl::ext::oneapi::detail::reduCGFuncImpl
void reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... > ReduIndices)
Definition: reduction.hpp:1678
cl::sycl::ext::oneapi::has_known_identity_v
__SYCL_INLINE_CONSTEXPR bool has_known_identity_v
Definition: reduction.hpp:2057
cl::sycl::ext::oneapi::detail::reducer::operator^=
enable_if_t< sycl::detail::IsBitXOR< _T, BinaryOperation >::value > operator^=(const _T &Partial)
Definition: reduction.hpp:158
cl::sycl::handler::single_task
void single_task(KernelType KernelFunc)
Defines and invokes a SYCL kernel function as a function object type.
Definition: handler.hpp:1304
cl::sycl::ext::oneapi::atomic_ref
Definition: atomic_ref.hpp:665
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, enable_if_t< IsKnownIdentityOp< T, BinaryOperation >::value > >::getIdentity
static enable_if_t< has_known_identity_impl< _BinaryOperation, _T >::value, _T > getIdentity()
Definition: reduction.hpp:206
cl::sycl::known_identity
Definition: known_identity.hpp:223
cl::sycl::detail::tuple_element_t
typename tuple_element< I, T >::type tuple_element_t
Definition: tuple.hpp:56
cl::sycl::ext::oneapi::detail::reduction_impl::reduction_impl
reduction_impl(buffer< _T, 1, AllocatorT > Buffer, handler &CGH, const T &, BinaryOperation, bool InitializeToIdentity)
SYCL-2020.
Definition: reduction.hpp:463
cl::sycl::ext::oneapi::detail::reduction_impl::reduction_impl
reduction_impl(dw_accessor_type &Acc, const T &Identity, BinaryOperation BOp)
Constructs reduction_impl when the identity value is unknown.
Definition: reduction.hpp:565
cl::sycl::ext::oneapi::detail::reducer::getIdentity
T getIdentity() const
Definition: reduction.hpp:122
cl::sycl::ext::oneapi::detail::reduction_impl::getWriteMemForPartialReds
std::enable_if_t< IsOneWG &&_IsUSM, result_type * > getWriteMemForPartialReds(size_t, handler &)
Returns user's USM pointer passed to reduction for editing.
Definition: reduction.hpp:655
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, enable_if_t< IsKnownIdentityOp< T, BinaryOperation >::value > >::atomic_combine
enable_if_t< std::is_same< typename remove_AS< _T >::type, T >::value &&IsReduOptForFastAtomicFetch< T, _BinaryOperation >::value &&sycl::detail::IsBitAND< T, _BinaryOperation >::value &&(Space==access::address_space::global_space||Space==access::address_space::local_space)> atomic_combine(_T *ReduVarPtr) const
Atomic BITWISE AND operation: *ReduVarPtr &= MValue;.
Definition: reduction.hpp:314
cl::sycl::ext::oneapi::detail::FilterElement
Definition: reduction.hpp:1644
cl::sycl::access::mode
mode
Definition: access.hpp:28
cl::sycl::instead
std::uint8_t instead
Definition: aliases.hpp:68
cl::sycl::ext::oneapi::detail::reduGetMemPerWorkItem
size_t reduGetMemPerWorkItem(std::tuple< ReductionT... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:1989
cl::sycl::ext::oneapi::detail::reducer::combine
void combine(const T &Partial)
Definition: reduction.hpp:120
cl::sycl::ext::oneapi::detail::IsReduOptForAtomic64Add
bool_constant< sycl::detail::IsPlus< T, BinaryOperation >::value &&sycl::detail::is_sgenfloat< T >::value &&(sizeof(T)==4||sizeof(T)==8)> IsReduOptForAtomic64Add
Definition: reduction.hpp:76
cl::sycl::ext::oneapi::detail::reduction_impl::binary_operation
BinaryOperation binary_operation
Definition: reduction.hpp:377
cl::sycl::ext::oneapi::detail::reduction_impl::getWriteAccForPartialReds
rw_accessor_type getWriteAccForPartialReds(size_t Size, handler &CGH)
Returns an accessor accessing the memory that will hold the reduction partial sums.
Definition: reduction.hpp:686
cl::sycl::ext::oneapi::detail::reduction_impl::getIdentity
constexpr enable_if_t< IsKnownIdentityOp< _T, _BinaryOperation >::value, _T > getIdentity()
Returns the statically known identity value.
Definition: reduction.hpp:407
cl::sycl::detail::enable_if_t
typename std::enable_if< B, T >::type enable_if_t
Definition: stl_type_traits.hpp:24
cl::sycl::ext::oneapi::has_known_identity
Definition: reduction.hpp:2053
cl::sycl::ext::oneapi::detail::reduction_impl::getReadWriteLocalAcc
static accessor< _T, buffer_dim, access::mode::read_write, access::target::local > getReadWriteLocalAcc(size_t Size, handler &CGH)
Creates and returns a local accessor with the Size elements.
Definition: reduction.hpp:642
known_identity.hpp
cl::sycl::ext::oneapi::detail::reduction_impl::associateWithHandler
void associateWithHandler(handler &CGH)
Associates the reduction accessor to user's memory with CGH handler to keep the accessor alive until ...
Definition: reduction.hpp:624
cl::sycl::ext::oneapi::detail::get_reduction_aux_kernel_name_t
Definition: reduction.hpp:811
cl::sycl::ext::oneapi::detail::EmptyReductionPredicate
Definition: reduction.hpp:1640
cl::sycl::nd_range::get_local_range
range< dimensions > get_local_range() const
Definition: nd_range.hpp:42
cl::sycl::ext::oneapi::detail::reduComputeWGSize
size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize, size_t &NWorkGroups)
Definition: reduction.cpp:20
cl::sycl::ext::oneapi::detail::reducer::reducer
reducer(const T &Identity, BinaryOperation BOp)
Definition: reduction.hpp:118
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, enable_if_t< IsKnownIdentityOp< T, BinaryOperation >::value > >::operator+=
enable_if_t< sycl::detail::IsPlus< _T, BinaryOperation >::value > operator+=(const _T &Partial)
Definition: reduction.hpp:226
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, enable_if_t< IsKnownIdentityOp< T, BinaryOperation >::value > >::MValue
T MValue
Definition: reduction.hpp:348
cl::sycl::ext::oneapi::detail::reduction_impl::reduction_impl
reduction_impl(dw_accessor_type &Acc, const T &, BinaryOperation)
Constructs reduction_impl when the identity value is statically known, and user still passed the iden...
Definition: reduction.hpp:514
__SYCL_INLINE_NAMESPACE
#define __SYCL_INLINE_NAMESPACE(X)
Definition: defines_elementary.hpp:12
cl::sycl::ext::oneapi::detail::reduction_impl::hasUserDiscardWriteAccessor
bool hasUserDiscardWriteAccessor()
Definition: reduction.hpp:726
cl::sycl::ext::oneapi::detail::reduction_impl::getWriteMemForPartialReds
std::enable_if_t< IsOneWG &&!_IsUSM, rw_accessor_type > getWriteMemForPartialReds(size_t, handler &CGH)
Returns user's accessor passed to reduction for editing if that is the read-write accessor.
Definition: reduction.hpp:664