DPC++ Runtime
Runtime libraries for oneAPI DPC++
reduction.hpp
Go to the documentation of this file.
1 //==---------------- reduction.hpp - SYCL reduction ------------*- C++ -*---==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 // ===--------------------------------------------------------------------=== //
8 
9 #pragma once
10 
11 #include <CL/sycl/accessor.hpp>
12 #include <CL/sycl/atomic.hpp>
13 #include <CL/sycl/detail/tuple.hpp>
14 #include <CL/sycl/handler.hpp>
15 #include <CL/sycl/kernel.hpp>
19 
20 #include <tuple>
21 
23 namespace sycl {
24 namespace ext {
25 namespace oneapi {
26 
27 namespace detail {
28 
33 
34 // This type trait is used to detect if the atomic operation BinaryOperation
35 // used with operands of the type T is available for using in reduction.
36 // The order in which the atomic operations are performed may be arbitrary and
37 // thus may cause different results from run to run even on the same elements
38 // and on same device. The macro SYCL_REDUCTION_DETERMINISTIC prohibits using
39 // atomic operations for reduction and helps to produce stable results.
40 // SYCL_REDUCTION_DETERMINISTIC is a short term solution, which perhaps become
41 // deprecated eventually and is replaced by a sycl property passed to reduction.
42 template <typename T, class BinaryOperation>
44 #ifdef SYCL_REDUCTION_DETERMINISTIC
45  bool_constant<false>;
46 #else
47  bool_constant<sycl::detail::is_sgeninteger<T>::value &&
48  sycl::detail::IsValidAtomicType<T>::value &&
49  (sycl::detail::IsPlus<T, BinaryOperation>::value ||
50  sycl::detail::IsMinimum<T, BinaryOperation>::value ||
51  sycl::detail::IsMaximum<T, BinaryOperation>::value ||
52  sycl::detail::IsBitOR<T, BinaryOperation>::value ||
53  sycl::detail::IsBitXOR<T, BinaryOperation>::value ||
55 #endif
56 
57 // This type trait is used to detect if the atomic operation BinaryOperation
58 // used with operands of the type T is available for using in reduction, in
59 // addition to the cases covered by "IsReduOptForFastAtomicFetch", if the device
60 // has the atomic64 aspect. This type trait should only be used if the device
61 // has the atomic64 aspect. Note that this type trait is currently a subset of
62 // IsReduOptForFastReduce. The macro SYCL_REDUCTION_DETERMINISTIC prohibits
63 // using the reduce_over_group() algorithm to produce stable results across same
64 // type devices.
65 // TODO 32 bit floating point atomics are eventually expected to be supported by
66 // the has_fast_atomics specialization. Once the reducer class is updated to
67 // replace the deprecated atomic class with atomic_ref, the (sizeof(T) == 4)
68 // case should be removed here and replaced in IsReduOptForFastAtomicFetch.
69 template <typename T, class BinaryOperation>
71 #ifdef SYCL_REDUCTION_DETERMINISTIC
73 #else
76  (sizeof(T) == 4 || sizeof(T) == 8)>;
77 #endif
78 
79 // This type trait is used to detect if the group algorithm reduce() used with
80 // operands of the type T and the operation BinaryOperation is available
81 // for using in reduction.
82 // The macro SYCL_REDUCTION_DETERMINISTIC prohibits using the reduce() algorithm
83 // to produce stable results across same type devices.
84 template <typename T, class BinaryOperation>
86 #ifdef SYCL_REDUCTION_DETERMINISTIC
88 #else
90  (sizeof(T) == 4 || sizeof(T) == 8)) ||
95 #endif
96 
97 // std::tuple seems to be a) too heavy and b) not copyable to device now
98 // Thus sycl::detail::tuple is used instead.
99 // Switching from sycl::device::tuple to std::tuple can be done by re-defining
100 // the ReduTupleT type and makeReduTupleT() function below.
101 template <typename... Ts> using ReduTupleT = sycl::detail::tuple<Ts...>;
102 template <typename... Ts> ReduTupleT<Ts...> makeReduTupleT(Ts... Elements) {
103  return sycl::detail::make_tuple(Elements...);
104 }
105 
106 __SYCL_EXPORT size_t reduGetMaxWGSize(std::shared_ptr<queue_impl> Queue,
107  size_t LocalMemBytesPerWorkItem);
108 __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
109  size_t &NWorkGroups);
110 
124 template <typename T, class BinaryOperation, int Dims, size_t Extent,
125  class Algorithm, bool View = false, typename Subst = void>
126 class reducer;
127 
130 template <typename Reducer> struct ReducerTraits;
131 
132 template <typename T, class BinaryOperation, int Dims, std::size_t Extent,
133  class Algorithm, bool View, typename Subst>
135  reducer<T, BinaryOperation, Dims, Extent, Algorithm, View, Subst>> {
136  using type = T;
137  using op = BinaryOperation;
138  static constexpr int dims = Dims;
139  static constexpr size_t extent = Extent;
140 };
141 
153 template <class Reducer> class combiner {
154  using T = typename ReducerTraits<Reducer>::type;
155  using BinaryOperation = typename ReducerTraits<Reducer>::op;
156  static constexpr int Dims = ReducerTraits<Reducer>::dims;
157  static constexpr size_t Extent = ReducerTraits<Reducer>::extent;
158 
159 public:
160  template <typename _T = T, int _Dims = Dims>
161  enable_if_t<(_Dims == 0) &&
164  operator++() {
165  static_cast<Reducer *>(this)->combine(static_cast<T>(1));
166  }
167 
168  template <typename _T = T, int _Dims = Dims>
169  enable_if_t<(_Dims == 0) &&
172  operator++(int) {
173  static_cast<Reducer *>(this)->combine(static_cast<T>(1));
174  }
175 
176  template <typename _T = T, int _Dims = Dims>
178  operator+=(const _T &Partial) {
179  static_cast<Reducer *>(this)->combine(Partial);
180  }
181 
182  template <typename _T = T, int _Dims = Dims>
183  enable_if_t<(_Dims == 0) &&
185  operator*=(const _T &Partial) {
186  static_cast<Reducer *>(this)->combine(Partial);
187  }
188 
189  template <typename _T = T, int _Dims = Dims>
191  operator|=(const _T &Partial) {
192  static_cast<Reducer *>(this)->combine(Partial);
193  }
194 
195  template <typename _T = T, int _Dims = Dims>
196  enable_if_t<(_Dims == 0) &&
198  operator^=(const _T &Partial) {
199  static_cast<Reducer *>(this)->combine(Partial);
200  }
201 
202  template <typename _T = T, int _Dims = Dims>
203  enable_if_t<(_Dims == 0) &&
205  operator&=(const _T &Partial) {
206  static_cast<Reducer *>(this)->combine(Partial);
207  }
208 
209 private:
210  template <access::address_space Space>
211  static constexpr memory_scope getMemoryScope() {
212  return Space == access::address_space::local_space
213  ? memory_scope::work_group
215  }
216 
217 public:
219  template <access::address_space Space = access::address_space::global_space,
220  typename _T = T, class _BinaryOperation = BinaryOperation>
222  (IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value ||
223  IsReduOptForAtomic64Add<T, _BinaryOperation>::value) &&
225  (Space == access::address_space::global_space ||
226  Space == access::address_space::local_space)>
227  atomic_combine(_T *ReduVarPtr) const {
228  auto reducer = static_cast<const Reducer *>(this);
229  for (size_t E = 0; E < Extent; ++E) {
231  multi_ptr<T, Space>(ReduVarPtr)[E])
232  .fetch_add(reducer->getElement(E));
233  }
234  }
235 
237  template <access::address_space Space = access::address_space::global_space,
238  typename _T = T, class _BinaryOperation = BinaryOperation>
242  (Space == access::address_space::global_space ||
243  Space == access::address_space::local_space)>
244  atomic_combine(_T *ReduVarPtr) const {
245  auto reducer = static_cast<const Reducer *>(this);
246  for (size_t E = 0; E < Extent; ++E) {
248  multi_ptr<T, Space>(ReduVarPtr)[E])
249  .fetch_or(reducer->getElement(E));
250  }
251  }
252 
254  template <access::address_space Space = access::address_space::global_space,
255  typename _T = T, class _BinaryOperation = BinaryOperation>
259  (Space == access::address_space::global_space ||
260  Space == access::address_space::local_space)>
261  atomic_combine(_T *ReduVarPtr) const {
262  auto reducer = static_cast<const Reducer *>(this);
263  for (size_t E = 0; E < Extent; ++E) {
265  multi_ptr<T, Space>(ReduVarPtr)[E])
266  .fetch_xor(reducer->getElement(E));
267  }
268  }
269 
271  template <access::address_space Space = access::address_space::global_space,
272  typename _T = T, class _BinaryOperation = BinaryOperation>
276  (Space == access::address_space::global_space ||
277  Space == access::address_space::local_space)>
278  atomic_combine(_T *ReduVarPtr) const {
279  auto reducer = static_cast<const Reducer *>(this);
280  for (size_t E = 0; E < Extent; ++E) {
282  multi_ptr<T, Space>(ReduVarPtr)[E])
283  .fetch_and(reducer->getElement(E));
284  }
285  }
286 
288  template <access::address_space Space = access::address_space::global_space,
289  typename _T = T, class _BinaryOperation = BinaryOperation>
293  (Space == access::address_space::global_space ||
294  Space == access::address_space::local_space)>
295  atomic_combine(_T *ReduVarPtr) const {
296  auto reducer = static_cast<const Reducer *>(this);
297  for (size_t E = 0; E < Extent; ++E) {
299  multi_ptr<T, Space>(ReduVarPtr)[E])
300  .fetch_min(reducer->getElement(E));
301  }
302  }
303 
305  template <access::address_space Space = access::address_space::global_space,
306  typename _T = T, class _BinaryOperation = BinaryOperation>
310  (Space == access::address_space::global_space ||
311  Space == access::address_space::local_space)>
312  atomic_combine(_T *ReduVarPtr) const {
313  auto reducer = static_cast<const Reducer *>(this);
314  for (size_t E = 0; E < Extent; ++E) {
316  multi_ptr<T, Space>(ReduVarPtr)[E])
317  .fetch_max(reducer->getElement(E));
318  }
319  }
320 };
321 
327 template <typename T, class BinaryOperation, int Dims, size_t Extent,
328  class Algorithm, bool View>
329 class reducer<T, BinaryOperation, Dims, Extent, Algorithm, View,
330  enable_if_t<Dims == 0 && Extent == 1 && View == false &&
332  : public combiner<
333  reducer<T, BinaryOperation, Dims, Extent, Algorithm, View,
334  enable_if_t<Dims == 0 && Extent == 1 && View == false &&
336 public:
337  reducer(const T &Identity, BinaryOperation BOp)
338  : MValue(Identity), MIdentity(Identity), MBinaryOp(BOp) {}
339 
340  void combine(const T &Partial) { MValue = MBinaryOp(MValue, Partial); }
341 
342  T getIdentity() const { return MIdentity; }
343 
344  T &getElement(size_t) { return MValue; }
345  const T &getElement(size_t) const { return MValue; }
347 
348 private:
349  const T MIdentity;
350  BinaryOperation MBinaryOp;
351 };
352 
358 template <typename T, class BinaryOperation, int Dims, size_t Extent,
359  class Algorithm, bool View>
360 class reducer<T, BinaryOperation, Dims, Extent, Algorithm, View,
361  enable_if_t<Dims == 0 && Extent == 1 && View == false &&
363  : public combiner<
364  reducer<T, BinaryOperation, Dims, Extent, Algorithm, View,
365  enable_if_t<Dims == 0 && Extent == 1 && View == false &&
367 public:
368  reducer() : MValue(getIdentity()) {}
369  reducer(const T & /* Identity */, BinaryOperation) : MValue(getIdentity()) {}
370 
371  void combine(const T &Partial) {
372  BinaryOperation BOp;
373  MValue = BOp(MValue, Partial);
374  }
375 
376  static T getIdentity() {
378  }
379 
380  T &getElement(size_t) { return MValue; }
381  const T &getElement(size_t) const { return MValue; }
383 };
384 
387 template <typename T, class BinaryOperation, int Dims, size_t Extent,
388  class Algorithm, bool View>
389 class reducer<T, BinaryOperation, Dims, Extent, Algorithm, View,
390  enable_if_t<Dims == 0 && View == true>>
391  : public combiner<reducer<T, BinaryOperation, Dims, Extent, Algorithm, View,
393 public:
394  reducer(T &Ref, BinaryOperation BOp) : MElement(Ref), MBinaryOp(BOp) {}
395 
396  void combine(const T &Partial) { MElement = MBinaryOp(MElement, Partial); }
397 
398 private:
399  T &MElement;
400  BinaryOperation MBinaryOp;
401 };
402 
405 template <typename T, class BinaryOperation, int Dims, size_t Extent,
406  class Algorithm, bool View>
407 class reducer<T, BinaryOperation, Dims, Extent, Algorithm, View,
408  enable_if_t<Dims == 1 && View == false &&
410  : public combiner<
411  reducer<T, BinaryOperation, Dims, Extent, Algorithm, View,
412  enable_if_t<Dims == 1 && View == false &&
414 public:
415  reducer(const T &Identity, BinaryOperation BOp)
416  : MValue(Identity), MIdentity(Identity), MBinaryOp(BOp) {}
417 
418  // SYCL 2020 revision 4 says this should be const, but this is a bug
419  // see https://github.com/KhronosGroup/SYCL-Docs/pull/252
420  reducer<T, BinaryOperation, Dims - 1, Extent, Algorithm, true>
421  operator[](size_t Index) {
422  return {MValue[Index], MBinaryOp};
423  }
424 
425  T getIdentity() const { return MIdentity; }
426  T &getElement(size_t E) { return MValue[E]; }
427  const T &getElement(size_t E) const { return MValue[E]; }
428 
429 private:
430  marray<T, Extent> MValue;
431  const T MIdentity;
432  BinaryOperation MBinaryOp;
433 };
434 
437 template <typename T, class BinaryOperation, int Dims, size_t Extent,
438  class Algorithm, bool View>
439 class reducer<T, BinaryOperation, Dims, Extent, Algorithm, View,
440  enable_if_t<Dims == 1 && View == false &&
442  : public combiner<
443  reducer<T, BinaryOperation, Dims, Extent, Algorithm, View,
444  enable_if_t<Dims == 1 && View == false &&
446 public:
447  reducer() : MValue(getIdentity()) {}
448  reducer(const T & /* Identity */, BinaryOperation) : MValue(getIdentity()) {}
449 
450  // SYCL 2020 revision 4 says this should be const, but this is a bug
451  // see https://github.com/KhronosGroup/SYCL-Docs/pull/252
452  reducer<T, BinaryOperation, Dims - 1, Extent, Algorithm, true>
453  operator[](size_t Index) {
454  return {MValue[Index], BinaryOperation()};
455  }
456 
457  static T getIdentity() {
459  }
460 
461  T &getElement(size_t E) { return MValue[E]; }
462  const T &getElement(size_t E) const { return MValue[E]; }
463 
464 private:
465  marray<T, Extent> MValue;
466 };
467 
471 
474 template <typename T, class BinaryOperation> class reduction_impl_common {
475 protected:
476  reduction_impl_common(const T &Identity, BinaryOperation BinaryOp,
477  bool Init = false)
478  : MIdentity(Identity), MBinaryOp(BinaryOp), InitializeToIdentity(Init) {}
479 
480 public:
482  template <typename _T = T, class _BinaryOperation = BinaryOperation>
484  _T> constexpr getIdentity() {
486  }
487 
489  template <typename _T = T, class _BinaryOperation = BinaryOperation>
492  return MIdentity;
493  }
494 
496  BinaryOperation getBinaryOperation() const { return MBinaryOp; }
498 
499 protected:
502  const T MIdentity;
503 
504  BinaryOperation MBinaryOp;
506 };
507 
510 template <bool IsUSM, access::placeholder IsPlaceholder, int AccessorDims>
512 
514 template <typename T, class BinaryOperation, int Dims, size_t Extent,
515  class Algorithm>
516 class reduction_impl_algo;
517 
520 template <typename T, class BinaryOperation, int Dims, size_t Extent,
521  bool IsUSM, access::placeholder IsPlaceholder, int AccessorDims>
523  T, BinaryOperation, Dims, Extent,
524  default_reduction_algorithm<IsUSM, IsPlaceholder, AccessorDims>>
525  : public reduction_impl_common<T, BinaryOperation> {
527 
528 public:
529  using reducer_type =
530  reducer<T, BinaryOperation, Dims, Extent,
532  using result_type = T;
533  using binary_operation = BinaryOperation;
534 
535  // Buffers and accessors always describe scalar reductions (i.e. Dims == 0)
536  // The input buffer/accessor is allowed to have different dimensionality
537  // AccessorDims also determines the dimensionality of some temp storage
538  static constexpr int accessor_dim = AccessorDims;
539  static constexpr int buffer_dim = (AccessorDims == 0) ? 1 : AccessorDims;
540  using rw_accessor_type = accessor<T, AccessorDims, access::mode::read_write,
541  access::target::device, IsPlaceholder,
543  using dw_accessor_type =
544  accessor<T, AccessorDims, access::mode::discard_write,
545  access::target::device, IsPlaceholder,
547 
548  static constexpr bool has_atomic_add_float64 =
550  static constexpr bool has_fast_atomics =
552  static constexpr bool has_fast_reduce =
554  static constexpr bool is_usm = IsUSM;
555  static constexpr bool is_placeholder =
556  (IsPlaceholder == access::placeholder::true_t);
557 
558  static constexpr size_t dims = Dims;
559  static constexpr size_t num_elements = Extent;
560 
561  reduction_impl_algo(const T &Identity, BinaryOperation BinaryOp, bool Init,
562  std::shared_ptr<rw_accessor_type> AccPointer)
563  : base(Identity, BinaryOp, Init), MRWAcc(AccPointer){};
564  reduction_impl_algo(const T &Identity, BinaryOperation BinaryOp, bool Init,
565  std::shared_ptr<dw_accessor_type> AccPointer)
566  : base(Identity, BinaryOp, Init), MDWAcc(AccPointer){};
567  reduction_impl_algo(const T &Identity, BinaryOperation BinaryOp, bool Init,
568  T *USMPointer)
569  : base(Identity, BinaryOp, Init), MUSMPointer(USMPointer){};
570 
575 #ifndef __SYCL_DEVICE_ONLY__
576  if (MRWAcc)
577  CGH.associateWithHandler(MRWAcc.get(), access::target::device);
578  else if (MDWAcc)
579  CGH.associateWithHandler(MDWAcc.get(), access::target::device);
580 #else
581  (void)CGH;
582 #endif
583  }
584 
589  template <typename _T = result_type>
590  static accessor<_T, buffer_dim, access::mode::read_write,
591  access::target::local>
592  getReadWriteLocalAcc(size_t Size, handler &CGH) {
593  return {Size, CGH};
594  }
595 
598  CGH.addReduction(MOutBufPtr);
599  return {*MOutBufPtr, CGH};
600  }
601 
603  template <bool IsOneWG, bool _IsUSM = is_usm>
604  std::enable_if_t<IsOneWG && _IsUSM, result_type *>
606  return getUSMPointer();
607  }
608 
612  template <bool IsOneWG, bool _IsUSM = is_usm>
613  std::enable_if_t<IsOneWG && !_IsUSM, rw_accessor_type>
615  if (MRWAcc)
616  return *MRWAcc;
617  return getWriteMemForPartialReds<false>(1, CGH);
618  }
619 
622  template <bool IsOneWG>
623  std::enable_if_t<!IsOneWG, rw_accessor_type>
624  getWriteMemForPartialReds(size_t Size, handler &CGH) {
625  MOutBufPtr = std::make_shared<buffer<T, buffer_dim>>(range<1>(Size));
626  CGH.addReduction(MOutBufPtr);
627  return createHandlerWiredReadWriteAccessor(CGH, *MOutBufPtr);
628  }
629 
637  if (Size == 1 && MRWAcc != nullptr) {
639  return *MRWAcc;
640  }
641 
642  // Create a new output buffer and return an accessor to it.
643  MOutBufPtr = std::make_shared<buffer<T, buffer_dim>>(range<1>(Size));
644  CGH.addReduction(MOutBufPtr);
645  return createHandlerWiredReadWriteAccessor(CGH, *MOutBufPtr);
646  }
647 
652  template <bool HasFastAtomics = (has_fast_atomics || has_atomic_add_float64)>
653  std::enable_if_t<HasFastAtomics, rw_accessor_type>
655  if (!is_usm && !base::initializeToIdentity())
656  return *MRWAcc;
657 
658  // TODO: Move to T[] in C++20 to simplify handling here
659  // auto RWReduVal = std::make_shared<T[num_elements]>();
660  auto RWReduVal = std::make_shared<std::array<T, num_elements>>();
661  for (int i = 0; i < num_elements; ++i) {
662  (*RWReduVal)[i] = base::getIdentity();
663  }
664  CGH.addReduction(RWReduVal);
665  MOutBufPtr = std::make_shared<buffer<T, 1>>(RWReduVal.get()->data(),
666  range<1>(num_elements));
667  MOutBufPtr->set_final_data();
668  CGH.addReduction(MOutBufPtr);
669  return createHandlerWiredReadWriteAccessor(CGH, *MOutBufPtr);
670  }
671 
672  accessor<int, 1, access::mode::read_write, access::target::device,
673  access::placeholder::false_t>
675  auto CounterMem = std::make_shared<int>(0);
676  CGH.addReduction(CounterMem);
677  auto CounterBuf = std::make_shared<buffer<int, 1>>(CounterMem.get(), 1);
678  CounterBuf->set_final_data();
679  CGH.addReduction(CounterBuf);
680  return {*CounterBuf, CGH};
681  }
682 
683  bool hasUserDiscardWriteAccessor() { return MDWAcc != nullptr; }
684 
685  template <bool _IsUSM = IsUSM>
686  std::enable_if_t<!_IsUSM, rw_accessor_type &> getUserReadWriteAccessor() {
687  return *MRWAcc;
688  }
689 
690  template <bool _IsUSM = IsUSM>
691  std::enable_if_t<!_IsUSM, dw_accessor_type &> getUserDiscardWriteAccessor() {
692  return *MDWAcc;
693  }
694 
696  assert(is_usm && "Unexpected call of getUSMPointer().");
697  return MUSMPointer;
698  }
699 
700  static inline result_type *getOutPointer(const rw_accessor_type &OutAcc) {
701  return OutAcc.get_pointer().get();
702  }
703 
704  static inline result_type *getOutPointer(result_type *OutPtr) {
705  return OutPtr;
706  }
707 
708 private:
709  template <typename BufferT, access::placeholder IsPH = IsPlaceholder>
710  std::enable_if_t<IsPH == access::placeholder::false_t, rw_accessor_type>
711  createHandlerWiredReadWriteAccessor(handler &CGH, BufferT Buffer) {
712  return {Buffer, CGH};
713  }
714 
715  template <typename BufferT, access::placeholder IsPH = IsPlaceholder>
716  std::enable_if_t<IsPH == access::placeholder::true_t, rw_accessor_type>
717  createHandlerWiredReadWriteAccessor(handler &CGH, BufferT Buffer) {
718  rw_accessor_type Acc(Buffer);
719  CGH.require(Acc);
720  return Acc;
721  }
722 
724  std::shared_ptr<rw_accessor_type> MRWAcc;
725  std::shared_ptr<dw_accessor_type> MDWAcc;
726 
727  std::shared_ptr<buffer<T, buffer_dim>> MOutBufPtr;
728 
731  T *MUSMPointer = nullptr;
732 };
733 
736 template <typename FirstT, typename... RestT> struct AreAllButLastReductions {
737  static constexpr bool value =
738  std::is_base_of<reduction_impl_base,
739  std::remove_reference_t<FirstT>>::value &&
740  AreAllButLastReductions<RestT...>::value;
741 };
742 
745 template <typename T> struct AreAllButLastReductions<T> {
746  static constexpr bool value =
747  !std::is_base_of<reduction_impl_base, std::remove_reference_t<T>>::value;
748 };
749 
752 template <typename T, class BinaryOperation, int Dims, size_t Extent,
753  class Algorithm>
755  : private reduction_impl_base,
756  public reduction_impl_algo<T, BinaryOperation, Dims, Extent, Algorithm> {
757 private:
759 
760 public:
761  using reducer_type = typename algo::reducer_type;
762  using rw_accessor_type = typename algo::rw_accessor_type;
763  using dw_accessor_type = typename algo::dw_accessor_type;
764 
765  // Only scalar and 1D array reductions are supported by SYCL 2020.
766  static_assert(Dims <= 1, "Multi-dimensional reductions are not supported.");
767 
770  template <typename _T, typename AllocatorT,
771  std::enable_if_t<IsKnownIdentityOp<_T, BinaryOperation>::value> * =
772  nullptr>
775  : algo(reducer_type::getIdentity(), BinaryOperation(),
776  InitializeToIdentity, std::make_shared<rw_accessor_type>(Buffer)) {
778  if (Buffer.size() != 1)
779  throw sycl::runtime_error(errc::invalid,
780  "Reduction variable must be a scalar.",
782  }
783 
785  template <
786  typename _T = T,
789  : algo(reducer_type::getIdentity(), BinaryOperation(), false,
790  std::make_shared<rw_accessor_type>(Acc)) {
791  if (Acc.size() != 1)
792  throw sycl::runtime_error(errc::invalid,
793  "Reduction variable must be a scalar.",
795  }
796 
798  template <
799  typename _T = T,
802  : algo(reducer_type::getIdentity(), BinaryOperation(), true,
803  std::make_shared<dw_accessor_type>(Acc)) {
804  if (Acc.size() != 1)
805  throw sycl::runtime_error(errc::invalid,
806  "Reduction variable must be a scalar.",
808  }
809 
813  template <
814  typename _T, typename AllocatorT,
817  const T & /*Identity*/, BinaryOperation,
819  : algo(reducer_type::getIdentity(), BinaryOperation(),
820  InitializeToIdentity, std::make_shared<rw_accessor_type>(Buffer)) {
822  if (Buffer.size() != 1)
823  throw sycl::runtime_error(errc::invalid,
824  "Reduction variable must be a scalar.",
826  // For now the implementation ignores the identity value given by user
827  // when the implementation knows the identity.
828  // The SPEC could prohibit passing identity parameter to operations with
829  // known identity, but that could have some bad consequences too.
830  // For example, at some moment the implementation may NOT know the identity
831  // for COMPLEX-PLUS reduction. User may create a program that would pass
832  // COMPLEX value (0,0) as identity for PLUS reduction. At some later moment
833  // when the implementation starts handling COMPLEX-PLUS as known operation
834  // the existing user's program remains compilable and working correctly.
835  // I.e. with this constructor here, adding more reduction operations to the
836  // list of known operations does not break the existing programs.
837  }
838 
841  template <
842  typename _T = T,
844  reduction_impl(rw_accessor_type &Acc, const T & /*Identity*/, BinaryOperation)
845  : algo(reducer_type::getIdentity(), BinaryOperation(), false,
846  std::make_shared<rw_accessor_type>(Acc)) {
847  if (Acc.size() != 1)
848  throw sycl::runtime_error(errc::invalid,
849  "Reduction variable must be a scalar.",
851  // For now the implementation ignores the identity value given by user
852  // when the implementation knows the identity.
853  // The SPEC could prohibit passing identity parameter to operations with
854  // known identity, but that could have some bad consequences too.
855  // For example, at some moment the implementation may NOT know the identity
856  // for COMPLEX-PLUS reduction. User may create a program that would pass
857  // COMPLEX value (0,0) as identity for PLUS reduction. At some later moment
858  // when the implementation starts handling COMPLEX-PLUS as known operation
859  // the existing user's program remains compilable and working correctly.
860  // I.e. with this constructor here, adding more reduction operations to the
861  // list of known operations does not break the existing programs.
862  }
863 
866  template <
867  typename _T = T,
869  reduction_impl(dw_accessor_type &Acc, const T & /*Identity*/, BinaryOperation)
870  : algo(reducer_type::getIdentity(), BinaryOperation(), true,
871  std::make_shared<dw_accessor_type>(Acc)) {
872  if (Acc.size() != 1)
873  throw sycl::runtime_error(errc::invalid,
874  "Reduction variable must be a scalar.",
876  // For now the implementation ignores the identity value given by user
877  // when the implementation knows the identity.
878  // The SPEC could prohibit passing identity parameter to operations with
879  // known identity, but that could have some bad consequences too.
880  // For example, at some moment the implementation may NOT know the identity
881  // for COMPLEX-PLUS reduction. User may create a program that would pass
882  // COMPLEX value (0,0) as identity for PLUS reduction. At some later moment
883  // when the implementation starts handling COMPLEX-PLUS as known operation
884  // the existing user's program remains compilable and working correctly.
885  // I.e. with this constructor here, adding more reduction operations to the
886  // list of known operations does not break the existing programs.
887  }
888 
891  template <
892  typename _T, typename AllocatorT,
895  const T &Identity, BinaryOperation BOp,
897  : algo(Identity, BOp, InitializeToIdentity,
898  std::make_shared<rw_accessor_type>(Buffer)) {
900  if (Buffer.size() != 1)
901  throw sycl::runtime_error(errc::invalid,
902  "Reduction variable must be a scalar.",
904  }
905 
907  template <
908  typename _T = T,
910  reduction_impl(rw_accessor_type &Acc, const T &Identity, BinaryOperation BOp)
911  : algo(Identity, BOp, false, std::make_shared<rw_accessor_type>(Acc)) {
912  if (Acc.size() != 1)
913  throw sycl::runtime_error(errc::invalid,
914  "Reduction variable must be a scalar.",
916  }
917 
919  template <
920  typename _T = T,
922  reduction_impl(dw_accessor_type &Acc, const T &Identity, BinaryOperation BOp)
923  : algo(Identity, BOp, true, std::make_shared<dw_accessor_type>(Acc)) {
924  if (Acc.size() != 1)
925  throw sycl::runtime_error(errc::invalid,
926  "Reduction variable must be a scalar.",
928  }
929 
934  template <
935  typename _T = T,
937  reduction_impl(T *VarPtr, bool InitializeToIdentity = false)
938  : algo(reducer_type::getIdentity(), BinaryOperation(),
939  InitializeToIdentity, VarPtr) {}
940 
946  template <
947  typename _T = T,
949  reduction_impl(T *VarPtr, const T &Identity, BinaryOperation,
950  bool InitializeToIdentity = false)
951  : algo(Identity, BinaryOperation(), InitializeToIdentity, VarPtr) {
952  // For now the implementation ignores the identity value given by user
953  // when the implementation knows the identity.
954  // The SPEC could prohibit passing identity parameter to operations with
955  // known identity, but that could have some bad consequences too.
956  // For example, at some moment the implementation may NOT know the identity
957  // for COMPLEX-PLUS reduction. User may create a program that would pass
958  // COMPLEX value (0,0) as identity for PLUS reduction. At some later moment
959  // when the implementation starts handling COMPLEX-PLUS as known operation
960  // the existing user's program remains compilable and working correctly.
961  // I.e. with this constructor here, adding more reduction operations to the
962  // list of known operations does not break the existing programs.
963  }
964 
969  template <
970  typename _T = T,
972  reduction_impl(T *VarPtr, const T &Identity, BinaryOperation BOp,
973  bool InitializeToIdentity = false)
974  : algo(Identity, BOp, InitializeToIdentity, VarPtr) {}
975 
976 #if __cplusplus >= 201703L
977  template <
979  typename _T = T,
981  reduction_impl(span<_T, Extent> Span, bool InitializeToIdentity = false)
982  : algo(reducer_type::getIdentity(), BinaryOperation(),
983  InitializeToIdentity, Span.data()) {}
984 
987  template <
988  typename _T = T,
990  reduction_impl(span<_T, Extent> Span, const T & /* Identity */,
991  BinaryOperation BOp, bool InitializeToIdentity = false)
992  : algo(reducer_type::getIdentity(), BOp, InitializeToIdentity,
993  Span.data()) {}
994 
996  template <
997  typename _T = T,
999  reduction_impl(span<T, Extent> Span, const T &Identity, BinaryOperation BOp,
1000  bool InitializeToIdentity = false)
1001  : algo(Identity, BOp, InitializeToIdentity, Span.data()) {}
1002 #endif
1003 };
1004 
1008 template <typename T1, bool B1, bool B2, typename T2>
1010 template <typename T1, bool B1, bool B2, typename T2>
1012 
1017 template <typename Name, typename Type, bool B1, bool B2, typename T3 = void>
1020 };
1021 template <typename Type, bool B1, bool B2, typename T3>
1023  T3> {
1025 };
1026 template <typename Name, typename Type, bool B1, bool B2, typename T3>
1029 };
1030 template <typename Type, bool B1, bool B2, typename T3>
1032  T3> {
1034 };
1035 
1041 template <typename KernelFunc, int Dims, typename ReducerT>
1042 void reductionLoop(const range<Dims> &Range, ReducerT &Reducer,
1043  const nd_item<1> &NdId, KernelFunc &F) {
1044  size_t Start = NdId.get_global_id(0);
1045  size_t End = Range.size();
1046  size_t Stride = NdId.get_global_range(0);
1047  for (size_t I = Start; I < End; I += Stride)
1048  F(sycl::detail::getDelinearizedId(Range, I), Reducer);
1049 }
1050 
1051 template <typename KernelName, typename KernelType, int Dims, class Reduction>
1052 std::enable_if_t<Reduction::has_fast_atomics>
1053 reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const range<Dims> &Range,
1054  const nd_range<1> &NDRange, Reduction &Redu) {
1055  constexpr size_t NElements = Reduction::num_elements;
1056  auto Out = Redu.getReadWriteAccessorToInitializedMem(CGH);
1057  auto GroupSum = Reduction::getReadWriteLocalAcc(NElements, CGH);
1058  using Name =
1059  typename get_reduction_main_kernel_name_t<KernelName, KernelType,
1060  Reduction::is_usm, false>::name;
1061  CGH.parallel_for<Name>(NDRange, [=](nd_item<1> NDId) {
1062  // Call user's functions. Reducer.MValue gets initialized there.
1063  typename Reduction::reducer_type Reducer;
1064  reductionLoop(Range, Reducer, NDId, KernelFunc);
1065 
1066  // Work-group cooperates to initialize multiple reduction variables
1067  auto LID = NDId.get_local_id(0);
1068  for (size_t E = LID; E < NElements; E += NDId.get_local_range(0)) {
1069  GroupSum[E] = Reducer.getIdentity();
1070  }
1072 
1073  // Each work-item has its own reducer to combine
1074  Reducer.template atomic_combine<access::address_space::local_space>(
1075  &GroupSum[0]);
1076 
1077  // Single work-item performs finalization for entire work-group
1078  // TODO: Opportunity to parallelize across elements
1080  if (LID == 0) {
1081  for (size_t E = 0; E < NElements; ++E) {
1082  Reducer.getElement(E) = GroupSum[E];
1083  }
1084  Reducer.template atomic_combine(Reduction::getOutPointer(Out));
1085  }
1086  });
1087 }
1088 
1089 template <typename KernelName, typename KernelType, int Dims, class Reduction>
1090 std::enable_if_t<!Reduction::has_fast_atomics && Reduction::has_fast_reduce>
1091 reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const range<Dims> &Range,
1092  const nd_range<1> &NDRange, Reduction &Redu) {
1093  constexpr size_t NElements = Reduction::num_elements;
1094  size_t WGSize = NDRange.get_local_range().size();
1095  size_t NWorkGroups = NDRange.get_group_range().size();
1096 
1097  bool IsUpdateOfUserVar = !Reduction::is_usm && !Redu.initializeToIdentity();
1098  auto PartialSums =
1099  Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1100  auto Out = (NWorkGroups == 1)
1101  ? PartialSums
1102  : Redu.getWriteAccForPartialReds(NElements, CGH);
1103  auto NWorkGroupsFinished =
1104  Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH);
1105  auto DoReducePartialSumsInLastWG =
1106  Reduction::template getReadWriteLocalAcc<int>(1, CGH);
1107 
1108  using Name =
1109  typename get_reduction_main_kernel_name_t<KernelName, KernelType,
1110  Reduction::is_usm, false>::name;
1111  CGH.parallel_for<Name>(NDRange, [=](nd_item<1> NDId) {
1112  // Call user's functions. Reducer.MValue gets initialized there.
1113  typename Reduction::reducer_type Reducer;
1114  reductionLoop(Range, Reducer, NDId, KernelFunc);
1115 
1116  typename Reduction::binary_operation BOp;
1117  auto Group = NDId.get_group();
1118 
1119  // If there are multiple values, reduce each separately
1120  // reduce_over_group is only defined for each T, not for span<T, ...>
1121  size_t LID = NDId.get_local_id(0);
1122  for (int E = 0; E < NElements; ++E) {
1123  Reducer.getElement(E) =
1124  reduce_over_group(Group, Reducer.getElement(E), BOp);
1125 
1126  if (LID == 0) {
1127  if (NWorkGroups == 1 && IsUpdateOfUserVar)
1128  Reducer.getElement(E) =
1129  BOp(Reducer.getElement(E), Reduction::getOutPointer(Out)[E]);
1130 
1131  // if NWorkGroups == 1, then PartialsSum and Out point to same memory.
1132  Reduction::getOutPointer(
1133  PartialSums)[NDId.get_group_linear_id() * NElements + E] =
1134  Reducer.getElement(E);
1135  }
1136  }
1137 
1138  // Signal this work-group has finished after all values are reduced
1139  if (LID == 0) {
1140  auto NFinished =
1141  atomic_ref<int, memory_order::relaxed, memory_scope::device,
1142  access::address_space::global_space>(
1143  NWorkGroupsFinished[0]);
1144  DoReducePartialSumsInLastWG[0] =
1145  ++NFinished == NWorkGroups && NWorkGroups > 1;
1146  }
1147 
1149  if (DoReducePartialSumsInLastWG[0]) {
1150  // Reduce each result separately
1151  // TODO: Opportunity to parallelize across elements
1152  for (int E = 0; E < NElements; ++E) {
1153  auto LocalSum = Reducer.getIdentity();
1154  for (size_t I = LID; I < NWorkGroups; I += WGSize)
1155  LocalSum = BOp(LocalSum, PartialSums[I * NElements + E]);
1156  Reducer.getElement(E) = reduce_over_group(Group, LocalSum, BOp);
1157 
1158  if (LID == 0) {
1159  if (IsUpdateOfUserVar)
1160  Reducer.getElement(E) =
1161  BOp(Reducer.getElement(E), Reduction::getOutPointer(Out)[E]);
1162  Reduction::getOutPointer(Out)[E] = Reducer.getElement(E);
1163  }
1164  }
1165  }
1166  });
1167 }
1168 
1169 template <typename KernelName, typename KernelType, int Dims, class Reduction>
1170 std::enable_if_t<!Reduction::has_fast_atomics && !Reduction::has_fast_reduce>
1171 reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const range<Dims> &Range,
1172  const nd_range<1> &NDRange, Reduction &Redu) {
1173  constexpr size_t NElements = Reduction::num_elements;
1174  size_t WGSize = NDRange.get_local_range().size();
1175  size_t NWorkGroups = NDRange.get_group_range().size();
1176 
1177  bool IsUpdateOfUserVar = !Reduction::is_usm && !Redu.initializeToIdentity();
1178  auto PartialSums =
1179  Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1180  auto Out = (NWorkGroups == 1)
1181  ? PartialSums
1182  : Redu.getWriteAccForPartialReds(NElements, CGH);
1183  auto LocalReds = Reduction::getReadWriteLocalAcc(WGSize + 1, CGH);
1184  auto NWorkGroupsFinished =
1185  Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH);
1186  auto DoReducePartialSumsInLastWG =
1187  Reduction::template getReadWriteLocalAcc<int>(1, CGH);
1188 
1189  auto Identity = Redu.getIdentity();
1190  auto BOp = Redu.getBinaryOperation();
1191  using Name =
1192  typename get_reduction_main_kernel_name_t<KernelName, KernelType,
1193  Reduction::is_usm, false>::name;
1194  CGH.parallel_for<Name>(NDRange, [=](nd_item<1> NDId) {
1195  // Call user's functions. Reducer.MValue gets initialized there.
1196  typename Reduction::reducer_type Reducer(Identity, BOp);
1197  reductionLoop(Range, Reducer, NDId, KernelFunc);
1198 
1199  // If there are multiple values, reduce each separately
1200  // This prevents local memory from scaling with elements
1201  size_t LID = NDId.get_local_linear_id();
1202  for (int E = 0; E < NElements; ++E) {
1203 
1204  // Copy the element to local memory to prepare it for tree-reduction.
1205  LocalReds[LID] = Reducer.getElement(E);
1206  if (LID == 0)
1207  LocalReds[WGSize] = Identity;
1209 
1210  // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0].
1211  // LocalReds[WGSize] accumulates last/odd elements when the step
1212  // of tree-reduction loop is not even.
1213  size_t PrevStep = WGSize;
1214  for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
1215  if (LID < CurStep)
1216  LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
1217  else if (LID == CurStep && (PrevStep & 0x1))
1218  LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
1220  PrevStep = CurStep;
1221  }
1222 
1223  if (LID == 0) {
1224  auto V = BOp(LocalReds[0], LocalReds[WGSize]);
1225  if (NWorkGroups == 1 && IsUpdateOfUserVar)
1226  V = BOp(V, Reduction::getOutPointer(Out)[E]);
1227  // if NWorkGroups == 1, then PartialsSum and Out point to same memory.
1228  Reduction::getOutPointer(
1229  PartialSums)[NDId.get_group_linear_id() * NElements + E] = V;
1230  }
1231  }
1232 
1233  // Signal this work-group has finished after all values are reduced
1234  if (LID == 0) {
1235  auto NFinished =
1236  atomic_ref<int, memory_order::relaxed, memory_scope::device,
1237  access::address_space::global_space>(
1238  NWorkGroupsFinished[0]);
1239  DoReducePartialSumsInLastWG[0] =
1240  ++NFinished == NWorkGroups && NWorkGroups > 1;
1241  }
1242 
1244  if (DoReducePartialSumsInLastWG[0]) {
1245  // Reduce each result separately
1246  // TODO: Opportunity to parallelize across elements
1247  for (int E = 0; E < NElements; ++E) {
1248  auto LocalSum = Identity;
1249  for (size_t I = LID; I < NWorkGroups; I += WGSize)
1250  LocalSum =
1251  BOp(LocalSum,
1252  Reduction::getOutPointer(PartialSums)[I * NElements + E]);
1253 
1254  LocalReds[LID] = LocalSum;
1255  if (LID == 0)
1256  LocalReds[WGSize] = Identity;
1258 
1259  size_t PrevStep = WGSize;
1260  for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
1261  if (LID < CurStep)
1262  LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
1263  else if (LID == CurStep && (PrevStep & 0x1))
1264  LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
1266  PrevStep = CurStep;
1267  }
1268  if (LID == 0) {
1269  auto V = BOp(LocalReds[0], LocalReds[WGSize]);
1270  if (IsUpdateOfUserVar)
1271  V = BOp(V, Reduction::getOutPointer(Out)[E]);
1272  Reduction::getOutPointer(Out)[E] = V;
1273  }
1274  }
1275  }
1276  });
1277 }
1278 
1279 template <typename KernelName, typename KernelType, int Dims, class Reduction>
1280 void reduCGFunc(handler &CGH, KernelType KernelFunc, const range<Dims> &Range,
1281  size_t MaxWGSize, uint32_t NumConcurrentWorkGroups,
1282  Reduction &Redu) {
1283  size_t NWorkItems = Range.size();
1284  size_t WGSize = std::min(NWorkItems, MaxWGSize);
1285  size_t NWorkGroups = NWorkItems / WGSize;
1286  if (NWorkItems % WGSize)
1287  NWorkGroups++;
1288  size_t MaxNWorkGroups = NumConcurrentWorkGroups;
1289  NWorkGroups = std::min(NWorkGroups, MaxNWorkGroups);
1290  size_t NDRItems = NWorkGroups * WGSize;
1291  nd_range<1> NDRange{range<1>{NDRItems}, range<1>{WGSize}};
1292 
1293  reduCGFuncImpl<KernelName>(CGH, KernelFunc, Range, NDRange, Redu);
1294 }
1295 
1305 template <typename KernelName, typename KernelType, int Dims, class Reduction,
1306  bool IsPow2WG>
1308 reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
1309  Reduction &, typename Reduction::rw_accessor_type Out) {
1310  constexpr size_t NElements = Reduction::num_elements;
1311  using Name = typename get_reduction_main_kernel_name_t<
1312  KernelName, KernelType, Reduction::is_usm, IsPow2WG>::name;
1313  CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
1314  // Call user's function. Reducer.MValue gets initialized there.
1315  typename Reduction::reducer_type Reducer;
1316  KernelFunc(NDIt, Reducer);
1317 
1318  typename Reduction::binary_operation BOp;
1319  for (int E = 0; E < NElements; ++E) {
1320  Reducer.getElement(E) =
1321  ext::oneapi::reduce(NDIt.get_group(), Reducer.getElement(E), BOp);
1322  }
1323  if (NDIt.get_local_linear_id() == 0)
1324  Reducer.atomic_combine(Reduction::getOutPointer(Out));
1325  });
1326 }
1327 
1336 template <typename KernelName, typename KernelType, int Dims, class Reduction,
1337  bool IsPow2WG>
1339 reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
1340  Reduction &, typename Reduction::rw_accessor_type Out) {
1341  constexpr size_t NElements = Reduction::num_elements;
1342  size_t WGSize = Range.get_local_range().size();
1343 
1344  // Use local memory to reduce elements in work-groups into zero-th element.
1345  // If WGSize is not power of two, then WGSize+1 elements are allocated.
1346  // The additional last element is used to catch reduce elements that could
1347  // otherwise be lost in the tree-reduction algorithm used in the kernel.
1348  size_t NLocalElements = WGSize + (IsPow2WG ? 0 : 1);
1349  auto LocalReds = Reduction::getReadWriteLocalAcc(NLocalElements, CGH);
1350 
1351  using Name = typename get_reduction_main_kernel_name_t<
1352  KernelName, KernelType, Reduction::is_usm, IsPow2WG>::name;
1353  CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
1354  // Call user's functions. Reducer.MValue gets initialized there.
1355  typename Reduction::reducer_type Reducer;
1356  KernelFunc(NDIt, Reducer);
1357 
1358  size_t WGSize = NDIt.get_local_range().size();
1359  size_t LID = NDIt.get_local_linear_id();
1360 
1361  // If there are multiple values, reduce each separately
1362  // This prevents local memory from scaling with elements
1363  for (int E = 0; E < NElements; ++E) {
1364 
1365  // Copy the element to local memory to prepare it for tree-reduction.
1366  LocalReds[LID] = Reducer.getElement(E);
1367  if (!IsPow2WG)
1368  LocalReds[WGSize] = Reducer.getIdentity();
1369  NDIt.barrier();
1370 
1371  // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0].
1372  // LocalReds[WGSize] accumulates last/odd elements when the step
1373  // of tree-reduction loop is not even.
1374  typename Reduction::binary_operation BOp;
1375  size_t PrevStep = WGSize;
1376  for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
1377  if (LID < CurStep)
1378  LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
1379  else if (!IsPow2WG && LID == CurStep && (PrevStep & 0x1))
1380  LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
1381  NDIt.barrier();
1382  PrevStep = CurStep;
1383  }
1384 
1385  if (LID == 0) {
1386  Reducer.getElement(E) =
1387  IsPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]);
1388  }
1389 
1390  // Ensure item 0 is finished with LocalReds before next iteration
1391  if (E != NElements - 1) {
1392  NDIt.barrier();
1393  }
1394  }
1395 
1396  if (LID == 0) {
1397  Reducer.atomic_combine(Reduction::getOutPointer(Out));
1398  }
1399 
1400  });
1401 }
1402 
1403 template <typename KernelName, typename KernelType, int Dims, class Reduction>
1405 reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
1406  Reduction &Redu) {
1407 
1408  size_t WGSize = Range.get_local_range().size();
1409 
1410  // User's initialized read-write accessor is re-used here if
1411  // initialize_to_identity is not set (i.e. if user's variable is initialized).
1412  // Otherwise, a new buffer is initialized with identity value and a new
1413  // read-write accessor to that buffer is created. That is done because
1414  // atomic operations update some initialized memory.
1415  // User's USM pointer is not re-used even when initialize_to_identity is not
1416  // set because it does not worth the creation of an additional variant of
1417  // a user's kernel for that case.
1418  auto Out = Redu.getReadWriteAccessorToInitializedMem(CGH);
1419 
1420  // If the work group size is not pow of 2, then the kernel runs some
1421  // additional code and checks in it.
1422  // If the reduction has fast reduce then the kernel does not care if the work
1423  // group size is pow of 2 or not, assume true for such cases.
1424  bool IsPow2WG = Reduction::has_fast_reduce || ((WGSize & (WGSize - 1)) == 0);
1425  if (IsPow2WG)
1426  reduCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
1427  CGH, KernelFunc, Range, Redu, Out);
1428  else
1429  reduCGFuncImpl<KernelName, KernelType, Dims, Reduction, false>(
1430  CGH, KernelFunc, Range, Redu, Out);
1431 }
1432 
1441 template <typename KernelName, typename KernelType, int Dims, class Reduction,
1442  bool IsPow2WG>
1444 reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
1445  Reduction &Redu, typename Reduction::rw_accessor_type Out) {
1446  constexpr size_t NElements = Reduction::num_elements;
1447  size_t NWorkGroups = Range.get_group_range().size();
1448  bool IsUpdateOfUserVar =
1449  !Reduction::is_usm && !Redu.initializeToIdentity() && NWorkGroups == 1;
1450 
1451  using Name = typename get_reduction_main_kernel_name_t<
1452  KernelName, KernelType, Reduction::is_usm, IsPow2WG>::name;
1453  CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
1454  // Call user's functions. Reducer.MValue gets initialized there.
1455  typename Reduction::reducer_type Reducer;
1456  KernelFunc(NDIt, Reducer);
1457 
1458  // Compute the partial sum/reduction for the work-group.
1459  size_t WGID = NDIt.get_group_linear_id();
1460  typename Reduction::binary_operation BOp;
1461  for (int E = 0; E < NElements; ++E) {
1462  typename Reduction::result_type PSum;
1463  PSum = Reducer.getElement(E);
1464  PSum = ext::oneapi::reduce(NDIt.get_group(), PSum, BOp);
1465  if (NDIt.get_local_linear_id() == 0) {
1466  if (IsUpdateOfUserVar)
1467  PSum = BOp(Reduction::getOutPointer(Out)[E], PSum);
1468  Reduction::getOutPointer(Out)[WGID * NElements + E] = PSum;
1469  }
1470  }
1471  });
1472 }
1473 
1482 template <typename KernelName, typename KernelType, int Dims, class Reduction,
1483  bool IsPow2WG>
1485 reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
1486  Reduction &Redu, typename Reduction::rw_accessor_type Out) {
1487  constexpr size_t NElements = Reduction::num_elements;
1488  size_t WGSize = Range.get_local_range().size();
1489  size_t NWorkGroups = Range.get_group_range().size();
1490 
1491  bool IsUpdateOfUserVar =
1492  !Reduction::is_usm && !Redu.initializeToIdentity() && NWorkGroups == 1;
1493 
1494  // Use local memory to reduce elements in work-groups into 0-th element.
1495  // If WGSize is not power of two, then WGSize+1 elements are allocated.
1496  // The additional last element is used to catch elements that could
1497  // otherwise be lost in the tree-reduction algorithm.
1498  size_t NumLocalElements = WGSize + (IsPow2WG ? 0 : 1);
1499  auto LocalReds = Reduction::getReadWriteLocalAcc(NumLocalElements, CGH);
1500  typename Reduction::result_type ReduIdentity = Redu.getIdentity();
1501  using Name = typename get_reduction_main_kernel_name_t<
1502  KernelName, KernelType, Reduction::is_usm, IsPow2WG>::name;
1503  auto BOp = Redu.getBinaryOperation();
1504  CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
1505  // Call user's functions. Reducer.MValue gets initialized there.
1506  typename Reduction::reducer_type Reducer(ReduIdentity, BOp);
1507  KernelFunc(NDIt, Reducer);
1508 
1509  size_t WGSize = NDIt.get_local_range().size();
1510  size_t LID = NDIt.get_local_linear_id();
1511 
1512  // If there are multiple values, reduce each separately
1513  // This prevents local memory from scaling with elements
1514  for (int E = 0; E < NElements; ++E) {
1515 
1516  // Copy the element to local memory to prepare it for tree-reduction.
1517  LocalReds[LID] = Reducer.getElement(E);
1518  if (!IsPow2WG)
1519  LocalReds[WGSize] = ReduIdentity;
1520  NDIt.barrier();
1521 
1522  // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]
1523  // LocalReds[WGSize] accumulates last/odd elements when the step
1524  // of tree-reduction loop is not even.
1525  size_t PrevStep = WGSize;
1526  for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
1527  if (LID < CurStep)
1528  LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
1529  else if (!IsPow2WG && LID == CurStep && (PrevStep & 0x1))
1530  LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
1531  NDIt.barrier();
1532  PrevStep = CurStep;
1533  }
1534 
1535  // Compute the partial sum/reduction for the work-group.
1536  if (LID == 0) {
1537  size_t GrID = NDIt.get_group_linear_id();
1538  typename Reduction::result_type PSum =
1539  IsPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]);
1540  if (IsUpdateOfUserVar)
1541  PSum = BOp(*(Reduction::getOutPointer(Out)), PSum);
1542  Reduction::getOutPointer(Out)[GrID * NElements + E] = PSum;
1543  }
1544 
1545  // Ensure item 0 is finished with LocalReds before next iteration
1546  if (E != NElements - 1) {
1547  NDIt.barrier();
1548  }
1549  }
1550  });
1551 }
1552 
1553 template <typename KernelName, typename KernelType, int Dims, class Reduction>
1555 reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
1556  Reduction &Redu) {
1557  constexpr size_t NElements = Reduction::num_elements;
1558  size_t WGSize = Range.get_local_range().size();
1559  size_t NWorkGroups = Range.get_group_range().size();
1560 
1561  // If the work group size is not pow of 2, then the kernel runs some
1562  // additional code and checks in it.
1563  // If the reduction has fast reduce then the kernel does not care if the work
1564  // group size is pow of 2 or not, assume true for such cases.
1565  bool IsPow2WG = Reduction::has_fast_reduce || ((WGSize & (WGSize - 1)) == 0);
1566 
1567  auto Out = Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1568  if (IsPow2WG)
1569  reduCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
1570  CGH, KernelFunc, Range, Redu, Out);
1571  else
1572  reduCGFuncImpl<KernelName, KernelType, Dims, Reduction, false>(
1573  CGH, KernelFunc, Range, Redu, Out);
1574 }
1575 
1584 template <typename KernelName, typename KernelType, bool UniformWG,
1585  class Reduction, typename InputT, typename OutputT>
1587 reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
1588  size_t WGSize, Reduction &Redu, InputT In, OutputT Out) {
1589  constexpr size_t NElements = Reduction::num_elements;
1590  using Name = typename get_reduction_aux_kernel_name_t<
1591  KernelName, KernelType, Reduction::is_usm, UniformWG, OutputT>::name;
1592  bool IsUpdateOfUserVar =
1593  !Reduction::is_usm && !Redu.initializeToIdentity() && NWorkGroups == 1;
1594  range<1> GlobalRange = {UniformWG ? NWorkItems : NWorkGroups * WGSize};
1595  nd_range<1> Range{GlobalRange, range<1>(WGSize)};
1596  CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
1597  typename Reduction::binary_operation BOp;
1598  size_t WGID = NDIt.get_group_linear_id();
1599  size_t GID = NDIt.get_global_linear_id();
1600 
1601  for (int E = 0; E < NElements; ++E) {
1602  typename Reduction::result_type PSum =
1603  (UniformWG || (GID < NWorkItems))
1604  ? In[GID * NElements + E]
1605  : Reduction::reducer_type::getIdentity();
1606  PSum = ext::oneapi::reduce(NDIt.get_group(), PSum, BOp);
1607  if (NDIt.get_local_linear_id() == 0) {
1608  if (IsUpdateOfUserVar)
1609  PSum = BOp(Reduction::getOutPointer(Out)[E], PSum);
1610  Reduction::getOutPointer(Out)[WGID * NElements + E] = PSum;
1611  }
1612  }
1613  });
1614 }
1615 
1623 template <typename KernelName, typename KernelType, bool UniformPow2WG,
1624  class Reduction, typename InputT, typename OutputT>
1626 reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
1627  size_t WGSize, Reduction &Redu, InputT In, OutputT Out) {
1628  constexpr size_t NElements = Reduction::num_elements;
1629  bool IsUpdateOfUserVar =
1630  !Reduction::is_usm && !Redu.initializeToIdentity() && NWorkGroups == 1;
1631 
1632  // Use local memory to reduce elements in work-groups into 0-th element.
1633  // If WGSize is not power of two, then WGSize+1 elements are allocated.
1634  // The additional last element is used to catch elements that could
1635  // otherwise be lost in the tree-reduction algorithm.
1636  size_t NumLocalElements = WGSize + (UniformPow2WG ? 0 : 1);
1637  auto LocalReds = Reduction::getReadWriteLocalAcc(NumLocalElements, CGH);
1638 
1639  auto ReduIdentity = Redu.getIdentity();
1640  auto BOp = Redu.getBinaryOperation();
1641  using Name = typename get_reduction_aux_kernel_name_t<
1642  KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name;
1643  range<1> GlobalRange = {UniformPow2WG ? NWorkItems : NWorkGroups * WGSize};
1644  nd_range<1> Range{GlobalRange, range<1>(WGSize)};
1645  CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
1646  size_t WGSize = NDIt.get_local_range().size();
1647  size_t LID = NDIt.get_local_linear_id();
1648  size_t GID = NDIt.get_global_linear_id();
1649 
1650  for (int E = 0; E < NElements; ++E) {
1651  // Copy the element to local memory to prepare it for tree-reduction.
1652  LocalReds[LID] = (UniformPow2WG || GID < NWorkItems)
1653  ? In[GID * NElements + E]
1654  : ReduIdentity;
1655  if (!UniformPow2WG)
1656  LocalReds[WGSize] = ReduIdentity;
1657  NDIt.barrier();
1658 
1659  // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]
1660  // LocalReds[WGSize] accumulates last/odd elements when the step
1661  // of tree-reduction loop is not even.
1662  size_t PrevStep = WGSize;
1663  for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
1664  if (LID < CurStep)
1665  LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
1666  else if (!UniformPow2WG && LID == CurStep && (PrevStep & 0x1))
1667  LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
1668  NDIt.barrier();
1669  PrevStep = CurStep;
1670  }
1671 
1672  // Compute the partial sum/reduction for the work-group.
1673  if (LID == 0) {
1674  size_t GrID = NDIt.get_group_linear_id();
1675  typename Reduction::result_type PSum =
1676  UniformPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]);
1677  if (IsUpdateOfUserVar)
1678  PSum = BOp(*(Reduction::getOutPointer(Out)), PSum);
1679  Reduction::getOutPointer(Out)[GrID * NElements + E] = PSum;
1680  }
1681 
1682  // Ensure item 0 is finished with LocalReds before next iteration
1683  if (E != NElements - 1) {
1684  NDIt.barrier();
1685  }
1686  }
1687  });
1688 }
1689 
1694 template <typename KernelName, typename KernelType, class Reduction>
1696 reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
1697  Reduction &Redu) {
1698 
1699  constexpr size_t NElements = Reduction::num_elements;
1700  size_t NWorkGroups;
1701  size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
1702 
1703  // The last work-group may be not fully loaded with work, or the work group
1704  // size may be not power of two. Those two cases considered inefficient
1705  // as they require additional code and checks in the kernel.
1706  bool HasUniformWG = NWorkGroups * WGSize == NWorkItems;
1707  if (!Reduction::has_fast_reduce)
1708  HasUniformWG = HasUniformWG && (WGSize & (WGSize - 1)) == 0;
1709 
1710  // Get read accessor to the buffer that was used as output
1711  // in the previous kernel.
1712  auto In = Redu.getReadAccToPreviousPartialReds(CGH);
1713  auto Out = Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1714  if (HasUniformWG)
1715  reduAuxCGFuncImpl<KernelName, KernelType, true>(
1716  CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Out);
1717  else
1718  reduAuxCGFuncImpl<KernelName, KernelType, false>(
1719  CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Out);
1720  return NWorkGroups;
1721 }
1722 
1723 // This method is used for implementation of parallel_for accepting 1 reduction.
1724 // TODO: remove this method when everything is switched to general algorithm
1725 // implementing arbitrary number of reductions in parallel_for().
1729 template <typename KernelName, class Reduction>
1730 std::enable_if_t<!Reduction::is_usm>
1731 reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu) {
1732  auto InAcc = Redu.getReadAccToPreviousPartialReds(CGH);
1733  Redu.associateWithHandler(CGH);
1734  if (Redu.hasUserDiscardWriteAccessor())
1735  CGH.copy(InAcc, Redu.getUserDiscardWriteAccessor());
1736  else
1737  CGH.copy(InAcc, Redu.getUserReadWriteAccessor());
1738 }
1739 
1740 // This method is used for implementation of parallel_for accepting 1 reduction.
1741 // TODO: remove this method when everything is switched to general algorithm
1742 // implementing arbitrary number of reductions in parallel_for().
1745 template <typename KernelName, class Reduction>
1746 std::enable_if_t<Reduction::is_usm>
1747 reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu) {
1748  constexpr size_t NElements = Reduction::num_elements;
1749  auto InAcc = Redu.getReadAccToPreviousPartialReds(CGH);
1750  auto UserVarPtr = Redu.getUSMPointer();
1751  bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1752  auto BOp = Redu.getBinaryOperation();
1753  CGH.single_task<KernelName>([=] {
1754  for (int i = 0; i < NElements; ++i) {
1755  if (IsUpdateOfUserVar)
1756  UserVarPtr[i] = BOp(UserVarPtr[i], InAcc.get_pointer()[i]);
1757  else
1758  UserVarPtr[i] = InAcc.get_pointer()[i];
1759  }
1760  });
1761 }
1762 
1766 template <typename... Reductions, size_t... Is>
1767 auto createReduLocalAccs(size_t Size, handler &CGH,
1768  std::index_sequence<Is...>) {
1769  return makeReduTupleT(
1770  std::tuple_element_t<Is, std::tuple<Reductions...>>::getReadWriteLocalAcc(
1771  Size, CGH)...);
1772 }
1773 
1777 template <bool IsOneWG, typename... Reductions, size_t... Is>
1778 auto createReduOutAccs(size_t NWorkGroups, handler &CGH,
1779  std::tuple<Reductions...> &ReduTuple,
1780  std::index_sequence<Is...>) {
1781  return makeReduTupleT(
1782  std::get<Is>(ReduTuple).template getWriteMemForPartialReds<IsOneWG>(
1783  NWorkGroups *
1784  std::tuple_element_t<Is, std::tuple<Reductions...>>::num_elements,
1785  CGH)...);
1786 }
1787 
1791 template <typename... Reductions, size_t... Is>
1793  std::tuple<Reductions...> &ReduTuple,
1794  std::index_sequence<Is...>) {
1795  return makeReduTupleT(
1796  std::get<Is>(ReduTuple).getReadAccToPreviousPartialReds(CGH)...);
1797 }
1798 
1799 template <typename... Reductions, size_t... Is>
1800 ReduTupleT<typename Reductions::result_type...>
1801 getReduIdentities(std::tuple<Reductions...> &ReduTuple,
1802  std::index_sequence<Is...>) {
1803  return {std::get<Is>(ReduTuple).getIdentity()...};
1804 }
1805 
1806 template <typename... Reductions, size_t... Is>
1807 ReduTupleT<typename Reductions::binary_operation...>
1808 getReduBOPs(std::tuple<Reductions...> &ReduTuple, std::index_sequence<Is...>) {
1809  return {std::get<Is>(ReduTuple).getBinaryOperation()...};
1810 }
1811 
1812 template <typename... Reductions, size_t... Is>
1813 std::array<bool, sizeof...(Reductions)>
1814 getInitToIdentityProperties(std::tuple<Reductions...> &ReduTuple,
1815  std::index_sequence<Is...>) {
1816  return {std::get<Is>(ReduTuple).initializeToIdentity()...};
1817 }
1818 
1819 template <typename... Reductions, size_t... Is>
1820 std::tuple<typename Reductions::reducer_type...>
1823  std::index_sequence<Is...>) {
1824  return {typename Reductions::reducer_type{std::get<Is>(Identities),
1825  std::get<Is>(BOPsTuple)}...};
1826 }
1827 
1828 template <typename KernelType, int Dims, typename... ReducerT, size_t... Is>
1830  std::tuple<ReducerT...> &Reducers,
1831  std::index_sequence<Is...>) {
1832  KernelFunc(NDIt, std::get<Is>(Reducers)...);
1833 }
1834 
1835 template <bool Pow2WG, typename... LocalAccT, typename... ReducerT,
1836  typename... ResultT, size_t... Is>
1837 void initReduLocalAccs(size_t LID, size_t WGSize,
1838  ReduTupleT<LocalAccT...> LocalAccs,
1839  const std::tuple<ReducerT...> &Reducers,
1840  ReduTupleT<ResultT...> Identities,
1841  std::index_sequence<Is...>) {
1842  std::tie(std::get<Is>(LocalAccs)[LID]...) =
1843  std::make_tuple(std::get<Is>(Reducers).MValue...);
1844 
1845  // For work-groups, which size is not power of two, local accessors have
1846  // an additional element with index WGSize that is used by the tree-reduction
1847  // algorithm. Initialize those additional elements with identity values here.
1848  if (!Pow2WG)
1849  std::tie(std::get<Is>(LocalAccs)[WGSize]...) =
1850  std::make_tuple(std::get<Is>(Identities)...);
1851 }
1852 
1853 template <bool UniformPow2WG, typename... LocalAccT, typename... InputAccT,
1854  typename... ResultT, size_t... Is>
1855 void initReduLocalAccs(size_t LID, size_t GID, size_t NWorkItems, size_t WGSize,
1856  ReduTupleT<InputAccT...> LocalAccs,
1857  ReduTupleT<LocalAccT...> InputAccs,
1858  ReduTupleT<ResultT...> Identities,
1859  std::index_sequence<Is...>) {
1860  // Normally, the local accessors are initialized with elements from the input
1861  // accessors. The exception is the case when (GID >= NWorkItems), which
1862  // possible only when UniformPow2WG is false. For that case the elements of
1863  // local accessors are initialized with identity value, so they would not
1864  // give any impact into the final partial sums during the tree-reduction
1865  // algorithm work.
1866  if (UniformPow2WG || GID < NWorkItems)
1867  std::tie(std::get<Is>(LocalAccs)[LID]...) =
1868  std::make_tuple(std::get<Is>(InputAccs)[GID]...);
1869  else
1870  std::tie(std::get<Is>(LocalAccs)[LID]...) =
1871  std::make_tuple(std::get<Is>(Identities)...);
1872 
1873  // For work-groups, which size is not power of two, local accessors have
1874  // an additional element with index WGSize that is used by the tree-reduction
1875  // algorithm. Initialize those additional elements with identity values here.
1876  if (!UniformPow2WG)
1877  std::tie(std::get<Is>(LocalAccs)[WGSize]...) =
1878  std::make_tuple(std::get<Is>(Identities)...);
1879 }
1880 
1881 template <typename... LocalAccT, typename... BOPsT, size_t... Is>
1882 void reduceReduLocalAccs(size_t IndexA, size_t IndexB,
1883  ReduTupleT<LocalAccT...> LocalAccs,
1884  ReduTupleT<BOPsT...> BOPs,
1885  std::index_sequence<Is...>) {
1886  std::tie(std::get<Is>(LocalAccs)[IndexA]...) =
1887  std::make_tuple((std::get<Is>(BOPs)(std::get<Is>(LocalAccs)[IndexA],
1888  std::get<Is>(LocalAccs)[IndexB]))...);
1889 }
1890 
1891 template <bool Pow2WG, bool IsOneWG, typename... Reductions,
1892  typename... OutAccT, typename... LocalAccT, typename... BOPsT,
1893  typename... Ts, size_t... Is>
1895  size_t OutAccIndex, size_t WGSize, std::tuple<Reductions...> *,
1897  ReduTupleT<BOPsT...> BOPs, ReduTupleT<Ts...> IdentityVals,
1898  std::array<bool, sizeof...(Reductions)> IsInitializeToIdentity,
1899  std::index_sequence<Is...>) {
1900  // Add the initial value of user's variable to the final result.
1901  if (IsOneWG)
1902  std::tie(std::get<Is>(LocalAccs)[0]...) = std::make_tuple(std::get<Is>(
1903  BOPs)(std::get<Is>(LocalAccs)[0],
1904  IsInitializeToIdentity[Is]
1905  ? std::get<Is>(IdentityVals)
1906  : std::tuple_element_t<Is, std::tuple<Reductions...>>::
1907  getOutPointer(std::get<Is>(OutAccs))[0])...);
1908 
1909  if (Pow2WG) {
1910  // The partial sums for the work-group are stored in 0-th elements of local
1911  // accessors. Simply write those sums to output accessors.
1912  std::tie(std::tuple_element_t<Is, std::tuple<Reductions...>>::getOutPointer(
1913  std::get<Is>(OutAccs))[OutAccIndex]...) =
1914  std::make_tuple(std::get<Is>(LocalAccs)[0]...);
1915  } else {
1916  // Each of local accessors keeps two partial sums: in 0-th and WGsize-th
1917  // elements. Combine them into final partial sums and write to output
1918  // accessors.
1919  std::tie(std::tuple_element_t<Is, std::tuple<Reductions...>>::getOutPointer(
1920  std::get<Is>(OutAccs))[OutAccIndex]...) =
1921  std::make_tuple(std::get<Is>(BOPs)(std::get<Is>(LocalAccs)[0],
1922  std::get<Is>(LocalAccs)[WGSize])...);
1923  }
1924 }
1925 
1926 // Concatenate an empty sequence.
1927 constexpr std::index_sequence<> concat_sequences(std::index_sequence<>) {
1928  return {};
1929 }
1930 
1931 // Concatenate a sequence consisting of 1 element.
1932 template <size_t I>
1933 constexpr std::index_sequence<I> concat_sequences(std::index_sequence<I>) {
1934  return {};
1935 }
1936 
1937 // Concatenate two potentially empty sequences.
1938 template <size_t... Is, size_t... Js>
1939 constexpr std::index_sequence<Is..., Js...>
1940 concat_sequences(std::index_sequence<Is...>, std::index_sequence<Js...>) {
1941  return {};
1942 }
1943 
1944 // Concatenate more than 2 sequences.
1945 template <size_t... Is, size_t... Js, class... Rs>
1946 constexpr auto concat_sequences(std::index_sequence<Is...>,
1947  std::index_sequence<Js...>, Rs...) {
1948  return concat_sequences(std::index_sequence<Is..., Js...>{}, Rs{}...);
1949 }
1950 
1952  template <typename T> struct Func {
1953  static constexpr bool value = !std::remove_pointer_t<T>::is_usm;
1954  };
1955 };
1956 
1958  template <typename T> struct Func { static constexpr bool value = false; };
1959 };
1960 
1961 template <bool Cond, size_t I> struct FilterElement {
1962  using type =
1963  std::conditional_t<Cond, std::index_sequence<I>, std::index_sequence<>>;
1964 };
1965 
1971 template <typename... T, typename FunctorT, size_t... Is,
1972  std::enable_if_t<(sizeof...(Is) > 0), int> Z = 0>
1973 constexpr auto filterSequenceHelper(FunctorT, std::index_sequence<Is...>) {
1974  return concat_sequences(
1975  typename FilterElement<FunctorT::template Func<std::tuple_element_t<
1976  Is, std::tuple<T...>>>::value,
1977  Is>::type{}...);
1978 }
1979 template <typename... T, typename FunctorT, size_t... Is,
1980  std::enable_if_t<(sizeof...(Is) == 0), int> Z = 0>
1981 constexpr auto filterSequenceHelper(FunctorT, std::index_sequence<Is...>) {
1982  return std::index_sequence<>{};
1983 }
1984 
1988 template <typename... T, typename FunctorT, size_t... Is>
1989 constexpr auto filterSequence(FunctorT F, std::index_sequence<Is...> Indices) {
1990  return filterSequenceHelper<T...>(F, Indices);
1991 }
1992 
1994  template <typename Reduction> struct Func {
1995  static constexpr bool value =
1996  (Reduction::dims == 0 && Reduction::num_elements == 1);
1997  };
1998 };
1999 
2001  template <typename Reduction> struct Func {
2002  static constexpr bool value =
2003  (Reduction::dims == 1 && Reduction::num_elements >= 1);
2004  };
2005 };
2006 
2009 template <bool Pow2WG, bool IsOneWG, typename... Reductions, int Dims,
2010  typename... LocalAccT, typename... OutAccT, typename... ReducerT,
2011  typename... Ts, typename... BOPsT, size_t... Is>
2013  nd_item<Dims> NDIt, ReduTupleT<LocalAccT...> LocalAccsTuple,
2014  ReduTupleT<OutAccT...> OutAccsTuple, std::tuple<ReducerT...> &ReducersTuple,
2015  ReduTupleT<Ts...> IdentitiesTuple, ReduTupleT<BOPsT...> BOPsTuple,
2016  std::array<bool, sizeof...(Reductions)> InitToIdentityProps,
2017  std::index_sequence<Is...> ReduIndices) {
2018  size_t WGSize = NDIt.get_local_range().size();
2019  size_t LID = NDIt.get_local_linear_id();
2020  initReduLocalAccs<Pow2WG>(LID, WGSize, LocalAccsTuple, ReducersTuple,
2021  IdentitiesTuple, ReduIndices);
2022  NDIt.barrier();
2023 
2024  size_t PrevStep = WGSize;
2025  for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
2026  if (LID < CurStep) {
2027  // LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
2028  reduceReduLocalAccs(LID, LID + CurStep, LocalAccsTuple, BOPsTuple,
2029  ReduIndices);
2030  } else if (!Pow2WG && LID == CurStep && (PrevStep & 0x1)) {
2031  // LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
2032  reduceReduLocalAccs(WGSize, PrevStep - 1, LocalAccsTuple, BOPsTuple,
2033  ReduIndices);
2034  }
2035  NDIt.barrier();
2036  PrevStep = CurStep;
2037  }
2038 
2039  // Compute the partial sum/reduction for the work-group.
2040  if (LID == 0) {
2041  size_t GrID = NDIt.get_group_linear_id();
2042  writeReduSumsToOutAccs<Pow2WG, IsOneWG>(
2043  GrID, WGSize, (std::tuple<Reductions...> *)nullptr, OutAccsTuple,
2044  LocalAccsTuple, BOPsTuple, IdentitiesTuple, InitToIdentityProps,
2045  ReduIndices);
2046  }
2047 }
2048 
2050 template <bool Pow2WG, bool IsOneWG, typename Reduction, int Dims,
2051  typename LocalAccT, typename OutAccT, typename ReducerT, typename T,
2052  typename BOPT>
2053 void reduCGFuncImplArrayHelper(nd_item<Dims> NDIt, LocalAccT LocalReds,
2054  OutAccT Out, ReducerT &Reducer, T Identity,
2055  BOPT BOp, bool IsInitializeToIdentity) {
2056  size_t WGSize = NDIt.get_local_range().size();
2057  size_t LID = NDIt.get_local_linear_id();
2058 
2059  // If there are multiple values, reduce each separately
2060  // This prevents local memory from scaling with elements
2061  auto NElements = Reduction::num_elements;
2062  for (size_t E = 0; E < NElements; ++E) {
2063 
2064  // Copy the element to local memory to prepare it for tree-reduction.
2065  LocalReds[LID] = Reducer.getElement(E);
2066  if (!Pow2WG)
2067  LocalReds[WGSize] = Identity;
2068  NDIt.barrier();
2069 
2070  size_t PrevStep = WGSize;
2071  for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
2072  if (LID < CurStep) {
2073  LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
2074  } else if (!Pow2WG && LID == CurStep && (PrevStep & 0x1)) {
2075  LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
2076  }
2077  NDIt.barrier();
2078  PrevStep = CurStep;
2079  }
2080 
2081  // Add the initial value of user's variable to the final result.
2082  if (LID == 0) {
2083  if (IsOneWG) {
2084  LocalReds[0] =
2085  BOp(LocalReds[0], IsInitializeToIdentity
2086  ? Identity
2087  : Reduction::getOutPointer(Out)[E]);
2088  }
2089 
2090  size_t GrID = NDIt.get_group_linear_id();
2091  if (Pow2WG) {
2092  // The partial sums for the work-group are stored in 0-th elements of
2093  // local accessors. Simply write those sums to output accessors.
2094  Reduction::getOutPointer(Out)[GrID * NElements + E] = LocalReds[0];
2095  } else {
2096  // Each of local accessors keeps two partial sums: in 0-th and WGsize-th
2097  // elements. Combine them into final partial sums and write to output
2098  // accessors.
2099  Reduction::getOutPointer(Out)[GrID * NElements + E] =
2100  BOp(LocalReds[0], LocalReds[WGSize]);
2101  }
2102  }
2103 
2104  // Ensure item 0 is finished with LocalReds before next iteration
2105  if (E != NElements - 1) {
2106  NDIt.barrier();
2107  }
2108  }
2109 }
2110 
2111 template <bool Pow2WG, bool IsOneWG, typename... Reductions, int Dims,
2112  typename... LocalAccT, typename... OutAccT, typename... ReducerT,
2113  typename... Ts, typename... BOPsT, size_t... Is>
2115  nd_item<Dims> NDIt, ReduTupleT<LocalAccT...> LocalAccsTuple,
2116  ReduTupleT<OutAccT...> OutAccsTuple, std::tuple<ReducerT...> &ReducersTuple,
2117  ReduTupleT<Ts...> IdentitiesTuple, ReduTupleT<BOPsT...> BOPsTuple,
2118  std::array<bool, sizeof...(Reductions)> InitToIdentityProps,
2119  std::index_sequence<Is...>) {
2120  using ReductionPack = std::tuple<Reductions...>;
2121  (reduCGFuncImplArrayHelper<Pow2WG, IsOneWG,
2122  std::tuple_element_t<Is, ReductionPack>>(
2123  NDIt, std::get<Is>(LocalAccsTuple), std::get<Is>(OutAccsTuple),
2124  std::get<Is>(ReducersTuple), std::get<Is>(IdentitiesTuple),
2125  std::get<Is>(BOPsTuple), InitToIdentityProps[Is]),
2126  ...);
2127 }
2128 
2129 template <typename KernelName, bool Pow2WG, bool IsOneWG, typename KernelType,
2130  int Dims, typename... Reductions, size_t... Is>
2131 void reduCGFuncImpl(handler &CGH, KernelType KernelFunc,
2132  const nd_range<Dims> &Range,
2133  std::tuple<Reductions...> &ReduTuple,
2134  std::index_sequence<Is...> ReduIndices) {
2135 
2136  // Split reduction sequence into two:
2137  // 1) Scalar reductions
2138  // 2) Array reductions
2139  // This allows us to reuse the existing implementation for scalar reductions
2140  // and introduce a new implementation for array reductions. Longer term it
2141  // may make sense to generalize the code such that each phase below applies
2142  // to all available reduction implementations -- today all reduction classes
2143  // use the same privatization-based approach, so this is unnecessary.
2144  IsScalarReduction ScalarPredicate;
2145  auto ScalarIs = filterSequence<Reductions...>(ScalarPredicate, ReduIndices);
2146 
2147  IsArrayReduction ArrayPredicate;
2148  auto ArrayIs = filterSequence<Reductions...>(ArrayPredicate, ReduIndices);
2149 
2150  // Create inputs using the global order of all reductions
2151  size_t WGSize = Range.get_local_range().size();
2152  size_t LocalAccSize = WGSize + (Pow2WG ? 0 : 1);
2153  auto LocalAccsTuple =
2154  createReduLocalAccs<Reductions...>(LocalAccSize, CGH, ReduIndices);
2155 
2156  size_t NWorkGroups = IsOneWG ? 1 : Range.get_group_range().size();
2157  auto OutAccsTuple =
2158  createReduOutAccs<IsOneWG>(NWorkGroups, CGH, ReduTuple, ReduIndices);
2159  auto IdentitiesTuple = getReduIdentities(ReduTuple, ReduIndices);
2160  auto BOPsTuple = getReduBOPs(ReduTuple, ReduIndices);
2161  auto InitToIdentityProps =
2162  getInitToIdentityProperties(ReduTuple, ReduIndices);
2163 
2164  using Name = typename get_reduction_main_kernel_name_t<
2165  KernelName, KernelType, Pow2WG, IsOneWG, decltype(OutAccsTuple)>::name;
2166  CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
2167  // Pass all reductions to user's lambda in the same order as supplied
2168  // Each reducer initializes its own storage
2169  auto ReduIndices = std::index_sequence_for<Reductions...>();
2170  auto ReducersTuple =
2171  createReducers<Reductions...>(IdentitiesTuple, BOPsTuple, ReduIndices);
2172  callReduUserKernelFunc(KernelFunc, NDIt, ReducersTuple, ReduIndices);
2173 
2174  // Combine and write-back the results of any scalar reductions
2175  // reduCGFuncImplScalar<Reductions...>(NDIt, LocalAccsTuple, OutAccsTuple,
2176  // ReducersTuple, IdentitiesTuple, BOPsTuple, InitToIdentityProps,
2177  // ReduIndices);
2178  reduCGFuncImplScalar<Pow2WG, IsOneWG, Reductions...>(
2179  NDIt, LocalAccsTuple, OutAccsTuple, ReducersTuple, IdentitiesTuple,
2180  BOPsTuple, InitToIdentityProps, ScalarIs);
2181 
2182  // Combine and write-back the results of any array reductions
2183  // These are handled separately to minimize temporary storage and account
2184  // for the fact that each array reduction may have a different number of
2185  // elements to reduce (i.e. a different extent).
2186  reduCGFuncImplArray<Pow2WG, IsOneWG, Reductions...>(
2187  NDIt, LocalAccsTuple, OutAccsTuple, ReducersTuple, IdentitiesTuple,
2188  BOPsTuple, InitToIdentityProps, ArrayIs);
2189  });
2190 }
2191 
2192 template <typename KernelName, typename KernelType, int Dims,
2193  typename... Reductions, size_t... Is>
2194 void reduCGFunc(handler &CGH, KernelType KernelFunc,
2195  const nd_range<Dims> &Range,
2196  std::tuple<Reductions...> &ReduTuple,
2197  std::index_sequence<Is...> ReduIndices) {
2198  size_t WGSize = Range.get_local_range().size();
2199  size_t NWorkGroups = Range.get_group_range().size();
2200  bool Pow2WG = (WGSize & (WGSize - 1)) == 0;
2201  if (NWorkGroups == 1) {
2202  // TODO: consider having only one variant of kernel instead of two here.
2203  // Having two kernels, where one is just slighly more efficient than
2204  // another, and only for the purpose of running 1 work-group may be too
2205  // expensive.
2206  if (Pow2WG)
2207  reduCGFuncImpl<KernelName, true, true>(CGH, KernelFunc, Range, ReduTuple,
2208  ReduIndices);
2209  else
2210  reduCGFuncImpl<KernelName, false, true>(CGH, KernelFunc, Range, ReduTuple,
2211  ReduIndices);
2212  } else {
2213  if (Pow2WG)
2214  reduCGFuncImpl<KernelName, true, false>(CGH, KernelFunc, Range, ReduTuple,
2215  ReduIndices);
2216  else
2217  reduCGFuncImpl<KernelName, false, false>(CGH, KernelFunc, Range,
2218  ReduTuple, ReduIndices);
2219  }
2220 }
2221 
2222 // Specialization for devices with the atomic64 aspect, which guarantees 64 (and
2223 // temporarily 32) bit floating point support for atomic add.
2224 // TODO 32 bit floating point atomics are eventually expected to be supported by
2225 // the has_fast_atomics specialization. Corresponding changes to
2226 // IsReduOptForAtomic64Add, as prescribed in its documentation, should then also
2227 // be made.
2228 template <typename KernelName, typename KernelType, int Dims, class Reduction>
2229 std::enable_if_t<Reduction::has_atomic_add_float64>
2231  const nd_range<Dims> &Range, Reduction &,
2232  typename Reduction::rw_accessor_type Out) {
2233  constexpr size_t NElements = Reduction::num_elements;
2234  using Name = typename get_reduction_main_kernel_name_t<
2235  KernelName, KernelType, Reduction::is_usm,
2236  Reduction::has_atomic_add_float64,
2237  typename Reduction::rw_accessor_type>::name;
2238  CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
2239  // Call user's function. Reducer.MValue gets initialized there.
2240  typename Reduction::reducer_type Reducer;
2241  KernelFunc(NDIt, Reducer);
2242 
2243  // If there are multiple values, reduce each separately
2244  // reduce_over_group is only defined for each T, not for span<T, ...>
2245  for (int E = 0; E < NElements; ++E) {
2246  typename Reduction::binary_operation BOp;
2247  Reducer.getElement(E) =
2248  reduce_over_group(NDIt.get_group(), Reducer.getElement(E), BOp);
2249  }
2250 
2251  if (NDIt.get_local_linear_id() == 0) {
2252  Reducer.atomic_combine(Reduction::getOutPointer(Out));
2253  }
2254  });
2255 }
2256 
2257 // Specialization for devices with the atomic64 aspect, which guarantees 64 (and
2258 // temporarily 32) bit floating point support for atomic add.
2259 // TODO 32 bit floating point atomics are eventually expected to be supported by
2260 // the has_fast_atomics specialization. Corresponding changes to
2261 // IsReduOptForAtomic64Add, as prescribed in its documentation, should then also
2262 // be made.
2263 template <typename KernelName, typename KernelType, int Dims, class Reduction>
2266  const nd_range<Dims> &Range, Reduction &Redu) {
2267 
2268  auto Out = Redu.getReadWriteAccessorToInitializedMem(CGH);
2269  reduCGFuncImplAtomic64<KernelName, KernelType, Dims, Reduction>(
2270  CGH, KernelFunc, Range, Redu, Out);
2271 }
2272 
2274 
2275 template <typename ReductionT>
2276 void associateReduAccsWithHandlerHelper(handler &CGH, ReductionT &Redu) {
2277  Redu.associateWithHandler(CGH);
2278 }
2279 
2280 template <typename ReductionT, typename... RestT,
2281  enable_if_t<(sizeof...(RestT) > 0), int> Z = 0>
2282 void associateReduAccsWithHandlerHelper(handler &CGH, ReductionT &Redu,
2283  RestT &... Rest) {
2284  Redu.associateWithHandler(CGH);
2285  associateReduAccsWithHandlerHelper(CGH, Rest...);
2286 }
2287 
2288 template <typename... Reductions, size_t... Is>
2290  std::tuple<Reductions...> &ReduTuple,
2291  std::index_sequence<Is...>) {
2292  associateReduAccsWithHandlerHelper(CGH, std::get<Is>(ReduTuple)...);
2293 }
2294 
2297 template <bool UniformPow2WG, bool IsOneWG, typename... Reductions, int Dims,
2298  typename... LocalAccT, typename... InAccT, typename... OutAccT,
2299  typename... Ts, typename... BOPsT, size_t... Is>
2301  nd_item<Dims> NDIt, size_t LID, size_t GID, size_t NWorkItems,
2302  size_t WGSize, ReduTupleT<LocalAccT...> LocalAccsTuple,
2303  ReduTupleT<InAccT...> InAccsTuple, ReduTupleT<OutAccT...> OutAccsTuple,
2304  ReduTupleT<Ts...> IdentitiesTuple, ReduTupleT<BOPsT...> BOPsTuple,
2305  std::array<bool, sizeof...(Reductions)> InitToIdentityProps,
2306  std::index_sequence<Is...> ReduIndices) {
2307  initReduLocalAccs<UniformPow2WG>(LID, GID, NWorkItems, WGSize, LocalAccsTuple,
2308  InAccsTuple, IdentitiesTuple, ReduIndices);
2309  NDIt.barrier();
2310 
2311  size_t PrevStep = WGSize;
2312  for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
2313  if (LID < CurStep) {
2314  // LocalAcc[LID] = BOp(LocalAcc[LID], LocalAcc[LID + CurStep]);
2315  reduceReduLocalAccs(LID, LID + CurStep, LocalAccsTuple, BOPsTuple,
2316  ReduIndices);
2317  } else if (!UniformPow2WG && LID == CurStep && (PrevStep & 0x1)) {
2318  // LocalAcc[WGSize] = BOp(LocalAcc[WGSize], LocalAcc[PrevStep - 1]);
2319  reduceReduLocalAccs(WGSize, PrevStep - 1, LocalAccsTuple, BOPsTuple,
2320  ReduIndices);
2321  }
2322  NDIt.barrier();
2323  PrevStep = CurStep;
2324  }
2325 
2326  // Compute the partial sum/reduction for the work-group.
2327  if (LID == 0) {
2328  size_t GrID = NDIt.get_group_linear_id();
2329  writeReduSumsToOutAccs<UniformPow2WG, IsOneWG>(
2330  GrID, WGSize, (std::tuple<Reductions...> *)nullptr, OutAccsTuple,
2331  LocalAccsTuple, BOPsTuple, IdentitiesTuple, InitToIdentityProps,
2332  ReduIndices);
2333  }
2334 }
2335 
2336 template <bool UniformPow2WG, bool IsOneWG, typename Reduction, int Dims,
2337  typename LocalAccT, typename InAccT, typename OutAccT, typename T,
2338  typename BOPT>
2339 void reduAuxCGFuncImplArrayHelper(nd_item<Dims> NDIt, size_t LID, size_t GID,
2340  size_t NWorkItems, size_t WGSize,
2341  LocalAccT LocalReds, InAccT In, OutAccT Out,
2342  T Identity, BOPT BOp,
2343  bool IsInitializeToIdentity) {
2344 
2345  // If there are multiple values, reduce each separately
2346  // This prevents local memory from scaling with elements
2347  auto NElements = Reduction::num_elements;
2348  for (size_t E = 0; E < NElements; ++E) {
2349  // Normally, the local accessors are initialized with elements from the
2350  // input accessors. The exception is the case when (GID >= NWorkItems),
2351  // which possible only when UniformPow2WG is false. For that case the
2352  // elements of local accessors are initialized with identity value, so they
2353  // would not give any impact into the final partial sums during the
2354  // tree-reduction algorithm work.
2355  if (UniformPow2WG || GID < NWorkItems) {
2356  LocalReds[LID] = In[GID * NElements + E];
2357  } else {
2358  LocalReds[LID] = Identity;
2359  }
2360 
2361  // For work-groups, which size is not power of two, local accessors have
2362  // an additional element with index WGSize that is used by the
2363  // tree-reduction algorithm. Initialize those additional elements with
2364  // identity values here.
2365  if (!UniformPow2WG) {
2366  LocalReds[WGSize] = Identity;
2367  }
2368 
2369  NDIt.barrier();
2370 
2371  // Tree reduction in local memory
2372  size_t PrevStep = WGSize;
2373  for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
2374  if (LID < CurStep) {
2375  LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
2376  } else if (!UniformPow2WG && LID == CurStep && (PrevStep & 0x1)) {
2377  LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
2378  }
2379  NDIt.barrier();
2380  PrevStep = CurStep;
2381  }
2382 
2383  // Add the initial value of user's variable to the final result.
2384  if (LID == 0) {
2385  if (IsOneWG) {
2386  LocalReds[0] =
2387  BOp(LocalReds[0], IsInitializeToIdentity
2388  ? Identity
2389  : Reduction::getOutPointer(Out)[E]);
2390  }
2391 
2392  size_t GrID = NDIt.get_group_linear_id();
2393  if (UniformPow2WG) {
2394  // The partial sums for the work-group are stored in 0-th elements of
2395  // local accessors. Simply write those sums to output accessors.
2396  Reduction::getOutPointer(Out)[GrID * NElements + E] = LocalReds[0];
2397  } else {
2398  // Each of local accessors keeps two partial sums: in 0-th and WGsize-th
2399  // elements. Combine them into final partial sums and write to output
2400  // accessors.
2401  Reduction::getOutPointer(Out)[GrID * NElements + E] =
2402  BOp(LocalReds[0], LocalReds[WGSize]);
2403  }
2404  }
2405 
2406  // Ensure item 0 is finished with LocalReds before next iteration
2407  if (E != NElements - 1) {
2408  NDIt.barrier();
2409  }
2410  }
2411 }
2412 
2413 template <bool UniformPow2WG, bool IsOneWG, typename... Reductions, int Dims,
2414  typename... LocalAccT, typename... InAccT, typename... OutAccT,
2415  typename... Ts, typename... BOPsT, size_t... Is>
2417  nd_item<Dims> NDIt, size_t LID, size_t GID, size_t NWorkItems,
2418  size_t WGSize, ReduTupleT<LocalAccT...> LocalAccsTuple,
2419  ReduTupleT<InAccT...> InAccsTuple, ReduTupleT<OutAccT...> OutAccsTuple,
2420  ReduTupleT<Ts...> IdentitiesTuple, ReduTupleT<BOPsT...> BOPsTuple,
2421  std::array<bool, sizeof...(Reductions)> InitToIdentityProps,
2422  std::index_sequence<Is...>) {
2423  using ReductionPack = std::tuple<Reductions...>;
2424  (reduAuxCGFuncImplArrayHelper<UniformPow2WG, IsOneWG,
2425  std::tuple_element_t<Is, ReductionPack>>(
2426  NDIt, LID, GID, NWorkItems, WGSize, std::get<Is>(LocalAccsTuple),
2427  std::get<Is>(InAccsTuple), std::get<Is>(OutAccsTuple),
2428  std::get<Is>(IdentitiesTuple), std::get<Is>(BOPsTuple),
2429  InitToIdentityProps[Is]),
2430  ...);
2431 }
2432 
2433 template <typename KernelName, typename KernelType, bool UniformPow2WG,
2434  bool IsOneWG, typename... Reductions, size_t... Is>
2435 void reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
2436  size_t WGSize, std::tuple<Reductions...> &ReduTuple,
2437  std::index_sequence<Is...> ReduIndices) {
2438 
2439  // Like reduCGFuncImpl, we also have to split out scalar and array reductions
2440  IsScalarReduction ScalarPredicate;
2441  auto ScalarIs = filterSequence<Reductions...>(ScalarPredicate, ReduIndices);
2442 
2443  IsArrayReduction ArrayPredicate;
2444  auto ArrayIs = filterSequence<Reductions...>(ArrayPredicate, ReduIndices);
2445 
2446  // The last kernel DOES write to user's accessor passed to reduction.
2447  // Associate it with handler manually.
2450  Predicate;
2451  auto AccReduIndices = filterSequence<Reductions...>(Predicate, ReduIndices);
2452  associateReduAccsWithHandler(CGH, ReduTuple, AccReduIndices);
2453 
2454  size_t LocalAccSize = WGSize + (UniformPow2WG ? 0 : 1);
2455  auto LocalAccsTuple =
2456  createReduLocalAccs<Reductions...>(LocalAccSize, CGH, ReduIndices);
2457  auto InAccsTuple =
2458  getReadAccsToPreviousPartialReds(CGH, ReduTuple, ReduIndices);
2459  auto OutAccsTuple =
2460  createReduOutAccs<IsOneWG>(NWorkGroups, CGH, ReduTuple, ReduIndices);
2461  auto IdentitiesTuple = getReduIdentities(ReduTuple, ReduIndices);
2462  auto BOPsTuple = getReduBOPs(ReduTuple, ReduIndices);
2463  auto InitToIdentityProps =
2464  getInitToIdentityProperties(ReduTuple, ReduIndices);
2465 
2466  using Name =
2467  typename get_reduction_aux_kernel_name_t<KernelName, KernelType,
2468  UniformPow2WG, IsOneWG,
2469  decltype(OutAccsTuple)>::name;
2470  // TODO: Opportunity to parallelize across number of elements
2471  range<1> GlobalRange = {UniformPow2WG ? NWorkItems : NWorkGroups * WGSize};
2472  nd_range<1> Range{GlobalRange, range<1>(WGSize)};
2473  CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
2474  size_t WGSize = NDIt.get_local_range().size();
2475  size_t LID = NDIt.get_local_linear_id();
2476  size_t GID = NDIt.get_global_linear_id();
2477 
2478  // Handle scalar and array reductions
2479  reduAuxCGFuncImplScalar<UniformPow2WG, IsOneWG, Reductions...>(
2480  NDIt, LID, GID, NWorkItems, WGSize, LocalAccsTuple, InAccsTuple,
2481  OutAccsTuple, IdentitiesTuple, BOPsTuple, InitToIdentityProps,
2482  ScalarIs);
2483  reduAuxCGFuncImplArray<UniformPow2WG, IsOneWG, Reductions...>(
2484  NDIt, LID, GID, NWorkItems, WGSize, LocalAccsTuple, InAccsTuple,
2485  OutAccsTuple, IdentitiesTuple, BOPsTuple, InitToIdentityProps, ArrayIs);
2486  });
2487 }
2488 
2489 template <typename KernelName, typename KernelType, typename... Reductions,
2490  size_t... Is>
2491 size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
2492  std::tuple<Reductions...> &ReduTuple,
2493  std::index_sequence<Is...> ReduIndices) {
2494  size_t NWorkGroups;
2495  size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
2496 
2497  bool Pow2WG = (WGSize & (WGSize - 1)) == 0;
2498  bool HasUniformWG = Pow2WG && (NWorkGroups * WGSize == NWorkItems);
2499  if (NWorkGroups == 1) {
2500  if (HasUniformWG)
2501  reduAuxCGFuncImpl<KernelName, KernelType, true, true>(
2502  CGH, NWorkItems, NWorkGroups, WGSize, ReduTuple, ReduIndices);
2503  else
2504  reduAuxCGFuncImpl<KernelName, KernelType, false, true>(
2505  CGH, NWorkItems, NWorkGroups, WGSize, ReduTuple, ReduIndices);
2506  } else {
2507  if (HasUniformWG)
2508  reduAuxCGFuncImpl<KernelName, KernelType, true, false>(
2509  CGH, NWorkItems, NWorkGroups, WGSize, ReduTuple, ReduIndices);
2510  else
2511  reduAuxCGFuncImpl<KernelName, KernelType, false, false>(
2512  CGH, NWorkItems, NWorkGroups, WGSize, ReduTuple, ReduIndices);
2513  }
2514  return NWorkGroups;
2515 }
2516 
2517 inline void
2519  std::shared_ptr<detail::queue_impl>, bool) {}
2520 
2521 template <typename Reduction, typename... RestT>
2522 std::enable_if_t<Reduction::is_usm>
2523 reduSaveFinalResultToUserMemHelper(std::vector<event> &Events,
2524  std::shared_ptr<detail::queue_impl> Queue,
2525  bool IsHost, Reduction &, RestT... Rest) {
2526  // Reductions initialized with USM pointer currently do not require copying
2527  // because the last kernel write directly to USM memory.
2528  reduSaveFinalResultToUserMemHelper(Events, Queue, IsHost, Rest...);
2529 }
2530 
2531 template <typename Reduction, typename... RestT>
2532 std::enable_if_t<!Reduction::is_usm> reduSaveFinalResultToUserMemHelper(
2533  std::vector<event> &Events, std::shared_ptr<detail::queue_impl> Queue,
2534  bool IsHost, Reduction &Redu, RestT... Rest) {
2535  if (Redu.hasUserDiscardWriteAccessor()) {
2536  handler CopyHandler(Queue, IsHost);
2537  auto InAcc = Redu.getReadAccToPreviousPartialReds(CopyHandler);
2538  auto OutAcc = Redu.getUserDiscardWriteAccessor();
2539  Redu.associateWithHandler(CopyHandler);
2540  if (!Events.empty())
2541  CopyHandler.depends_on(Events.back());
2542  CopyHandler.copy(InAcc, OutAcc);
2543  event CopyEvent = CopyHandler.finalize();
2544  Events.push_back(CopyEvent);
2545  }
2546  reduSaveFinalResultToUserMemHelper(Events, Queue, IsHost, Rest...);
2547 }
2548 
2553 template <typename... Reduction, size_t... Is>
2554 std::shared_ptr<event>
2555 reduSaveFinalResultToUserMem(std::shared_ptr<detail::queue_impl> Queue,
2556  bool IsHost, std::tuple<Reduction...> &ReduTuple,
2557  std::index_sequence<Is...>) {
2558  std::vector<event> Events;
2559  reduSaveFinalResultToUserMemHelper(Events, Queue, IsHost,
2560  std::get<Is>(ReduTuple)...);
2561  if (!Events.empty())
2562  return std::make_shared<event>(Events.back());
2563  return std::shared_ptr<event>();
2564 }
2565 
2566 template <typename Reduction> size_t reduGetMemPerWorkItemHelper(Reduction &) {
2567  return sizeof(typename Reduction::result_type);
2568 }
2569 
2570 template <typename Reduction, typename... RestT>
2571 size_t reduGetMemPerWorkItemHelper(Reduction &, RestT... Rest) {
2572  return sizeof(typename Reduction::result_type) +
2573  reduGetMemPerWorkItemHelper(Rest...);
2574 }
2575 
2576 template <typename... ReductionT, size_t... Is>
2577 size_t reduGetMemPerWorkItem(std::tuple<ReductionT...> &ReduTuple,
2578  std::index_sequence<Is...>) {
2579  return reduGetMemPerWorkItemHelper(std::get<Is>(ReduTuple)...);
2580 }
2581 
2584 template <typename TupleT, std::size_t... Is>
2585 std::tuple<std::tuple_element_t<Is, TupleT>...>
2586 tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>) {
2587  return {std::get<Is>(std::move(Tuple))...};
2588 }
2589 
2590 } // namespace detail
2591 
2596 template <typename T, class BinaryOperation, int Dims, access::mode AccMode,
2597  access::placeholder IsPH>
2598 detail::reduction_impl<T, BinaryOperation, 0, 1,
2599  detail::default_reduction_algorithm<false, IsPH, Dims>>
2601  const T &Identity, BinaryOperation BOp) {
2602  return {Acc, Identity, BOp};
2603 }
2604 
2609 template <typename T, class BinaryOperation, int Dims, access::mode AccMode,
2610  access::placeholder IsPH>
2611 std::enable_if_t<detail::IsKnownIdentityOp<T, BinaryOperation>::value,
2612  detail::reduction_impl<
2613  T, BinaryOperation, 0, 1,
2614  detail::default_reduction_algorithm<false, IsPH, Dims>>>
2616  BinaryOperation) {
2617  return {Acc};
2618 }
2619 
2624 template <typename T, class BinaryOperation>
2625 detail::reduction_impl<
2626  T, BinaryOperation, 0, 1,
2627  detail::default_reduction_algorithm<true, access::placeholder::false_t, 1>>
2628 reduction(T *VarPtr, const T &Identity, BinaryOperation BOp) {
2629  return {VarPtr, Identity, BOp};
2630 }
2631 
2637 template <typename T, class BinaryOperation>
2640  detail::reduction_impl<T, BinaryOperation, 0, 1,
2641  detail::default_reduction_algorithm<
2642  true, access::placeholder::false_t, 1>>>
2643 reduction(T *VarPtr, BinaryOperation) {
2644  return {VarPtr};
2645 }
2646 
2647 // ---- has_known_identity
2648 template <typename BinaryOperation, typename AccumulatorT>
2650  : sycl::has_known_identity<BinaryOperation, AccumulatorT> {};
2651 
2652 template <typename BinaryOperation, typename AccumulatorT>
2655 
2656 // ---- known_identity
2657 template <typename BinaryOperation, typename AccumulatorT>
2658 struct known_identity : sycl::known_identity<BinaryOperation, AccumulatorT> {};
2659 
2660 template <typename BinaryOperation, typename AccumulatorT>
2663 
2664 } // namespace oneapi
2665 } // namespace ext
2666 
2667 #ifdef __SYCL_INTERNAL_API
2668 namespace __SYCL2020_DEPRECATED("use 'ext::oneapi' instead") ONEAPI {
2669  using namespace ext::oneapi;
2670  namespace detail {
2672  __SYCL_EXPORT size_t reduGetMaxWGSize(std::shared_ptr<queue_impl> Queue,
2673  size_t LocalMemBytesPerWorkItem);
2674  __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
2675  size_t &NWorkGroups);
2676  } // namespace detail
2677 } // namespace ONEAPI
2678 #endif // __SYCL_INTERNAL_API
2679 } // namespace sycl
2680 } // __SYCL_INLINE_NAMESPACE(cl)
cl::sycl::detail::associateWithHandler
void associateWithHandler(handler &, AccessorBaseHost *, access::target)
Definition: handler_proxy.cpp:17
cl::sycl::nd_range
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: nd_range.hpp:23
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==0 &&Extent==1 &&View==false &&IsKnownIdentityOp< T, BinaryOperation >::value > >::getElement
T & getElement(size_t)
Definition: reduction.hpp:380
cl::sycl::ext::oneapi::detail::reduction_impl::reduction_impl
reduction_impl(T *VarPtr, const T &Identity, BinaryOperation, bool InitializeToIdentity=false)
Constructs reduction_impl when the identity value is statically known, and user still passed the iden...
Definition: reduction.hpp:949
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==1 &&View==false &&IsKnownIdentityOp< T, BinaryOperation >::value > >::getElement
T & getElement(size_t E)
Definition: reduction.hpp:461
cl::sycl::ext::oneapi::detail::reduction_impl::reduction_impl
reduction_impl(dw_accessor_type &Acc, const T &, BinaryOperation)
Constructs reduction_impl when the identity value is statically known, and user still passed the iden...
Definition: reduction.hpp:869
cl::sycl::detail::known_identity_impl
Definition: known_identity.hpp:122
cl::sycl::ext::oneapi::detail::reduction_impl::reduction_impl
reduction_impl(dw_accessor_type &Acc)
Constructs reduction_impl when the identity value is statically known.
Definition: reduction.hpp:801
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==0 &&Extent==1 &&View==false &&!IsKnownIdentityOp< T, BinaryOperation >::value > >::combine
void combine(const T &Partial)
Definition: reduction.hpp:340
cl::sycl::ext::oneapi::detail::reduction_impl_common::MIdentity
const T MIdentity
Identity of the BinaryOperation.
Definition: reduction.hpp:502
cl::sycl::ext::oneapi::detail::reduction_impl_algo
Templated class for implementations of specific reduction algorithms.
Definition: handler.hpp:245
cl::sycl::detail::IsBitOR
bool_constant< std::is_same< BinaryOperation, sycl::bit_or< T > >::value||std::is_same< BinaryOperation, sycl::bit_or< void > >::value > IsBitOR
Definition: known_identity.hpp:49
cl::sycl::detail::getDelinearizedId
id< 1 > getDelinearizedId(const range< 1 > &, size_t Index)
Definition: id.hpp:318
cl::sycl::ext::oneapi::detail::reduction_impl_algo< T, BinaryOperation, Dims, Extent, default_reduction_algorithm< IsUSM, IsPlaceholder, AccessorDims > >::binary_operation
BinaryOperation binary_operation
Definition: reduction.hpp:533
cl::sycl::ext::oneapi::detail::reducer
Class that is used to represent objects that are passed to user's lambda functions and representing u...
Definition: reduction.hpp:126
cl::sycl::ext::oneapi::detail::reduGetMemPerWorkItemHelper
size_t reduGetMemPerWorkItemHelper(Reduction &, RestT... Rest)
Definition: reduction.hpp:2571
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==0 &&Extent==1 &&View==false &&IsKnownIdentityOp< T, BinaryOperation >::value > >::getElement
const T & getElement(size_t) const
Definition: reduction.hpp:381
T
cl::sycl::info::device
device
Definition: info_desc.hpp:53
cl::sycl::detail::make_tuple
constexpr tuple< Ts... > make_tuple(Ts... Args)
Definition: tuple.hpp:36
cl::sycl::ext::oneapi::detail::filterSequenceHelper
constexpr auto filterSequenceHelper(FunctorT, std::index_sequence< Is... >)
For each index 'I' from the given indices pack 'Is' this function initially creates a number of short...
Definition: reduction.hpp:1973
__SYCL2020_DEPRECATED
#define __SYCL2020_DEPRECATED(message)
Definition: defines_elementary.hpp:56
cl::sycl::ext::oneapi::known_identity
Definition: reduction.hpp:2658
cl::sycl::ext::oneapi::detail::get_reduction_main_kernel_name_t
Helper structs to get additional kernel name types based on given Name and additional template parame...
Definition: reduction.hpp:1018
tuple.hpp
cl::sycl::access::placeholder
placeholder
Definition: access.hpp:43
cl::sycl::detail::IsPlus
bool_constant< std::is_same< BinaryOperation, sycl::plus< T > >::value||std::is_same< BinaryOperation, sycl::plus< void > >::value > IsPlus
Definition: known_identity.hpp:24
cl::sycl::ext::oneapi::detail::FilterElement::type
std::conditional_t< Cond, std::index_sequence< I >, std::index_sequence<> > type
Definition: reduction.hpp:1963
cl::sycl::ext::oneapi::detail::IsScalarReduction::Func
Definition: reduction.hpp:1994
cl::sycl::ext::oneapi::detail::reduSaveFinalResultToUserMem
std::enable_if_t<!Reduction::is_usm > reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu)
Copies the final reduction result kept in read-write accessor to user's accessor.
Definition: reduction.hpp:1731
cl::sycl::ext::oneapi::known_identity_v
__SYCL_INLINE_CONSTEXPR AccumulatorT known_identity_v
Definition: reduction.hpp:2661
cl::sycl::ext::oneapi::detail::reduction_impl_algo< T, BinaryOperation, Dims, Extent, default_reduction_algorithm< IsUSM, IsPlaceholder, AccessorDims > >::getWriteMemForPartialReds
std::enable_if_t< IsOneWG &&!_IsUSM, rw_accessor_type > getWriteMemForPartialReds(size_t, handler &CGH)
Returns user's accessor passed to reduction for editing if that is the read-write accessor.
Definition: reduction.hpp:614
cl::sycl::ext::oneapi::detail::reduction_impl::reduction_impl
reduction_impl(rw_accessor_type &Acc, const T &Identity, BinaryOperation BOp)
Constructs reduction_impl when the identity value is unknown.
Definition: reduction.hpp:910
cl::sycl::ext::oneapi::detail::__sycl_reduction_main_kernel
These are the forward declaration for the classes that help to create names for additional kernels.
Definition: reduction.hpp:1009
cl::sycl::nd_item::get_local_linear_id
size_t get_local_linear_id() const
Definition: nd_item.hpp:62
cl::sycl::detail::workGroupBarrier
static void workGroupBarrier()
Definition: group.hpp:33
cl::sycl::ext::oneapi::detail::reduction_impl::reduction_impl
reduction_impl(T *VarPtr, bool InitializeToIdentity=false)
Constructs reduction_impl when the identity value is statically known.
Definition: reduction.hpp:937
cl::sycl::ext::oneapi::detail::combiner::atomic_combine
enable_if_t< std::is_same< typename remove_AS< _T >::type, T >::value &&IsReduOptForFastAtomicFetch< T, _BinaryOperation >::value &&sycl::detail::IsMaximum< T, _BinaryOperation >::value &&(Space==access::address_space::global_space||Space==access::address_space::local_space)> atomic_combine(_T *ReduVarPtr) const
Atomic MAX operation: *ReduVarPtr = sycl::maximum(*ReduVarPtr, MValue);.
Definition: reduction.hpp:312
cl::sycl::nd_item::get_group_linear_id
size_t __SYCL_ALWAYS_INLINE get_group_linear_id() const
Definition: nd_item.hpp:78
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==1 &&View==false &&IsKnownIdentityOp< T, BinaryOperation >::value > >::getIdentity
static T getIdentity()
Definition: reduction.hpp:457
cl::sycl::detail::IsKnownIdentityOp
bool_constant< IsZeroIdentityOp< T, BinaryOperation >::value||IsOneIdentityOp< T, BinaryOperation >::value||IsOnesIdentityOp< T, BinaryOperation >::value||IsMinimumIdentityOp< T, BinaryOperation >::value||IsMaximumIdentityOp< T, BinaryOperation >::value||IsFalseIdentityOp< T, BinaryOperation >::value||IsTrueIdentityOp< T, BinaryOperation >::value > IsKnownIdentityOp
Definition: known_identity.hpp:114
cl::sycl::reduce_over_group
detail::enable_if_t<(is_group_v< std::decay_t< Group >> &&detail::is_scalar_arithmetic< T >::value &&detail::is_native_op< T, BinaryOperation >::value), T > reduce_over_group(Group, T x, BinaryOperation binary_op)
Definition: group_algorithm.hpp:176
cl::sycl::ext::oneapi::detail::createReduLocalAccs
auto createReduLocalAccs(size_t Size, handler &CGH, std::index_sequence< Is... >)
For the given 'Reductions' types pack and indices enumerating only the reductions for which a local a...
Definition: reduction.hpp:1767
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==0 &&Extent==1 &&View==false &&!IsKnownIdentityOp< T, BinaryOperation >::value > >::getElement
T & getElement(size_t)
Definition: reduction.hpp:344
accessor_property_list.hpp
cl::sycl::ext::oneapi::detail::reduSaveFinalResultToUserMemHelper
std::enable_if_t< Reduction::is_usm > reduSaveFinalResultToUserMemHelper(std::vector< event > &Events, std::shared_ptr< detail::queue_impl > Queue, bool IsHost, Reduction &, RestT... Rest)
Definition: reduction.hpp:2523
cl::sycl::detail::is_contained
Definition: type_list.hpp:54
cl::sycl::ext::oneapi::detail::filterSequence
constexpr auto filterSequence(FunctorT F, std::index_sequence< Is... > Indices)
For each index 'I' from the given indices pack 'Is' this function returns an index sequence consistin...
Definition: reduction.hpp:1989
cl::sycl::ext::oneapi::detail::writeReduSumsToOutAccs
void writeReduSumsToOutAccs(size_t OutAccIndex, size_t WGSize, std::tuple< Reductions... > *, ReduTupleT< OutAccT... > OutAccs, ReduTupleT< LocalAccT... > LocalAccs, ReduTupleT< BOPsT... > BOPs, ReduTupleT< Ts... > IdentityVals, std::array< bool, sizeof...(Reductions)> IsInitializeToIdentity, std::index_sequence< Is... >)
Definition: reduction.hpp:1894
cl::sycl::ext::oneapi::detail::createReducers
std::tuple< typename Reductions::reducer_type... > createReducers(ReduTupleT< typename Reductions::result_type... > Identities, ReduTupleT< typename Reductions::binary_operation... > BOPsTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:1821
cl::sycl::ext::oneapi::detail::reduction_impl_algo< T, BinaryOperation, Dims, Extent, default_reduction_algorithm< IsUSM, IsPlaceholder, AccessorDims > >::getUserDiscardWriteAccessor
std::enable_if_t<!_IsUSM, dw_accessor_type & > getUserDiscardWriteAccessor()
Definition: reduction.hpp:691
cl::sycl::nd_range::get_group_range
range< dimensions > get_group_range() const
Definition: nd_range.hpp:44
cl::sycl::ext::oneapi::detail::initReduLocalAccs
void initReduLocalAccs(size_t LID, size_t GID, size_t NWorkItems, size_t WGSize, ReduTupleT< InputAccT... > LocalAccs, ReduTupleT< LocalAccT... > InputAccs, ReduTupleT< ResultT... > Identities, std::index_sequence< Is... >)
Definition: reduction.hpp:1855
cl::sycl::ext::oneapi::detail::reduction_impl::reduction_impl
reduction_impl(rw_accessor_type &Acc)
Constructs reduction_impl when the identity value is statically known.
Definition: reduction.hpp:788
cl::sycl::ext::oneapi::detail::reduAuxCGFuncImplArray
void reduAuxCGFuncImplArray(nd_item< Dims > NDIt, size_t LID, size_t GID, size_t NWorkItems, size_t WGSize, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< InAccT... > InAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... >)
Definition: reduction.hpp:2416
sycl
Definition: invoke_simd.hpp:68
cl::sycl::ext::oneapi::detail::getReadAccsToPreviousPartialReds
auto getReadAccsToPreviousPartialReds(handler &CGH, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
For the given 'Reductions' types pack and indices enumerating them this function returns accessors to...
Definition: reduction.hpp:1792
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==1 &&View==false &&!IsKnownIdentityOp< T, BinaryOperation >::value > >::reducer
reducer(const T &Identity, BinaryOperation BOp)
Definition: reduction.hpp:415
bool_constant
cl::sycl::multi_ptr
Provides constructors for address space qualified and non address space qualified pointers to allow i...
Definition: atomic.hpp:32
cl::sycl::ext::oneapi::detail::concat_sequences
constexpr auto concat_sequences(std::index_sequence< Is... >, std::index_sequence< Js... >, Rs...)
Definition: reduction.hpp:1946
cl::sycl::ext::oneapi::detail::reduction_impl::reduction_impl
reduction_impl(T *VarPtr, const T &Identity, BinaryOperation BOp, bool InitializeToIdentity=false)
Constructs reduction_impl when the identity value is unknown.
Definition: reduction.hpp:972
cl::sycl::ext::oneapi::detail::reduction_impl_algo< T, BinaryOperation, Dims, Extent, default_reduction_algorithm< IsUSM, IsPlaceholder, AccessorDims > >::getReadWriteLocalAcc
static accessor< _T, buffer_dim, access::mode::read_write, access::target::local > getReadWriteLocalAcc(size_t Size, handler &CGH)
Creates and returns a local accessor with the Size elements.
Definition: reduction.hpp:592
cl::sycl::has_known_identity
Definition: known_identity.hpp:379
cl::sycl::ext::oneapi::detail::callReduUserKernelFunc
void callReduUserKernelFunc(KernelType KernelFunc, nd_item< Dims > NDIt, std::tuple< ReducerT... > &Reducers, std::index_sequence< Is... >)
Definition: reduction.hpp:1829
cl::sycl::detail::IsMultiplies
bool_constant< std::is_same< BinaryOperation, sycl::multiplies< T > >::value||std::is_same< BinaryOperation, sycl::multiplies< void > >::value > IsMultiplies
Definition: known_identity.hpp:29
cl::sycl::ext::oneapi::detail::IsReduOptForFastReduce
bool_constant<((sycl::detail::is_sgeninteger< T >::value &&(sizeof(T)==4||sizeof(T)==8))||sycl::detail::is_sgenfloat< T >::value) &&(sycl::detail::IsPlus< T, BinaryOperation >::value||sycl::detail::IsMinimum< T, BinaryOperation >::value||sycl::detail::IsMaximum< T, BinaryOperation >::value)> IsReduOptForFastReduce
Definition: reduction.hpp:94
cl::sycl::buffer
Defines a shared array that can be used by kernels in queues.
Definition: buffer.hpp:58
cl::sycl::nd_item::get_local_range
range< dimensions > get_local_range() const
Definition: nd_item.hpp:98
cl::sycl::ext::oneapi::detail::reduction_impl_base
Base non-template class which is a base class for all reduction implementation classes.
Definition: reduction.hpp:470
cl::sycl::nd_item::barrier
void barrier(access::fence_space accessSpace=access::fence_space::global_and_local) const
Definition: nd_item.hpp:112
cl::sycl::ext::oneapi::detail::IsArrayReduction
Definition: reduction.hpp:2000
cl::sycl::ext::oneapi::detail::IsReduOptForFastAtomicFetch
bool_constant< sycl::detail::is_sgeninteger< T >::value &&sycl::detail::IsValidAtomicType< T >::value &&(sycl::detail::IsPlus< T, BinaryOperation >::value||sycl::detail::IsMinimum< T, BinaryOperation >::value||sycl::detail::IsMaximum< T, BinaryOperation >::value||sycl::detail::IsBitOR< T, BinaryOperation >::value||sycl::detail::IsBitXOR< T, BinaryOperation >::value||sycl::detail::IsBitAND< T, BinaryOperation >::value)> IsReduOptForFastAtomicFetch
Definition: reduction.hpp:54
cl::sycl::range< 1 >
cl::sycl::ext::oneapi::detail::reduCGFuncImplArray
void reduCGFuncImplArray(nd_item< Dims > NDIt, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, std::tuple< ReducerT... > &ReducersTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... >)
Definition: reduction.hpp:2114
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==0 &&Extent==1 &&View==false &&!IsKnownIdentityOp< T, BinaryOperation >::value > >::getIdentity
T getIdentity() const
Definition: reduction.hpp:342
cl::sycl::ext::oneapi::detail::ReducerTraits
Helper class for accessing reducer-defined types in CRTP May prove to be useful for other things late...
Definition: reduction.hpp:130
cl::sycl::detail::IsBitAND
bool_constant< std::is_same< BinaryOperation, sycl::bit_and< T > >::value||std::is_same< BinaryOperation, sycl::bit_and< void > >::value > IsBitAND
Definition: known_identity.hpp:44
cl::sycl::detail::IsMinimum
bool_constant< std::is_same< BinaryOperation, sycl::minimum< T > >::value||std::is_same< BinaryOperation, sycl::minimum< void > >::value > IsMinimum
Definition: known_identity.hpp:34
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==0 &&Extent==1 &&View==false &&!IsKnownIdentityOp< T, BinaryOperation >::value > >::MValue
T MValue
Definition: reduction.hpp:346
cl::sycl::handler::depends_on
void depends_on(event Event)
Registers event dependencies on this command group.
Definition: handler.cpp:799
cl::sycl::buffer::size
size_t size() const noexcept
Definition: buffer.hpp:376
cl::sycl::ext::oneapi::detail::reduction_impl::reduction_impl
reduction_impl(rw_accessor_type &Acc, const T &, BinaryOperation)
Constructs reduction_impl when the identity value is statically known, and user still passed the iden...
Definition: reduction.hpp:844
cl::sycl::ext::oneapi::detail::reduction_impl_common::InitializeToIdentity
bool InitializeToIdentity
Definition: reduction.hpp:505
cl::sycl::range::size
size_t size() const
Definition: range.hpp:50
cl::sycl::ext::oneapi::detail::reduGetMaxWGSize
size_t reduGetMaxWGSize(std::shared_ptr< queue_impl > Queue, size_t LocalMemBytesPerWorkItem)
cl::sycl::memory_scope
memory_scope
Definition: memory_enums.hpp:26
cl::sycl::ext::oneapi::detail::reduCGFunc
void reduCGFunc(handler &CGH, KernelType KernelFunc, const range< Dims > &Range, size_t MaxWGSize, uint32_t NumConcurrentWorkGroups, Reduction &Redu)
Definition: reduction.hpp:1280
cl::sycl::ext::oneapi::detail::reduction_impl_algo< T, BinaryOperation, Dims, Extent, default_reduction_algorithm< IsUSM, IsPlaceholder, AccessorDims > >::getUSMPointer
result_type * getUSMPointer()
Definition: reduction.hpp:695
cl::sycl::handler::require
void require(accessor< DataT, Dims, AccMode, AccTarget, access::placeholder::true_t > Acc)
Requires access to the memory object associated with the placeholder accessor.
Definition: handler.hpp:1359
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==1 &&View==false &&!IsKnownIdentityOp< T, BinaryOperation >::value > >::getElement
const T & getElement(size_t E) const
Definition: reduction.hpp:427
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==0 &&Extent==1 &&View==false &&IsKnownIdentityOp< T, BinaryOperation >::value > >::MValue
T MValue
Definition: reduction.hpp:382
cl::sycl::handler::parallel_for
void parallel_for(range< 1 > NumWorkItems, KernelType KernelFunc)
Definition: handler.hpp:1456
cl::sycl::detail::conditional_t
typename std::conditional< B, T, F >::type conditional_t
Definition: stl_type_traits.hpp:27
cl::sycl::ext::oneapi::detail::reduceReduLocalAccs
void reduceReduLocalAccs(size_t IndexA, size_t IndexB, ReduTupleT< LocalAccT... > LocalAccs, ReduTupleT< BOPsT... > BOPs, std::index_sequence< Is... >)
Definition: reduction.hpp:1882
cl::sycl::ext::oneapi::detail::getInitToIdentityProperties
std::array< bool, sizeof...(Reductions)> getInitToIdentityProperties(std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:1814
kernel.hpp
cl::sycl::ext::oneapi::detail::tuple_select_elements
std::tuple< std::tuple_element_t< Is, TupleT >... > tuple_select_elements(TupleT Tuple, std::index_sequence< Is... >)
Utility function: for the given tuple.
Definition: reduction.hpp:2586
cl::sycl::nd_item::get_global_range
range< dimensions > get_global_range() const
Definition: nd_item.hpp:92
cl::sycl::ext::oneapi::detail::EmptyReductionPredicate::Func
Definition: reduction.hpp:1958
cl::sycl::detail::bool_constant
std::integral_constant< bool, V > bool_constant
Definition: stl_type_traits.hpp:40
cl::sycl::ext::oneapi::detail::__sycl_reduction_aux_kernel
Definition: reduction.hpp:1011
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==1 &&View==false &&IsKnownIdentityOp< T, BinaryOperation >::value > >::operator[]
reducer< T, BinaryOperation, Dims - 1, Extent, Algorithm, true > operator[](size_t Index)
Definition: reduction.hpp:453
cl::sycl::ext::oneapi::detail::reduction_impl_algo< T, BinaryOperation, Dims, Extent, default_reduction_algorithm< IsUSM, IsPlaceholder, AccessorDims > >::getReadWriteAccessorToInitializedGroupsCounter
accessor< int, 1, access::mode::read_write, access::target::device, access::placeholder::false_t > getReadWriteAccessorToInitializedGroupsCounter(handler &CGH)
Definition: reduction.hpp:674
cl::sycl::ext::oneapi::detail::associateReduAccsWithHandlerHelper
void associateReduAccsWithHandlerHelper(handler &CGH, ReductionT &Redu, RestT &... Rest)
Definition: reduction.hpp:2282
cl::sycl::ext::oneapi::detail::ReducerTraits< reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, Subst > >::type
T type
Definition: reduction.hpp:136
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==0 &&View==true > >::combine
void combine(const T &Partial)
Definition: reduction.hpp:396
cl::sycl::ext::oneapi::detail::combiner::atomic_combine
enable_if_t< std::is_same< typename remove_AS< _T >::type, T >::value &&IsReduOptForFastAtomicFetch< T, _BinaryOperation >::value &&sycl::detail::IsMinimum< T, _BinaryOperation >::value &&(Space==access::address_space::global_space||Space==access::address_space::local_space)> atomic_combine(_T *ReduVarPtr) const
Atomic MIN operation: *ReduVarPtr = sycl::minimum(*ReduVarPtr, MValue);.
Definition: reduction.hpp:295
cl::sycl::ext::oneapi::detail::reduction_impl_algo< T, BinaryOperation, Dims, Extent, default_reduction_algorithm< IsUSM, IsPlaceholder, AccessorDims > >::getWriteMemForPartialReds
std::enable_if_t<!IsOneWG, rw_accessor_type > getWriteMemForPartialReds(size_t Size, handler &CGH)
Constructs a new temporary buffer to hold partial sums and returns the accessor for that buffer.
Definition: reduction.hpp:624
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==0 &&Extent==1 &&View==false &&IsKnownIdentityOp< T, BinaryOperation >::value > >::getIdentity
static T getIdentity()
Definition: reduction.hpp:376
cl::sycl::ext::oneapi::detail::IsArrayReduction::Func
Definition: reduction.hpp:2001
cl::sycl::ext::oneapi::detail::reduAuxCGFunc
enable_if_t<!Reduction::has_fast_atomics, size_t > reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, Reduction &Redu)
Implements a command group function that enqueues a kernel that does one iteration of reduction of el...
Definition: reduction.hpp:1696
cl::sycl::ext::oneapi::detail::AreAllButLastReductions
Predicate returning true if all template type parameters except the last one are reductions.
Definition: handler.hpp:322
cl::sycl::ext::oneapi::detail::associateReduAccsWithHandler
void associateReduAccsWithHandler(handler &CGH, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:2289
cl::sycl::accessor
Buffer accessor.
Definition: accessor.hpp:223
cl::sycl::ext::oneapi::detail::reduction_impl_common::getIdentity
constexpr enable_if_t< IsKnownIdentityOp< _T, _BinaryOperation >::value, _T > getIdentity()
Returns the statically known identity value.
Definition: reduction.hpp:484
cl::sycl::ext::oneapi::detail::getReduIdentities
ReduTupleT< typename Reductions::result_type... > getReduIdentities(std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:1801
cl::sycl::ext::oneapi::detail::reduction_impl_algo< T, BinaryOperation, Dims, Extent, default_reduction_algorithm< IsUSM, IsPlaceholder, AccessorDims > >::getReadAccToPreviousPartialReds
accessor< T, buffer_dim, access::mode::read > getReadAccToPreviousPartialReds(handler &CGH) const
Definition: reduction.hpp:597
cl::sycl::ext::oneapi::detail::reduction_impl_algo< T, BinaryOperation, Dims, Extent, default_reduction_algorithm< IsUSM, IsPlaceholder, AccessorDims > >::getUserReadWriteAccessor
std::enable_if_t<!_IsUSM, rw_accessor_type & > getUserReadWriteAccessor()
Definition: reduction.hpp:686
cl::sycl::handler::copy
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, std::shared_ptr< T_Dst > Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Definition: handler.hpp:2281
cl
We provide new interfaces for matrix muliply in this patch:
Definition: access.hpp:13
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==1 &&View==false &&IsKnownIdentityOp< T, BinaryOperation >::value > >::reducer
reducer()
Definition: reduction.hpp:447
cl::sycl::ext::oneapi::detail::reduCGFuncImplAtomic64
std::enable_if_t< Reduction::has_atomic_add_float64 > reduCGFuncImplAtomic64(handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, Reduction &, typename Reduction::rw_accessor_type Out)
Definition: reduction.hpp:2230
cl::sycl::ext::oneapi::detail::combiner::atomic_combine
enable_if_t< std::is_same< typename remove_AS< _T >::type, T >::value &&IsReduOptForFastAtomicFetch< T, _BinaryOperation >::value &&sycl::detail::IsBitAND< T, _BinaryOperation >::value &&(Space==access::address_space::global_space||Space==access::address_space::local_space)> atomic_combine(_T *ReduVarPtr) const
Atomic BITWISE AND operation: *ReduVarPtr &= MValue;.
Definition: reduction.hpp:278
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==1 &&View==false &&!IsKnownIdentityOp< T, BinaryOperation >::value > >::operator[]
reducer< T, BinaryOperation, Dims - 1, Extent, Algorithm, true > operator[](size_t Index)
Definition: reduction.hpp:421
cl::sycl::ext::oneapi::detail::reductionLoop
void reductionLoop(const range< Dims > &Range, ReducerT &Reducer, const nd_item< 1 > &NdId, KernelFunc &F)
Called in device code.
Definition: reduction.hpp:1042
cl::sycl::detail::IsBitXOR
bool_constant< std::is_same< BinaryOperation, sycl::bit_xor< T > >::value||std::is_same< BinaryOperation, sycl::bit_xor< void > >::value > IsBitXOR
Definition: known_identity.hpp:54
cl::sycl::nd_item::get_group
group< dimensions > get_group() const
Definition: nd_item.hpp:68
cl::sycl::ext::oneapi::detail::reduAuxCGFuncImpl
void reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups, size_t WGSize, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... > ReduIndices)
Definition: reduction.hpp:2435
cl::sycl::ext::oneapi::detail::reduCGFuncImplArrayHelper
void reduCGFuncImplArrayHelper(nd_item< Dims > NDIt, LocalAccT LocalReds, OutAccT Out, ReducerT &Reducer, T Identity, BOPT BOp, bool IsInitializeToIdentity)
Each array reduction is processed separately.
Definition: reduction.hpp:2053
cl::sycl::ext::oneapi::detail::reduction_impl_algo< T, BinaryOperation, Dims, Extent, default_reduction_algorithm< IsUSM, IsPlaceholder, AccessorDims > >::getWriteMemForPartialReds
std::enable_if_t< IsOneWG &&_IsUSM, result_type * > getWriteMemForPartialReds(size_t, handler &)
Returns user's USM pointer passed to reduction for editing.
Definition: reduction.hpp:605
cl::sycl::detail::tie
auto tie(Ts &... Args)
Definition: tuple.hpp:40
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==0 &&Extent==1 &&View==false &&!IsKnownIdentityOp< T, BinaryOperation >::value > >::getElement
const T & getElement(size_t) const
Definition: reduction.hpp:345
cl::sycl::ext::oneapi::detail::reduCGFuncAtomic64
enable_if_t< Reduction::has_atomic_add_float64 > reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, Reduction &Redu)
Definition: reduction.hpp:2265
cl::sycl::atomic_ref
Definition: atomic_ref.hpp:661
cl::sycl::ext::oneapi::detail::reduction_impl_common::MBinaryOp
BinaryOperation MBinaryOp
Definition: reduction.hpp:504
cl::sycl::ext::oneapi::detail::combiner::atomic_combine
enable_if_t< std::is_same< typename remove_AS< _T >::type, T >::value &&IsReduOptForFastAtomicFetch< T, _BinaryOperation >::value &&sycl::detail::IsBitOR< T, _BinaryOperation >::value &&(Space==access::address_space::global_space||Space==access::address_space::local_space)> atomic_combine(_T *ReduVarPtr) const
Atomic BITWISE OR operation: *ReduVarPtr |= MValue;.
Definition: reduction.hpp:244
cl::sycl::ext::oneapi::detail::reduction_impl
This class encapsulates the reduction variable/accessor, the reduction operator and an optional opera...
Definition: reduction.hpp:754
cl::sycl::ext::oneapi::detail::combiner
Use CRTP to avoid redefining shorthand operators in terms of combine.
Definition: reduction.hpp:153
cl::sycl::ext::oneapi::detail::createReduOutAccs
auto createReduOutAccs(size_t NWorkGroups, handler &CGH, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
For the given 'Reductions' types pack and indices enumerating them this function either creates new t...
Definition: reduction.hpp:1778
cl::sycl::ext::oneapi::detail::reduction_impl::rw_accessor_type
typename algo::rw_accessor_type rw_accessor_type
Definition: reduction.hpp:762
cl::sycl::handler
Command group handler class.
Definition: handler.hpp:361
cl::sycl::ext::oneapi::accessor_property_list
Objects of the accessor_property_list class are containers for the SYCL properties.
Definition: property_list.hpp:19
cl::sycl::accessor::get_pointer
DataT * get_pointer() const
Definition: accessor.hpp:1777
cl::sycl::detail::queue_impl
Definition: queue_impl.hpp:54
cl::sycl::ext::oneapi::detail::reduction_impl::reduction_impl
reduction_impl(dw_accessor_type &Acc, const T &Identity, BinaryOperation BOp)
Constructs reduction_impl when the identity value is unknown.
Definition: reduction.hpp:922
cl::sycl::access::address_space
address_space
Definition: access.hpp:45
cl::sycl::nd_item::get_global_linear_id
size_t __SYCL_ALWAYS_INLINE get_global_linear_id() const
Definition: nd_item.hpp:48
accessor.hpp
cl::sycl::ext::oneapi::detail::getReduBOPs
ReduTupleT< typename Reductions::binary_operation... > getReduBOPs(std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:1808
cl::sycl::ext::oneapi::detail::IsNonUsmReductionPredicate::Func
Definition: reduction.hpp:1952
cl::sycl::ext::oneapi::detail::ReducerTraits< reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, Subst > >::op
BinaryOperation op
Definition: reduction.hpp:137
cl::sycl::ext::oneapi::detail::reduction_impl::dw_accessor_type
typename algo::dw_accessor_type dw_accessor_type
Definition: reduction.hpp:763
cl::sycl::nd_item::get_global_id
id< dimensions > get_global_id() const
Definition: nd_item.hpp:40
KernelFunc
std::function< void(const sycl::nd_item< NDims > &)> KernelFunc
Definition: pi_esimd_emulator.cpp:166
cl::sycl::detail::InitializeToIdentity
@ InitializeToIdentity
Definition: property_helper.hpp:34
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==0 &&View==true > >::reducer
reducer(T &Ref, BinaryOperation BOp)
Definition: reduction.hpp:394
cl::sycl::ext::oneapi::detail::makeReduTupleT
ReduTupleT< Ts... > makeReduTupleT(Ts... Elements)
Definition: reduction.hpp:102
PI_INVALID_VALUE
@ PI_INVALID_VALUE
Definition: pi.h:91
cl::sycl::detail::remove_AS
Definition: access.hpp:204
cl::sycl::detail::tuple< Ts... >
atomic.hpp
cl::sycl::ext::oneapi::detail::reduction_impl_algo< T, BinaryOperation, Dims, Extent, default_reduction_algorithm< IsUSM, IsPlaceholder, AccessorDims > >::hasUserDiscardWriteAccessor
bool hasUserDiscardWriteAccessor()
Definition: reduction.hpp:683
cl::sycl::detail::auto_name
This class is the default KernelName template parameter type for kernel invocation APIs such as singl...
Definition: kernel.hpp:35
cl::sycl::ext::oneapi::detail::ReduTupleT
sycl::detail::tuple< Ts... > ReduTupleT
Definition: reduction.hpp:101
cl::sycl::ext::oneapi::detail::reduction_impl_algo< T, BinaryOperation, Dims, Extent, default_reduction_algorithm< IsUSM, IsPlaceholder, AccessorDims > >::getOutPointer
static result_type * getOutPointer(result_type *OutPtr)
Definition: reduction.hpp:704
cl::sycl::nd_item
Identifies an instance of the function object executing at each point in an nd_range.
Definition: helpers.hpp:32
handler.hpp
cl::sycl::ext::oneapi::detail::reduction_impl_algo< T, BinaryOperation, Dims, Extent, default_reduction_algorithm< IsUSM, IsPlaceholder, AccessorDims > >::reduction_impl_algo
reduction_impl_algo(const T &Identity, BinaryOperation BinaryOp, bool Init, std::shared_ptr< rw_accessor_type > AccPointer)
Definition: reduction.hpp:561
cl::sycl::ext::oneapi::detail::combiner::atomic_combine
enable_if_t< std::is_same< typename remove_AS< _T >::type, T >::value &&IsReduOptForFastAtomicFetch< T, _BinaryOperation >::value &&sycl::detail::IsBitXOR< T, _BinaryOperation >::value &&(Space==access::address_space::global_space||Space==access::address_space::local_space)> atomic_combine(_T *ReduVarPtr) const
Atomic BITWISE XOR operation: *ReduVarPtr ^= MValue;.
Definition: reduction.hpp:261
cl::sycl::detail::IsMaximum
bool_constant< std::is_same< BinaryOperation, sycl::maximum< T > >::value||std::is_same< BinaryOperation, sycl::maximum< void > >::value > IsMaximum
Definition: known_identity.hpp:39
cl::sycl::ext::oneapi::detail::reduction_impl_algo< T, BinaryOperation, Dims, Extent, default_reduction_algorithm< IsUSM, IsPlaceholder, AccessorDims > >::getOutPointer
static result_type * getOutPointer(const rw_accessor_type &OutAcc)
Definition: reduction.hpp:700
std
Definition: accessor.hpp:2616
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==0 &&Extent==1 &&View==false &&IsKnownIdentityOp< T, BinaryOperation >::value > >::combine
void combine(const T &Partial)
Definition: reduction.hpp:371
cl::sycl::ext::oneapi::detail::IsNonUsmReductionPredicate
Definition: reduction.hpp:1951
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==1 &&View==false &&IsKnownIdentityOp< T, BinaryOperation >::value > >::reducer
reducer(const T &, BinaryOperation)
Definition: reduction.hpp:448
cl::sycl::ext::oneapi::detail::reduAuxCGFuncImplArrayHelper
void reduAuxCGFuncImplArrayHelper(nd_item< Dims > NDIt, size_t LID, size_t GID, size_t NWorkItems, size_t WGSize, LocalAccT LocalReds, InAccT In, OutAccT Out, T Identity, BOPT BOp, bool IsInitializeToIdentity)
Definition: reduction.hpp:2339
__SYCL_INLINE_CONSTEXPR
#define __SYCL_INLINE_CONSTEXPR
Definition: defines_elementary.hpp:65
cl::sycl::ext::oneapi::detail::reduction_impl_common::reduction_impl_common
reduction_impl_common(const T &Identity, BinaryOperation BinaryOp, bool Init=false)
Definition: reduction.hpp:476
reduce
_Tp reduce(const simd< _Tp, _Abi > &, _BinaryOp=_BinaryOp())
cl::sycl::marray< T, Extent >
group_algorithm.hpp
cl::sycl::ext::oneapi::detail::reduCGFuncImpl
void reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... > ReduIndices)
Definition: reduction.hpp:2131
cl::sycl::ext::oneapi::detail::reduction_impl_common::initializeToIdentity
bool initializeToIdentity() const
Definition: reduction.hpp:497
cl::sycl::ext::oneapi::has_known_identity_v
__SYCL_INLINE_CONSTEXPR bool has_known_identity_v
Definition: reduction.hpp:2653
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==0 &&Extent==1 &&View==false &&IsKnownIdentityOp< T, BinaryOperation >::value > >::reducer
reducer()
Definition: reduction.hpp:368
cl::sycl::handler::single_task
void single_task(KernelType KernelFunc)
Defines and invokes a SYCL kernel function as a function object type.
Definition: handler.hpp:1435
cl::sycl::ext::oneapi::detail::reduction_impl_common
Templated class for common functionality of all reduction implementation classes.
Definition: reduction.hpp:474
cl::sycl::known_identity
Definition: known_identity.hpp:389
cl::sycl::ext::oneapi::detail::reduction_impl_algo< T, BinaryOperation, Dims, Extent, default_reduction_algorithm< IsUSM, IsPlaceholder, AccessorDims > >::reduction_impl_algo
reduction_impl_algo(const T &Identity, BinaryOperation BinaryOp, bool Init, T *USMPointer)
Definition: reduction.hpp:567
cl::sycl::detail::tuple_element_t
typename tuple_element< I, T >::type tuple_element_t
Definition: tuple.hpp:56
cl::sycl::ext::oneapi::detail::FilterElement
Definition: reduction.hpp:1961
cl::sycl::access::mode
mode
Definition: access.hpp:28
cl::sycl::instead
std::uint8_t instead
Definition: aliases.hpp:68
cl::sycl::ext::oneapi::detail::reduGetMemPerWorkItem
size_t reduGetMemPerWorkItem(std::tuple< ReductionT... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:2577
cl::sycl::ext::oneapi::detail::reduction_impl_algo< T, BinaryOperation, Dims, Extent, default_reduction_algorithm< IsUSM, IsPlaceholder, AccessorDims > >::result_type
T result_type
Definition: reduction.hpp:532
cl::sycl::ext::oneapi::detail::reduction_impl_common::getIdentity
enable_if_t<!IsKnownIdentityOp< _T, _BinaryOperation >::value, _T > getIdentity()
Returns the identity value given by user.
Definition: reduction.hpp:491
cl::sycl::ext::oneapi::detail::reduction_impl::reducer_type
typename algo::reducer_type reducer_type
Definition: reduction.hpp:761
cl::sycl::ext::oneapi::detail::reduction_impl::reduction_impl
reduction_impl(buffer< _T, 1, AllocatorT > Buffer, handler &CGH, const T &, BinaryOperation, bool InitializeToIdentity)
SYCL-2020.
Definition: reduction.hpp:816
cl::sycl::ext::oneapi::detail::IsReduOptForAtomic64Add
bool_constant< sycl::detail::IsPlus< T, BinaryOperation >::value &&sycl::detail::is_sgenfloat< T >::value &&(sizeof(T)==4||sizeof(T)==8)> IsReduOptForAtomic64Add
Definition: reduction.hpp:76
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==1 &&View==false &&IsKnownIdentityOp< T, BinaryOperation >::value > >::getElement
const T & getElement(size_t E) const
Definition: reduction.hpp:462
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==0 &&Extent==1 &&View==false &&IsKnownIdentityOp< T, BinaryOperation >::value > >::reducer
reducer(const T &, BinaryOperation)
Definition: reduction.hpp:369
cl::sycl::ext::oneapi::reduction
std::enable_if_t< detail::IsKnownIdentityOp< T, BinaryOperation >::value, detail::reduction_impl< T, BinaryOperation, 0, 1, detail::default_reduction_algorithm< true, access::placeholder::false_t, 1 > > > reduction(T *VarPtr, BinaryOperation)
Creates and returns an object implementing the reduction functionality.
Definition: reduction.hpp:2643
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==1 &&View==false &&!IsKnownIdentityOp< T, BinaryOperation >::value > >::getElement
T & getElement(size_t E)
Definition: reduction.hpp:426
cl::sycl::ext::oneapi::detail::reduction_impl::reduction_impl
reduction_impl(buffer< _T, 1, AllocatorT > Buffer, handler &CGH, const T &Identity, BinaryOperation BOp, bool InitializeToIdentity)
SYCL-2020.
Definition: reduction.hpp:894
cl::sycl::detail::enable_if_t
typename std::enable_if< B, T >::type enable_if_t
Definition: stl_type_traits.hpp:24
cl::sycl::ext::oneapi::has_known_identity
Definition: reduction.hpp:2649
known_identity.hpp
cl::sycl::ext::oneapi::detail::get_reduction_aux_kernel_name_t
Definition: reduction.hpp:1027
cl::sycl::ext::oneapi::detail::reduction_impl_algo< T, BinaryOperation, Dims, Extent, default_reduction_algorithm< IsUSM, IsPlaceholder, AccessorDims > >::getReadWriteAccessorToInitializedMem
std::enable_if_t< HasFastAtomics, rw_accessor_type > getReadWriteAccessorToInitializedMem(handler &CGH)
If reduction is initialized with read-write accessor, which does not require initialization with iden...
Definition: reduction.hpp:654
cl::sycl::ext::oneapi::detail::EmptyReductionPredicate
Definition: reduction.hpp:1957
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==1 &&View==false &&!IsKnownIdentityOp< T, BinaryOperation >::value > >::getIdentity
T getIdentity() const
Definition: reduction.hpp:425
cl::sycl::ext::oneapi::detail::reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==0 &&Extent==1 &&View==false &&!IsKnownIdentityOp< T, BinaryOperation >::value > >::reducer
reducer(const T &Identity, BinaryOperation BOp)
Definition: reduction.hpp:337
cl::sycl::ext::oneapi::detail::IsScalarReduction
Definition: reduction.hpp:1993
cl::sycl::ext::oneapi::detail::combiner::atomic_combine
enable_if_t< std::is_same< typename remove_AS< _T >::type, T >::value &&(IsReduOptForFastAtomicFetch< T, _BinaryOperation >::value||IsReduOptForAtomic64Add< T, _BinaryOperation >::value) &&sycl::detail::IsPlus< T, _BinaryOperation >::value &&(Space==access::address_space::global_space||Space==access::address_space::local_space)> atomic_combine(_T *ReduVarPtr) const
Atomic ADD operation: *ReduVarPtr += MValue;.
Definition: reduction.hpp:227
cl::sycl::ext::oneapi::detail::reduction_impl_common::getBinaryOperation
BinaryOperation getBinaryOperation() const
Returns the binary operation associated with the reduction.
Definition: reduction.hpp:496
cl::sycl::nd_range::get_local_range
range< dimensions > get_local_range() const
Definition: nd_range.hpp:42
cl::sycl::ext::oneapi::detail::reduCGFuncImplScalar
void reduCGFuncImplScalar(nd_item< Dims > NDIt, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, std::tuple< ReducerT... > &ReducersTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... > ReduIndices)
All scalar reductions are processed together; there is one loop of log2(N) steps, and each reduction ...
Definition: reduction.hpp:2012
cl::sycl::ext::oneapi::detail::reduction_impl_algo< T, BinaryOperation, Dims, Extent, default_reduction_algorithm< IsUSM, IsPlaceholder, AccessorDims > >::reduction_impl_algo
reduction_impl_algo(const T &Identity, BinaryOperation BinaryOp, bool Init, std::shared_ptr< dw_accessor_type > AccPointer)
Definition: reduction.hpp:564
cl::sycl::ext::oneapi::detail::reduComputeWGSize
size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize, size_t &NWorkGroups)
Definition: reduction.cpp:20
cl::sycl::ext::oneapi::detail::reduction_impl_algo< T, BinaryOperation, Dims, Extent, default_reduction_algorithm< IsUSM, IsPlaceholder, AccessorDims > >::getWriteAccForPartialReds
rw_accessor_type getWriteAccForPartialReds(size_t Size, handler &CGH)
Returns an accessor accessing the memory that will hold the reduction partial sums.
Definition: reduction.hpp:636
cl::sycl::ext::oneapi::detail::reduAuxCGFuncImplScalar
void reduAuxCGFuncImplScalar(nd_item< Dims > NDIt, size_t LID, size_t GID, size_t NWorkItems, size_t WGSize, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< InAccT... > InAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... > ReduIndices)
All scalar reductions are processed together; there is one loop of log2(N) steps, and each reduction ...
Definition: reduction.hpp:2300
cl::sycl::ext::oneapi::detail::reduction_impl_algo< T, BinaryOperation, Dims, Extent, default_reduction_algorithm< IsUSM, IsPlaceholder, AccessorDims > >::associateWithHandler
void associateWithHandler(handler &CGH)
Associates the reduction accessor to user's memory with CGH handler to keep the accessor alive until ...
Definition: reduction.hpp:574
cl::sycl::ext::oneapi::detail::default_reduction_algorithm
Types representing specific reduction algorithms Enables reduction_impl_algo to take additional algor...
Definition: reduction.hpp:511
__SYCL_INLINE_NAMESPACE
#define __SYCL_INLINE_NAMESPACE(X)
Definition: defines_elementary.hpp:12
cl::sycl::ext::oneapi::detail::reduction_impl::reduction_impl
reduction_impl(buffer< _T, 1, AllocatorT > Buffer, handler &CGH, bool InitializeToIdentity)
SYCL-2020.
Definition: reduction.hpp:773