DPC++ Runtime
Runtime libraries for oneAPI DPC++
reduction.hpp
Go to the documentation of this file.
1 //==---------------- reduction.hpp - SYCL reduction ------------*- C++ -*---==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 // ===--------------------------------------------------------------------=== //
8 
9 #pragma once
10 
11 #include <sycl/accessor.hpp>
12 #include <sycl/atomic.hpp>
13 #include <sycl/atomic_ref.hpp>
14 #include <sycl/detail/tuple.hpp>
16 #include <sycl/group_algorithm.hpp>
17 #include <sycl/handler.hpp>
18 #include <sycl/kernel.hpp>
19 #include <sycl/known_identity.hpp>
22 #include <sycl/usm.hpp>
23 
24 #include <optional>
25 #include <tuple>
26 
27 namespace sycl {
29 namespace detail {
30 
34 
36 template <typename T> struct IsReduction {
37  static constexpr bool value =
38  std::is_base_of_v<reduction_impl_base, std::remove_reference_t<T>>;
39 };
40 
43 template <typename FirstT, typename... RestT> struct AreAllButLastReductions {
44  static constexpr bool value =
46 };
47 
50 template <typename T> struct AreAllButLastReductions<T> {
51  static constexpr bool value = !IsReduction<T>::value;
52 };
53 } // namespace detail
54 
68 template <typename T, class BinaryOperation, int Dims, size_t Extent,
69  typename IdentityContainerT, bool View = false, typename Subst = void>
70 class reducer;
71 
72 namespace detail {
73 // This type trait is used to detect if the atomic operation BinaryOperation
74 // used with operands of the type T is available for using in reduction.
75 // The order in which the atomic operations are performed may be arbitrary and
76 // thus may cause different results from run to run even on the same elements
77 // and on same device. The macro SYCL_REDUCTION_DETERMINISTIC prohibits using
78 // atomic operations for reduction and helps to produce stable results.
79 // SYCL_REDUCTION_DETERMINISTIC is a short term solution, which perhaps become
80 // deprecated eventually and is replaced by a sycl property passed to reduction.
81 template <typename T, class BinaryOperation>
83 #ifdef SYCL_REDUCTION_DETERMINISTIC
84  std::bool_constant<false>;
85 #else
86  std::bool_constant<((is_sgenfloat<T>::value && sizeof(T) == 4) ||
87  is_sgeninteger<T>::value) &&
88  IsValidAtomicType<T>::value &&
89  (IsPlus<T, BinaryOperation>::value ||
90  IsMinimum<T, BinaryOperation>::value ||
91  IsMaximum<T, BinaryOperation>::value ||
92  IsBitOR<T, BinaryOperation>::value ||
93  IsBitXOR<T, BinaryOperation>::value ||
95 #endif
96 
97 // This type trait is used to detect if the atomic operation BinaryOperation
98 // used with operands of the type T is available for using in reduction, in
99 // addition to the cases covered by "IsReduOptForFastAtomicFetch", if the device
100 // has the atomic64 aspect. This type trait should only be used if the device
101 // has the atomic64 aspect. Note that this type trait is currently a subset of
102 // IsReduOptForFastReduce. The macro SYCL_REDUCTION_DETERMINISTIC prohibits
103 // using the reduce_over_group() algorithm to produce stable results across same
104 // type devices.
105 template <typename T, class BinaryOperation>
107 #ifdef SYCL_REDUCTION_DETERMINISTIC
108  std::bool_constant<false>;
109 #else
110  std::bool_constant<(IsPlus<T, BinaryOperation>::value ||
113  is_sgenfloat<T>::value && sizeof(T) == 8>;
114 #endif
115 
116 // This type trait is used to detect if the group algorithm reduce() used with
117 // operands of the type T and the operation BinaryOperation is available
118 // for using in reduction.
119 // The macro SYCL_REDUCTION_DETERMINISTIC prohibits using the reduce() algorithm
120 // to produce stable results across same type devices.
121 template <typename T, class BinaryOperation>
123 #ifdef SYCL_REDUCTION_DETERMINISTIC
124  std::bool_constant<false>;
125 #else
126  std::bool_constant<((is_sgeninteger<T>::value &&
127  (sizeof(T) == 4 || sizeof(T) == 8)) ||
132 #endif
133 
134 // std::tuple seems to be a) too heavy and b) not copyable to device now
135 // Thus sycl::detail::tuple is used instead.
136 // Switching from sycl::device::tuple to std::tuple can be done by re-defining
137 // the ReduTupleT type and makeReduTupleT() function below.
138 template <typename... Ts> using ReduTupleT = sycl::detail::tuple<Ts...>;
139 template <typename... Ts> ReduTupleT<Ts...> makeReduTupleT(Ts... Elements) {
140  return sycl::detail::make_tuple(Elements...);
141 }
142 
143 __SYCL_EXPORT size_t reduGetMaxWGSize(std::shared_ptr<queue_impl> Queue,
144  size_t LocalMemBytesPerWorkItem);
145 __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
146  size_t &NWorkGroups);
147 __SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
148  size_t LocalMemBytesPerWorkItem);
149 
150 template <typename T, class BinaryOperation, bool IsOptional>
152 
155 template <typename Reducer> struct ReducerTraits;
156 
157 template <typename T, class BinaryOperation, int Dims, std::size_t Extent,
158  typename IdentityContainerT, bool View, typename Subst>
159 struct ReducerTraits<reducer<T, BinaryOperation, Dims, Extent,
160  IdentityContainerT, View, Subst>> {
161  using type = T;
162  using op = BinaryOperation;
163  static constexpr int dims = Dims;
164  static constexpr size_t extent = Extent;
165  static constexpr bool has_identity = IdentityContainerT::has_identity;
167 };
168 
170 template <typename ReducerT> class ReducerAccess {
171 public:
172  ReducerAccess(ReducerT &ReducerRef) : MReducerRef(ReducerRef) {}
173 
174  template <typename ReducerRelayT = ReducerT> auto &getElement(size_t E) {
175  return MReducerRef.getElement(E);
176  }
177 
178  template <typename ReducerRelayT = ReducerT> constexpr auto getIdentity() {
180  "Identity unavailable.");
181  return MReducerRef.getIdentity();
182  }
183 
184  // MSVC does not like static overloads of non-static functions, even if they
185  // are made mutually exclusive through SFINAE. Instead we use a new static
186  // function to be used when a static function is needed.
187  template <typename ReducerRelayT = ReducerT>
188  static constexpr auto getIdentityStatic() {
189  static_assert(
191  typename ReducerTraits<ReducerRelayT>::op>::value,
192  "Static identity unavailable.");
193  return ReducerT::getIdentity();
194  }
195 
196 private:
197  ReducerT &MReducerRef;
198 };
199 
200 // Helper function to simplify the use of ReducerAccess. This avoids the need
201 // for potentially unsupported deduction guides.
202 template <typename ReducerT> auto getReducerAccess(ReducerT &Reducer) {
203  return ReducerAccess<ReducerT>{Reducer};
204 }
205 
217 template <class Reducer> class combiner {
218  using Ty = typename ReducerTraits<Reducer>::type;
219  using BinaryOp = typename ReducerTraits<Reducer>::op;
220  static constexpr int Dims = ReducerTraits<Reducer>::dims;
221  static constexpr size_t Extent = ReducerTraits<Reducer>::extent;
222 
223 public:
224  template <typename _T = Ty, int _Dims = Dims>
225  std::enable_if_t<(_Dims == 0) && IsPlus<_T, BinaryOp>::value &&
227  Reducer &>
228  operator++() {
229  return static_cast<Reducer *>(this)->combine(static_cast<_T>(1));
230  }
231 
232  template <typename _T = Ty, int _Dims = Dims>
233  std::enable_if_t<(_Dims == 0) && IsPlus<_T, BinaryOp>::value &&
235  Reducer &>
236  operator++(int) {
237  return static_cast<Reducer *>(this)->combine(static_cast<_T>(1));
238  }
239 
240  template <typename _T = Ty, int _Dims = Dims>
241  std::enable_if_t<(_Dims == 0) && IsPlus<_T, BinaryOp>::value, Reducer &>
242  operator+=(const _T &Partial) {
243  return static_cast<Reducer *>(this)->combine(Partial);
244  }
245 
246  template <typename _T = Ty, int _Dims = Dims>
247  std::enable_if_t<(_Dims == 0) && IsMultiplies<_T, BinaryOp>::value, Reducer &>
248  operator*=(const _T &Partial) {
249  return static_cast<Reducer *>(this)->combine(Partial);
250  }
251 
252  template <typename _T = Ty, int _Dims = Dims>
253  std::enable_if_t<(_Dims == 0) && IsBitOR<_T, BinaryOp>::value, Reducer &>
254  operator|=(const _T &Partial) {
255  return static_cast<Reducer *>(this)->combine(Partial);
256  }
257 
258  template <typename _T = Ty, int _Dims = Dims>
259  std::enable_if_t<(_Dims == 0) && IsBitXOR<_T, BinaryOp>::value, Reducer &>
260  operator^=(const _T &Partial) {
261  return static_cast<Reducer *>(this)->combine(Partial);
262  }
263 
264  template <typename _T = Ty, int _Dims = Dims>
265  std::enable_if_t<(_Dims == 0) && IsBitAND<_T, BinaryOp>::value, Reducer &>
266  operator&=(const _T &Partial) {
267  return static_cast<Reducer *>(this)->combine(Partial);
268  }
269 
270 private:
271  template <access::address_space Space>
272  static constexpr memory_scope getMemoryScope() {
273  return Space == access::address_space::local_space
274  ? memory_scope::work_group
275  : memory_scope::device;
276  }
277 
278  template <access::address_space Space, class T, class AtomicFunctor>
279  void atomic_combine_impl(T *ReduVarPtr, AtomicFunctor Functor) const {
280  auto reducer = static_cast<const Reducer *>(this);
281  for (size_t E = 0; E < Extent; ++E) {
282  const auto &ReducerElem = getReducerAccess(*reducer).getElement(E);
283 
284  // If the reducer element doesn't have a value we can skip the combine.
285  if (!ReducerElem)
286  continue;
287 
288  auto AtomicRef = sycl::atomic_ref<T, memory_order::relaxed,
289  getMemoryScope<Space>(), Space>(
290  address_space_cast<Space, access::decorated::no>(ReduVarPtr)[E]);
291  Functor(std::move(AtomicRef), *ReducerElem);
292  }
293  }
294 
295  template <class _T, access::address_space Space, class BinaryOp>
296  static constexpr bool BasicCheck =
297  std::is_same_v<remove_decoration_t<_T>, Ty> &&
298  (Space == access::address_space::global_space ||
299  Space == access::address_space::local_space);
300 
301 public:
303  template <access::address_space Space = access::address_space::global_space,
304  typename _T = Ty, class _BinaryOperation = BinaryOp>
305  std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
306  (IsReduOptForFastAtomicFetch<_T, _BinaryOperation>::value ||
307  IsReduOptForAtomic64Op<_T, _BinaryOperation>::value) &&
308  IsPlus<_T, _BinaryOperation>::value>
309  atomic_combine(_T *ReduVarPtr) const {
310  atomic_combine_impl<Space>(
311  ReduVarPtr, [](auto &&Ref, auto Val) { return Ref.fetch_add(Val); });
312  }
313 
315  template <access::address_space Space = access::address_space::global_space,
316  typename _T = Ty, class _BinaryOperation = BinaryOp>
317  std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
320  atomic_combine(_T *ReduVarPtr) const {
321  atomic_combine_impl<Space>(
322  ReduVarPtr, [](auto &&Ref, auto Val) { return Ref.fetch_or(Val); });
323  }
324 
326  template <access::address_space Space = access::address_space::global_space,
327  typename _T = Ty, class _BinaryOperation = BinaryOp>
328  std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
331  atomic_combine(_T *ReduVarPtr) const {
332  atomic_combine_impl<Space>(
333  ReduVarPtr, [](auto &&Ref, auto Val) { return Ref.fetch_xor(Val); });
334  }
335 
337  template <access::address_space Space = access::address_space::global_space,
338  typename _T = Ty, class _BinaryOperation = BinaryOp>
339  std::enable_if_t<std::is_same_v<remove_decoration_t<_T>, _T> &&
342  (Space == access::address_space::global_space ||
343  Space == access::address_space::local_space)>
344  atomic_combine(_T *ReduVarPtr) const {
345  atomic_combine_impl<Space>(
346  ReduVarPtr, [](auto &&Ref, auto Val) { return Ref.fetch_and(Val); });
347  }
348 
350  template <access::address_space Space = access::address_space::global_space,
351  typename _T = Ty, class _BinaryOperation = BinaryOp>
352  std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
356  atomic_combine(_T *ReduVarPtr) const {
357  atomic_combine_impl<Space>(
358  ReduVarPtr, [](auto &&Ref, auto Val) { return Ref.fetch_min(Val); });
359  }
360 
362  template <access::address_space Space = access::address_space::global_space,
363  typename _T = Ty, class _BinaryOperation = BinaryOp>
364  std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
368  atomic_combine(_T *ReduVarPtr) const {
369  atomic_combine_impl<Space>(
370  ReduVarPtr, [](auto &&Ref, auto Val) { return Ref.fetch_max(Val); });
371  }
372 };
373 
376 template <typename T, class BinaryOperation, bool ExplicitIdentity,
377  typename CondT = void>
379  static_assert(!std::is_same_v<T, T>,
380  "Partial specializations don't cover all possible options!");
381 };
382 
383 // Specialization for reductions with explicit identity.
384 template <typename T, class BinaryOperation, bool ExplicitIdentity>
386  T, BinaryOperation, ExplicitIdentity,
387  enable_if_t<IsKnownIdentityOp<T, BinaryOperation>::value>> {
388 public:
389  static constexpr bool has_identity = true;
390 
393 
395  static constexpr T getIdentity() {
397  }
398 };
399 
400 // Specialization for reductions with explicit identity.
401 template <typename T, class BinaryOperation>
403  T, BinaryOperation, true,
404  enable_if_t<!IsKnownIdentityOp<T, BinaryOperation>::value>> {
405 public:
406  static constexpr bool has_identity = true;
407 
408  ReductionIdentityContainer(const T &Identity) : MIdentity(Identity) {}
409 
411  T getIdentity() const { return MIdentity; }
412 
413 private:
416  const T MIdentity;
417 };
418 
419 // Specialization for identityless reductions.
420 template <typename T, class BinaryOperation>
422  T, BinaryOperation, false,
423  std::enable_if_t<!IsKnownIdentityOp<T, BinaryOperation>::value>> {
424 public:
425  static constexpr bool has_identity = false;
426 };
427 
428 // Class representing a single reducer element. In cases where the identity is
429 // unknown this is allowed to not contain a value.
430 template <typename T, class BinaryOperation, bool IsOptional>
431 class ReducerElement {
432  using value_type = std::conditional_t<IsOptional, std::optional<T>, T>;
433 
434  template <bool ExplicitIdentity>
435  constexpr value_type GetInitValue(
437  &IdentityContainer) {
438  constexpr bool ContainerHasIdentity =
439  ReductionIdentityContainer<T, BinaryOperation,
440  ExplicitIdentity>::has_identity;
441  static_assert(IsOptional || ContainerHasIdentity);
442  if constexpr (!ContainerHasIdentity)
443  return std::nullopt;
444  else
445  return IdentityContainer.getIdentity();
446  }
447 
448 public:
449  ReducerElement() = default;
450  ReducerElement(T Value) : MValue{Value} {}
451 
452  template <bool ExplicitIdentity>
455  &IdentityContainer)
456  : MValue(GetInitValue(IdentityContainer)) {}
457 
458  ReducerElement &combine(BinaryOperation BinOp, const T &OtherValue) {
459  if constexpr (IsOptional)
460  MValue = MValue ? BinOp(*MValue, OtherValue) : OtherValue;
461  else
462  MValue = BinOp(MValue, OtherValue);
463  return *this;
464  }
465 
466  ReducerElement &combine(BinaryOperation BinOp, const ReducerElement &Other) {
467  if constexpr (IsOptional) {
468  if (Other.MValue)
469  return combine(BinOp, *Other.MValue);
470  // If the other value doesn't have a value it is a no-op.
471  return *this;
472  } else {
473  return combine(BinOp, Other.MValue);
474  }
475  }
476 
477  constexpr T &operator*() noexcept {
478  if constexpr (IsOptional)
479  return *MValue;
480  else
481  return MValue;
482  }
483  constexpr const T &operator*() const noexcept {
484  if constexpr (IsOptional)
485  return *MValue;
486  else
487  return MValue;
488  }
489 
490  constexpr explicit operator bool() const {
491  if constexpr (IsOptional)
492  return MValue.has_value();
493  return true;
494  }
495 
496 private:
497  value_type MValue;
498 };
499 
500 template <typename T, class BinaryOperation, int Dims> class reducer_common {
501 public:
502  using value_type = T;
503  using binary_operation = BinaryOperation;
504  static constexpr int dimensions = Dims;
505 };
506 
507 // Token class to help with the in-place construction of reducers.
508 template <class BinaryOperation, typename IdentityContainerT>
509 struct ReducerToken {
510  const IdentityContainerT &IdentityContainer;
511  const BinaryOperation BOp;
512 };
513 
514 } // namespace detail
515 
522 template <typename T, class BinaryOperation, int Dims, size_t Extent,
523  typename IdentityContainerT, bool View>
524 class reducer<
525  T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
526  std::enable_if_t<Dims == 0 && Extent == 1 && View == false &&
527  !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
528  : public detail::combiner<
529  reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
530  std::enable_if_t<
531  Dims == 0 && Extent == 1 && View == false &&
532  !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>>,
533  public detail::reducer_common<T, BinaryOperation, Dims> {
534  static constexpr bool has_identity = IdentityContainerT::has_identity;
535  using element_type =
537 
538 public:
539  reducer(const IdentityContainerT &IdentityContainer, BinaryOperation BOp)
540  : MValue(IdentityContainer), MIdentity(IdentityContainer),
541  MBinaryOp(BOp) {}
544  : reducer(Token.IdentityContainer, Token.BOp) {}
545 
546  reducer(const reducer &) = delete;
547  reducer(reducer &&) = delete;
548  reducer &operator=(const reducer &) = delete;
549  reducer &operator=(reducer &&) = delete;
550 
551  reducer &combine(const T &Partial) {
552  MValue.combine(MBinaryOp, Partial);
553  return *this;
554  }
555 
556  template <bool HasIdentityRelay = has_identity>
557  std::enable_if_t<HasIdentityRelay && (HasIdentityRelay == has_identity), T>
558  identity() const {
559  return MIdentity.getIdentity();
560  }
561 
562 private:
563  template <typename ReducerT> friend class detail::ReducerAccess;
564 
565  element_type &getElement(size_t) { return MValue; }
566  const element_type &getElement(size_t) const { return MValue; }
567 
568  detail::ReducerElement<T, BinaryOperation, !has_identity> MValue;
569  const IdentityContainerT MIdentity;
570  BinaryOperation MBinaryOp;
571 };
572 
578 template <typename T, class BinaryOperation, int Dims, size_t Extent,
579  typename IdentityContainerT, bool View>
580 class reducer<
581  T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
582  std::enable_if_t<Dims == 0 && Extent == 1 && View == false &&
583  detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
584  : public detail::combiner<
585  reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
586  std::enable_if_t<
587  Dims == 0 && Extent == 1 && View == false &&
588  detail::IsKnownIdentityOp<T, BinaryOperation>::value>>>,
589  public detail::reducer_common<T, BinaryOperation, Dims> {
590  static constexpr bool has_identity = IdentityContainerT::has_identity;
591  using element_type =
593 
594 public:
595  reducer() : MValue(getIdentity()) {}
596  reducer(const IdentityContainerT & /* Identity */, BinaryOperation)
597  : MValue(getIdentity()) {}
600  : reducer(Token.IdentityContainer, Token.BOp) {}
601 
602  reducer(const reducer &) = delete;
603  reducer(reducer &&) = delete;
604  reducer &operator=(const reducer &) = delete;
605  reducer &operator=(reducer &&) = delete;
606 
607  reducer &combine(const T &Partial) {
608  BinaryOperation BOp;
609  MValue.combine(BOp, Partial);
610  return *this;
611  }
612 
613  T identity() const { return getIdentity(); }
614 
615 private:
616  template <typename ReducerT> friend class detail::ReducerAccess;
617 
618  static constexpr T getIdentity() {
620  }
621 
622  element_type &getElement(size_t) { return MValue; }
623  const element_type &getElement(size_t) const { return MValue; }
624  detail::ReducerElement<T, BinaryOperation, !has_identity> MValue;
625 };
626 
629 template <typename T, class BinaryOperation, int Dims, size_t Extent,
630  typename IdentityContainerT, bool View>
631 class reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
632  std::enable_if_t<Dims == 0 && View == true>>
633  : public detail::combiner<
634  reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
635  std::enable_if_t<Dims == 0 && View == true>>>,
636  public detail::reducer_common<T, BinaryOperation, Dims> {
637  static constexpr bool has_identity = IdentityContainerT::has_identity;
638  using element_type =
640 
641 public:
642  reducer(element_type &Ref, BinaryOperation BOp)
643  : MElement(Ref), MBinaryOp(BOp) {}
646  : reducer(Token.IdentityContainer, Token.BOp) {}
647 
648  reducer(const reducer &) = delete;
649  reducer(reducer &&) = delete;
650  reducer &operator=(const reducer &) = delete;
651  reducer &operator=(reducer &&) = delete;
652 
653  reducer &combine(const T &Partial) {
654  MElement.combine(MBinaryOp, Partial);
655  return *this;
656  }
657 
658 private:
659  template <typename ReducerT> friend class detail::ReducerAccess;
660 
661  element_type &getElement(size_t) { return MElement; }
662  const element_type &getElement(size_t) const { return MElement; }
663 
664  element_type &MElement;
665  BinaryOperation MBinaryOp;
666 };
667 
670 template <typename T, class BinaryOperation, int Dims, size_t Extent,
671  typename IdentityContainerT, bool View>
672 class reducer<
673  T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
674  std::enable_if_t<Dims == 1 && View == false &&
675  !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
676  : public detail::combiner<
677  reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
678  std::enable_if_t<
679  Dims == 1 && View == false &&
680  !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>>,
681  public detail::reducer_common<T, BinaryOperation, Dims> {
682  static constexpr bool has_identity = IdentityContainerT::has_identity;
683  using element_type =
685 
686 public:
687  reducer(const IdentityContainerT &IdentityContainer, BinaryOperation BOp)
688  : MValue(IdentityContainer), MIdentity(IdentityContainer),
689  MBinaryOp(BOp) {}
692  : reducer(Token.IdentityContainer, Token.BOp) {}
693 
694  reducer(const reducer &) = delete;
695  reducer(reducer &&) = delete;
696  reducer &operator=(const reducer &) = delete;
697  reducer &operator=(reducer &&) = delete;
698 
699  reducer<T, BinaryOperation, Dims - 1, Extent, IdentityContainerT, true>
700  operator[](size_t Index) {
701  return {MValue[Index], MBinaryOp};
702  }
703 
704  template <bool HasIdentityRelay = has_identity>
705  std::enable_if_t<HasIdentityRelay && (HasIdentityRelay == has_identity), T>
706  identity() const {
707  return MIdentity.getIdentity();
708  }
709 
710 private:
711  template <typename ReducerT> friend class detail::ReducerAccess;
712 
713  element_type &getElement(size_t E) { return MValue[E]; }
714  const element_type &getElement(size_t E) const { return MValue[E]; }
715 
716  marray<element_type, Extent> MValue;
717  const IdentityContainerT MIdentity;
718  BinaryOperation MBinaryOp;
719 };
720 
723 template <typename T, class BinaryOperation, int Dims, size_t Extent,
724  typename IdentityContainerT, bool View>
725 class reducer<
726  T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
727  std::enable_if_t<Dims == 1 && View == false &&
728  detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
729  : public detail::combiner<
730  reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
731  std::enable_if_t<
732  Dims == 1 && View == false &&
733  detail::IsKnownIdentityOp<T, BinaryOperation>::value>>>,
734  public detail::reducer_common<T, BinaryOperation, Dims> {
735  static constexpr bool has_identity = IdentityContainerT::has_identity;
736  using element_type =
738 
739 public:
740  reducer() : MValue(getIdentity()) {}
741  reducer(const IdentityContainerT & /* Identity */, BinaryOperation)
742  : MValue(getIdentity()) {}
745  : reducer(Token.IdentityContainer, Token.BOp) {}
746 
747  reducer(const reducer &) = delete;
748  reducer(reducer &&) = delete;
749  reducer &operator=(const reducer &) = delete;
750  reducer &operator=(reducer &&) = delete;
751 
752  // SYCL 2020 revision 4 says this should be const, but this is a bug
753  // see https://github.com/KhronosGroup/SYCL-Docs/pull/252
754  reducer<T, BinaryOperation, Dims - 1, Extent, IdentityContainerT, true>
755  operator[](size_t Index) {
756  return {MValue[Index], BinaryOperation()};
757  }
758 
759  T identity() const { return getIdentity(); }
760 
761 private:
762  template <typename ReducerT> friend class detail::ReducerAccess;
763 
764  static constexpr T getIdentity() {
766  }
767 
768  element_type &getElement(size_t E) { return MValue[E]; }
769  const element_type &getElement(size_t E) const { return MValue[E]; }
770 
771  marray<element_type, Extent> MValue;
772 };
773 
774 namespace detail {
775 
776 // Used for determining dimensions for temporary storage (mainly).
777 template <class T> struct data_dim_t {
778  static constexpr int value = 1;
779 };
780 
781 template <class T, int AccessorDims, access::mode Mode,
782  access::placeholder IsPH, typename PropList>
783 struct data_dim_t<
784  accessor<T, AccessorDims, Mode, access::target::device, IsPH, PropList>> {
785  static constexpr int value = AccessorDims;
786 };
787 
788 template <class T> struct get_red_t;
789 template <class T> struct get_red_t<T *> {
790  using type = T;
791 };
792 
793 template <class T, int Dims, typename AllocatorT>
794 struct get_red_t<buffer<T, Dims, AllocatorT>> {
795  using type = T;
796 };
797 
798 namespace reduction {
799 // Kernel name wrapper for initializing reduction-related memory through
800 // reduction_impl_algo::withInitializedMem.
801 template <typename KernelName> struct InitMemKrn;
802 } // namespace reduction
803 
806 template <class KernelName>
807 using __sycl_init_mem_for =
808  std::conditional_t<std::is_same_v<KernelName, auto_name>, auto_name,
810 
811 template <typename T, class BinaryOperation, int Dims, size_t Extent,
812  bool ExplicitIdentity, typename RedOutVar>
814  using self = reduction_impl_algo<T, BinaryOperation, Dims, Extent,
815  ExplicitIdentity, RedOutVar>;
816 
817  // TODO: Do we also need chooseBinOp?
818  static constexpr T chooseIdentity(const T &Identity) {
819  // For now the implementation ignores the identity value given by user
820  // when the implementation knows the identity.
821  // The SPEC could prohibit passing identity parameter to operations with
822  // known identity, but that could have some bad consequences too.
823  // For example, at some moment the implementation may NOT know the identity
824  // for COMPLEX-PLUS reduction. User may create a program that would pass
825  // COMPLEX value (0,0) as identity for PLUS reduction. At some later moment
826  // when the implementation starts handling COMPLEX-PLUS as known operation
827  // the existing user's program remains compilable and working correctly.
828  // I.e. with this constructor here, adding more reduction operations to the
829  // list of known operations does not break the existing programs.
830  if constexpr (is_known_identity) {
831  (void)Identity;
833  } else {
834  return Identity;
835  }
836  }
837 
838 public:
839  static constexpr bool is_known_identity =
841  static constexpr bool has_identity = is_known_identity || ExplicitIdentity;
842 
845  using reducer_token_type =
847  using reducer_type =
849  using reducer_element_type =
851  using result_type = T;
852  using binary_operation = BinaryOperation;
853 
854  static constexpr size_t dims = Dims;
855  static constexpr bool has_float64_atomics =
857  static constexpr bool has_fast_atomics =
859  static constexpr bool has_fast_reduce =
861 
862  static constexpr bool is_usm = std::is_same_v<RedOutVar, T *>;
863 
864  static constexpr size_t num_elements = Extent;
865 
866  reduction_impl_algo(const T &Identity, BinaryOperation BinaryOp, bool Init,
867  RedOutVar RedOut)
868  : MIdentityContainer(chooseIdentity(Identity)), MBinaryOp(BinaryOp),
869  InitializeToIdentity(Init), MRedOut(std::move(RedOut)) {}
870 
871  template <typename RelayT = T,
872  typename RelayBinaryOperation = BinaryOperation>
874  BinaryOperation BinaryOp, bool Init, RedOutVar RedOut,
876  int> = 0)
877  : MIdentityContainer(ReducerAccess<reducer_type>::getIdentityStatic()),
878  MBinaryOp(BinaryOp), InitializeToIdentity(Init),
879  MRedOut(std::move(RedOut)) {}
880 
881  template <typename RelayT = T,
882  typename RelayBinaryOperation = BinaryOperation>
884  BinaryOperation BinaryOp, bool Init, RedOutVar RedOut,
886  int> = 0)
887  : MIdentityContainer(), MBinaryOp(BinaryOp), InitializeToIdentity(Init),
888  MRedOut(std::move(RedOut)) {}
889 
891  CGH.addReduction(MOutBufPtr);
892  return accessor{*MOutBufPtr, CGH, sycl::read_only};
893  }
894 
895  template <bool IsOneWG>
896  auto getWriteMemForPartialReds(size_t Size, handler &CGH) {
897  // If there is only one WG we can avoid creation of temporary buffer with
898  // partial sums and write directly into user's reduction variable.
899  if constexpr (IsOneWG) {
900  return getUserRedVarAccess(CGH);
901  } else {
902  MOutBufPtr =
903  std::make_shared<buffer<reducer_element_type, 1>>(range<1>(Size));
904  CGH.addReduction(MOutBufPtr);
905  return accessor{*MOutBufPtr, CGH};
906  }
907  }
908 
909  template <class _T = T> auto &getTempBuffer(size_t Size, handler &CGH) {
910  auto Buffer = std::make_shared<buffer<_T, 1>>(range<1>(Size));
911  CGH.addReduction(Buffer);
912  return *Buffer;
913  }
914 
921  auto getWriteAccForPartialReds(size_t Size, handler &CGH) {
922  static_assert(!has_identity || sizeof(reducer_element_type) == sizeof(T),
923  "Unexpected size of reducer element type.");
924 
925  // We can only use the output memory directly if it is not USM and we have
926  // and identity, i.e. it has a thin element wrapper.
927  if constexpr (!is_usm && has_identity) {
928  if (Size == 1) {
929  auto ReinterpretRedOut =
930  MRedOut.template reinterpret<reducer_element_type>();
931  return accessor{ReinterpretRedOut, CGH};
932  }
933  }
934 
935  // Create a new output buffer and return an accessor to it.
936  //
937  // Array reductions are performed element-wise to avoid stack growth.
938  MOutBufPtr =
939  std::make_shared<buffer<reducer_element_type, 1>>(range<1>(Size));
940  CGH.addReduction(MOutBufPtr);
941  return accessor{*MOutBufPtr, CGH};
942  }
943 
949  //
950  // This currently optimizes for a number of kernel instantiations instead of
951  // runtime latency. That might change in future.
952  template <typename KernelName, typename FuncTy,
953  bool HasIdentity = has_identity>
954  std::enable_if_t<HasIdentity> withInitializedMem(handler &CGH, FuncTy Func) {
955  // "Template" lambda to ensure that only one type of Func (USM/Buf) is
956  // instantiated for the code below.
957  auto DoIt = [&](auto &Out) {
958  auto RWReduVal = std::make_shared<std::array<T, num_elements>>();
959  for (int i = 0; i < num_elements; ++i) {
960  (*RWReduVal)[i] = decltype(MIdentityContainer)::getIdentity();
961  }
962  CGH.addReduction(RWReduVal);
963  auto Buf = std::make_shared<buffer<T, 1>>(RWReduVal.get()->data(),
964  range<1>(num_elements));
965  Buf->set_final_data();
966  CGH.addReduction(Buf);
967  accessor Mem{*Buf, CGH};
968  Func(Mem);
969 
970  reduction::withAuxHandler(CGH, [&](handler &CopyHandler) {
971  // MSVC (19.32.31329) has problems compiling the line below when used
972  // as a host compiler in c++17 mode (but not in c++latest)
973  // accessor Mem{*Buf, CopyHandler};
974  // so use the old-style API.
975  auto Mem =
976  Buf->template get_access<access::mode::read_write>(CopyHandler);
977  if constexpr (is_usm) {
978  // Can't capture whole reduction, copy into distinct variables.
979  bool IsUpdateOfUserVar = !initializeToIdentity();
980  auto BOp = getBinaryOperation();
981 
982  // Don't use constexpr as non-default host compilers (unlike clang)
983  // might actually create a capture resulting in binary differences
984  // between host/device in lambda captures.
985  size_t NElements = num_elements;
986 
988  for (int i = 0; i < NElements; ++i) {
989  if (IsUpdateOfUserVar)
990  Out[i] = BOp(Out[i], Mem[i]);
991  else
992  Out[i] = Mem[i];
993  }
994  });
995  } else {
996  accessor OutAcc{Out, CGH};
997  CopyHandler.copy(Mem, OutAcc);
998  }
999  });
1000  };
1001  if constexpr (is_usm) {
1002  // Don't dispatch based on initializeToIdentity() as that would lead
1003  // to two different instantiations of Func.
1004  DoIt(MRedOut);
1005  } else {
1006  if (initializeToIdentity())
1007  DoIt(MRedOut);
1008  else
1009  Func(accessor{MRedOut, CGH});
1010  }
1011  }
1012 
1013  // Overload of withInitializedMem for reducer without identity. Initializing
1014  // to identity is not allowed in this case.
1015  template <typename KernelName, typename FuncTy,
1016  bool HasIdentity = has_identity>
1017  std::enable_if_t<!HasIdentity> withInitializedMem(handler &CGH, FuncTy Func) {
1018  std::ignore = CGH;
1019  assert(!initializeToIdentity() &&
1020  "Initialize to identity not allowed for identity-less reductions.");
1021  Func(accessor{MRedOut, CGH});
1022  }
1023 
1025  return MIdentityContainer;
1026  }
1027 
1028  accessor<int, 1, access::mode::read_write, access::target::device,
1029  access::placeholder::false_t>
1031  auto CounterMem = std::make_shared<int>(0);
1032  CGH.addReduction(CounterMem);
1033  auto CounterBuf = std::make_shared<buffer<int, 1>>(CounterMem.get(), 1);
1034  CounterBuf->set_final_data();
1035  CGH.addReduction(CounterBuf);
1036  return {*CounterBuf, CGH};
1037  }
1038 
1039  // On discrete (vs. integrated) GPUs it's faster to initialize memory with an
1040  // extra kernel than copy it from the host.
1042  queue q = createSyclObjFromImpl<queue>(CGH.MQueue);
1043  device Dev = q.get_device();
1044  auto Deleter = [=](auto *Ptr) { free(Ptr, q); };
1045 
1046  std::shared_ptr<int> Counter(malloc_device<int>(1, q), Deleter);
1047  CGH.addReduction(Counter);
1048 
1049  auto Event = q.memset(Counter.get(), 0, sizeof(int));
1050  CGH.depends_on(Event);
1051 
1052  return Counter.get();
1053  }
1054 
1056  BinaryOperation getBinaryOperation() const { return MBinaryOp; }
1058 
1060  std::ignore = CGH;
1061  if constexpr (is_usm)
1062  return MRedOut;
1063  else
1064  return accessor{MRedOut, CGH};
1065  }
1066 
1067 private:
1068  // Object holding the identity if available.
1069  identity_container_type MIdentityContainer;
1070 
1071  // Array reduction is performed element-wise to avoid stack growth, hence
1072  // 1-dimensional always.
1073  std::shared_ptr<buffer<reducer_element_type, 1>> MOutBufPtr;
1074 
1075  BinaryOperation MBinaryOp;
1076  bool InitializeToIdentity;
1077 
1079  RedOutVar MRedOut;
1080 };
1081 
1084 template <typename T, class BinaryOperation, int Dims, size_t Extent,
1085  bool ExplicitIdentity, typename RedOutVar>
1087  : private reduction_impl_base,
1088  public reduction_impl_algo<T, BinaryOperation, Dims, Extent,
1089  ExplicitIdentity, RedOutVar> {
1090 private:
1091  using algo = reduction_impl_algo<T, BinaryOperation, Dims, Extent,
1092  ExplicitIdentity, RedOutVar>;
1093  using self = reduction_impl<T, BinaryOperation, Dims, Extent,
1094  ExplicitIdentity, RedOutVar>;
1095 
1096 public:
1097  using algo::is_known_identity;
1098  using algo::is_usm;
1099 
1100  // Only scalar and 1D array reductions are supported by SYCL 2020.
1101  static_assert(Dims <= 1, "Multi-dimensional reductions are not supported.");
1102 
1105  template <bool ExplicitIdentityRelay = ExplicitIdentity,
1106  typename = std::enable_if_t<!ExplicitIdentityRelay>>
1107  reduction_impl(RedOutVar Var, BinaryOperation BOp,
1108  bool InitializeToIdentity = false)
1109  : algo(BOp, InitializeToIdentity, Var) {
1110  if constexpr (!is_usm)
1111  if (Var.size() != 1)
1112  throw sycl::runtime_error(errc::invalid,
1113  "Reduction variable must be a scalar.",
1114  PI_ERROR_INVALID_VALUE);
1115  if constexpr (!is_known_identity)
1117  throw sycl::runtime_error(errc::invalid,
1118  "initialize_to_identity property cannot be "
1119  "used with identityless reductions.",
1120  PI_ERROR_INVALID_VALUE);
1121  }
1122 
1125  template <bool ExplicitIdentityRelay = ExplicitIdentity,
1126  typename = std::enable_if_t<ExplicitIdentityRelay>>
1127  reduction_impl(RedOutVar &Var, const T &Identity, BinaryOperation BOp,
1128  bool InitializeToIdentity)
1129  : algo(Identity, BOp, InitializeToIdentity, Var) {
1130  if constexpr (!is_usm)
1131  if (Var.size() != 1)
1132  throw sycl::runtime_error(errc::invalid,
1133  "Reduction variable must be a scalar.",
1134  PI_ERROR_INVALID_VALUE);
1135  }
1136 };
1137 
1138 template <class BinaryOp, int Dims, size_t Extent, bool ExplicitIdentity,
1139  typename RedOutVar, typename... RestTy>
1140 auto make_reduction(RedOutVar RedVar, RestTy &&...Rest) {
1142  Extent, ExplicitIdentity, RedOutVar>{
1143  RedVar, std::forward<RestTy>(Rest)...};
1144 }
1145 
1146 namespace reduction {
1147 inline void finalizeHandler(handler &CGH) { CGH.finalize(); }
1148 template <class FunctorTy> void withAuxHandler(handler &CGH, FunctorTy Func) {
1149  event E = CGH.finalize();
1150  handler AuxHandler(CGH.MQueue, CGH.MIsHost);
1151  AuxHandler.depends_on(E);
1152  AuxHandler.saveCodeLoc(CGH.MCodeLoc);
1153  Func(AuxHandler);
1154  CGH.MLastEvent = AuxHandler.finalize();
1155  return;
1156 }
1157 } // namespace reduction
1158 
1159 // This method is used for implementation of parallel_for accepting 1 reduction.
1160 // TODO: remove this method when everything is switched to general algorithm
1161 // implementing arbitrary number of reductions in parallel_for().
1164 template <typename KernelName, class Reduction>
1165 void reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu) {
1166  static_assert(Reduction::is_usm,
1167  "All implementations using this helper are expected to have "
1168  "USM reduction, not a buffer-based one.");
1169  size_t NElements = Reduction::num_elements;
1170  auto InAcc = Redu.getReadAccToPreviousPartialReds(CGH);
1171  auto UserVarPtr = Redu.getUserRedVarAccess(CGH);
1172  bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1173  auto BOp = Redu.getBinaryOperation();
1174  CGH.single_task<KernelName>([=] {
1175  for (int i = 0; i < NElements; ++i) {
1176  auto Elem = InAcc[i];
1177  if (IsUpdateOfUserVar)
1178  UserVarPtr[i] = BOp(UserVarPtr[i], *Elem);
1179  else
1180  UserVarPtr[i] = *Elem;
1181  }
1182  });
1183 }
1184 
1185 namespace reduction {
1186 template <typename KernelName, strategy S, class... Ts> struct MainKrn;
1187 template <typename KernelName, strategy S, class... Ts> struct AuxKrn;
1188 } // namespace reduction
1189 
1190 // Tag structs to help creating unique kernels for multi-reduction cases.
1191 struct KernelOneWGTag {};
1193 
1196 template <template <typename, reduction::strategy, typename...> class MainOrAux,
1197  class KernelName, reduction::strategy Strategy, class... Ts>
1199  std::conditional_t<std::is_same_v<KernelName, auto_name>, auto_name,
1200  MainOrAux<KernelName, Strategy, Ts...>>;
1201 
1202 // Implementations.
1203 
1204 template <reduction::strategy> struct NDRangeReduction;
1205 
1206 template <>
1207 struct NDRangeReduction<reduction::strategy::local_atomic_and_atomic_cross_wg> {
1208  template <typename KernelName, int Dims, typename PropertiesT,
1209  typename KernelType, typename Reduction>
1210  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1211  nd_range<Dims> NDRange, PropertiesT &Properties,
1212  Reduction &Redu, KernelType &KernelFunc) {
1213  static_assert(Reduction::has_identity,
1214  "Identityless reductions are not supported by the "
1215  "local_atomic_and_atomic_cross_wg strategy.");
1216 
1217  std::ignore = Queue;
1218  using Name = __sycl_reduction_kernel<
1219  reduction::MainKrn, KernelName,
1220  reduction::strategy::local_atomic_and_atomic_cross_wg>;
1221  Redu.template withInitializedMem<Name>(CGH, [&](auto Out) {
1222  size_t NElements = Reduction::num_elements;
1224  CGH};
1225 
1226  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<1> NDId) {
1227  // Call user's functions. Reducer.MValue gets initialized there.
1228  typename Reduction::reducer_type Reducer;
1229  KernelFunc(NDId, Reducer);
1230 
1231  // Work-group cooperates to initialize multiple reduction variables
1232  auto LID = NDId.get_local_id(0);
1233  for (size_t E = LID; E < NElements; E += NDId.get_local_range(0)) {
1234  GroupSum[E] = getReducerAccess(Reducer).getIdentity();
1235  }
1236  workGroupBarrier();
1237 
1238  // Each work-item has its own reducer to combine
1239  Reducer.template atomic_combine<access::address_space::local_space>(
1240  &GroupSum[0]);
1241 
1242  // Single work-item performs finalization for entire work-group
1243  // TODO: Opportunity to parallelize across elements
1244  workGroupBarrier();
1245  if (LID == 0) {
1246  for (size_t E = 0; E < NElements; ++E) {
1247  *getReducerAccess(Reducer).getElement(E) = GroupSum[E];
1248  }
1249  Reducer.template atomic_combine(&Out[0]);
1250  }
1251  });
1252  });
1253  }
1254 };
1255 
1256 template <>
1258  reduction::strategy::group_reduce_and_last_wg_detection> {
1259  template <typename KernelName, int Dims, typename PropertiesT,
1260  typename KernelType, typename Reduction>
1261  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1262  nd_range<Dims> NDRange, PropertiesT &Properties,
1263  Reduction &Redu, KernelType &KernelFunc) {
1264  static_assert(Reduction::has_identity,
1265  "Identityless reductions are not supported by the "
1266  "group_reduce_and_last_wg_detection strategy.");
1267 
1268  std::ignore = Queue;
1269  size_t NElements = Reduction::num_elements;
1270  size_t WGSize = NDRange.get_local_range().size();
1271  size_t NWorkGroups = NDRange.get_group_range().size();
1272 
1273  auto Out = Redu.getUserRedVarAccess(CGH);
1274 
1275  auto &PartialSumsBuf = Redu.getTempBuffer(NWorkGroups * NElements, CGH);
1276  accessor PartialSums(PartialSumsBuf, CGH, sycl::read_write, sycl::no_init);
1277 
1278  bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1279  auto Rest = [&](auto NWorkGroupsFinished) {
1280  local_accessor<int, 1> DoReducePartialSumsInLastWG{1, CGH};
1281 
1282  using Name = __sycl_reduction_kernel<
1283  reduction::MainKrn, KernelName,
1284  reduction::strategy::group_reduce_and_last_wg_detection,
1285  decltype(NWorkGroupsFinished)>;
1286 
1287  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<1> NDId) {
1288  // Call user's functions. Reducer.MValue gets initialized there.
1289  typename Reduction::reducer_type Reducer;
1290  KernelFunc(NDId, Reducer);
1291 
1292  typename Reduction::binary_operation BOp;
1293  auto Group = NDId.get_group();
1294 
1295  // If there are multiple values, reduce each separately
1296  // reduce_over_group is only defined for each T, not for span<T, ...>
1297  size_t LID = NDId.get_local_id(0);
1298  for (int E = 0; E < NElements; ++E) {
1299  auto &RedElem = *getReducerAccess(Reducer).getElement(E);
1300  RedElem = reduce_over_group(Group, RedElem, BOp);
1301  if (LID == 0) {
1302  if (NWorkGroups == 1) {
1303  // Can avoid using partial sum and write the final result
1304  // immediately.
1305  if (IsUpdateOfUserVar)
1306  RedElem = BOp(RedElem, Out[E]);
1307  Out[E] = RedElem;
1308  } else {
1309  PartialSums[NDId.get_group_linear_id() * NElements + E] =
1310  *getReducerAccess(Reducer).getElement(E);
1311  }
1312  }
1313  }
1314 
1315  if (NWorkGroups == 1)
1316  // We're done.
1317  return;
1318 
1319  // Signal this work-group has finished after all values are reduced
1320  if (LID == 0) {
1321  auto NFinished =
1322  sycl::atomic_ref<int, memory_order::acq_rel, memory_scope::device,
1323  access::address_space::global_space>(
1324  NWorkGroupsFinished[0]);
1325  DoReducePartialSumsInLastWG[0] = ++NFinished == NWorkGroups;
1326  }
1327 
1328  workGroupBarrier();
1329  if (DoReducePartialSumsInLastWG[0]) {
1330  // Reduce each result separately
1331  // TODO: Opportunity to parallelize across elements.
1332  for (int E = 0; E < NElements; ++E) {
1333  auto LocalSum = getReducerAccess(Reducer).getIdentity();
1334  for (size_t I = LID; I < NWorkGroups; I += WGSize)
1335  LocalSum = BOp(LocalSum, PartialSums[I * NElements + E]);
1336  auto Result = reduce_over_group(Group, LocalSum, BOp);
1337 
1338  if (LID == 0) {
1339  if (IsUpdateOfUserVar)
1340  Result = BOp(Result, Out[E]);
1341  Out[E] = Result;
1342  }
1343  }
1344  }
1345  });
1346  };
1347 
1348  auto device = getDeviceFromHandler(CGH);
1349  // Integrated/discrete GPUs have different faster path. For discrete GPUs
1350  // fast path requires USM device allocations though, so check for that as
1351  // well.
1352  if (device.get_info<info::device::host_unified_memory>() ||
1353  !device.has(aspect::usm_device_allocations))
1354  Rest(Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH));
1355  else
1356  Rest(Redu.getGroupsCounterAccDiscrete(CGH));
1357  }
1358 };
1359 
1361 static inline size_t GreatestPowerOfTwo(size_t N) {
1362  if (N == 0)
1363  return 0;
1364 
1365  size_t Ret = 1;
1366  while ((N >>= 1) != 0)
1367  Ret <<= 1;
1368  return Ret;
1369 }
1370 
1371 template <typename FuncTy>
1372 void doTreeReductionHelper(size_t WorkSize, size_t LID, FuncTy Func) {
1373  workGroupBarrier();
1374 
1375  // Initial pivot is the greatest power-of-two value smaller or equal to the
1376  // work size.
1377  size_t Pivot = GreatestPowerOfTwo(WorkSize);
1378 
1379  // If the pivot is not the same as the work size, it needs to do an initial
1380  // reduction where we only reduce the N last elements into the first N
1381  // elements, where N is WorkSize - Pivot.
1382  // 0 Pivot WorkSize Power of two
1383  // | | | |
1384  // +-----------------------+------+------------------------+------+
1385  // |
1386  // WorkSize - Pivot
1387  if (Pivot != WorkSize) {
1388  if (Pivot + LID < WorkSize)
1389  Func(LID, Pivot + LID);
1390  workGroupBarrier();
1391  }
1392 
1393  // Now the amount of work must be power-of-two, so do the tree reduction.
1394  for (size_t CurPivot = Pivot >> 1; CurPivot > 0; CurPivot >>= 1) {
1395  if (LID < CurPivot)
1396  Func(LID, CurPivot + LID);
1397  workGroupBarrier();
1398  }
1399 }
1400 
1401 // Enum for specifying work size guarantees in tree-reduction.
1403 
1404 template <WorkSizeGuarantees WSGuarantee, int Dim, typename LocalRedsTy,
1405  typename BinOpTy, typename AccessFuncTy>
1406 void doTreeReduction(size_t WorkSize, nd_item<Dim> NDIt, LocalRedsTy &LocalReds,
1407  BinOpTy &BOp, AccessFuncTy AccessFunc) {
1408  size_t LID = NDIt.get_local_linear_id();
1409  size_t AdjustedWorkSize;
1410  if constexpr (WSGuarantee == WorkSizeGuarantees::LessOrEqual ||
1411  WSGuarantee == WorkSizeGuarantees::Equal) {
1412  // If there is less-or-equal number of items and amount of work, we just
1413  // load the work into the local memory and start reducing. If we know it is
1414  // equal we can let the optimizer remove the check.
1415  if (WSGuarantee == WorkSizeGuarantees::Equal || LID < WorkSize)
1416  LocalReds[LID] = AccessFunc(LID);
1417  AdjustedWorkSize = WorkSize;
1418  } else {
1419  // Otherwise we have no guarantee and we need to first reduce the amount of
1420  // work to fit into the local memory.
1421  size_t WGSize = NDIt.get_local_range().size();
1422  AdjustedWorkSize = std::min(WorkSize, WGSize);
1423  if (LID < AdjustedWorkSize) {
1424  auto LocalSum = AccessFunc(LID);
1425  for (size_t I = LID + WGSize; I < WorkSize; I += WGSize)
1426  LocalSum = BOp(LocalSum, AccessFunc(I));
1427 
1428  LocalReds[LID] = LocalSum;
1429  }
1430  }
1431  doTreeReductionHelper(AdjustedWorkSize, LID, [&](size_t I, size_t J) {
1432  LocalReds[I] = BOp(LocalReds[I], LocalReds[J]);
1433  });
1434 }
1435 
1436 // Tree-reduction over tuples of accessors. This assumes that WorkSize is
1437 // less than or equal to the work-group size.
1438 // TODO: For variadics/tuples we don't provide such a high-level abstraction as
1439 // for the scalar case above. Is there some C++ magic to level them?
1440 template <typename... LocalAccT, typename... BOPsT, size_t... Is>
1441 void doTreeReductionOnTuple(size_t WorkSize, size_t LID,
1442  ReduTupleT<LocalAccT...> &LocalAccs,
1443  ReduTupleT<BOPsT...> &BOPs,
1444  std::index_sequence<Is...>) {
1445  doTreeReductionHelper(WorkSize, LID, [&](size_t I, size_t J) {
1446  auto ProcessOne = [=](auto &LocalAcc, auto &BOp) {
1447  LocalAcc[I] = BOp(LocalAcc[I], LocalAcc[J]);
1448  };
1449  (ProcessOne(std::get<Is>(LocalAccs), std::get<Is>(BOPs)), ...);
1450  });
1451 }
1452 
1453 template <> struct NDRangeReduction<reduction::strategy::range_basic> {
1454  template <typename KernelName, int Dims, typename PropertiesT,
1455  typename KernelType, typename Reduction>
1456  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1457  nd_range<Dims> NDRange, PropertiesT &Properties,
1458  Reduction &Redu, KernelType &KernelFunc) {
1459  using reducer_type = typename Reduction::reducer_type;
1460  using element_type = typename ReducerTraits<reducer_type>::element_type;
1461 
1462  // If reduction has an identity and is not USM, the reducer element is just
1463  // a thin wrapper around the result type so the partial sum will use the
1464  // output memory iff NWorkGroups == 1. Otherwise, we need to make sure the
1465  // right output buffer is written in case NWorkGroups == 1.
1466  constexpr bool UsePartialSumForOutput =
1467  !Reduction::is_usm && Reduction::has_identity;
1468 
1469  std::ignore = Queue;
1470  size_t NElements = Reduction::num_elements;
1471  size_t WGSize = NDRange.get_local_range().size();
1472  size_t NWorkGroups = NDRange.get_group_range().size();
1473 
1474  bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1475  auto PartialSums =
1476  Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1477  auto Out = [&]() {
1478  if constexpr (UsePartialSumForOutput)
1479  return (NWorkGroups == 1)
1480  ? PartialSums
1481  : Redu.getWriteAccForPartialReds(NElements, CGH);
1482  else
1483  return Redu.getUserRedVarAccess(CGH);
1484  }();
1485  local_accessor<element_type, 1> LocalReds{WGSize, CGH};
1486  auto NWorkGroupsFinished =
1487  Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH);
1488  local_accessor<int, 1> DoReducePartialSumsInLastWG{1, CGH};
1489 
1490  auto IdentityContainer = Redu.getIdentityContainer();
1491  auto BOp = Redu.getBinaryOperation();
1492 
1493  using Name = __sycl_reduction_kernel<reduction::MainKrn, KernelName,
1494  reduction::strategy::range_basic>;
1495 
1496  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<1> NDId) {
1497  // Call user's functions. Reducer.MValue gets initialized there.
1498  reducer_type Reducer = reducer_type(IdentityContainer, BOp);
1499  KernelFunc(NDId, Reducer);
1500 
1501  auto ElementCombiner = [&](element_type &LHS, const element_type &RHS) {
1502  return LHS.combine(BOp, RHS);
1503  };
1504 
1505  // If there are multiple values, reduce each separately
1506  // This prevents local memory from scaling with elements
1507  size_t LID = NDId.get_local_linear_id();
1508  for (int E = 0; E < NElements; ++E) {
1509 
1510  doTreeReduction<WorkSizeGuarantees::Equal>(
1511  WGSize, NDId, LocalReds, ElementCombiner,
1512  [&](size_t) { return getReducerAccess(Reducer).getElement(E); });
1513 
1514  if (LID == 0) {
1515  auto V = LocalReds[0];
1516 
1517  bool IsOneWG = NWorkGroups == 1;
1518  if (IsOneWG && IsUpdateOfUserVar)
1519  V.combine(BOp, Out[E]);
1520 
1521  // if NWorkGroups == 1 && UsePartialSumForOutput, then PartialsSum
1522  // and Out point to same memory.
1523  if (UsePartialSumForOutput || !IsOneWG)
1524  PartialSums[NDId.get_group_linear_id() * NElements + E] = V;
1525  else if (V)
1526  Out[E] = *V;
1527  }
1528  }
1529 
1530  // Signal this work-group has finished after all values are reduced
1531  if (LID == 0) {
1532  auto NFinished =
1533  sycl::atomic_ref<int, memory_order::acq_rel, memory_scope::device,
1534  access::address_space::global_space>(
1535  NWorkGroupsFinished[0]);
1536  DoReducePartialSumsInLastWG[0] =
1537  ++NFinished == NWorkGroups && NWorkGroups > 1;
1538  }
1539 
1540  workGroupBarrier();
1541  if (DoReducePartialSumsInLastWG[0]) {
1542  // Reduce each result separately
1543  // TODO: Opportunity to parallelize across elements
1544  for (int E = 0; E < NElements; ++E) {
1545  doTreeReduction<WorkSizeGuarantees::None>(
1546  NWorkGroups, NDId, LocalReds, ElementCombiner,
1547  [&](size_t I) { return PartialSums[I * NElements + E]; });
1548  if (LID == 0) {
1549  auto V = LocalReds[0];
1550  if (IsUpdateOfUserVar)
1551  V.combine(BOp, Out[E]);
1552  Out[E] = *V;
1553  }
1554  }
1555  }
1556  });
1557  }
1558 };
1559 
1560 template <>
1561 struct NDRangeReduction<reduction::strategy::group_reduce_and_atomic_cross_wg> {
1562  template <typename KernelName, int Dims, typename PropertiesT,
1563  typename KernelType, typename Reduction>
1564  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1565  nd_range<Dims> NDRange, PropertiesT &Properties,
1566  Reduction &Redu, KernelType &KernelFunc) {
1567  static_assert(Reduction::has_identity,
1568  "Identityless reductions are not supported by the "
1569  "group_reduce_and_atomic_cross_wg strategy.");
1570 
1571  std::ignore = Queue;
1572  using Name = __sycl_reduction_kernel<
1573  reduction::MainKrn, KernelName,
1574  reduction::strategy::group_reduce_and_atomic_cross_wg>;
1575  Redu.template withInitializedMem<Name>(CGH, [&](auto Out) {
1576  size_t NElements = Reduction::num_elements;
1577 
1578  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<Dims> NDIt) {
1579  // Call user's function. Reducer.MValue gets initialized there.
1580  typename Reduction::reducer_type Reducer;
1581  KernelFunc(NDIt, Reducer);
1582 
1583  typename Reduction::binary_operation BOp;
1584  for (int E = 0; E < NElements; ++E) {
1585  auto &ReducerElem = getReducerAccess(Reducer).getElement(E);
1586  *ReducerElem = reduce_over_group(NDIt.get_group(), *ReducerElem, BOp);
1587  }
1588  if (NDIt.get_local_linear_id() == 0)
1589  Reducer.atomic_combine(&Out[0]);
1590  });
1591  });
1592  }
1593 };
1594 
1595 template <>
1597  reduction::strategy::local_mem_tree_and_atomic_cross_wg> {
1598  template <typename KernelName, int Dims, typename PropertiesT,
1599  typename KernelType, typename Reduction>
1600  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1601  nd_range<Dims> NDRange, PropertiesT &Properties,
1602  Reduction &Redu, KernelType &KernelFunc) {
1603  using reducer_type = typename Reduction::reducer_type;
1604  using element_type = typename ReducerTraits<reducer_type>::element_type;
1605 
1606  std::ignore = Queue;
1607  using Name = __sycl_reduction_kernel<
1608  reduction::MainKrn, KernelName,
1609  reduction::strategy::local_mem_tree_and_atomic_cross_wg>;
1610  Redu.template withInitializedMem<Name>(CGH, [&](auto Out) {
1611  size_t NElements = Reduction::num_elements;
1612  size_t WGSize = NDRange.get_local_range().size();
1613 
1614  // Use local memory to reduce elements in work-groups into zero-th
1615  // element.
1616  local_accessor<element_type, 1> LocalReds{WGSize, CGH};
1617 
1618  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<Dims> NDIt) {
1619  // Call user's functions. Reducer.MValue gets initialized there.
1620  reducer_type Reducer;
1621  KernelFunc(NDIt, Reducer);
1622 
1623  size_t WGSize = NDIt.get_local_range().size();
1624  size_t LID = NDIt.get_local_linear_id();
1625 
1626  typename Reduction::binary_operation BOp;
1627  auto ElementCombiner = [&](element_type &LHS, const element_type &RHS) {
1628  return LHS.combine(BOp, RHS);
1629  };
1630 
1631  // If there are multiple values, reduce each separately
1632  // This prevents local memory from scaling with elements
1633  for (int E = 0; E < NElements; ++E) {
1634 
1635  doTreeReduction<WorkSizeGuarantees::Equal>(
1636  WGSize, NDIt, LocalReds, ElementCombiner,
1637  [&](size_t) { return getReducerAccess(Reducer).getElement(E); });
1638 
1639  if (LID == 0)
1640  getReducerAccess(Reducer).getElement(E) = LocalReds[0];
1641 
1642  // Ensure item 0 is finished with LocalReds before next iteration
1643  if (E != NElements - 1) {
1644  NDIt.barrier();
1645  }
1646  }
1647 
1648  if (LID == 0) {
1649  Reducer.atomic_combine(&Out[0]);
1650  }
1651  });
1652  });
1653  }
1654 };
1655 
1656 template <>
1658  reduction::strategy::group_reduce_and_multiple_kernels> {
1659  template <typename KernelName, int Dims, typename PropertiesT,
1660  typename KernelType, typename Reduction>
1661  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1662  nd_range<Dims> NDRange, PropertiesT &Properties,
1663  Reduction &Redu, KernelType &KernelFunc) {
1664  static_assert(Reduction::has_identity,
1665  "Identityless reductions are not supported by the "
1666  "group_reduce_and_multiple_kernels strategy.");
1667 
1668  // Before running the kernels, check that device has enough local memory
1669  // to hold local arrays that may be required for the reduction algorithm.
1670  // TODO: If the work-group-size is limited by the local memory, then
1671  // a special version of the main kernel may be created. The one that would
1672  // not use local accessors, which means it would not do the reduction in
1673  // the main kernel, but simply generate Range.get_global_range.size() number
1674  // of partial sums, leaving the reduction work to the additional/aux
1675  // kernels.
1676  constexpr bool HFR = Reduction::has_fast_reduce;
1677  size_t OneElemSize = HFR ? 0 : sizeof(typename Reduction::result_type);
1678  // TODO: currently the maximal work group size is determined for the given
1679  // queue/device, while it may be safer to use queries to the kernel compiled
1680  // for the device.
1681  size_t MaxWGSize = reduGetMaxWGSize(Queue, OneElemSize);
1682  if (NDRange.get_local_range().size() > MaxWGSize)
1683  throw sycl::runtime_error("The implementation handling parallel_for with"
1684  " reduction requires work group size not bigger"
1685  " than " +
1686  std::to_string(MaxWGSize),
1687  PI_ERROR_INVALID_WORK_GROUP_SIZE);
1688 
1689  size_t NElements = Reduction::num_elements;
1690  size_t NWorkGroups = NDRange.get_group_range().size();
1691  auto Out = Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1692 
1693  bool IsUpdateOfUserVar =
1694  !Reduction::is_usm && !Redu.initializeToIdentity() && NWorkGroups == 1;
1695 
1696  using Name = __sycl_reduction_kernel<
1697  reduction::MainKrn, KernelName,
1698  reduction::strategy::group_reduce_and_multiple_kernels>;
1699 
1700  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<Dims> NDIt) {
1701  // Call user's functions. Reducer.MValue gets initialized there.
1702  typename Reduction::reducer_type Reducer;
1703  KernelFunc(NDIt, Reducer);
1704 
1705  // Compute the partial sum/reduction for the work-group.
1706  size_t WGID = NDIt.get_group_linear_id();
1707  typename Reduction::binary_operation BOp;
1708  for (int E = 0; E < NElements; ++E) {
1709  typename Reduction::result_type PSum;
1710  PSum = *getReducerAccess(Reducer).getElement(E);
1711  PSum = reduce_over_group(NDIt.get_group(), PSum, BOp);
1712  if (NDIt.get_local_linear_id() == 0) {
1713  if (IsUpdateOfUserVar)
1714  PSum = BOp(*Out[E], PSum);
1715  Out[WGID * NElements + E] = PSum;
1716  }
1717  }
1718  });
1719 
1721 
1722  // Run the additional kernel as many times as needed to reduce all partial
1723  // sums into one scalar.
1724 
1725  // TODO: Create a special slow/sequential version of the kernel that would
1726  // handle the reduction instead of reporting an assert below.
1727  if (MaxWGSize <= 1)
1728  throw sycl::runtime_error("The implementation handling parallel_for with "
1729  "reduction requires the maximal work group "
1730  "size to be greater than 1 to converge. "
1731  "The maximal work group size depends on the "
1732  "device and the size of the objects passed to "
1733  "the reduction.",
1734  PI_ERROR_INVALID_WORK_GROUP_SIZE);
1735  size_t NWorkItems = NDRange.get_group_range().size();
1736  while (NWorkItems > 1) {
1737  reduction::withAuxHandler(CGH, [&](handler &AuxHandler) {
1738  size_t NElements = Reduction::num_elements;
1739  size_t NWorkGroups;
1740  size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
1741 
1742  // The last work-group may be not fully loaded with work, or the work
1743  // group size may be not power of two. Those two cases considered
1744  // inefficient as they require additional code and checks in the kernel.
1745  bool HasUniformWG = NWorkGroups * WGSize == NWorkItems;
1746  if (!Reduction::has_fast_reduce)
1747  HasUniformWG = HasUniformWG && (WGSize & (WGSize - 1)) == 0;
1748 
1749  // Get read accessor to the buffer that was used as output
1750  // in the previous kernel.
1751  auto In = Redu.getReadAccToPreviousPartialReds(AuxHandler);
1752  auto Out =
1753  Redu.getWriteAccForPartialReds(NWorkGroups * NElements, AuxHandler);
1754 
1755  using Name = __sycl_reduction_kernel<
1756  reduction::AuxKrn, KernelName,
1757  reduction::strategy::group_reduce_and_multiple_kernels>;
1758 
1759  bool IsUpdateOfUserVar = !Reduction::is_usm &&
1760  !Redu.initializeToIdentity() &&
1761  NWorkGroups == 1;
1762  range<1> GlobalRange = {HasUniformWG ? NWorkItems
1763  : NWorkGroups * WGSize};
1764  nd_range<1> Range{GlobalRange, range<1>(WGSize)};
1765  AuxHandler.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
1766  typename Reduction::binary_operation BOp;
1767  size_t WGID = NDIt.get_group_linear_id();
1768  size_t GID = NDIt.get_global_linear_id();
1769 
1770  for (int E = 0; E < NElements; ++E) {
1771  typename Reduction::result_type PSum =
1772  (HasUniformWG || (GID < NWorkItems))
1773  ? *In[GID * NElements + E]
1776  PSum = reduce_over_group(NDIt.get_group(), PSum, BOp);
1777  if (NDIt.get_local_linear_id() == 0) {
1778  if (IsUpdateOfUserVar)
1779  PSum = BOp(*Out[E], PSum);
1780  Out[WGID * NElements + E] = PSum;
1781  }
1782  }
1783  });
1784  NWorkItems = NWorkGroups;
1785  });
1786  } // end while (NWorkItems > 1)
1787 
1788  if constexpr (Reduction::is_usm) {
1789  reduction::withAuxHandler(CGH, [&](handler &CopyHandler) {
1790  reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
1791  });
1792  }
1793  }
1794 };
1795 
1796 template <> struct NDRangeReduction<reduction::strategy::basic> {
1797  template <typename KernelName, int Dims, typename PropertiesT,
1798  typename KernelType, typename Reduction>
1799  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1800  nd_range<Dims> NDRange, PropertiesT &Properties,
1801  Reduction &Redu, KernelType &KernelFunc) {
1802  using element_type = typename Reduction::reducer_element_type;
1803 
1804  constexpr bool HFR = Reduction::has_fast_reduce;
1805  size_t OneElemSize = HFR ? 0 : sizeof(element_type);
1806  // TODO: currently the maximal work group size is determined for the given
1807  // queue/device, while it may be safer to use queries to the kernel
1808  // compiled for the device.
1809  size_t MaxWGSize = reduGetMaxWGSize(Queue, OneElemSize);
1810  if (NDRange.get_local_range().size() > MaxWGSize)
1811  throw sycl::runtime_error("The implementation handling parallel_for with"
1812  " reduction requires work group size not bigger"
1813  " than " +
1814  std::to_string(MaxWGSize),
1815  PI_ERROR_INVALID_WORK_GROUP_SIZE);
1816 
1817  size_t NWorkGroups = NDRange.get_group_range().size();
1818 
1819  bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1820  std::ignore = IsUpdateOfUserVar;
1821 
1822  // The type of the Out "accessor" differs between scenarios when there is
1823  // just one WorkGroup and when there are multiple. Use this lambda to write
1824  // the code just once.
1825  auto First = [&](auto KernelTag) {
1826  // We can deduce IsOneWG from the tag type.
1827  constexpr bool IsOneWG =
1828  std::is_same_v<std::remove_reference_t<decltype(KernelTag)>,
1829  KernelOneWGTag>;
1830 
1831  constexpr size_t NElements = Reduction::num_elements;
1832 
1833  size_t WGSize = NDRange.get_local_range().size();
1834 
1835  auto Out = [&]() {
1836  if constexpr (IsOneWG)
1837  return Redu.getUserRedVarAccess(CGH);
1838  else
1839  return Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1840  }();
1841 
1842  // Use local memory to reduce elements in work-groups into 0-th element.
1843  local_accessor<element_type, 1> LocalReds{WGSize, CGH};
1844 
1845  auto BOp = Redu.getBinaryOperation();
1846  auto IdentityContainer = Redu.getIdentityContainer();
1847 
1848  using Name = __sycl_reduction_kernel<reduction::MainKrn, KernelName,
1849  reduction::strategy::basic,
1850  decltype(KernelTag)>;
1851 
1852  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<Dims> NDIt) {
1853  // Call user's functions. Reducer.MValue gets initialized there.
1854  typename Reduction::reducer_type Reducer =
1855  typename Reduction::reducer_type(IdentityContainer, BOp);
1856  KernelFunc(NDIt, Reducer);
1857 
1858  size_t WGSize = NDIt.get_local_range().size();
1859  size_t LID = NDIt.get_local_linear_id();
1860 
1861  auto ElementCombiner = [&](element_type &LHS, const element_type &RHS) {
1862  return LHS.combine(BOp, RHS);
1863  };
1864 
1865  // If there are multiple values, reduce each separately
1866  // This prevents local memory from scaling with elements
1867  for (int E = 0; E < NElements; ++E) {
1868 
1869  doTreeReduction<WorkSizeGuarantees::Equal>(
1870  WGSize, NDIt, LocalReds, ElementCombiner,
1871  [&](size_t) { return getReducerAccess(Reducer).getElement(E); });
1872 
1873  // Compute the partial sum/reduction for the work-group.
1874  if (LID == 0) {
1875  element_type PSum = LocalReds[0];
1876  if constexpr (IsOneWG) {
1877  if (IsUpdateOfUserVar)
1878  PSum.combine(BOp, Out[E]);
1879  Out[E] = *PSum;
1880  } else {
1881  size_t GrID = NDIt.get_group_linear_id();
1882  Out[GrID * NElements + E] = PSum;
1883  }
1884  }
1885 
1886  // Ensure item 0 is finished with LocalReds before next iteration
1887  if (E != NElements - 1) {
1888  NDIt.barrier();
1889  }
1890  }
1891  });
1892  };
1893 
1894  if (NWorkGroups == 1)
1895  First(KernelOneWGTag{});
1896  else
1897  First(KernelMultipleWGTag{});
1898 
1900 
1901  // 2. Run the additional kernel as many times as needed to reduce
1902  // all partial sums into one scalar.
1903 
1904  // TODO: Create a special slow/sequential version of the kernel that would
1905  // handle the reduction instead of reporting an assert below.
1906  if (MaxWGSize <= 1)
1907  throw sycl::runtime_error("The implementation handling parallel_for with "
1908  "reduction requires the maximal work group "
1909  "size to be greater than 1 to converge. "
1910  "The maximal work group size depends on the "
1911  "device and the size of the objects passed to "
1912  "the reduction.",
1913  PI_ERROR_INVALID_WORK_GROUP_SIZE);
1914  size_t NWorkItems = NDRange.get_group_range().size();
1915  while (NWorkItems > 1) {
1916  size_t NWorkGroups;
1917  size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
1918 
1919  auto Rest = [&](auto KernelTag) {
1920  reduction::withAuxHandler(CGH, [&](handler &AuxHandler) {
1921  // We can deduce IsOneWG from the tag type.
1922  constexpr bool IsOneWG =
1923  std::is_same_v<std::remove_reference_t<decltype(KernelTag)>,
1924  KernelOneWGTag>;
1925 
1926  constexpr size_t NElements = Reduction::num_elements;
1927 
1928  // The last work-group may be not fully loaded with work, or the work
1929  // group size may be not power of two. Those two cases considered
1930  // inefficient as they require additional code and checks in the
1931  // kernel.
1932  bool HasUniformWG = NWorkGroups * WGSize == NWorkItems;
1933 
1934  // Get read accessor to the buffer that was used as output
1935  // in the previous kernel.
1936  auto In = Redu.getReadAccToPreviousPartialReds(AuxHandler);
1937 
1938  auto Out = [&]() {
1939  if constexpr (IsOneWG)
1940  return Redu.getUserRedVarAccess(AuxHandler);
1941  else
1942  return Redu.getWriteAccForPartialReds(NWorkGroups * NElements,
1943  AuxHandler);
1944  }();
1945 
1946  bool UniformPow2WG = HasUniformWG && (WGSize & (WGSize - 1)) == 0;
1947  // Use local memory to reduce elements in work-groups into 0-th
1948  // element.
1949  local_accessor<element_type, 1> LocalReds{WGSize, AuxHandler};
1950 
1951  auto BOp = Redu.getBinaryOperation();
1952  using Name = __sycl_reduction_kernel<reduction::AuxKrn, KernelName,
1953  reduction::strategy::basic,
1954  decltype(KernelTag)>;
1955 
1956  range<1> GlobalRange = {UniformPow2WG ? NWorkItems
1957  : NWorkGroups * WGSize};
1958  nd_range<1> Range{GlobalRange, range<1>(WGSize)};
1959  AuxHandler.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
1960  size_t WGSize = NDIt.get_local_range().size();
1961  size_t LID = NDIt.get_local_linear_id();
1962  size_t GID = NDIt.get_global_linear_id();
1963  size_t GrID = NDIt.get_group_linear_id();
1964 
1965  // The last work-group may not have enough work for all its items.
1966  size_t RemainingWorkSize =
1967  sycl::min(WGSize, NWorkItems - GrID * WGSize);
1968 
1969  auto ElementCombiner = [&](element_type &LHS,
1970  const element_type &RHS) {
1971  return LHS.combine(BOp, RHS);
1972  };
1973 
1974  for (int E = 0; E < NElements; ++E) {
1975 
1976  doTreeReduction<WorkSizeGuarantees::LessOrEqual>(
1977  RemainingWorkSize, NDIt, LocalReds, ElementCombiner,
1978  [&](size_t) { return In[GID * NElements + E]; });
1979 
1980  // Compute the partial sum/reduction for the work-group.
1981  if (LID == 0) {
1982  element_type PSum = LocalReds[0];
1983  if constexpr (IsOneWG) {
1984  if (IsUpdateOfUserVar)
1985  PSum.combine(BOp, Out[E]);
1986  Out[E] = *PSum;
1987  } else {
1988  Out[GrID * NElements + E] = PSum;
1989  }
1990  }
1991 
1992  // Ensure item 0 is finished with LocalReds before next iteration
1993  if (E != NElements - 1) {
1994  NDIt.barrier();
1995  }
1996  }
1997  });
1998  NWorkItems = NWorkGroups;
1999  });
2000  };
2001 
2002  if (NWorkGroups == 1)
2003  Rest(KernelOneWGTag{});
2004  else
2005  Rest(KernelMultipleWGTag{});
2006  } // end while (NWorkItems > 1)
2007  }
2008 };
2009 
2013 template <bool IsOneWG, typename... Reductions, size_t... Is>
2014 auto createReduOutAccs(size_t NWorkGroups, handler &CGH,
2015  std::tuple<Reductions...> &ReduTuple,
2016  std::index_sequence<Is...>) {
2017  return makeReduTupleT(
2018  std::get<Is>(ReduTuple).template getWriteMemForPartialReds<IsOneWG>(
2019  NWorkGroups *
2020  std::tuple_element_t<Is, std::tuple<Reductions...>>::num_elements,
2021  CGH)...);
2022 }
2023 
2024 template <typename OutAccT, typename LocalAccT, typename BOPT,
2025  typename IdentityContainerT>
2026 auto getLastCombine(OutAccT OutAcc, LocalAccT LocalAcc, BOPT BOP,
2027  IdentityContainerT IdentityContainer,
2028  bool IsInitializeToIdentity) {
2029  if constexpr (!IdentityContainerT::has_identity) {
2030  return BOP(LocalAcc[0], OutAcc[0]);
2031  } else {
2032  return BOP(LocalAcc[0], IsInitializeToIdentity
2033  ? IdentityContainer.getIdentity()
2034  : OutAcc[0]);
2035  }
2036 }
2037 
2038 template <bool IsOneWG, typename... Reductions, typename... OutAccT,
2039  typename... LocalAccT, typename... BOPsT, typename... Ts,
2040  size_t... Is>
2042  size_t OutAccIndex, ReduTupleT<OutAccT...> OutAccs,
2044  ReduTupleT<Ts...> IdentityVals,
2045  std::array<bool, sizeof...(Reductions)> IsInitializeToIdentity,
2046  std::index_sequence<Is...>) {
2047  if constexpr (IsOneWG) {
2048  // Add the initial value of user's variable to the final result.
2049  // After this we know there will be a value in the 0th element.
2050  ((std::get<Is>(LocalAccs)[0] = getLastCombine(
2051  std::get<Is>(OutAccs), std::get<Is>(LocalAccs), std::get<Is>(BOPs),
2052  std::get<Is>(IdentityVals), IsInitializeToIdentity[Is])),
2053  ...);
2054  ((std::get<Is>(OutAccs)[OutAccIndex] = *std::get<Is>(LocalAccs)[0]), ...);
2055  } else {
2056  // The partial sums for the work-group are stored in 0-th elements of local
2057  // accessors. Simply write those sums to output accessors.
2058  ((std::get<Is>(OutAccs)[OutAccIndex] = std::get<Is>(LocalAccs)[0]), ...);
2059  }
2060 }
2061 
2062 // Concatenate an empty sequence.
2063 constexpr std::index_sequence<> concat_sequences(std::index_sequence<>) {
2064  return {};
2065 }
2066 
2067 // Concatenate a sequence consisting of 1 element.
2068 template <size_t I>
2069 constexpr std::index_sequence<I> concat_sequences(std::index_sequence<I>) {
2070  return {};
2071 }
2072 
2073 // Concatenate two potentially empty sequences.
2074 template <size_t... Is, size_t... Js>
2075 constexpr std::index_sequence<Is..., Js...>
2076 concat_sequences(std::index_sequence<Is...>, std::index_sequence<Js...>) {
2077  return {};
2078 }
2079 
2080 // Concatenate more than 2 sequences.
2081 template <size_t... Is, size_t... Js, class... Rs>
2082 constexpr auto concat_sequences(std::index_sequence<Is...>,
2083  std::index_sequence<Js...>, Rs...) {
2084  return concat_sequences(std::index_sequence<Is..., Js...>{}, Rs{}...);
2085 }
2086 
2088  template <typename T> struct Func {
2089  static constexpr bool value = !std::remove_pointer_t<T>::is_usm;
2090  };
2091 };
2092 
2094  template <typename T> struct Func {
2095  static constexpr bool value = false;
2096  };
2097 };
2098 
2099 template <bool Cond, size_t I> struct FilterElement {
2100  using type =
2101  std::conditional_t<Cond, std::index_sequence<I>, std::index_sequence<>>;
2102 };
2103 
2109 template <typename... T, typename FunctorT, size_t... Is,
2110  std::enable_if_t<(sizeof...(Is) > 0), int> Z = 0>
2111 constexpr auto filterSequenceHelper(FunctorT, std::index_sequence<Is...>) {
2112  return concat_sequences(
2113  typename FilterElement<FunctorT::template Func<std::tuple_element_t<
2114  Is, std::tuple<T...>>>::value,
2115  Is>::type{}...);
2116 }
2117 template <typename... T, typename FunctorT, size_t... Is,
2118  std::enable_if_t<(sizeof...(Is) == 0), int> Z = 0>
2119 constexpr auto filterSequenceHelper(FunctorT, std::index_sequence<Is...>) {
2120  return std::index_sequence<>{};
2121 }
2122 
2126 template <typename... T, typename FunctorT, size_t... Is>
2127 constexpr auto filterSequence(FunctorT F, std::index_sequence<Is...> Indices) {
2128  return filterSequenceHelper<T...>(F, Indices);
2129 }
2130 
2132  template <typename Reduction> struct Func {
2133  static constexpr bool value =
2134  (Reduction::dims == 0 && Reduction::num_elements == 1);
2135  };
2136 };
2137 
2139  template <typename Reduction> struct Func {
2140  static constexpr bool value =
2141  (Reduction::dims == 1 && Reduction::num_elements >= 1);
2142  };
2143 };
2144 
2145 template <typename ElementType, typename BOPT>
2146 constexpr auto makeAdjustedBOP(BOPT &BOP) {
2147  return [&](ElementType &LHS, const ElementType &RHS) {
2148  return LHS.combine(BOP, RHS);
2149  };
2150 }
2151 
2152 template <typename... Reductions, typename... BOPsT, size_t... Is>
2153 constexpr auto makeAdjustedBOPs(ReduTupleT<BOPsT...> &BOPsTuple,
2154  std::index_sequence<Is...>) {
2155  return makeReduTupleT(
2157  Is, std::tuple<Reductions...>>::reducer_element_type>(
2158  std::get<Is>(BOPsTuple))...);
2159 }
2160 
2161 template <typename... Reductions, typename... BOPsT>
2162 constexpr auto makeAdjustedBOPs(ReduTupleT<BOPsT...> &BOPsTuple) {
2163  return makeAdjustedBOPs<Reductions...>(
2164  BOPsTuple, std::make_index_sequence<sizeof...(Reductions)>{});
2165 }
2166 
2169 template <bool IsOneWG, typename... Reductions, int Dims, typename... LocalAccT,
2170  typename... OutAccT, typename... ReducerT, typename... Ts,
2171  typename... BOPsT, size_t... Is>
2173  nd_item<Dims> NDIt, ReduTupleT<LocalAccT...> LocalAccsTuple,
2174  ReduTupleT<OutAccT...> OutAccsTuple, std::tuple<ReducerT...> &ReducersTuple,
2175  ReduTupleT<Ts...> IdentitiesTuple, ReduTupleT<BOPsT...> BOPsTuple,
2176  std::array<bool, sizeof...(Reductions)> InitToIdentityProps,
2177  std::index_sequence<Is...> ReduIndices) {
2178  size_t WGSize = NDIt.get_local_range().size();
2179  size_t LID = NDIt.get_local_linear_id();
2180 
2181  ((std::get<Is>(LocalAccsTuple)[LID] =
2182  getReducerAccess(std::get<Is>(ReducersTuple)).getElement(0)),
2183  ...);
2184 
2185  // We apply tree-reduction on reducer elements so we adjust the operations
2186  // to combine these.
2187  auto AdjustedBOPsTuple = makeAdjustedBOPs<Reductions...>(BOPsTuple);
2188 
2189  doTreeReductionOnTuple(WGSize, LID, LocalAccsTuple, AdjustedBOPsTuple,
2190  ReduIndices);
2191 
2192  // Compute the partial sum/reduction for the work-group.
2193  if (LID == 0) {
2194  size_t GrID = NDIt.get_group_linear_id();
2195  writeReduSumsToOutAccs<IsOneWG, Reductions...>(
2196  GrID, OutAccsTuple, LocalAccsTuple, AdjustedBOPsTuple, IdentitiesTuple,
2197  InitToIdentityProps, ReduIndices);
2198  }
2199 }
2200 
2202 template <bool IsOneWG, typename Reduction, int Dims, typename LocalAccT,
2203  typename OutAccT, typename ReducerT, typename BOPT>
2204 void reduCGFuncImplArrayHelper(nd_item<Dims> NDIt, LocalAccT LocalReds,
2205  OutAccT Out, ReducerT &Reducer, BOPT BOp,
2206  bool IsInitializeToIdentity) {
2207  using element_type = typename Reduction::reducer_element_type;
2208 
2209  size_t WGSize = NDIt.get_local_range().size();
2210  size_t LID = NDIt.get_local_linear_id();
2211 
2212  auto ElementCombiner = [&](element_type &LHS, const element_type &RHS) {
2213  return LHS.combine(BOp, RHS);
2214  };
2215 
2216  // If there are multiple values, reduce each separately
2217  // This prevents local memory from scaling with elements
2218  auto NElements = Reduction::num_elements;
2219  for (size_t E = 0; E < NElements; ++E) {
2220  doTreeReduction<WorkSizeGuarantees::Equal>(
2221  WGSize, NDIt, LocalReds, ElementCombiner,
2222  [&](size_t) { return getReducerAccess(Reducer).getElement(E); });
2223 
2224  // Add the initial value of user's variable to the final result.
2225  if (LID == 0) {
2226  size_t GrID = NDIt.get_group_linear_id();
2227  size_t OutIdx = GrID * NElements + E;
2228  if constexpr (IsOneWG) {
2229  // If there is only a single work-group, the output will be an actual
2230  // value rather than a potentially optional value.
2231  if constexpr (Reduction::has_identity) {
2232  Out[OutIdx] = *ElementCombiner(LocalReds[0], IsInitializeToIdentity
2233  ? Reducer.identity()
2234  : Out[E]);
2235  } else {
2236  Out[OutIdx] = *LocalReds[0];
2237  }
2238  } else {
2239  // Otherwise we propagate a potentially optional value.
2240  Out[OutIdx] = LocalReds[0];
2241  }
2242  }
2243 
2244  // Ensure item 0 is finished with LocalReds before next iteration
2245  if (E != NElements - 1) {
2246  NDIt.barrier();
2247  }
2248  }
2249 }
2250 
2251 template <bool IsOneWG, typename... Reductions, int Dims, typename... LocalAccT,
2252  typename... OutAccT, typename... ReducerT, typename... BOPsT,
2253  size_t... Is>
2255  nd_item<Dims> NDIt, ReduTupleT<LocalAccT...> LocalAccsTuple,
2256  ReduTupleT<OutAccT...> OutAccsTuple, std::tuple<ReducerT...> &ReducersTuple,
2257  ReduTupleT<BOPsT...> BOPsTuple,
2258  std::array<bool, sizeof...(Reductions)> InitToIdentityProps,
2259  std::index_sequence<Is...>) {
2260  using ReductionPack = std::tuple<Reductions...>;
2261  (reduCGFuncImplArrayHelper<IsOneWG, std::tuple_element_t<Is, ReductionPack>>(
2262  NDIt, std::get<Is>(LocalAccsTuple), std::get<Is>(OutAccsTuple),
2263  std::get<Is>(ReducersTuple), std::get<Is>(BOPsTuple),
2264  InitToIdentityProps[Is]),
2265  ...);
2266 }
2267 
2268 namespace reduction::main_krn {
2269 template <class KernelName, class Accessor> struct NDRangeMulti;
2270 } // namespace reduction::main_krn
2271 template <typename KernelName, typename KernelType, int Dims,
2272  typename PropertiesT, typename... Reductions, size_t... Is>
2273 void reduCGFuncMulti(handler &CGH, KernelType KernelFunc,
2274  const nd_range<Dims> &Range, PropertiesT Properties,
2275  std::tuple<Reductions...> &ReduTuple,
2276  std::index_sequence<Is...> ReduIndices) {
2277  size_t WGSize = Range.get_local_range().size();
2278 
2279  // Split reduction sequence into two:
2280  // 1) Scalar reductions
2281  // 2) Array reductions
2282  // This allows us to reuse the existing implementation for scalar reductions
2283  // and introduce a new implementation for array reductions. Longer term it
2284  // may make sense to generalize the code such that each phase below applies
2285  // to all available reduction implementations -- today all reduction classes
2286  // use the same privatization-based approach, so this is unnecessary.
2287  IsScalarReduction ScalarPredicate;
2288  auto ScalarIs = filterSequence<Reductions...>(ScalarPredicate, ReduIndices);
2289 
2290  IsArrayReduction ArrayPredicate;
2291  auto ArrayIs = filterSequence<Reductions...>(ArrayPredicate, ReduIndices);
2292 
2293  auto LocalAccsTuple = makeReduTupleT(
2295  CGH}...);
2296 
2297  // The type of the Out "accessor" differs between scenarios when there is just
2298  // one WorkGroup and when there are multiple. Use this lambda to write the
2299  // code just once.
2300  auto Rest = [&](auto KernelTag, auto OutAccsTuple) {
2301  auto IdentitiesTuple =
2302  makeReduTupleT(std::get<Is>(ReduTuple).getIdentityContainer()...);
2303  auto BOPsTuple =
2304  makeReduTupleT(std::get<Is>(ReduTuple).getBinaryOperation()...);
2305  std::array InitToIdentityProps{
2306  std::get<Is>(ReduTuple).initializeToIdentity()...};
2307 
2308  using Name = __sycl_reduction_kernel<reduction::MainKrn, KernelName,
2309  reduction::strategy::multi,
2310  decltype(KernelTag)>;
2311 
2312  CGH.parallel_for<Name>(Range, Properties, [=](nd_item<Dims> NDIt) {
2313  // We can deduce IsOneWG from the tag type.
2314  constexpr bool IsOneWG =
2315  std::is_same_v<std::remove_reference_t<decltype(KernelTag)>,
2316  KernelOneWGTag>;
2317 
2318  // Pass all reductions to user's lambda in the same order as supplied
2319  // Each reducer initializes its own storage
2320  auto ReduIndices = std::index_sequence_for<Reductions...>();
2321  auto ReducerTokensTuple =
2322  std::tuple{typename Reductions::reducer_token_type{
2323  std::get<Is>(IdentitiesTuple), std::get<Is>(BOPsTuple)}...};
2324  auto ReducersTuple = std::tuple<typename Reductions::reducer_type...>{
2325  std::get<Is>(ReducerTokensTuple)...};
2326  std::apply([&](auto &...Reducers) { KernelFunc(NDIt, Reducers...); },
2327  ReducersTuple);
2328 
2329  // Combine and write-back the results of any scalar reductions
2330  // reduCGFuncImplScalar<Reductions...>(NDIt, LocalAccsTuple, OutAccsTuple,
2331  // ReducersTuple, IdentitiesTuple, BOPsTuple, InitToIdentityProps,
2332  // ReduIndices);
2333  reduCGFuncImplScalar<IsOneWG, Reductions...>(
2334  NDIt, LocalAccsTuple, OutAccsTuple, ReducersTuple, IdentitiesTuple,
2335  BOPsTuple, InitToIdentityProps, ScalarIs);
2336 
2337  // Combine and write-back the results of any array reductions
2338  // These are handled separately to minimize temporary storage and account
2339  // for the fact that each array reduction may have a different number of
2340  // elements to reduce (i.e. a different extent).
2341  reduCGFuncImplArray<IsOneWG, Reductions...>(
2342  NDIt, LocalAccsTuple, OutAccsTuple, ReducersTuple, BOPsTuple,
2343  InitToIdentityProps, ArrayIs);
2344  });
2345  };
2346 
2347  size_t NWorkGroups = Range.get_group_range().size();
2348  if (NWorkGroups == 1)
2349  Rest(KernelOneWGTag{},
2350  createReduOutAccs<true>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2351  else
2352  Rest(KernelMultipleWGTag{},
2353  createReduOutAccs<false>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2354 }
2355 
2356 // TODO: Is this still needed?
2357 template <typename... Reductions, size_t... Is>
2359  std::tuple<Reductions...> &ReduTuple,
2360  std::index_sequence<Is...>) {
2361  auto ProcessOne = [&CGH](auto Redu) {
2362  if constexpr (!decltype(Redu)::is_usm)
2363  Redu.getUserRedVarAccess(CGH);
2364  };
2365  (ProcessOne(std::get<Is>(ReduTuple)), ...);
2366 }
2367 
2370 template <bool IsOneWG, typename... Reductions, int Dims, typename... LocalAccT,
2371  typename... InAccT, typename... OutAccT, typename... Ts,
2372  typename... BOPsT, size_t... Is>
2374  nd_item<Dims> NDIt, size_t LID, size_t GID, size_t RemainingWorkSize,
2375  ReduTupleT<LocalAccT...> LocalAccsTuple, ReduTupleT<InAccT...> InAccsTuple,
2376  ReduTupleT<OutAccT...> OutAccsTuple, ReduTupleT<Ts...> IdentitiesTuple,
2377  ReduTupleT<BOPsT...> BOPsTuple,
2378  std::array<bool, sizeof...(Reductions)> InitToIdentityProps,
2379  std::index_sequence<Is...> ReduIndices) {
2380  // The end work-group may have less work than the rest, so we only need to
2381  // read the value of the elements that still have work left.
2382  if (LID < RemainingWorkSize)
2383  ((std::get<Is>(LocalAccsTuple)[LID] = std::get<Is>(InAccsTuple)[GID]), ...);
2384 
2385  // We apply tree-reduction on reducer elements so we adjust the operations
2386  // to combine these.
2387  auto AdjustedBOPsTuple = makeAdjustedBOPs<Reductions...>(BOPsTuple);
2388 
2389  doTreeReductionOnTuple(RemainingWorkSize, LID, LocalAccsTuple,
2390  AdjustedBOPsTuple, ReduIndices);
2391 
2392  // Compute the partial sum/reduction for the work-group.
2393  if (LID == 0) {
2394  size_t GrID = NDIt.get_group_linear_id();
2395  writeReduSumsToOutAccs<IsOneWG, Reductions...>(
2396  GrID, OutAccsTuple, LocalAccsTuple, AdjustedBOPsTuple, IdentitiesTuple,
2397  InitToIdentityProps, ReduIndices);
2398  }
2399 }
2400 
2401 template <bool IsOneWG, typename Reduction, int Dims, typename LocalAccT,
2402  typename InAccT, typename OutAccT, typename T, typename BOPT>
2403 void reduAuxCGFuncImplArrayHelper(nd_item<Dims> NDIt, size_t LID, size_t GID,
2404  size_t RemainingWorkSize, LocalAccT LocalReds,
2405  InAccT In, OutAccT Out, T IdentityContainer,
2406  BOPT BOp, bool IsInitializeToIdentity) {
2407  using element_type = typename Reduction::reducer_element_type;
2408  auto ElementCombiner = [&](element_type &LHS, const element_type &RHS) {
2409  return LHS.combine(BOp, RHS);
2410  };
2411 
2412  // If there are multiple values, reduce each separately
2413  // This prevents local memory from scaling with elements
2414  auto NElements = Reduction::num_elements;
2415  for (size_t E = 0; E < NElements; ++E) {
2416  doTreeReduction<WorkSizeGuarantees::LessOrEqual>(
2417  RemainingWorkSize, NDIt, LocalReds, ElementCombiner,
2418  [&](size_t) { return In[GID * NElements + E]; });
2419 
2420  // Add the initial value of user's variable to the final result.
2421  if (LID == 0) {
2422  size_t GrID = NDIt.get_group_linear_id();
2423  size_t OutIdx = GrID * NElements + E;
2424  if constexpr (IsOneWG) {
2425  // If there is only a single work-group, the output will be an actual
2426  // value rather than a potentially optional value.
2427  if constexpr (Reduction::has_identity) {
2428  Out[OutIdx] = *ElementCombiner(LocalReds[0],
2429  IsInitializeToIdentity
2430  ? IdentityContainer.getIdentity()
2431  : Out[E]);
2432  } else {
2433  Out[OutIdx] = *LocalReds[0];
2434  }
2435  } else {
2436  // Otherwise we propagate a potentially optional value.
2437  Out[OutIdx] = LocalReds[0];
2438  }
2439  }
2440 
2441  // Ensure item 0 is finished with LocalReds before next iteration
2442  if (E != NElements - 1) {
2443  NDIt.barrier();
2444  }
2445  }
2446 }
2447 
2448 template <bool IsOneWG, typename... Reductions, int Dims, typename... LocalAccT,
2449  typename... InAccT, typename... OutAccT, typename... Ts,
2450  typename... BOPsT, size_t... Is>
2452  nd_item<Dims> NDIt, size_t LID, size_t GID, size_t RemainingWorkSize,
2453  ReduTupleT<LocalAccT...> LocalAccsTuple, ReduTupleT<InAccT...> InAccsTuple,
2454  ReduTupleT<OutAccT...> OutAccsTuple, ReduTupleT<Ts...> IdentitiesTuple,
2455  ReduTupleT<BOPsT...> BOPsTuple,
2456  std::array<bool, sizeof...(Reductions)> InitToIdentityProps,
2457  std::index_sequence<Is...>) {
2458  using ReductionPack = std::tuple<Reductions...>;
2460  std::tuple_element_t<Is, ReductionPack>>(
2461  NDIt, LID, GID, RemainingWorkSize, std::get<Is>(LocalAccsTuple),
2462  std::get<Is>(InAccsTuple), std::get<Is>(OutAccsTuple),
2463  std::get<Is>(IdentitiesTuple), std::get<Is>(BOPsTuple),
2464  InitToIdentityProps[Is]),
2465  ...);
2466 }
2467 
2468 namespace reduction::aux_krn {
2469 template <class KernelName, class Predicate> struct Multi;
2470 } // namespace reduction::aux_krn
2471 template <typename KernelName, typename KernelType, typename... Reductions,
2472  size_t... Is>
2473 size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
2474  std::tuple<Reductions...> &ReduTuple,
2475  std::index_sequence<Is...> ReduIndices) {
2476  size_t NWorkGroups;
2477  size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
2478 
2479  bool Pow2WG = (WGSize & (WGSize - 1)) == 0;
2480  bool HasUniformWG = Pow2WG && (NWorkGroups * WGSize == NWorkItems);
2481 
2482  // Like reduCGFuncImpl, we also have to split out scalar and array reductions
2483  IsScalarReduction ScalarPredicate;
2484  auto ScalarIs = filterSequence<Reductions...>(ScalarPredicate, ReduIndices);
2485 
2486  IsArrayReduction ArrayPredicate;
2487  auto ArrayIs = filterSequence<Reductions...>(ArrayPredicate, ReduIndices);
2488 
2489  auto LocalAccsTuple = makeReduTupleT(
2491  CGH}...);
2492  auto InAccsTuple = makeReduTupleT(
2493  std::get<Is>(ReduTuple).getReadAccToPreviousPartialReds(CGH)...);
2494 
2495  auto IdentitiesTuple =
2496  makeReduTupleT(std::get<Is>(ReduTuple).getIdentityContainer()...);
2497  auto BOPsTuple =
2498  makeReduTupleT(std::get<Is>(ReduTuple).getBinaryOperation()...);
2499  std::array InitToIdentityProps{
2500  std::get<Is>(ReduTuple).initializeToIdentity()...};
2501 
2502  // Predicate/OutAccsTuple below have different type depending on us having
2503  // just a single WG or multiple WGs. Use this lambda to avoid code
2504  // duplication.
2505  auto Rest = [&](auto Predicate, auto OutAccsTuple) {
2506  auto AccReduIndices = filterSequence<Reductions...>(Predicate, ReduIndices);
2507  associateReduAccsWithHandler(CGH, ReduTuple, AccReduIndices);
2508  using Name = __sycl_reduction_kernel<reduction::AuxKrn, KernelName,
2509  reduction::strategy::multi,
2510  decltype(Predicate)>;
2511  // TODO: Opportunity to parallelize across number of elements
2512  range<1> GlobalRange = {HasUniformWG ? NWorkItems : NWorkGroups * WGSize};
2513  nd_range<1> Range{GlobalRange, range<1>(WGSize)};
2514  CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
2515  // We can deduce IsOneWG from the predicate type.
2516  constexpr bool IsOneWG =
2517  std::is_same_v<std::remove_reference_t<decltype(Predicate)>,
2519 
2520  size_t WGSize = NDIt.get_local_range().size();
2521  size_t RemainingWorkSize =
2522  sycl::min(WGSize, NWorkItems - WGSize * NDIt.get_group_linear_id());
2523  size_t LID = NDIt.get_local_linear_id();
2524  size_t GID = NDIt.get_global_linear_id();
2525 
2526  // Handle scalar and array reductions
2527  reduAuxCGFuncImplScalar<IsOneWG, Reductions...>(
2528  NDIt, LID, GID, RemainingWorkSize, LocalAccsTuple, InAccsTuple,
2529  OutAccsTuple, IdentitiesTuple, BOPsTuple, InitToIdentityProps,
2530  ScalarIs);
2531  reduAuxCGFuncImplArray<IsOneWG, Reductions...>(
2532  NDIt, LID, GID, RemainingWorkSize, LocalAccsTuple, InAccsTuple,
2533  OutAccsTuple, IdentitiesTuple, BOPsTuple, InitToIdentityProps,
2534  ArrayIs);
2535  });
2536  };
2537  if (NWorkGroups == 1)
2539  createReduOutAccs<true>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2540  else
2541  Rest(EmptyReductionPredicate{},
2542  createReduOutAccs<false>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2543 
2544  return NWorkGroups;
2545 }
2546 
2547 template <typename Reduction> size_t reduGetMemPerWorkItemHelper(Reduction &) {
2548  return sizeof(typename Reduction::result_type);
2549 }
2550 
2551 template <typename Reduction, typename... RestT>
2552 size_t reduGetMemPerWorkItemHelper(Reduction &, RestT... Rest) {
2553  return sizeof(typename Reduction::result_type) +
2554  reduGetMemPerWorkItemHelper(Rest...);
2555 }
2556 
2557 template <typename... ReductionT, size_t... Is>
2558 size_t reduGetMemPerWorkItem(std::tuple<ReductionT...> &ReduTuple,
2559  std::index_sequence<Is...>) {
2560  return reduGetMemPerWorkItemHelper(std::get<Is>(ReduTuple)...);
2561 }
2562 
2565 template <typename TupleT, std::size_t... Is>
2566 std::tuple<std::tuple_element_t<Is, TupleT>...>
2567 tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>) {
2568  return {std::get<Is>(std::move(Tuple))...};
2569 }
2570 
2571 template <> struct NDRangeReduction<reduction::strategy::multi> {
2572  template <typename KernelName, int Dims, typename PropertiesT,
2573  typename... RestT>
2574  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
2575  nd_range<Dims> NDRange, PropertiesT &Properties,
2576  RestT... Rest) {
2577  std::tuple<RestT...> ArgsTuple(Rest...);
2578  constexpr size_t NumArgs = sizeof...(RestT);
2579  auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
2580  auto ReduIndices = std::make_index_sequence<NumArgs - 1>();
2581  auto ReduTuple = detail::tuple_select_elements(ArgsTuple, ReduIndices);
2582 
2583  size_t LocalMemPerWorkItem = reduGetMemPerWorkItem(ReduTuple, ReduIndices);
2584  // TODO: currently the maximal work group size is determined for the given
2585  // queue/device, while it is safer to use queries to the kernel compiled
2586  // for the device.
2587  size_t MaxWGSize = reduGetMaxWGSize(Queue, LocalMemPerWorkItem);
2588  if (NDRange.get_local_range().size() > MaxWGSize)
2589  throw sycl::runtime_error("The implementation handling parallel_for with"
2590  " reduction requires work group size not bigger"
2591  " than " +
2592  std::to_string(MaxWGSize),
2593  PI_ERROR_INVALID_WORK_GROUP_SIZE);
2594 
2595  reduCGFuncMulti<KernelName>(CGH, KernelFunc, NDRange, Properties, ReduTuple,
2596  ReduIndices);
2598 
2599  size_t NWorkItems = NDRange.get_group_range().size();
2600  while (NWorkItems > 1) {
2601  reduction::withAuxHandler(CGH, [&](handler &AuxHandler) {
2602  NWorkItems = reduAuxCGFunc<KernelName, decltype(KernelFunc)>(
2603  AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
2604  });
2605  } // end while (NWorkItems > 1)
2606  }
2607 };
2608 
2609 // Auto-dispatch. Must be the last one.
2610 template <> struct NDRangeReduction<reduction::strategy::auto_select> {
2611  // Some readability aliases, to increase signal/noise ratio below.
2612  template <reduction::strategy Strategy>
2615 
2616  template <typename KernelName, int Dims, typename PropertiesT,
2617  typename KernelType, typename Reduction>
2618  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
2619  nd_range<Dims> NDRange, PropertiesT &Properties,
2620  Reduction &Redu, KernelType &KernelFunc) {
2621  auto Delegate = [&](auto Impl) {
2622  Impl.template run<KernelName>(CGH, Queue, NDRange, Properties, Redu,
2623  KernelFunc);
2624  };
2625 
2626  if constexpr (Reduction::has_float64_atomics) {
2627  if (getDeviceFromHandler(CGH).has(aspect::atomic64))
2629 
2630  if constexpr (Reduction::has_fast_reduce)
2632  else
2633  return Delegate(Impl<Strat::basic>{});
2634  } else if constexpr (Reduction::has_fast_atomics) {
2635  if constexpr (sizeof(typename Reduction::result_type) == 8) {
2636  // Both group_reduce_and_atomic_cross_wg and
2637  // local_mem_tree_and_atomic_cross_wg implicitly require
2638  // aspect::atomic64 if the result type of the reduction is 64-bit. If
2639  // the device does not support this, we need to fall back to more
2640  // reliable strategies.
2641  if (!getDeviceFromHandler(CGH).has(aspect::atomic64)) {
2642  if constexpr (Reduction::has_fast_reduce)
2644  else
2645  return Delegate(Impl<Strat::basic>{});
2646  }
2647  }
2648 
2649  if constexpr (Reduction::has_fast_reduce) {
2651  } else {
2653  }
2654  } else {
2655  if constexpr (Reduction::has_fast_reduce)
2657  else
2658  return Delegate(Impl<Strat::basic>{});
2659  }
2660 
2661  assert(false && "Must be unreachable!");
2662  }
2663  template <typename KernelName, int Dims, typename PropertiesT,
2664  typename... RestT>
2665  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
2666  nd_range<Dims> NDRange, PropertiesT &Properties,
2667  RestT... Rest) {
2668  return Impl<Strat::multi>::run<KernelName>(CGH, Queue, NDRange, Properties,
2669  Rest...);
2670  }
2671 };
2672 
2673 template <typename KernelName, reduction::strategy Strategy, int Dims,
2674  typename PropertiesT, typename... RestT>
2676  PropertiesT Properties, RestT... Rest) {
2677  NDRangeReduction<Strategy>::template run<KernelName>(CGH, CGH.MQueue, NDRange,
2678  Properties, Rest...);
2679 }
2680 
2681 __SYCL_EXPORT uint32_t
2682 reduGetMaxNumConcurrentWorkGroups(std::shared_ptr<queue_impl> Queue);
2683 
2684 template <typename KernelName, reduction::strategy Strategy, int Dims,
2685  typename PropertiesT, typename... RestT>
2687  PropertiesT Properties, RestT... Rest) {
2688  std::tuple<RestT...> ArgsTuple(Rest...);
2689  constexpr size_t NumArgs = sizeof...(RestT);
2690  static_assert(NumArgs > 1, "No reduction!");
2691  auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
2692  auto ReduIndices = std::make_index_sequence<NumArgs - 1>();
2693  auto ReduTuple = detail::tuple_select_elements(ArgsTuple, ReduIndices);
2694 
2695  // Before running the kernels, check that device has enough local memory
2696  // to hold local arrays required for the tree-reduction algorithm.
2697  size_t OneElemSize = [&]() {
2698  // Can't use outlined NumArgs due to a bug in gcc 8.4.
2699  if constexpr (sizeof...(RestT) == 2) {
2700  using Reduction = std::tuple_element_t<0, decltype(ReduTuple)>;
2701  constexpr bool IsTreeReduction =
2702  !Reduction::has_fast_reduce && !Reduction::has_fast_atomics;
2703  return IsTreeReduction ? sizeof(typename Reduction::result_type) : 0;
2704  } else {
2705  return reduGetMemPerWorkItem(ReduTuple, ReduIndices);
2706  }
2707  }();
2708 
2709  uint32_t NumConcurrentWorkGroups =
2710 #ifdef __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS
2711  __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS;
2712 #else
2714 #endif
2715 
2716  // TODO: currently the preferred work group size is determined for the given
2717  // queue/device, while it is safer to use queries to the kernel pre-compiled
2718  // for the device.
2719  size_t PrefWGSize = reduGetPreferredWGSize(CGH.MQueue, OneElemSize);
2720 
2721  size_t NWorkItems = Range.size();
2722  size_t WGSize = std::min(NWorkItems, PrefWGSize);
2723  size_t NWorkGroups = NWorkItems / WGSize;
2724  if (NWorkItems % WGSize)
2725  NWorkGroups++;
2726  size_t MaxNWorkGroups = NumConcurrentWorkGroups;
2727  NWorkGroups = std::min(NWorkGroups, MaxNWorkGroups);
2728  size_t NDRItems = NWorkGroups * WGSize;
2729  nd_range<1> NDRange{range<1>{NDRItems}, range<1>{WGSize}};
2730 
2731  size_t PerGroup = Range.size() / NWorkGroups;
2732  // Iterate through the index space by assigning contiguous chunks to each
2733  // work-group, then iterating through each chunk using a stride equal to the
2734  // work-group's local range, which gives much better performance than using
2735  // stride equal to 1. For each of the index the given the original KernelFunc
2736  // is called and the reduction value hold in \p Reducer is accumulated in
2737  // those calls.
2738  auto UpdatedKernelFunc = [=](auto NDId, auto &...Reducers) {
2739  // Divide into contiguous chunks and assign each chunk to a Group
2740  // Rely on precomputed division to avoid repeating expensive operations
2741  // TODO: Some devices may prefer alternative remainder handling
2742  auto Group = NDId.get_group();
2743  size_t GroupId = Group.get_group_linear_id();
2744  size_t NumGroups = Group.get_group_linear_range();
2745  bool LastGroup = (GroupId == NumGroups - 1);
2746  size_t GroupStart = GroupId * PerGroup;
2747  size_t GroupEnd = LastGroup ? Range.size() : (GroupStart + PerGroup);
2748 
2749  // Loop over the contiguous chunk
2750  size_t Start = GroupStart + NDId.get_local_id(0);
2751  size_t End = GroupEnd;
2752  size_t Stride = NDId.get_local_range(0);
2753  auto GetDelinearized = [&](size_t I) {
2754  auto Id = getDelinearizedId(Range, I);
2755  if constexpr (std::is_invocable_v<decltype(KernelFunc), id<Dims>,
2756  decltype(Reducers)...>)
2757  return Id;
2758  else
2759  // SYCL doesn't provide parallel_for accepting offset in presence of
2760  // reductions, so use with_offset==false.
2761  return reduction::getDelinearizedItem(Range, Id);
2762  };
2763  for (size_t I = Start; I < End; I += Stride)
2764  KernelFunc(GetDelinearized(I), Reducers...);
2765  };
2766  if constexpr (NumArgs == 2) {
2767  using Reduction = std::tuple_element_t<0, decltype(ReduTuple)>;
2768  auto &Redu = std::get<0>(ReduTuple);
2769 
2770  constexpr auto StrategyToUse = [&]() {
2771  if constexpr (Strategy != reduction::strategy::auto_select)
2772  return Strategy;
2773 
2774  // TODO: Both group_reduce_and_last_wg_detection and range_basic require
2775  // memory_order::acq_rel support that isn't guaranteed by the
2776  // specification. However, implementing run-time check for that would
2777  // result in an extra kernel compilation(s). We probably need to
2778  // investigate if the usage of kernel_bundles can mitigate that.
2779  // TODO: local_atomic_and_atomic_cross_wg uses atomics on the partial
2780  // results, which may add an implicit requirement on aspect::atomic64. As
2781  // a temporary work-around we do not pick this if the result type is
2782  // 64-bit. In the future this selection should be done at runtime based
2783  // on the device.
2784  // Note: Identityless reductions cannot use group reductions.
2785  if constexpr (Reduction::has_fast_reduce && Reduction::has_identity)
2786  return reduction::strategy::group_reduce_and_last_wg_detection;
2787  else if constexpr (Reduction::has_fast_atomics &&
2788  sizeof(typename Reduction::result_type) != 8)
2789  return reduction::strategy::local_atomic_and_atomic_cross_wg;
2790  else
2791  return reduction::strategy::range_basic;
2792  }();
2793 
2794  reduction_parallel_for<KernelName, StrategyToUse>(CGH, NDRange, Properties,
2795  Redu, UpdatedKernelFunc);
2796  } else {
2797  return std::apply(
2798  [&](auto &...Reds) {
2799  return reduction_parallel_for<KernelName, Strategy>(
2800  CGH, NDRange, Properties, Reds..., UpdatedKernelFunc);
2801  },
2802  ReduTuple);
2803  }
2804 }
2805 } // namespace detail
2806 
2811 template <typename T, typename AllocatorT, typename BinaryOperation>
2813  BinaryOperation Combiner, const property_list &PropList = {}) {
2814  std::ignore = CGH;
2815  bool InitializeToIdentity =
2816  PropList.has_property<property::reduction::initialize_to_identity>();
2817  return detail::make_reduction<BinaryOperation, 0, 1, false>(
2818  Var, Combiner, InitializeToIdentity);
2819 }
2820 
2826 template <typename T, typename BinaryOperation>
2827 auto reduction(T *Var, BinaryOperation Combiner,
2828  const property_list &PropList = {}) {
2829  bool InitializeToIdentity =
2830  PropList.has_property<property::reduction::initialize_to_identity>();
2831  return detail::make_reduction<BinaryOperation, 0, 1, false>(
2832  Var, Combiner, InitializeToIdentity);
2833 }
2834 
2838 template <typename T, typename AllocatorT, typename BinaryOperation>
2839 auto reduction(buffer<T, 1, AllocatorT> Var, handler &CGH, const T &Identity,
2840  BinaryOperation Combiner, const property_list &PropList = {}) {
2841  std::ignore = CGH;
2842  bool InitializeToIdentity =
2843  PropList.has_property<property::reduction::initialize_to_identity>();
2844  return detail::make_reduction<BinaryOperation, 0, 1, true>(
2845  Var, Identity, Combiner, InitializeToIdentity);
2846 }
2847 
2851 template <typename T, typename BinaryOperation>
2852 auto reduction(T *Var, const T &Identity, BinaryOperation Combiner,
2853  const property_list &PropList = {}) {
2854  bool InitializeToIdentity =
2855  PropList.has_property<property::reduction::initialize_to_identity>();
2856  return detail::make_reduction<BinaryOperation, 0, 1, true>(
2857  Var, Identity, Combiner, InitializeToIdentity);
2858 }
2859 
2865 template <typename T, size_t Extent, typename BinaryOperation,
2866  typename = std::enable_if_t<Extent != dynamic_extent>>
2867 auto reduction(span<T, Extent> Span, BinaryOperation Combiner,
2868  const property_list &PropList = {}) {
2869  bool InitializeToIdentity =
2870  PropList.has_property<property::reduction::initialize_to_identity>();
2871  return detail::make_reduction<BinaryOperation, 1, Extent, false>(
2872  Span.data(), Combiner, InitializeToIdentity);
2873 }
2874 
2878 template <typename T, size_t Extent, typename BinaryOperation,
2879  typename = std::enable_if_t<Extent != dynamic_extent>>
2880 auto reduction(span<T, Extent> Span, const T &Identity,
2881  BinaryOperation Combiner, const property_list &PropList = {}) {
2882  bool InitializeToIdentity =
2883  PropList.has_property<property::reduction::initialize_to_identity>();
2884  return detail::make_reduction<BinaryOperation, 1, Extent, true>(
2885  Span.data(), Identity, Combiner, InitializeToIdentity);
2886 }
2887 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
2888 } // namespace sycl
sycl::_V1::reducer< T, BinaryOperation, Dims, Extent, IdentityContainerT, View, std::enable_if_t< Dims==1 &&View==false &&!detail::IsKnownIdentityOp< T, BinaryOperation >::value > >::identity
std::enable_if_t< HasIdentityRelay &&(HasIdentityRelay==has_identity), T > identity() const
Definition: reduction.hpp:706
sycl::_V1::handler::copy
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, std::shared_ptr< T_Dst > Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Definition: handler.hpp:2198
sycl::_V1::detail::reduGetMaxNumConcurrentWorkGroups
uint32_t reduGetMaxNumConcurrentWorkGroups(std::shared_ptr< queue_impl > Queue)
sycl::_V1::detail::concat_sequences
constexpr auto concat_sequences(std::index_sequence< Is... >, std::index_sequence< Js... >, Rs...)
Definition: reduction.hpp:2082
sycl::_V1::reducer< T, BinaryOperation, Dims, Extent, IdentityContainerT, View, std::enable_if_t< Dims==0 &&View==true > >::reducer
reducer(const detail::ReducerToken< BinaryOperation, IdentityContainerT > &Token)
Definition: reduction.hpp:644
sycl::_V1::reducer
Class that is used to represent objects that are passed to user's lambda functions and representing u...
Definition: reduction.hpp:70
sycl::_V1::detail::get_red_t< T * >::type
T type
Definition: reduction.hpp:790
sycl::_V1::reducer< T, BinaryOperation, Dims, Extent, IdentityContainerT, View, std::enable_if_t< Dims==1 &&View==false &&detail::IsKnownIdentityOp< T, BinaryOperation >::value > >::reducer
reducer(const IdentityContainerT &, BinaryOperation)
Definition: reduction.hpp:741
sycl::_V1::property_list
Objects of the property_list class are containers for the SYCL properties.
Definition: property_list.hpp:24
sycl::_V1::detail::writeReduSumsToOutAccs
void writeReduSumsToOutAccs(size_t OutAccIndex, ReduTupleT< OutAccT... > OutAccs, ReduTupleT< LocalAccT... > LocalAccs, ReduTupleT< BOPsT... > BOPs, ReduTupleT< Ts... > IdentityVals, std::array< bool, sizeof...(Reductions)> IsInitializeToIdentity, std::index_sequence< Is... >)
Definition: reduction.hpp:2041
sycl::_V1::nd_range::get_group_range
range< dimensions > get_group_range() const
Definition: nd_range.hpp:44
sycl::_V1::reducer< T, BinaryOperation, Dims, Extent, IdentityContainerT, View, std::enable_if_t< Dims==1 &&View==false &&!detail::IsKnownIdentityOp< T, BinaryOperation >::value > >::reducer
reducer(const detail::ReducerToken< BinaryOperation, IdentityContainerT > &Token)
Definition: reduction.hpp:690
sycl::_V1::reducer< T, BinaryOperation, Dims, Extent, IdentityContainerT, View, std::enable_if_t< Dims==0 &&Extent==1 &&View==false &&!detail::IsKnownIdentityOp< T, BinaryOperation >::value > >::reducer
reducer(const IdentityContainerT &IdentityContainer, BinaryOperation BOp)
Definition: reduction.hpp:539
sycl::_V1::detail::reduction::AuxKrn
Definition: reduction.hpp:1187
sycl::_V1::detail::workGroupBarrier
static void workGroupBarrier()
Definition: group.hpp:33
sycl::_V1::detail::reduCGFuncImplScalar
void reduCGFuncImplScalar(nd_item< Dims > NDIt, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, std::tuple< ReducerT... > &ReducersTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... > ReduIndices)
All scalar reductions are processed together; there is one loop of log2(N) steps, and each reduction ...
Definition: reduction.hpp:2172
sycl::_V1::access::mode
mode
Definition: access.hpp:30
sycl::_V1::range::size
size_t size() const
Definition: range.hpp:50
sycl::_V1::detail::NDRangeReduction< reduction::strategy::local_mem_tree_and_atomic_cross_wg >::run
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1600
sycl::_V1::detail::NDRangeReduction< reduction::strategy::group_reduce_and_last_wg_detection >::run
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1261
sycl::_V1::detail::ReductionIdentityContainer< T, BinaryOperation, ExplicitIdentity, enable_if_t< IsKnownIdentityOp< T, BinaryOperation >::value > >::getIdentity
static constexpr T getIdentity()
Returns the statically known identity value.
Definition: reduction.hpp:395
sycl::_V1::detail::WorkSizeGuarantees
WorkSizeGuarantees
Definition: reduction.hpp:1402
T
sycl::_V1::detail::combiner::atomic_combine
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&(IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value||IsReduOptForAtomic64Op< _T, _BinaryOperation >::value) &&IsMinimum< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic MIN operation: *ReduVarPtr = sycl::minimum(*ReduVarPtr, MValue);.
Definition: reduction.hpp:356
sycl::_V1::detail::reduction_impl_algo
Definition: reduction.hpp:813
sycl::_V1::detail::ReductionIdentityContainer
Templated class for common functionality of all reduction implementation classes.
Definition: reduction.hpp:378
sycl::_V1::detail::NDRangeReduction< reduction::strategy::range_basic >::run
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1456
sycl::_V1::detail::IsScalarReduction::Func
Definition: reduction.hpp:2132
sycl::_V1::detail::reduction_impl_algo::reducer_element_type
typename ReducerTraits< reducer_type >::element_type reducer_element_type
Definition: reduction.hpp:850
sycl::_V1::detail::NDRangeReduction< reduction::strategy::group_reduce_and_atomic_cross_wg >::run
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1564
sycl::_V1::reducer< T, BinaryOperation, Dims, Extent, IdentityContainerT, View, std::enable_if_t< Dims==0 &&View==true > >::combine
reducer & combine(const T &Partial)
Definition: reduction.hpp:653
sycl::_V1::detail::makeReduTupleT
ReduTupleT< Ts... > makeReduTupleT(Ts... Elements)
Definition: reduction.hpp:139
sycl::_V1::detail::data_dim_t
Definition: reduction.hpp:777
sycl::_V1::detail::auto_name
This class is the default KernelName template parameter type for kernel invocation APIs such as singl...
Definition: kernel.hpp:40
sycl::_V1::detail::ReducerTraits< reducer< T, BinaryOperation, Dims, Extent, IdentityContainerT, View, Subst > >::type
T type
Definition: reduction.hpp:161
sycl::_V1::detail::reduction_impl_algo::getWriteMemForPartialReds
auto getWriteMemForPartialReds(size_t Size, handler &CGH)
Definition: reduction.hpp:896
sycl::_V1::reducer< T, BinaryOperation, Dims, Extent, IdentityContainerT, View, std::enable_if_t< Dims==1 &&View==false &&detail::IsKnownIdentityOp< T, BinaryOperation >::value > >::identity
T identity() const
Definition: reduction.hpp:759
sycl::_V1::detail::reduction_impl
This class encapsulates the reduction variable/accessor, the reduction operator and an optional opera...
Definition: reduction.hpp:1086
tuple.hpp
sycl::_V1::detail::ReducerElement::combine
ReducerElement & combine(BinaryOperation BinOp, const T &OtherValue)
Definition: reduction.hpp:458
sycl::_V1::detail::IsBitXOR
std::bool_constant< std::is_same_v< BinaryOperation, sycl::bit_xor< T > >||std::is_same_v< BinaryOperation, sycl::bit_xor< void > >> IsBitXOR
Definition: known_identity.hpp:54
__SYCL_INLINE_VER_NAMESPACE
#define __SYCL_INLINE_VER_NAMESPACE(X)
Definition: defines_elementary.hpp:11
sycl::_V1::detail::reducer_common
Definition: reduction.hpp:500
sycl::_V1::detail::ReducerElement::operator*
constexpr const T & operator*() const noexcept
Definition: reduction.hpp:483
sycl::_V1::detail::__sycl_init_mem_for
std::conditional_t< std::is_same_v< KernelName, auto_name >, auto_name, reduction::InitMemKrn< KernelName > > __sycl_init_mem_for
A helper to pass undefined (sycl::detail::auto_name) names unmodified.
Definition: reduction.hpp:809
sycl::_V1::buffer
Defines a shared array that can be used by kernels in queues.
Definition: buffer.hpp:37
sycl::_V1::detail::reduCGFuncImplArrayHelper
void reduCGFuncImplArrayHelper(nd_item< Dims > NDIt, LocalAccT LocalReds, OutAccT Out, ReducerT &Reducer, BOPT BOp, bool IsInitializeToIdentity)
Each array reduction is processed separately.
Definition: reduction.hpp:2204
sycl::_V1::nd_item::get_local_range
range< dimensions > get_local_range() const
Definition: nd_item.hpp:102
sycl::_V1::detail::makeAdjustedBOPs
constexpr auto makeAdjustedBOPs(ReduTupleT< BOPsT... > &BOPsTuple)
Definition: reduction.hpp:2162
sycl::_V1::detail::reduction::aux_krn::Multi
Definition: reduction.hpp:2469
sycl::_V1::detail::reduAuxCGFuncImplArray
void reduAuxCGFuncImplArray(nd_item< Dims > NDIt, size_t LID, size_t GID, size_t RemainingWorkSize, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< InAccT... > InAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... >)
Definition: reduction.hpp:2451
sycl::_V1::detail::ReducerAccess::getIdentity
constexpr auto getIdentity()
Definition: reduction.hpp:178
sycl::_V1::detail::__sycl_reduction_kernel
std::conditional_t< std::is_same_v< KernelName, auto_name >, auto_name, MainOrAux< KernelName, Strategy, Ts... > > __sycl_reduction_kernel
A helper to pass undefined (sycl::detail::auto_name) names unmodified.
Definition: reduction.hpp:1200
sycl::_V1::device::has
bool has(aspect Aspect) const __SYCL_WARN_IMAGE_ASPECT(Aspect)
Indicates if the SYCL device has the given feature.
Definition: device.cpp:209
sycl::_V1::detail::ReducerTraits
Helper class for accessing reducer-defined types in CRTP May prove to be useful for other things late...
Definition: reduction.hpp:155
accessor_property_list.hpp
sycl::_V1::detail::reduComputeWGSize
size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize, size_t &NWorkGroups)
Definition: reduction.cpp:19
sycl::_V1::detail::doTreeReductionHelper
void doTreeReductionHelper(size_t WorkSize, size_t LID, FuncTy Func)
Definition: reduction.hpp:1372
sycl::_V1::detail::filterSequenceHelper
constexpr auto filterSequenceHelper(FunctorT, std::index_sequence< Is... >)
For each index 'I' from the given indices pack 'Is' this function initially creates a number of short...
Definition: reduction.hpp:2111
sycl::_V1::queue::memset
event memset(void *Ptr, int Value, size_t Count _CODELOCPARAM(&CodeLoc))
Fills the memory pointed by a USM pointer with the value specified.
sycl::_V1::detail::reduction_impl_algo::getWriteAccForPartialReds
auto getWriteAccForPartialReds(size_t Size, handler &CGH)
Returns an accessor accessing the memory that will hold the reduction partial sums.
Definition: reduction.hpp:921
sycl::_V1::detail::createReduOutAccs
auto createReduOutAccs(size_t NWorkGroups, handler &CGH, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
For the given 'Reductions' types pack and indices enumerating them this function either creates new t...
Definition: reduction.hpp:2014
sycl::_V1::detail::IsReduction
Predicate returning true if a type is a reduction.
Definition: reduction.hpp:36
sycl::_V1::span
Definition: sycl_span.hpp:152
sycl::_V1::detail::ReducerToken::IdentityContainer
const IdentityContainerT & IdentityContainer
Definition: reduction.hpp:510
sycl::_V1::detail::known_identity_impl
Definition: known_identity.hpp:136
sycl
---— Error handling, matching OpenCL plugin semantics.
Definition: access.hpp:14
sycl::_V1::detail::ReducerElement::ReducerElement
ReducerElement(const ReductionIdentityContainer< T, BinaryOperation, ExplicitIdentity > &IdentityContainer)
Definition: reduction.hpp:453
sycl::_V1::detail::reducer_common::binary_operation
BinaryOperation binary_operation
Definition: reduction.hpp:503
sycl::_V1::detail::associateReduAccsWithHandler
void associateReduAccsWithHandler(handler &CGH, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:2358
usm.hpp
sycl::_V1::detail::reduction::main_krn::NDRangeMulti
Definition: reduction.hpp:2269
sycl::_V1::detail::EmptyReductionPredicate
Definition: reduction.hpp:2093
sycl::_V1::detail::NDRangeReduction< reduction::strategy::local_atomic_and_atomic_cross_wg >::run
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1210
sycl::_V1::no_init
constexpr property::no_init no_init
Definition: accessor_properties.hpp:28
sycl::_V1::detail::combiner
Use CRTP to avoid redefining shorthand operators in terms of combine.
Definition: reduction.hpp:217
sycl::_V1::detail::make_tuple
constexpr tuple< Ts... > make_tuple(Ts... Args)
Definition: tuple.hpp:36
sycl::_V1::detail::KernelOneWGTag
Definition: reduction.hpp:1191
sycl::_V1::detail::FilterElement
Definition: reduction.hpp:2099
sycl::_V1::detail::reduGetMemPerWorkItem
size_t reduGetMemPerWorkItem(std::tuple< ReductionT... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:2558
sycl::_V1::detail::combiner::atomic_combine
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&(IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value||IsReduOptForAtomic64Op< _T, _BinaryOperation >::value) &&IsMaximum< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic MAX operation: *ReduVarPtr = sycl::maximum(*ReduVarPtr, MValue);.
Definition: reduction.hpp:368
sycl::_V1::handler::single_task
void single_task(_KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function as a function object type.
Definition: handler.hpp:1638
sycl::_V1::nd_item::get_global_linear_id
size_t __SYCL_ALWAYS_INLINE get_global_linear_id() const
Definition: nd_item.hpp:52
sycl::_V1::detail::memory_scope
sycl::memory_scope memory_scope
Definition: atomic_ref.hpp:31
sycl::_V1::detail::ReduTupleT
sycl::detail::tuple< Ts... > ReduTupleT
Definition: reduction.hpp:138
sycl::_V1::id< Dims >
sycl::_V1::detail::ReducerAccess
Helper class for accessing internal reducer member functions.
Definition: reduction.hpp:170
atomic_ref.hpp
reduction_properties.hpp
sycl::_V1::detail::combiner::atomic_combine
std::enable_if_t< std::is_same_v< remove_decoration_t< _T >, _T > &&IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value &&IsBitAND< _T, _BinaryOperation >::value &&(Space==access::address_space::global_space||Space==access::address_space::local_space)> atomic_combine(_T *ReduVarPtr) const
Atomic BITWISE AND operation: *ReduVarPtr &= MValue;.
Definition: reduction.hpp:344
sycl::_V1::detail::ReducerAccess::getElement
auto & getElement(size_t E)
Definition: reduction.hpp:174
sycl::_V1::detail::reduction_impl_algo::withInitializedMem
std::enable_if_t< HasIdentity > withInitializedMem(handler &CGH, FuncTy Func)
Provide Func with a properly initialized memory to write the reduction result to.
Definition: reduction.hpp:954
sycl::_V1::detail::reduction::finalizeHandler
void finalizeHandler(handler &CGH)
Definition: reduction.hpp:1147
sycl::_V1::detail::IsMinimum
std::bool_constant< std::is_same_v< BinaryOperation, sycl::minimum< T > >||std::is_same_v< BinaryOperation, sycl::minimum< void > >> IsMinimum
Definition: known_identity.hpp:34
sycl::_V1::detail::reduCGFuncImplArray
void reduCGFuncImplArray(nd_item< Dims > NDIt, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, std::tuple< ReducerT... > &ReducersTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... >)
Definition: reduction.hpp:2254
sycl::_V1::detail::reduGetPreferredWGSize
size_t reduGetPreferredWGSize(std::shared_ptr< queue_impl > &Queue, size_t LocalMemBytesPerWorkItem)
Definition: reduction.cpp:105
sycl::_V1::detail::reduction_impl_algo::result_type
T result_type
Definition: reduction.hpp:851
sycl::_V1::nd_item::barrier
void barrier(access::fence_space accessSpace=access::fence_space::global_and_local) const
Definition: nd_item.hpp:116
sycl::_V1::detail::reduGetMemPerWorkItemHelper
size_t reduGetMemPerWorkItemHelper(Reduction &, RestT... Rest)
Definition: reduction.hpp:2552
sycl::_V1::detail::reduction_impl::reduction_impl
reduction_impl(RedOutVar &Var, const T &Identity, BinaryOperation BOp, bool InitializeToIdentity)
Constructs reduction_impl with an explicit identity value.
Definition: reduction.hpp:1127
sycl::_V1::detail::GreatestPowerOfTwo
static size_t GreatestPowerOfTwo(size_t N)
Computes the greatest power-of-two less than or equal to N.
Definition: reduction.hpp:1361
sycl::_V1::detail::IsKnownIdentityOp
std::bool_constant< IsZeroIdentityOp< T, BinaryOperation >::value||IsOneIdentityOp< T, BinaryOperation >::value||IsOnesIdentityOp< T, BinaryOperation >::value||IsMinimumIdentityOp< T, BinaryOperation >::value||IsMaximumIdentityOp< T, BinaryOperation >::value||IsFalseIdentityOp< T, BinaryOperation >::value||IsTrueIdentityOp< T, BinaryOperation >::value > IsKnownIdentityOp
Definition: known_identity.hpp:128
sycl::_V1::range< 1 >
sycl::_V1::detail::ReductionIdentityContainer< T, BinaryOperation, true, enable_if_t<!IsKnownIdentityOp< T, BinaryOperation >::value > >::ReductionIdentityContainer
ReductionIdentityContainer(const T &Identity)
Definition: reduction.hpp:408
sycl::_V1::detail::ReducerElement
Definition: reduction.hpp:151
sycl::_V1::access::placeholder
placeholder
Definition: access.hpp:45
sycl::_V1::detail::reduSaveFinalResultToUserMem
void reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu)
Copies the final reduction result kept in read-write accessor to user's USM memory.
Definition: reduction.hpp:1165
sycl::_V1::reducer< T, BinaryOperation, Dims, Extent, IdentityContainerT, View, std::enable_if_t< Dims==1 &&View==false &&!detail::IsKnownIdentityOp< T, BinaryOperation >::value > >::operator[]
reducer< T, BinaryOperation, Dims - 1, Extent, IdentityContainerT, true > operator[](size_t Index)
Definition: reduction.hpp:700
sycl::_V1::nd_item
Identifies an instance of the function object executing at each point in an nd_range.
Definition: helpers.hpp:33
std::get
constexpr tuple_element< I, tuple< Types... > >::type & get(sycl::detail::tuple< Types... > &Arg) noexcept
Definition: tuple.hpp:199
sycl::_V1::detail::ReductionIdentityContainer< T, BinaryOperation, true, enable_if_t<!IsKnownIdentityOp< T, BinaryOperation >::value > >::getIdentity
T getIdentity() const
Returns the identity value given by user.
Definition: reduction.hpp:411
sycl::_V1::detail::KernelMultipleWGTag
Definition: reduction.hpp:1192
sycl::_V1::detail::IsPlus
std::bool_constant< std::is_same_v< BinaryOperation, sycl::plus< T > >||std::is_same_v< BinaryOperation, sycl::plus< void > >> IsPlus
Definition: known_identity.hpp:24
sycl::_V1::detail::IsBitAND
std::bool_constant< std::is_same_v< BinaryOperation, sycl::bit_and< T > >||std::is_same_v< BinaryOperation, sycl::bit_and< void > >> IsBitAND
Definition: known_identity.hpp:44
sycl::_V1::handler::parallel_for
void parallel_for(range< 1 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:1644
sycl::_V1::reducer< T, BinaryOperation, Dims, Extent, IdentityContainerT, View, std::enable_if_t< Dims==1 &&View==false &&detail::IsKnownIdentityOp< T, BinaryOperation >::value > >::reducer
reducer()
Definition: reduction.hpp:740
kernel.hpp
sycl::_V1::detail::ReducerTraits< reducer< T, BinaryOperation, Dims, Extent, IdentityContainerT, View, Subst > >::op
BinaryOperation op
Definition: reduction.hpp:162
sycl::_V1::detail::IsMultiplies
std::bool_constant< std::is_same_v< BinaryOperation, sycl::multiplies< T > >||std::is_same_v< BinaryOperation, sycl::multiplies< void > >> IsMultiplies
Definition: known_identity.hpp:29
sycl::_V1::detail::WorkSizeGuarantees::LessOrEqual
@ LessOrEqual
sycl::_V1::detail::getReducerAccess
auto getReducerAccess(ReducerT &Reducer)
Definition: reduction.hpp:202
sycl::_V1::detail::reducer_common::value_type
T value_type
Definition: reduction.hpp:502
sycl::_V1::detail::NDRangeReduction< reduction::strategy::group_reduce_and_multiple_kernels >::run
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1661
sycl::_V1::detail::combiner::atomic_combine
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&(IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value||IsReduOptForAtomic64Op< _T, _BinaryOperation >::value) &&IsPlus< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic ADD operation: *ReduVarPtr += MValue;.
Definition: reduction.hpp:309
group_algorithm.hpp
sycl::_V1::detail::get_red_t
Definition: reduction.hpp:788
sycl::_V1::queue
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
Definition: queue.hpp:89
sycl::_V1::ext::oneapi::experimental::operator=
annotated_arg & operator=(annotated_arg &)=default
sycl::_V1::detail::reduction::InitMemKrn
Definition: reduction.hpp:801
sycl::_V1::detail::IsScalarReduction
Definition: reduction.hpp:2131
sycl::_V1::reduction
auto reduction(span< T, Extent > Span, const T &Identity, BinaryOperation Combiner, const property_list &PropList={})
Constructs a reduction object using the reduction variable referenced by the given sycl::span Span,...
Definition: reduction.hpp:2880
sycl::_V1::detail::IsArrayReduction
Definition: reduction.hpp:2138
sycl::_V1::detail::tuple_select_elements
std::tuple< std::tuple_element_t< Is, TupleT >... > tuple_select_elements(TupleT Tuple, std::index_sequence< Is... >)
Utility function: for the given tuple.
Definition: reduction.hpp:2567
sycl::_V1::reducer< T, BinaryOperation, Dims, Extent, IdentityContainerT, View, std::enable_if_t< Dims==0 &&Extent==1 &&View==false &&detail::IsKnownIdentityOp< T, BinaryOperation >::value > >::reducer
reducer()
Definition: reduction.hpp:595
sycl::_V1::detail::doTreeReductionOnTuple
void doTreeReductionOnTuple(size_t WorkSize, size_t LID, ReduTupleT< LocalAccT... > &LocalAccs, ReduTupleT< BOPsT... > &BOPs, std::index_sequence< Is... >)
Definition: reduction.hpp:1441
sycl::_V1::handler
Command group handler class.
Definition: handler.hpp:325
sycl::_V1::detail::reduction::withAuxHandler
void withAuxHandler(handler &CGH, FunctorTy Func)
Definition: reduction.hpp:1148
sycl::_V1::detail::makeAdjustedBOP
constexpr auto makeAdjustedBOP(BOPT &BOP)
Definition: reduction.hpp:2146
sycl::_V1::detail::ReducerElement::operator*
constexpr T & operator*() noexcept
Definition: reduction.hpp:477
sycl::_V1::detail::reduction::MainKrn
Definition: reduction.hpp:1186
sycl::_V1::detail::reduction_impl::reduction_impl
reduction_impl(RedOutVar Var, BinaryOperation BOp, bool InitializeToIdentity=false)
Constructs reduction_impl when no identity is specified.
Definition: reduction.hpp:1107
reduction_forward.hpp
sycl::_V1::detail::ReducerElement::ReducerElement
ReducerElement(T Value)
Definition: reduction.hpp:450
sycl::_V1::nd_range
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: uniform.hpp:36
sycl::_V1::detail::reduction_impl_algo::initializeToIdentity
bool initializeToIdentity() const
Definition: reduction.hpp:1057
sycl::_V1::detail::reduction_impl_algo::getGroupsCounterAccDiscrete
auto getGroupsCounterAccDiscrete(handler &CGH)
Definition: reduction.hpp:1041
sycl::_V1::device
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:49
sycl::_V1::detail::ReducerAccess::ReducerAccess
ReducerAccess(ReducerT &ReducerRef)
Definition: reduction.hpp:172
sycl::_V1::detail::reduCGFuncMulti
void reduCGFuncMulti(handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, PropertiesT Properties, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... > ReduIndices)
Definition: reduction.hpp:2273
sycl::_V1::detail::reduAuxCGFunc
size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... > ReduIndices)
Definition: reduction.hpp:2473
sycl::_V1::read_write
constexpr mode_tag_t< access_mode::read_write > read_write
Definition: access.hpp:76
sycl::_V1::detail::FilterElement::type
std::conditional_t< Cond, std::index_sequence< I >, std::index_sequence<> > type
Definition: reduction.hpp:2101
sycl::_V1::detail::reduction_impl_algo::getBinaryOperation
BinaryOperation getBinaryOperation() const
Returns the binary operation associated with the reduction.
Definition: reduction.hpp:1056
sycl::_V1::detail::reduction_impl_algo::binary_operation
BinaryOperation binary_operation
Definition: reduction.hpp:852
sycl::_V1::detail::reduction_impl_algo::getUserRedVarAccess
auto getUserRedVarAccess(handler &CGH)
Definition: reduction.hpp:1059
sycl::_V1::reducer< T, BinaryOperation, Dims, Extent, IdentityContainerT, View, std::enable_if_t< Dims==0 &&Extent==1 &&View==false &&detail::IsKnownIdentityOp< T, BinaryOperation >::value > >::combine
reducer & combine(const T &Partial)
Definition: reduction.hpp:607
sycl::_V1::nd_range::get_local_range
range< dimensions > get_local_range() const
Definition: nd_range.hpp:42
sycl::_V1::detail::IsMaximum
std::bool_constant< std::is_same_v< BinaryOperation, sycl::maximum< T > >||std::is_same_v< BinaryOperation, sycl::maximum< void > >> IsMaximum
Definition: known_identity.hpp:39
sycl::_V1::accessor
Definition: accessor.hpp:225
sycl::_V1::nd_item::get_local_linear_id
size_t get_local_linear_id() const
Definition: nd_item.hpp:66
sycl::_V1::reducer< T, BinaryOperation, Dims, Extent, IdentityContainerT, View, std::enable_if_t< Dims==0 &&View==true > >::reducer
reducer(element_type &Ref, BinaryOperation BOp)
Definition: reduction.hpp:642
accessor.hpp
sycl::_V1::detail::ReductionIdentityContainer< T, BinaryOperation, ExplicitIdentity, enable_if_t< IsKnownIdentityOp< T, BinaryOperation >::value > >::ReductionIdentityContainer
ReductionIdentityContainer()
Definition: reduction.hpp:392
sycl::_V1::detail::NDRangeReduction< reduction::strategy::basic >::run
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1799
sycl::_V1::span::data
constexpr _SYCL_SPAN_INLINE_VISIBILITY pointer data() const noexcept
Definition: sycl_span.hpp:377
KernelFunc
std::function< void(const sycl::nd_item< NDims > &)> KernelFunc
Definition: pi_esimd_emulator.cpp:192
sycl::_V1::detail::IsNonUsmReductionPredicate
Definition: reduction.hpp:2087
sycl::_V1::ext::oneapi::experimental::reduce_over_group
std::enable_if_t<(is_group_helper_v< GroupHelper >), T > reduce_over_group(GroupHelper group_helper, T x, BinaryOperation binary_op)
Definition: user_defined_reductions.hpp:48
sycl::_V1::nd_item::get_group_linear_id
size_t __SYCL_ALWAYS_INLINE get_group_linear_id() const
Definition: nd_item.hpp:82
sycl::_V1::detail::doTreeReduction
void doTreeReduction(size_t WorkSize, nd_item< Dim > NDIt, LocalRedsTy &LocalReds, BinOpTy &BOp, AccessFuncTy AccessFunc)
Definition: reduction.hpp:1406
sycl::_V1::detail::IsReduOptForAtomic64Op
std::bool_constant<(IsPlus< T, BinaryOperation >::value||IsMinimum< T, BinaryOperation >::value||IsMaximum< T, BinaryOperation >::value) &&is_sgenfloat< T >::value &&sizeof(T)==8 > IsReduOptForAtomic64Op
Definition: reduction.hpp:113
sycl::_V1::detail::reduction_impl_algo::getReadAccToPreviousPartialReds
auto getReadAccToPreviousPartialReds(handler &CGH) const
Definition: reduction.hpp:890
sycl::_V1::reducer< T, BinaryOperation, Dims, Extent, IdentityContainerT, View, std::enable_if_t< Dims==0 &&Extent==1 &&View==false &&detail::IsKnownIdentityOp< T, BinaryOperation >::value > >::reducer
reducer(const IdentityContainerT &, BinaryOperation)
Definition: reduction.hpp:596
sycl::_V1::reducer< T, BinaryOperation, Dims, Extent, IdentityContainerT, View, std::enable_if_t< Dims==1 &&View==false &&detail::IsKnownIdentityOp< T, BinaryOperation >::value > >::reducer
reducer(const detail::ReducerToken< BinaryOperation, IdentityContainerT > &Token)
Definition: reduction.hpp:743
handler.hpp
sycl::_V1::device::get_info
detail::is_device_info_desc< Param >::return_type get_info() const
Queries this SYCL device for information requested by the template parameter param.
Definition: device.cpp:136
sycl::_V1::detail::EmptyReductionPredicate::Func
Definition: reduction.hpp:2094
sycl::_V1::local_accessor
Definition: multi_ptr.hpp:68
std
Definition: accessor.hpp:3922
sycl::_V1::reducer< T, BinaryOperation, Dims, Extent, IdentityContainerT, View, std::enable_if_t< Dims==0 &&Extent==1 &&View==false &&!detail::IsKnownIdentityOp< T, BinaryOperation >::value > >::reducer
reducer(const detail::ReducerToken< BinaryOperation, IdentityContainerT > &Token)
Definition: reduction.hpp:542
sycl::_V1::detail::getDeviceFromHandler
device getDeviceFromHandler(handler &CommandGroupHandlerRef)
Definition: accessor.cpp:15
sycl::_V1::detail::tuple_element_t
typename tuple_element< I, T >::type tuple_element_t
Definition: tuple.hpp:56
sycl::_V1::detail::is_contained
Definition: type_list.hpp:55
sycl::_V1::detail::IsArrayReduction::Func
Definition: reduction.hpp:2139
sycl::_V1::detail::AreAllButLastReductions
Predicate returning true if all template type parameters except the last one are reductions.
Definition: reduction.hpp:43
sycl::_V1::reducer< T, BinaryOperation, Dims, Extent, IdentityContainerT, View, std::enable_if_t< Dims==0 &&Extent==1 &&View==false &&detail::IsKnownIdentityOp< T, BinaryOperation >::value > >::identity
T identity() const
Definition: reduction.hpp:613
sycl::_V1::detail::combiner::atomic_combine
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value &&IsBitOR< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic BITWISE OR operation: *ReduVarPtr |= MValue;.
Definition: reduction.hpp:320
sycl::_V1::detail::combiner::atomic_combine
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value &&IsBitXOR< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic BITWISE XOR operation: *ReduVarPtr ^= MValue;.
Definition: reduction.hpp:331
sycl::_V1::detail::reduAuxCGFuncImplScalar
void reduAuxCGFuncImplScalar(nd_item< Dims > NDIt, size_t LID, size_t GID, size_t RemainingWorkSize, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< InAccT... > InAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... > ReduIndices)
All scalar reductions are processed together; there is one loop of log2(N) steps, and each reduction ...
Definition: reduction.hpp:2373
sycl::_V1::detail::reduction_impl_algo::getIdentityContainer
const identity_container_type & getIdentityContainer()
Definition: reduction.hpp:1024
sycl::_V1::detail::getDelinearizedId
id< 1 > getDelinearizedId(const range< 1 > &, size_t Index)
Definition: id.hpp:323
sycl::_V1::detail::reduction_impl_algo::getReadWriteAccessorToInitializedGroupsCounter
accessor< int, 1, access::mode::read_write, access::target::device, access::placeholder::false_t > getReadWriteAccessorToInitializedGroupsCounter(handler &CGH)
Definition: reduction.hpp:1030
sycl::_V1::detail::WorkSizeGuarantees::None
@ None
sycl::_V1::detail::get_red_t< buffer< T, Dims, AllocatorT > >::type
T type
Definition: reduction.hpp:795
sycl::_V1::detail::reduction::getDelinearizedItem
item< Dims, false > getDelinearizedItem(range< Dims > Range, id< Dims > Id)
Definition: reduction_forward.hpp:47
sycl::_V1::detail::reduction_impl_base
Base non-template class which is a base class for all reduction implementation classes.
Definition: reduction.hpp:33
sycl::_V1::detail::reduction_impl_algo::getTempBuffer
auto & getTempBuffer(size_t Size, handler &CGH)
Definition: reduction.hpp:909
sycl::_V1::detail::reduAuxCGFuncImplArrayHelper
void reduAuxCGFuncImplArrayHelper(nd_item< Dims > NDIt, size_t LID, size_t GID, size_t RemainingWorkSize, LocalAccT LocalReds, InAccT In, OutAccT Out, T IdentityContainer, BOPT BOp, bool IsInitializeToIdentity)
Definition: reduction.hpp:2403
sycl::_V1::reducer< T, BinaryOperation, Dims, Extent, IdentityContainerT, View, std::enable_if_t< Dims==1 &&View==false &&detail::IsKnownIdentityOp< T, BinaryOperation >::value > >::operator[]
reducer< T, BinaryOperation, Dims - 1, Extent, IdentityContainerT, true > operator[](size_t Index)
Definition: reduction.hpp:755
sycl::_V1::detail::ReductionIdentityContainer< T, BinaryOperation, ExplicitIdentity, enable_if_t< IsKnownIdentityOp< T, BinaryOperation >::value > >::ReductionIdentityContainer
ReductionIdentityContainer(const T &)
Definition: reduction.hpp:391
sycl::_V1::detail::ReducerToken
Definition: reduction.hpp:509
sycl::_V1::detail::filterSequence
constexpr auto filterSequence(FunctorT F, std::index_sequence< Is... > Indices)
For each index 'I' from the given indices pack 'Is' this function returns an index sequence consistin...
Definition: reduction.hpp:2127
sycl::_V1::detail::ReducerElement::combine
ReducerElement & combine(BinaryOperation BinOp, const ReducerElement &Other)
Definition: reduction.hpp:466
sycl::_V1::detail::usm::free
void free(void *Ptr, const context &Ctxt, const code_location &CL)
Definition: usm_impl.cpp:277
sycl::_V1::reducer< T, BinaryOperation, Dims, Extent, IdentityContainerT, View, std::enable_if_t< Dims==0 &&Extent==1 &&View==false &&!detail::IsKnownIdentityOp< T, BinaryOperation >::value > >::identity
std::enable_if_t< HasIdentityRelay &&(HasIdentityRelay==has_identity), T > identity() const
Definition: reduction.hpp:558
sycl::_V1::detail::NDRangeReduction< reduction::strategy::multi >::run
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, RestT... Rest)
Definition: reduction.hpp:2574
sycl::_V1::detail::NDRangeReduction< reduction::strategy::auto_select >::run
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:2618
sycl::_V1::reducer< T, BinaryOperation, Dims, Extent, IdentityContainerT, View, std::enable_if_t< Dims==0 &&Extent==1 &&View==false &&!detail::IsKnownIdentityOp< T, BinaryOperation >::value > >::combine
reducer & combine(const T &Partial)
Definition: reduction.hpp:551
sycl::_V1::detail::getLastCombine
auto getLastCombine(OutAccT OutAcc, LocalAccT LocalAcc, BOPT BOP, IdentityContainerT IdentityContainer, bool IsInitializeToIdentity)
Definition: reduction.hpp:2026
sycl::_V1::detail::IsReduOptForFastAtomicFetch
std::bool_constant<((is_sgenfloat< T >::value &&sizeof(T)==4)||is_sgeninteger< T >::value) &&IsValidAtomicType< T >::value &&(IsPlus< T, BinaryOperation >::value||IsMinimum< T, BinaryOperation >::value||IsMaximum< T, BinaryOperation >::value||IsBitOR< T, BinaryOperation >::value||IsBitXOR< T, BinaryOperation >::value||IsBitAND< T, BinaryOperation >::value)> IsReduOptForFastAtomicFetch
Definition: reduction.hpp:94
sycl::_V1::queue::get_device
device get_device() const
Definition: queue.cpp:77
sycl::_V1::detail::reduGetMaxWGSize
size_t reduGetMaxWGSize(std::shared_ptr< queue_impl > Queue, size_t LocalMemBytesPerWorkItem)
sycl::_V1::detail::ReducerToken::BOp
const BinaryOperation BOp
Definition: reduction.hpp:511
sycl::_V1::detail::IsBitOR
std::bool_constant< std::is_same_v< BinaryOperation, sycl::bit_or< T > >||std::is_same_v< BinaryOperation, sycl::bit_or< void > >> IsBitOR
Definition: known_identity.hpp:49
sycl::_V1::reducer< T, BinaryOperation, Dims, Extent, IdentityContainerT, View, std::enable_if_t< Dims==0 &&Extent==1 &&View==false &&detail::IsKnownIdentityOp< T, BinaryOperation >::value > >::reducer
reducer(const detail::ReducerToken< BinaryOperation, IdentityContainerT > &Token)
Definition: reduction.hpp:598
sycl::_V1::detail::reduction_impl_algo::reduction_impl_algo
reduction_impl_algo(BinaryOperation BinaryOp, bool Init, RedOutVar RedOut, std::enable_if_t<!IsKnownIdentityOp< RelayT, RelayBinaryOperation >::value, int >=0)
Definition: reduction.hpp:883
sycl::_V1::detail::InitializeToIdentity
@ InitializeToIdentity
Definition: property_helper.hpp:34
sycl::_V1::detail::IsReduOptForFastReduce
std::bool_constant<((is_sgeninteger< T >::value &&(sizeof(T)==4||sizeof(T)==8))||is_sgenfloat< T >::value) &&(IsPlus< T, BinaryOperation >::value||IsMinimum< T, BinaryOperation >::value||IsMaximum< T, BinaryOperation >::value)> IsReduOptForFastReduce
Definition: reduction.hpp:131
sycl::_V1::handler::depends_on
void depends_on(event Event)
Registers event dependencies on this command group.
Definition: handler.cpp:784
sycl::_V1::detail::reduction_parallel_for
void reduction_parallel_for(handler &CGH, range< Dims > Range, PropertiesT Properties, RestT... Rest)
Definition: reduction.hpp:2686
sycl::_V1::detail::WorkSizeGuarantees::Equal
@ Equal
known_identity.hpp
sycl::_V1::detail::reduction::strategy
strategy
Definition: reduction_forward.hpp:25
sycl::_V1::detail::NDRangeReduction< reduction::strategy::auto_select >::run
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, RestT... Rest)
Definition: reduction.hpp:2665
sycl::_V1::Space
Space
Definition: multi_ptr.hpp:1307
atomic.hpp
sycl::_V1::detail::NDRangeReduction
Definition: reduction.hpp:1204
sycl::_V1::detail::reduction_impl_algo::reduction_impl_algo
reduction_impl_algo(const T &Identity, BinaryOperation BinaryOp, bool Init, RedOutVar RedOut)
Definition: reduction.hpp:866
sycl::_V1::reducer< T, BinaryOperation, Dims, Extent, IdentityContainerT, View, std::enable_if_t< Dims==1 &&View==false &&!detail::IsKnownIdentityOp< T, BinaryOperation >::value > >::reducer
reducer(const IdentityContainerT &IdentityContainer, BinaryOperation BOp)
Definition: reduction.hpp:687
sycl::_V1::detail::IsNonUsmReductionPredicate::Func
Definition: reduction.hpp:2088
sycl::_V1::detail::ReducerAccess::getIdentityStatic
static constexpr auto getIdentityStatic()
Definition: reduction.hpp:188
sycl::_V1::detail::make_reduction
auto make_reduction(RedOutVar RedVar, RestTy &&...Rest)
Definition: reduction.hpp:1140
sycl::_V1::detail::reduction_impl_algo::withInitializedMem
std::enable_if_t<!HasIdentity > withInitializedMem(handler &CGH, FuncTy Func)
Definition: reduction.hpp:1017
sycl::_V1::detail::reduction_impl_algo::reduction_impl_algo
reduction_impl_algo(BinaryOperation BinaryOp, bool Init, RedOutVar RedOut, std::enable_if_t< IsKnownIdentityOp< RelayT, RelayBinaryOperation >::value, int >=0)
Definition: reduction.hpp:873
sycl::_V1::access::address_space
address_space
Definition: access.hpp:47