DPC++ Runtime
Runtime libraries for oneAPI DPC++
reduction.hpp
Go to the documentation of this file.
1 //==---------------- reduction.hpp - SYCL reduction ------------*- C++ -*---==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 // ===--------------------------------------------------------------------=== //
8 
9 #pragma once
10 
11 #include <sycl/access/access.hpp> // for address_s...
12 #include <sycl/accessor.hpp> // for local_acc...
13 #include <sycl/aspects.hpp> // for aspect
14 #include <sycl/atomic.hpp> // for IsValidAt...
15 #include <sycl/atomic_ref.hpp> // for atomic_ref
16 #include <sycl/buffer.hpp> // for buffer
17 #include <sycl/builtins.hpp> // for min
18 #include <sycl/detail/export.hpp> // for __SYCL_EX...
19 #include <sycl/detail/generic_type_traits.hpp> // for is_sgenfloat
20 #include <sycl/detail/impl_utils.hpp> // for createSyc...
21 #include <sycl/detail/item_base.hpp> // for id
22 #include <sycl/detail/reduction_forward.hpp> // for strategy
23 #include <sycl/detail/tuple.hpp> // for make_tuple
24 #include <sycl/device.hpp> // for device
25 #include <sycl/event.hpp> // for event
26 #include <sycl/exception.hpp> // for make_erro...
27 #include <sycl/exception_list.hpp> // for queue_impl
29 #include <sycl/group.hpp> // for workGroup...
30 #include <sycl/group_algorithm.hpp> // for reduce_ov...
31 #include <sycl/handler.hpp> // for handler
32 #include <sycl/id.hpp> // for getDeline...
33 #include <sycl/kernel.hpp> // for auto_name
34 #include <sycl/known_identity.hpp> // for IsKnownId...
35 #include <sycl/marray.hpp> // for marray
36 #include <sycl/memory_enums.hpp> // for memory_order
37 #include <sycl/multi_ptr.hpp> // for address_s...
38 #include <sycl/nd_item.hpp> // for nd_item
39 #include <sycl/nd_range.hpp> // for nd_range
40 #include <sycl/properties/accessor_properties.hpp> // for no_init
41 #include <sycl/properties/reduction_properties.hpp> // for initializ...
42 #include <sycl/property_list.hpp> // for property_...
43 #include <sycl/queue.hpp> // for queue
44 #include <sycl/range.hpp> // for range
45 #include <sycl/sycl_span.hpp> // for dynamic_e...
46 #include <sycl/usm.hpp> // for malloc_de...
47 
48 #include <algorithm> // for min
49 #include <array> // for array
50 #include <assert.h> // for assert
51 #include <cstddef> // for size_t
52 #include <memory> // for shared_ptr
53 #include <optional> // for nullopt
54 #include <stdint.h> // for uint32_t
55 #include <string> // for operator+
56 #include <tuple> // for _Swallow_...
57 #include <type_traits> // for enable_if_t
58 #include <utility> // for index_seq...
59 #include <variant> // for tuple
60 
61 namespace sycl {
62 inline namespace _V1 {
76 template <typename T, class BinaryOperation, int Dims, size_t Extent,
77  typename IdentityContainerT, bool View = false, typename Subst = void>
78 class reducer;
79 
80 namespace detail {
81 // This type trait is used to detect if the atomic operation BinaryOperation
82 // used with operands of the type T is available for using in reduction.
83 // The order in which the atomic operations are performed may be arbitrary and
84 // thus may cause different results from run to run even on the same elements
85 // and on same device. The macro SYCL_REDUCTION_DETERMINISTIC prohibits using
86 // atomic operations for reduction and helps to produce stable results.
87 // SYCL_REDUCTION_DETERMINISTIC is a short term solution, which perhaps become
88 // deprecated eventually and is replaced by a sycl property passed to reduction.
89 template <typename T, class BinaryOperation>
91 #ifdef SYCL_REDUCTION_DETERMINISTIC
92  std::bool_constant<false>;
93 #else
94  std::bool_constant<((is_sgenfloat_v<T> && sizeof(T) == 4) ||
95  is_sgeninteger_v<T>)&&IsValidAtomicType<T>::value &&
102 #endif
103 
104 // This type trait is used to detect if the atomic operation BinaryOperation
105 // used with operands of the type T is available for using in reduction, in
106 // addition to the cases covered by "IsReduOptForFastAtomicFetch", if the device
107 // has the atomic64 aspect. This type trait should only be used if the device
108 // has the atomic64 aspect. Note that this type trait is currently a subset of
109 // IsReduOptForFastReduce. The macro SYCL_REDUCTION_DETERMINISTIC prohibits
110 // using the reduce_over_group() algorithm to produce stable results across same
111 // type devices.
112 template <typename T, class BinaryOperation>
114 #ifdef SYCL_REDUCTION_DETERMINISTIC
115  std::bool_constant<false>;
116 #else
117  std::bool_constant<(IsPlus<T, BinaryOperation>::value ||
120  is_sgenfloat_v<T> && sizeof(T) == 8>;
121 #endif
122 
123 // This type trait is used to detect if the group algorithm reduce() used with
124 // operands of the type T and the operation BinaryOperation is available
125 // for using in reduction.
126 // The macro SYCL_REDUCTION_DETERMINISTIC prohibits using the reduce() algorithm
127 // to produce stable results across same type devices.
128 template <typename T, class BinaryOperation>
130 #ifdef SYCL_REDUCTION_DETERMINISTIC
131  std::bool_constant<false>;
132 #else
133  std::bool_constant<(
134  (is_sgeninteger_v<T> && (sizeof(T) == 4 || sizeof(T) == 8)) ||
135  is_sgenfloat_v<T>)&&(IsPlus<T, BinaryOperation>::value ||
138 #endif
139 
140 // std::tuple seems to be a) too heavy and b) not copyable to device now
141 // Thus sycl::detail::tuple is used instead.
142 // Switching from sycl::device::tuple to std::tuple can be done by re-defining
143 // the ReduTupleT type and makeReduTupleT() function below.
144 template <typename... Ts> using ReduTupleT = sycl::detail::tuple<Ts...>;
145 template <typename... Ts> ReduTupleT<Ts...> makeReduTupleT(Ts... Elements) {
146  return sycl::detail::make_tuple(Elements...);
147 }
148 
149 __SYCL_EXPORT size_t reduGetMaxWGSize(std::shared_ptr<queue_impl> Queue,
150  size_t LocalMemBytesPerWorkItem);
151 __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
152  size_t &NWorkGroups);
153 __SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
154  size_t LocalMemBytesPerWorkItem);
155 
156 template <typename T, class BinaryOperation, bool IsOptional>
157 class ReducerElement;
158 
161 template <typename Reducer> struct ReducerTraits;
162 
163 template <typename T, class BinaryOperation, int Dims, std::size_t Extent,
164  typename IdentityContainerT, bool View, typename Subst>
165 struct ReducerTraits<reducer<T, BinaryOperation, Dims, Extent,
166  IdentityContainerT, View, Subst>> {
167  using type = T;
168  using op = BinaryOperation;
169  static constexpr int dims = Dims;
170  static constexpr size_t extent = Extent;
171  static constexpr bool has_identity = IdentityContainerT::has_identity;
173 };
174 
176 template <typename ReducerT> class ReducerAccess {
177 public:
178  ReducerAccess(ReducerT &ReducerRef) : MReducerRef(ReducerRef) {}
179 
180  template <typename ReducerRelayT = ReducerT> auto &getElement(size_t E) {
181  return MReducerRef.getElement(E);
182  }
183 
184  template <typename ReducerRelayT = ReducerT> constexpr auto getIdentity() {
186  "Identity unavailable.");
187  return MReducerRef.getIdentity();
188  }
189 
190  // MSVC does not like static overloads of non-static functions, even if they
191  // are made mutually exclusive through SFINAE. Instead we use a new static
192  // function to be used when a static function is needed.
193  template <typename ReducerRelayT = ReducerT>
194  static constexpr auto getIdentityStatic() {
195  static_assert(
197  typename ReducerTraits<ReducerRelayT>::op>::value,
198  "Static identity unavailable.");
199  return ReducerT::getIdentity();
200  }
201 
202 private:
203  ReducerT &MReducerRef;
204 };
205 
206 // Helper function to simplify the use of ReducerAccess. This avoids the need
207 // for potentially unsupported deduction guides.
208 template <typename ReducerT> auto getReducerAccess(ReducerT &Reducer) {
209  return ReducerAccess<ReducerT>{Reducer};
210 }
211 
223 template <class Reducer> class combiner {
224  using Ty = typename ReducerTraits<Reducer>::type;
225  using BinaryOp = typename ReducerTraits<Reducer>::op;
226  static constexpr int Dims = ReducerTraits<Reducer>::dims;
227  static constexpr size_t Extent = ReducerTraits<Reducer>::extent;
228 
229 public:
230  template <typename _T = Ty, int _Dims = Dims>
231  std::enable_if_t<(_Dims == 0) && IsPlus<_T, BinaryOp>::value &&
232  is_geninteger_v<_T>,
233  Reducer &>
234  operator++() {
235  return static_cast<Reducer *>(this)->combine(static_cast<_T>(1));
236  }
237 
238  template <typename _T = Ty, int _Dims = Dims>
239  std::enable_if_t<(_Dims == 0) && IsPlus<_T, BinaryOp>::value &&
240  is_geninteger_v<_T>,
241  Reducer &>
242  operator++(int) {
243  return static_cast<Reducer *>(this)->combine(static_cast<_T>(1));
244  }
245 
246  template <typename _T = Ty, int _Dims = Dims>
247  std::enable_if_t<(_Dims == 0) && IsPlus<_T, BinaryOp>::value, Reducer &>
248  operator+=(const _T &Partial) {
249  return static_cast<Reducer *>(this)->combine(Partial);
250  }
251 
252  template <typename _T = Ty, int _Dims = Dims>
253  std::enable_if_t<(_Dims == 0) && IsMultiplies<_T, BinaryOp>::value, Reducer &>
254  operator*=(const _T &Partial) {
255  return static_cast<Reducer *>(this)->combine(Partial);
256  }
257 
258  template <typename _T = Ty, int _Dims = Dims>
259  std::enable_if_t<(_Dims == 0) && IsBitOR<_T, BinaryOp>::value, Reducer &>
260  operator|=(const _T &Partial) {
261  return static_cast<Reducer *>(this)->combine(Partial);
262  }
263 
264  template <typename _T = Ty, int _Dims = Dims>
265  std::enable_if_t<(_Dims == 0) && IsBitXOR<_T, BinaryOp>::value, Reducer &>
266  operator^=(const _T &Partial) {
267  return static_cast<Reducer *>(this)->combine(Partial);
268  }
269 
270  template <typename _T = Ty, int _Dims = Dims>
271  std::enable_if_t<(_Dims == 0) && IsBitAND<_T, BinaryOp>::value, Reducer &>
272  operator&=(const _T &Partial) {
273  return static_cast<Reducer *>(this)->combine(Partial);
274  }
275 
276 private:
277  template <access::address_space Space>
278  static constexpr memory_scope getMemoryScope() {
279  return Space == access::address_space::local_space
282  }
283 
284  template <access::address_space Space, class T, class AtomicFunctor>
285  void atomic_combine_impl(T *ReduVarPtr, AtomicFunctor Functor) const {
286  auto reducer = static_cast<const Reducer *>(this);
287  for (size_t E = 0; E < Extent; ++E) {
288  const auto &ReducerElem = getReducerAccess(*reducer).getElement(E);
289 
290  // If the reducer element doesn't have a value we can skip the combine.
291  if (!ReducerElem)
292  continue;
293 
294  auto AtomicRef = sycl::atomic_ref<T, memory_order::relaxed,
295  getMemoryScope<Space>(), Space>(
296  address_space_cast<Space, access::decorated::no>(ReduVarPtr)[E]);
297  Functor(std::move(AtomicRef), *ReducerElem);
298  }
299  }
300 
301  template <class _T, access::address_space Space, class BinaryOp>
302  static constexpr bool BasicCheck =
303  std::is_same_v<remove_decoration_t<_T>, Ty> &&
306 
307 public:
310  typename _T = Ty, class _BinaryOperation = BinaryOp>
311  std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
312  (IsReduOptForFastAtomicFetch<_T, _BinaryOperation>::value ||
313  IsReduOptForAtomic64Op<_T, _BinaryOperation>::value) &&
314  IsPlus<_T, _BinaryOperation>::value>
315  atomic_combine(_T *ReduVarPtr) const {
316  atomic_combine_impl<Space>(
317  ReduVarPtr, [](auto &&Ref, auto Val) { return Ref.fetch_add(Val); });
318  }
319 
322  typename _T = Ty, class _BinaryOperation = BinaryOp>
323  std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
326  atomic_combine(_T *ReduVarPtr) const {
327  atomic_combine_impl<Space>(
328  ReduVarPtr, [](auto &&Ref, auto Val) { return Ref.fetch_or(Val); });
329  }
330 
333  typename _T = Ty, class _BinaryOperation = BinaryOp>
334  std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
337  atomic_combine(_T *ReduVarPtr) const {
338  atomic_combine_impl<Space>(
339  ReduVarPtr, [](auto &&Ref, auto Val) { return Ref.fetch_xor(Val); });
340  }
341 
344  typename _T = Ty, class _BinaryOperation = BinaryOp>
345  std::enable_if_t<std::is_same_v<remove_decoration_t<_T>, _T> &&
350  atomic_combine(_T *ReduVarPtr) const {
351  atomic_combine_impl<Space>(
352  ReduVarPtr, [](auto &&Ref, auto Val) { return Ref.fetch_and(Val); });
353  }
354 
357  typename _T = Ty, class _BinaryOperation = BinaryOp>
358  std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
362  atomic_combine(_T *ReduVarPtr) const {
363  atomic_combine_impl<Space>(
364  ReduVarPtr, [](auto &&Ref, auto Val) { return Ref.fetch_min(Val); });
365  }
366 
369  typename _T = Ty, class _BinaryOperation = BinaryOp>
370  std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
374  atomic_combine(_T *ReduVarPtr) const {
375  atomic_combine_impl<Space>(
376  ReduVarPtr, [](auto &&Ref, auto Val) { return Ref.fetch_max(Val); });
377  }
378 };
379 
382 template <typename T, class BinaryOperation, bool ExplicitIdentity,
383  typename CondT = void>
385  static_assert(!std::is_same_v<T, T>,
386  "Partial specializations don't cover all possible options!");
387 };
388 
389 // Specialization for reductions with explicit identity.
390 template <typename T, class BinaryOperation, bool ExplicitIdentity>
392  T, BinaryOperation, ExplicitIdentity,
393  enable_if_t<IsKnownIdentityOp<T, BinaryOperation>::value>> {
394 public:
395  static constexpr bool has_identity = true;
396 
399 
401  static constexpr T getIdentity() {
403  }
404 };
405 
406 // Specialization for reductions with explicit identity.
407 template <typename T, class BinaryOperation>
409  T, BinaryOperation, true,
410  enable_if_t<!IsKnownIdentityOp<T, BinaryOperation>::value>> {
411 public:
412  static constexpr bool has_identity = true;
413 
414  ReductionIdentityContainer(const T &Identity) : MIdentity(Identity) {}
415 
417  T getIdentity() const { return MIdentity; }
418 
419 private:
422  const T MIdentity;
423 };
424 
425 // Specialization for identityless reductions.
426 template <typename T, class BinaryOperation>
428  T, BinaryOperation, false,
429  std::enable_if_t<!IsKnownIdentityOp<T, BinaryOperation>::value>> {
430 public:
431  static constexpr bool has_identity = false;
432 };
433 
434 // Class representing a single reducer element. In cases where the identity is
435 // unknown this is allowed to not contain a value.
436 template <typename T, class BinaryOperation, bool IsOptional>
438  using value_type = std::conditional_t<IsOptional, std::optional<T>, T>;
439 
440  template <bool ExplicitIdentity>
441  constexpr value_type GetInitValue(
443  &IdentityContainer) {
444  constexpr bool ContainerHasIdentity =
445  ReductionIdentityContainer<T, BinaryOperation,
446  ExplicitIdentity>::has_identity;
447  static_assert(IsOptional || ContainerHasIdentity);
448  if constexpr (!ContainerHasIdentity)
449  return std::nullopt;
450  else
451  return IdentityContainer.getIdentity();
452  }
453 
454 public:
455  ReducerElement() = default;
456  ReducerElement(T Value) : MValue{Value} {}
457 
458  template <bool ExplicitIdentity>
461  &IdentityContainer)
462  : MValue(GetInitValue(IdentityContainer)) {}
463 
464  ReducerElement &combine(BinaryOperation BinOp, const T &OtherValue) {
465  if constexpr (IsOptional)
466  MValue = MValue ? BinOp(*MValue, OtherValue) : OtherValue;
467  else
468  MValue = BinOp(MValue, OtherValue);
469  return *this;
470  }
471 
472  ReducerElement &combine(BinaryOperation BinOp, const ReducerElement &Other) {
473  if constexpr (IsOptional) {
474  if (Other.MValue)
475  return combine(BinOp, *Other.MValue);
476  // If the other value doesn't have a value it is a no-op.
477  return *this;
478  } else {
479  return combine(BinOp, Other.MValue);
480  }
481  }
482 
483  constexpr T &operator*() noexcept {
484  if constexpr (IsOptional)
485  return *MValue;
486  else
487  return MValue;
488  }
489  constexpr const T &operator*() const noexcept {
490  if constexpr (IsOptional)
491  return *MValue;
492  else
493  return MValue;
494  }
495 
496  constexpr explicit operator bool() const {
497  if constexpr (IsOptional)
498  return MValue.has_value();
499  return true;
500  }
501 
502 private:
503  value_type MValue;
504 };
505 } // namespace detail
506 
507 // We explicitly claim std::optional as device-copyable in sycl/types.hpp.
508 // However, that information isn't propagated to the struct/class types that use
509 // it as a member, so we need to provide this partial specialization as well.
510 // This is needed to support GNU STL where
511 // std::is_trivially_copyable_v<std::optional> is false (e.g., 7.5.* ).
512 template <typename T, class BinaryOperation, bool IsOptional>
514  detail::ReducerElement<T, BinaryOperation, IsOptional>>
515  : is_device_copyable<std::conditional_t<IsOptional, std::optional<T>, T>> {
516 };
517 
518 namespace detail {
519 template <typename T, class BinaryOperation, int Dims> class reducer_common {
520 public:
521  using value_type = T;
522  using binary_operation = BinaryOperation;
523  static constexpr int dimensions = Dims;
524 };
525 
526 // Token class to help with the in-place construction of reducers.
527 template <class BinaryOperation, typename IdentityContainerT>
528 struct ReducerToken {
529  const IdentityContainerT &IdentityContainer;
530  const BinaryOperation BOp;
531 };
532 
533 } // namespace detail
534 
541 template <typename T, class BinaryOperation, int Dims, size_t Extent,
542  typename IdentityContainerT, bool View>
543 class reducer<
544  T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
545  std::enable_if_t<Dims == 0 && Extent == 1 && View == false &&
546  !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
547  : public detail::combiner<
548  reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
549  std::enable_if_t<
550  Dims == 0 && Extent == 1 && View == false &&
551  !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>>,
552  public detail::reducer_common<T, BinaryOperation, Dims> {
553  static constexpr bool has_identity = IdentityContainerT::has_identity;
554  using element_type =
556 
557 public:
558  reducer(const IdentityContainerT &IdentityContainer, BinaryOperation BOp)
559  : MValue(IdentityContainer), MIdentity(IdentityContainer),
560  MBinaryOp(BOp) {}
563  : reducer(Token.IdentityContainer, Token.BOp) {}
564 
565  reducer(const reducer &) = delete;
566  reducer(reducer &&) = delete;
567  reducer &operator=(const reducer &) = delete;
568  reducer &operator=(reducer &&) = delete;
569 
570  reducer &combine(const T &Partial) {
571  MValue.combine(MBinaryOp, Partial);
572  return *this;
573  }
574 
575  template <bool HasIdentityRelay = has_identity>
576  std::enable_if_t<HasIdentityRelay && (HasIdentityRelay == has_identity), T>
577  identity() const {
578  return MIdentity.getIdentity();
579  }
580 
581 private:
582  template <typename ReducerT> friend class detail::ReducerAccess;
583 
584  element_type &getElement(size_t) { return MValue; }
585  const element_type &getElement(size_t) const { return MValue; }
586 
587  detail::ReducerElement<T, BinaryOperation, !has_identity> MValue;
588  const IdentityContainerT MIdentity;
589  BinaryOperation MBinaryOp;
590 };
591 
597 template <typename T, class BinaryOperation, int Dims, size_t Extent,
598  typename IdentityContainerT, bool View>
599 class reducer<
600  T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
601  std::enable_if_t<Dims == 0 && Extent == 1 && View == false &&
602  detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
603  : public detail::combiner<
604  reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
605  std::enable_if_t<
606  Dims == 0 && Extent == 1 && View == false &&
607  detail::IsKnownIdentityOp<T, BinaryOperation>::value>>>,
608  public detail::reducer_common<T, BinaryOperation, Dims> {
609  static constexpr bool has_identity = IdentityContainerT::has_identity;
610  using element_type =
612 
613 public:
614  reducer() : MValue(getIdentity()) {}
615  reducer(const IdentityContainerT & /* Identity */, BinaryOperation)
616  : MValue(getIdentity()) {}
619  : reducer(Token.IdentityContainer, Token.BOp) {}
620 
621  reducer(const reducer &) = delete;
622  reducer(reducer &&) = delete;
623  reducer &operator=(const reducer &) = delete;
624  reducer &operator=(reducer &&) = delete;
625 
626  reducer &combine(const T &Partial) {
627  BinaryOperation BOp;
628  MValue.combine(BOp, Partial);
629  return *this;
630  }
631 
632  T identity() const { return getIdentity(); }
633 
634 private:
635  template <typename ReducerT> friend class detail::ReducerAccess;
636 
637  static constexpr T getIdentity() {
639  }
640 
641  element_type &getElement(size_t) { return MValue; }
642  const element_type &getElement(size_t) const { return MValue; }
643  detail::ReducerElement<T, BinaryOperation, !has_identity> MValue;
644 };
645 
648 template <typename T, class BinaryOperation, int Dims, size_t Extent,
649  typename IdentityContainerT, bool View>
650 class reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
651  std::enable_if_t<Dims == 0 && View == true>>
652  : public detail::combiner<
653  reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
654  std::enable_if_t<Dims == 0 && View == true>>>,
655  public detail::reducer_common<T, BinaryOperation, Dims> {
656  static constexpr bool has_identity = IdentityContainerT::has_identity;
657  using element_type =
659 
660 public:
661  reducer(element_type &Ref, BinaryOperation BOp)
662  : MElement(Ref), MBinaryOp(BOp) {}
665  : reducer(Token.IdentityContainer, Token.BOp) {}
666 
667  reducer(const reducer &) = delete;
668  reducer(reducer &&) = delete;
669  reducer &operator=(const reducer &) = delete;
670  reducer &operator=(reducer &&) = delete;
671 
672  reducer &combine(const T &Partial) {
673  MElement.combine(MBinaryOp, Partial);
674  return *this;
675  }
676 
677 private:
678  template <typename ReducerT> friend class detail::ReducerAccess;
679 
680  element_type &getElement(size_t) { return MElement; }
681  const element_type &getElement(size_t) const { return MElement; }
682 
683  element_type &MElement;
684  BinaryOperation MBinaryOp;
685 };
686 
689 template <typename T, class BinaryOperation, int Dims, size_t Extent,
690  typename IdentityContainerT, bool View>
691 class reducer<
692  T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
693  std::enable_if_t<Dims == 1 && View == false &&
694  !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
695  : public detail::combiner<
696  reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
697  std::enable_if_t<
698  Dims == 1 && View == false &&
699  !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>>,
700  public detail::reducer_common<T, BinaryOperation, Dims> {
701  static constexpr bool has_identity = IdentityContainerT::has_identity;
702  using element_type =
704 
705 public:
706  reducer(const IdentityContainerT &IdentityContainer, BinaryOperation BOp)
707  : MValue(IdentityContainer), MIdentity(IdentityContainer),
708  MBinaryOp(BOp) {}
711  : reducer(Token.IdentityContainer, Token.BOp) {}
712 
713  reducer(const reducer &) = delete;
714  reducer(reducer &&) = delete;
715  reducer &operator=(const reducer &) = delete;
716  reducer &operator=(reducer &&) = delete;
717 
718  reducer<T, BinaryOperation, Dims - 1, Extent, IdentityContainerT, true>
719  operator[](size_t Index) {
720  return {MValue[Index], MBinaryOp};
721  }
722 
723  template <bool HasIdentityRelay = has_identity>
724  std::enable_if_t<HasIdentityRelay && (HasIdentityRelay == has_identity), T>
725  identity() const {
726  return MIdentity.getIdentity();
727  }
728 
729 private:
730  template <typename ReducerT> friend class detail::ReducerAccess;
731 
732  element_type &getElement(size_t E) { return MValue[E]; }
733  const element_type &getElement(size_t E) const { return MValue[E]; }
734 
736  const IdentityContainerT MIdentity;
737  BinaryOperation MBinaryOp;
738 };
739 
742 template <typename T, class BinaryOperation, int Dims, size_t Extent,
743  typename IdentityContainerT, bool View>
744 class reducer<
745  T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
746  std::enable_if_t<Dims == 1 && View == false &&
747  detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
748  : public detail::combiner<
749  reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
750  std::enable_if_t<
751  Dims == 1 && View == false &&
752  detail::IsKnownIdentityOp<T, BinaryOperation>::value>>>,
753  public detail::reducer_common<T, BinaryOperation, Dims> {
754  static constexpr bool has_identity = IdentityContainerT::has_identity;
755  using element_type =
757 
758 public:
759  reducer() : MValue(getIdentity()) {}
760  reducer(const IdentityContainerT & /* Identity */, BinaryOperation)
761  : MValue(getIdentity()) {}
764  : reducer(Token.IdentityContainer, Token.BOp) {}
765 
766  reducer(const reducer &) = delete;
767  reducer(reducer &&) = delete;
768  reducer &operator=(const reducer &) = delete;
769  reducer &operator=(reducer &&) = delete;
770 
771  // SYCL 2020 revision 4 says this should be const, but this is a bug
772  // see https://github.com/KhronosGroup/SYCL-Docs/pull/252
773  reducer<T, BinaryOperation, Dims - 1, Extent, IdentityContainerT, true>
774  operator[](size_t Index) {
775  return {MValue[Index], BinaryOperation()};
776  }
777 
778  T identity() const { return getIdentity(); }
779 
780 private:
781  template <typename ReducerT> friend class detail::ReducerAccess;
782 
783  static constexpr T getIdentity() {
785  }
786 
787  element_type &getElement(size_t E) { return MValue[E]; }
788  const element_type &getElement(size_t E) const { return MValue[E]; }
789 
790  marray<element_type, Extent> MValue;
791 };
792 
793 namespace detail {
794 
795 // Used for determining dimensions for temporary storage (mainly).
796 template <class T> struct data_dim_t {
797  static constexpr int value = 1;
798 };
799 
800 template <class T, int AccessorDims, access::mode Mode,
801  access::placeholder IsPH, typename PropList>
802 struct data_dim_t<
803  accessor<T, AccessorDims, Mode, access::target::device, IsPH, PropList>> {
804  static constexpr int value = AccessorDims;
805 };
806 
807 template <class T> struct get_red_t;
808 template <class T> struct get_red_t<T *> {
809  using type = T;
810 };
811 
812 template <class T, int Dims, typename AllocatorT>
813 struct get_red_t<buffer<T, Dims, AllocatorT>> {
814  using type = T;
815 };
816 
817 namespace reduction {
818 // Kernel name wrapper for initializing reduction-related memory through
819 // reduction_impl_algo::withInitializedMem.
820 template <typename KernelName> struct InitMemKrn;
821 } // namespace reduction
822 
825 template <class KernelName>
827  std::conditional_t<std::is_same_v<KernelName, auto_name>, auto_name,
829 
830 template <typename T, class BinaryOperation, int Dims, size_t Extent,
831  bool ExplicitIdentity, typename RedOutVar>
833  using self = reduction_impl_algo<T, BinaryOperation, Dims, Extent,
834  ExplicitIdentity, RedOutVar>;
835 
836  // TODO: Do we also need chooseBinOp?
837  static constexpr T chooseIdentity(const T &Identity) {
838  // For now the implementation ignores the identity value given by user
839  // when the implementation knows the identity.
840  // The SPEC could prohibit passing identity parameter to operations with
841  // known identity, but that could have some bad consequences too.
842  // For example, at some moment the implementation may NOT know the identity
843  // for COMPLEX-PLUS reduction. User may create a program that would pass
844  // COMPLEX value (0,0) as identity for PLUS reduction. At some later moment
845  // when the implementation starts handling COMPLEX-PLUS as known operation
846  // the existing user's program remains compilable and working correctly.
847  // I.e. with this constructor here, adding more reduction operations to the
848  // list of known operations does not break the existing programs.
849  if constexpr (is_known_identity) {
850  (void)Identity;
852  } else {
853  return Identity;
854  }
855  }
856 
857 public:
858  static constexpr bool is_known_identity =
860  static constexpr bool has_identity = is_known_identity || ExplicitIdentity;
861 
866  using reducer_type =
870  using result_type = T;
871  using binary_operation = BinaryOperation;
872 
873  static constexpr size_t dims = Dims;
874  static constexpr bool has_float64_atomics =
876  static constexpr bool has_fast_atomics =
878  static constexpr bool has_fast_reduce =
880 
881  static constexpr bool is_usm = std::is_same_v<RedOutVar, T *>;
882 
883  static constexpr size_t num_elements = Extent;
884 
885  reduction_impl_algo(const T &Identity, BinaryOperation BinaryOp, bool Init,
886  RedOutVar RedOut)
887  : MIdentityContainer(chooseIdentity(Identity)), MBinaryOp(BinaryOp),
888  InitializeToIdentity(Init), MRedOut(std::move(RedOut)) {}
889 
890  template <typename RelayT = T,
891  typename RelayBinaryOperation = BinaryOperation>
893  BinaryOperation BinaryOp, bool Init, RedOutVar RedOut,
895  int> = 0)
896  : MIdentityContainer(ReducerAccess<reducer_type>::getIdentityStatic()),
897  MBinaryOp(BinaryOp), InitializeToIdentity(Init),
898  MRedOut(std::move(RedOut)) {}
899 
900  template <typename RelayT = T,
901  typename RelayBinaryOperation = BinaryOperation>
903  BinaryOperation BinaryOp, bool Init, RedOutVar RedOut,
905  int> = 0)
906  : MIdentityContainer(), MBinaryOp(BinaryOp), InitializeToIdentity(Init),
907  MRedOut(std::move(RedOut)) {}
908 
910  CGH.addReduction(MOutBufPtr);
911  return accessor{*MOutBufPtr, CGH, sycl::read_only};
912  }
913 
914  template <bool IsOneWG>
915  auto getWriteMemForPartialReds(size_t Size, handler &CGH) {
916  // If there is only one WG we can avoid creation of temporary buffer with
917  // partial sums and write directly into user's reduction variable.
918  if constexpr (IsOneWG) {
919  return getUserRedVarAccess(CGH);
920  } else {
921  MOutBufPtr =
922  std::make_shared<buffer<reducer_element_type, 1>>(range<1>(Size));
923  CGH.addReduction(MOutBufPtr);
924  return accessor{*MOutBufPtr, CGH};
925  }
926  }
927 
928  template <class _T = T> auto &getTempBuffer(size_t Size, handler &CGH) {
929  auto Buffer = std::make_shared<buffer<_T, 1>>(range<1>(Size));
930  CGH.addReduction(Buffer);
931  return *Buffer;
932  }
933 
940  auto getWriteAccForPartialReds(size_t Size, handler &CGH) {
941  static_assert(!has_identity || sizeof(reducer_element_type) == sizeof(T),
942  "Unexpected size of reducer element type.");
943 
944  // We can only use the output memory directly if it is not USM and we have
945  // and identity, i.e. it has a thin element wrapper.
946  if constexpr (!is_usm && has_identity) {
947  if (Size == 1) {
948  auto ReinterpretRedOut =
949  MRedOut.template reinterpret<reducer_element_type>();
950  return accessor{ReinterpretRedOut, CGH};
951  }
952  }
953 
954  // Create a new output buffer and return an accessor to it.
955  //
956  // Array reductions are performed element-wise to avoid stack growth.
957  MOutBufPtr =
958  std::make_shared<buffer<reducer_element_type, 1>>(range<1>(Size));
959  CGH.addReduction(MOutBufPtr);
960  return accessor{*MOutBufPtr, CGH};
961  }
962 
968  //
969  // This currently optimizes for a number of kernel instantiations instead of
970  // runtime latency. That might change in future.
971  template <typename KernelName, typename FuncTy,
972  bool HasIdentity = has_identity>
973  std::enable_if_t<HasIdentity> withInitializedMem(handler &CGH, FuncTy Func) {
974  // "Template" lambda to ensure that only one type of Func (USM/Buf) is
975  // instantiated for the code below.
976  auto DoIt = [&](auto &Out) {
977  auto RWReduVal = std::make_shared<std::array<T, num_elements>>();
978  for (size_t i = 0; i < num_elements; ++i) {
979  (*RWReduVal)[i] = decltype(MIdentityContainer)::getIdentity();
980  }
981  auto Buf = std::make_shared<buffer<T, 1>>(RWReduVal.get()->data(),
983  Buf->set_final_data();
984  accessor Mem{*Buf, CGH};
985  Func(Mem);
986 
987  reduction::withAuxHandler(CGH, [&](handler &CopyHandler) {
988  // MSVC (19.32.31329) has problems compiling the line below when used
989  // as a host compiler in c++17 mode (but not in c++latest)
990  // accessor Mem{*Buf, CopyHandler};
991  // so use the old-style API.
992  auto Mem =
993  Buf->template get_access<access::mode::read_write>(CopyHandler);
994  // Since this CG is dependent on the one associated with CGH,
995  // registering the auxiliary resources here is enough.
996  CopyHandler.addReduction(RWReduVal);
997  CopyHandler.addReduction(Buf);
998  if constexpr (is_usm) {
999  // Can't capture whole reduction, copy into distinct variables.
1000  bool IsUpdateOfUserVar = !initializeToIdentity();
1001  auto BOp = getBinaryOperation();
1002 
1003  // Don't use constexpr as non-default host compilers (unlike clang)
1004  // might actually create a capture resulting in binary differences
1005  // between host/device in lambda captures.
1006  size_t NElements = num_elements;
1007 
1008  CopyHandler.single_task<__sycl_init_mem_for<KernelName>>([=] {
1009  for (size_t i = 0; i < NElements; ++i) {
1010  if (IsUpdateOfUserVar)
1011  Out[i] = BOp(Out[i], Mem[i]);
1012  else
1013  Out[i] = Mem[i];
1014  }
1015  });
1016  } else {
1017  accessor OutAcc{Out, CGH};
1018  CopyHandler.copy(Mem, OutAcc);
1019  }
1020  });
1021  };
1022  if constexpr (is_usm) {
1023  // Don't dispatch based on initializeToIdentity() as that would lead
1024  // to two different instantiations of Func.
1025  DoIt(MRedOut);
1026  } else {
1027  if (initializeToIdentity())
1028  DoIt(MRedOut);
1029  else
1030  Func(accessor{MRedOut, CGH});
1031  }
1032  }
1033 
1034  // Overload of withInitializedMem for reducer without identity. Initializing
1035  // to identity is not allowed in this case.
1036  template <typename KernelName, typename FuncTy,
1037  bool HasIdentity = has_identity>
1038  std::enable_if_t<!HasIdentity> withInitializedMem(handler &CGH, FuncTy Func) {
1039  std::ignore = CGH;
1040  assert(!initializeToIdentity() &&
1041  "Initialize to identity not allowed for identity-less reductions.");
1042  Func(accessor{MRedOut, CGH});
1043  }
1044 
1046  return MIdentityContainer;
1047  }
1048 
1052  auto CounterMem = std::make_shared<int>(0);
1053  CGH.addReduction(CounterMem);
1054  auto CounterBuf = std::make_shared<buffer<int, 1>>(CounterMem.get(), 1);
1055  CounterBuf->set_final_data();
1056  CGH.addReduction(CounterBuf);
1057  return {*CounterBuf, CGH};
1058  }
1059 
1060  // On discrete (vs. integrated) GPUs it's faster to initialize memory with an
1061  // extra kernel than copy it from the host.
1063  queue q = createSyclObjFromImpl<queue>(CGH.MQueue);
1064  device Dev = q.get_device();
1065  auto Deleter = [=](auto *Ptr) { free(Ptr, q); };
1066 
1067  std::shared_ptr<int> Counter(malloc_device<int>(1, q), Deleter);
1068  CGH.addReduction(Counter);
1069 
1070  auto Event = q.memset(Counter.get(), 0, sizeof(int));
1071  CGH.depends_on(Event);
1072 
1073  return Counter.get();
1074  }
1075 
1077  BinaryOperation getBinaryOperation() const { return MBinaryOp; }
1078  bool initializeToIdentity() const { return InitializeToIdentity; }
1079 
1081  std::ignore = CGH;
1082  if constexpr (is_usm)
1083  return MRedOut;
1084  else
1085  return accessor{MRedOut, CGH};
1086  }
1087 
1088 private:
1089  // Object holding the identity if available.
1090  identity_container_type MIdentityContainer;
1091 
1092  // Array reduction is performed element-wise to avoid stack growth, hence
1093  // 1-dimensional always.
1094  std::shared_ptr<buffer<reducer_element_type, 1>> MOutBufPtr;
1095 
1096  BinaryOperation MBinaryOp;
1097  bool InitializeToIdentity;
1098 
1100  RedOutVar MRedOut;
1101 };
1102 
1105 template <typename T, class BinaryOperation, int Dims, size_t Extent,
1106  bool ExplicitIdentity, typename RedOutVar>
1108  : private reduction_impl_base,
1109  public reduction_impl_algo<T, BinaryOperation, Dims, Extent,
1110  ExplicitIdentity, RedOutVar> {
1111 private:
1112  using algo = reduction_impl_algo<T, BinaryOperation, Dims, Extent,
1113  ExplicitIdentity, RedOutVar>;
1114  using self = reduction_impl<T, BinaryOperation, Dims, Extent,
1115  ExplicitIdentity, RedOutVar>;
1116 
1117 public:
1119  using algo::is_usm;
1120 
1121  // Only scalar and 1D array reductions are supported by SYCL 2020.
1122  static_assert(Dims <= 1, "Multi-dimensional reductions are not supported.");
1123 
1126  template <bool ExplicitIdentityRelay = ExplicitIdentity,
1127  typename = std::enable_if_t<!ExplicitIdentityRelay>>
1128  reduction_impl(RedOutVar Var, BinaryOperation BOp,
1129  bool InitializeToIdentity = false)
1130  : algo(BOp, InitializeToIdentity, Var) {
1131  if constexpr (!is_usm)
1132  if (Var.size() != 1)
1134  "Reduction variable must be a scalar.");
1135  if constexpr (!is_known_identity)
1138  "initialize_to_identity property cannot be "
1139  "used with identityless reductions.");
1140  }
1141 
1144  template <bool ExplicitIdentityRelay = ExplicitIdentity,
1145  typename = std::enable_if_t<ExplicitIdentityRelay>>
1146  reduction_impl(RedOutVar &Var, const T &Identity, BinaryOperation BOp,
1147  bool InitializeToIdentity)
1148  : algo(Identity, BOp, InitializeToIdentity, Var) {
1149  if constexpr (!is_usm)
1150  if (Var.size() != 1)
1152  "Reduction variable must be a scalar.");
1153  }
1154 };
1155 
1156 template <class BinaryOp, int Dims, size_t Extent, bool ExplicitIdentity,
1157  typename RedOutVar, typename... RestTy>
1158 auto make_reduction(RedOutVar RedVar, RestTy &&...Rest) {
1160  Extent, ExplicitIdentity, RedOutVar>{
1161  RedVar, std::forward<RestTy>(Rest)...};
1162 }
1163 
1164 namespace reduction {
1165 inline void finalizeHandler(handler &CGH) { CGH.finalize(); }
1166 template <class FunctorTy> void withAuxHandler(handler &CGH, FunctorTy Func) {
1167  event E = CGH.finalize();
1168  handler AuxHandler(CGH.MQueue, CGH.MIsHost);
1169  AuxHandler.depends_on(E);
1170  AuxHandler.saveCodeLoc(CGH.MCodeLoc);
1171  Func(AuxHandler);
1172  CGH.MLastEvent = AuxHandler.finalize();
1173  return;
1174 }
1175 } // namespace reduction
1176 
1177 // This method is used for implementation of parallel_for accepting 1 reduction.
1178 // TODO: remove this method when everything is switched to general algorithm
1179 // implementing arbitrary number of reductions in parallel_for().
1182 template <typename KernelName, class Reduction>
1183 void reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu) {
1184  static_assert(Reduction::is_usm,
1185  "All implementations using this helper are expected to have "
1186  "USM reduction, not a buffer-based one.");
1187  size_t NElements = Reduction::num_elements;
1188  auto InAcc = Redu.getReadAccToPreviousPartialReds(CGH);
1189  auto UserVarPtr = Redu.getUserRedVarAccess(CGH);
1190  bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1191  auto BOp = Redu.getBinaryOperation();
1192  CGH.single_task<KernelName>([=] {
1193  for (size_t i = 0; i < NElements; ++i) {
1194  auto Elem = InAcc[i];
1195  if (IsUpdateOfUserVar)
1196  UserVarPtr[i] = BOp(UserVarPtr[i], *Elem);
1197  else
1198  UserVarPtr[i] = *Elem;
1199  }
1200  });
1201 }
1202 
1203 namespace reduction {
1204 template <typename KernelName, strategy S, class... Ts> struct MainKrn;
1205 template <typename KernelName, strategy S, class... Ts> struct AuxKrn;
1206 } // namespace reduction
1207 
1208 // Tag structs to help creating unique kernels for multi-reduction cases.
1209 struct KernelOneWGTag {};
1211 
1214 template <template <typename, reduction::strategy, typename...> class MainOrAux,
1215  class KernelName, reduction::strategy Strategy, class... Ts>
1217  std::conditional_t<std::is_same_v<KernelName, auto_name>, auto_name,
1218  MainOrAux<KernelName, Strategy, Ts...>>;
1219 
1220 // Implementations.
1221 
1222 template <reduction::strategy> struct NDRangeReduction;
1223 
1224 template <>
1225 struct NDRangeReduction<reduction::strategy::local_atomic_and_atomic_cross_wg> {
1226  template <typename KernelName, int Dims, typename PropertiesT,
1227  typename KernelType, typename Reduction>
1228  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1229  nd_range<Dims> NDRange, PropertiesT &Properties,
1230  Reduction &Redu, KernelType &KernelFunc) {
1231  static_assert(Reduction::has_identity,
1232  "Identityless reductions are not supported by the "
1233  "local_atomic_and_atomic_cross_wg strategy.");
1234 
1235  std::ignore = Queue;
1236  using Name = __sycl_reduction_kernel<
1237  reduction::MainKrn, KernelName,
1239  Redu.template withInitializedMem<Name>(CGH, [&](auto Out) {
1240  size_t NElements = Reduction::num_elements;
1242  CGH};
1243 
1244  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<1> NDId) {
1245  // Call user's functions. Reducer.MValue gets initialized there.
1246  typename Reduction::reducer_type Reducer;
1247  KernelFunc(NDId, Reducer);
1248 
1249  // Work-group cooperates to initialize multiple reduction variables
1250  auto LID = NDId.get_local_id(0);
1251  for (size_t E = LID; E < NElements; E += NDId.get_local_range(0)) {
1252  GroupSum[E] = getReducerAccess(Reducer).getIdentity();
1253  }
1254  workGroupBarrier();
1255 
1256  // Each work-item has its own reducer to combine
1257  Reducer.template atomic_combine<access::address_space::local_space>(
1258  &GroupSum[0]);
1259 
1260  // Single work-item performs finalization for entire work-group
1261  // TODO: Opportunity to parallelize across elements
1262  workGroupBarrier();
1263  if (LID == 0) {
1264  for (size_t E = 0; E < NElements; ++E) {
1265  *getReducerAccess(Reducer).getElement(E) = GroupSum[E];
1266  }
1267  Reducer.template atomic_combine(&Out[0]);
1268  }
1269  });
1270  });
1271  }
1272 };
1273 
1274 template <>
1276  reduction::strategy::group_reduce_and_last_wg_detection> {
1277  template <typename KernelName, int Dims, typename PropertiesT,
1278  typename KernelType, typename Reduction>
1279  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1280  nd_range<Dims> NDRange, PropertiesT &Properties,
1281  Reduction &Redu, KernelType &KernelFunc) {
1282  static_assert(Reduction::has_identity,
1283  "Identityless reductions are not supported by the "
1284  "group_reduce_and_last_wg_detection strategy.");
1285 
1286  std::ignore = Queue;
1287  size_t NElements = Reduction::num_elements;
1288  size_t WGSize = NDRange.get_local_range().size();
1289  size_t NWorkGroups = NDRange.get_group_range().size();
1290 
1291  auto Out = Redu.getUserRedVarAccess(CGH);
1292 
1293  auto &PartialSumsBuf = Redu.getTempBuffer(NWorkGroups * NElements, CGH);
1294  accessor PartialSums(PartialSumsBuf, CGH, sycl::read_write, sycl::no_init);
1295 
1296  bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1297  auto Rest = [&](auto NWorkGroupsFinished) {
1298  local_accessor<int, 1> DoReducePartialSumsInLastWG{1, CGH};
1299 
1300  using Name = __sycl_reduction_kernel<
1301  reduction::MainKrn, KernelName,
1303  decltype(NWorkGroupsFinished)>;
1304 
1305  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<1> NDId) {
1306  // Call user's functions. Reducer.MValue gets initialized there.
1307  typename Reduction::reducer_type Reducer;
1308  KernelFunc(NDId, Reducer);
1309 
1310  typename Reduction::binary_operation BOp;
1311  auto Group = NDId.get_group();
1312 
1313  // If there are multiple values, reduce each separately
1314  // reduce_over_group is only defined for each T, not for span<T, ...>
1315  size_t LID = NDId.get_local_id(0);
1316  for (size_t E = 0; E < NElements; ++E) {
1317  auto &RedElem = *getReducerAccess(Reducer).getElement(E);
1318  RedElem = reduce_over_group(Group, RedElem, BOp);
1319  if (LID == 0) {
1320  if (NWorkGroups == 1) {
1321  // Can avoid using partial sum and write the final result
1322  // immediately.
1323  if (IsUpdateOfUserVar)
1324  RedElem = BOp(RedElem, Out[E]);
1325  Out[E] = RedElem;
1326  } else {
1327  PartialSums[NDId.get_group_linear_id() * NElements + E] =
1328  *getReducerAccess(Reducer).getElement(E);
1329  }
1330  }
1331  }
1332 
1333  if (NWorkGroups == 1)
1334  // We're done.
1335  return;
1336 
1337  // Signal this work-group has finished after all values are reduced. We
1338  // had an implicit work-group barrier in reduce_over_group and all the
1339  // work since has been done in (LID == 0) work-item, so no extra sync is
1340  // needed.
1341  if (LID == 0) {
1342  auto NFinished =
1345  NWorkGroupsFinished[0]);
1346  DoReducePartialSumsInLastWG[0] =
1347  ++NFinished == static_cast<int>(NWorkGroups);
1348  }
1349 
1350  workGroupBarrier();
1351  if (DoReducePartialSumsInLastWG[0]) {
1352  // Reduce each result separately
1353  // TODO: Opportunity to parallelize across elements.
1354  for (size_t E = 0; E < NElements; ++E) {
1355  auto LocalSum = getReducerAccess(Reducer).getIdentity();
1356  for (size_t I = LID; I < NWorkGroups; I += WGSize)
1357  LocalSum = BOp(LocalSum, PartialSums[I * NElements + E]);
1358  auto Result = reduce_over_group(Group, LocalSum, BOp);
1359 
1360  if (LID == 0) {
1361  if (IsUpdateOfUserVar)
1362  Result = BOp(Result, Out[E]);
1363  Out[E] = Result;
1364  }
1365  }
1366  }
1367  });
1368  };
1369 
1370  auto device = getDeviceFromHandler(CGH);
1371  // Integrated/discrete GPUs have different faster path. For discrete GPUs
1372  // fast path requires USM device allocations though, so check for that as
1373  // well.
1374  if (device.get_info<info::device::host_unified_memory>() ||
1375  !device.has(aspect::usm_device_allocations))
1376  Rest(Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH));
1377  else
1378  Rest(Redu.getGroupsCounterAccDiscrete(CGH));
1379  }
1380 };
1381 
1383 inline size_t GreatestPowerOfTwo(size_t N) {
1384  if (N == 0)
1385  return 0;
1386 
1387  size_t Ret = 1;
1388  while ((N >>= 1) != 0)
1389  Ret <<= 1;
1390  return Ret;
1391 }
1392 
1393 template <typename FuncTy>
1394 void doTreeReductionHelper(size_t WorkSize, size_t LID, FuncTy Func) {
1395  workGroupBarrier();
1396 
1397  // Initial pivot is the greatest power-of-two value smaller or equal to the
1398  // work size.
1399  size_t Pivot = GreatestPowerOfTwo(WorkSize);
1400 
1401  // If the pivot is not the same as the work size, it needs to do an initial
1402  // reduction where we only reduce the N last elements into the first N
1403  // elements, where N is WorkSize - Pivot.
1404  // 0 Pivot WorkSize Power of two
1405  // | | | |
1406  // +-----------------------+------+------------------------+------+
1407  // |
1408  // WorkSize - Pivot
1409  if (Pivot != WorkSize) {
1410  if (Pivot + LID < WorkSize)
1411  Func(LID, Pivot + LID);
1412  workGroupBarrier();
1413  }
1414 
1415  // Now the amount of work must be power-of-two, so do the tree reduction.
1416  for (size_t CurPivot = Pivot >> 1; CurPivot > 0; CurPivot >>= 1) {
1417  if (LID < CurPivot)
1418  Func(LID, CurPivot + LID);
1419  workGroupBarrier();
1420  }
1421 }
1422 
1423 // Enum for specifying work size guarantees in tree-reduction.
1425 
1426 template <WorkSizeGuarantees WSGuarantee, int Dim, typename LocalRedsTy,
1427  typename BinOpTy, typename AccessFuncTy>
1428 void doTreeReduction(size_t WorkSize, nd_item<Dim> NDIt, LocalRedsTy &LocalReds,
1429  BinOpTy &BOp, AccessFuncTy AccessFunc) {
1430  size_t LID = NDIt.get_local_linear_id();
1431  size_t AdjustedWorkSize;
1432  if constexpr (WSGuarantee == WorkSizeGuarantees::LessOrEqual ||
1433  WSGuarantee == WorkSizeGuarantees::Equal) {
1434  // If there is less-or-equal number of items and amount of work, we just
1435  // load the work into the local memory and start reducing. If we know it is
1436  // equal we can let the optimizer remove the check.
1437  if (WSGuarantee == WorkSizeGuarantees::Equal || LID < WorkSize)
1438  LocalReds[LID] = AccessFunc(LID);
1439  AdjustedWorkSize = WorkSize;
1440  } else {
1441  // Otherwise we have no guarantee and we need to first reduce the amount of
1442  // work to fit into the local memory.
1443  size_t WGSize = NDIt.get_local_range().size();
1444  AdjustedWorkSize = std::min(WorkSize, WGSize);
1445  if (LID < AdjustedWorkSize) {
1446  auto LocalSum = AccessFunc(LID);
1447  for (size_t I = LID + WGSize; I < WorkSize; I += WGSize)
1448  LocalSum = BOp(LocalSum, AccessFunc(I));
1449 
1450  LocalReds[LID] = LocalSum;
1451  }
1452  }
1453  doTreeReductionHelper(AdjustedWorkSize, LID, [&](size_t I, size_t J) {
1454  LocalReds[I] = BOp(LocalReds[I], LocalReds[J]);
1455  });
1456 }
1457 
1458 // Tree-reduction over tuples of accessors. This assumes that WorkSize is
1459 // less than or equal to the work-group size.
1460 // TODO: For variadics/tuples we don't provide such a high-level abstraction as
1461 // for the scalar case above. Is there some C++ magic to level them?
1462 template <typename... LocalAccT, typename... BOPsT, size_t... Is>
1463 void doTreeReductionOnTuple(size_t WorkSize, size_t LID,
1464  ReduTupleT<LocalAccT...> &LocalAccs,
1465  ReduTupleT<BOPsT...> &BOPs,
1466  std::index_sequence<Is...>) {
1467  doTreeReductionHelper(WorkSize, LID, [&](size_t I, size_t J) {
1468  auto ProcessOne = [=](auto &LocalAcc, auto &BOp) {
1469  LocalAcc[I] = BOp(LocalAcc[I], LocalAcc[J]);
1470  };
1471  (ProcessOne(std::get<Is>(LocalAccs), std::get<Is>(BOPs)), ...);
1472  });
1473 }
1474 
1475 template <> struct NDRangeReduction<reduction::strategy::range_basic> {
1476  template <typename KernelName, int Dims, typename PropertiesT,
1477  typename KernelType, typename Reduction>
1478  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1479  nd_range<Dims> NDRange, PropertiesT &Properties,
1480  Reduction &Redu, KernelType &KernelFunc) {
1481  using reducer_type = typename Reduction::reducer_type;
1482  using element_type = typename ReducerTraits<reducer_type>::element_type;
1483 
1484  // If reduction has an identity and is not USM, the reducer element is just
1485  // a thin wrapper around the result type so the partial sum will use the
1486  // output memory iff NWorkGroups == 1. Otherwise, we need to make sure the
1487  // right output buffer is written in case NWorkGroups == 1.
1488  constexpr bool UsePartialSumForOutput =
1489  !Reduction::is_usm && Reduction::has_identity;
1490 
1491  std::ignore = Queue;
1492  size_t NElements = Reduction::num_elements;
1493  size_t WGSize = NDRange.get_local_range().size();
1494  size_t NWorkGroups = NDRange.get_group_range().size();
1495 
1496  bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1497  auto PartialSums =
1498  Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1499  auto Out = [&]() {
1500  if constexpr (UsePartialSumForOutput)
1501  return (NWorkGroups == 1)
1502  ? PartialSums
1503  : Redu.getWriteAccForPartialReds(NElements, CGH);
1504  else
1505  return Redu.getUserRedVarAccess(CGH);
1506  }();
1507  local_accessor<element_type, 1> LocalReds{WGSize, CGH};
1508  auto NWorkGroupsFinished =
1509  Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH);
1510  local_accessor<int, 1> DoReducePartialSumsInLastWG{1, CGH};
1511 
1512  auto IdentityContainer = Redu.getIdentityContainer();
1513  auto BOp = Redu.getBinaryOperation();
1514 
1515  using Name = __sycl_reduction_kernel<reduction::MainKrn, KernelName,
1517 
1518  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<1> NDId) {
1519  // Call user's functions. Reducer.MValue gets initialized there.
1520  reducer_type Reducer = reducer_type(IdentityContainer, BOp);
1521  KernelFunc(NDId, Reducer);
1522 
1523  auto ElementCombiner = [&](element_type &LHS, const element_type &RHS) {
1524  return LHS.combine(BOp, RHS);
1525  };
1526 
1527  // If there are multiple values, reduce each separately
1528  // This prevents local memory from scaling with elements
1529  size_t LID = NDId.get_local_linear_id();
1530  for (size_t E = 0; E < NElements; ++E) {
1531 
1532  doTreeReduction<WorkSizeGuarantees::Equal>(
1533  WGSize, NDId, LocalReds, ElementCombiner,
1534  [&](size_t) { return getReducerAccess(Reducer).getElement(E); });
1535 
1536  if (LID == 0) {
1537  auto V = LocalReds[0];
1538 
1539  bool IsOneWG = NWorkGroups == 1;
1540  if (IsOneWG && IsUpdateOfUserVar)
1541  V.combine(BOp, Out[E]);
1542 
1543  // if NWorkGroups == 1 && UsePartialSumForOutput, then PartialsSum
1544  // and Out point to same memory.
1545  if (UsePartialSumForOutput || !IsOneWG)
1546  PartialSums[NDId.get_group_linear_id() * NElements + E] = V;
1547  else if (V)
1548  Out[E] = *V;
1549  }
1550  }
1551 
1552  // Signal this work-group has finished after all values are reduced. We
1553  // had an implicit work-group barrier in doTreeReduction and all the
1554  // work since has been done in (LID == 0) work-item, so no extra sync is
1555  // needed.
1556  if (LID == 0) {
1557  auto NFinished =
1560  NWorkGroupsFinished[0]);
1561  DoReducePartialSumsInLastWG[0] =
1562  ++NFinished == NWorkGroups && NWorkGroups > 1;
1563  }
1564 
1565  workGroupBarrier();
1566  if (DoReducePartialSumsInLastWG[0]) {
1567  // Reduce each result separately
1568  // TODO: Opportunity to parallelize across elements
1569  for (size_t E = 0; E < NElements; ++E) {
1570  doTreeReduction<WorkSizeGuarantees::None>(
1571  NWorkGroups, NDId, LocalReds, ElementCombiner,
1572  [&](size_t I) { return PartialSums[I * NElements + E]; });
1573  if (LID == 0) {
1574  auto V = LocalReds[0];
1575  if (IsUpdateOfUserVar)
1576  V.combine(BOp, Out[E]);
1577  Out[E] = *V;
1578  }
1579  }
1580  }
1581  });
1582  }
1583 };
1584 
1585 template <>
1586 struct NDRangeReduction<reduction::strategy::group_reduce_and_atomic_cross_wg> {
1587  template <typename KernelName, int Dims, typename PropertiesT,
1588  typename KernelType, typename Reduction>
1589  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1590  nd_range<Dims> NDRange, PropertiesT &Properties,
1591  Reduction &Redu, KernelType &KernelFunc) {
1592  static_assert(Reduction::has_identity,
1593  "Identityless reductions are not supported by the "
1594  "group_reduce_and_atomic_cross_wg strategy.");
1595 
1596  std::ignore = Queue;
1597  using Name = __sycl_reduction_kernel<
1598  reduction::MainKrn, KernelName,
1600  Redu.template withInitializedMem<Name>(CGH, [&](auto Out) {
1601  size_t NElements = Reduction::num_elements;
1602 
1603  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<Dims> NDIt) {
1604  // Call user's function. Reducer.MValue gets initialized there.
1605  typename Reduction::reducer_type Reducer;
1606  KernelFunc(NDIt, Reducer);
1607 
1608  typename Reduction::binary_operation BOp;
1609  for (size_t E = 0; E < NElements; ++E) {
1610  auto &ReducerElem = getReducerAccess(Reducer).getElement(E);
1611  *ReducerElem = reduce_over_group(NDIt.get_group(), *ReducerElem, BOp);
1612  }
1613  if (NDIt.get_local_linear_id() == 0)
1614  Reducer.atomic_combine(&Out[0]);
1615  });
1616  });
1617  }
1618 };
1619 
1620 template <>
1622  reduction::strategy::local_mem_tree_and_atomic_cross_wg> {
1623  template <typename KernelName, int Dims, typename PropertiesT,
1624  typename KernelType, typename Reduction>
1625  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1626  nd_range<Dims> NDRange, PropertiesT &Properties,
1627  Reduction &Redu, KernelType &KernelFunc) {
1628  using reducer_type = typename Reduction::reducer_type;
1629  using element_type = typename ReducerTraits<reducer_type>::element_type;
1630 
1631  std::ignore = Queue;
1632  using Name = __sycl_reduction_kernel<
1633  reduction::MainKrn, KernelName,
1635  Redu.template withInitializedMem<Name>(CGH, [&](auto Out) {
1636  size_t NElements = Reduction::num_elements;
1637  size_t WGSize = NDRange.get_local_range().size();
1638 
1639  // Use local memory to reduce elements in work-groups into zero-th
1640  // element.
1641  local_accessor<element_type, 1> LocalReds{WGSize, CGH};
1642 
1643  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<Dims> NDIt) {
1644  // Call user's functions. Reducer.MValue gets initialized there.
1645  reducer_type Reducer;
1646  KernelFunc(NDIt, Reducer);
1647 
1648  size_t WGSize = NDIt.get_local_range().size();
1649  size_t LID = NDIt.get_local_linear_id();
1650 
1651  typename Reduction::binary_operation BOp;
1652  auto ElementCombiner = [&](element_type &LHS, const element_type &RHS) {
1653  return LHS.combine(BOp, RHS);
1654  };
1655 
1656  // If there are multiple values, reduce each separately
1657  // This prevents local memory from scaling with elements
1658  for (size_t E = 0; E < NElements; ++E) {
1659 
1660  doTreeReduction<WorkSizeGuarantees::Equal>(
1661  WGSize, NDIt, LocalReds, ElementCombiner,
1662  [&](size_t) { return getReducerAccess(Reducer).getElement(E); });
1663 
1664  if (LID == 0)
1665  getReducerAccess(Reducer).getElement(E) = LocalReds[0];
1666 
1667  // Ensure item 0 is finished with LocalReds before next iteration
1668  if (E != NElements - 1) {
1669  NDIt.barrier();
1670  }
1671  }
1672 
1673  if (LID == 0) {
1674  Reducer.atomic_combine(&Out[0]);
1675  }
1676  });
1677  });
1678  }
1679 };
1680 
1681 template <>
1683  reduction::strategy::group_reduce_and_multiple_kernels> {
1684  template <typename KernelName, int Dims, typename PropertiesT,
1685  typename KernelType, typename Reduction>
1686  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1687  nd_range<Dims> NDRange, PropertiesT &Properties,
1688  Reduction &Redu, KernelType &KernelFunc) {
1689  static_assert(Reduction::has_identity,
1690  "Identityless reductions are not supported by the "
1691  "group_reduce_and_multiple_kernels strategy.");
1692 
1693  // Before running the kernels, check that device has enough local memory
1694  // to hold local arrays that may be required for the reduction algorithm.
1695  // TODO: If the work-group-size is limited by the local memory, then
1696  // a special version of the main kernel may be created. The one that would
1697  // not use local accessors, which means it would not do the reduction in
1698  // the main kernel, but simply generate Range.get_global_range.size() number
1699  // of partial sums, leaving the reduction work to the additional/aux
1700  // kernels.
1701  constexpr bool HFR = Reduction::has_fast_reduce;
1702  size_t OneElemSize = HFR ? 0 : sizeof(typename Reduction::result_type);
1703  // TODO: currently the maximal work group size is determined for the given
1704  // queue/device, while it may be safer to use queries to the kernel compiled
1705  // for the device.
1706  size_t MaxWGSize = reduGetMaxWGSize(Queue, OneElemSize);
1707  if (NDRange.get_local_range().size() > MaxWGSize)
1709  "The implementation handling parallel_for with"
1710  " reduction requires work group size not bigger"
1711  " than " +
1712  std::to_string(MaxWGSize));
1713 
1714  size_t NElements = Reduction::num_elements;
1715  size_t NWorkGroups = NDRange.get_group_range().size();
1716  auto Out = Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1717 
1718  bool IsUpdateOfUserVar =
1719  !Reduction::is_usm && !Redu.initializeToIdentity() && NWorkGroups == 1;
1720 
1721  using Name = __sycl_reduction_kernel<
1722  reduction::MainKrn, KernelName,
1724 
1725  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<Dims> NDIt) {
1726  // Call user's functions. Reducer.MValue gets initialized there.
1727  typename Reduction::reducer_type Reducer;
1728  KernelFunc(NDIt, Reducer);
1729 
1730  // Compute the partial sum/reduction for the work-group.
1731  size_t WGID = NDIt.get_group_linear_id();
1732  typename Reduction::binary_operation BOp;
1733  for (size_t E = 0; E < NElements; ++E) {
1734  typename Reduction::result_type PSum;
1735  PSum = *getReducerAccess(Reducer).getElement(E);
1736  PSum = reduce_over_group(NDIt.get_group(), PSum, BOp);
1737  if (NDIt.get_local_linear_id() == 0) {
1738  if (IsUpdateOfUserVar)
1739  PSum = BOp(*Out[E], PSum);
1740  Out[WGID * NElements + E] = PSum;
1741  }
1742  }
1743  });
1744 
1746 
1747  // Run the additional kernel as many times as needed to reduce all partial
1748  // sums into one scalar.
1749 
1750  // TODO: Create a special slow/sequential version of the kernel that would
1751  // handle the reduction instead of reporting an assert below.
1752  if (MaxWGSize <= 1)
1754  "The implementation handling parallel_for with "
1755  "reduction requires the maximal work group "
1756  "size to be greater than 1 to converge. "
1757  "The maximal work group size depends on the "
1758  "device and the size of the objects passed to "
1759  "the reduction.");
1760  size_t NWorkItems = NDRange.get_group_range().size();
1761  while (NWorkItems > 1) {
1762  reduction::withAuxHandler(CGH, [&](handler &AuxHandler) {
1763  size_t NElements = Reduction::num_elements;
1764  size_t NWorkGroups;
1765  size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
1766 
1767  // The last work-group may be not fully loaded with work, or the work
1768  // group size may be not power of two. Those two cases considered
1769  // inefficient as they require additional code and checks in the kernel.
1770  bool HasUniformWG = NWorkGroups * WGSize == NWorkItems;
1771  if (!Reduction::has_fast_reduce)
1772  HasUniformWG = HasUniformWG && (WGSize & (WGSize - 1)) == 0;
1773 
1774  // Get read accessor to the buffer that was used as output
1775  // in the previous kernel.
1776  auto In = Redu.getReadAccToPreviousPartialReds(AuxHandler);
1777  auto Out =
1778  Redu.getWriteAccForPartialReds(NWorkGroups * NElements, AuxHandler);
1779 
1780  using Name = __sycl_reduction_kernel<
1781  reduction::AuxKrn, KernelName,
1783 
1784  bool IsUpdateOfUserVar = !Reduction::is_usm &&
1785  !Redu.initializeToIdentity() &&
1786  NWorkGroups == 1;
1787  range<1> GlobalRange = {HasUniformWG ? NWorkItems
1788  : NWorkGroups * WGSize};
1789  nd_range<1> Range{GlobalRange, range<1>(WGSize)};
1790  AuxHandler.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
1791  typename Reduction::binary_operation BOp;
1792  size_t WGID = NDIt.get_group_linear_id();
1793  size_t GID = NDIt.get_global_linear_id();
1794 
1795  for (size_t E = 0; E < NElements; ++E) {
1796  typename Reduction::result_type PSum =
1797  (HasUniformWG || (GID < NWorkItems))
1798  ? *In[GID * NElements + E]
1801  PSum = reduce_over_group(NDIt.get_group(), PSum, BOp);
1802  if (NDIt.get_local_linear_id() == 0) {
1803  if (IsUpdateOfUserVar)
1804  PSum = BOp(*Out[E], PSum);
1805  Out[WGID * NElements + E] = PSum;
1806  }
1807  }
1808  });
1809  NWorkItems = NWorkGroups;
1810  });
1811  } // end while (NWorkItems > 1)
1812 
1813  if constexpr (Reduction::is_usm) {
1814  reduction::withAuxHandler(CGH, [&](handler &CopyHandler) {
1815  reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
1816  });
1817  }
1818  }
1819 };
1820 
1821 template <> struct NDRangeReduction<reduction::strategy::basic> {
1822  template <typename KernelName, int Dims, typename PropertiesT,
1823  typename KernelType, typename Reduction>
1824  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1825  nd_range<Dims> NDRange, PropertiesT &Properties,
1826  Reduction &Redu, KernelType &KernelFunc) {
1827  using element_type = typename Reduction::reducer_element_type;
1828 
1829  constexpr bool HFR = Reduction::has_fast_reduce;
1830  size_t OneElemSize = HFR ? 0 : sizeof(element_type);
1831  // TODO: currently the maximal work group size is determined for the given
1832  // queue/device, while it may be safer to use queries to the kernel
1833  // compiled for the device.
1834  size_t MaxWGSize = reduGetMaxWGSize(Queue, OneElemSize);
1835  if (NDRange.get_local_range().size() > MaxWGSize)
1837  "The implementation handling parallel_for with"
1838  " reduction requires work group size not bigger"
1839  " than " +
1840  std::to_string(MaxWGSize));
1841 
1842  size_t NWorkGroups = NDRange.get_group_range().size();
1843 
1844  bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1845  std::ignore = IsUpdateOfUserVar;
1846 
1847  // The type of the Out "accessor" differs between scenarios when there is
1848  // just one WorkGroup and when there are multiple. Use this lambda to write
1849  // the code just once.
1850  auto First = [&](auto KernelTag) {
1851  // We can deduce IsOneWG from the tag type.
1852  constexpr bool IsOneWG =
1853  std::is_same_v<std::remove_reference_t<decltype(KernelTag)>,
1854  KernelOneWGTag>;
1855 
1856  constexpr size_t NElements = Reduction::num_elements;
1857 
1858  size_t WGSize = NDRange.get_local_range().size();
1859 
1860  auto Out = [&]() {
1861  if constexpr (IsOneWG)
1862  return Redu.getUserRedVarAccess(CGH);
1863  else
1864  return Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1865  }();
1866 
1867  // Use local memory to reduce elements in work-groups into 0-th element.
1868  local_accessor<element_type, 1> LocalReds{WGSize, CGH};
1869 
1870  auto BOp = Redu.getBinaryOperation();
1871  auto IdentityContainer = Redu.getIdentityContainer();
1872 
1873  using Name = __sycl_reduction_kernel<reduction::MainKrn, KernelName,
1875  decltype(KernelTag)>;
1876 
1877  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<Dims> NDIt) {
1878  // Call user's functions. Reducer.MValue gets initialized there.
1879  typename Reduction::reducer_type Reducer =
1880  typename Reduction::reducer_type(IdentityContainer, BOp);
1881  KernelFunc(NDIt, Reducer);
1882 
1883  size_t WGSize = NDIt.get_local_range().size();
1884  size_t LID = NDIt.get_local_linear_id();
1885 
1886  auto ElementCombiner = [&](element_type &LHS, const element_type &RHS) {
1887  return LHS.combine(BOp, RHS);
1888  };
1889 
1890  // If there are multiple values, reduce each separately
1891  // This prevents local memory from scaling with elements
1892  for (size_t E = 0; E < NElements; ++E) {
1893 
1894  doTreeReduction<WorkSizeGuarantees::Equal>(
1895  WGSize, NDIt, LocalReds, ElementCombiner,
1896  [&](size_t) { return getReducerAccess(Reducer).getElement(E); });
1897 
1898  // Compute the partial sum/reduction for the work-group.
1899  if (LID == 0) {
1900  element_type PSum = LocalReds[0];
1901  if constexpr (IsOneWG) {
1902  if (IsUpdateOfUserVar)
1903  PSum.combine(BOp, Out[E]);
1904  Out[E] = *PSum;
1905  } else {
1906  size_t GrID = NDIt.get_group_linear_id();
1907  Out[GrID * NElements + E] = PSum;
1908  }
1909  }
1910 
1911  // Ensure item 0 is finished with LocalReds before next iteration
1912  if (E != NElements - 1) {
1913  NDIt.barrier();
1914  }
1915  }
1916  });
1917  };
1918 
1919  if (NWorkGroups == 1)
1920  First(KernelOneWGTag{});
1921  else
1922  First(KernelMultipleWGTag{});
1923 
1925 
1926  // 2. Run the additional kernel as many times as needed to reduce
1927  // all partial sums into one scalar.
1928 
1929  // TODO: Create a special slow/sequential version of the kernel that would
1930  // handle the reduction instead of reporting an assert below.
1931  if (MaxWGSize <= 1)
1933  "The implementation handling parallel_for with "
1934  "reduction requires the maximal work group "
1935  "size to be greater than 1 to converge. "
1936  "The maximal work group size depends on the "
1937  "device and the size of the objects passed to "
1938  "the reduction.");
1939  size_t NWorkItems = NDRange.get_group_range().size();
1940  while (NWorkItems > 1) {
1941  size_t NWorkGroups;
1942  size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
1943 
1944  auto Rest = [&](auto KernelTag) {
1945  reduction::withAuxHandler(CGH, [&](handler &AuxHandler) {
1946  // We can deduce IsOneWG from the tag type.
1947  constexpr bool IsOneWG =
1948  std::is_same_v<std::remove_reference_t<decltype(KernelTag)>,
1949  KernelOneWGTag>;
1950 
1951  constexpr size_t NElements = Reduction::num_elements;
1952 
1953  // The last work-group may be not fully loaded with work, or the work
1954  // group size may be not power of two. Those two cases considered
1955  // inefficient as they require additional code and checks in the
1956  // kernel.
1957  bool HasUniformWG = NWorkGroups * WGSize == NWorkItems;
1958 
1959  // Get read accessor to the buffer that was used as output
1960  // in the previous kernel.
1961  auto In = Redu.getReadAccToPreviousPartialReds(AuxHandler);
1962 
1963  auto Out = [&]() {
1964  if constexpr (IsOneWG)
1965  return Redu.getUserRedVarAccess(AuxHandler);
1966  else
1967  return Redu.getWriteAccForPartialReds(NWorkGroups * NElements,
1968  AuxHandler);
1969  }();
1970 
1971  bool UniformPow2WG = HasUniformWG && (WGSize & (WGSize - 1)) == 0;
1972  // Use local memory to reduce elements in work-groups into 0-th
1973  // element.
1974  local_accessor<element_type, 1> LocalReds{WGSize, AuxHandler};
1975 
1976  auto BOp = Redu.getBinaryOperation();
1977  using Name = __sycl_reduction_kernel<reduction::AuxKrn, KernelName,
1979  decltype(KernelTag)>;
1980 
1981  range<1> GlobalRange = {UniformPow2WG ? NWorkItems
1982  : NWorkGroups * WGSize};
1983  nd_range<1> Range{GlobalRange, range<1>(WGSize)};
1984  AuxHandler.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
1985  size_t WGSize = NDIt.get_local_range().size();
1986  size_t LID = NDIt.get_local_linear_id();
1987  size_t GID = NDIt.get_global_linear_id();
1988  size_t GrID = NDIt.get_group_linear_id();
1989 
1990  // The last work-group may not have enough work for all its items.
1991  size_t RemainingWorkSize =
1992  sycl::min(WGSize, NWorkItems - GrID * WGSize);
1993 
1994  auto ElementCombiner = [&](element_type &LHS,
1995  const element_type &RHS) {
1996  return LHS.combine(BOp, RHS);
1997  };
1998 
1999  for (size_t E = 0; E < NElements; ++E) {
2000 
2001  doTreeReduction<WorkSizeGuarantees::LessOrEqual>(
2002  RemainingWorkSize, NDIt, LocalReds, ElementCombiner,
2003  [&](size_t) { return In[GID * NElements + E]; });
2004 
2005  // Compute the partial sum/reduction for the work-group.
2006  if (LID == 0) {
2007  element_type PSum = LocalReds[0];
2008  if constexpr (IsOneWG) {
2009  if (IsUpdateOfUserVar)
2010  PSum.combine(BOp, Out[E]);
2011  Out[E] = *PSum;
2012  } else {
2013  Out[GrID * NElements + E] = PSum;
2014  }
2015  }
2016 
2017  // Ensure item 0 is finished with LocalReds before next iteration
2018  if (E != NElements - 1) {
2019  NDIt.barrier();
2020  }
2021  }
2022  });
2023  NWorkItems = NWorkGroups;
2024  });
2025  };
2026 
2027  if (NWorkGroups == 1)
2028  Rest(KernelOneWGTag{});
2029  else
2030  Rest(KernelMultipleWGTag{});
2031  } // end while (NWorkItems > 1)
2032  }
2033 };
2034 
2038 template <bool IsOneWG, typename... Reductions, size_t... Is>
2039 auto createReduOutAccs(size_t NWorkGroups, handler &CGH,
2040  std::tuple<Reductions...> &ReduTuple,
2041  std::index_sequence<Is...>) {
2042  return makeReduTupleT(
2043  std::get<Is>(ReduTuple).template getWriteMemForPartialReds<IsOneWG>(
2044  NWorkGroups *
2045  std::tuple_element_t<Is, std::tuple<Reductions...>>::num_elements,
2046  CGH)...);
2047 }
2048 
2049 template <typename OutAccT, typename LocalAccT, typename BOPT,
2050  typename IdentityContainerT>
2051 auto getLastCombine(OutAccT OutAcc, LocalAccT LocalAcc, BOPT BOP,
2052  IdentityContainerT IdentityContainer,
2053  bool IsInitializeToIdentity) {
2054  if constexpr (!IdentityContainerT::has_identity) {
2055  return BOP(LocalAcc[0], OutAcc[0]);
2056  } else {
2057  return BOP(LocalAcc[0], IsInitializeToIdentity
2058  ? IdentityContainer.getIdentity()
2059  : OutAcc[0]);
2060  }
2061 }
2062 
2063 template <bool IsOneWG, typename... Reductions, typename... OutAccT,
2064  typename... LocalAccT, typename... BOPsT, typename... Ts,
2065  size_t... Is>
2067  size_t OutAccIndex, ReduTupleT<OutAccT...> OutAccs,
2069  ReduTupleT<Ts...> IdentityVals,
2070  std::array<bool, sizeof...(Reductions)> IsInitializeToIdentity,
2071  std::index_sequence<Is...>) {
2072  if constexpr (IsOneWG) {
2073  // Add the initial value of user's variable to the final result.
2074  // After this we know there will be a value in the 0th element.
2075  ((std::get<Is>(LocalAccs)[0] = getLastCombine(
2076  std::get<Is>(OutAccs), std::get<Is>(LocalAccs), std::get<Is>(BOPs),
2077  std::get<Is>(IdentityVals), IsInitializeToIdentity[Is])),
2078  ...);
2079  ((std::get<Is>(OutAccs)[OutAccIndex] = *std::get<Is>(LocalAccs)[0]), ...);
2080  } else {
2081  // The partial sums for the work-group are stored in 0-th elements of local
2082  // accessors. Simply write those sums to output accessors.
2083  ((std::get<Is>(OutAccs)[OutAccIndex] = std::get<Is>(LocalAccs)[0]), ...);
2084  }
2085 }
2086 
2087 // Concatenate an empty sequence.
2088 constexpr std::index_sequence<> concat_sequences(std::index_sequence<>) {
2089  return {};
2090 }
2091 
2092 // Concatenate a sequence consisting of 1 element.
2093 template <size_t I>
2094 constexpr std::index_sequence<I> concat_sequences(std::index_sequence<I>) {
2095  return {};
2096 }
2097 
2098 // Concatenate two potentially empty sequences.
2099 template <size_t... Is, size_t... Js>
2100 constexpr std::index_sequence<Is..., Js...>
2101 concat_sequences(std::index_sequence<Is...>, std::index_sequence<Js...>) {
2102  return {};
2103 }
2104 
2105 // Concatenate more than 2 sequences.
2106 template <size_t... Is, size_t... Js, class... Rs>
2107 constexpr auto concat_sequences(std::index_sequence<Is...>,
2108  std::index_sequence<Js...>, Rs...) {
2109  return concat_sequences(std::index_sequence<Is..., Js...>{}, Rs{}...);
2110 }
2111 
2113  template <typename T> struct Func {
2114  static constexpr bool value = !std::remove_pointer_t<T>::is_usm;
2115  };
2116 };
2117 
2119  template <typename T> struct Func {
2120  static constexpr bool value = false;
2121  };
2122 };
2123 
2124 template <bool Cond, size_t I> struct FilterElement {
2125  using type =
2126  std::conditional_t<Cond, std::index_sequence<I>, std::index_sequence<>>;
2127 };
2128 
2134 template <typename... T, typename FunctorT, size_t... Is,
2135  std::enable_if_t<(sizeof...(Is) > 0), int> Z = 0>
2136 constexpr auto filterSequenceHelper(FunctorT, std::index_sequence<Is...>) {
2137  return concat_sequences(
2138  typename FilterElement<FunctorT::template Func<std::tuple_element_t<
2139  Is, std::tuple<T...>>>::value,
2140  Is>::type{}...);
2141 }
2142 template <typename... T, typename FunctorT, size_t... Is,
2143  std::enable_if_t<(sizeof...(Is) == 0), int> Z = 0>
2144 constexpr auto filterSequenceHelper(FunctorT, std::index_sequence<Is...>) {
2145  return std::index_sequence<>{};
2146 }
2147 
2151 template <typename... T, typename FunctorT, size_t... Is>
2152 constexpr auto filterSequence(FunctorT F, std::index_sequence<Is...> Indices) {
2153  return filterSequenceHelper<T...>(F, Indices);
2154 }
2155 
2157  template <typename Reduction> struct Func {
2158  static constexpr bool value =
2159  (Reduction::dims == 0 && Reduction::num_elements == 1);
2160  };
2161 };
2162 
2164  template <typename Reduction> struct Func {
2165  static constexpr bool value =
2166  (Reduction::dims == 1 && Reduction::num_elements >= 1);
2167  };
2168 };
2169 
2170 template <typename ElementType, typename BOPT>
2171 constexpr auto makeAdjustedBOP(BOPT &BOP) {
2172  return [&](ElementType &LHS, const ElementType &RHS) {
2173  return LHS.combine(BOP, RHS);
2174  };
2175 }
2176 
2177 template <typename... Reductions, typename... BOPsT, size_t... Is>
2178 constexpr auto makeAdjustedBOPs(ReduTupleT<BOPsT...> &BOPsTuple,
2179  std::index_sequence<Is...>) {
2180  return makeReduTupleT(
2182  Is, std::tuple<Reductions...>>::reducer_element_type>(
2183  std::get<Is>(BOPsTuple))...);
2184 }
2185 
2186 template <typename... Reductions, typename... BOPsT>
2187 constexpr auto makeAdjustedBOPs(ReduTupleT<BOPsT...> &BOPsTuple) {
2188  return makeAdjustedBOPs<Reductions...>(
2189  BOPsTuple, std::make_index_sequence<sizeof...(Reductions)>{});
2190 }
2191 
2194 template <bool IsOneWG, typename... Reductions, int Dims, typename... LocalAccT,
2195  typename... OutAccT, typename... ReducerT, typename... Ts,
2196  typename... BOPsT, size_t... Is>
2198  nd_item<Dims> NDIt, ReduTupleT<LocalAccT...> LocalAccsTuple,
2199  ReduTupleT<OutAccT...> OutAccsTuple, std::tuple<ReducerT...> &ReducersTuple,
2200  ReduTupleT<Ts...> IdentitiesTuple, ReduTupleT<BOPsT...> BOPsTuple,
2201  std::array<bool, sizeof...(Reductions)> InitToIdentityProps,
2202  std::index_sequence<Is...> ReduIndices) {
2203  size_t WGSize = NDIt.get_local_range().size();
2204  size_t LID = NDIt.get_local_linear_id();
2205 
2206  ((std::get<Is>(LocalAccsTuple)[LID] =
2207  getReducerAccess(std::get<Is>(ReducersTuple)).getElement(0)),
2208  ...);
2209 
2210  // We apply tree-reduction on reducer elements so we adjust the operations
2211  // to combine these.
2212  auto AdjustedBOPsTuple = makeAdjustedBOPs<Reductions...>(BOPsTuple);
2213 
2214  doTreeReductionOnTuple(WGSize, LID, LocalAccsTuple, AdjustedBOPsTuple,
2215  ReduIndices);
2216 
2217  // Compute the partial sum/reduction for the work-group.
2218  if (LID == 0) {
2219  size_t GrID = NDIt.get_group_linear_id();
2220  writeReduSumsToOutAccs<IsOneWG, Reductions...>(
2221  GrID, OutAccsTuple, LocalAccsTuple, AdjustedBOPsTuple, IdentitiesTuple,
2222  InitToIdentityProps, ReduIndices);
2223  }
2224 }
2225 
2227 template <bool IsOneWG, typename Reduction, int Dims, typename LocalAccT,
2228  typename OutAccT, typename ReducerT, typename BOPT>
2229 void reduCGFuncImplArrayHelper(nd_item<Dims> NDIt, LocalAccT LocalReds,
2230  OutAccT Out, ReducerT &Reducer, BOPT BOp,
2231  bool IsInitializeToIdentity) {
2232  using element_type = typename Reduction::reducer_element_type;
2233 
2234  size_t WGSize = NDIt.get_local_range().size();
2235  size_t LID = NDIt.get_local_linear_id();
2236 
2237  auto ElementCombiner = [&](element_type &LHS, const element_type &RHS) {
2238  return LHS.combine(BOp, RHS);
2239  };
2240 
2241  // If there are multiple values, reduce each separately
2242  // This prevents local memory from scaling with elements
2243  auto NElements = Reduction::num_elements;
2244  for (size_t E = 0; E < NElements; ++E) {
2245  doTreeReduction<WorkSizeGuarantees::Equal>(
2246  WGSize, NDIt, LocalReds, ElementCombiner,
2247  [&](size_t) { return getReducerAccess(Reducer).getElement(E); });
2248 
2249  // Add the initial value of user's variable to the final result.
2250  if (LID == 0) {
2251  size_t GrID = NDIt.get_group_linear_id();
2252  size_t OutIdx = GrID * NElements + E;
2253  if constexpr (IsOneWG) {
2254  // If there is only a single work-group, the output will be an actual
2255  // value rather than a potentially optional value.
2256  if constexpr (Reduction::has_identity) {
2257  Out[OutIdx] = *ElementCombiner(LocalReds[0], IsInitializeToIdentity
2258  ? Reducer.identity()
2259  : Out[E]);
2260  } else {
2261  Out[OutIdx] = *LocalReds[0];
2262  }
2263  } else {
2264  // Otherwise we propagate a potentially optional value.
2265  Out[OutIdx] = LocalReds[0];
2266  }
2267  }
2268 
2269  // Ensure item 0 is finished with LocalReds before next iteration
2270  if (E != NElements - 1) {
2271  NDIt.barrier();
2272  }
2273  }
2274 }
2275 
2276 template <bool IsOneWG, typename... Reductions, int Dims, typename... LocalAccT,
2277  typename... OutAccT, typename... ReducerT, typename... BOPsT,
2278  size_t... Is>
2280  nd_item<Dims> NDIt, ReduTupleT<LocalAccT...> LocalAccsTuple,
2281  ReduTupleT<OutAccT...> OutAccsTuple, std::tuple<ReducerT...> &ReducersTuple,
2282  ReduTupleT<BOPsT...> BOPsTuple,
2283  std::array<bool, sizeof...(Reductions)> InitToIdentityProps,
2284  std::index_sequence<Is...>) {
2285  using ReductionPack = std::tuple<Reductions...>;
2286  (reduCGFuncImplArrayHelper<IsOneWG, std::tuple_element_t<Is, ReductionPack>>(
2287  NDIt, std::get<Is>(LocalAccsTuple), std::get<Is>(OutAccsTuple),
2288  std::get<Is>(ReducersTuple), std::get<Is>(BOPsTuple),
2289  InitToIdentityProps[Is]),
2290  ...);
2291 }
2292 
2293 namespace reduction::main_krn {
2294 template <class KernelName, class Accessor> struct NDRangeMulti;
2295 } // namespace reduction::main_krn
2296 template <typename KernelName, typename KernelType, int Dims,
2297  typename PropertiesT, typename... Reductions, size_t... Is>
2298 void reduCGFuncMulti(handler &CGH, KernelType KernelFunc,
2299  const nd_range<Dims> &Range, PropertiesT Properties,
2300  std::tuple<Reductions...> &ReduTuple,
2301  std::index_sequence<Is...> ReduIndices) {
2302  size_t WGSize = Range.get_local_range().size();
2303 
2304  // Split reduction sequence into two:
2305  // 1) Scalar reductions
2306  // 2) Array reductions
2307  // This allows us to reuse the existing implementation for scalar reductions
2308  // and introduce a new implementation for array reductions. Longer term it
2309  // may make sense to generalize the code such that each phase below applies
2310  // to all available reduction implementations -- today all reduction classes
2311  // use the same privatization-based approach, so this is unnecessary.
2312  IsScalarReduction ScalarPredicate;
2313  auto ScalarIs = filterSequence<Reductions...>(ScalarPredicate, ReduIndices);
2314 
2315  IsArrayReduction ArrayPredicate;
2316  auto ArrayIs = filterSequence<Reductions...>(ArrayPredicate, ReduIndices);
2317 
2318  auto LocalAccsTuple = makeReduTupleT(
2320  CGH}...);
2321 
2322  // The type of the Out "accessor" differs between scenarios when there is just
2323  // one WorkGroup and when there are multiple. Use this lambda to write the
2324  // code just once.
2325  auto Rest = [&](auto KernelTag, auto OutAccsTuple) {
2326  auto IdentitiesTuple =
2327  makeReduTupleT(std::get<Is>(ReduTuple).getIdentityContainer()...);
2328  auto BOPsTuple =
2329  makeReduTupleT(std::get<Is>(ReduTuple).getBinaryOperation()...);
2330  std::array InitToIdentityProps{
2331  std::get<Is>(ReduTuple).initializeToIdentity()...};
2332 
2333  using Name = __sycl_reduction_kernel<reduction::MainKrn, KernelName,
2335  decltype(KernelTag)>;
2336 
2337  CGH.parallel_for<Name>(Range, Properties, [=](nd_item<Dims> NDIt) {
2338  // We can deduce IsOneWG from the tag type.
2339  constexpr bool IsOneWG =
2340  std::is_same_v<std::remove_reference_t<decltype(KernelTag)>,
2341  KernelOneWGTag>;
2342 
2343  // Pass all reductions to user's lambda in the same order as supplied
2344  // Each reducer initializes its own storage
2345  auto ReduIndices = std::index_sequence_for<Reductions...>();
2346  auto ReducerTokensTuple =
2347  std::tuple{typename Reductions::reducer_token_type{
2348  std::get<Is>(IdentitiesTuple), std::get<Is>(BOPsTuple)}...};
2349  auto ReducersTuple = std::tuple<typename Reductions::reducer_type...>{
2350  std::get<Is>(ReducerTokensTuple)...};
2351  std::apply([&](auto &...Reducers) { KernelFunc(NDIt, Reducers...); },
2352  ReducersTuple);
2353 
2354  // Combine and write-back the results of any scalar reductions
2355  // reduCGFuncImplScalar<Reductions...>(NDIt, LocalAccsTuple, OutAccsTuple,
2356  // ReducersTuple, IdentitiesTuple, BOPsTuple, InitToIdentityProps,
2357  // ReduIndices);
2358  reduCGFuncImplScalar<IsOneWG, Reductions...>(
2359  NDIt, LocalAccsTuple, OutAccsTuple, ReducersTuple, IdentitiesTuple,
2360  BOPsTuple, InitToIdentityProps, ScalarIs);
2361 
2362  // Combine and write-back the results of any array reductions
2363  // These are handled separately to minimize temporary storage and account
2364  // for the fact that each array reduction may have a different number of
2365  // elements to reduce (i.e. a different extent).
2366  reduCGFuncImplArray<IsOneWG, Reductions...>(
2367  NDIt, LocalAccsTuple, OutAccsTuple, ReducersTuple, BOPsTuple,
2368  InitToIdentityProps, ArrayIs);
2369  });
2370  };
2371 
2372  size_t NWorkGroups = Range.get_group_range().size();
2373  if (NWorkGroups == 1)
2374  Rest(KernelOneWGTag{},
2375  createReduOutAccs<true>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2376  else
2377  Rest(KernelMultipleWGTag{},
2378  createReduOutAccs<false>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2379 }
2380 
2381 // TODO: Is this still needed?
2382 template <typename... Reductions, size_t... Is>
2384  std::tuple<Reductions...> &ReduTuple,
2385  std::index_sequence<Is...>) {
2386  auto ProcessOne = [&CGH](auto Redu) {
2387  if constexpr (!decltype(Redu)::is_usm)
2388  Redu.getUserRedVarAccess(CGH);
2389  };
2390  (ProcessOne(std::get<Is>(ReduTuple)), ...);
2391 }
2392 
2395 template <bool IsOneWG, typename... Reductions, int Dims, typename... LocalAccT,
2396  typename... InAccT, typename... OutAccT, typename... Ts,
2397  typename... BOPsT, size_t... Is>
2399  nd_item<Dims> NDIt, size_t LID, size_t GID, size_t RemainingWorkSize,
2400  ReduTupleT<LocalAccT...> LocalAccsTuple, ReduTupleT<InAccT...> InAccsTuple,
2401  ReduTupleT<OutAccT...> OutAccsTuple, ReduTupleT<Ts...> IdentitiesTuple,
2402  ReduTupleT<BOPsT...> BOPsTuple,
2403  std::array<bool, sizeof...(Reductions)> InitToIdentityProps,
2404  std::index_sequence<Is...> ReduIndices) {
2405  // The end work-group may have less work than the rest, so we only need to
2406  // read the value of the elements that still have work left.
2407  if (LID < RemainingWorkSize)
2408  ((std::get<Is>(LocalAccsTuple)[LID] = std::get<Is>(InAccsTuple)[GID]), ...);
2409 
2410  // We apply tree-reduction on reducer elements so we adjust the operations
2411  // to combine these.
2412  auto AdjustedBOPsTuple = makeAdjustedBOPs<Reductions...>(BOPsTuple);
2413 
2414  doTreeReductionOnTuple(RemainingWorkSize, LID, LocalAccsTuple,
2415  AdjustedBOPsTuple, ReduIndices);
2416 
2417  // Compute the partial sum/reduction for the work-group.
2418  if (LID == 0) {
2419  size_t GrID = NDIt.get_group_linear_id();
2420  writeReduSumsToOutAccs<IsOneWG, Reductions...>(
2421  GrID, OutAccsTuple, LocalAccsTuple, AdjustedBOPsTuple, IdentitiesTuple,
2422  InitToIdentityProps, ReduIndices);
2423  }
2424 }
2425 
2426 template <bool IsOneWG, typename Reduction, int Dims, typename LocalAccT,
2427  typename InAccT, typename OutAccT, typename T, typename BOPT>
2428 void reduAuxCGFuncImplArrayHelper(nd_item<Dims> NDIt, size_t LID, size_t GID,
2429  size_t RemainingWorkSize, LocalAccT LocalReds,
2430  InAccT In, OutAccT Out, T IdentityContainer,
2431  BOPT BOp, bool IsInitializeToIdentity) {
2432  using element_type = typename Reduction::reducer_element_type;
2433  auto ElementCombiner = [&](element_type &LHS, const element_type &RHS) {
2434  return LHS.combine(BOp, RHS);
2435  };
2436 
2437  // If there are multiple values, reduce each separately
2438  // This prevents local memory from scaling with elements
2439  auto NElements = Reduction::num_elements;
2440  for (size_t E = 0; E < NElements; ++E) {
2441  doTreeReduction<WorkSizeGuarantees::LessOrEqual>(
2442  RemainingWorkSize, NDIt, LocalReds, ElementCombiner,
2443  [&](size_t) { return In[GID * NElements + E]; });
2444 
2445  // Add the initial value of user's variable to the final result.
2446  if (LID == 0) {
2447  size_t GrID = NDIt.get_group_linear_id();
2448  size_t OutIdx = GrID * NElements + E;
2449  if constexpr (IsOneWG) {
2450  // If there is only a single work-group, the output will be an actual
2451  // value rather than a potentially optional value.
2452  if constexpr (Reduction::has_identity) {
2453  Out[OutIdx] = *ElementCombiner(LocalReds[0],
2454  IsInitializeToIdentity
2455  ? IdentityContainer.getIdentity()
2456  : Out[E]);
2457  } else {
2458  Out[OutIdx] = *LocalReds[0];
2459  }
2460  } else {
2461  // Otherwise we propagate a potentially optional value.
2462  Out[OutIdx] = LocalReds[0];
2463  }
2464  }
2465 
2466  // Ensure item 0 is finished with LocalReds before next iteration
2467  if (E != NElements - 1) {
2468  NDIt.barrier();
2469  }
2470  }
2471 }
2472 
2473 template <bool IsOneWG, typename... Reductions, int Dims, typename... LocalAccT,
2474  typename... InAccT, typename... OutAccT, typename... Ts,
2475  typename... BOPsT, size_t... Is>
2477  nd_item<Dims> NDIt, size_t LID, size_t GID, size_t RemainingWorkSize,
2478  ReduTupleT<LocalAccT...> LocalAccsTuple, ReduTupleT<InAccT...> InAccsTuple,
2479  ReduTupleT<OutAccT...> OutAccsTuple, ReduTupleT<Ts...> IdentitiesTuple,
2480  ReduTupleT<BOPsT...> BOPsTuple,
2481  std::array<bool, sizeof...(Reductions)> InitToIdentityProps,
2482  std::index_sequence<Is...>) {
2483  using ReductionPack = std::tuple<Reductions...>;
2485  std::tuple_element_t<Is, ReductionPack>>(
2486  NDIt, LID, GID, RemainingWorkSize, std::get<Is>(LocalAccsTuple),
2487  std::get<Is>(InAccsTuple), std::get<Is>(OutAccsTuple),
2488  std::get<Is>(IdentitiesTuple), std::get<Is>(BOPsTuple),
2489  InitToIdentityProps[Is]),
2490  ...);
2491 }
2492 
2493 namespace reduction::aux_krn {
2494 template <class KernelName, class Predicate> struct Multi;
2495 } // namespace reduction::aux_krn
2496 template <typename KernelName, typename KernelType, typename... Reductions,
2497  size_t... Is>
2498 size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
2499  std::tuple<Reductions...> &ReduTuple,
2500  std::index_sequence<Is...> ReduIndices) {
2501  size_t NWorkGroups;
2502  size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
2503 
2504  bool Pow2WG = (WGSize & (WGSize - 1)) == 0;
2505  bool HasUniformWG = Pow2WG && (NWorkGroups * WGSize == NWorkItems);
2506 
2507  // Like reduCGFuncImpl, we also have to split out scalar and array reductions
2508  IsScalarReduction ScalarPredicate;
2509  auto ScalarIs = filterSequence<Reductions...>(ScalarPredicate, ReduIndices);
2510 
2511  IsArrayReduction ArrayPredicate;
2512  auto ArrayIs = filterSequence<Reductions...>(ArrayPredicate, ReduIndices);
2513 
2514  auto LocalAccsTuple = makeReduTupleT(
2516  CGH}...);
2517  auto InAccsTuple = makeReduTupleT(
2518  std::get<Is>(ReduTuple).getReadAccToPreviousPartialReds(CGH)...);
2519 
2520  auto IdentitiesTuple =
2521  makeReduTupleT(std::get<Is>(ReduTuple).getIdentityContainer()...);
2522  auto BOPsTuple =
2523  makeReduTupleT(std::get<Is>(ReduTuple).getBinaryOperation()...);
2524  std::array InitToIdentityProps{
2525  std::get<Is>(ReduTuple).initializeToIdentity()...};
2526 
2527  // Predicate/OutAccsTuple below have different type depending on us having
2528  // just a single WG or multiple WGs. Use this lambda to avoid code
2529  // duplication.
2530  auto Rest = [&](auto Predicate, auto OutAccsTuple) {
2531  auto AccReduIndices = filterSequence<Reductions...>(Predicate, ReduIndices);
2532  associateReduAccsWithHandler(CGH, ReduTuple, AccReduIndices);
2533  using Name = __sycl_reduction_kernel<reduction::AuxKrn, KernelName,
2535  decltype(Predicate)>;
2536  // TODO: Opportunity to parallelize across number of elements
2537  range<1> GlobalRange = {HasUniformWG ? NWorkItems : NWorkGroups * WGSize};
2538  nd_range<1> Range{GlobalRange, range<1>(WGSize)};
2539  CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
2540  // We can deduce IsOneWG from the predicate type.
2541  constexpr bool IsOneWG =
2542  std::is_same_v<std::remove_reference_t<decltype(Predicate)>,
2544 
2545  size_t WGSize = NDIt.get_local_range().size();
2546  size_t RemainingWorkSize =
2547  sycl::min(WGSize, NWorkItems - WGSize * NDIt.get_group_linear_id());
2548  size_t LID = NDIt.get_local_linear_id();
2549  size_t GID = NDIt.get_global_linear_id();
2550 
2551  // Handle scalar and array reductions
2552  reduAuxCGFuncImplScalar<IsOneWG, Reductions...>(
2553  NDIt, LID, GID, RemainingWorkSize, LocalAccsTuple, InAccsTuple,
2554  OutAccsTuple, IdentitiesTuple, BOPsTuple, InitToIdentityProps,
2555  ScalarIs);
2556  reduAuxCGFuncImplArray<IsOneWG, Reductions...>(
2557  NDIt, LID, GID, RemainingWorkSize, LocalAccsTuple, InAccsTuple,
2558  OutAccsTuple, IdentitiesTuple, BOPsTuple, InitToIdentityProps,
2559  ArrayIs);
2560  });
2561  };
2562  if (NWorkGroups == 1)
2564  createReduOutAccs<true>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2565  else
2566  Rest(EmptyReductionPredicate{},
2567  createReduOutAccs<false>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2568 
2569  return NWorkGroups;
2570 }
2571 
2572 template <typename Reduction> size_t reduGetMemPerWorkItemHelper(Reduction &) {
2573  return sizeof(typename Reduction::result_type);
2574 }
2575 
2576 template <typename Reduction, typename... RestT>
2577 size_t reduGetMemPerWorkItemHelper(Reduction &, RestT... Rest) {
2578  return sizeof(typename Reduction::result_type) +
2579  reduGetMemPerWorkItemHelper(Rest...);
2580 }
2581 
2582 template <typename... ReductionT, size_t... Is>
2583 size_t reduGetMemPerWorkItem(std::tuple<ReductionT...> &ReduTuple,
2584  std::index_sequence<Is...>) {
2585  return reduGetMemPerWorkItemHelper(std::get<Is>(ReduTuple)...);
2586 }
2587 
2590 template <typename TupleT, std::size_t... Is>
2591 std::tuple<std::tuple_element_t<Is, TupleT>...>
2592 tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>) {
2593  return {std::get<Is>(std::move(Tuple))...};
2594 }
2595 
2596 template <> struct NDRangeReduction<reduction::strategy::multi> {
2597  template <typename KernelName, int Dims, typename PropertiesT,
2598  typename... RestT>
2599  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
2600  nd_range<Dims> NDRange, PropertiesT &Properties,
2601  RestT... Rest) {
2602  std::tuple<RestT...> ArgsTuple(Rest...);
2603  constexpr size_t NumArgs = sizeof...(RestT);
2604  auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
2605  auto ReduIndices = std::make_index_sequence<NumArgs - 1>();
2606  auto ReduTuple = detail::tuple_select_elements(ArgsTuple, ReduIndices);
2607 
2608  size_t LocalMemPerWorkItem = reduGetMemPerWorkItem(ReduTuple, ReduIndices);
2609  // TODO: currently the maximal work group size is determined for the given
2610  // queue/device, while it is safer to use queries to the kernel compiled
2611  // for the device.
2612  size_t MaxWGSize = reduGetMaxWGSize(Queue, LocalMemPerWorkItem);
2613  if (NDRange.get_local_range().size() > MaxWGSize)
2615  "The implementation handling parallel_for with"
2616  " reduction requires work group size not bigger"
2617  " than " +
2618  std::to_string(MaxWGSize));
2619 
2620  reduCGFuncMulti<KernelName>(CGH, KernelFunc, NDRange, Properties, ReduTuple,
2621  ReduIndices);
2623 
2624  size_t NWorkItems = NDRange.get_group_range().size();
2625  while (NWorkItems > 1) {
2626  reduction::withAuxHandler(CGH, [&](handler &AuxHandler) {
2627  NWorkItems = reduAuxCGFunc<KernelName, decltype(KernelFunc)>(
2628  AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
2629  });
2630  } // end while (NWorkItems > 1)
2631  }
2632 };
2633 
2634 // Auto-dispatch. Must be the last one.
2635 template <> struct NDRangeReduction<reduction::strategy::auto_select> {
2636  // Some readability aliases, to increase signal/noise ratio below.
2637  template <reduction::strategy Strategy>
2640 
2641  template <typename KernelName, int Dims, typename PropertiesT,
2642  typename KernelType, typename Reduction>
2643  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
2644  nd_range<Dims> NDRange, PropertiesT &Properties,
2645  Reduction &Redu, KernelType &KernelFunc) {
2646  auto Delegate = [&](auto Impl) {
2647  Impl.template run<KernelName>(CGH, Queue, NDRange, Properties, Redu,
2648  KernelFunc);
2649  };
2650 
2651  if constexpr (Reduction::has_float64_atomics) {
2652  if (getDeviceFromHandler(CGH).has(aspect::atomic64))
2654 
2655  if constexpr (Reduction::has_fast_reduce)
2657  else
2658  return Delegate(Impl<Strat::basic>{});
2659  } else if constexpr (Reduction::has_fast_atomics) {
2660  if constexpr (sizeof(typename Reduction::result_type) == 8) {
2661  // Both group_reduce_and_atomic_cross_wg and
2662  // local_mem_tree_and_atomic_cross_wg implicitly require
2663  // aspect::atomic64 if the result type of the reduction is 64-bit. If
2664  // the device does not support this, we need to fall back to more
2665  // reliable strategies.
2666  if (!getDeviceFromHandler(CGH).has(aspect::atomic64)) {
2667  if constexpr (Reduction::has_fast_reduce)
2669  else
2670  return Delegate(Impl<Strat::basic>{});
2671  }
2672  }
2673 
2674  if constexpr (Reduction::has_fast_reduce) {
2676  } else {
2678  }
2679  } else {
2680  if constexpr (Reduction::has_fast_reduce)
2682  else
2683  return Delegate(Impl<Strat::basic>{});
2684  }
2685 
2686  assert(false && "Must be unreachable!");
2687  }
2688  template <typename KernelName, int Dims, typename PropertiesT,
2689  typename... RestT>
2690  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
2691  nd_range<Dims> NDRange, PropertiesT &Properties,
2692  RestT... Rest) {
2693  return Impl<Strat::multi>::run<KernelName>(CGH, Queue, NDRange, Properties,
2694  Rest...);
2695  }
2696 };
2697 
2698 template <typename KernelName, reduction::strategy Strategy, int Dims,
2699  typename PropertiesT, typename... RestT>
2701  PropertiesT Properties, RestT... Rest) {
2702  NDRangeReduction<Strategy>::template run<KernelName>(CGH, CGH.MQueue, NDRange,
2703  Properties, Rest...);
2704 }
2705 
2706 __SYCL_EXPORT uint32_t
2707 reduGetMaxNumConcurrentWorkGroups(std::shared_ptr<queue_impl> Queue);
2708 
2709 template <typename KernelName, reduction::strategy Strategy, int Dims,
2710  typename PropertiesT, typename... RestT>
2712  PropertiesT Properties, RestT... Rest) {
2713  std::tuple<RestT...> ArgsTuple(Rest...);
2714  constexpr size_t NumArgs = sizeof...(RestT);
2715  static_assert(NumArgs > 1, "No reduction!");
2716  auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
2717  auto ReduIndices = std::make_index_sequence<NumArgs - 1>();
2718  auto ReduTuple = detail::tuple_select_elements(ArgsTuple, ReduIndices);
2719 
2720  // Before running the kernels, check that device has enough local memory
2721  // to hold local arrays required for the tree-reduction algorithm.
2722  size_t OneElemSize = [&]() {
2723  // Can't use outlined NumArgs due to a bug in gcc 8.4.
2724  if constexpr (sizeof...(RestT) == 2) {
2725  using Reduction = std::tuple_element_t<0, decltype(ReduTuple)>;
2726  constexpr bool IsTreeReduction =
2727  !Reduction::has_fast_reduce && !Reduction::has_fast_atomics;
2728  return IsTreeReduction ? sizeof(typename Reduction::result_type) : 0;
2729  } else {
2730  return reduGetMemPerWorkItem(ReduTuple, ReduIndices);
2731  }
2732  }();
2733 
2734  uint32_t NumConcurrentWorkGroups =
2735 #ifdef __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS
2736  __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS;
2737 #else
2739 #endif
2740 
2741  // TODO: currently the preferred work group size is determined for the given
2742  // queue/device, while it is safer to use queries to the kernel pre-compiled
2743  // for the device.
2744  size_t PrefWGSize = reduGetPreferredWGSize(CGH.MQueue, OneElemSize);
2745 
2746  size_t NWorkItems = Range.size();
2747  size_t WGSize = std::min(NWorkItems, PrefWGSize);
2748  size_t NWorkGroups = NWorkItems / WGSize;
2749  if (NWorkItems % WGSize)
2750  NWorkGroups++;
2751  size_t MaxNWorkGroups = NumConcurrentWorkGroups;
2752  NWorkGroups = std::min(NWorkGroups, MaxNWorkGroups);
2753  size_t NDRItems = NWorkGroups * WGSize;
2754  nd_range<1> NDRange{range<1>{NDRItems}, range<1>{WGSize}};
2755 
2756  size_t PerGroup = Range.size() / NWorkGroups;
2757  // Iterate through the index space by assigning contiguous chunks to each
2758  // work-group, then iterating through each chunk using a stride equal to the
2759  // work-group's local range, which gives much better performance than using
2760  // stride equal to 1. For each of the index the given the original KernelFunc
2761  // is called and the reduction value hold in \p Reducer is accumulated in
2762  // those calls.
2763  auto UpdatedKernelFunc = [=](auto NDId, auto &...Reducers) {
2764  // Divide into contiguous chunks and assign each chunk to a Group
2765  // Rely on precomputed division to avoid repeating expensive operations
2766  // TODO: Some devices may prefer alternative remainder handling
2767  auto Group = NDId.get_group();
2768  size_t GroupId = Group.get_group_linear_id();
2769  size_t NumGroups = Group.get_group_linear_range();
2770  bool LastGroup = (GroupId == NumGroups - 1);
2771  size_t GroupStart = GroupId * PerGroup;
2772  size_t GroupEnd = LastGroup ? Range.size() : (GroupStart + PerGroup);
2773 
2774  // Loop over the contiguous chunk
2775  size_t Start = GroupStart + NDId.get_local_id(0);
2776  size_t End = GroupEnd;
2777  size_t Stride = NDId.get_local_range(0);
2778  auto GetDelinearized = [&](size_t I) {
2779  auto Id = getDelinearizedId(Range, I);
2780  if constexpr (std::is_invocable_v<decltype(KernelFunc), id<Dims>,
2781  decltype(Reducers)...>)
2782  return Id;
2783  else
2784  // SYCL doesn't provide parallel_for accepting offset in presence of
2785  // reductions, so use with_offset==false.
2786  return reduction::getDelinearizedItem(Range, Id);
2787  };
2788  for (size_t I = Start; I < End; I += Stride)
2789  KernelFunc(GetDelinearized(I), Reducers...);
2790  };
2791  if constexpr (NumArgs == 2) {
2792  using Reduction = std::tuple_element_t<0, decltype(ReduTuple)>;
2793  auto &Redu = std::get<0>(ReduTuple);
2794 
2795  constexpr auto StrategyToUse = [&]() {
2796  if constexpr (Strategy != reduction::strategy::auto_select)
2797  return Strategy;
2798 
2799  // TODO: Both group_reduce_and_last_wg_detection and range_basic require
2800  // memory_order::acq_rel support that isn't guaranteed by the
2801  // specification. However, implementing run-time check for that would
2802  // result in an extra kernel compilation(s). We probably need to
2803  // investigate if the usage of kernel_bundles can mitigate that.
2804  // TODO: local_atomic_and_atomic_cross_wg uses atomics on the partial
2805  // results, which may add an implicit requirement on aspect::atomic64. As
2806  // a temporary work-around we do not pick this if the result type is
2807  // 64-bit. In the future this selection should be done at runtime based
2808  // on the device.
2809  // Note: Identityless reductions cannot use group reductions.
2810  if constexpr (Reduction::has_fast_reduce && Reduction::has_identity)
2812  else if constexpr (Reduction::has_fast_atomics &&
2813  sizeof(typename Reduction::result_type) != 8)
2815  else
2817  }();
2818 
2819  reduction_parallel_for<KernelName, StrategyToUse>(CGH, NDRange, Properties,
2820  Redu, UpdatedKernelFunc);
2821  } else {
2822  return std::apply(
2823  [&](auto &...Reds) {
2824  return reduction_parallel_for<KernelName, Strategy>(
2825  CGH, NDRange, Properties, Reds..., UpdatedKernelFunc);
2826  },
2827  ReduTuple);
2828  }
2829 }
2830 } // namespace detail
2831 
2836 template <typename T, typename AllocatorT, typename BinaryOperation>
2838  BinaryOperation Combiner, const property_list &PropList = {}) {
2839  std::ignore = CGH;
2840  bool InitializeToIdentity =
2841  PropList.has_property<property::reduction::initialize_to_identity>();
2842  return detail::make_reduction<BinaryOperation, 0, 1, false>(
2843  Var, Combiner, InitializeToIdentity);
2844 }
2845 
2851 template <typename T, typename BinaryOperation>
2852 auto reduction(T *Var, BinaryOperation Combiner,
2853  const property_list &PropList = {}) {
2854  bool InitializeToIdentity =
2855  PropList.has_property<property::reduction::initialize_to_identity>();
2856  return detail::make_reduction<BinaryOperation, 0, 1, false>(
2857  Var, Combiner, InitializeToIdentity);
2858 }
2859 
2863 template <typename T, typename AllocatorT, typename BinaryOperation>
2864 auto reduction(buffer<T, 1, AllocatorT> Var, handler &CGH, const T &Identity,
2865  BinaryOperation Combiner, const property_list &PropList = {}) {
2866  std::ignore = CGH;
2867  bool InitializeToIdentity =
2868  PropList.has_property<property::reduction::initialize_to_identity>();
2869  return detail::make_reduction<BinaryOperation, 0, 1, true>(
2870  Var, Identity, Combiner, InitializeToIdentity);
2871 }
2872 
2876 template <typename T, typename BinaryOperation>
2877 auto reduction(T *Var, const T &Identity, BinaryOperation Combiner,
2878  const property_list &PropList = {}) {
2879  bool InitializeToIdentity =
2880  PropList.has_property<property::reduction::initialize_to_identity>();
2881  return detail::make_reduction<BinaryOperation, 0, 1, true>(
2882  Var, Identity, Combiner, InitializeToIdentity);
2883 }
2884 
2890 template <typename T, size_t Extent, typename BinaryOperation,
2891  typename = std::enable_if_t<Extent != dynamic_extent>>
2892 auto reduction(span<T, Extent> Span, BinaryOperation Combiner,
2893  const property_list &PropList = {}) {
2894  bool InitializeToIdentity =
2895  PropList.has_property<property::reduction::initialize_to_identity>();
2896  return detail::make_reduction<BinaryOperation, 1, Extent, false>(
2897  Span.data(), Combiner, InitializeToIdentity);
2898 }
2899 
2903 template <typename T, size_t Extent, typename BinaryOperation,
2904  typename = std::enable_if_t<Extent != dynamic_extent>>
2905 auto reduction(span<T, Extent> Span, const T &Identity,
2906  BinaryOperation Combiner, const property_list &PropList = {}) {
2907  bool InitializeToIdentity =
2908  PropList.has_property<property::reduction::initialize_to_identity>();
2909  return detail::make_reduction<BinaryOperation, 1, Extent, true>(
2910  Span.data(), Identity, Combiner, InitializeToIdentity);
2911 }
2912 } // namespace _V1
2913 } // namespace sycl
The file contains implementations of accessor class.
Defines a shared array that can be used by kernels in queues.
Definition: buffer.hpp:168
Helper class for accessing internal reducer member functions.
Definition: reduction.hpp:176
static constexpr auto getIdentityStatic()
Definition: reduction.hpp:194
ReducerAccess(ReducerT &ReducerRef)
Definition: reduction.hpp:178
constexpr T & operator*() noexcept
Definition: reduction.hpp:483
ReducerElement & combine(BinaryOperation BinOp, const T &OtherValue)
Definition: reduction.hpp:464
ReducerElement & combine(BinaryOperation BinOp, const ReducerElement &Other)
Definition: reduction.hpp:472
ReducerElement(const ReductionIdentityContainer< T, BinaryOperation, ExplicitIdentity > &IdentityContainer)
Definition: reduction.hpp:459
constexpr const T & operator*() const noexcept
Definition: reduction.hpp:489
Templated class for common functionality of all reduction implementation classes.
Definition: reduction.hpp:384
This class is the default KernelName template parameter type for kernel invocation APIs such as singl...
Definition: kernel.hpp:45
Use CRTP to avoid redefining shorthand operators in terms of combine.
Definition: reduction.hpp:223
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&(IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value||IsReduOptForAtomic64Op< _T, _BinaryOperation >::value) &&IsMaximum< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic MAX operation: *ReduVarPtr = sycl::maximum(*ReduVarPtr, MValue);.
Definition: reduction.hpp:374
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&(IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value||IsReduOptForAtomic64Op< _T, _BinaryOperation >::value) &&IsPlus< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic ADD operation: *ReduVarPtr += MValue;.
Definition: reduction.hpp:315
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value &&IsBitOR< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic BITWISE OR operation: *ReduVarPtr |= MValue;.
Definition: reduction.hpp:326
std::enable_if_t< std::is_same_v< remove_decoration_t< _T >, _T > &&IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value &&IsBitAND< _T, _BinaryOperation >::value &&(Space==access::address_space::global_space||Space==access::address_space::local_space)> atomic_combine(_T *ReduVarPtr) const
Atomic BITWISE AND operation: *ReduVarPtr &= MValue;.
Definition: reduction.hpp:350
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&(IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value||IsReduOptForAtomic64Op< _T, _BinaryOperation >::value) &&IsMinimum< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic MIN operation: *ReduVarPtr = sycl::minimum(*ReduVarPtr, MValue);.
Definition: reduction.hpp:362
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value &&IsBitXOR< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic BITWISE XOR operation: *ReduVarPtr ^= MValue;.
Definition: reduction.hpp:337
static constexpr int dimensions
Definition: reduction.hpp:523
const identity_container_type & getIdentityContainer()
Definition: reduction.hpp:1045
auto & getTempBuffer(size_t Size, handler &CGH)
Definition: reduction.hpp:928
BinaryOperation getBinaryOperation() const
Returns the binary operation associated with the reduction.
Definition: reduction.hpp:1077
static constexpr bool has_fast_reduce
Definition: reduction.hpp:878
typename ReducerTraits< reducer_type >::element_type reducer_element_type
Definition: reduction.hpp:869
std::enable_if_t<!HasIdentity > withInitializedMem(handler &CGH, FuncTy Func)
Definition: reduction.hpp:1038
static constexpr bool has_fast_atomics
Definition: reduction.hpp:876
auto getWriteMemForPartialReds(size_t Size, handler &CGH)
Definition: reduction.hpp:915
static constexpr bool has_float64_atomics
Definition: reduction.hpp:874
static constexpr size_t num_elements
Definition: reduction.hpp:883
std::enable_if_t< HasIdentity > withInitializedMem(handler &CGH, FuncTy Func)
Provide Func with a properly initialized memory to write the reduction result to.
Definition: reduction.hpp:973
static constexpr bool is_known_identity
Definition: reduction.hpp:858
reduction_impl_algo(BinaryOperation BinaryOp, bool Init, RedOutVar RedOut, std::enable_if_t<!IsKnownIdentityOp< RelayT, RelayBinaryOperation >::value, int >=0)
Definition: reduction.hpp:902
reduction_impl_algo(const T &Identity, BinaryOperation BinaryOp, bool Init, RedOutVar RedOut)
Definition: reduction.hpp:885
reduction_impl_algo(BinaryOperation BinaryOp, bool Init, RedOutVar RedOut, std::enable_if_t< IsKnownIdentityOp< RelayT, RelayBinaryOperation >::value, int >=0)
Definition: reduction.hpp:892
auto getWriteAccForPartialReds(size_t Size, handler &CGH)
Returns an accessor accessing the memory that will hold the reduction partial sums.
Definition: reduction.hpp:940
auto getReadAccToPreviousPartialReds(handler &CGH) const
Definition: reduction.hpp:909
static constexpr bool has_identity
Definition: reduction.hpp:860
accessor< int, 1, access::mode::read_write, access::target::device, access::placeholder::false_t > getReadWriteAccessorToInitializedGroupsCounter(handler &CGH)
Definition: reduction.hpp:1051
ReductionIdentityContainer< T, BinaryOperation, ExplicitIdentity > identity_container_type
Definition: reduction.hpp:863
auto getGroupsCounterAccDiscrete(handler &CGH)
Definition: reduction.hpp:1062
Base non-template class which is a base class for all reduction implementation classes.
This class encapsulates the reduction variable/accessor, the reduction operator and an optional opera...
Definition: reduction.hpp:1110
reduction_impl(RedOutVar Var, BinaryOperation BOp, bool InitializeToIdentity=false)
Constructs reduction_impl when no identity is specified.
Definition: reduction.hpp:1128
static constexpr bool is_known_identity
Definition: reduction.hpp:858
static constexpr bool is_usm
Definition: reduction.hpp:881
reduction_impl(RedOutVar &Var, const T &Identity, BinaryOperation BOp, bool InitializeToIdentity)
Constructs reduction_impl with an explicit identity value.
Definition: reduction.hpp:1146
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64
detail::is_device_info_desc< Param >::return_type get_info() const
Queries this SYCL device for information requested by the template parameter param.
Definition: device.hpp:223
bool has(aspect Aspect) const __SYCL_WARN_IMAGE_ASPECT(Aspect)
Indicates if the SYCL device has the given feature.
Definition: device.cpp:219
Command group handler class.
Definition: handler.hpp:458
void depends_on(event Event)
Registers event dependencies on this command group.
Definition: handler.cpp:1376
void single_task(_KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function as a function object type.
Definition: handler.hpp:2008
void parallel_for(range< 1 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2014
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, std::shared_ptr< T_Dst > Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Definition: handler.hpp:2594
Provides a cross-platform math array class template that works on SYCL devices as well as in host C++...
Definition: marray.hpp:49
Identifies an instance of the function object executing at each point in an nd_range.
Definition: nd_item.hpp:48
size_t get_local_linear_id() const
Definition: nd_item.hpp:101
size_t __SYCL_ALWAYS_INLINE get_group_linear_id() const
Definition: nd_item.hpp:132
range< Dimensions > get_local_range() const
Definition: nd_item.hpp:180
size_t __SYCL_ALWAYS_INLINE get_global_linear_id() const
Definition: nd_item.hpp:68
void barrier(access::fence_space accessSpace=access::fence_space::global_and_local) const
Definition: nd_item.hpp:212
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: nd_range.hpp:22
range< Dimensions > get_local_range() const
Definition: nd_range.hpp:45
range< Dimensions > get_group_range() const
Definition: nd_range.hpp:47
Objects of the property_list class are containers for the SYCL properties.
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
Definition: queue.hpp:111
event memset(void *Ptr, int Value, size_t Count, const detail::code_location &CodeLoc=detail::code_location::current())
Fills the memory pointed by a USM pointer with the value specified.
Definition: queue.cpp:106
device get_device() const
Definition: queue.cpp:76
size_t size() const
Definition: range.hpp:56
Class that is used to represent objects that are passed to user's lambda functions and representing u...
Definition: reduction.hpp:78
constexpr _SYCL_SPAN_INLINE_VISIBILITY pointer data() const noexcept
Definition: sycl_span.hpp:378
constexpr tuple_element< I, tuple< Types... > >::type & get(sycl::detail::tuple< Types... > &Arg) noexcept
Definition: tuple.hpp:198
item< Dims, false > getDelinearizedItem(range< Dims > Range, id< Dims > Id)
void withAuxHandler(handler &CGH, FunctorTy Func)
Definition: reduction.hpp:1166
void finalizeHandler(handler &CGH)
Definition: reduction.hpp:1165
void writeReduSumsToOutAccs(size_t OutAccIndex, ReduTupleT< OutAccT... > OutAccs, ReduTupleT< LocalAccT... > LocalAccs, ReduTupleT< BOPsT... > BOPs, ReduTupleT< Ts... > IdentityVals, std::array< bool, sizeof...(Reductions)> IsInitializeToIdentity, std::index_sequence< Is... >)
Definition: reduction.hpp:2066
size_t reduGetMemPerWorkItem(std::tuple< ReductionT... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:2583
device getDeviceFromHandler(handler &CommandGroupHandlerRef)
Definition: accessor.cpp:16
void doTreeReductionHelper(size_t WorkSize, size_t LID, FuncTy Func)
Definition: reduction.hpp:1394
void reduCGFuncImplScalar(nd_item< Dims > NDIt, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, std::tuple< ReducerT... > &ReducersTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... > ReduIndices)
All scalar reductions are processed together; there is one loop of log2(N) steps, and each reduction ...
Definition: reduction.hpp:2197
uint32_t reduGetMaxNumConcurrentWorkGroups(std::shared_ptr< queue_impl > Queue)
size_t reduGetMaxWGSize(std::shared_ptr< queue_impl > Queue, size_t LocalMemBytesPerWorkItem)
void reduAuxCGFuncImplArray(nd_item< Dims > NDIt, size_t LID, size_t GID, size_t RemainingWorkSize, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< InAccT... > InAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... >)
Definition: reduction.hpp:2476
std::bool_constant< std::is_same_v< BinaryOperation, sycl::maximum< T > >||std::is_same_v< BinaryOperation, sycl::maximum< void > >> IsMaximum
void reduCGFuncImplArrayHelper(nd_item< Dims > NDIt, LocalAccT LocalReds, OutAccT Out, ReducerT &Reducer, BOPT BOp, bool IsInitializeToIdentity)
Each array reduction is processed separately.
Definition: reduction.hpp:2229
std::bool_constant< std::is_same_v< BinaryOperation, sycl::bit_or< T > >||std::is_same_v< BinaryOperation, sycl::bit_or< void > >> IsBitOR
std::bool_constant< IsZeroIdentityOp< T, BinaryOperation >::value||IsOneIdentityOp< T, BinaryOperation >::value||IsOnesIdentityOp< T, BinaryOperation >::value||IsMinimumIdentityOp< T, BinaryOperation >::value||IsMaximumIdentityOp< T, BinaryOperation >::value||IsFalseIdentityOp< T, BinaryOperation >::value||IsTrueIdentityOp< T, BinaryOperation >::value > IsKnownIdentityOp
id< 1 > getDelinearizedId(const range< 1 > &, size_t Index)
Definition: id.hpp:313
void reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu)
Copies the final reduction result kept in read-write accessor to user's USM memory.
Definition: reduction.hpp:1183
void reduAuxCGFuncImplArrayHelper(nd_item< Dims > NDIt, size_t LID, size_t GID, size_t RemainingWorkSize, LocalAccT LocalReds, InAccT In, OutAccT Out, T IdentityContainer, BOPT BOp, bool IsInitializeToIdentity)
Definition: reduction.hpp:2428
void doTreeReductionOnTuple(size_t WorkSize, size_t LID, ReduTupleT< LocalAccT... > &LocalAccs, ReduTupleT< BOPsT... > &BOPs, std::index_sequence< Is... >)
Definition: reduction.hpp:1463
std::bool_constant<((is_sgenfloat_v< T > &&sizeof(T)==4)||is_sgeninteger_v< T >)&&IsValidAtomicType< T >::value &&(IsPlus< T, BinaryOperation >::value||IsMinimum< T, BinaryOperation >::value||IsMaximum< T, BinaryOperation >::value||IsBitOR< T, BinaryOperation >::value||IsBitXOR< T, BinaryOperation >::value||IsBitAND< T, BinaryOperation >::value)> IsReduOptForFastAtomicFetch
Definition: reduction.hpp:101
void associateReduAccsWithHandler(handler &CGH, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:2383
size_t GreatestPowerOfTwo(size_t N)
Computes the greatest power-of-two less than or equal to N.
Definition: reduction.hpp:1383
void doTreeReduction(size_t WorkSize, nd_item< Dim > NDIt, LocalRedsTy &LocalReds, BinOpTy &BOp, AccessFuncTy AccessFunc)
Definition: reduction.hpp:1428
std::conditional_t< std::is_same_v< KernelName, auto_name >, auto_name, reduction::InitMemKrn< KernelName > > __sycl_init_mem_for
A helper to pass undefined (sycl::detail::auto_name) names unmodified.
Definition: reduction.hpp:828
std::bool_constant< std::is_same_v< BinaryOperation, sycl::multiplies< T > >||std::is_same_v< BinaryOperation, sycl::multiplies< void > >> IsMultiplies
size_t reduGetMemPerWorkItemHelper(Reduction &)
Definition: reduction.hpp:2572
std::bool_constant<((is_sgeninteger_v< T > &&(sizeof(T)==4||sizeof(T)==8))||is_sgenfloat_v< T >)&&(IsPlus< T, BinaryOperation >::value||IsMinimum< T, BinaryOperation >::value||IsMaximum< T, BinaryOperation >::value)> IsReduOptForFastReduce
Definition: reduction.hpp:137
auto getReducerAccess(ReducerT &Reducer)
Definition: reduction.hpp:208
std::bool_constant<(IsPlus< T, BinaryOperation >::value||IsMinimum< T, BinaryOperation >::value||IsMaximum< T, BinaryOperation >::value) &&is_sgenfloat_v< T > &&sizeof(T)==8 > IsReduOptForAtomic64Op
Definition: reduction.hpp:120
constexpr auto filterSequenceHelper(FunctorT, std::index_sequence< Is... >)
For each index 'I' from the given indices pack 'Is' this function initially creates a number of short...
Definition: reduction.hpp:2136
void reduAuxCGFuncImplScalar(nd_item< Dims > NDIt, size_t LID, size_t GID, size_t RemainingWorkSize, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< InAccT... > InAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... > ReduIndices)
All scalar reductions are processed together; there is one loop of log2(N) steps, and each reduction ...
Definition: reduction.hpp:2398
std::tuple< std::tuple_element_t< Is, TupleT >... > tuple_select_elements(TupleT Tuple, std::index_sequence< Is... >)
Utility function: for the given tuple.
Definition: reduction.hpp:2592
constexpr tuple< Ts... > make_tuple(Ts... Args)
Definition: tuple.hpp:35
size_t reduGetPreferredWGSize(std::shared_ptr< queue_impl > &Queue, size_t LocalMemBytesPerWorkItem)
Definition: reduction.cpp:115
std::bool_constant< std::is_same_v< BinaryOperation, sycl::plus< T > >||std::is_same_v< BinaryOperation, sycl::plus< void > >> IsPlus
size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... > ReduIndices)
Definition: reduction.hpp:2498
constexpr auto filterSequence(FunctorT F, std::index_sequence< Is... > Indices)
For each index 'I' from the given indices pack 'Is' this function returns an index sequence consistin...
Definition: reduction.hpp:2152
void reduCGFuncImplArray(nd_item< Dims > NDIt, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, std::tuple< ReducerT... > &ReducersTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... >)
Definition: reduction.hpp:2279
void workGroupBarrier()
Definition: group.hpp:42
ReduTupleT< Ts... > makeReduTupleT(Ts... Elements)
Definition: reduction.hpp:145
std::conditional_t< std::is_same_v< KernelName, auto_name >, auto_name, MainOrAux< KernelName, Strategy, Ts... > > __sycl_reduction_kernel
A helper to pass undefined (sycl::detail::auto_name) names unmodified.
Definition: reduction.hpp:1218
std::bool_constant< std::is_same_v< BinaryOperation, sycl::minimum< T > >||std::is_same_v< BinaryOperation, sycl::minimum< void > >> IsMinimum
void reduction_parallel_for(handler &CGH, range< Dims > NDRange, PropertiesT Properties, RestT... Rest)
Definition: reduction.hpp:2711
size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize, size_t &NWorkGroups)
Definition: reduction.cpp:19
constexpr std::index_sequence concat_sequences(std::index_sequence<>)
Definition: reduction.hpp:2088
std::bool_constant< std::is_same_v< BinaryOperation, sycl::bit_and< T > >||std::is_same_v< BinaryOperation, sycl::bit_and< void > >> IsBitAND
auto createReduOutAccs(size_t NWorkGroups, handler &CGH, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
For the given 'Reductions' types pack and indices enumerating them this function either creates new t...
Definition: reduction.hpp:2039
auto getLastCombine(OutAccT OutAcc, LocalAccT LocalAcc, BOPT BOP, IdentityContainerT IdentityContainer, bool IsInitializeToIdentity)
Definition: reduction.hpp:2051
std::bool_constant< std::is_same_v< BinaryOperation, sycl::bit_xor< T > >||std::is_same_v< BinaryOperation, sycl::bit_xor< void > >> IsBitXOR
typename tuple_element< I, T >::type tuple_element_t
Definition: tuple.hpp:55
constexpr auto makeAdjustedBOPs(ReduTupleT< BOPsT... > &BOPsTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:2178
void reduCGFuncMulti(handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, PropertiesT Properties, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... > ReduIndices)
Definition: reduction.hpp:2298
constexpr auto makeAdjustedBOP(BOPT &BOP)
Definition: reduction.hpp:2171
auto make_reduction(RedOutVar RedVar, RestTy &&...Rest)
Definition: reduction.hpp:1158
sycl::detail::tuple< Ts... > ReduTupleT
Definition: reduction.hpp:144
constexpr mode_tag_t< access_mode::read > read_only
Definition: access.hpp:84
constexpr property::no_init no_init
constexpr mode_tag_t< access_mode::read_write > read_write
Definition: access.hpp:85
std::enable_if_t<(is_group_v< std::decay_t< Group >> &&(detail::is_scalar_arithmetic< T >::value||(detail::is_complex< T >::value &&detail::is_multiplies< T, BinaryOperation >::value)) &&detail::is_native_op< T, BinaryOperation >::value), T > reduce_over_group(Group g, T x, BinaryOperation binary_op)
auto reduction(buffer< T, 1, AllocatorT > Var, handler &CGH, BinaryOperation Combiner, const property_list &PropList={})
Constructs a reduction object using the given buffer Var, handler CGH, reduction operation Combiner,...
Definition: reduction.hpp:2837
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:87
const void value_type
Definition: multi_ptr.hpp:457
void free(void *ptr, const context &ctxt, const detail::code_location &CodeLoc=detail::code_location::current())
Definition: usm_impl.cpp:374
Definition: access.hpp:18
_Abi const simd< _Tp, _Abi > & noexcept
Definition: simd.hpp:1324
std::conditional_t< Cond, std::index_sequence< I >, std::index_sequence<> > type
Definition: reduction.hpp:2126
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:2643
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, RestT... Rest)
Definition: reduction.hpp:2690
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1824
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1589
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1279
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1686
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1228
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1625
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, RestT... Rest)
Definition: reduction.hpp:2599
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1478
const IdentityContainerT & IdentityContainer
Definition: reduction.hpp:529
const BinaryOperation BOp
Definition: reduction.hpp:530
Helper class for accessing reducer-defined types in CRTP May prove to be useful for other things late...
Definition: reduction.hpp:161
static constexpr int value
Definition: reduction.hpp:797
is_device_copyable is a user specializable class template to indicate that a type T is device copyabl...