DPC++ Runtime
Runtime libraries for oneAPI DPC++
reduction.hpp
Go to the documentation of this file.
1 //==---------------- reduction.hpp - SYCL reduction ------------*- C++ -*---==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 // ===--------------------------------------------------------------------=== //
8 
9 #pragma once
10 
11 #include <sycl/access/access.hpp> // for address_s...
12 #include <sycl/accessor.hpp> // for local_acc...
13 #include <sycl/aspects.hpp> // for aspect
14 #include <sycl/atomic.hpp> // for IsValidAt...
15 #include <sycl/atomic_ref.hpp> // for atomic_ref
16 #include <sycl/buffer.hpp> // for buffer
17 #include <sycl/builtins.hpp> // for min
18 #include <sycl/detail/export.hpp> // for __SYCL_EX...
19 #include <sycl/detail/generic_type_traits.hpp> // for is_sgenfloat
20 #include <sycl/detail/impl_utils.hpp> // for createSyc...
21 #include <sycl/detail/item_base.hpp> // for id
22 #include <sycl/detail/reduction_forward.hpp> // for strategy
23 #include <sycl/detail/tuple.hpp> // for make_tuple
24 #include <sycl/device.hpp> // for device
25 #include <sycl/event.hpp> // for event
26 #include <sycl/exception.hpp> // for make_erro...
27 #include <sycl/exception_list.hpp> // for queue_impl
29 #include <sycl/group.hpp> // for workGroup...
30 #include <sycl/group_algorithm.hpp> // for reduce_ov...
31 #include <sycl/handler.hpp> // for handler
32 #include <sycl/id.hpp> // for getDeline...
33 #include <sycl/kernel.hpp> // for auto_name
34 #include <sycl/known_identity.hpp> // for IsKnownId...
35 #include <sycl/marray.hpp> // for marray
36 #include <sycl/memory_enums.hpp> // for memory_order
37 #include <sycl/multi_ptr.hpp> // for address_s...
38 #include <sycl/nd_item.hpp> // for nd_item
39 #include <sycl/nd_range.hpp> // for nd_range
40 #include <sycl/properties/accessor_properties.hpp> // for no_init
41 #include <sycl/properties/reduction_properties.hpp> // for initializ...
42 #include <sycl/property_list.hpp> // for property_...
43 #include <sycl/queue.hpp> // for queue
44 #include <sycl/range.hpp> // for range
45 #include <sycl/sycl_span.hpp> // for dynamic_e...
46 #include <sycl/usm.hpp> // for malloc_de...
47 
48 #include <algorithm> // for min
49 #include <array> // for array
50 #include <assert.h> // for assert
51 #include <cstddef> // for size_t
52 #include <memory> // for shared_ptr
53 #include <optional> // for nullopt
54 #include <stdint.h> // for uint32_t
55 #include <string> // for operator+
56 #include <tuple> // for _Swallow_...
57 #include <type_traits> // for enable_if_t
58 #include <utility> // for index_seq...
59 #include <variant> // for tuple
60 
61 namespace sycl {
62 inline namespace _V1 {
76 template <typename T, class BinaryOperation, int Dims, size_t Extent,
77  typename IdentityContainerT, bool View = false, typename Subst = void>
78 class reducer;
79 
80 namespace detail {
81 // This type trait is used to detect if the atomic operation BinaryOperation
82 // used with operands of the type T is available for using in reduction.
83 // The order in which the atomic operations are performed may be arbitrary and
84 // thus may cause different results from run to run even on the same elements
85 // and on same device. The macro SYCL_REDUCTION_DETERMINISTIC prohibits using
86 // atomic operations for reduction and helps to produce stable results.
87 // SYCL_REDUCTION_DETERMINISTIC is a short term solution, which perhaps become
88 // deprecated eventually and is replaced by a sycl property passed to reduction.
89 template <typename T, class BinaryOperation>
91 #ifdef SYCL_REDUCTION_DETERMINISTIC
92  std::bool_constant<false>;
93 #else
94  std::bool_constant<((is_sgenfloat_v<T> && sizeof(T) == 4) ||
95  is_sgeninteger_v<T>) &&
103 #endif
104 
105 // This type trait is used to detect if the atomic operation BinaryOperation
106 // used with operands of the type T is available for using in reduction, in
107 // addition to the cases covered by "IsReduOptForFastAtomicFetch", if the device
108 // has the atomic64 aspect. This type trait should only be used if the device
109 // has the atomic64 aspect. Note that this type trait is currently a subset of
110 // IsReduOptForFastReduce. The macro SYCL_REDUCTION_DETERMINISTIC prohibits
111 // using the reduce_over_group() algorithm to produce stable results across same
112 // type devices.
113 template <typename T, class BinaryOperation>
115 #ifdef SYCL_REDUCTION_DETERMINISTIC
116  std::bool_constant<false>;
117 #else
118  std::bool_constant<(IsPlus<T, BinaryOperation>::value ||
121  is_sgenfloat_v<T> && sizeof(T) == 8>;
122 #endif
123 
124 // This type trait is used to detect if the group algorithm reduce() used with
125 // operands of the type T and the operation BinaryOperation is available
126 // for using in reduction.
127 // The macro SYCL_REDUCTION_DETERMINISTIC prohibits using the reduce() algorithm
128 // to produce stable results across same type devices.
129 template <typename T, class BinaryOperation>
131 #ifdef SYCL_REDUCTION_DETERMINISTIC
132  std::bool_constant<false>;
133 #else
134  std::bool_constant<((is_sgeninteger_v<T> &&
135  (sizeof(T) == 4 || sizeof(T) == 8)) ||
136  is_sgenfloat_v<T>) &&
140 #endif
141 
142 // std::tuple seems to be a) too heavy and b) not copyable to device now
143 // Thus sycl::detail::tuple is used instead.
144 // Switching from sycl::device::tuple to std::tuple can be done by re-defining
145 // the ReduTupleT type and makeReduTupleT() function below.
146 template <typename... Ts> using ReduTupleT = sycl::detail::tuple<Ts...>;
147 template <typename... Ts> ReduTupleT<Ts...> makeReduTupleT(Ts... Elements) {
148  return sycl::detail::make_tuple(Elements...);
149 }
150 
151 __SYCL_EXPORT size_t reduGetMaxWGSize(std::shared_ptr<queue_impl> Queue,
152  size_t LocalMemBytesPerWorkItem);
153 __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
154  size_t &NWorkGroups);
155 __SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
156  size_t LocalMemBytesPerWorkItem);
157 
158 template <typename T, class BinaryOperation, bool IsOptional>
159 class ReducerElement;
160 
163 template <typename Reducer> struct ReducerTraits;
164 
165 template <typename T, class BinaryOperation, int Dims, std::size_t Extent,
166  typename IdentityContainerT, bool View, typename Subst>
167 struct ReducerTraits<reducer<T, BinaryOperation, Dims, Extent,
168  IdentityContainerT, View, Subst>> {
169  using type = T;
170  using op = BinaryOperation;
171  static constexpr int dims = Dims;
172  static constexpr size_t extent = Extent;
173  static constexpr bool has_identity = IdentityContainerT::has_identity;
175 };
176 
178 template <typename ReducerT> class ReducerAccess {
179 public:
180  ReducerAccess(ReducerT &ReducerRef) : MReducerRef(ReducerRef) {}
181 
182  template <typename ReducerRelayT = ReducerT> auto &getElement(size_t E) {
183  return MReducerRef.getElement(E);
184  }
185 
186  template <typename ReducerRelayT = ReducerT> constexpr auto getIdentity() {
188  "Identity unavailable.");
189  return MReducerRef.getIdentity();
190  }
191 
192  // MSVC does not like static overloads of non-static functions, even if they
193  // are made mutually exclusive through SFINAE. Instead we use a new static
194  // function to be used when a static function is needed.
195  template <typename ReducerRelayT = ReducerT>
196  static constexpr auto getIdentityStatic() {
197  static_assert(
199  typename ReducerTraits<ReducerRelayT>::op>::value,
200  "Static identity unavailable.");
201  return ReducerT::getIdentity();
202  }
203 
204 private:
205  ReducerT &MReducerRef;
206 };
207 
208 // Helper function to simplify the use of ReducerAccess. This avoids the need
209 // for potentially unsupported deduction guides.
210 template <typename ReducerT> auto getReducerAccess(ReducerT &Reducer) {
211  return ReducerAccess<ReducerT>{Reducer};
212 }
213 
225 template <class Reducer> class combiner {
226  using Ty = typename ReducerTraits<Reducer>::type;
227  using BinaryOp = typename ReducerTraits<Reducer>::op;
228  static constexpr int Dims = ReducerTraits<Reducer>::dims;
229  static constexpr size_t Extent = ReducerTraits<Reducer>::extent;
230 
231 public:
232  template <typename _T = Ty, int _Dims = Dims>
233  std::enable_if_t<(_Dims == 0) && IsPlus<_T, BinaryOp>::value &&
234  is_geninteger_v<_T>,
235  Reducer &>
236  operator++() {
237  return static_cast<Reducer *>(this)->combine(static_cast<_T>(1));
238  }
239 
240  template <typename _T = Ty, int _Dims = Dims>
241  std::enable_if_t<(_Dims == 0) && IsPlus<_T, BinaryOp>::value &&
242  is_geninteger_v<_T>,
243  Reducer &>
244  operator++(int) {
245  return static_cast<Reducer *>(this)->combine(static_cast<_T>(1));
246  }
247 
248  template <typename _T = Ty, int _Dims = Dims>
249  std::enable_if_t<(_Dims == 0) && IsPlus<_T, BinaryOp>::value, Reducer &>
250  operator+=(const _T &Partial) {
251  return static_cast<Reducer *>(this)->combine(Partial);
252  }
253 
254  template <typename _T = Ty, int _Dims = Dims>
255  std::enable_if_t<(_Dims == 0) && IsMultiplies<_T, BinaryOp>::value, Reducer &>
256  operator*=(const _T &Partial) {
257  return static_cast<Reducer *>(this)->combine(Partial);
258  }
259 
260  template <typename _T = Ty, int _Dims = Dims>
261  std::enable_if_t<(_Dims == 0) && IsBitOR<_T, BinaryOp>::value, Reducer &>
262  operator|=(const _T &Partial) {
263  return static_cast<Reducer *>(this)->combine(Partial);
264  }
265 
266  template <typename _T = Ty, int _Dims = Dims>
267  std::enable_if_t<(_Dims == 0) && IsBitXOR<_T, BinaryOp>::value, Reducer &>
268  operator^=(const _T &Partial) {
269  return static_cast<Reducer *>(this)->combine(Partial);
270  }
271 
272  template <typename _T = Ty, int _Dims = Dims>
273  std::enable_if_t<(_Dims == 0) && IsBitAND<_T, BinaryOp>::value, Reducer &>
274  operator&=(const _T &Partial) {
275  return static_cast<Reducer *>(this)->combine(Partial);
276  }
277 
278 private:
279  template <access::address_space Space>
280  static constexpr memory_scope getMemoryScope() {
281  return Space == access::address_space::local_space
284  }
285 
286  template <access::address_space Space, class T, class AtomicFunctor>
287  void atomic_combine_impl(T *ReduVarPtr, AtomicFunctor Functor) const {
288  auto reducer = static_cast<const Reducer *>(this);
289  for (size_t E = 0; E < Extent; ++E) {
290  const auto &ReducerElem = getReducerAccess(*reducer).getElement(E);
291 
292  // If the reducer element doesn't have a value we can skip the combine.
293  if (!ReducerElem)
294  continue;
295 
296  auto AtomicRef = sycl::atomic_ref<T, memory_order::relaxed,
297  getMemoryScope<Space>(), Space>(
298  address_space_cast<Space, access::decorated::no>(ReduVarPtr)[E]);
299  Functor(std::move(AtomicRef), *ReducerElem);
300  }
301  }
302 
303  template <class _T, access::address_space Space, class BinaryOp>
304  static constexpr bool BasicCheck =
305  std::is_same_v<remove_decoration_t<_T>, Ty> &&
308 
309 public:
312  typename _T = Ty, class _BinaryOperation = BinaryOp>
313  std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
314  (IsReduOptForFastAtomicFetch<_T, _BinaryOperation>::value ||
315  IsReduOptForAtomic64Op<_T, _BinaryOperation>::value) &&
316  IsPlus<_T, _BinaryOperation>::value>
317  atomic_combine(_T *ReduVarPtr) const {
318  atomic_combine_impl<Space>(
319  ReduVarPtr, [](auto &&Ref, auto Val) { return Ref.fetch_add(Val); });
320  }
321 
324  typename _T = Ty, class _BinaryOperation = BinaryOp>
325  std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
328  atomic_combine(_T *ReduVarPtr) const {
329  atomic_combine_impl<Space>(
330  ReduVarPtr, [](auto &&Ref, auto Val) { return Ref.fetch_or(Val); });
331  }
332 
335  typename _T = Ty, class _BinaryOperation = BinaryOp>
336  std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
339  atomic_combine(_T *ReduVarPtr) const {
340  atomic_combine_impl<Space>(
341  ReduVarPtr, [](auto &&Ref, auto Val) { return Ref.fetch_xor(Val); });
342  }
343 
346  typename _T = Ty, class _BinaryOperation = BinaryOp>
347  std::enable_if_t<std::is_same_v<remove_decoration_t<_T>, _T> &&
352  atomic_combine(_T *ReduVarPtr) const {
353  atomic_combine_impl<Space>(
354  ReduVarPtr, [](auto &&Ref, auto Val) { return Ref.fetch_and(Val); });
355  }
356 
359  typename _T = Ty, class _BinaryOperation = BinaryOp>
360  std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
364  atomic_combine(_T *ReduVarPtr) const {
365  atomic_combine_impl<Space>(
366  ReduVarPtr, [](auto &&Ref, auto Val) { return Ref.fetch_min(Val); });
367  }
368 
371  typename _T = Ty, class _BinaryOperation = BinaryOp>
372  std::enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
376  atomic_combine(_T *ReduVarPtr) const {
377  atomic_combine_impl<Space>(
378  ReduVarPtr, [](auto &&Ref, auto Val) { return Ref.fetch_max(Val); });
379  }
380 };
381 
384 template <typename T, class BinaryOperation, bool ExplicitIdentity,
385  typename CondT = void>
387  static_assert(!std::is_same_v<T, T>,
388  "Partial specializations don't cover all possible options!");
389 };
390 
391 // Specialization for reductions with explicit identity.
392 template <typename T, class BinaryOperation, bool ExplicitIdentity>
394  T, BinaryOperation, ExplicitIdentity,
395  enable_if_t<IsKnownIdentityOp<T, BinaryOperation>::value>> {
396 public:
397  static constexpr bool has_identity = true;
398 
401 
403  static constexpr T getIdentity() {
405  }
406 };
407 
408 // Specialization for reductions with explicit identity.
409 template <typename T, class BinaryOperation>
411  T, BinaryOperation, true,
412  enable_if_t<!IsKnownIdentityOp<T, BinaryOperation>::value>> {
413 public:
414  static constexpr bool has_identity = true;
415 
416  ReductionIdentityContainer(const T &Identity) : MIdentity(Identity) {}
417 
419  T getIdentity() const { return MIdentity; }
420 
421 private:
424  const T MIdentity;
425 };
426 
427 // Specialization for identityless reductions.
428 template <typename T, class BinaryOperation>
430  T, BinaryOperation, false,
431  std::enable_if_t<!IsKnownIdentityOp<T, BinaryOperation>::value>> {
432 public:
433  static constexpr bool has_identity = false;
434 };
435 
436 // Class representing a single reducer element. In cases where the identity is
437 // unknown this is allowed to not contain a value.
438 template <typename T, class BinaryOperation, bool IsOptional>
440  using value_type = std::conditional_t<IsOptional, std::optional<T>, T>;
441 
442  template <bool ExplicitIdentity>
443  constexpr value_type GetInitValue(
445  &IdentityContainer) {
446  constexpr bool ContainerHasIdentity =
447  ReductionIdentityContainer<T, BinaryOperation,
448  ExplicitIdentity>::has_identity;
449  static_assert(IsOptional || ContainerHasIdentity);
450  if constexpr (!ContainerHasIdentity)
451  return std::nullopt;
452  else
453  return IdentityContainer.getIdentity();
454  }
455 
456 public:
457  ReducerElement() = default;
458  ReducerElement(T Value) : MValue{Value} {}
459 
460  template <bool ExplicitIdentity>
463  &IdentityContainer)
464  : MValue(GetInitValue(IdentityContainer)) {}
465 
466  ReducerElement &combine(BinaryOperation BinOp, const T &OtherValue) {
467  if constexpr (IsOptional)
468  MValue = MValue ? BinOp(*MValue, OtherValue) : OtherValue;
469  else
470  MValue = BinOp(MValue, OtherValue);
471  return *this;
472  }
473 
474  ReducerElement &combine(BinaryOperation BinOp, const ReducerElement &Other) {
475  if constexpr (IsOptional) {
476  if (Other.MValue)
477  return combine(BinOp, *Other.MValue);
478  // If the other value doesn't have a value it is a no-op.
479  return *this;
480  } else {
481  return combine(BinOp, Other.MValue);
482  }
483  }
484 
485  constexpr T &operator*() noexcept {
486  if constexpr (IsOptional)
487  return *MValue;
488  else
489  return MValue;
490  }
491  constexpr const T &operator*() const noexcept {
492  if constexpr (IsOptional)
493  return *MValue;
494  else
495  return MValue;
496  }
497 
498  constexpr explicit operator bool() const {
499  if constexpr (IsOptional)
500  return MValue.has_value();
501  return true;
502  }
503 
504 private:
505  value_type MValue;
506 };
507 } // namespace detail
508 
509 // We explicitly claim std::optional as device-copyable in sycl/types.hpp.
510 // However, that information isn't propagated to the struct/class types that use
511 // it as a member, so we need to provide this partial specialization as well.
512 // This is needed to support GNU STL where
513 // std::is_trivially_copyable_v<std::optional> is false (e.g., 7.5.* ).
514 template <typename T, class BinaryOperation, bool IsOptional>
516  detail::ReducerElement<T, BinaryOperation, IsOptional>>
517  : is_device_copyable<std::conditional_t<IsOptional, std::optional<T>, T>> {
518 };
519 
520 namespace detail {
521 template <typename T, class BinaryOperation, int Dims> class reducer_common {
522 public:
523  using value_type = T;
524  using binary_operation = BinaryOperation;
525  static constexpr int dimensions = Dims;
526 };
527 
528 // Token class to help with the in-place construction of reducers.
529 template <class BinaryOperation, typename IdentityContainerT>
530 struct ReducerToken {
531  const IdentityContainerT &IdentityContainer;
532  const BinaryOperation BOp;
533 };
534 
535 } // namespace detail
536 
543 template <typename T, class BinaryOperation, int Dims, size_t Extent,
544  typename IdentityContainerT, bool View>
545 class reducer<
546  T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
547  std::enable_if_t<Dims == 0 && Extent == 1 && View == false &&
548  !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
549  : public detail::combiner<
550  reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
551  std::enable_if_t<
552  Dims == 0 && Extent == 1 && View == false &&
553  !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>>,
554  public detail::reducer_common<T, BinaryOperation, Dims> {
555  static constexpr bool has_identity = IdentityContainerT::has_identity;
556  using element_type =
558 
559 public:
560  reducer(const IdentityContainerT &IdentityContainer, BinaryOperation BOp)
561  : MValue(IdentityContainer), MIdentity(IdentityContainer),
562  MBinaryOp(BOp) {}
565  : reducer(Token.IdentityContainer, Token.BOp) {}
566 
567  reducer(const reducer &) = delete;
568  reducer(reducer &&) = delete;
569  reducer &operator=(const reducer &) = delete;
570  reducer &operator=(reducer &&) = delete;
571 
572  reducer &combine(const T &Partial) {
573  MValue.combine(MBinaryOp, Partial);
574  return *this;
575  }
576 
577  template <bool HasIdentityRelay = has_identity>
578  std::enable_if_t<HasIdentityRelay && (HasIdentityRelay == has_identity), T>
579  identity() const {
580  return MIdentity.getIdentity();
581  }
582 
583 private:
584  template <typename ReducerT> friend class detail::ReducerAccess;
585 
586  element_type &getElement(size_t) { return MValue; }
587  const element_type &getElement(size_t) const { return MValue; }
588 
589  detail::ReducerElement<T, BinaryOperation, !has_identity> MValue;
590  const IdentityContainerT MIdentity;
591  BinaryOperation MBinaryOp;
592 };
593 
599 template <typename T, class BinaryOperation, int Dims, size_t Extent,
600  typename IdentityContainerT, bool View>
601 class reducer<
602  T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
603  std::enable_if_t<Dims == 0 && Extent == 1 && View == false &&
604  detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
605  : public detail::combiner<
606  reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
607  std::enable_if_t<
608  Dims == 0 && Extent == 1 && View == false &&
609  detail::IsKnownIdentityOp<T, BinaryOperation>::value>>>,
610  public detail::reducer_common<T, BinaryOperation, Dims> {
611  static constexpr bool has_identity = IdentityContainerT::has_identity;
612  using element_type =
614 
615 public:
616  reducer() : MValue(getIdentity()) {}
617  reducer(const IdentityContainerT & /* Identity */, BinaryOperation)
618  : MValue(getIdentity()) {}
621  : reducer(Token.IdentityContainer, Token.BOp) {}
622 
623  reducer(const reducer &) = delete;
624  reducer(reducer &&) = delete;
625  reducer &operator=(const reducer &) = delete;
626  reducer &operator=(reducer &&) = delete;
627 
628  reducer &combine(const T &Partial) {
629  BinaryOperation BOp;
630  MValue.combine(BOp, Partial);
631  return *this;
632  }
633 
634  T identity() const { return getIdentity(); }
635 
636 private:
637  template <typename ReducerT> friend class detail::ReducerAccess;
638 
639  static constexpr T getIdentity() {
641  }
642 
643  element_type &getElement(size_t) { return MValue; }
644  const element_type &getElement(size_t) const { return MValue; }
645  detail::ReducerElement<T, BinaryOperation, !has_identity> MValue;
646 };
647 
650 template <typename T, class BinaryOperation, int Dims, size_t Extent,
651  typename IdentityContainerT, bool View>
652 class reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
653  std::enable_if_t<Dims == 0 && View == true>>
654  : public detail::combiner<
655  reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
656  std::enable_if_t<Dims == 0 && View == true>>>,
657  public detail::reducer_common<T, BinaryOperation, Dims> {
658  static constexpr bool has_identity = IdentityContainerT::has_identity;
659  using element_type =
661 
662 public:
663  reducer(element_type &Ref, BinaryOperation BOp)
664  : MElement(Ref), MBinaryOp(BOp) {}
667  : reducer(Token.IdentityContainer, Token.BOp) {}
668 
669  reducer(const reducer &) = delete;
670  reducer(reducer &&) = delete;
671  reducer &operator=(const reducer &) = delete;
672  reducer &operator=(reducer &&) = delete;
673 
674  reducer &combine(const T &Partial) {
675  MElement.combine(MBinaryOp, Partial);
676  return *this;
677  }
678 
679 private:
680  template <typename ReducerT> friend class detail::ReducerAccess;
681 
682  element_type &getElement(size_t) { return MElement; }
683  const element_type &getElement(size_t) const { return MElement; }
684 
685  element_type &MElement;
686  BinaryOperation MBinaryOp;
687 };
688 
691 template <typename T, class BinaryOperation, int Dims, size_t Extent,
692  typename IdentityContainerT, bool View>
693 class reducer<
694  T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
695  std::enable_if_t<Dims == 1 && View == false &&
696  !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
697  : public detail::combiner<
698  reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
699  std::enable_if_t<
700  Dims == 1 && View == false &&
701  !detail::IsKnownIdentityOp<T, BinaryOperation>::value>>>,
702  public detail::reducer_common<T, BinaryOperation, Dims> {
703  static constexpr bool has_identity = IdentityContainerT::has_identity;
704  using element_type =
706 
707 public:
708  reducer(const IdentityContainerT &IdentityContainer, BinaryOperation BOp)
709  : MValue(IdentityContainer), MIdentity(IdentityContainer),
710  MBinaryOp(BOp) {}
713  : reducer(Token.IdentityContainer, Token.BOp) {}
714 
715  reducer(const reducer &) = delete;
716  reducer(reducer &&) = delete;
717  reducer &operator=(const reducer &) = delete;
718  reducer &operator=(reducer &&) = delete;
719 
720  reducer<T, BinaryOperation, Dims - 1, Extent, IdentityContainerT, true>
721  operator[](size_t Index) {
722  return {MValue[Index], MBinaryOp};
723  }
724 
725  template <bool HasIdentityRelay = has_identity>
726  std::enable_if_t<HasIdentityRelay && (HasIdentityRelay == has_identity), T>
727  identity() const {
728  return MIdentity.getIdentity();
729  }
730 
731 private:
732  template <typename ReducerT> friend class detail::ReducerAccess;
733 
734  element_type &getElement(size_t E) { return MValue[E]; }
735  const element_type &getElement(size_t E) const { return MValue[E]; }
736 
738  const IdentityContainerT MIdentity;
739  BinaryOperation MBinaryOp;
740 };
741 
744 template <typename T, class BinaryOperation, int Dims, size_t Extent,
745  typename IdentityContainerT, bool View>
746 class reducer<
747  T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
748  std::enable_if_t<Dims == 1 && View == false &&
749  detail::IsKnownIdentityOp<T, BinaryOperation>::value>>
750  : public detail::combiner<
751  reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
752  std::enable_if_t<
753  Dims == 1 && View == false &&
754  detail::IsKnownIdentityOp<T, BinaryOperation>::value>>>,
755  public detail::reducer_common<T, BinaryOperation, Dims> {
756  static constexpr bool has_identity = IdentityContainerT::has_identity;
757  using element_type =
759 
760 public:
761  reducer() : MValue(getIdentity()) {}
762  reducer(const IdentityContainerT & /* Identity */, BinaryOperation)
763  : MValue(getIdentity()) {}
766  : reducer(Token.IdentityContainer, Token.BOp) {}
767 
768  reducer(const reducer &) = delete;
769  reducer(reducer &&) = delete;
770  reducer &operator=(const reducer &) = delete;
771  reducer &operator=(reducer &&) = delete;
772 
773  // SYCL 2020 revision 4 says this should be const, but this is a bug
774  // see https://github.com/KhronosGroup/SYCL-Docs/pull/252
775  reducer<T, BinaryOperation, Dims - 1, Extent, IdentityContainerT, true>
776  operator[](size_t Index) {
777  return {MValue[Index], BinaryOperation()};
778  }
779 
780  T identity() const { return getIdentity(); }
781 
782 private:
783  template <typename ReducerT> friend class detail::ReducerAccess;
784 
785  static constexpr T getIdentity() {
787  }
788 
789  element_type &getElement(size_t E) { return MValue[E]; }
790  const element_type &getElement(size_t E) const { return MValue[E]; }
791 
792  marray<element_type, Extent> MValue;
793 };
794 
795 namespace detail {
796 
797 // Used for determining dimensions for temporary storage (mainly).
798 template <class T> struct data_dim_t {
799  static constexpr int value = 1;
800 };
801 
802 template <class T, int AccessorDims, access::mode Mode,
803  access::placeholder IsPH, typename PropList>
804 struct data_dim_t<
805  accessor<T, AccessorDims, Mode, access::target::device, IsPH, PropList>> {
806  static constexpr int value = AccessorDims;
807 };
808 
809 template <class T> struct get_red_t;
810 template <class T> struct get_red_t<T *> {
811  using type = T;
812 };
813 
814 template <class T, int Dims, typename AllocatorT>
815 struct get_red_t<buffer<T, Dims, AllocatorT>> {
816  using type = T;
817 };
818 
819 namespace reduction {
820 // Kernel name wrapper for initializing reduction-related memory through
821 // reduction_impl_algo::withInitializedMem.
822 template <typename KernelName> struct InitMemKrn;
823 } // namespace reduction
824 
827 template <class KernelName>
829  std::conditional_t<std::is_same_v<KernelName, auto_name>, auto_name,
831 
832 __SYCL_EXPORT void
833 addCounterInit(handler &CGH, std::shared_ptr<sycl::detail::queue_impl> &Queue,
834  std::shared_ptr<int> &Counter);
835 
836 template <typename T, class BinaryOperation, int Dims, size_t Extent,
837  bool ExplicitIdentity, typename RedOutVar>
839  using self = reduction_impl_algo<T, BinaryOperation, Dims, Extent,
840  ExplicitIdentity, RedOutVar>;
841 
842  // TODO: Do we also need chooseBinOp?
843  static constexpr T chooseIdentity(const T &Identity) {
844  // For now the implementation ignores the identity value given by user
845  // when the implementation knows the identity.
846  // The SPEC could prohibit passing identity parameter to operations with
847  // known identity, but that could have some bad consequences too.
848  // For example, at some moment the implementation may NOT know the identity
849  // for COMPLEX-PLUS reduction. User may create a program that would pass
850  // COMPLEX value (0,0) as identity for PLUS reduction. At some later moment
851  // when the implementation starts handling COMPLEX-PLUS as known operation
852  // the existing user's program remains compilable and working correctly.
853  // I.e. with this constructor here, adding more reduction operations to the
854  // list of known operations does not break the existing programs.
855  if constexpr (is_known_identity) {
856  (void)Identity;
858  } else {
859  return Identity;
860  }
861  }
862 
863 public:
864  static constexpr bool is_known_identity =
866  static constexpr bool has_identity = is_known_identity || ExplicitIdentity;
867 
872  using reducer_type =
876  using result_type = T;
877  using binary_operation = BinaryOperation;
878 
879  static constexpr size_t dims = Dims;
880  static constexpr bool has_float64_atomics =
882  static constexpr bool has_fast_atomics =
884  static constexpr bool has_fast_reduce =
886 
887  static constexpr bool is_usm = std::is_same_v<RedOutVar, T *>;
888 
889  static constexpr size_t num_elements = Extent;
890 
891  reduction_impl_algo(const T &Identity, BinaryOperation BinaryOp, bool Init,
892  RedOutVar RedOut)
893  : MIdentityContainer(chooseIdentity(Identity)), MBinaryOp(BinaryOp),
894  InitializeToIdentity(Init), MRedOut(std::move(RedOut)) {}
895 
896  template <typename RelayT = T,
897  typename RelayBinaryOperation = BinaryOperation>
899  BinaryOperation BinaryOp, bool Init, RedOutVar RedOut,
901  int> = 0)
902  : MIdentityContainer(ReducerAccess<reducer_type>::getIdentityStatic()),
903  MBinaryOp(BinaryOp), InitializeToIdentity(Init),
904  MRedOut(std::move(RedOut)) {}
905 
906  template <typename RelayT = T,
907  typename RelayBinaryOperation = BinaryOperation>
909  BinaryOperation BinaryOp, bool Init, RedOutVar RedOut,
911  int> = 0)
912  : MIdentityContainer(), MBinaryOp(BinaryOp), InitializeToIdentity(Init),
913  MRedOut(std::move(RedOut)) {}
914 
916  CGH.addReduction(MOutBufPtr);
917  return accessor{*MOutBufPtr, CGH, sycl::read_only};
918  }
919 
920  template <bool IsOneWG>
921  auto getWriteMemForPartialReds(size_t Size, handler &CGH) {
922  // If there is only one WG we can avoid creation of temporary buffer with
923  // partial sums and write directly into user's reduction variable.
924  if constexpr (IsOneWG) {
925  return getUserRedVarAccess(CGH);
926  } else {
927  MOutBufPtr =
928  std::make_shared<buffer<reducer_element_type, 1>>(range<1>(Size));
929  CGH.addReduction(MOutBufPtr);
930  return accessor{*MOutBufPtr, CGH};
931  }
932  }
933 
934  template <class _T = T> auto &getTempBuffer(size_t Size, handler &CGH) {
935  auto Buffer = std::make_shared<buffer<_T, 1>>(range<1>(Size));
936  CGH.addReduction(Buffer);
937  return *Buffer;
938  }
939 
946  auto getWriteAccForPartialReds(size_t Size, handler &CGH) {
947  static_assert(!has_identity || sizeof(reducer_element_type) == sizeof(T),
948  "Unexpected size of reducer element type.");
949 
950  // We can only use the output memory directly if it is not USM and we have
951  // and identity, i.e. it has a thin element wrapper.
952  if constexpr (!is_usm && has_identity) {
953  if (Size == 1) {
954  auto ReinterpretRedOut =
955  MRedOut.template reinterpret<reducer_element_type>();
956  return accessor{ReinterpretRedOut, CGH};
957  }
958  }
959 
960  // Create a new output buffer and return an accessor to it.
961  //
962  // Array reductions are performed element-wise to avoid stack growth.
963  MOutBufPtr =
964  std::make_shared<buffer<reducer_element_type, 1>>(range<1>(Size));
965  CGH.addReduction(MOutBufPtr);
966  return accessor{*MOutBufPtr, CGH};
967  }
968 
974  //
975  // This currently optimizes for a number of kernel instantiations instead of
976  // runtime latency. That might change in future.
977  template <typename KernelName, typename FuncTy,
978  bool HasIdentity = has_identity>
979  std::enable_if_t<HasIdentity> withInitializedMem(handler &CGH, FuncTy Func) {
980  // "Template" lambda to ensure that only one type of Func (USM/Buf) is
981  // instantiated for the code below.
982  auto DoIt = [&](auto &Out) {
983  auto RWReduVal = std::make_shared<std::array<T, num_elements>>();
984  for (size_t i = 0; i < num_elements; ++i) {
985  (*RWReduVal)[i] = decltype(MIdentityContainer)::getIdentity();
986  }
987  auto Buf = std::make_shared<buffer<T, 1>>(RWReduVal.get()->data(),
989  Buf->set_final_data();
990  accessor Mem{*Buf, CGH};
991  Func(Mem);
992 
993  reduction::withAuxHandler(CGH, [&](handler &CopyHandler) {
994  // MSVC (19.32.31329) has problems compiling the line below when used
995  // as a host compiler in c++17 mode (but not in c++latest)
996  // accessor Mem{*Buf, CopyHandler};
997  // so use the old-style API.
998  auto Mem =
999  Buf->template get_access<access::mode::read_write>(CopyHandler);
1000  // Since this CG is dependent on the one associated with CGH,
1001  // registering the auxiliary resources here is enough.
1002  CopyHandler.addReduction(RWReduVal);
1003  CopyHandler.addReduction(Buf);
1004  if constexpr (is_usm) {
1005  // Can't capture whole reduction, copy into distinct variables.
1006  bool IsUpdateOfUserVar = !initializeToIdentity();
1007  auto BOp = getBinaryOperation();
1008 
1009  // Don't use constexpr as non-default host compilers (unlike clang)
1010  // might actually create a capture resulting in binary differences
1011  // between host/device in lambda captures.
1012  size_t NElements = num_elements;
1013 
1014  CopyHandler.single_task<__sycl_init_mem_for<KernelName>>([=] {
1015  for (size_t i = 0; i < NElements; ++i) {
1016  if (IsUpdateOfUserVar)
1017  Out[i] = BOp(Out[i], Mem[i]);
1018  else
1019  Out[i] = Mem[i];
1020  }
1021  });
1022  } else {
1023  accessor OutAcc{Out, CGH};
1024  CopyHandler.copy(Mem, OutAcc);
1025  }
1026  });
1027  };
1028  if constexpr (is_usm) {
1029  // Don't dispatch based on initializeToIdentity() as that would lead
1030  // to two different instantiations of Func.
1031  DoIt(MRedOut);
1032  } else {
1033  if (initializeToIdentity())
1034  DoIt(MRedOut);
1035  else
1036  Func(accessor{MRedOut, CGH});
1037  }
1038  }
1039 
1040  // Overload of withInitializedMem for reducer without identity. Initializing
1041  // to identity is not allowed in this case.
1042  template <typename KernelName, typename FuncTy,
1043  bool HasIdentity = has_identity>
1044  std::enable_if_t<!HasIdentity> withInitializedMem(handler &CGH, FuncTy Func) {
1045  std::ignore = CGH;
1046  assert(!initializeToIdentity() &&
1047  "Initialize to identity not allowed for identity-less reductions.");
1048  Func(accessor{MRedOut, CGH});
1049  }
1050 
1052  return MIdentityContainer;
1053  }
1054 
1058  auto CounterMem = std::make_shared<int>(0);
1059  CGH.addReduction(CounterMem);
1060  auto CounterBuf = std::make_shared<buffer<int, 1>>(CounterMem.get(), 1);
1061  CounterBuf->set_final_data();
1062  CGH.addReduction(CounterBuf);
1063  return {*CounterBuf, CGH};
1064  }
1065 
1066  // On discrete (vs. integrated) GPUs it's faster to initialize memory with an
1067  // extra kernel than copy it from the host.
1069  queue q = createSyclObjFromImpl<queue>(CGH.MQueue);
1070  device Dev = q.get_device();
1071  auto Deleter = [=](auto *Ptr) { free(Ptr, q); };
1072 
1073  std::shared_ptr<int> Counter(malloc_device<int>(1, q), Deleter);
1074  CGH.addReduction(Counter);
1075 
1076  addCounterInit(CGH, CGH.MQueue, Counter);
1077 
1078  return Counter.get();
1079  }
1080 
1082  BinaryOperation getBinaryOperation() const { return MBinaryOp; }
1083  bool initializeToIdentity() const { return InitializeToIdentity; }
1084 
1086  std::ignore = CGH;
1087  if constexpr (is_usm)
1088  return MRedOut;
1089  else
1090  return accessor{MRedOut, CGH};
1091  }
1092 
1093 private:
1094  // Object holding the identity if available.
1095  identity_container_type MIdentityContainer;
1096 
1097  // Array reduction is performed element-wise to avoid stack growth, hence
1098  // 1-dimensional always.
1099  std::shared_ptr<buffer<reducer_element_type, 1>> MOutBufPtr;
1100 
1101  BinaryOperation MBinaryOp;
1102  bool InitializeToIdentity;
1103 
1105  RedOutVar MRedOut;
1106 };
1107 
1110 template <typename T, class BinaryOperation, int Dims, size_t Extent,
1111  bool ExplicitIdentity, typename RedOutVar>
1113  : private reduction_impl_base,
1114  public reduction_impl_algo<T, BinaryOperation, Dims, Extent,
1115  ExplicitIdentity, RedOutVar> {
1116 private:
1117  using algo = reduction_impl_algo<T, BinaryOperation, Dims, Extent,
1118  ExplicitIdentity, RedOutVar>;
1119  using self = reduction_impl<T, BinaryOperation, Dims, Extent,
1120  ExplicitIdentity, RedOutVar>;
1121 
1122 public:
1124  using algo::is_usm;
1125 
1126  // Only scalar and 1D array reductions are supported by SYCL 2020.
1127  static_assert(Dims <= 1, "Multi-dimensional reductions are not supported.");
1128 
1131  template <bool ExplicitIdentityRelay = ExplicitIdentity,
1132  typename = std::enable_if_t<!ExplicitIdentityRelay>>
1133  reduction_impl(RedOutVar Var, BinaryOperation BOp,
1134  bool InitializeToIdentity = false)
1135  : algo(BOp, InitializeToIdentity, Var) {
1136  if constexpr (!is_usm)
1137  if (Var.size() != 1)
1139  "Reduction variable must be a scalar.");
1140  if constexpr (!is_known_identity)
1143  "initialize_to_identity property cannot be "
1144  "used with identityless reductions.");
1145  }
1146 
1149  template <bool ExplicitIdentityRelay = ExplicitIdentity,
1150  typename = std::enable_if_t<ExplicitIdentityRelay>>
1151  reduction_impl(RedOutVar &Var, const T &Identity, BinaryOperation BOp,
1152  bool InitializeToIdentity)
1153  : algo(Identity, BOp, InitializeToIdentity, Var) {
1154  if constexpr (!is_usm)
1155  if (Var.size() != 1)
1157  "Reduction variable must be a scalar.");
1158  }
1159 };
1160 
1161 template <class BinaryOp, int Dims, size_t Extent, bool ExplicitIdentity,
1162  typename RedOutVar, typename... RestTy>
1163 auto make_reduction(RedOutVar RedVar, RestTy &&...Rest) {
1165  Extent, ExplicitIdentity, RedOutVar>{
1166  RedVar, std::forward<RestTy>(Rest)...};
1167 }
1168 
1169 namespace reduction {
1170 inline void finalizeHandler(handler &CGH) { CGH.finalize(); }
1171 template <class FunctorTy> void withAuxHandler(handler &CGH, FunctorTy Func) {
1172  event E = CGH.finalize();
1173  handler AuxHandler(CGH.MQueue, CGH.eventNeeded());
1174  if (!createSyclObjFromImpl<queue>(CGH.MQueue).is_in_order())
1175  AuxHandler.depends_on(E);
1176  AuxHandler.saveCodeLoc(CGH.MCodeLoc);
1177  Func(AuxHandler);
1178  CGH.MLastEvent = AuxHandler.finalize();
1179  return;
1180 }
1181 } // namespace reduction
1182 
1183 // This method is used for implementation of parallel_for accepting 1 reduction.
1184 // TODO: remove this method when everything is switched to general algorithm
1185 // implementing arbitrary number of reductions in parallel_for().
1188 template <typename KernelName, class Reduction>
1189 void reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu) {
1190  static_assert(Reduction::is_usm,
1191  "All implementations using this helper are expected to have "
1192  "USM reduction, not a buffer-based one.");
1193  size_t NElements = Reduction::num_elements;
1194  auto InAcc = Redu.getReadAccToPreviousPartialReds(CGH);
1195  auto UserVarPtr = Redu.getUserRedVarAccess(CGH);
1196  bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1197  auto BOp = Redu.getBinaryOperation();
1198  CGH.single_task<KernelName>([=] {
1199  for (size_t i = 0; i < NElements; ++i) {
1200  auto Elem = InAcc[i];
1201  if (IsUpdateOfUserVar)
1202  UserVarPtr[i] = BOp(UserVarPtr[i], *Elem);
1203  else
1204  UserVarPtr[i] = *Elem;
1205  }
1206  });
1207 }
1208 
1209 namespace reduction {
1210 template <typename KernelName, strategy S, class... Ts> struct MainKrn;
1211 template <typename KernelName, strategy S, class... Ts> struct AuxKrn;
1212 } // namespace reduction
1213 
1214 // Tag structs to help creating unique kernels for multi-reduction cases.
1215 struct KernelOneWGTag {};
1217 
1220 template <template <typename, reduction::strategy, typename...> class MainOrAux,
1221  class KernelName, reduction::strategy Strategy, class... Ts>
1223  std::conditional_t<std::is_same_v<KernelName, auto_name>, auto_name,
1224  MainOrAux<KernelName, Strategy, Ts...>>;
1225 
1226 // Implementations.
1227 
1228 template <reduction::strategy> struct NDRangeReduction;
1229 
1230 template <>
1231 struct NDRangeReduction<reduction::strategy::local_atomic_and_atomic_cross_wg> {
1232  template <typename KernelName, int Dims, typename PropertiesT,
1233  typename KernelType, typename Reduction>
1234  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1235  nd_range<Dims> NDRange, PropertiesT &Properties,
1236  Reduction &Redu, KernelType &KernelFunc) {
1237  static_assert(Reduction::has_identity,
1238  "Identityless reductions are not supported by the "
1239  "local_atomic_and_atomic_cross_wg strategy.");
1240 
1241  std::ignore = Queue;
1242  using Name = __sycl_reduction_kernel<
1243  reduction::MainKrn, KernelName,
1245  Redu.template withInitializedMem<Name>(CGH, [&](auto Out) {
1246  size_t NElements = Reduction::num_elements;
1248  CGH};
1249 
1250  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<1> NDId) {
1251  // Call user's functions. Reducer.MValue gets initialized there.
1252  typename Reduction::reducer_type Reducer;
1253  KernelFunc(NDId, Reducer);
1254 
1255  // Work-group cooperates to initialize multiple reduction variables
1256  auto LID = NDId.get_local_id(0);
1257  for (size_t E = LID; E < NElements; E += NDId.get_local_range(0)) {
1258  GroupSum[E] = getReducerAccess(Reducer).getIdentity();
1259  }
1260  workGroupBarrier();
1261 
1262  // Each work-item has its own reducer to combine
1263  Reducer.template atomic_combine<access::address_space::local_space>(
1264  &GroupSum[0]);
1265 
1266  // Single work-item performs finalization for entire work-group
1267  // TODO: Opportunity to parallelize across elements
1268  workGroupBarrier();
1269  if (LID == 0) {
1270  for (size_t E = 0; E < NElements; ++E) {
1271  *getReducerAccess(Reducer).getElement(E) = GroupSum[E];
1272  }
1273  Reducer.atomic_combine(&Out[0]);
1274  }
1275  });
1276  });
1277  }
1278 };
1279 
1280 template <>
1282  reduction::strategy::group_reduce_and_last_wg_detection> {
1283  template <typename KernelName, int Dims, typename PropertiesT,
1284  typename KernelType, typename Reduction>
1285  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1286  nd_range<Dims> NDRange, PropertiesT &Properties,
1287  Reduction &Redu, KernelType &KernelFunc) {
1288  static_assert(Reduction::has_identity,
1289  "Identityless reductions are not supported by the "
1290  "group_reduce_and_last_wg_detection strategy.");
1291 
1292  std::ignore = Queue;
1293  size_t NElements = Reduction::num_elements;
1294  size_t WGSize = NDRange.get_local_range().size();
1295  size_t NWorkGroups = NDRange.get_group_range().size();
1296 
1297  auto Out = Redu.getUserRedVarAccess(CGH);
1298 
1299  auto &PartialSumsBuf = Redu.getTempBuffer(NWorkGroups * NElements, CGH);
1300  accessor PartialSums(PartialSumsBuf, CGH, sycl::read_write, sycl::no_init);
1301 
1302  bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1303  auto Rest = [&](auto NWorkGroupsFinished) {
1304  local_accessor<int, 1> DoReducePartialSumsInLastWG{1, CGH};
1305 
1306  using Name = __sycl_reduction_kernel<
1307  reduction::MainKrn, KernelName,
1309  decltype(NWorkGroupsFinished)>;
1310 
1311  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<1> NDId) {
1312  // Call user's functions. Reducer.MValue gets initialized there.
1313  typename Reduction::reducer_type Reducer;
1314  KernelFunc(NDId, Reducer);
1315 
1316  typename Reduction::binary_operation BOp;
1317  auto Group = NDId.get_group();
1318 
1319  // If there are multiple values, reduce each separately
1320  // reduce_over_group is only defined for each T, not for span<T, ...>
1321  size_t LID = NDId.get_local_id(0);
1322  for (size_t E = 0; E < NElements; ++E) {
1323  auto &RedElem = *getReducerAccess(Reducer).getElement(E);
1324  RedElem = reduce_over_group(Group, RedElem, BOp);
1325  if (LID == 0) {
1326  if (NWorkGroups == 1) {
1327  // Can avoid using partial sum and write the final result
1328  // immediately.
1329  if (IsUpdateOfUserVar)
1330  RedElem = BOp(RedElem, Out[E]);
1331  Out[E] = RedElem;
1332  } else {
1333  PartialSums[NDId.get_group_linear_id() * NElements + E] =
1334  *getReducerAccess(Reducer).getElement(E);
1335  }
1336  }
1337  }
1338 
1339  if (NWorkGroups == 1)
1340  // We're done.
1341  return;
1342 
1343  // Signal this work-group has finished after all values are reduced. We
1344  // had an implicit work-group barrier in reduce_over_group and all the
1345  // work since has been done in (LID == 0) work-item, so no extra sync is
1346  // needed.
1347  if (LID == 0) {
1348  auto NFinished =
1351  NWorkGroupsFinished[0]);
1352  DoReducePartialSumsInLastWG[0] =
1353  ++NFinished == static_cast<int>(NWorkGroups);
1354  }
1355 
1356  workGroupBarrier();
1357  if (DoReducePartialSumsInLastWG[0]) {
1358  // Reduce each result separately
1359  // TODO: Opportunity to parallelize across elements.
1360  for (size_t E = 0; E < NElements; ++E) {
1361  auto LocalSum = getReducerAccess(Reducer).getIdentity();
1362  for (size_t I = LID; I < NWorkGroups; I += WGSize)
1363  LocalSum = BOp(LocalSum, PartialSums[I * NElements + E]);
1364  auto Result = reduce_over_group(Group, LocalSum, BOp);
1365 
1366  if (LID == 0) {
1367  if (IsUpdateOfUserVar)
1368  Result = BOp(Result, Out[E]);
1369  Out[E] = Result;
1370  }
1371  }
1372  }
1373  });
1374  };
1375 
1376  auto device = getDeviceFromHandler(CGH);
1377  // Integrated/discrete GPUs have different faster path. For discrete GPUs
1378  // fast path requires USM device allocations though, so check for that as
1379  // well.
1380  if (device.get_info<info::device::host_unified_memory>() ||
1381  !device.has(aspect::usm_device_allocations))
1382  Rest(Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH));
1383  else
1384  Rest(Redu.getGroupsCounterAccDiscrete(CGH));
1385  }
1386 };
1387 
1389 inline size_t GreatestPowerOfTwo(size_t N) {
1390  if (N == 0)
1391  return 0;
1392 
1393  size_t Ret = 1;
1394  while ((N >>= 1) != 0)
1395  Ret <<= 1;
1396  return Ret;
1397 }
1398 
1399 template <typename FuncTy>
1400 void doTreeReductionHelper(size_t WorkSize, size_t LID, FuncTy Func) {
1401  workGroupBarrier();
1402 
1403  // Initial pivot is the greatest power-of-two value smaller or equal to the
1404  // work size.
1405  size_t Pivot = GreatestPowerOfTwo(WorkSize);
1406 
1407  // If the pivot is not the same as the work size, it needs to do an initial
1408  // reduction where we only reduce the N last elements into the first N
1409  // elements, where N is WorkSize - Pivot.
1410  // 0 Pivot WorkSize Power of two
1411  // | | | |
1412  // +-----------------------+------+------------------------+------+
1413  // |
1414  // WorkSize - Pivot
1415  if (Pivot != WorkSize) {
1416  if (Pivot + LID < WorkSize)
1417  Func(LID, Pivot + LID);
1418  workGroupBarrier();
1419  }
1420 
1421  // Now the amount of work must be power-of-two, so do the tree reduction.
1422  for (size_t CurPivot = Pivot >> 1; CurPivot > 0; CurPivot >>= 1) {
1423  if (LID < CurPivot)
1424  Func(LID, CurPivot + LID);
1425  workGroupBarrier();
1426  }
1427 }
1428 
1429 // Enum for specifying work size guarantees in tree-reduction.
1431 
1432 template <WorkSizeGuarantees WSGuarantee, int Dim, typename LocalRedsTy,
1433  typename BinOpTy, typename AccessFuncTy>
1434 void doTreeReduction(size_t WorkSize, nd_item<Dim> NDIt, LocalRedsTy &LocalReds,
1435  BinOpTy &BOp, AccessFuncTy AccessFunc) {
1436  size_t LID = NDIt.get_local_linear_id();
1437  size_t AdjustedWorkSize;
1438  if constexpr (WSGuarantee == WorkSizeGuarantees::LessOrEqual ||
1439  WSGuarantee == WorkSizeGuarantees::Equal) {
1440  // If there is less-or-equal number of items and amount of work, we just
1441  // load the work into the local memory and start reducing. If we know it is
1442  // equal we can let the optimizer remove the check.
1443  if (WSGuarantee == WorkSizeGuarantees::Equal || LID < WorkSize)
1444  LocalReds[LID] = AccessFunc(LID);
1445  AdjustedWorkSize = WorkSize;
1446  } else {
1447  // Otherwise we have no guarantee and we need to first reduce the amount of
1448  // work to fit into the local memory.
1449  size_t WGSize = NDIt.get_local_range().size();
1450  AdjustedWorkSize = std::min(WorkSize, WGSize);
1451  if (LID < AdjustedWorkSize) {
1452  auto LocalSum = AccessFunc(LID);
1453  for (size_t I = LID + WGSize; I < WorkSize; I += WGSize)
1454  LocalSum = BOp(LocalSum, AccessFunc(I));
1455 
1456  LocalReds[LID] = LocalSum;
1457  }
1458  }
1459  doTreeReductionHelper(AdjustedWorkSize, LID, [&](size_t I, size_t J) {
1460  LocalReds[I] = BOp(LocalReds[I], LocalReds[J]);
1461  });
1462 }
1463 
1464 // Tree-reduction over tuples of accessors. This assumes that WorkSize is
1465 // less than or equal to the work-group size.
1466 // TODO: For variadics/tuples we don't provide such a high-level abstraction as
1467 // for the scalar case above. Is there some C++ magic to level them?
1468 template <typename... LocalAccT, typename... BOPsT, size_t... Is>
1469 void doTreeReductionOnTuple(size_t WorkSize, size_t LID,
1470  ReduTupleT<LocalAccT...> &LocalAccs,
1471  ReduTupleT<BOPsT...> &BOPs,
1472  std::index_sequence<Is...>) {
1473  doTreeReductionHelper(WorkSize, LID, [&](size_t I, size_t J) {
1474  auto ProcessOne = [=](auto &LocalAcc, auto &BOp) {
1475  LocalAcc[I] = BOp(LocalAcc[I], LocalAcc[J]);
1476  };
1477  (ProcessOne(std::get<Is>(LocalAccs), std::get<Is>(BOPs)), ...);
1478  });
1479 }
1480 
1481 template <> struct NDRangeReduction<reduction::strategy::range_basic> {
1482  template <typename KernelName, int Dims, typename PropertiesT,
1483  typename KernelType, typename Reduction>
1484  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1485  nd_range<Dims> NDRange, PropertiesT &Properties,
1486  Reduction &Redu, KernelType &KernelFunc) {
1487  using reducer_type = typename Reduction::reducer_type;
1488  using element_type = typename ReducerTraits<reducer_type>::element_type;
1489 
1490  // If reduction has an identity and is not USM, the reducer element is just
1491  // a thin wrapper around the result type so the partial sum will use the
1492  // output memory iff NWorkGroups == 1. Otherwise, we need to make sure the
1493  // right output buffer is written in case NWorkGroups == 1.
1494  constexpr bool UsePartialSumForOutput =
1495  !Reduction::is_usm && Reduction::has_identity;
1496 
1497  std::ignore = Queue;
1498  size_t NElements = Reduction::num_elements;
1499  size_t WGSize = NDRange.get_local_range().size();
1500  size_t NWorkGroups = NDRange.get_group_range().size();
1501 
1502  bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1503  auto PartialSums =
1504  Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1505  auto Out = [&]() {
1506  if constexpr (UsePartialSumForOutput)
1507  return (NWorkGroups == 1)
1508  ? PartialSums
1509  : Redu.getWriteAccForPartialReds(NElements, CGH);
1510  else
1511  return Redu.getUserRedVarAccess(CGH);
1512  }();
1513  local_accessor<element_type, 1> LocalReds{WGSize, CGH};
1514  auto NWorkGroupsFinished =
1515  Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH);
1516  local_accessor<int, 1> DoReducePartialSumsInLastWG{1, CGH};
1517 
1518  auto IdentityContainer = Redu.getIdentityContainer();
1519  auto BOp = Redu.getBinaryOperation();
1520 
1521  using Name = __sycl_reduction_kernel<reduction::MainKrn, KernelName,
1523 
1524  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<1> NDId) {
1525  // Call user's functions. Reducer.MValue gets initialized there.
1526  reducer_type Reducer = reducer_type(IdentityContainer, BOp);
1527  KernelFunc(NDId, Reducer);
1528 
1529  auto ElementCombiner = [&](element_type &LHS, const element_type &RHS) {
1530  return LHS.combine(BOp, RHS);
1531  };
1532 
1533  // If there are multiple values, reduce each separately
1534  // This prevents local memory from scaling with elements
1535  size_t LID = NDId.get_local_linear_id();
1536  for (size_t E = 0; E < NElements; ++E) {
1537 
1538  doTreeReduction<WorkSizeGuarantees::Equal>(
1539  WGSize, NDId, LocalReds, ElementCombiner,
1540  [&](size_t) { return getReducerAccess(Reducer).getElement(E); });
1541 
1542  if (LID == 0) {
1543  auto V = LocalReds[0];
1544 
1545  bool IsOneWG = NWorkGroups == 1;
1546  if (IsOneWG && IsUpdateOfUserVar)
1547  V.combine(BOp, Out[E]);
1548 
1549  // if NWorkGroups == 1 && UsePartialSumForOutput, then PartialsSum
1550  // and Out point to same memory.
1551  if (UsePartialSumForOutput || !IsOneWG)
1552  PartialSums[NDId.get_group_linear_id() * NElements + E] = V;
1553  else if (V)
1554  Out[E] = *V;
1555  }
1556  }
1557 
1558  // Signal this work-group has finished after all values are reduced. We
1559  // had an implicit work-group barrier in doTreeReduction and all the
1560  // work since has been done in (LID == 0) work-item, so no extra sync is
1561  // needed.
1562  if (LID == 0) {
1563  auto NFinished =
1566  NWorkGroupsFinished[0]);
1567  DoReducePartialSumsInLastWG[0] =
1568  ++NFinished == NWorkGroups && NWorkGroups > 1;
1569  }
1570 
1571  workGroupBarrier();
1572  if (DoReducePartialSumsInLastWG[0]) {
1573  // Reduce each result separately
1574  // TODO: Opportunity to parallelize across elements
1575  for (size_t E = 0; E < NElements; ++E) {
1576  doTreeReduction<WorkSizeGuarantees::None>(
1577  NWorkGroups, NDId, LocalReds, ElementCombiner,
1578  [&](size_t I) { return PartialSums[I * NElements + E]; });
1579  if (LID == 0) {
1580  auto V = LocalReds[0];
1581  if (IsUpdateOfUserVar)
1582  V.combine(BOp, Out[E]);
1583  Out[E] = *V;
1584  }
1585  }
1586  }
1587  });
1588  }
1589 };
1590 
1591 template <>
1592 struct NDRangeReduction<reduction::strategy::group_reduce_and_atomic_cross_wg> {
1593  template <typename KernelName, int Dims, typename PropertiesT,
1594  typename KernelType, typename Reduction>
1595  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1596  nd_range<Dims> NDRange, PropertiesT &Properties,
1597  Reduction &Redu, KernelType &KernelFunc) {
1598  static_assert(Reduction::has_identity,
1599  "Identityless reductions are not supported by the "
1600  "group_reduce_and_atomic_cross_wg strategy.");
1601 
1602  std::ignore = Queue;
1603  using Name = __sycl_reduction_kernel<
1604  reduction::MainKrn, KernelName,
1606  Redu.template withInitializedMem<Name>(CGH, [&](auto Out) {
1607  size_t NElements = Reduction::num_elements;
1608 
1609  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<Dims> NDIt) {
1610  // Call user's function. Reducer.MValue gets initialized there.
1611  typename Reduction::reducer_type Reducer;
1612  KernelFunc(NDIt, Reducer);
1613 
1614  typename Reduction::binary_operation BOp;
1615  for (size_t E = 0; E < NElements; ++E) {
1616  auto &ReducerElem = getReducerAccess(Reducer).getElement(E);
1617  *ReducerElem = reduce_over_group(NDIt.get_group(), *ReducerElem, BOp);
1618  }
1619  if (NDIt.get_local_linear_id() == 0)
1620  Reducer.atomic_combine(&Out[0]);
1621  });
1622  });
1623  }
1624 };
1625 
1626 template <>
1628  reduction::strategy::local_mem_tree_and_atomic_cross_wg> {
1629  template <typename KernelName, int Dims, typename PropertiesT,
1630  typename KernelType, typename Reduction>
1631  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1632  nd_range<Dims> NDRange, PropertiesT &Properties,
1633  Reduction &Redu, KernelType &KernelFunc) {
1634  using reducer_type = typename Reduction::reducer_type;
1635  using element_type = typename ReducerTraits<reducer_type>::element_type;
1636 
1637  std::ignore = Queue;
1638  using Name = __sycl_reduction_kernel<
1639  reduction::MainKrn, KernelName,
1641  Redu.template withInitializedMem<Name>(CGH, [&](auto Out) {
1642  size_t NElements = Reduction::num_elements;
1643  size_t WGSize = NDRange.get_local_range().size();
1644 
1645  // Use local memory to reduce elements in work-groups into zero-th
1646  // element.
1647  local_accessor<element_type, 1> LocalReds{WGSize, CGH};
1648 
1649  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<Dims> NDIt) {
1650  // Call user's functions. Reducer.MValue gets initialized there.
1651  reducer_type Reducer;
1652  KernelFunc(NDIt, Reducer);
1653 
1654  size_t WGSize = NDIt.get_local_range().size();
1655  size_t LID = NDIt.get_local_linear_id();
1656 
1657  typename Reduction::binary_operation BOp;
1658  auto ElementCombiner = [&](element_type &LHS, const element_type &RHS) {
1659  return LHS.combine(BOp, RHS);
1660  };
1661 
1662  // If there are multiple values, reduce each separately
1663  // This prevents local memory from scaling with elements
1664  for (size_t E = 0; E < NElements; ++E) {
1665 
1666  doTreeReduction<WorkSizeGuarantees::Equal>(
1667  WGSize, NDIt, LocalReds, ElementCombiner,
1668  [&](size_t) { return getReducerAccess(Reducer).getElement(E); });
1669 
1670  if (LID == 0)
1671  getReducerAccess(Reducer).getElement(E) = LocalReds[0];
1672 
1673  // Ensure item 0 is finished with LocalReds before next iteration
1674  if (E != NElements - 1) {
1675  NDIt.barrier();
1676  }
1677  }
1678 
1679  if (LID == 0) {
1680  Reducer.atomic_combine(&Out[0]);
1681  }
1682  });
1683  });
1684  }
1685 };
1686 
1687 template <>
1689  reduction::strategy::group_reduce_and_multiple_kernels> {
1690  template <typename KernelName, int Dims, typename PropertiesT,
1691  typename KernelType, typename Reduction>
1692  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1693  nd_range<Dims> NDRange, PropertiesT &Properties,
1694  Reduction &Redu, KernelType &KernelFunc) {
1695  static_assert(Reduction::has_identity,
1696  "Identityless reductions are not supported by the "
1697  "group_reduce_and_multiple_kernels strategy.");
1698 
1699  // Before running the kernels, check that device has enough local memory
1700  // to hold local arrays that may be required for the reduction algorithm.
1701  // TODO: If the work-group-size is limited by the local memory, then
1702  // a special version of the main kernel may be created. The one that would
1703  // not use local accessors, which means it would not do the reduction in
1704  // the main kernel, but simply generate Range.get_global_range.size() number
1705  // of partial sums, leaving the reduction work to the additional/aux
1706  // kernels.
1707  constexpr bool HFR = Reduction::has_fast_reduce;
1708  size_t OneElemSize = HFR ? 0 : sizeof(typename Reduction::result_type);
1709  // TODO: currently the maximal work group size is determined for the given
1710  // queue/device, while it may be safer to use queries to the kernel compiled
1711  // for the device.
1712  size_t MaxWGSize = reduGetMaxWGSize(Queue, OneElemSize);
1713  if (NDRange.get_local_range().size() > MaxWGSize)
1715  "The implementation handling parallel_for with"
1716  " reduction requires work group size not bigger"
1717  " than " +
1718  std::to_string(MaxWGSize));
1719 
1720  size_t NElements = Reduction::num_elements;
1721  size_t NWorkGroups = NDRange.get_group_range().size();
1722  auto Out = Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1723 
1724  bool IsUpdateOfUserVar =
1725  !Reduction::is_usm && !Redu.initializeToIdentity() && NWorkGroups == 1;
1726 
1727  using Name = __sycl_reduction_kernel<
1728  reduction::MainKrn, KernelName,
1730 
1731  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<Dims> NDIt) {
1732  // Call user's functions. Reducer.MValue gets initialized there.
1733  typename Reduction::reducer_type Reducer;
1734  KernelFunc(NDIt, Reducer);
1735 
1736  // Compute the partial sum/reduction for the work-group.
1737  size_t WGID = NDIt.get_group_linear_id();
1738  typename Reduction::binary_operation BOp;
1739  for (size_t E = 0; E < NElements; ++E) {
1740  typename Reduction::result_type PSum;
1741  PSum = *getReducerAccess(Reducer).getElement(E);
1742  PSum = reduce_over_group(NDIt.get_group(), PSum, BOp);
1743  if (NDIt.get_local_linear_id() == 0) {
1744  if (IsUpdateOfUserVar)
1745  PSum = BOp(*Out[E], PSum);
1746  Out[WGID * NElements + E] = PSum;
1747  }
1748  }
1749  });
1750 
1752 
1753  // Run the additional kernel as many times as needed to reduce all partial
1754  // sums into one scalar.
1755 
1756  // TODO: Create a special slow/sequential version of the kernel that would
1757  // handle the reduction instead of reporting an assert below.
1758  if (MaxWGSize <= 1)
1760  "The implementation handling parallel_for with "
1761  "reduction requires the maximal work group "
1762  "size to be greater than 1 to converge. "
1763  "The maximal work group size depends on the "
1764  "device and the size of the objects passed to "
1765  "the reduction.");
1766  size_t NWorkItems = NDRange.get_group_range().size();
1767  while (NWorkItems > 1) {
1768  reduction::withAuxHandler(CGH, [&](handler &AuxHandler) {
1769  size_t NElements = Reduction::num_elements;
1770  size_t NWorkGroups;
1771  size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
1772 
1773  // The last work-group may be not fully loaded with work, or the work
1774  // group size may be not power of two. Those two cases considered
1775  // inefficient as they require additional code and checks in the kernel.
1776  bool HasUniformWG = NWorkGroups * WGSize == NWorkItems;
1777  if (!Reduction::has_fast_reduce)
1778  HasUniformWG = HasUniformWG && (WGSize & (WGSize - 1)) == 0;
1779 
1780  // Get read accessor to the buffer that was used as output
1781  // in the previous kernel.
1782  auto In = Redu.getReadAccToPreviousPartialReds(AuxHandler);
1783  auto Out =
1784  Redu.getWriteAccForPartialReds(NWorkGroups * NElements, AuxHandler);
1785 
1786  using Name = __sycl_reduction_kernel<
1787  reduction::AuxKrn, KernelName,
1789 
1790  bool IsUpdateOfUserVar = !Reduction::is_usm &&
1791  !Redu.initializeToIdentity() &&
1792  NWorkGroups == 1;
1793  range<1> GlobalRange = {HasUniformWG ? NWorkItems
1794  : NWorkGroups * WGSize};
1795  nd_range<1> Range{GlobalRange, range<1>(WGSize)};
1796  AuxHandler.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
1797  typename Reduction::binary_operation BOp;
1798  size_t WGID = NDIt.get_group_linear_id();
1799  size_t GID = NDIt.get_global_linear_id();
1800 
1801  for (size_t E = 0; E < NElements; ++E) {
1802  typename Reduction::result_type PSum =
1803  (HasUniformWG || (GID < NWorkItems))
1804  ? *In[GID * NElements + E]
1807  PSum = reduce_over_group(NDIt.get_group(), PSum, BOp);
1808  if (NDIt.get_local_linear_id() == 0) {
1809  if (IsUpdateOfUserVar)
1810  PSum = BOp(*Out[E], PSum);
1811  Out[WGID * NElements + E] = PSum;
1812  }
1813  }
1814  });
1815  NWorkItems = NWorkGroups;
1816  });
1817  } // end while (NWorkItems > 1)
1818 
1819  if constexpr (Reduction::is_usm) {
1820  reduction::withAuxHandler(CGH, [&](handler &CopyHandler) {
1821  reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
1822  });
1823  }
1824  }
1825 };
1826 
1827 template <> struct NDRangeReduction<reduction::strategy::basic> {
1828  template <typename KernelName, int Dims, typename PropertiesT,
1829  typename KernelType, typename Reduction>
1830  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1831  nd_range<Dims> NDRange, PropertiesT &Properties,
1832  Reduction &Redu, KernelType &KernelFunc) {
1833  using element_type = typename Reduction::reducer_element_type;
1834 
1835  constexpr bool HFR = Reduction::has_fast_reduce;
1836  size_t OneElemSize = HFR ? 0 : sizeof(element_type);
1837  // TODO: currently the maximal work group size is determined for the given
1838  // queue/device, while it may be safer to use queries to the kernel
1839  // compiled for the device.
1840  size_t MaxWGSize = reduGetMaxWGSize(Queue, OneElemSize);
1841  if (NDRange.get_local_range().size() > MaxWGSize)
1843  "The implementation handling parallel_for with"
1844  " reduction requires work group size not bigger"
1845  " than " +
1846  std::to_string(MaxWGSize));
1847 
1848  size_t NWorkGroups = NDRange.get_group_range().size();
1849 
1850  bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1851  std::ignore = IsUpdateOfUserVar;
1852 
1853  // The type of the Out "accessor" differs between scenarios when there is
1854  // just one WorkGroup and when there are multiple. Use this lambda to write
1855  // the code just once.
1856  auto First = [&](auto KernelTag) {
1857  // We can deduce IsOneWG from the tag type.
1858  constexpr bool IsOneWG =
1859  std::is_same_v<std::remove_reference_t<decltype(KernelTag)>,
1860  KernelOneWGTag>;
1861 
1862  constexpr size_t NElements = Reduction::num_elements;
1863 
1864  size_t WGSize = NDRange.get_local_range().size();
1865 
1866  auto Out = [&]() {
1867  if constexpr (IsOneWG)
1868  return Redu.getUserRedVarAccess(CGH);
1869  else
1870  return Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
1871  }();
1872 
1873  // Use local memory to reduce elements in work-groups into 0-th element.
1874  local_accessor<element_type, 1> LocalReds{WGSize, CGH};
1875 
1876  auto BOp = Redu.getBinaryOperation();
1877  auto IdentityContainer = Redu.getIdentityContainer();
1878 
1879  using Name = __sycl_reduction_kernel<reduction::MainKrn, KernelName,
1881  decltype(KernelTag)>;
1882 
1883  CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<Dims> NDIt) {
1884  // Call user's functions. Reducer.MValue gets initialized there.
1885  typename Reduction::reducer_type Reducer =
1886  typename Reduction::reducer_type(IdentityContainer, BOp);
1887  KernelFunc(NDIt, Reducer);
1888 
1889  size_t WGSize = NDIt.get_local_range().size();
1890  size_t LID = NDIt.get_local_linear_id();
1891 
1892  auto ElementCombiner = [&](element_type &LHS, const element_type &RHS) {
1893  return LHS.combine(BOp, RHS);
1894  };
1895 
1896  // If there are multiple values, reduce each separately
1897  // This prevents local memory from scaling with elements
1898  for (size_t E = 0; E < NElements; ++E) {
1899 
1900  doTreeReduction<WorkSizeGuarantees::Equal>(
1901  WGSize, NDIt, LocalReds, ElementCombiner,
1902  [&](size_t) { return getReducerAccess(Reducer).getElement(E); });
1903 
1904  // Compute the partial sum/reduction for the work-group.
1905  if (LID == 0) {
1906  element_type PSum = LocalReds[0];
1907  if constexpr (IsOneWG) {
1908  if (IsUpdateOfUserVar)
1909  PSum.combine(BOp, Out[E]);
1910  Out[E] = *PSum;
1911  } else {
1912  size_t GrID = NDIt.get_group_linear_id();
1913  Out[GrID * NElements + E] = PSum;
1914  }
1915  }
1916 
1917  // Ensure item 0 is finished with LocalReds before next iteration
1918  if (E != NElements - 1) {
1919  NDIt.barrier();
1920  }
1921  }
1922  });
1923  };
1924 
1925  if (NWorkGroups == 1)
1926  First(KernelOneWGTag{});
1927  else
1928  First(KernelMultipleWGTag{});
1929 
1931 
1932  // 2. Run the additional kernel as many times as needed to reduce
1933  // all partial sums into one scalar.
1934 
1935  // TODO: Create a special slow/sequential version of the kernel that would
1936  // handle the reduction instead of reporting an assert below.
1937  if (MaxWGSize <= 1)
1939  "The implementation handling parallel_for with "
1940  "reduction requires the maximal work group "
1941  "size to be greater than 1 to converge. "
1942  "The maximal work group size depends on the "
1943  "device and the size of the objects passed to "
1944  "the reduction.");
1945  size_t NWorkItems = NDRange.get_group_range().size();
1946  while (NWorkItems > 1) {
1947  size_t NWorkGroups;
1948  size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
1949 
1950  auto Rest = [&](auto KernelTag) {
1951  reduction::withAuxHandler(CGH, [&](handler &AuxHandler) {
1952  // We can deduce IsOneWG from the tag type.
1953  constexpr bool IsOneWG =
1954  std::is_same_v<std::remove_reference_t<decltype(KernelTag)>,
1955  KernelOneWGTag>;
1956 
1957  constexpr size_t NElements = Reduction::num_elements;
1958 
1959  // The last work-group may be not fully loaded with work, or the work
1960  // group size may be not power of two. Those two cases considered
1961  // inefficient as they require additional code and checks in the
1962  // kernel.
1963  bool HasUniformWG = NWorkGroups * WGSize == NWorkItems;
1964 
1965  // Get read accessor to the buffer that was used as output
1966  // in the previous kernel.
1967  auto In = Redu.getReadAccToPreviousPartialReds(AuxHandler);
1968 
1969  auto Out = [&]() {
1970  if constexpr (IsOneWG)
1971  return Redu.getUserRedVarAccess(AuxHandler);
1972  else
1973  return Redu.getWriteAccForPartialReds(NWorkGroups * NElements,
1974  AuxHandler);
1975  }();
1976 
1977  bool UniformPow2WG = HasUniformWG && (WGSize & (WGSize - 1)) == 0;
1978  // Use local memory to reduce elements in work-groups into 0-th
1979  // element.
1980  local_accessor<element_type, 1> LocalReds{WGSize, AuxHandler};
1981 
1982  auto BOp = Redu.getBinaryOperation();
1983  using Name = __sycl_reduction_kernel<reduction::AuxKrn, KernelName,
1985  decltype(KernelTag)>;
1986 
1987  range<1> GlobalRange = {UniformPow2WG ? NWorkItems
1988  : NWorkGroups * WGSize};
1989  nd_range<1> Range{GlobalRange, range<1>(WGSize)};
1990  AuxHandler.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
1991  size_t WGSize = NDIt.get_local_range().size();
1992  size_t LID = NDIt.get_local_linear_id();
1993  size_t GID = NDIt.get_global_linear_id();
1994  size_t GrID = NDIt.get_group_linear_id();
1995 
1996  // The last work-group may not have enough work for all its items.
1997  size_t RemainingWorkSize =
1998  sycl::min(WGSize, NWorkItems - GrID * WGSize);
1999 
2000  auto ElementCombiner = [&](element_type &LHS,
2001  const element_type &RHS) {
2002  return LHS.combine(BOp, RHS);
2003  };
2004 
2005  for (size_t E = 0; E < NElements; ++E) {
2006 
2007  doTreeReduction<WorkSizeGuarantees::LessOrEqual>(
2008  RemainingWorkSize, NDIt, LocalReds, ElementCombiner,
2009  [&](size_t) { return In[GID * NElements + E]; });
2010 
2011  // Compute the partial sum/reduction for the work-group.
2012  if (LID == 0) {
2013  element_type PSum = LocalReds[0];
2014  if constexpr (IsOneWG) {
2015  if (IsUpdateOfUserVar)
2016  PSum.combine(BOp, Out[E]);
2017  Out[E] = *PSum;
2018  } else {
2019  Out[GrID * NElements + E] = PSum;
2020  }
2021  }
2022 
2023  // Ensure item 0 is finished with LocalReds before next iteration
2024  if (E != NElements - 1) {
2025  NDIt.barrier();
2026  }
2027  }
2028  });
2029  NWorkItems = NWorkGroups;
2030  });
2031  };
2032 
2033  if (NWorkGroups == 1)
2034  Rest(KernelOneWGTag{});
2035  else
2036  Rest(KernelMultipleWGTag{});
2037  } // end while (NWorkItems > 1)
2038  }
2039 };
2040 
2044 template <bool IsOneWG, typename... Reductions, size_t... Is>
2045 auto createReduOutAccs(size_t NWorkGroups, handler &CGH,
2046  std::tuple<Reductions...> &ReduTuple,
2047  std::index_sequence<Is...>) {
2048  return makeReduTupleT(
2049  std::get<Is>(ReduTuple).template getWriteMemForPartialReds<IsOneWG>(
2050  NWorkGroups *
2051  std::tuple_element_t<Is, std::tuple<Reductions...>>::num_elements,
2052  CGH)...);
2053 }
2054 
2055 template <typename OutAccT, typename LocalAccT, typename BOPT,
2056  typename IdentityContainerT>
2057 auto getLastCombine(OutAccT OutAcc, LocalAccT LocalAcc, BOPT BOP,
2058  IdentityContainerT IdentityContainer,
2059  bool IsInitializeToIdentity) {
2060  if constexpr (!IdentityContainerT::has_identity) {
2061  return BOP(LocalAcc[0], OutAcc[0]);
2062  } else {
2063  return BOP(LocalAcc[0], IsInitializeToIdentity
2064  ? IdentityContainer.getIdentity()
2065  : OutAcc[0]);
2066  }
2067 }
2068 
2069 template <bool IsOneWG, typename... Reductions, typename... OutAccT,
2070  typename... LocalAccT, typename... BOPsT, typename... Ts,
2071  size_t... Is>
2073  size_t OutAccIndex, ReduTupleT<OutAccT...> OutAccs,
2075  ReduTupleT<Ts...> IdentityVals,
2076  std::array<bool, sizeof...(Reductions)> IsInitializeToIdentity,
2077  std::index_sequence<Is...>) {
2078  if constexpr (IsOneWG) {
2079  // Add the initial value of user's variable to the final result.
2080  // After this we know there will be a value in the 0th element.
2081  ((std::get<Is>(LocalAccs)[0] = getLastCombine(
2082  std::get<Is>(OutAccs), std::get<Is>(LocalAccs), std::get<Is>(BOPs),
2083  std::get<Is>(IdentityVals), IsInitializeToIdentity[Is])),
2084  ...);
2085  ((std::get<Is>(OutAccs)[OutAccIndex] = *std::get<Is>(LocalAccs)[0]), ...);
2086  } else {
2087  // The partial sums for the work-group are stored in 0-th elements of local
2088  // accessors. Simply write those sums to output accessors.
2089  ((std::get<Is>(OutAccs)[OutAccIndex] = std::get<Is>(LocalAccs)[0]), ...);
2090  }
2091 }
2092 
2093 // Concatenate an empty sequence.
2094 constexpr std::index_sequence<> concat_sequences(std::index_sequence<>) {
2095  return {};
2096 }
2097 
2098 // Concatenate a sequence consisting of 1 element.
2099 template <size_t I>
2100 constexpr std::index_sequence<I> concat_sequences(std::index_sequence<I>) {
2101  return {};
2102 }
2103 
2104 // Concatenate two potentially empty sequences.
2105 template <size_t... Is, size_t... Js>
2106 constexpr std::index_sequence<Is..., Js...>
2107 concat_sequences(std::index_sequence<Is...>, std::index_sequence<Js...>) {
2108  return {};
2109 }
2110 
2111 // Concatenate more than 2 sequences.
2112 template <size_t... Is, size_t... Js, class... Rs>
2113 constexpr auto concat_sequences(std::index_sequence<Is...>,
2114  std::index_sequence<Js...>, Rs...) {
2115  return concat_sequences(std::index_sequence<Is..., Js...>{}, Rs{}...);
2116 }
2117 
2119  template <typename T> struct Func {
2120  static constexpr bool value = !std::remove_pointer_t<T>::is_usm;
2121  };
2122 };
2123 
2125  template <typename T> struct Func {
2126  static constexpr bool value = false;
2127  };
2128 };
2129 
2130 template <bool Cond, size_t I> struct FilterElement {
2131  using type =
2132  std::conditional_t<Cond, std::index_sequence<I>, std::index_sequence<>>;
2133 };
2134 
2140 template <typename... T, typename FunctorT, size_t... Is,
2141  std::enable_if_t<(sizeof...(Is) > 0), int> Z = 0>
2142 constexpr auto filterSequenceHelper(FunctorT, std::index_sequence<Is...>) {
2143  return concat_sequences(
2144  typename FilterElement<FunctorT::template Func<std::tuple_element_t<
2145  Is, std::tuple<T...>>>::value,
2146  Is>::type{}...);
2147 }
2148 template <typename... T, typename FunctorT, size_t... Is,
2149  std::enable_if_t<(sizeof...(Is) == 0), int> Z = 0>
2150 constexpr auto filterSequenceHelper(FunctorT, std::index_sequence<Is...>) {
2151  return std::index_sequence<>{};
2152 }
2153 
2157 template <typename... T, typename FunctorT, size_t... Is>
2158 constexpr auto filterSequence(FunctorT F, std::index_sequence<Is...> Indices) {
2159  return filterSequenceHelper<T...>(F, Indices);
2160 }
2161 
2163  template <typename Reduction> struct Func {
2164  static constexpr bool value =
2165  (Reduction::dims == 0 && Reduction::num_elements == 1);
2166  };
2167 };
2168 
2170  template <typename Reduction> struct Func {
2171  static constexpr bool value =
2172  (Reduction::dims == 1 && Reduction::num_elements >= 1);
2173  };
2174 };
2175 
2176 template <typename ElementType, typename BOPT>
2177 constexpr auto makeAdjustedBOP(BOPT &BOP) {
2178  return [&](ElementType &LHS, const ElementType &RHS) {
2179  return LHS.combine(BOP, RHS);
2180  };
2181 }
2182 
2183 template <typename... Reductions, typename... BOPsT, size_t... Is>
2184 constexpr auto makeAdjustedBOPs(ReduTupleT<BOPsT...> &BOPsTuple,
2185  std::index_sequence<Is...>) {
2186  return makeReduTupleT(
2188  Is, std::tuple<Reductions...>>::reducer_element_type>(
2189  std::get<Is>(BOPsTuple))...);
2190 }
2191 
2192 template <typename... Reductions, typename... BOPsT>
2193 constexpr auto makeAdjustedBOPs(ReduTupleT<BOPsT...> &BOPsTuple) {
2194  return makeAdjustedBOPs<Reductions...>(
2195  BOPsTuple, std::make_index_sequence<sizeof...(Reductions)>{});
2196 }
2197 
2200 template <bool IsOneWG, typename... Reductions, int Dims, typename... LocalAccT,
2201  typename... OutAccT, typename... ReducerT, typename... Ts,
2202  typename... BOPsT, size_t... Is>
2204  nd_item<Dims> NDIt, ReduTupleT<LocalAccT...> LocalAccsTuple,
2205  ReduTupleT<OutAccT...> OutAccsTuple, std::tuple<ReducerT...> &ReducersTuple,
2206  ReduTupleT<Ts...> IdentitiesTuple, ReduTupleT<BOPsT...> BOPsTuple,
2207  std::array<bool, sizeof...(Reductions)> InitToIdentityProps,
2208  std::index_sequence<Is...> ReduIndices) {
2209  size_t WGSize = NDIt.get_local_range().size();
2210  size_t LID = NDIt.get_local_linear_id();
2211 
2212  ((std::get<Is>(LocalAccsTuple)[LID] =
2213  getReducerAccess(std::get<Is>(ReducersTuple)).getElement(0)),
2214  ...);
2215 
2216  // We apply tree-reduction on reducer elements so we adjust the operations
2217  // to combine these.
2218  auto AdjustedBOPsTuple = makeAdjustedBOPs<Reductions...>(BOPsTuple);
2219 
2220  doTreeReductionOnTuple(WGSize, LID, LocalAccsTuple, AdjustedBOPsTuple,
2221  ReduIndices);
2222 
2223  // Compute the partial sum/reduction for the work-group.
2224  if (LID == 0) {
2225  size_t GrID = NDIt.get_group_linear_id();
2226  writeReduSumsToOutAccs<IsOneWG, Reductions...>(
2227  GrID, OutAccsTuple, LocalAccsTuple, AdjustedBOPsTuple, IdentitiesTuple,
2228  InitToIdentityProps, ReduIndices);
2229  }
2230 }
2231 
2233 template <bool IsOneWG, typename Reduction, int Dims, typename LocalAccT,
2234  typename OutAccT, typename ReducerT, typename BOPT>
2235 void reduCGFuncImplArrayHelper(nd_item<Dims> NDIt, LocalAccT LocalReds,
2236  OutAccT Out, ReducerT &Reducer, BOPT BOp,
2237  bool IsInitializeToIdentity) {
2238  using element_type = typename Reduction::reducer_element_type;
2239 
2240  size_t WGSize = NDIt.get_local_range().size();
2241  size_t LID = NDIt.get_local_linear_id();
2242 
2243  auto ElementCombiner = [&](element_type &LHS, const element_type &RHS) {
2244  return LHS.combine(BOp, RHS);
2245  };
2246 
2247  // If there are multiple values, reduce each separately
2248  // This prevents local memory from scaling with elements
2249  auto NElements = Reduction::num_elements;
2250  for (size_t E = 0; E < NElements; ++E) {
2251  doTreeReduction<WorkSizeGuarantees::Equal>(
2252  WGSize, NDIt, LocalReds, ElementCombiner,
2253  [&](size_t) { return getReducerAccess(Reducer).getElement(E); });
2254 
2255  // Add the initial value of user's variable to the final result.
2256  if (LID == 0) {
2257  size_t GrID = NDIt.get_group_linear_id();
2258  size_t OutIdx = GrID * NElements + E;
2259  if constexpr (IsOneWG) {
2260  // If there is only a single work-group, the output will be an actual
2261  // value rather than a potentially optional value.
2262  if constexpr (Reduction::has_identity) {
2263  Out[OutIdx] = *ElementCombiner(LocalReds[0], IsInitializeToIdentity
2264  ? Reducer.identity()
2265  : Out[E]);
2266  } else {
2267  Out[OutIdx] = *LocalReds[0];
2268  }
2269  } else {
2270  // Otherwise we propagate a potentially optional value.
2271  Out[OutIdx] = LocalReds[0];
2272  }
2273  }
2274 
2275  // Ensure item 0 is finished with LocalReds before next iteration
2276  if (E != NElements - 1) {
2277  NDIt.barrier();
2278  }
2279  }
2280 }
2281 
2282 template <bool IsOneWG, typename... Reductions, int Dims, typename... LocalAccT,
2283  typename... OutAccT, typename... ReducerT, typename... BOPsT,
2284  size_t... Is>
2286  nd_item<Dims> NDIt, ReduTupleT<LocalAccT...> LocalAccsTuple,
2287  ReduTupleT<OutAccT...> OutAccsTuple, std::tuple<ReducerT...> &ReducersTuple,
2288  ReduTupleT<BOPsT...> BOPsTuple,
2289  std::array<bool, sizeof...(Reductions)> InitToIdentityProps,
2290  std::index_sequence<Is...>) {
2291  using ReductionPack = std::tuple<Reductions...>;
2292  (reduCGFuncImplArrayHelper<IsOneWG, std::tuple_element_t<Is, ReductionPack>>(
2293  NDIt, std::get<Is>(LocalAccsTuple), std::get<Is>(OutAccsTuple),
2294  std::get<Is>(ReducersTuple), std::get<Is>(BOPsTuple),
2295  InitToIdentityProps[Is]),
2296  ...);
2297 }
2298 
2299 namespace reduction::main_krn {
2300 template <class KernelName, class Accessor> struct NDRangeMulti;
2301 } // namespace reduction::main_krn
2302 template <typename KernelName, typename KernelType, int Dims,
2303  typename PropertiesT, typename... Reductions, size_t... Is>
2304 void reduCGFuncMulti(handler &CGH, KernelType KernelFunc,
2305  const nd_range<Dims> &Range, PropertiesT Properties,
2306  std::tuple<Reductions...> &ReduTuple,
2307  std::index_sequence<Is...> ReduIndices) {
2308  size_t WGSize = Range.get_local_range().size();
2309 
2310  // Split reduction sequence into two:
2311  // 1) Scalar reductions
2312  // 2) Array reductions
2313  // This allows us to reuse the existing implementation for scalar reductions
2314  // and introduce a new implementation for array reductions. Longer term it
2315  // may make sense to generalize the code such that each phase below applies
2316  // to all available reduction implementations -- today all reduction classes
2317  // use the same privatization-based approach, so this is unnecessary.
2318  IsScalarReduction ScalarPredicate;
2319  auto ScalarIs = filterSequence<Reductions...>(ScalarPredicate, ReduIndices);
2320 
2321  IsArrayReduction ArrayPredicate;
2322  auto ArrayIs = filterSequence<Reductions...>(ArrayPredicate, ReduIndices);
2323 
2324  auto LocalAccsTuple = makeReduTupleT(
2326  CGH}...);
2327 
2328  // The type of the Out "accessor" differs between scenarios when there is just
2329  // one WorkGroup and when there are multiple. Use this lambda to write the
2330  // code just once.
2331  auto Rest = [&](auto KernelTag, auto OutAccsTuple) {
2332  auto IdentitiesTuple =
2333  makeReduTupleT(std::get<Is>(ReduTuple).getIdentityContainer()...);
2334  auto BOPsTuple =
2335  makeReduTupleT(std::get<Is>(ReduTuple).getBinaryOperation()...);
2336  std::array InitToIdentityProps{
2337  std::get<Is>(ReduTuple).initializeToIdentity()...};
2338 
2339  using Name = __sycl_reduction_kernel<reduction::MainKrn, KernelName,
2341  decltype(KernelTag)>;
2342 
2343  CGH.parallel_for<Name>(Range, Properties, [=](nd_item<Dims> NDIt) {
2344  // We can deduce IsOneWG from the tag type.
2345  constexpr bool IsOneWG =
2346  std::is_same_v<std::remove_reference_t<decltype(KernelTag)>,
2347  KernelOneWGTag>;
2348 
2349  // Pass all reductions to user's lambda in the same order as supplied
2350  // Each reducer initializes its own storage
2351  auto ReduIndices = std::index_sequence_for<Reductions...>();
2352  auto ReducerTokensTuple =
2353  std::tuple{typename Reductions::reducer_token_type{
2354  std::get<Is>(IdentitiesTuple), std::get<Is>(BOPsTuple)}...};
2355  auto ReducersTuple = std::tuple<typename Reductions::reducer_type...>{
2356  std::get<Is>(ReducerTokensTuple)...};
2357  std::apply([&](auto &...Reducers) { KernelFunc(NDIt, Reducers...); },
2358  ReducersTuple);
2359 
2360  // Combine and write-back the results of any scalar reductions
2361  // reduCGFuncImplScalar<Reductions...>(NDIt, LocalAccsTuple, OutAccsTuple,
2362  // ReducersTuple, IdentitiesTuple, BOPsTuple, InitToIdentityProps,
2363  // ReduIndices);
2364  reduCGFuncImplScalar<IsOneWG, Reductions...>(
2365  NDIt, LocalAccsTuple, OutAccsTuple, ReducersTuple, IdentitiesTuple,
2366  BOPsTuple, InitToIdentityProps, ScalarIs);
2367 
2368  // Combine and write-back the results of any array reductions
2369  // These are handled separately to minimize temporary storage and account
2370  // for the fact that each array reduction may have a different number of
2371  // elements to reduce (i.e. a different extent).
2372  reduCGFuncImplArray<IsOneWG, Reductions...>(
2373  NDIt, LocalAccsTuple, OutAccsTuple, ReducersTuple, BOPsTuple,
2374  InitToIdentityProps, ArrayIs);
2375  });
2376  };
2377 
2378  size_t NWorkGroups = Range.get_group_range().size();
2379  if (NWorkGroups == 1)
2380  Rest(KernelOneWGTag{},
2381  createReduOutAccs<true>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2382  else
2383  Rest(KernelMultipleWGTag{},
2384  createReduOutAccs<false>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2385 }
2386 
2387 // TODO: Is this still needed?
2388 template <typename... Reductions, size_t... Is>
2390  std::tuple<Reductions...> &ReduTuple,
2391  std::index_sequence<Is...>) {
2392  auto ProcessOne = [&CGH](auto Redu) {
2393  if constexpr (!decltype(Redu)::is_usm)
2394  Redu.getUserRedVarAccess(CGH);
2395  };
2396  (ProcessOne(std::get<Is>(ReduTuple)), ...);
2397 }
2398 
2401 template <bool IsOneWG, typename... Reductions, int Dims, typename... LocalAccT,
2402  typename... InAccT, typename... OutAccT, typename... Ts,
2403  typename... BOPsT, size_t... Is>
2405  nd_item<Dims> NDIt, size_t LID, size_t GID, size_t RemainingWorkSize,
2406  ReduTupleT<LocalAccT...> LocalAccsTuple, ReduTupleT<InAccT...> InAccsTuple,
2407  ReduTupleT<OutAccT...> OutAccsTuple, ReduTupleT<Ts...> IdentitiesTuple,
2408  ReduTupleT<BOPsT...> BOPsTuple,
2409  std::array<bool, sizeof...(Reductions)> InitToIdentityProps,
2410  std::index_sequence<Is...> ReduIndices) {
2411  // The end work-group may have less work than the rest, so we only need to
2412  // read the value of the elements that still have work left.
2413  if (LID < RemainingWorkSize)
2414  ((std::get<Is>(LocalAccsTuple)[LID] = std::get<Is>(InAccsTuple)[GID]), ...);
2415 
2416  // We apply tree-reduction on reducer elements so we adjust the operations
2417  // to combine these.
2418  auto AdjustedBOPsTuple = makeAdjustedBOPs<Reductions...>(BOPsTuple);
2419 
2420  doTreeReductionOnTuple(RemainingWorkSize, LID, LocalAccsTuple,
2421  AdjustedBOPsTuple, ReduIndices);
2422 
2423  // Compute the partial sum/reduction for the work-group.
2424  if (LID == 0) {
2425  size_t GrID = NDIt.get_group_linear_id();
2426  writeReduSumsToOutAccs<IsOneWG, Reductions...>(
2427  GrID, OutAccsTuple, LocalAccsTuple, AdjustedBOPsTuple, IdentitiesTuple,
2428  InitToIdentityProps, ReduIndices);
2429  }
2430 }
2431 
2432 template <bool IsOneWG, typename Reduction, int Dims, typename LocalAccT,
2433  typename InAccT, typename OutAccT, typename T, typename BOPT>
2434 void reduAuxCGFuncImplArrayHelper(nd_item<Dims> NDIt, size_t LID, size_t GID,
2435  size_t RemainingWorkSize, LocalAccT LocalReds,
2436  InAccT In, OutAccT Out, T IdentityContainer,
2437  BOPT BOp, bool IsInitializeToIdentity) {
2438  using element_type = typename Reduction::reducer_element_type;
2439  auto ElementCombiner = [&](element_type &LHS, const element_type &RHS) {
2440  return LHS.combine(BOp, RHS);
2441  };
2442 
2443  // If there are multiple values, reduce each separately
2444  // This prevents local memory from scaling with elements
2445  auto NElements = Reduction::num_elements;
2446  for (size_t E = 0; E < NElements; ++E) {
2447  doTreeReduction<WorkSizeGuarantees::LessOrEqual>(
2448  RemainingWorkSize, NDIt, LocalReds, ElementCombiner,
2449  [&](size_t) { return In[GID * NElements + E]; });
2450 
2451  // Add the initial value of user's variable to the final result.
2452  if (LID == 0) {
2453  size_t GrID = NDIt.get_group_linear_id();
2454  size_t OutIdx = GrID * NElements + E;
2455  if constexpr (IsOneWG) {
2456  // If there is only a single work-group, the output will be an actual
2457  // value rather than a potentially optional value.
2458  if constexpr (Reduction::has_identity) {
2459  Out[OutIdx] = *ElementCombiner(LocalReds[0],
2460  IsInitializeToIdentity
2461  ? IdentityContainer.getIdentity()
2462  : Out[E]);
2463  } else {
2464  Out[OutIdx] = *LocalReds[0];
2465  }
2466  } else {
2467  // Otherwise we propagate a potentially optional value.
2468  Out[OutIdx] = LocalReds[0];
2469  }
2470  }
2471 
2472  // Ensure item 0 is finished with LocalReds before next iteration
2473  if (E != NElements - 1) {
2474  NDIt.barrier();
2475  }
2476  }
2477 }
2478 
2479 template <bool IsOneWG, typename... Reductions, int Dims, typename... LocalAccT,
2480  typename... InAccT, typename... OutAccT, typename... Ts,
2481  typename... BOPsT, size_t... Is>
2483  nd_item<Dims> NDIt, size_t LID, size_t GID, size_t RemainingWorkSize,
2484  ReduTupleT<LocalAccT...> LocalAccsTuple, ReduTupleT<InAccT...> InAccsTuple,
2485  ReduTupleT<OutAccT...> OutAccsTuple, ReduTupleT<Ts...> IdentitiesTuple,
2486  ReduTupleT<BOPsT...> BOPsTuple,
2487  std::array<bool, sizeof...(Reductions)> InitToIdentityProps,
2488  std::index_sequence<Is...>) {
2489  using ReductionPack = std::tuple<Reductions...>;
2491  std::tuple_element_t<Is, ReductionPack>>(
2492  NDIt, LID, GID, RemainingWorkSize, std::get<Is>(LocalAccsTuple),
2493  std::get<Is>(InAccsTuple), std::get<Is>(OutAccsTuple),
2494  std::get<Is>(IdentitiesTuple), std::get<Is>(BOPsTuple),
2495  InitToIdentityProps[Is]),
2496  ...);
2497 }
2498 
2499 namespace reduction::aux_krn {
2500 template <class KernelName, class Predicate> struct Multi;
2501 } // namespace reduction::aux_krn
2502 template <typename KernelName, typename KernelType, typename... Reductions,
2503  size_t... Is>
2504 size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
2505  std::tuple<Reductions...> &ReduTuple,
2506  std::index_sequence<Is...> ReduIndices) {
2507  size_t NWorkGroups;
2508  size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
2509 
2510  bool Pow2WG = (WGSize & (WGSize - 1)) == 0;
2511  bool HasUniformWG = Pow2WG && (NWorkGroups * WGSize == NWorkItems);
2512 
2513  // Like reduCGFuncImpl, we also have to split out scalar and array reductions
2514  IsScalarReduction ScalarPredicate;
2515  auto ScalarIs = filterSequence<Reductions...>(ScalarPredicate, ReduIndices);
2516 
2517  IsArrayReduction ArrayPredicate;
2518  auto ArrayIs = filterSequence<Reductions...>(ArrayPredicate, ReduIndices);
2519 
2520  auto LocalAccsTuple = makeReduTupleT(
2522  CGH}...);
2523  auto InAccsTuple = makeReduTupleT(
2524  std::get<Is>(ReduTuple).getReadAccToPreviousPartialReds(CGH)...);
2525 
2526  auto IdentitiesTuple =
2527  makeReduTupleT(std::get<Is>(ReduTuple).getIdentityContainer()...);
2528  auto BOPsTuple =
2529  makeReduTupleT(std::get<Is>(ReduTuple).getBinaryOperation()...);
2530  std::array InitToIdentityProps{
2531  std::get<Is>(ReduTuple).initializeToIdentity()...};
2532 
2533  // Predicate/OutAccsTuple below have different type depending on us having
2534  // just a single WG or multiple WGs. Use this lambda to avoid code
2535  // duplication.
2536  auto Rest = [&](auto Predicate, auto OutAccsTuple) {
2537  auto AccReduIndices = filterSequence<Reductions...>(Predicate, ReduIndices);
2538  associateReduAccsWithHandler(CGH, ReduTuple, AccReduIndices);
2539  using Name = __sycl_reduction_kernel<reduction::AuxKrn, KernelName,
2541  decltype(Predicate)>;
2542  // TODO: Opportunity to parallelize across number of elements
2543  range<1> GlobalRange = {HasUniformWG ? NWorkItems : NWorkGroups * WGSize};
2544  nd_range<1> Range{GlobalRange, range<1>(WGSize)};
2545  CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
2546  // We can deduce IsOneWG from the predicate type.
2547  constexpr bool IsOneWG =
2548  std::is_same_v<std::remove_reference_t<decltype(Predicate)>,
2550 
2551  size_t WGSize = NDIt.get_local_range().size();
2552  size_t RemainingWorkSize =
2553  sycl::min(WGSize, NWorkItems - WGSize * NDIt.get_group_linear_id());
2554  size_t LID = NDIt.get_local_linear_id();
2555  size_t GID = NDIt.get_global_linear_id();
2556 
2557  // Handle scalar and array reductions
2558  reduAuxCGFuncImplScalar<IsOneWG, Reductions...>(
2559  NDIt, LID, GID, RemainingWorkSize, LocalAccsTuple, InAccsTuple,
2560  OutAccsTuple, IdentitiesTuple, BOPsTuple, InitToIdentityProps,
2561  ScalarIs);
2562  reduAuxCGFuncImplArray<IsOneWG, Reductions...>(
2563  NDIt, LID, GID, RemainingWorkSize, LocalAccsTuple, InAccsTuple,
2564  OutAccsTuple, IdentitiesTuple, BOPsTuple, InitToIdentityProps,
2565  ArrayIs);
2566  });
2567  };
2568  if (NWorkGroups == 1)
2570  createReduOutAccs<true>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2571  else
2572  Rest(EmptyReductionPredicate{},
2573  createReduOutAccs<false>(NWorkGroups, CGH, ReduTuple, ReduIndices));
2574 
2575  return NWorkGroups;
2576 }
2577 
2578 template <typename Reduction> size_t reduGetMemPerWorkItemHelper(Reduction &) {
2579  return sizeof(typename Reduction::result_type);
2580 }
2581 
2582 template <typename Reduction, typename... RestT>
2583 size_t reduGetMemPerWorkItemHelper(Reduction &, RestT... Rest) {
2584  return sizeof(typename Reduction::result_type) +
2585  reduGetMemPerWorkItemHelper(Rest...);
2586 }
2587 
2588 template <typename... ReductionT, size_t... Is>
2589 size_t reduGetMemPerWorkItem(std::tuple<ReductionT...> &ReduTuple,
2590  std::index_sequence<Is...>) {
2591  return reduGetMemPerWorkItemHelper(std::get<Is>(ReduTuple)...);
2592 }
2593 
2596 template <typename TupleT, std::size_t... Is>
2597 std::tuple<std::tuple_element_t<Is, TupleT>...>
2598 tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>) {
2599  return {std::get<Is>(std::move(Tuple))...};
2600 }
2601 
2602 template <> struct NDRangeReduction<reduction::strategy::multi> {
2603  template <typename KernelName, int Dims, typename PropertiesT,
2604  typename... RestT>
2605  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
2606  nd_range<Dims> NDRange, PropertiesT &Properties,
2607  RestT... Rest) {
2608  std::tuple<RestT...> ArgsTuple(Rest...);
2609  constexpr size_t NumArgs = sizeof...(RestT);
2610  auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
2611  auto ReduIndices = std::make_index_sequence<NumArgs - 1>();
2612  auto ReduTuple = detail::tuple_select_elements(ArgsTuple, ReduIndices);
2613 
2614  size_t LocalMemPerWorkItem = reduGetMemPerWorkItem(ReduTuple, ReduIndices);
2615  // TODO: currently the maximal work group size is determined for the given
2616  // queue/device, while it is safer to use queries to the kernel compiled
2617  // for the device.
2618  size_t MaxWGSize = reduGetMaxWGSize(Queue, LocalMemPerWorkItem);
2619  if (NDRange.get_local_range().size() > MaxWGSize)
2621  "The implementation handling parallel_for with"
2622  " reduction requires work group size not bigger"
2623  " than " +
2624  std::to_string(MaxWGSize));
2625 
2626  reduCGFuncMulti<KernelName>(CGH, KernelFunc, NDRange, Properties, ReduTuple,
2627  ReduIndices);
2629 
2630  size_t NWorkItems = NDRange.get_group_range().size();
2631  while (NWorkItems > 1) {
2632  reduction::withAuxHandler(CGH, [&](handler &AuxHandler) {
2633  NWorkItems = reduAuxCGFunc<KernelName, decltype(KernelFunc)>(
2634  AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
2635  });
2636  } // end while (NWorkItems > 1)
2637  }
2638 };
2639 
2640 // Auto-dispatch. Must be the last one.
2641 template <> struct NDRangeReduction<reduction::strategy::auto_select> {
2642  // Some readability aliases, to increase signal/noise ratio below.
2643  template <reduction::strategy Strategy>
2646 
2647  template <typename KernelName, int Dims, typename PropertiesT,
2648  typename KernelType, typename Reduction>
2649  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
2650  nd_range<Dims> NDRange, PropertiesT &Properties,
2651  Reduction &Redu, KernelType &KernelFunc) {
2652  auto Delegate = [&](auto Impl) {
2653  Impl.template run<KernelName>(CGH, Queue, NDRange, Properties, Redu,
2654  KernelFunc);
2655  };
2656 
2657  if constexpr (Reduction::has_float64_atomics) {
2658  if (getDeviceFromHandler(CGH).has(aspect::atomic64))
2660 
2661  if constexpr (Reduction::has_fast_reduce)
2663  else
2664  return Delegate(Impl<Strat::basic>{});
2665  } else if constexpr (Reduction::has_fast_atomics) {
2666  if constexpr (sizeof(typename Reduction::result_type) == 8) {
2667  // Both group_reduce_and_atomic_cross_wg and
2668  // local_mem_tree_and_atomic_cross_wg implicitly require
2669  // aspect::atomic64 if the result type of the reduction is 64-bit. If
2670  // the device does not support this, we need to fall back to more
2671  // reliable strategies.
2672  if (!getDeviceFromHandler(CGH).has(aspect::atomic64)) {
2673  if constexpr (Reduction::has_fast_reduce)
2675  else
2676  return Delegate(Impl<Strat::basic>{});
2677  }
2678  }
2679 
2680  if constexpr (Reduction::has_fast_reduce) {
2682  } else {
2684  }
2685  } else {
2686  if constexpr (Reduction::has_fast_reduce)
2688  else
2689  return Delegate(Impl<Strat::basic>{});
2690  }
2691 
2692  assert(false && "Must be unreachable!");
2693  }
2694  template <typename KernelName, int Dims, typename PropertiesT,
2695  typename... RestT>
2696  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
2697  nd_range<Dims> NDRange, PropertiesT &Properties,
2698  RestT... Rest) {
2699  return Impl<Strat::multi>::run<KernelName>(CGH, Queue, NDRange, Properties,
2700  Rest...);
2701  }
2702 };
2703 
2704 template <typename KernelName, reduction::strategy Strategy, int Dims,
2705  typename PropertiesT, typename... RestT>
2707  PropertiesT Properties, RestT... Rest) {
2708  NDRangeReduction<Strategy>::template run<KernelName>(CGH, CGH.MQueue, NDRange,
2709  Properties, Rest...);
2710 }
2711 
2712 __SYCL_EXPORT uint32_t
2713 reduGetMaxNumConcurrentWorkGroups(std::shared_ptr<queue_impl> Queue);
2714 
2715 template <typename KernelName, reduction::strategy Strategy, int Dims,
2716  typename PropertiesT, typename... RestT>
2718  PropertiesT Properties, RestT... Rest) {
2719  std::tuple<RestT...> ArgsTuple(Rest...);
2720  constexpr size_t NumArgs = sizeof...(RestT);
2721  static_assert(NumArgs > 1, "No reduction!");
2722  auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
2723  auto ReduIndices = std::make_index_sequence<NumArgs - 1>();
2724  auto ReduTuple = detail::tuple_select_elements(ArgsTuple, ReduIndices);
2725 
2726  // Before running the kernels, check that device has enough local memory
2727  // to hold local arrays required for the tree-reduction algorithm.
2728  size_t OneElemSize = [&]() {
2729  // Can't use outlined NumArgs due to a bug in gcc 8.4.
2730  if constexpr (sizeof...(RestT) == 2) {
2731  using Reduction = std::tuple_element_t<0, decltype(ReduTuple)>;
2732  constexpr bool IsTreeReduction =
2733  !Reduction::has_fast_reduce && !Reduction::has_fast_atomics;
2734  return IsTreeReduction ? sizeof(typename Reduction::result_type) : 0;
2735  } else {
2736  return reduGetMemPerWorkItem(ReduTuple, ReduIndices);
2737  }
2738  }();
2739 
2740  uint32_t NumConcurrentWorkGroups =
2741 #ifdef __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS
2742  __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS;
2743 #else
2745 #endif
2746 
2747  // TODO: currently the preferred work group size is determined for the given
2748  // queue/device, while it is safer to use queries to the kernel pre-compiled
2749  // for the device.
2750  size_t PrefWGSize = reduGetPreferredWGSize(CGH.MQueue, OneElemSize);
2751 
2752  size_t NWorkItems = Range.size();
2753  size_t WGSize = std::min(NWorkItems, PrefWGSize);
2754  size_t NWorkGroups = NWorkItems / WGSize;
2755  if (NWorkItems % WGSize)
2756  NWorkGroups++;
2757  size_t MaxNWorkGroups = NumConcurrentWorkGroups;
2758  NWorkGroups = std::min(NWorkGroups, MaxNWorkGroups);
2759  size_t NDRItems = NWorkGroups * WGSize;
2760  nd_range<1> NDRange{range<1>{NDRItems}, range<1>{WGSize}};
2761 
2762  size_t PerGroup = Range.size() / NWorkGroups;
2763  // Iterate through the index space by assigning contiguous chunks to each
2764  // work-group, then iterating through each chunk using a stride equal to the
2765  // work-group's local range, which gives much better performance than using
2766  // stride equal to 1. For each of the index the given the original KernelFunc
2767  // is called and the reduction value hold in \p Reducer is accumulated in
2768  // those calls.
2769  auto UpdatedKernelFunc = [=](auto NDId, auto &...Reducers) {
2770  // Divide into contiguous chunks and assign each chunk to a Group
2771  // Rely on precomputed division to avoid repeating expensive operations
2772  // TODO: Some devices may prefer alternative remainder handling
2773  auto Group = NDId.get_group();
2774  size_t GroupId = Group.get_group_linear_id();
2775  size_t NumGroups = Group.get_group_linear_range();
2776  bool LastGroup = (GroupId == NumGroups - 1);
2777  size_t GroupStart = GroupId * PerGroup;
2778  size_t GroupEnd = LastGroup ? Range.size() : (GroupStart + PerGroup);
2779 
2780  // Loop over the contiguous chunk
2781  size_t Start = GroupStart + NDId.get_local_id(0);
2782  size_t End = GroupEnd;
2783  size_t Stride = NDId.get_local_range(0);
2784  auto GetDelinearized = [&](size_t I) {
2785  auto Id = getDelinearizedId(Range, I);
2786  if constexpr (std::is_invocable_v<decltype(KernelFunc), id<Dims>,
2787  decltype(Reducers)...>)
2788  return Id;
2789  else
2790  // SYCL doesn't provide parallel_for accepting offset in presence of
2791  // reductions, so use with_offset==false.
2792  return reduction::getDelinearizedItem(Range, Id);
2793  };
2794  for (size_t I = Start; I < End; I += Stride)
2795  KernelFunc(GetDelinearized(I), Reducers...);
2796  };
2797  if constexpr (NumArgs == 2) {
2798  using Reduction = std::tuple_element_t<0, decltype(ReduTuple)>;
2799  auto &Redu = std::get<0>(ReduTuple);
2800 
2801  constexpr auto StrategyToUse = [&]() {
2802  if constexpr (Strategy != reduction::strategy::auto_select)
2803  return Strategy;
2804 
2805  // TODO: Both group_reduce_and_last_wg_detection and range_basic require
2806  // memory_order::acq_rel support that isn't guaranteed by the
2807  // specification. However, implementing run-time check for that would
2808  // result in an extra kernel compilation(s). We probably need to
2809  // investigate if the usage of kernel_bundles can mitigate that.
2810  // TODO: local_atomic_and_atomic_cross_wg uses atomics on the partial
2811  // results, which may add an implicit requirement on aspect::atomic64. As
2812  // a temporary work-around we do not pick this if the result type is
2813  // 64-bit. In the future this selection should be done at runtime based
2814  // on the device.
2815  // Note: Identityless reductions cannot use group reductions.
2816  if constexpr (Reduction::has_fast_reduce && Reduction::has_identity)
2818  else if constexpr (Reduction::has_fast_atomics &&
2819  sizeof(typename Reduction::result_type) != 8)
2821  else
2823  }();
2824 
2825  reduction_parallel_for<KernelName, StrategyToUse>(CGH, NDRange, Properties,
2826  Redu, UpdatedKernelFunc);
2827  } else {
2828  return std::apply(
2829  [&](auto &...Reds) {
2830  return reduction_parallel_for<KernelName, Strategy>(
2831  CGH, NDRange, Properties, Reds..., UpdatedKernelFunc);
2832  },
2833  ReduTuple);
2834  }
2835 }
2836 } // namespace detail
2837 
2842 template <typename T, typename AllocatorT, typename BinaryOperation>
2844  BinaryOperation Combiner, const property_list &PropList = {}) {
2845  std::ignore = CGH;
2846  bool InitializeToIdentity =
2847  PropList.has_property<property::reduction::initialize_to_identity>();
2848  return detail::make_reduction<BinaryOperation, 0, 1, false>(
2849  Var, Combiner, InitializeToIdentity);
2850 }
2851 
2857 template <typename T, typename BinaryOperation>
2858 auto reduction(T *Var, BinaryOperation Combiner,
2859  const property_list &PropList = {}) {
2860  bool InitializeToIdentity =
2861  PropList.has_property<property::reduction::initialize_to_identity>();
2862  return detail::make_reduction<BinaryOperation, 0, 1, false>(
2863  Var, Combiner, InitializeToIdentity);
2864 }
2865 
2869 template <typename T, typename AllocatorT, typename BinaryOperation>
2870 auto reduction(buffer<T, 1, AllocatorT> Var, handler &CGH, const T &Identity,
2871  BinaryOperation Combiner, const property_list &PropList = {}) {
2872  std::ignore = CGH;
2873  bool InitializeToIdentity =
2874  PropList.has_property<property::reduction::initialize_to_identity>();
2875  return detail::make_reduction<BinaryOperation, 0, 1, true>(
2876  Var, Identity, Combiner, InitializeToIdentity);
2877 }
2878 
2882 template <typename T, typename BinaryOperation>
2883 auto reduction(T *Var, const T &Identity, BinaryOperation Combiner,
2884  const property_list &PropList = {}) {
2885  bool InitializeToIdentity =
2886  PropList.has_property<property::reduction::initialize_to_identity>();
2887  return detail::make_reduction<BinaryOperation, 0, 1, true>(
2888  Var, Identity, Combiner, InitializeToIdentity);
2889 }
2890 
2896 template <typename T, size_t Extent, typename BinaryOperation,
2897  typename = std::enable_if_t<Extent != dynamic_extent>>
2898 auto reduction(span<T, Extent> Span, BinaryOperation Combiner,
2899  const property_list &PropList = {}) {
2900  bool InitializeToIdentity =
2901  PropList.has_property<property::reduction::initialize_to_identity>();
2902  return detail::make_reduction<BinaryOperation, 1, Extent, false>(
2903  Span.data(), Combiner, InitializeToIdentity);
2904 }
2905 
2909 template <typename T, size_t Extent, typename BinaryOperation,
2910  typename = std::enable_if_t<Extent != dynamic_extent>>
2911 auto reduction(span<T, Extent> Span, const T &Identity,
2912  BinaryOperation Combiner, const property_list &PropList = {}) {
2913  bool InitializeToIdentity =
2914  PropList.has_property<property::reduction::initialize_to_identity>();
2915  return detail::make_reduction<BinaryOperation, 1, Extent, true>(
2916  Span.data(), Identity, Combiner, InitializeToIdentity);
2917 }
2918 } // namespace _V1
2919 } // namespace sycl
The file contains implementations of accessor class.
Defines a shared array that can be used by kernels in queues.
Definition: buffer.hpp:173
Helper class for accessing internal reducer member functions.
Definition: reduction.hpp:178
static constexpr auto getIdentityStatic()
Definition: reduction.hpp:196
ReducerAccess(ReducerT &ReducerRef)
Definition: reduction.hpp:180
constexpr T & operator*() noexcept
Definition: reduction.hpp:485
ReducerElement & combine(BinaryOperation BinOp, const T &OtherValue)
Definition: reduction.hpp:466
ReducerElement & combine(BinaryOperation BinOp, const ReducerElement &Other)
Definition: reduction.hpp:474
ReducerElement(const ReductionIdentityContainer< T, BinaryOperation, ExplicitIdentity > &IdentityContainer)
Definition: reduction.hpp:461
constexpr const T & operator*() const noexcept
Definition: reduction.hpp:491
Templated class for common functionality of all reduction implementation classes.
Definition: reduction.hpp:386
This class is the default KernelName template parameter type for kernel invocation APIs such as singl...
Definition: kernel.hpp:44
Use CRTP to avoid redefining shorthand operators in terms of combine.
Definition: reduction.hpp:225
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&(IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value||IsReduOptForAtomic64Op< _T, _BinaryOperation >::value) &&IsMaximum< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic MAX operation: *ReduVarPtr = sycl::maximum(*ReduVarPtr, MValue);.
Definition: reduction.hpp:376
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&(IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value||IsReduOptForAtomic64Op< _T, _BinaryOperation >::value) &&IsPlus< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic ADD operation: *ReduVarPtr += MValue;.
Definition: reduction.hpp:317
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value &&IsBitOR< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic BITWISE OR operation: *ReduVarPtr |= MValue;.
Definition: reduction.hpp:328
std::enable_if_t< std::is_same_v< remove_decoration_t< _T >, _T > &&IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value &&IsBitAND< _T, _BinaryOperation >::value &&(Space==access::address_space::global_space||Space==access::address_space::local_space)> atomic_combine(_T *ReduVarPtr) const
Atomic BITWISE AND operation: *ReduVarPtr &= MValue;.
Definition: reduction.hpp:352
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&(IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value||IsReduOptForAtomic64Op< _T, _BinaryOperation >::value) &&IsMinimum< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic MIN operation: *ReduVarPtr = sycl::minimum(*ReduVarPtr, MValue);.
Definition: reduction.hpp:364
std::enable_if_t< BasicCheck< _T, Space, _BinaryOperation > &&IsReduOptForFastAtomicFetch< _T, _BinaryOperation >::value &&IsBitXOR< _T, _BinaryOperation >::value > atomic_combine(_T *ReduVarPtr) const
Atomic BITWISE XOR operation: *ReduVarPtr ^= MValue;.
Definition: reduction.hpp:339
static constexpr int dimensions
Definition: reduction.hpp:525
const identity_container_type & getIdentityContainer()
Definition: reduction.hpp:1051
auto & getTempBuffer(size_t Size, handler &CGH)
Definition: reduction.hpp:934
BinaryOperation getBinaryOperation() const
Returns the binary operation associated with the reduction.
Definition: reduction.hpp:1082
static constexpr bool has_fast_reduce
Definition: reduction.hpp:884
typename ReducerTraits< reducer_type >::element_type reducer_element_type
Definition: reduction.hpp:875
std::enable_if_t<!HasIdentity > withInitializedMem(handler &CGH, FuncTy Func)
Definition: reduction.hpp:1044
static constexpr bool has_fast_atomics
Definition: reduction.hpp:882
auto getWriteMemForPartialReds(size_t Size, handler &CGH)
Definition: reduction.hpp:921
static constexpr bool has_float64_atomics
Definition: reduction.hpp:880
static constexpr size_t num_elements
Definition: reduction.hpp:889
std::enable_if_t< HasIdentity > withInitializedMem(handler &CGH, FuncTy Func)
Provide Func with a properly initialized memory to write the reduction result to.
Definition: reduction.hpp:979
static constexpr bool is_known_identity
Definition: reduction.hpp:864
reduction_impl_algo(BinaryOperation BinaryOp, bool Init, RedOutVar RedOut, std::enable_if_t<!IsKnownIdentityOp< RelayT, RelayBinaryOperation >::value, int >=0)
Definition: reduction.hpp:908
reduction_impl_algo(const T &Identity, BinaryOperation BinaryOp, bool Init, RedOutVar RedOut)
Definition: reduction.hpp:891
reduction_impl_algo(BinaryOperation BinaryOp, bool Init, RedOutVar RedOut, std::enable_if_t< IsKnownIdentityOp< RelayT, RelayBinaryOperation >::value, int >=0)
Definition: reduction.hpp:898
auto getWriteAccForPartialReds(size_t Size, handler &CGH)
Returns an accessor accessing the memory that will hold the reduction partial sums.
Definition: reduction.hpp:946
auto getReadAccToPreviousPartialReds(handler &CGH) const
Definition: reduction.hpp:915
static constexpr bool has_identity
Definition: reduction.hpp:866
accessor< int, 1, access::mode::read_write, access::target::device, access::placeholder::false_t > getReadWriteAccessorToInitializedGroupsCounter(handler &CGH)
Definition: reduction.hpp:1057
ReductionIdentityContainer< T, BinaryOperation, ExplicitIdentity > identity_container_type
Definition: reduction.hpp:869
auto getGroupsCounterAccDiscrete(handler &CGH)
Definition: reduction.hpp:1068
Base non-template class which is a base class for all reduction implementation classes.
This class encapsulates the reduction variable/accessor, the reduction operator and an optional opera...
Definition: reduction.hpp:1115
reduction_impl(RedOutVar Var, BinaryOperation BOp, bool InitializeToIdentity=false)
Constructs reduction_impl when no identity is specified.
Definition: reduction.hpp:1133
static constexpr bool is_known_identity
Definition: reduction.hpp:864
static constexpr bool is_usm
Definition: reduction.hpp:887
reduction_impl(RedOutVar &Var, const T &Identity, BinaryOperation BOp, bool InitializeToIdentity)
Constructs reduction_impl with an explicit identity value.
Definition: reduction.hpp:1151
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64
detail::is_device_info_desc< Param >::return_type get_info() const
Queries this SYCL device for information requested by the template parameter param.
Definition: device.hpp:215
bool has(aspect Aspect) const __SYCL_WARN_IMAGE_ASPECT(Aspect)
Indicates if the SYCL device has the given feature.
Definition: device.cpp:207
Command group handler class.
Definition: handler.hpp:467
void depends_on(event Event)
Registers event dependencies on this command group.
Definition: handler.cpp:1529
void single_task(_KERNELFUNCPARAM(KernelFunc))
Defines and invokes a SYCL kernel function as a function object type.
Definition: handler.hpp:2053
void parallel_for(range< 1 > NumWorkItems, _KERNELFUNCPARAM(KernelFunc))
Definition: handler.hpp:2059
void copy(accessor< T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder > Src, std::shared_ptr< T_Dst > Dst)
Copies the content of memory object accessed by Src into the memory pointed by Dst.
Definition: handler.hpp:2647
Provides a cross-platform math array class template that works on SYCL devices as well as in host C++...
Definition: marray.hpp:49
Identifies an instance of the function object executing at each point in an nd_range.
Definition: nd_item.hpp:48
size_t get_local_linear_id() const
Definition: nd_item.hpp:97
size_t __SYCL_ALWAYS_INLINE get_group_linear_id() const
Definition: nd_item.hpp:128
range< Dimensions > get_local_range() const
Definition: nd_item.hpp:172
size_t __SYCL_ALWAYS_INLINE get_global_linear_id() const
Definition: nd_item.hpp:66
void barrier(access::fence_space accessSpace=access::fence_space::global_and_local) const
Definition: nd_item.hpp:200
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: nd_range.hpp:22
range< Dimensions > get_local_range() const
Definition: nd_range.hpp:45
range< Dimensions > get_group_range() const
Definition: nd_range.hpp:47
Objects of the property_list class are containers for the SYCL properties.
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
Definition: queue.hpp:110
device get_device() const
Definition: queue.cpp:77
size_t size() const
Definition: range.hpp:56
Class that is used to represent objects that are passed to user's lambda functions and representing u...
Definition: reduction.hpp:78
constexpr _SYCL_SPAN_INLINE_VISIBILITY pointer data() const noexcept
Definition: sycl_span.hpp:378
constexpr tuple_element< I, tuple< Types... > >::type & get(sycl::detail::tuple< Types... > &Arg) noexcept
Definition: tuple.hpp:198
item< Dims, false > getDelinearizedItem(range< Dims > Range, id< Dims > Id)
void withAuxHandler(handler &CGH, FunctorTy Func)
Definition: reduction.hpp:1171
void finalizeHandler(handler &CGH)
Definition: reduction.hpp:1170
void writeReduSumsToOutAccs(size_t OutAccIndex, ReduTupleT< OutAccT... > OutAccs, ReduTupleT< LocalAccT... > LocalAccs, ReduTupleT< BOPsT... > BOPs, ReduTupleT< Ts... > IdentityVals, std::array< bool, sizeof...(Reductions)> IsInitializeToIdentity, std::index_sequence< Is... >)
Definition: reduction.hpp:2072
size_t reduGetMemPerWorkItem(std::tuple< ReductionT... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:2589
device getDeviceFromHandler(handler &CommandGroupHandlerRef)
Definition: accessor.cpp:17
void doTreeReductionHelper(size_t WorkSize, size_t LID, FuncTy Func)
Definition: reduction.hpp:1400
void reduCGFuncImplScalar(nd_item< Dims > NDIt, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, std::tuple< ReducerT... > &ReducersTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... > ReduIndices)
All scalar reductions are processed together; there is one loop of log2(N) steps, and each reduction ...
Definition: reduction.hpp:2203
uint32_t reduGetMaxNumConcurrentWorkGroups(std::shared_ptr< queue_impl > Queue)
size_t reduGetMaxWGSize(std::shared_ptr< queue_impl > Queue, size_t LocalMemBytesPerWorkItem)
void reduAuxCGFuncImplArray(nd_item< Dims > NDIt, size_t LID, size_t GID, size_t RemainingWorkSize, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< InAccT... > InAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... >)
Definition: reduction.hpp:2482
std::bool_constant< std::is_same_v< BinaryOperation, sycl::maximum< T > >||std::is_same_v< BinaryOperation, sycl::maximum< void > >> IsMaximum
void reduCGFuncImplArrayHelper(nd_item< Dims > NDIt, LocalAccT LocalReds, OutAccT Out, ReducerT &Reducer, BOPT BOp, bool IsInitializeToIdentity)
Each array reduction is processed separately.
Definition: reduction.hpp:2235
std::bool_constant< std::is_same_v< BinaryOperation, sycl::bit_or< T > >||std::is_same_v< BinaryOperation, sycl::bit_or< void > >> IsBitOR
std::bool_constant< IsZeroIdentityOp< T, BinaryOperation >::value||IsOneIdentityOp< T, BinaryOperation >::value||IsOnesIdentityOp< T, BinaryOperation >::value||IsMinimumIdentityOp< T, BinaryOperation >::value||IsMaximumIdentityOp< T, BinaryOperation >::value||IsFalseIdentityOp< T, BinaryOperation >::value||IsTrueIdentityOp< T, BinaryOperation >::value > IsKnownIdentityOp
id< 1 > getDelinearizedId(const range< 1 > &, size_t Index)
Definition: id.hpp:313
void reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu)
Copies the final reduction result kept in read-write accessor to user's USM memory.
Definition: reduction.hpp:1189
void reduAuxCGFuncImplArrayHelper(nd_item< Dims > NDIt, size_t LID, size_t GID, size_t RemainingWorkSize, LocalAccT LocalReds, InAccT In, OutAccT Out, T IdentityContainer, BOPT BOp, bool IsInitializeToIdentity)
Definition: reduction.hpp:2434
void doTreeReductionOnTuple(size_t WorkSize, size_t LID, ReduTupleT< LocalAccT... > &LocalAccs, ReduTupleT< BOPsT... > &BOPs, std::index_sequence< Is... >)
Definition: reduction.hpp:1469
void associateReduAccsWithHandler(handler &CGH, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:2389
size_t GreatestPowerOfTwo(size_t N)
Computes the greatest power-of-two less than or equal to N.
Definition: reduction.hpp:1389
void doTreeReduction(size_t WorkSize, nd_item< Dim > NDIt, LocalRedsTy &LocalReds, BinOpTy &BOp, AccessFuncTy AccessFunc)
Definition: reduction.hpp:1434
std::conditional_t< std::is_same_v< KernelName, auto_name >, auto_name, reduction::InitMemKrn< KernelName > > __sycl_init_mem_for
A helper to pass undefined (sycl::detail::auto_name) names unmodified.
Definition: reduction.hpp:830
std::bool_constant< std::is_same_v< BinaryOperation, sycl::multiplies< T > >||std::is_same_v< BinaryOperation, sycl::multiplies< void > >> IsMultiplies
size_t reduGetMemPerWorkItemHelper(Reduction &)
Definition: reduction.hpp:2578
auto getReducerAccess(ReducerT &Reducer)
Definition: reduction.hpp:210
std::bool_constant<(IsPlus< T, BinaryOperation >::value||IsMinimum< T, BinaryOperation >::value||IsMaximum< T, BinaryOperation >::value) &&is_sgenfloat_v< T > &&sizeof(T)==8 > IsReduOptForAtomic64Op
Definition: reduction.hpp:121
constexpr auto filterSequenceHelper(FunctorT, std::index_sequence< Is... >)
For each index 'I' from the given indices pack 'Is' this function initially creates a number of short...
Definition: reduction.hpp:2142
void reduAuxCGFuncImplScalar(nd_item< Dims > NDIt, size_t LID, size_t GID, size_t RemainingWorkSize, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< InAccT... > InAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... > ReduIndices)
All scalar reductions are processed together; there is one loop of log2(N) steps, and each reduction ...
Definition: reduction.hpp:2404
std::bool_constant<((is_sgenfloat_v< T > &&sizeof(T)==4)||is_sgeninteger_v< T >) &&IsValidAtomicType< T >::value &&(IsPlus< T, BinaryOperation >::value||IsMinimum< T, BinaryOperation >::value||IsMaximum< T, BinaryOperation >::value||IsBitOR< T, BinaryOperation >::value||IsBitXOR< T, BinaryOperation >::value||IsBitAND< T, BinaryOperation >::value)> IsReduOptForFastAtomicFetch
Definition: reduction.hpp:102
std::tuple< std::tuple_element_t< Is, TupleT >... > tuple_select_elements(TupleT Tuple, std::index_sequence< Is... >)
Utility function: for the given tuple.
Definition: reduction.hpp:2598
constexpr tuple< Ts... > make_tuple(Ts... Args)
Definition: tuple.hpp:35
size_t reduGetPreferredWGSize(std::shared_ptr< queue_impl > &Queue, size_t LocalMemBytesPerWorkItem)
Definition: reduction.cpp:116
std::bool_constant< std::is_same_v< BinaryOperation, sycl::plus< T > >||std::is_same_v< BinaryOperation, sycl::plus< void > >> IsPlus
size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... > ReduIndices)
Definition: reduction.hpp:2504
constexpr auto filterSequence(FunctorT F, std::index_sequence< Is... > Indices)
For each index 'I' from the given indices pack 'Is' this function returns an index sequence consistin...
Definition: reduction.hpp:2158
std::bool_constant<((is_sgeninteger_v< T > &&(sizeof(T)==4||sizeof(T)==8))||is_sgenfloat_v< T >) &&(IsPlus< T, BinaryOperation >::value||IsMinimum< T, BinaryOperation >::value||IsMaximum< T, BinaryOperation >::value)> IsReduOptForFastReduce
Definition: reduction.hpp:139
void reduCGFuncImplArray(nd_item< Dims > NDIt, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, std::tuple< ReducerT... > &ReducersTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... >)
Definition: reduction.hpp:2285
void workGroupBarrier()
Definition: group.hpp:42
ReduTupleT< Ts... > makeReduTupleT(Ts... Elements)
Definition: reduction.hpp:147
std::conditional_t< std::is_same_v< KernelName, auto_name >, auto_name, MainOrAux< KernelName, Strategy, Ts... > > __sycl_reduction_kernel
A helper to pass undefined (sycl::detail::auto_name) names unmodified.
Definition: reduction.hpp:1224
std::bool_constant< std::is_same_v< BinaryOperation, sycl::minimum< T > >||std::is_same_v< BinaryOperation, sycl::minimum< void > >> IsMinimum
void reduction_parallel_for(handler &CGH, range< Dims > NDRange, PropertiesT Properties, RestT... Rest)
Definition: reduction.hpp:2717
void addCounterInit(handler &CGH, std::shared_ptr< sycl::detail::queue_impl > &Queue, std::shared_ptr< int > &Counter)
Definition: reduction.cpp:170
size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize, size_t &NWorkGroups)
Definition: reduction.cpp:20
constexpr std::index_sequence concat_sequences(std::index_sequence<>)
Definition: reduction.hpp:2094
std::bool_constant< std::is_same_v< BinaryOperation, sycl::bit_and< T > >||std::is_same_v< BinaryOperation, sycl::bit_and< void > >> IsBitAND
auto createReduOutAccs(size_t NWorkGroups, handler &CGH, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
For the given 'Reductions' types pack and indices enumerating them this function either creates new t...
Definition: reduction.hpp:2045
auto getLastCombine(OutAccT OutAcc, LocalAccT LocalAcc, BOPT BOP, IdentityContainerT IdentityContainer, bool IsInitializeToIdentity)
Definition: reduction.hpp:2057
std::bool_constant< std::is_same_v< BinaryOperation, sycl::bit_xor< T > >||std::is_same_v< BinaryOperation, sycl::bit_xor< void > >> IsBitXOR
typename tuple_element< I, T >::type tuple_element_t
Definition: tuple.hpp:55
constexpr auto makeAdjustedBOPs(ReduTupleT< BOPsT... > &BOPsTuple, std::index_sequence< Is... >)
Definition: reduction.hpp:2184
void reduCGFuncMulti(handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, PropertiesT Properties, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... > ReduIndices)
Definition: reduction.hpp:2304
constexpr auto makeAdjustedBOP(BOPT &BOP)
Definition: reduction.hpp:2177
auto make_reduction(RedOutVar RedVar, RestTy &&...Rest)
Definition: reduction.hpp:1163
sycl::detail::tuple< Ts... > ReduTupleT
Definition: reduction.hpp:146
constexpr mode_tag_t< access_mode::read > read_only
Definition: access.hpp:84
constexpr property::no_init no_init
constexpr mode_tag_t< access_mode::read_write > read_write
Definition: access.hpp:85
std::enable_if_t<(is_group_v< std::decay_t< Group >> &&(detail::is_scalar_arithmetic< T >::value||(detail::is_complex< T >::value &&detail::is_multiplies< T, BinaryOperation >::value)) &&detail::is_native_op< T, BinaryOperation >::value), T > reduce_over_group(Group g, T x, BinaryOperation binary_op)
auto reduction(buffer< T, 1, AllocatorT > Var, handler &CGH, BinaryOperation Combiner, const property_list &PropList={})
Constructs a reduction object using the given buffer Var, handler CGH, reduction operation Combiner,...
Definition: reduction.hpp:2843
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:65
const void value_type
Definition: multi_ptr.hpp:457
void free(void *ptr, const context &ctxt, const detail::code_location &CodeLoc=detail::code_location::current())
Definition: usm_impl.cpp:343
Definition: access.hpp:18
_Abi const simd< _Tp, _Abi > & noexcept
Definition: simd.hpp:1324
std::conditional_t< Cond, std::index_sequence< I >, std::index_sequence<> > type
Definition: reduction.hpp:2132
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:2649
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, RestT... Rest)
Definition: reduction.hpp:2696
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1830
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1595
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1285
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1692
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1234
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1631
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, RestT... Rest)
Definition: reduction.hpp:2605
static void run(handler &CGH, std::shared_ptr< detail::queue_impl > &Queue, nd_range< Dims > NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc)
Definition: reduction.hpp:1484
const IdentityContainerT & IdentityContainer
Definition: reduction.hpp:531
const BinaryOperation BOp
Definition: reduction.hpp:532
Helper class for accessing reducer-defined types in CRTP May prove to be useful for other things late...
Definition: reduction.hpp:163
static constexpr int value
Definition: reduction.hpp:799
is_device_copyable is a user specializable class template to indicate that a type T is device copyabl...