DPC++ Runtime
Runtime libraries for oneAPI DPC++
non_uniform_algorithms.hpp
Go to the documentation of this file.
1 //==----- non_uniform_algorithms.hpp - cuda masked subgroup algorithms -----==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #pragma once
10 
11 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
12 #include "masked_shuffles.hpp"
13 
14 namespace sycl {
15 inline namespace _V1 {
16 namespace detail {
17 
18 template <typename T, class BinaryOperation>
19 using IsRedux = std::bool_constant<
20  std::is_integral<T>::value && IsBitAND<T, BinaryOperation>::value ||
21  IsBitOR<T, BinaryOperation>::value || IsBitXOR<T, BinaryOperation>::value ||
22  IsPlus<T, BinaryOperation>::value || IsMinimum<T, BinaryOperation>::value ||
23  IsMaximum<T, BinaryOperation>::value>;
24 
26 
27 template <typename Group, typename T, class BinaryOperation>
28 std::enable_if_t<is_sugeninteger_v<T> && IsMinimum<T, BinaryOperation>::value,
29  T>
30 masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
31  const uint32_t MemberMask) {
32  return __nvvm_redux_sync_umin(x, MemberMask);
33 }
34 
35 template <typename Group, typename T, class BinaryOperation>
36 std::enable_if_t<is_sigeninteger_v<T> && IsMinimum<T, BinaryOperation>::value,
37  T>
38 masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
39  const uint32_t MemberMask) {
40  return __nvvm_redux_sync_min(x, MemberMask);
41 }
42 
43 template <typename Group, typename T, class BinaryOperation>
44 std::enable_if_t<is_sugeninteger_v<T> && IsMaximum<T, BinaryOperation>::value,
45  T>
46 masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
47  const uint32_t MemberMask) {
48  return __nvvm_redux_sync_umax(x, MemberMask);
49 }
50 
51 template <typename Group, typename T, class BinaryOperation>
52 std::enable_if_t<is_sigeninteger_v<T> && IsMaximum<T, BinaryOperation>::value,
53  T>
54 masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
55  const uint32_t MemberMask) {
56  return __nvvm_redux_sync_max(x, MemberMask);
57 }
58 
59 template <typename Group, typename T, class BinaryOperation>
60 std::enable_if_t<(is_sugeninteger_v<T> ||
61  is_sigeninteger_v<T>)&&IsPlus<T, BinaryOperation>::value,
62  T>
63 masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
64  const uint32_t MemberMask) {
65  return __nvvm_redux_sync_add(x, MemberMask);
66 }
67 
68 template <typename Group, typename T, class BinaryOperation>
69 std::enable_if_t<(is_sugeninteger_v<T> ||
70  is_sigeninteger_v<T>)&&IsBitAND<T, BinaryOperation>::value,
71  T>
72 masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
73  const uint32_t MemberMask) {
74  return __nvvm_redux_sync_and(x, MemberMask);
75 }
76 
77 template <typename Group, typename T, class BinaryOperation>
78 std::enable_if_t<(is_sugeninteger_v<T> ||
79  is_sigeninteger_v<T>)&&IsBitOR<T, BinaryOperation>::value,
80  T>
81 masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
82  const uint32_t MemberMask) {
83  return __nvvm_redux_sync_or(x, MemberMask);
84 }
85 
86 template <typename Group, typename T, class BinaryOperation>
87 std::enable_if_t<(is_sugeninteger_v<T> ||
88  is_sigeninteger_v<T>)&&IsBitXOR<T, BinaryOperation>::value,
89  T>
90 masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
91  const uint32_t MemberMask) {
92  return __nvvm_redux_sync_xor(x, MemberMask);
93 }
95 
97 
98 // fixed_size_group group reduction using shfls
99 template <typename Group, typename T, class BinaryOperation>
100 inline __SYCL_ALWAYS_INLINE std::enable_if_t<is_fixed_size_group_v<Group>, T>
101 masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
102  const uint32_t MemberMask) {
103  for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) {
104  T tmp = cuda_shfl_sync_bfly_i32(MemberMask, x, i, 0x1f);
105  x = binary_op(x, tmp);
106  }
107  return x;
108 }
109 
110 // Opportunistic/Ballot group reduction using shfls
111 template <typename Group, typename T, class BinaryOperation>
112 inline __SYCL_ALWAYS_INLINE std::enable_if_t<
113  ext::oneapi::experimental::is_user_constructed_group_v<Group> &&
114  !is_fixed_size_group_v<Group>,
115  T>
116 masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
117  const uint32_t MemberMask) {
118 
119  unsigned localSetBit = g.get_local_id()[0] + 1;
120 
121  // number of elements requiring binary operations each loop iteration
122  auto opRange = g.get_local_range()[0];
123 
124  // stride between local_ids forming a binary op
125  unsigned stride = opRange / 2;
126  while (stride >= 1) {
127 
128  // if (remainder == 1), there is a WI without a binary op partner
129  unsigned remainder = opRange % 2;
130 
131  // unfolded position of set bit in mask of shfl src lane
132  int unfoldedSrcSetBit = localSetBit + stride;
133 
134  // __nvvm_fns automatically wraps around to the correct bit position.
135  // There is no performance impact on src_set_bit position wrt localSetBit
136  T tmp = cuda_shfl_sync_idx_i32(
137  MemberMask, x, __nvvm_fns(MemberMask, 0, unfoldedSrcSetBit), 31);
138 
139  if (!(localSetBit == 1 && remainder != 0)) {
140  x = binary_op(x, tmp);
141  }
142 
143  opRange = stride + remainder;
144  stride = opRange / 2;
145  }
146  unsigned broadID;
147  asm volatile(".reg .u32 rev;\n\t"
148  "brev.b32 rev, %1;\n\t" // reverse mask bits
149  "clz.b32 %0, rev;"
150  : "=r"(broadID)
151  : "r"(MemberMask));
152 
153  x = cuda_shfl_sync_idx_i32(MemberMask, x, broadID, 31);
154  return x;
155 }
156 
157 // Non Redux types must fall back to shfl based implementations.
158 template <typename Group, typename T, class BinaryOperation>
159 std::enable_if_t<
160  std::is_same<IsRedux<T, BinaryOperation>, std::false_type>::value &&
161  ext::oneapi::experimental::is_user_constructed_group_v<Group>,
162  T>
163 masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
164  const uint32_t MemberMask) {
165  return masked_reduction_cuda_shfls(g, x, binary_op, MemberMask);
166 }
167 
168 // get_identity is only currently used in this cuda specific header. If in the
169 // future it has more general use it should be moved to a more appropriate
170 // header.
171 template <typename T, class BinaryOperation>
173  std::enable_if_t<IsPlus<T, BinaryOperation>::value ||
174  IsBitOR<T, BinaryOperation>::value ||
175  IsBitXOR<T, BinaryOperation>::value,
176  T>
177  get_identity() {
178  return 0;
179 }
180 
181 template <typename T, class BinaryOperation>
183  std::enable_if_t<IsMultiplies<T, BinaryOperation>::value, T>
184  get_identity() {
185  return 1;
186 }
187 
188 template <typename T, class BinaryOperation>
190  std::enable_if_t<IsBitAND<T, BinaryOperation>::value, T>
191  get_identity() {
192  return ~0;
193 }
194 
195 template <typename T, class BinaryOperation>
197  std::enable_if_t<IsMinimum<T, BinaryOperation>::value, T>
198  get_identity() {
200 }
201 
202 template <typename T, class BinaryOperation>
204  std::enable_if_t<IsMaximum<T, BinaryOperation>::value, T>
205  get_identity() {
207 }
208 
210 
211 // fixed_size_group group scan using shfls
212 template <__spv::GroupOperation Op, typename Group, typename T,
213  class BinaryOperation>
214 inline __SYCL_ALWAYS_INLINE std::enable_if_t<is_fixed_size_group_v<Group>, T>
215 masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op,
216  const uint32_t MemberMask) {
217  unsigned localIdVal = g.get_local_id()[0];
218  for (int i = 1; i < g.get_local_range()[0]; i *= 2) {
219  T tmp = cuda_shfl_sync_up_i32(MemberMask, x, i, 0);
220  if (localIdVal >= i)
221  x = binary_op(x, tmp);
222  }
223  if constexpr (Op == __spv::GroupOperation::ExclusiveScan) {
224  x = cuda_shfl_sync_up_i32(MemberMask, x, 1, 0);
225  if (localIdVal == 0) {
226  return get_identity<T, BinaryOperation>();
227  }
228  }
229  return x;
230 }
231 
232 template <__spv::GroupOperation Op, typename Group, typename T,
233  class BinaryOperation>
234 inline __SYCL_ALWAYS_INLINE std::enable_if_t<
235  ext::oneapi::experimental::is_user_constructed_group_v<Group> &&
236  !is_fixed_size_group_v<Group>,
237  T>
238 masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op,
239  const uint32_t MemberMask) {
240  unsigned localIdVal = g.get_local_id()[0];
241  unsigned localSetBit = localIdVal + 1;
242 
243  for (int i = 1; i < g.get_local_range()[0]; i *= 2) {
244  int unfoldedSrcSetBit = localSetBit - i;
245 
246  T tmp = cuda_shfl_sync_idx_i32(
247  MemberMask, x, __nvvm_fns(MemberMask, 0, unfoldedSrcSetBit), 31);
248 
249  if (localIdVal >= i)
250  x = binary_op(x, tmp);
251  }
252  if constexpr (Op == __spv::GroupOperation::ExclusiveScan) {
253  x = cuda_shfl_sync_idx_i32(MemberMask, x,
254  __nvvm_fns(MemberMask, 0, localSetBit - 1), 31);
255  if (localIdVal == 0) {
256  return get_identity<T, BinaryOperation>();
257  }
258  }
259  return x;
260 }
261 
262 } // namespace detail
263 } // namespace _V1
264 } // namespace sycl
265 
266 #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
#define __SYCL_ALWAYS_INLINE
autodecltype(x) x
Definition: access.hpp:18