DPC++ Runtime
Runtime libraries for oneAPI DPC++
spirv.hpp
Go to the documentation of this file.
1 //===-- spirv.hpp - Helpers to generate SPIR-V instructions ----*- C++ -*--===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #pragma once
10 
11 #ifdef __SYCL_DEVICE_ONLY__
12 
13 #include <sycl/ext/oneapi/experimental/non_uniform_groups.hpp> // for IdToMaskPosition
14 
15 #if defined(__NVPTX__)
17 #endif
18 
19 #include <sycl/detail/memcpy.hpp> // sycl::detail::memcpy
20 
21 namespace sycl {
22 inline namespace _V1 {
23 struct sub_group;
24 namespace ext {
25 namespace oneapi {
26 struct sub_group;
27 namespace experimental {
28 template <typename ParentGroup> class ballot_group;
29 template <size_t PartitionSize, typename ParentGroup> class fixed_size_group;
30 template <int Dimensions> class root_group;
31 template <typename ParentGroup> class tangle_group;
32 class opportunistic_group;
33 } // namespace experimental
34 } // namespace oneapi
35 } // namespace ext
36 
37 namespace detail {
38 
39 // Helper for reinterpret casting the decorated pointer inside a multi_ptr
40 // without losing the decorations.
41 template <typename ToT, typename FromT, access::address_space Space,
42  access::decorated IsDecorated>
44 GetMultiPtrDecoratedAs(multi_ptr<FromT, Space, IsDecorated> MPtr) {
45  if constexpr (IsDecorated == access::decorated::legacy)
46  return reinterpret_cast<
48  MPtr.get());
49  else
50  return reinterpret_cast<
52  MPtr.get_decorated());
53 }
54 
55 namespace spirv {
56 
57 template <typename Group>
58 struct is_tangle_or_opportunistic_group : std::false_type {};
59 
60 template <typename ParentGroup>
61 struct is_tangle_or_opportunistic_group<
62  sycl::ext::oneapi::experimental::tangle_group<ParentGroup>>
63  : std::true_type {};
64 
65 template <>
66 struct is_tangle_or_opportunistic_group<
67  sycl::ext::oneapi::experimental::opportunistic_group> : std::true_type {};
68 
69 template <typename Group> struct is_ballot_group : std::false_type {};
70 
71 template <typename ParentGroup>
72 struct is_ballot_group<
73  sycl::ext::oneapi::experimental::ballot_group<ParentGroup>>
74  : std::true_type {};
75 
76 template <typename Group> struct is_fixed_size_group : std::false_type {};
77 
78 template <size_t PartitionSize, typename ParentGroup>
79 struct is_fixed_size_group<sycl::ext::oneapi::experimental::fixed_size_group<
80  PartitionSize, ParentGroup>> : std::true_type {};
81 
82 template <typename Group> struct group_scope {};
83 
84 template <int Dimensions>
85 struct group_scope<sycl::ext::oneapi::experimental::root_group<Dimensions>> {
86  static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Device;
87 };
88 
89 template <int Dimensions> struct group_scope<group<Dimensions>> {
90  static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Workgroup;
91 };
92 
93 template <> struct group_scope<::sycl::ext::oneapi::sub_group> {
94  static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Subgroup;
95 };
96 template <> struct group_scope<::sycl::sub_group> {
97  static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Subgroup;
98 };
99 
100 template <typename ParentGroup>
101 struct group_scope<sycl::ext::oneapi::experimental::ballot_group<ParentGroup>> {
102  static constexpr __spv::Scope::Flag value = group_scope<ParentGroup>::value;
103 };
104 
105 template <size_t PartitionSize, typename ParentGroup>
106 struct group_scope<sycl::ext::oneapi::experimental::fixed_size_group<
107  PartitionSize, ParentGroup>> {
108  static constexpr __spv::Scope::Flag value = group_scope<ParentGroup>::value;
109 };
110 
111 template <typename ParentGroup>
112 struct group_scope<sycl::ext::oneapi::experimental::tangle_group<ParentGroup>> {
113  static constexpr __spv::Scope::Flag value = group_scope<ParentGroup>::value;
114 };
115 
116 template <>
118  static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Subgroup;
119 };
120 
121 // Generic shuffles and broadcasts may require multiple calls to
122 // intrinsics, and should use the fewest broadcasts possible
123 // - Loop over chunks until remaining bytes < chunk size
124 // - At most one 32-bit, 16-bit and 8-bit chunk left over
125 #ifndef __NVPTX__
126 using ShuffleChunkT = uint64_t;
127 #else
128 using ShuffleChunkT = uint32_t;
129 #endif
130 template <typename T, typename Functor>
131 void GenericCall(const Functor &ApplyToBytes) {
132  if (sizeof(T) >= sizeof(ShuffleChunkT)) {
133 #pragma unroll
134  for (size_t Offset = 0; Offset + sizeof(ShuffleChunkT) <= sizeof(T);
135  Offset += sizeof(ShuffleChunkT)) {
136  ApplyToBytes(Offset, sizeof(ShuffleChunkT));
137  }
138  }
139  if (sizeof(ShuffleChunkT) >= sizeof(uint64_t)) {
140  if (sizeof(T) % sizeof(uint64_t) >= sizeof(uint32_t)) {
141  size_t Offset = sizeof(T) / sizeof(uint64_t) * sizeof(uint64_t);
142  ApplyToBytes(Offset, sizeof(uint32_t));
143  }
144  }
145  if (sizeof(ShuffleChunkT) >= sizeof(uint32_t)) {
146  if (sizeof(T) % sizeof(uint32_t) >= sizeof(uint16_t)) {
147  size_t Offset = sizeof(T) / sizeof(uint32_t) * sizeof(uint32_t);
148  ApplyToBytes(Offset, sizeof(uint16_t));
149  }
150  }
151  if (sizeof(ShuffleChunkT) >= sizeof(uint16_t)) {
152  if (sizeof(T) % sizeof(uint16_t) >= sizeof(uint8_t)) {
153  size_t Offset = sizeof(T) / sizeof(uint16_t) * sizeof(uint16_t);
154  ApplyToBytes(Offset, sizeof(uint8_t));
155  }
156  }
157 }
158 
159 template <typename Group> bool GroupAll(Group, bool pred) {
160  return __spirv_GroupAll(group_scope<Group>::value, pred);
161 }
162 template <typename ParentGroup>
163 bool GroupAll(ext::oneapi::experimental::ballot_group<ParentGroup> g,
164  bool pred) {
165  // ballot_group partitions its parent into two groups (0 and 1)
166  // We have to force each group down different control flow
167  // Work-items in the "false" group (0) may still be active
168  if (g.get_group_id() == 1) {
169  return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
170  } else {
171  return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
172  }
173 }
174 template <size_t PartitionSize, typename ParentGroup>
175 bool GroupAll(
176  ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup>,
177  bool pred) {
178  // GroupNonUniformAll doesn't support cluster size, so use a reduction
179  return __spirv_GroupNonUniformBitwiseAnd(
180  group_scope<ParentGroup>::value,
181  static_cast<uint32_t>(__spv::GroupOperation::ClusteredReduce),
182  static_cast<uint32_t>(pred), PartitionSize);
183 }
184 template <typename ParentGroup>
185 bool GroupAll(ext::oneapi::experimental::tangle_group<ParentGroup>, bool pred) {
186  return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
187 }
188 
189 bool GroupAll(const ext::oneapi::experimental::opportunistic_group &,
190  bool pred) {
191  return __spirv_GroupNonUniformAll(
192  group_scope<ext::oneapi::experimental::opportunistic_group>::value, pred);
193 }
194 
195 template <typename Group> bool GroupAny(Group, bool pred) {
196  return __spirv_GroupAny(group_scope<Group>::value, pred);
197 }
198 template <typename ParentGroup>
199 bool GroupAny(ext::oneapi::experimental::ballot_group<ParentGroup> g,
200  bool pred) {
201  // ballot_group partitions its parent into two groups (0 and 1)
202  // We have to force each group down different control flow
203  // Work-items in the "false" group (0) may still be active
204  if (g.get_group_id() == 1) {
205  return __spirv_GroupNonUniformAny(group_scope<ParentGroup>::value, pred);
206  } else {
207  return __spirv_GroupNonUniformAny(group_scope<ParentGroup>::value, pred);
208  }
209 }
210 template <size_t PartitionSize, typename ParentGroup>
211 bool GroupAny(
212  ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup>,
213  bool pred) {
214  // GroupNonUniformAny doesn't support cluster size, so use a reduction
215  return __spirv_GroupNonUniformBitwiseOr(
216  group_scope<ParentGroup>::value,
217  static_cast<uint32_t>(__spv::GroupOperation::ClusteredReduce),
218  static_cast<uint32_t>(pred), PartitionSize);
219 }
220 template <typename ParentGroup>
221 bool GroupAny(ext::oneapi::experimental::tangle_group<ParentGroup>, bool pred) {
222  return __spirv_GroupNonUniformAny(group_scope<ParentGroup>::value, pred);
223 }
224 bool GroupAny(const ext::oneapi::experimental::opportunistic_group &,
225  bool pred) {
226  return __spirv_GroupNonUniformAny(
227  group_scope<ext::oneapi::experimental::opportunistic_group>::value, pred);
228 }
229 
230 // Native broadcasts map directly to a SPIR-V GroupBroadcast intrinsic
231 // FIXME: Do not special-case for half or vec once all backends support all data
232 // types.
233 template <typename T>
234 using is_native_broadcast =
235  std::bool_constant<detail::is_arithmetic<T>::value &&
236  !std::is_same<T, half>::value &&
237  !detail::is_vec<T>::value>;
238 
239 template <typename T, typename IdT = size_t>
240 using EnableIfNativeBroadcast = std::enable_if_t<
241  is_native_broadcast<T>::value && std::is_integral<IdT>::value, T>;
242 
243 // Bitcast broadcasts can be implemented using a single SPIR-V GroupBroadcast
244 // intrinsic, but require type-punning via an appropriate integer type
245 template <typename T>
246 using is_bitcast_broadcast = std::bool_constant<
247  !is_native_broadcast<T>::value && std::is_trivially_copyable<T>::value &&
248  (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8)>;
249 
250 template <typename T, typename IdT = size_t>
251 using EnableIfBitcastBroadcast = std::enable_if_t<
252  is_bitcast_broadcast<T>::value && std::is_integral<IdT>::value, T>;
253 
254 template <typename T>
255 using ConvertToNativeBroadcastType_t = select_cl_scalar_integral_unsigned_t<T>;
256 
257 // Generic broadcasts may require multiple calls to SPIR-V GroupBroadcast
258 // intrinsics, and should use the fewest broadcasts possible
259 // - Loop over 64-bit chunks until remaining bytes < 64-bit
260 // - At most one 32-bit, 16-bit and 8-bit chunk left over
261 template <typename T>
262 using is_generic_broadcast =
263  std::bool_constant<!is_native_broadcast<T>::value &&
264  !is_bitcast_broadcast<T>::value &&
265  std::is_trivially_copyable<T>::value>;
266 
267 template <typename T, typename IdT = size_t>
268 using EnableIfGenericBroadcast = std::enable_if_t<
269  is_generic_broadcast<T>::value && std::is_integral<IdT>::value, T>;
270 
271 // FIXME: Disable widening once all backends support all data types.
272 template <typename T>
273 using WidenOpenCLTypeTo32_t = std::conditional_t<
274  std::is_same<T, opencl::cl_char>() || std::is_same<T, opencl::cl_short>(),
276  std::conditional_t<std::is_same<T, opencl::cl_uchar>() ||
277  std::is_same<T, opencl::cl_ushort>(),
278  opencl::cl_uint, T>>;
279 
280 // Broadcast with scalar local index
281 // Work-group supports any integral type
282 // Sub-group currently supports only uint32_t
283 template <typename Group> struct GroupId {
284  using type = size_t;
285 };
286 template <> struct GroupId<::sycl::ext::oneapi::sub_group> {
287  using type = uint32_t;
288 };
289 template <> struct GroupId<::sycl::sub_group> {
290  using type = uint32_t;
291 };
292 
293 // Consolidated function for converting group arguments to OpenCL types.
294 template <typename Group, typename T, typename IdT>
295 EnableIfNativeBroadcast<T, IdT> GroupBroadcast(Group, T x, IdT local_id) {
296  auto GroupLocalId = static_cast<typename GroupId<Group>::type>(local_id);
297  auto OCLX = detail::convertToOpenCLType(x);
298  WidenOpenCLTypeTo32_t<decltype(OCLX)> WideOCLX = OCLX;
299  auto OCLId = detail::convertToOpenCLType(GroupLocalId);
300  return __spirv_GroupBroadcast(group_scope<Group>::value, WideOCLX, OCLId);
301 }
302 
303 template <typename ParentGroup, typename T, typename IdT>
304 EnableIfNativeBroadcast<T, IdT>
306  T x, IdT local_id) {
307  // Remap local_id to its original numbering in ParentGroup.
308  auto LocalId = detail::IdToMaskPosition(g, local_id);
309 
310  // TODO: Refactor to avoid duplication after design settles.
311  auto GroupLocalId = static_cast<typename GroupId<ParentGroup>::type>(LocalId);
312  auto OCLX = detail::convertToOpenCLType(x);
313  WidenOpenCLTypeTo32_t<decltype(OCLX)> WideOCLX = OCLX;
314  auto OCLId = detail::convertToOpenCLType(GroupLocalId);
315 
316  // ballot_group partitions its parent into two groups (0 and 1)
317  // We have to force each group down different control flow
318  // Work-items in the "false" group (0) may still be active
319  if (g.get_group_id() == 1) {
320  return __spirv_GroupNonUniformBroadcast(group_scope<ParentGroup>::value,
321  WideOCLX, OCLId);
322  } else {
323  return __spirv_GroupNonUniformBroadcast(group_scope<ParentGroup>::value,
324  WideOCLX, OCLId);
325  }
326 }
327 template <size_t PartitionSize, typename ParentGroup, typename T, typename IdT>
328 EnableIfNativeBroadcast<T, IdT> GroupBroadcast(
329  ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup> g,
330  T x, IdT local_id) {
331  // Remap local_id to its original numbering in ParentGroup
332  auto LocalId = g.get_group_linear_id() * PartitionSize + local_id;
333 
334  // TODO: Refactor to avoid duplication after design settles.
335  auto GroupLocalId = static_cast<typename GroupId<ParentGroup>::type>(LocalId);
336  auto OCLX = detail::convertToOpenCLType(x);
337  WidenOpenCLTypeTo32_t<decltype(OCLX)> WideOCLX = OCLX;
338  auto OCLId = detail::convertToOpenCLType(GroupLocalId);
339 
340  // NonUniformBroadcast requires Id to be dynamically uniform, which does not
341  // hold here; each partition is broadcasting a separate index. We could
342  // fallback to either NonUniformShuffle or a NonUniformBroadcast per
343  // partition, and it's unclear which will be faster in practice.
344  return __spirv_GroupNonUniformShuffle(group_scope<ParentGroup>::value,
345  WideOCLX, OCLId);
346 }
347 template <typename ParentGroup, typename T, typename IdT>
348 EnableIfNativeBroadcast<T, IdT>
349 GroupBroadcast(ext::oneapi::experimental::tangle_group<ParentGroup> g, T x,
350  IdT local_id) {
351  // Remap local_id to its original numbering in ParentGroup.
352  auto LocalId = detail::IdToMaskPosition(g, local_id);
353 
354  // TODO: Refactor to avoid duplication after design settles.
355  auto GroupLocalId = static_cast<typename GroupId<ParentGroup>::type>(LocalId);
356  auto OCLX = detail::convertToOpenCLType(x);
357  WidenOpenCLTypeTo32_t<decltype(OCLX)> WideOCLX = OCLX;
358  auto OCLId = detail::convertToOpenCLType(GroupLocalId);
359 
360  return __spirv_GroupNonUniformBroadcast(group_scope<ParentGroup>::value,
361  WideOCLX, OCLId);
362 }
363 template <typename T, typename IdT>
364 EnableIfNativeBroadcast<T, IdT>
365 GroupBroadcast(const ext::oneapi::experimental::opportunistic_group &g, T x,
366  IdT local_id) {
367  // Remap local_id to its original numbering in sub-group
368  auto LocalId = detail::IdToMaskPosition(g, local_id);
369 
370  // TODO: Refactor to avoid duplication after design settles.
371  auto GroupLocalId =
372  static_cast<typename GroupId<::sycl::sub_group>::type>(LocalId);
373  auto OCLX = detail::convertToOpenCLType(x);
374  WidenOpenCLTypeTo32_t<decltype(OCLX)> WideOCLX = OCLX;
375  auto OCLId = detail::convertToOpenCLType(GroupLocalId);
376 
377  return __spirv_GroupNonUniformBroadcast(
378  group_scope<ext::oneapi::experimental::opportunistic_group>::value,
379  WideOCLX, OCLId);
380 }
381 
382 template <typename Group, typename T, typename IdT>
383 EnableIfBitcastBroadcast<T, IdT> GroupBroadcast(Group g, T x, IdT local_id) {
384  using BroadcastT = ConvertToNativeBroadcastType_t<T>;
385  auto BroadcastX = sycl::bit_cast<BroadcastT>(x);
386  BroadcastT Result = GroupBroadcast(g, BroadcastX, local_id);
387  return sycl::bit_cast<T>(Result);
388 }
389 template <typename Group, typename T, typename IdT>
390 EnableIfGenericBroadcast<T, IdT> GroupBroadcast(Group g, T x, IdT local_id) {
391  // Initialize with x to support type T without default constructor
392  T Result = x;
393  char *XBytes = reinterpret_cast<char *>(&x);
394  char *ResultBytes = reinterpret_cast<char *>(&Result);
395  auto BroadcastBytes = [=](size_t Offset, size_t Size) {
396  uint64_t BroadcastX, BroadcastResult;
397  detail::memcpy(&BroadcastX, XBytes + Offset, Size);
398  BroadcastResult = GroupBroadcast(g, BroadcastX, local_id);
399  detail::memcpy(ResultBytes + Offset, &BroadcastResult, Size);
400  };
401  GenericCall<T>(BroadcastBytes);
402  return Result;
403 }
404 
405 // Broadcast with vector local index
406 template <typename Group, typename T, int Dimensions>
407 EnableIfNativeBroadcast<T> GroupBroadcast(Group g, T x,
408  id<Dimensions> local_id) {
409  if (Dimensions == 1) {
410  return GroupBroadcast(g, x, local_id[0]);
411  }
412  using IdT = vec<size_t, Dimensions>;
413  IdT VecId;
414  for (int i = 0; i < Dimensions; ++i) {
415  VecId[i] = local_id[Dimensions - i - 1];
416  }
417  auto OCLX = detail::convertToOpenCLType(x);
418  WidenOpenCLTypeTo32_t<decltype(OCLX)> WideOCLX = OCLX;
419  auto OCLId = detail::convertToOpenCLType(VecId);
420  return __spirv_GroupBroadcast(group_scope<Group>::value, WideOCLX, OCLId);
421 }
422 template <typename ParentGroup, typename T>
423 EnableIfNativeBroadcast<T>
425  T x, id<1> local_id) {
426  // Limited to 1D indices for now because ParentGroup must be sub-group.
427  return GroupBroadcast(g, x, local_id[0]);
428 }
429 template <typename Group, typename T, int Dimensions>
430 EnableIfBitcastBroadcast<T> GroupBroadcast(Group g, T x,
431  id<Dimensions> local_id) {
432  using BroadcastT = ConvertToNativeBroadcastType_t<T>;
433  auto BroadcastX = sycl::bit_cast<BroadcastT>(x);
434  BroadcastT Result = GroupBroadcast(g, BroadcastX, local_id);
435  return sycl::bit_cast<T>(Result);
436 }
437 template <typename Group, typename T, int Dimensions>
438 EnableIfGenericBroadcast<T> GroupBroadcast(Group g, T x,
439  id<Dimensions> local_id) {
440  if (Dimensions == 1) {
441  return GroupBroadcast(g, x, local_id[0]);
442  }
443  // Initialize with x to support type T without default constructor
444  T Result = x;
445  char *XBytes = reinterpret_cast<char *>(&x);
446  char *ResultBytes = reinterpret_cast<char *>(&Result);
447  auto BroadcastBytes = [=](size_t Offset, size_t Size) {
448  uint64_t BroadcastX, BroadcastResult;
449  detail::memcpy(&BroadcastX, XBytes + Offset, Size);
450  BroadcastResult = GroupBroadcast(g, BroadcastX, local_id);
451  detail::memcpy(ResultBytes + Offset, &BroadcastResult, Size);
452  };
453  GenericCall<T>(BroadcastBytes);
454  return Result;
455 }
456 
457 // Single happens-before means semantics should always apply to all spaces
458 // Although consume is unsupported, forwarding to acquire is valid
459 template <typename T>
460 static constexpr
461  typename std::enable_if<std::is_same<T, sycl::memory_order>::value,
463  getMemorySemanticsMask(T Order) {
465  switch (Order) {
466  case T::relaxed:
468  break;
469  case T::__consume_unsupported:
470  case T::acquire:
472  break;
473  case T::release:
475  break;
476  case T::acq_rel:
478  break;
479  case T::seq_cst:
481  break;
482  }
483  return static_cast<__spv::MemorySemanticsMask::Flag>(
487 }
488 
489 static constexpr __spv::Scope::Flag getScope(memory_scope Scope) {
490  switch (Scope) {
494  return __spv::Scope::Subgroup;
498  return __spv::Scope::Device;
501  }
502 }
503 
504 template <typename T, access::address_space AddressSpace,
505  access::decorated IsDecorated>
506 inline typename std::enable_if_t<std::is_integral<T>::value, T>
507 AtomicCompareExchange(multi_ptr<T, AddressSpace, IsDecorated> MPtr,
508  memory_scope Scope, memory_order Success,
509  memory_order Failure, T Desired, T Expected) {
510  auto SPIRVSuccess = getMemorySemanticsMask(Success);
511  auto SPIRVFailure = getMemorySemanticsMask(Failure);
512  auto SPIRVScope = getScope(Scope);
513  auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
514  return __spirv_AtomicCompareExchange(Ptr, SPIRVScope, SPIRVSuccess,
515  SPIRVFailure, Desired, Expected);
516 }
517 
518 template <typename T, access::address_space AddressSpace,
519  access::decorated IsDecorated>
520 inline typename std::enable_if_t<std::is_floating_point<T>::value, T>
521 AtomicCompareExchange(multi_ptr<T, AddressSpace, IsDecorated> MPtr,
522  memory_scope Scope, memory_order Success,
523  memory_order Failure, T Desired, T Expected) {
524  using I = detail::make_unsinged_integer_t<T>;
525  auto SPIRVSuccess = getMemorySemanticsMask(Success);
526  auto SPIRVFailure = getMemorySemanticsMask(Failure);
527  auto SPIRVScope = getScope(Scope);
528  auto *PtrInt = GetMultiPtrDecoratedAs<I>(MPtr);
529  I DesiredInt = sycl::bit_cast<I>(Desired);
530  I ExpectedInt = sycl::bit_cast<I>(Expected);
531  I ResultInt = __spirv_AtomicCompareExchange(
532  PtrInt, SPIRVScope, SPIRVSuccess, SPIRVFailure, DesiredInt, ExpectedInt);
533  return sycl::bit_cast<T>(ResultInt);
534 }
535 
536 template <typename T, access::address_space AddressSpace,
537  access::decorated IsDecorated>
538 inline typename std::enable_if_t<std::is_integral<T>::value, T>
539 AtomicLoad(multi_ptr<T, AddressSpace, IsDecorated> MPtr, memory_scope Scope,
540  memory_order Order) {
541  auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
542  auto SPIRVOrder = getMemorySemanticsMask(Order);
543  auto SPIRVScope = getScope(Scope);
544  return __spirv_AtomicLoad(Ptr, SPIRVScope, SPIRVOrder);
545 }
546 
547 template <typename T, access::address_space AddressSpace,
548  access::decorated IsDecorated>
549 inline typename std::enable_if_t<std::is_floating_point<T>::value, T>
550 AtomicLoad(multi_ptr<T, AddressSpace, IsDecorated> MPtr, memory_scope Scope,
551  memory_order Order) {
552  using I = detail::make_unsinged_integer_t<T>;
553  auto *PtrInt = GetMultiPtrDecoratedAs<I>(MPtr);
554  auto SPIRVOrder = getMemorySemanticsMask(Order);
555  auto SPIRVScope = getScope(Scope);
556  I ResultInt = __spirv_AtomicLoad(PtrInt, SPIRVScope, SPIRVOrder);
557  return sycl::bit_cast<T>(ResultInt);
558 }
559 
560 template <typename T, access::address_space AddressSpace,
561  access::decorated IsDecorated>
562 inline typename std::enable_if_t<std::is_integral<T>::value>
563 AtomicStore(multi_ptr<T, AddressSpace, IsDecorated> MPtr, memory_scope Scope,
564  memory_order Order, T Value) {
565  auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
566  auto SPIRVOrder = getMemorySemanticsMask(Order);
567  auto SPIRVScope = getScope(Scope);
568  __spirv_AtomicStore(Ptr, SPIRVScope, SPIRVOrder, Value);
569 }
570 
571 template <typename T, access::address_space AddressSpace,
572  access::decorated IsDecorated>
573 inline typename std::enable_if_t<std::is_floating_point<T>::value>
574 AtomicStore(multi_ptr<T, AddressSpace, IsDecorated> MPtr, memory_scope Scope,
575  memory_order Order, T Value) {
576  using I = detail::make_unsinged_integer_t<T>;
577  auto *PtrInt = GetMultiPtrDecoratedAs<I>(MPtr);
578  auto SPIRVOrder = getMemorySemanticsMask(Order);
579  auto SPIRVScope = getScope(Scope);
580  I ValueInt = sycl::bit_cast<I>(Value);
581  __spirv_AtomicStore(PtrInt, SPIRVScope, SPIRVOrder, ValueInt);
582 }
583 
584 template <typename T, access::address_space AddressSpace,
585  access::decorated IsDecorated>
586 inline typename std::enable_if_t<std::is_integral<T>::value, T>
587 AtomicExchange(multi_ptr<T, AddressSpace, IsDecorated> MPtr, memory_scope Scope,
588  memory_order Order, T Value) {
589  auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
590  auto SPIRVOrder = getMemorySemanticsMask(Order);
591  auto SPIRVScope = getScope(Scope);
592  return __spirv_AtomicExchange(Ptr, SPIRVScope, SPIRVOrder, Value);
593 }
594 
595 template <typename T, access::address_space AddressSpace,
596  access::decorated IsDecorated>
597 inline typename std::enable_if_t<std::is_floating_point<T>::value, T>
598 AtomicExchange(multi_ptr<T, AddressSpace, IsDecorated> MPtr, memory_scope Scope,
599  memory_order Order, T Value) {
600  using I = detail::make_unsinged_integer_t<T>;
601  auto *PtrInt = GetMultiPtrDecoratedAs<I>(MPtr);
602  auto SPIRVOrder = getMemorySemanticsMask(Order);
603  auto SPIRVScope = getScope(Scope);
604  I ValueInt = sycl::bit_cast<I>(Value);
605  I ResultInt =
606  __spirv_AtomicExchange(PtrInt, SPIRVScope, SPIRVOrder, ValueInt);
607  return sycl::bit_cast<T>(ResultInt);
608 }
609 
610 template <typename T, access::address_space AddressSpace,
611  access::decorated IsDecorated>
612 inline typename std::enable_if_t<std::is_integral<T>::value, T>
613 AtomicIAdd(multi_ptr<T, AddressSpace, IsDecorated> MPtr, memory_scope Scope,
614  memory_order Order, T Value) {
615  auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
616  auto SPIRVOrder = getMemorySemanticsMask(Order);
617  auto SPIRVScope = getScope(Scope);
618  return __spirv_AtomicIAdd(Ptr, SPIRVScope, SPIRVOrder, Value);
619 }
620 
621 template <typename T, access::address_space AddressSpace,
622  access::decorated IsDecorated>
623 inline typename std::enable_if_t<std::is_integral<T>::value, T>
624 AtomicISub(multi_ptr<T, AddressSpace, IsDecorated> MPtr, memory_scope Scope,
625  memory_order Order, T Value) {
626  auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
627  auto SPIRVOrder = getMemorySemanticsMask(Order);
628  auto SPIRVScope = getScope(Scope);
629  return __spirv_AtomicISub(Ptr, SPIRVScope, SPIRVOrder, Value);
630 }
631 
632 template <typename T, access::address_space AddressSpace,
633  access::decorated IsDecorated>
634 inline typename std::enable_if_t<std::is_floating_point<T>::value, T>
635 AtomicFAdd(multi_ptr<T, AddressSpace, IsDecorated> MPtr, memory_scope Scope,
636  memory_order Order, T Value) {
637  auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
638  auto SPIRVOrder = getMemorySemanticsMask(Order);
639  auto SPIRVScope = getScope(Scope);
640  return __spirv_AtomicFAddEXT(Ptr, SPIRVScope, SPIRVOrder, Value);
641 }
642 
643 template <typename T, access::address_space AddressSpace,
644  access::decorated IsDecorated>
645 inline typename std::enable_if_t<std::is_integral<T>::value, T>
646 AtomicAnd(multi_ptr<T, AddressSpace, IsDecorated> MPtr, memory_scope Scope,
647  memory_order Order, T Value) {
648  auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
649  auto SPIRVOrder = getMemorySemanticsMask(Order);
650  auto SPIRVScope = getScope(Scope);
651  return __spirv_AtomicAnd(Ptr, SPIRVScope, SPIRVOrder, Value);
652 }
653 
654 template <typename T, access::address_space AddressSpace,
655  access::decorated IsDecorated>
656 inline typename std::enable_if_t<std::is_integral<T>::value, T>
657 AtomicOr(multi_ptr<T, AddressSpace, IsDecorated> MPtr, memory_scope Scope,
658  memory_order Order, T Value) {
659  auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
660  auto SPIRVOrder = getMemorySemanticsMask(Order);
661  auto SPIRVScope = getScope(Scope);
662  return __spirv_AtomicOr(Ptr, SPIRVScope, SPIRVOrder, Value);
663 }
664 
665 template <typename T, access::address_space AddressSpace,
666  access::decorated IsDecorated>
667 inline typename std::enable_if_t<std::is_integral<T>::value, T>
668 AtomicXor(multi_ptr<T, AddressSpace, IsDecorated> MPtr, memory_scope Scope,
669  memory_order Order, T Value) {
670  auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
671  auto SPIRVOrder = getMemorySemanticsMask(Order);
672  auto SPIRVScope = getScope(Scope);
673  return __spirv_AtomicXor(Ptr, SPIRVScope, SPIRVOrder, Value);
674 }
675 
676 template <typename T, access::address_space AddressSpace,
677  access::decorated IsDecorated>
678 inline typename std::enable_if_t<std::is_integral<T>::value, T>
679 AtomicMin(multi_ptr<T, AddressSpace, IsDecorated> MPtr, memory_scope Scope,
680  memory_order Order, T Value) {
681  auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
682  auto SPIRVOrder = getMemorySemanticsMask(Order);
683  auto SPIRVScope = getScope(Scope);
684  return __spirv_AtomicMin(Ptr, SPIRVScope, SPIRVOrder, Value);
685 }
686 
687 template <typename T, access::address_space AddressSpace,
688  access::decorated IsDecorated>
689 inline typename std::enable_if_t<std::is_floating_point<T>::value, T>
690 AtomicMin(multi_ptr<T, AddressSpace, IsDecorated> MPtr, memory_scope Scope,
691  memory_order Order, T Value) {
692  auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
693  auto SPIRVOrder = getMemorySemanticsMask(Order);
694  auto SPIRVScope = getScope(Scope);
695  return __spirv_AtomicMin(Ptr, SPIRVScope, SPIRVOrder, Value);
696 }
697 
698 template <typename T, access::address_space AddressSpace,
699  access::decorated IsDecorated>
700 inline typename std::enable_if_t<std::is_integral<T>::value, T>
701 AtomicMax(multi_ptr<T, AddressSpace, IsDecorated> MPtr, memory_scope Scope,
702  memory_order Order, T Value) {
703  auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
704  auto SPIRVOrder = getMemorySemanticsMask(Order);
705  auto SPIRVScope = getScope(Scope);
706  return __spirv_AtomicMax(Ptr, SPIRVScope, SPIRVOrder, Value);
707 }
708 
709 template <typename T, access::address_space AddressSpace,
710  access::decorated IsDecorated>
711 inline typename std::enable_if_t<std::is_floating_point<T>::value, T>
712 AtomicMax(multi_ptr<T, AddressSpace, IsDecorated> MPtr, memory_scope Scope,
713  memory_order Order, T Value) {
714  auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
715  auto SPIRVOrder = getMemorySemanticsMask(Order);
716  auto SPIRVScope = getScope(Scope);
717  return __spirv_AtomicMax(Ptr, SPIRVScope, SPIRVOrder, Value);
718 }
719 
720 // Native shuffles map directly to a shuffle intrinsic:
721 // - The Intel SPIR-V extension natively supports all arithmetic types.
722 // However, OpenCL extension natively supports float vectors,
723 // integer vectors, half scalar and double scalar.
724 // For double, long, long long, unsigned long, unsigned long long
725 // and half vectors we perform emulation with scalar version.
726 // - The CUDA shfl intrinsics do not support vectors, and we use the _i32
727 // variants for all scalar types
728 #ifndef __NVPTX__
729 
730 using ProhibitedTypesForShuffleEmulation =
731  type_list<double, long, long long, unsigned long, unsigned long long, half>;
732 
733 template <typename T>
734 struct TypeIsProhibitedForShuffleEmulation
735  : std::bool_constant<is_contained<
736  vector_element_t<T>, ProhibitedTypesForShuffleEmulation>::value> {};
737 
738 template <typename T>
739 struct VecTypeIsProhibitedForShuffleEmulation
740  : std::bool_constant<
741  (detail::get_vec_size<T>::size > 1) &&
742  TypeIsProhibitedForShuffleEmulation<vector_element_t<T>>::value> {};
743 
744 template <typename T>
745 using EnableIfNativeShuffle =
746  std::enable_if_t<detail::is_arithmetic<T>::value &&
747  !VecTypeIsProhibitedForShuffleEmulation<T>::value,
748  T>;
749 
750 template <typename T>
751 using EnableIfVectorShuffle =
752  std::enable_if_t<VecTypeIsProhibitedForShuffleEmulation<T>::value, T>;
753 
754 #else // ifndef __NVPTX__
755 
756 template <typename T>
757 using EnableIfNativeShuffle = std::enable_if_t<
758  std::is_integral<T>::value && (sizeof(T) <= sizeof(int32_t)), T>;
759 
760 template <typename T>
761 using EnableIfVectorShuffle =
762  std::enable_if_t<detail::is_vector_arithmetic<T>::value, T>;
763 #endif // ifndef __NVPTX__
764 
765 // Bitcast shuffles can be implemented using a single SubgroupShuffle
766 // intrinsic, but require type-punning via an appropriate integer type
767 #ifndef __NVPTX__
768 template <typename T>
769 using EnableIfBitcastShuffle =
770  std::enable_if_t<!detail::is_arithmetic<T>::value &&
771  (std::is_trivially_copyable_v<T> &&
772  (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 ||
773  sizeof(T) == 8)),
774  T>;
775 #else
776 template <typename T>
777 using EnableIfBitcastShuffle =
778  std::enable_if_t<!(std::is_integral_v<T> &&
779  (sizeof(T) <= sizeof(int32_t))) &&
780  !detail::is_vector_arithmetic<T>::value &&
781  (std::is_trivially_copyable_v<T> &&
782  (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4)),
783  T>;
784 #endif // ifndef __NVPTX__
785 
786 // Generic shuffles may require multiple calls to SubgroupShuffle
787 // intrinsics, and should use the fewest shuffles possible:
788 // - Loop over 64-bit chunks until remaining bytes < 64-bit
789 // - At most one 32-bit, 16-bit and 8-bit chunk left over
790 #ifndef __NVPTX__
791 template <typename T>
792 using EnableIfGenericShuffle =
793  std::enable_if_t<!detail::is_arithmetic<T>::value &&
794  !(std::is_trivially_copyable_v<T> &&
795  (sizeof(T) == 1 || sizeof(T) == 2 ||
796  sizeof(T) == 4 || sizeof(T) == 8)),
797  T>;
798 #else
799 template <typename T>
800 using EnableIfGenericShuffle = std::enable_if_t<
801  !(std::is_integral<T>::value && (sizeof(T) <= sizeof(int32_t))) &&
802  !detail::is_vector_arithmetic<T>::value &&
803  !(std::is_trivially_copyable_v<T> &&
804  (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4)),
805  T>;
806 #endif
807 
808 #ifdef __NVPTX__
809 inline uint32_t membermask() {
810  // use a full mask as sync operations are required to be convergent and exited
811  // threads can safely be in the mask
812  return 0xFFFFFFFF;
813 }
814 #endif
815 
816 template <typename GroupT>
817 inline uint32_t MapShuffleID(GroupT g, id<1> local_id) {
818  if constexpr (is_tangle_or_opportunistic_group<GroupT>::value ||
819  is_ballot_group<GroupT>::value)
820  return detail::IdToMaskPosition(g, local_id);
821  else if constexpr (is_fixed_size_group<GroupT>::value)
822  return g.get_group_linear_id() * g.get_local_range().size() + local_id;
823  else
824  return local_id.get(0);
825 }
826 
827 // Forward declarations for template overloadings
828 template <typename GroupT, typename T>
829 EnableIfBitcastShuffle<T> Shuffle(GroupT g, T x, id<1> local_id);
830 
831 template <typename GroupT, typename T>
832 EnableIfBitcastShuffle<T> ShuffleXor(GroupT g, T x, id<1> local_id);
833 
834 template <typename GroupT, typename T>
835 EnableIfBitcastShuffle<T> ShuffleDown(GroupT g, T x, uint32_t delta);
836 
837 template <typename GroupT, typename T>
838 EnableIfBitcastShuffle<T> ShuffleUp(GroupT g, T x, uint32_t delta);
839 
840 template <typename GroupT, typename T>
841 EnableIfGenericShuffle<T> Shuffle(GroupT g, T x, id<1> local_id);
842 
843 template <typename GroupT, typename T>
844 EnableIfGenericShuffle<T> ShuffleXor(GroupT g, T x, id<1> local_id);
845 
846 template <typename GroupT, typename T>
847 EnableIfGenericShuffle<T> ShuffleDown(GroupT g, T x, uint32_t delta);
848 
849 template <typename GroupT, typename T>
850 EnableIfGenericShuffle<T> ShuffleUp(GroupT g, T x, uint32_t delta);
851 
852 template <typename GroupT, typename T>
853 EnableIfNativeShuffle<T> Shuffle(GroupT g, T x, id<1> local_id) {
854  uint32_t LocalId = MapShuffleID(g, local_id);
855 #ifndef __NVPTX__
856  std::ignore = g;
858  GroupT> &&
859  detail::is_vec<T>::value) {
860  // Temporary work-around due to a bug in IGC.
861  // TODO: Remove when IGC bug is fixed.
862  T result;
863  for (int s = 0; s < x.size(); ++s)
864  result[s] = Shuffle(g, x[s], local_id);
865  return result;
867  GroupT>) {
868  return __spirv_GroupNonUniformShuffle(group_scope<GroupT>::value,
869  convertToOpenCLType(x), LocalId);
870  } else {
871  // Subgroup.
872  return __spirv_SubgroupShuffleINTEL(convertToOpenCLType(x), LocalId);
873  }
874 #else
876  GroupT>) {
877  return cuda_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x,
878  LocalId, 31);
879  } else {
880  return cuda_shfl_sync_idx_i32(membermask(), x, LocalId, 31);
881  }
882 #endif
883 }
884 
885 template <typename GroupT, typename T>
886 EnableIfNativeShuffle<T> ShuffleXor(GroupT g, T x, id<1> mask) {
887 #ifndef __NVPTX__
888  std::ignore = g;
890  GroupT> &&
891  detail::is_vec<T>::value) {
892  // Temporary work-around due to a bug in IGC.
893  // TODO: Remove when IGC bug is fixed.
894  T result;
895  for (int s = 0; s < x.size(); ++s)
896  result[s] = ShuffleXor(g, x[s], mask);
897  return result;
899  GroupT>) {
900  // Since the masks are relative to the groups, we could either try to adjust
901  // the mask or simply do the xor ourselves. Latter option is efficient,
902  // general, and simple so we go with that.
903  id<1> TargetLocalId = g.get_local_id() ^ mask;
904  uint32_t TargetId = MapShuffleID(g, TargetLocalId);
905  return __spirv_GroupNonUniformShuffle(group_scope<GroupT>::value,
906  convertToOpenCLType(x), TargetId);
907  } else {
908  // Subgroup.
909  return __spirv_SubgroupShuffleXorINTEL(convertToOpenCLType(x),
910  static_cast<uint32_t>(mask.get(0)));
911  }
912 #else
914  GroupT>) {
915  auto MemberMask = detail::ExtractMask(detail::GetMask(g))[0];
916  if constexpr (is_fixed_size_group_v<GroupT>) {
917  return cuda_shfl_sync_bfly_i32(MemberMask, x,
918  static_cast<uint32_t>(mask.get(0)), 0x1f);
919 
920  } else {
921  int unfoldedSrcSetBit =
922  (g.get_local_id()[0] ^ static_cast<uint32_t>(mask.get(0))) + 1;
923  return cuda_shfl_sync_idx_i32(
924  MemberMask, x, __nvvm_fns(MemberMask, 0, unfoldedSrcSetBit), 31);
925  }
926  } else {
927  return cuda_shfl_sync_bfly_i32(membermask(), x,
928  static_cast<uint32_t>(mask.get(0)), 0x1f);
929  }
930 #endif
931 }
932 
933 template <typename GroupT, typename T>
934 EnableIfNativeShuffle<T> ShuffleDown(GroupT g, T x, uint32_t delta) {
935 #ifndef __NVPTX__
937  GroupT> &&
938  detail::is_vec<T>::value) {
939  // Temporary work-around due to a bug in IGC.
940  // TODO: Remove when IGC bug is fixed.
941  T result;
942  for (int s = 0; s < x.size(); ++s)
943  result[s] = ShuffleDown(g, x[s], delta);
944  return result;
946  GroupT>) {
947  id<1> TargetLocalId = g.get_local_id();
948  // ID outside the group range is UB, so we just keep the current item ID
949  // unchanged.
950  if (TargetLocalId[0] + delta < g.get_local_linear_range())
951  TargetLocalId[0] += delta;
952  uint32_t TargetId = MapShuffleID(g, TargetLocalId);
953  return __spirv_GroupNonUniformShuffle(group_scope<GroupT>::value,
954  convertToOpenCLType(x), TargetId);
955  } else {
956  // Subgroup.
957  return __spirv_SubgroupShuffleDownINTEL(convertToOpenCLType(x),
958  convertToOpenCLType(x), delta);
959  }
960 #else
962  GroupT>) {
963  auto MemberMask = detail::ExtractMask(detail::GetMask(g))[0];
964  if constexpr (is_fixed_size_group_v<GroupT>) {
965  return cuda_shfl_sync_down_i32(MemberMask, x, delta, 31);
966  } else {
967  unsigned localSetBit = g.get_local_id()[0] + 1;
968  int unfoldedSrcSetBit = localSetBit + delta;
969  return cuda_shfl_sync_idx_i32(
970  MemberMask, x, __nvvm_fns(MemberMask, 0, unfoldedSrcSetBit), 31);
971  }
972  } else {
973  return cuda_shfl_sync_down_i32(membermask(), x, delta, 31);
974  }
975 #endif
976 }
977 
978 template <typename GroupT, typename T>
979 EnableIfNativeShuffle<T> ShuffleUp(GroupT g, T x, uint32_t delta) {
980 #ifndef __NVPTX__
982  GroupT> &&
983  detail::is_vec<T>::value) {
984  // Temporary work-around due to a bug in IGC.
985  // TODO: Remove when IGC bug is fixed.
986  T result;
987  for (int s = 0; s < x.size(); ++s)
988  result[s] = ShuffleUp(g, x[s], delta);
989  return result;
991  GroupT>) {
992  id<1> TargetLocalId = g.get_local_id();
993  // Underflow is UB, so we just keep the current item ID unchanged.
994  if (TargetLocalId[0] >= delta)
995  TargetLocalId[0] -= delta;
996  uint32_t TargetId = MapShuffleID(g, TargetLocalId);
997  return __spirv_GroupNonUniformShuffle(group_scope<GroupT>::value,
998  convertToOpenCLType(x), TargetId);
999  } else {
1000  // Subgroup.
1001  return __spirv_SubgroupShuffleUpINTEL(convertToOpenCLType(x),
1002  convertToOpenCLType(x), delta);
1003  }
1004 #else
1006  GroupT>) {
1007  auto MemberMask = detail::ExtractMask(detail::GetMask(g))[0];
1008  if constexpr (is_fixed_size_group_v<GroupT>) {
1009  return cuda_shfl_sync_up_i32(MemberMask, x, delta, 0);
1010  } else {
1011  unsigned localSetBit = g.get_local_id()[0] + 1;
1012  int unfoldedSrcSetBit = localSetBit - delta;
1013 
1014  return cuda_shfl_sync_idx_i32(
1015  MemberMask, x, __nvvm_fns(MemberMask, 0, unfoldedSrcSetBit), 31);
1016  }
1017  } else {
1018  return cuda_shfl_sync_up_i32(membermask(), x, delta, 0);
1019  }
1020 #endif
1021 }
1022 
1023 template <typename GroupT, typename T>
1024 EnableIfVectorShuffle<T> Shuffle(GroupT g, T x, id<1> local_id) {
1025  T result;
1026  for (int s = 0; s < x.size(); ++s) {
1027  result[s] = Shuffle(g, x[s], local_id);
1028  }
1029  return result;
1030 }
1031 
1032 template <typename GroupT, typename T>
1033 EnableIfVectorShuffle<T> ShuffleXor(GroupT g, T x, id<1> local_id) {
1034  T result;
1035  for (int s = 0; s < x.size(); ++s) {
1036  result[s] = ShuffleXor(g, x[s], local_id);
1037  }
1038  return result;
1039 }
1040 
1041 template <typename GroupT, typename T>
1042 EnableIfVectorShuffle<T> ShuffleDown(GroupT g, T x, uint32_t delta) {
1043  T result;
1044  for (int s = 0; s < x.size(); ++s) {
1045  result[s] = ShuffleDown(g, x[s], delta);
1046  }
1047  return result;
1048 }
1049 
1050 template <typename GroupT, typename T>
1051 EnableIfVectorShuffle<T> ShuffleUp(GroupT g, T x, uint32_t delta) {
1052  T result;
1053  for (int s = 0; s < x.size(); ++s) {
1054  result[s] = ShuffleUp(g, x[s], delta);
1055  }
1056  return result;
1057 }
1058 
1059 template <typename T>
1060 using ConvertToNativeShuffleType_t = select_cl_scalar_integral_unsigned_t<T>;
1061 
1062 template <typename GroupT, typename T>
1063 EnableIfBitcastShuffle<T> Shuffle(GroupT g, T x, id<1> local_id) {
1064  using ShuffleT = ConvertToNativeShuffleType_t<T>;
1065  auto ShuffleX = sycl::bit_cast<ShuffleT>(x);
1066  ShuffleT Result = Shuffle(g, ShuffleX, local_id);
1067  return sycl::bit_cast<T>(Result);
1068 }
1069 
1070 template <typename GroupT, typename T>
1071 EnableIfBitcastShuffle<T> ShuffleXor(GroupT g, T x, id<1> local_id) {
1072  using ShuffleT = ConvertToNativeShuffleType_t<T>;
1073  auto ShuffleX = sycl::bit_cast<ShuffleT>(x);
1074  ShuffleT Result = ShuffleXor(g, ShuffleX, local_id);
1075  return sycl::bit_cast<T>(Result);
1076 }
1077 
1078 template <typename GroupT, typename T>
1079 EnableIfBitcastShuffle<T> ShuffleDown(GroupT g, T x, uint32_t delta) {
1080  using ShuffleT = ConvertToNativeShuffleType_t<T>;
1081  auto ShuffleX = sycl::bit_cast<ShuffleT>(x);
1082  ShuffleT Result = ShuffleDown(g, ShuffleX, delta);
1083  return sycl::bit_cast<T>(Result);
1084 }
1085 
1086 template <typename GroupT, typename T>
1087 EnableIfBitcastShuffle<T> ShuffleUp(GroupT g, T x, uint32_t delta) {
1088  using ShuffleT = ConvertToNativeShuffleType_t<T>;
1089  auto ShuffleX = sycl::bit_cast<ShuffleT>(x);
1090  ShuffleT Result = ShuffleUp(g, ShuffleX, delta);
1091  return sycl::bit_cast<T>(Result);
1092 }
1093 
1094 template <typename GroupT, typename T>
1095 EnableIfGenericShuffle<T> Shuffle(GroupT g, T x, id<1> local_id) {
1096  T Result;
1097  char *XBytes = reinterpret_cast<char *>(&x);
1098  char *ResultBytes = reinterpret_cast<char *>(&Result);
1099  auto ShuffleBytes = [=](size_t Offset, size_t Size) {
1100  ShuffleChunkT ShuffleX, ShuffleResult;
1101  detail::memcpy(&ShuffleX, XBytes + Offset, Size);
1102  ShuffleResult = Shuffle(g, ShuffleX, local_id);
1103  detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
1104  };
1105  GenericCall<T>(ShuffleBytes);
1106  return Result;
1107 }
1108 
1109 template <typename GroupT, typename T>
1110 EnableIfGenericShuffle<T> ShuffleXor(GroupT g, T x, id<1> local_id) {
1111  T Result;
1112  char *XBytes = reinterpret_cast<char *>(&x);
1113  char *ResultBytes = reinterpret_cast<char *>(&Result);
1114  auto ShuffleBytes = [=](size_t Offset, size_t Size) {
1115  ShuffleChunkT ShuffleX, ShuffleResult;
1116  detail::memcpy(&ShuffleX, XBytes + Offset, Size);
1117  ShuffleResult = ShuffleXor(g, ShuffleX, local_id);
1118  detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
1119  };
1120  GenericCall<T>(ShuffleBytes);
1121  return Result;
1122 }
1123 
1124 template <typename GroupT, typename T>
1125 EnableIfGenericShuffle<T> ShuffleDown(GroupT g, T x, uint32_t delta) {
1126  T Result;
1127  char *XBytes = reinterpret_cast<char *>(&x);
1128  char *ResultBytes = reinterpret_cast<char *>(&Result);
1129  auto ShuffleBytes = [=](size_t Offset, size_t Size) {
1130  ShuffleChunkT ShuffleX, ShuffleResult;
1131  detail::memcpy(&ShuffleX, XBytes + Offset, Size);
1132  ShuffleResult = ShuffleDown(g, ShuffleX, delta);
1133  detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
1134  };
1135  GenericCall<T>(ShuffleBytes);
1136  return Result;
1137 }
1138 
1139 template <typename GroupT, typename T>
1140 EnableIfGenericShuffle<T> ShuffleUp(GroupT g, T x, uint32_t delta) {
1141  T Result;
1142  char *XBytes = reinterpret_cast<char *>(&x);
1143  char *ResultBytes = reinterpret_cast<char *>(&Result);
1144  auto ShuffleBytes = [=](size_t Offset, size_t Size) {
1145  ShuffleChunkT ShuffleX, ShuffleResult;
1146  detail::memcpy(&ShuffleX, XBytes + Offset, Size);
1147  ShuffleResult = ShuffleUp(g, ShuffleX, delta);
1148  detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
1149  };
1150  GenericCall<T>(ShuffleBytes);
1151  return Result;
1152 }
1153 
1154 template <typename Group>
1155 typename std::enable_if_t<
1156  ext::oneapi::experimental::is_fixed_topology_group_v<Group>>
1157 ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
1158  __spirv_ControlBarrier(group_scope<Group>::value, getScope(FenceScope),
1159  getMemorySemanticsMask(Order) |
1163 }
1164 
1165 template <typename Group>
1166 typename std::enable_if_t<
1167  ext::oneapi::experimental::is_user_constructed_group_v<Group>>
1168 ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) {
1169 #if defined(__NVPTX__)
1170  __nvvm_bar_warp_sync(detail::ExtractMask(detail::GetMask(g))[0]);
1171 #else
1172  (void)g;
1173  // SPIR-V does not define an instruction to synchronize partial groups.
1174  // However, most (possibly all?) of the current SPIR-V targets execute
1175  // work-items in lockstep, so we can probably get away with a MemoryBarrier.
1176  // TODO: Replace this if SPIR-V defines a NonUniformControlBarrier
1177  __spirv_MemoryBarrier(getScope(FenceScope),
1178  getMemorySemanticsMask(Order) |
1182 #endif
1183 }
1184 
1185 // TODO: Refactor to avoid duplication after design settles
1186 #define __SYCL_GROUP_COLLECTIVE_OVERLOAD(Instruction) \
1187  template <__spv::GroupOperation Op, typename Group, typename T> \
1188  inline typename std::enable_if_t< \
1189  ext::oneapi::experimental::is_fixed_topology_group_v<Group>, T> \
1190  Group##Instruction(Group, T x) { \
1191  using ConvertedT = detail::ConvertToOpenCLType_t<T>; \
1192  \
1193  using OCLT = std::conditional_t< \
1194  std::is_same<ConvertedT, opencl::cl_char>() || \
1195  std::is_same<ConvertedT, opencl::cl_short>(), \
1196  opencl::cl_int, \
1197  std::conditional_t<std::is_same<ConvertedT, opencl::cl_uchar>() || \
1198  std::is_same<ConvertedT, opencl::cl_ushort>(), \
1199  opencl::cl_uint, ConvertedT>>; \
1200  OCLT Arg = x; \
1201  OCLT Ret = __spirv_Group##Instruction(group_scope<Group>::value, \
1202  static_cast<unsigned int>(Op), Arg); \
1203  return Ret; \
1204  } \
1205  \
1206  template <__spv::GroupOperation Op, typename ParentGroup, typename T> \
1207  inline T Group##Instruction( \
1208  ext::oneapi::experimental::ballot_group<ParentGroup> g, T x) { \
1209  using ConvertedT = detail::ConvertToOpenCLType_t<T>; \
1210  \
1211  using OCLT = std::conditional_t< \
1212  std::is_same<ConvertedT, opencl::cl_char>() || \
1213  std::is_same<ConvertedT, opencl::cl_short>(), \
1214  opencl::cl_int, \
1215  std::conditional_t<std::is_same<ConvertedT, opencl::cl_uchar>() || \
1216  std::is_same<ConvertedT, opencl::cl_ushort>(), \
1217  opencl::cl_uint, ConvertedT>>; \
1218  OCLT Arg = x; \
1219  /* ballot_group partitions its parent into two groups (0 and 1) */ \
1220  /* We have to force each group down different control flow */ \
1221  /* Work-items in the "false" group (0) may still be active */ \
1222  constexpr auto Scope = group_scope<ParentGroup>::value; \
1223  constexpr auto OpInt = static_cast<unsigned int>(Op); \
1224  if (g.get_group_id() == 1) { \
1225  return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg); \
1226  } else { \
1227  return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg); \
1228  } \
1229  } \
1230  \
1231  template <__spv::GroupOperation Op, size_t PartitionSize, \
1232  typename ParentGroup, typename T> \
1233  inline T Group##Instruction( \
1234  ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup> \
1235  g, \
1236  T x) { \
1237  using ConvertedT = detail::ConvertToOpenCLType_t<T>; \
1238  \
1239  using OCLT = std::conditional_t< \
1240  std::is_same<ConvertedT, opencl::cl_char>() || \
1241  std::is_same<ConvertedT, opencl::cl_short>(), \
1242  opencl::cl_int, \
1243  std::conditional_t<std::is_same<ConvertedT, opencl::cl_uchar>() || \
1244  std::is_same<ConvertedT, opencl::cl_ushort>(), \
1245  opencl::cl_uint, ConvertedT>>; \
1246  OCLT Arg = x; \
1247  constexpr auto Scope = group_scope<ParentGroup>::value; \
1248  /* SPIR-V only defines a ClusteredReduce, with no equivalents for scan. */ \
1249  /* Emulate Clustered*Scan using control flow to separate clusters. */ \
1250  if constexpr (Op == __spv::GroupOperation::Reduce) { \
1251  constexpr auto OpInt = \
1252  static_cast<unsigned int>(__spv::GroupOperation::ClusteredReduce); \
1253  return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg, \
1254  PartitionSize); \
1255  } else { \
1256  T tmp; \
1257  for (size_t Cluster = 0; Cluster < g.get_group_linear_range(); \
1258  ++Cluster) { \
1259  if (Cluster == g.get_group_linear_id()) { \
1260  constexpr auto OpInt = static_cast<unsigned int>(Op); \
1261  tmp = __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg); \
1262  } \
1263  } \
1264  return tmp; \
1265  } \
1266  } \
1267  template <__spv::GroupOperation Op, typename Group, typename T> \
1268  inline typename std::enable_if_t< \
1269  is_tangle_or_opportunistic_group<Group>::value, T> \
1270  Group##Instruction(Group, T x) { \
1271  using ConvertedT = detail::ConvertToOpenCLType_t<T>; \
1272  \
1273  using OCLT = std::conditional_t< \
1274  std::is_same<ConvertedT, opencl::cl_char>() || \
1275  std::is_same<ConvertedT, opencl::cl_short>(), \
1276  opencl::cl_int, \
1277  std::conditional_t<std::is_same<ConvertedT, opencl::cl_uchar>() || \
1278  std::is_same<ConvertedT, opencl::cl_ushort>(), \
1279  opencl::cl_uint, ConvertedT>>; \
1280  OCLT Arg = x; \
1281  OCLT Ret = __spirv_GroupNonUniform##Instruction( \
1282  group_scope<Group>::value, static_cast<unsigned int>(Op), Arg); \
1283  return Ret; \
1284  }
1285 
1286 __SYCL_GROUP_COLLECTIVE_OVERLOAD(SMin)
1287 __SYCL_GROUP_COLLECTIVE_OVERLOAD(UMin)
1288 __SYCL_GROUP_COLLECTIVE_OVERLOAD(FMin)
1289 
1290 __SYCL_GROUP_COLLECTIVE_OVERLOAD(SMax)
1291 __SYCL_GROUP_COLLECTIVE_OVERLOAD(UMax)
1292 __SYCL_GROUP_COLLECTIVE_OVERLOAD(FMax)
1293 
1294 __SYCL_GROUP_COLLECTIVE_OVERLOAD(IAdd)
1295 __SYCL_GROUP_COLLECTIVE_OVERLOAD(FAdd)
1296 
1297 __SYCL_GROUP_COLLECTIVE_OVERLOAD(IMulKHR)
1298 __SYCL_GROUP_COLLECTIVE_OVERLOAD(FMulKHR)
1299 __SYCL_GROUP_COLLECTIVE_OVERLOAD(CMulINTEL)
1300 
1301 __SYCL_GROUP_COLLECTIVE_OVERLOAD(BitwiseOrKHR)
1302 __SYCL_GROUP_COLLECTIVE_OVERLOAD(BitwiseXorKHR)
1303 __SYCL_GROUP_COLLECTIVE_OVERLOAD(BitwiseAndKHR)
1304 
1305 __SYCL_GROUP_COLLECTIVE_OVERLOAD(LogicalAndKHR)
1306 __SYCL_GROUP_COLLECTIVE_OVERLOAD(LogicalOrKHR)
1307 
1308 template <access::address_space Space, typename T>
1309 auto GenericCastToPtr(T *Ptr) ->
1311  if constexpr (Space == access::address_space::global_space) {
1312  return __SYCL_GenericCastToPtr_ToGlobal<T>(Ptr);
1313  } else if constexpr (Space == access::address_space::local_space) {
1314  return __SYCL_GenericCastToPtr_ToLocal<T>(Ptr);
1315  } else if constexpr (Space == access::address_space::private_space) {
1316  return __SYCL_GenericCastToPtr_ToPrivate<T>(Ptr);
1317  }
1318 }
1319 
1320 template <access::address_space Space, typename T>
1321 auto GenericCastToPtrExplicit(T *Ptr) ->
1323  if constexpr (Space == access::address_space::global_space) {
1324  return __SYCL_GenericCastToPtrExplicit_ToGlobal<T>(Ptr);
1325  } else if constexpr (Space == access::address_space::local_space) {
1326  return __SYCL_GenericCastToPtrExplicit_ToLocal<T>(Ptr);
1327  } else if constexpr (Space == access::address_space::private_space) {
1328  return __SYCL_GenericCastToPtrExplicit_ToPrivate<T>(Ptr);
1329  }
1330 }
1331 
1332 } // namespace spirv
1333 } // namespace detail
1334 } // namespace _V1
1335 } // namespace sycl
1336 #endif // __SYCL_DEVICE_ONLY__
sycl::memory_order memory_order
Definition: atomic.hpp:38
sycl::memory_scope memory_scope
Definition: atomic_ref.hpp:31
uint32_t IdToMaskPosition(NonUniformGroup Group, uint32_t Id)
ext::oneapi::sub_group_mask GetMask(NonUniformGroup Group)
void memcpy(void *Dst, const void *Src, size_t Size)
Definition: memcpy.hpp:16
sycl::vec< unsigned, 4 > ExtractMask(ext::oneapi::sub_group_mask Mask)
@ group
Wait until all previous memory transactions from this thread are observed within the local thread-gro...
std::int32_t cl_int
Definition: aliases.hpp:134
std::uint32_t cl_uint
Definition: aliases.hpp:135
class __SYCL_EBO __SYCL_SPECIAL_CLASS Dimensions
std::conditional_t< is_decorated, decorated_type *, std::add_pointer_t< value_type > > pointer
Definition: multi_ptr.hpp:459
autodecltype(x) x
Definition: access.hpp:18
__SYCL_CONVERGENT__ __DPCPP_SYCL_EXTERNAL void __spirv_ControlBarrier(__spv::Scope Execution, __spv::Scope Memory, uint32_t Semantics) noexcept
Definition: spirv_ops.cpp:26
__SYCL_CONVERGENT__ __DPCPP_SYCL_EXTERNAL void __spirv_MemoryBarrier(__spv::Scope Memory, uint32_t Semantics) noexcept
Definition: spirv_ops.cpp:36
T __spirv_AtomicOr(std::atomic< T > *Ptr, __spv::Scope::Flag, __spv::MemorySemanticsMask::Flag MS, T V)
Definition: atomic.hpp:128
void __spirv_AtomicStore(std::atomic< T > *Ptr, __spv::Scope::Flag, __spv::MemorySemanticsMask::Flag MS, T V)
Definition: atomic.hpp:92
T __spirv_AtomicExchange(std::atomic< T > *Ptr, __spv::Scope::Flag, __spv::MemorySemanticsMask::Flag MS, T V)
Definition: atomic.hpp:104
T __spirv_AtomicLoad(const std::atomic< T > *Ptr, __spv::Scope::Flag, __spv::MemorySemanticsMask::Flag MS)
Definition: atomic.hpp:98
T __spirv_AtomicMax(std::atomic< T > *Ptr, __spv::Scope::Flag, __spv::MemorySemanticsMask::Flag MS, T V)
Definition: atomic.hpp:153
T __spirv_AtomicIAdd(std::atomic< T > *Ptr, __spv::Scope::Flag, __spv::MemorySemanticsMask::Flag MS, T V)
Definition: atomic.hpp:110
T __spirv_AtomicAnd(std::atomic< T > *Ptr, __spv::Scope::Flag, __spv::MemorySemanticsMask::Flag MS, T V)
Definition: atomic.hpp:122
T __spirv_AtomicXor(std::atomic< T > *Ptr, __spv::Scope::Flag, __spv::MemorySemanticsMask::Flag MS, T V)
Definition: atomic.hpp:134
T __spirv_AtomicISub(std::atomic< T > *Ptr, __spv::Scope::Flag, __spv::MemorySemanticsMask::Flag MS, T V)
Definition: atomic.hpp:116
T __spirv_AtomicMin(std::atomic< T > *Ptr, __spv::Scope::Flag, __spv::MemorySemanticsMask::Flag MS, T V)
Definition: atomic.hpp:140