11 #ifdef __SYCL_DEVICE_ONLY__
15 #if defined(__NVPTX__)
22 inline namespace _V1 {
27 namespace experimental {
28 template <
typename ParentGroup>
class ballot_group;
29 template <
size_t PartitionSize,
typename ParentGroup>
class fixed_size_group;
30 template <
int Dimensions>
class root_group;
31 template <
typename ParentGroup>
class tangle_group;
32 class opportunistic_group;
44 GetMultiPtrDecoratedAs(multi_ptr<FromT, Space, IsDecorated> MPtr) {
45 if constexpr (IsDecorated == access::decorated::legacy)
46 return reinterpret_cast<
50 return reinterpret_cast<
52 MPtr.get_decorated());
57 template <
typename Group>
58 struct is_tangle_or_opportunistic_group : std::false_type {};
60 template <
typename ParentGroup>
61 struct is_tangle_or_opportunistic_group<
62 sycl::ext::oneapi::experimental::tangle_group<ParentGroup>>
66 struct is_tangle_or_opportunistic_group<
67 sycl::ext::oneapi::experimental::opportunistic_group> : std::true_type {};
69 template <
typename Group>
struct is_ballot_group : std::false_type {};
71 template <
typename ParentGroup>
72 struct is_ballot_group<
73 sycl::ext::oneapi::experimental::ballot_group<ParentGroup>>
76 template <
typename Group>
struct is_fixed_size_group : std::false_type {};
78 template <
size_t PartitionSize,
typename ParentGroup>
79 struct is_fixed_size_group<
sycl::ext::oneapi::experimental::fixed_size_group<
80 PartitionSize, ParentGroup>> : std::true_type {};
82 template <
typename Group>
struct group_scope {};
84 template <
int Dimensions>
85 struct group_scope<
sycl::ext::oneapi::experimental::root_group<Dimensions>> {
93 template <>
struct group_scope<::sycl::ext::oneapi::sub_group> {
100 template <
typename ParentGroup>
101 struct group_scope<
sycl::ext::oneapi::experimental::ballot_group<ParentGroup>> {
105 template <
size_t PartitionSize,
typename ParentGroup>
106 struct group_scope<
sycl::ext::oneapi::experimental::fixed_size_group<
107 PartitionSize, ParentGroup>> {
111 template <
typename ParentGroup>
112 struct group_scope<
sycl::ext::oneapi::experimental::tangle_group<ParentGroup>> {
126 using ShuffleChunkT = uint64_t;
128 using ShuffleChunkT = uint32_t;
130 template <
typename T,
typename Functor>
131 void GenericCall(
const Functor &ApplyToBytes) {
132 if (
sizeof(T) >=
sizeof(ShuffleChunkT)) {
134 for (
size_t Offset = 0; Offset +
sizeof(ShuffleChunkT) <=
sizeof(T);
135 Offset +=
sizeof(ShuffleChunkT)) {
136 ApplyToBytes(Offset,
sizeof(ShuffleChunkT));
139 if (
sizeof(ShuffleChunkT) >=
sizeof(uint64_t)) {
140 if (
sizeof(T) %
sizeof(uint64_t) >=
sizeof(uint32_t)) {
141 size_t Offset =
sizeof(T) /
sizeof(uint64_t) *
sizeof(uint64_t);
142 ApplyToBytes(Offset,
sizeof(uint32_t));
145 if (
sizeof(ShuffleChunkT) >=
sizeof(uint32_t)) {
146 if (
sizeof(T) %
sizeof(uint32_t) >=
sizeof(uint16_t)) {
147 size_t Offset =
sizeof(T) /
sizeof(uint32_t) *
sizeof(uint32_t);
148 ApplyToBytes(Offset,
sizeof(uint16_t));
151 if (
sizeof(ShuffleChunkT) >=
sizeof(uint16_t)) {
152 if (
sizeof(T) %
sizeof(uint16_t) >=
sizeof(uint8_t)) {
153 size_t Offset =
sizeof(T) /
sizeof(uint16_t) *
sizeof(uint16_t);
154 ApplyToBytes(Offset,
sizeof(uint8_t));
159 template <
typename Group>
bool GroupAll(Group,
bool pred) {
160 return __spirv_GroupAll(group_scope<Group>::value, pred);
162 template <
typename ParentGroup>
163 bool GroupAll(ext::oneapi::experimental::ballot_group<ParentGroup> g,
168 if (g.get_group_id() == 1) {
169 return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
171 return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
174 template <
size_t PartitionSize,
typename ParentGroup>
176 ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup>,
179 return __spirv_GroupNonUniformBitwiseAnd(
180 group_scope<ParentGroup>::value,
182 static_cast<uint32_t
>(pred), PartitionSize);
184 template <
typename ParentGroup>
185 bool GroupAll(ext::oneapi::experimental::tangle_group<ParentGroup>,
bool pred) {
186 return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
189 bool GroupAll(
const ext::oneapi::experimental::opportunistic_group &,
191 return __spirv_GroupNonUniformAll(
192 group_scope<ext::oneapi::experimental::opportunistic_group>::value, pred);
195 template <
typename Group>
bool GroupAny(Group,
bool pred) {
196 return __spirv_GroupAny(group_scope<Group>::value, pred);
198 template <
typename ParentGroup>
199 bool GroupAny(ext::oneapi::experimental::ballot_group<ParentGroup> g,
204 if (g.get_group_id() == 1) {
205 return __spirv_GroupNonUniformAny(group_scope<ParentGroup>::value, pred);
207 return __spirv_GroupNonUniformAny(group_scope<ParentGroup>::value, pred);
210 template <
size_t PartitionSize,
typename ParentGroup>
212 ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup>,
215 return __spirv_GroupNonUniformBitwiseOr(
216 group_scope<ParentGroup>::value,
218 static_cast<uint32_t
>(pred), PartitionSize);
220 template <
typename ParentGroup>
221 bool GroupAny(ext::oneapi::experimental::tangle_group<ParentGroup>,
bool pred) {
222 return __spirv_GroupNonUniformAny(group_scope<ParentGroup>::value, pred);
224 bool GroupAny(
const ext::oneapi::experimental::opportunistic_group &,
226 return __spirv_GroupNonUniformAny(
227 group_scope<ext::oneapi::experimental::opportunistic_group>::value, pred);
233 template <
typename T>
234 using is_native_broadcast =
235 std::bool_constant<detail::is_arithmetic<T>::value &&
236 !std::is_same<T, half>::value &&
237 !detail::is_vec<T>::value>;
239 template <
typename T,
typename IdT =
size_t>
240 using EnableIfNativeBroadcast = std::enable_if_t<
241 is_native_broadcast<T>::value && std::is_integral<IdT>::value, T>;
245 template <
typename T>
246 using is_bitcast_broadcast = std::bool_constant<
247 !is_native_broadcast<T>::value && std::is_trivially_copyable<T>::value &&
248 (
sizeof(T) == 1 ||
sizeof(T) == 2 ||
sizeof(T) == 4 ||
sizeof(T) == 8)>;
250 template <
typename T,
typename IdT =
size_t>
251 using EnableIfBitcastBroadcast = std::enable_if_t<
252 is_bitcast_broadcast<T>::value && std::is_integral<IdT>::value, T>;
254 template <
typename T>
255 using ConvertToNativeBroadcastType_t = select_cl_scalar_integral_unsigned_t<T>;
261 template <
typename T>
262 using is_generic_broadcast =
263 std::bool_constant<!is_native_broadcast<T>::value &&
264 !is_bitcast_broadcast<T>::value &&
265 std::is_trivially_copyable<T>::value>;
267 template <
typename T,
typename IdT =
size_t>
268 using EnableIfGenericBroadcast = std::enable_if_t<
269 is_generic_broadcast<T>::value && std::is_integral<IdT>::value, T>;
272 template <
typename T>
273 using WidenOpenCLTypeTo32_t = std::conditional_t<
274 std::is_same<T, opencl::cl_char>() || std::is_same<T, opencl::cl_short>(),
276 std::conditional_t<std::is_same<T, opencl::cl_uchar>() ||
277 std::is_same<T, opencl::cl_ushort>(),
283 template <
typename Group>
struct GroupId {
286 template <>
struct GroupId<::sycl::ext::oneapi::sub_group> {
287 using type = uint32_t;
290 using type = uint32_t;
294 template <
typename Group,
typename T,
typename IdT>
295 EnableIfNativeBroadcast<T, IdT> GroupBroadcast(Group, T x, IdT local_id) {
296 auto GroupLocalId =
static_cast<typename GroupId<Group>::type
>(local_id);
298 WidenOpenCLTypeTo32_t<decltype(OCLX)> WideOCLX = OCLX;
300 return __spirv_GroupBroadcast(group_scope<Group>::value, WideOCLX, OCLId);
303 template <
typename ParentGroup,
typename T,
typename IdT>
304 EnableIfNativeBroadcast<T, IdT>
311 auto GroupLocalId =
static_cast<typename GroupId<ParentGroup>::type
>(LocalId);
313 WidenOpenCLTypeTo32_t<decltype(OCLX)> WideOCLX = OCLX;
320 return __spirv_GroupNonUniformBroadcast(group_scope<ParentGroup>::value,
323 return __spirv_GroupNonUniformBroadcast(group_scope<ParentGroup>::value,
327 template <
size_t PartitionSize,
typename ParentGroup,
typename T,
typename IdT>
328 EnableIfNativeBroadcast<T, IdT> GroupBroadcast(
329 ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup> g,
332 auto LocalId = g.get_group_linear_id() * PartitionSize + local_id;
335 auto GroupLocalId =
static_cast<typename GroupId<ParentGroup>::type
>(LocalId);
337 WidenOpenCLTypeTo32_t<decltype(OCLX)> WideOCLX = OCLX;
344 return __spirv_GroupNonUniformShuffle(group_scope<ParentGroup>::value,
347 template <
typename ParentGroup,
typename T,
typename IdT>
348 EnableIfNativeBroadcast<T, IdT>
349 GroupBroadcast(ext::oneapi::experimental::tangle_group<ParentGroup> g, T x,
355 auto GroupLocalId =
static_cast<typename GroupId<ParentGroup>::type
>(LocalId);
357 WidenOpenCLTypeTo32_t<decltype(OCLX)> WideOCLX = OCLX;
360 return __spirv_GroupNonUniformBroadcast(group_scope<ParentGroup>::value,
363 template <
typename T,
typename IdT>
364 EnableIfNativeBroadcast<T, IdT>
365 GroupBroadcast(
const ext::oneapi::experimental::opportunistic_group &g, T x,
372 static_cast<typename GroupId<::sycl::sub_group>::type
>(LocalId);
374 WidenOpenCLTypeTo32_t<decltype(OCLX)> WideOCLX = OCLX;
377 return __spirv_GroupNonUniformBroadcast(
378 group_scope<ext::oneapi::experimental::opportunistic_group>::value,
382 template <
typename Group,
typename T,
typename IdT>
383 EnableIfBitcastBroadcast<T, IdT> GroupBroadcast(Group g, T x, IdT local_id) {
384 using BroadcastT = ConvertToNativeBroadcastType_t<T>;
385 auto BroadcastX = sycl::bit_cast<BroadcastT>(x);
386 BroadcastT Result = GroupBroadcast(g, BroadcastX, local_id);
387 return sycl::bit_cast<T>(Result);
389 template <
typename Group,
typename T,
typename IdT>
390 EnableIfGenericBroadcast<T, IdT> GroupBroadcast(Group g, T x, IdT local_id) {
393 char *XBytes =
reinterpret_cast<char *
>(&
x);
394 char *ResultBytes =
reinterpret_cast<char *
>(&Result);
395 auto BroadcastBytes = [=](
size_t Offset,
size_t Size) {
396 uint64_t BroadcastX, BroadcastResult;
398 BroadcastResult = GroupBroadcast(g, BroadcastX, local_id);
401 GenericCall<T>(BroadcastBytes);
406 template <
typename Group,
typename T,
int Dimensions>
407 EnableIfNativeBroadcast<T> GroupBroadcast(Group g, T x,
408 id<Dimensions> local_id) {
410 return GroupBroadcast(g, x, local_id[0]);
412 using IdT = vec<size_t, Dimensions>;
418 WidenOpenCLTypeTo32_t<decltype(OCLX)> WideOCLX = OCLX;
420 return __spirv_GroupBroadcast(group_scope<Group>::value, WideOCLX, OCLId);
422 template <
typename ParentGroup,
typename T>
423 EnableIfNativeBroadcast<T>
425 T x, id<1> local_id) {
427 return GroupBroadcast(g, x, local_id[0]);
429 template <
typename Group,
typename T,
int Dimensions>
430 EnableIfBitcastBroadcast<T> GroupBroadcast(Group g, T x,
431 id<Dimensions> local_id) {
432 using BroadcastT = ConvertToNativeBroadcastType_t<T>;
433 auto BroadcastX = sycl::bit_cast<BroadcastT>(x);
434 BroadcastT Result = GroupBroadcast(g, BroadcastX, local_id);
435 return sycl::bit_cast<T>(Result);
437 template <
typename Group,
typename T,
int Dimensions>
438 EnableIfGenericBroadcast<T> GroupBroadcast(Group g, T x,
439 id<Dimensions> local_id) {
441 return GroupBroadcast(g, x, local_id[0]);
445 char *XBytes =
reinterpret_cast<char *
>(&
x);
446 char *ResultBytes =
reinterpret_cast<char *
>(&Result);
447 auto BroadcastBytes = [=](
size_t Offset,
size_t Size) {
448 uint64_t BroadcastX, BroadcastResult;
450 BroadcastResult = GroupBroadcast(g, BroadcastX, local_id);
453 GenericCall<T>(BroadcastBytes);
459 template <
typename T>
461 typename std::enable_if<std::is_same<T, sycl::memory_order>::value,
463 getMemorySemanticsMask(T Order) {
469 case T::__consume_unsupported:
506 inline typename std::enable_if_t<std::is_integral<T>::value, T>
507 AtomicCompareExchange(multi_ptr<T, AddressSpace, IsDecorated> MPtr,
510 auto SPIRVSuccess = getMemorySemanticsMask(Success);
511 auto SPIRVFailure = getMemorySemanticsMask(Failure);
512 auto SPIRVScope = getScope(Scope);
513 auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
514 return __spirv_AtomicCompareExchange(Ptr, SPIRVScope, SPIRVSuccess,
515 SPIRVFailure, Desired, Expected);
520 inline typename std::enable_if_t<std::is_floating_point<T>::value, T>
521 AtomicCompareExchange(multi_ptr<T, AddressSpace, IsDecorated> MPtr,
524 using I = detail::make_unsinged_integer_t<T>;
525 auto SPIRVSuccess = getMemorySemanticsMask(Success);
526 auto SPIRVFailure = getMemorySemanticsMask(Failure);
527 auto SPIRVScope = getScope(Scope);
528 auto *PtrInt = GetMultiPtrDecoratedAs<I>(MPtr);
529 I DesiredInt = sycl::bit_cast<I>(Desired);
530 I ExpectedInt = sycl::bit_cast<I>(Expected);
531 I ResultInt = __spirv_AtomicCompareExchange(
532 PtrInt, SPIRVScope, SPIRVSuccess, SPIRVFailure, DesiredInt, ExpectedInt);
533 return sycl::bit_cast<T>(ResultInt);
538 inline typename std::enable_if_t<std::is_integral<T>::value, T>
539 AtomicLoad(multi_ptr<T, AddressSpace, IsDecorated> MPtr,
memory_scope Scope,
541 auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
542 auto SPIRVOrder = getMemorySemanticsMask(Order);
543 auto SPIRVScope = getScope(Scope);
549 inline typename std::enable_if_t<std::is_floating_point<T>::value, T>
550 AtomicLoad(multi_ptr<T, AddressSpace, IsDecorated> MPtr,
memory_scope Scope,
552 using I = detail::make_unsinged_integer_t<T>;
553 auto *PtrInt = GetMultiPtrDecoratedAs<I>(MPtr);
554 auto SPIRVOrder = getMemorySemanticsMask(Order);
555 auto SPIRVScope = getScope(Scope);
557 return sycl::bit_cast<T>(ResultInt);
562 inline typename std::enable_if_t<std::is_integral<T>::value>
563 AtomicStore(multi_ptr<T, AddressSpace, IsDecorated> MPtr,
memory_scope Scope,
565 auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
566 auto SPIRVOrder = getMemorySemanticsMask(Order);
567 auto SPIRVScope = getScope(Scope);
573 inline typename std::enable_if_t<std::is_floating_point<T>::value>
574 AtomicStore(multi_ptr<T, AddressSpace, IsDecorated> MPtr,
memory_scope Scope,
576 using I = detail::make_unsinged_integer_t<T>;
577 auto *PtrInt = GetMultiPtrDecoratedAs<I>(MPtr);
578 auto SPIRVOrder = getMemorySemanticsMask(Order);
579 auto SPIRVScope = getScope(Scope);
580 I ValueInt = sycl::bit_cast<I>(Value);
586 inline typename std::enable_if_t<std::is_integral<T>::value, T>
587 AtomicExchange(multi_ptr<T, AddressSpace, IsDecorated> MPtr,
memory_scope Scope,
589 auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
590 auto SPIRVOrder = getMemorySemanticsMask(Order);
591 auto SPIRVScope = getScope(Scope);
597 inline typename std::enable_if_t<std::is_floating_point<T>::value, T>
598 AtomicExchange(multi_ptr<T, AddressSpace, IsDecorated> MPtr,
memory_scope Scope,
600 using I = detail::make_unsinged_integer_t<T>;
601 auto *PtrInt = GetMultiPtrDecoratedAs<I>(MPtr);
602 auto SPIRVOrder = getMemorySemanticsMask(Order);
603 auto SPIRVScope = getScope(Scope);
604 I ValueInt = sycl::bit_cast<I>(Value);
607 return sycl::bit_cast<T>(ResultInt);
612 inline typename std::enable_if_t<std::is_integral<T>::value, T>
613 AtomicIAdd(multi_ptr<T, AddressSpace, IsDecorated> MPtr,
memory_scope Scope,
615 auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
616 auto SPIRVOrder = getMemorySemanticsMask(Order);
617 auto SPIRVScope = getScope(Scope);
623 inline typename std::enable_if_t<std::is_integral<T>::value, T>
624 AtomicISub(multi_ptr<T, AddressSpace, IsDecorated> MPtr,
memory_scope Scope,
626 auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
627 auto SPIRVOrder = getMemorySemanticsMask(Order);
628 auto SPIRVScope = getScope(Scope);
634 inline typename std::enable_if_t<std::is_floating_point<T>::value, T>
635 AtomicFAdd(multi_ptr<T, AddressSpace, IsDecorated> MPtr,
memory_scope Scope,
637 auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
638 auto SPIRVOrder = getMemorySemanticsMask(Order);
639 auto SPIRVScope = getScope(Scope);
640 return __spirv_AtomicFAddEXT(Ptr, SPIRVScope, SPIRVOrder, Value);
645 inline typename std::enable_if_t<std::is_integral<T>::value, T>
646 AtomicAnd(multi_ptr<T, AddressSpace, IsDecorated> MPtr,
memory_scope Scope,
648 auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
649 auto SPIRVOrder = getMemorySemanticsMask(Order);
650 auto SPIRVScope = getScope(Scope);
656 inline typename std::enable_if_t<std::is_integral<T>::value, T>
657 AtomicOr(multi_ptr<T, AddressSpace, IsDecorated> MPtr,
memory_scope Scope,
659 auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
660 auto SPIRVOrder = getMemorySemanticsMask(Order);
661 auto SPIRVScope = getScope(Scope);
667 inline typename std::enable_if_t<std::is_integral<T>::value, T>
668 AtomicXor(multi_ptr<T, AddressSpace, IsDecorated> MPtr,
memory_scope Scope,
670 auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
671 auto SPIRVOrder = getMemorySemanticsMask(Order);
672 auto SPIRVScope = getScope(Scope);
678 inline typename std::enable_if_t<std::is_integral<T>::value, T>
679 AtomicMin(multi_ptr<T, AddressSpace, IsDecorated> MPtr,
memory_scope Scope,
681 auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
682 auto SPIRVOrder = getMemorySemanticsMask(Order);
683 auto SPIRVScope = getScope(Scope);
689 inline typename std::enable_if_t<std::is_floating_point<T>::value, T>
690 AtomicMin(multi_ptr<T, AddressSpace, IsDecorated> MPtr,
memory_scope Scope,
692 auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
693 auto SPIRVOrder = getMemorySemanticsMask(Order);
694 auto SPIRVScope = getScope(Scope);
700 inline typename std::enable_if_t<std::is_integral<T>::value, T>
701 AtomicMax(multi_ptr<T, AddressSpace, IsDecorated> MPtr,
memory_scope Scope,
703 auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
704 auto SPIRVOrder = getMemorySemanticsMask(Order);
705 auto SPIRVScope = getScope(Scope);
711 inline typename std::enable_if_t<std::is_floating_point<T>::value, T>
712 AtomicMax(multi_ptr<T, AddressSpace, IsDecorated> MPtr,
memory_scope Scope,
714 auto *Ptr = GetMultiPtrDecoratedAs<T>(MPtr);
715 auto SPIRVOrder = getMemorySemanticsMask(Order);
716 auto SPIRVScope = getScope(Scope);
730 using ProhibitedTypesForShuffleEmulation =
731 type_list<double, long, long long, unsigned long, unsigned long long, half>;
733 template <
typename T>
734 struct TypeIsProhibitedForShuffleEmulation
735 : std::bool_constant<is_contained<
736 vector_element_t<T>, ProhibitedTypesForShuffleEmulation>::value> {};
738 template <
typename T>
739 struct VecTypeIsProhibitedForShuffleEmulation
740 : std::bool_constant<
741 (detail::get_vec_size<T>::size > 1) &&
742 TypeIsProhibitedForShuffleEmulation<vector_element_t<T>>::value> {};
744 template <
typename T>
745 using EnableIfNativeShuffle =
746 std::enable_if_t<detail::is_arithmetic<T>::value &&
747 !VecTypeIsProhibitedForShuffleEmulation<T>::value,
750 template <
typename T>
751 using EnableIfVectorShuffle =
752 std::enable_if_t<VecTypeIsProhibitedForShuffleEmulation<T>::value, T>;
756 template <
typename T>
757 using EnableIfNativeShuffle = std::enable_if_t<
758 std::is_integral<T>::value && (
sizeof(T) <=
sizeof(int32_t)), T>;
760 template <
typename T>
761 using EnableIfVectorShuffle =
762 std::enable_if_t<detail::is_vector_arithmetic<T>::value, T>;
768 template <
typename T>
769 using EnableIfBitcastShuffle =
770 std::enable_if_t<!detail::is_arithmetic<T>::value &&
771 (std::is_trivially_copyable_v<T> &&
772 (
sizeof(T) == 1 ||
sizeof(T) == 2 ||
sizeof(T) == 4 ||
776 template <
typename T>
777 using EnableIfBitcastShuffle =
778 std::enable_if_t<!(std::is_integral_v<T> &&
779 (
sizeof(T) <=
sizeof(int32_t))) &&
780 !detail::is_vector_arithmetic<T>::value &&
781 (std::is_trivially_copyable_v<T> &&
782 (
sizeof(T) == 1 ||
sizeof(T) == 2 ||
sizeof(T) == 4)),
791 template <
typename T>
792 using EnableIfGenericShuffle =
793 std::enable_if_t<!detail::is_arithmetic<T>::value &&
794 !(std::is_trivially_copyable_v<T> &&
795 (
sizeof(T) == 1 ||
sizeof(T) == 2 ||
796 sizeof(T) == 4 ||
sizeof(T) == 8)),
799 template <
typename T>
800 using EnableIfGenericShuffle = std::enable_if_t<
801 !(std::is_integral<T>::value && (
sizeof(T) <=
sizeof(int32_t))) &&
802 !detail::is_vector_arithmetic<T>::value &&
803 !(std::is_trivially_copyable_v<T> &&
804 (
sizeof(T) == 1 ||
sizeof(T) == 2 ||
sizeof(T) == 4)),
809 inline uint32_t membermask() {
816 template <
typename GroupT>
817 inline uint32_t MapShuffleID(GroupT g, id<1> local_id) {
818 if constexpr (is_tangle_or_opportunistic_group<GroupT>::value ||
819 is_ballot_group<GroupT>::value)
821 else if constexpr (is_fixed_size_group<GroupT>::value)
822 return g.get_group_linear_id() * g.get_local_range().size() + local_id;
824 return local_id.get(0);
828 template <
typename GroupT,
typename T>
829 EnableIfBitcastShuffle<T> Shuffle(GroupT g, T x, id<1> local_id);
831 template <
typename GroupT,
typename T>
832 EnableIfBitcastShuffle<T> ShuffleXor(GroupT g, T x, id<1> local_id);
834 template <
typename GroupT,
typename T>
835 EnableIfBitcastShuffle<T> ShuffleDown(GroupT g, T x, uint32_t delta);
837 template <
typename GroupT,
typename T>
838 EnableIfBitcastShuffle<T> ShuffleUp(GroupT g, T x, uint32_t delta);
840 template <
typename GroupT,
typename T>
841 EnableIfGenericShuffle<T> Shuffle(GroupT g, T x, id<1> local_id);
843 template <
typename GroupT,
typename T>
844 EnableIfGenericShuffle<T> ShuffleXor(GroupT g, T x, id<1> local_id);
846 template <
typename GroupT,
typename T>
847 EnableIfGenericShuffle<T> ShuffleDown(GroupT g, T x, uint32_t delta);
849 template <
typename GroupT,
typename T>
850 EnableIfGenericShuffle<T> ShuffleUp(GroupT g, T x, uint32_t delta);
852 template <
typename GroupT,
typename T>
853 EnableIfNativeShuffle<T> Shuffle(GroupT g, T x, id<1> local_id) {
854 uint32_t LocalId = MapShuffleID(g, local_id);
859 detail::is_vec<T>::value) {
863 for (
int s = 0; s <
x.size(); ++s)
864 result[s] = Shuffle(g, x[s], local_id);
868 return __spirv_GroupNonUniformShuffle(group_scope<GroupT>::value,
880 return cuda_shfl_sync_idx_i32(membermask(), x, LocalId, 31);
885 template <
typename GroupT,
typename T>
886 EnableIfNativeShuffle<T> ShuffleXor(GroupT g, T x, id<1> mask) {
891 detail::is_vec<T>::value) {
895 for (
int s = 0; s <
x.size(); ++s)
896 result[s] = ShuffleXor(g, x[s], mask);
903 id<1> TargetLocalId = g.get_local_id() ^ mask;
904 uint32_t TargetId = MapShuffleID(g, TargetLocalId);
905 return __spirv_GroupNonUniformShuffle(group_scope<GroupT>::value,
910 static_cast<uint32_t
>(mask.get(0)));
916 if constexpr (is_fixed_size_group_v<GroupT>) {
917 return cuda_shfl_sync_bfly_i32(MemberMask, x,
918 static_cast<uint32_t
>(mask.get(0)), 0x1f);
921 int unfoldedSrcSetBit =
922 (g.get_local_id()[0] ^
static_cast<uint32_t
>(mask.get(0))) + 1;
923 return cuda_shfl_sync_idx_i32(
924 MemberMask, x, __nvvm_fns(MemberMask, 0, unfoldedSrcSetBit), 31);
927 return cuda_shfl_sync_bfly_i32(membermask(), x,
928 static_cast<uint32_t
>(mask.get(0)), 0x1f);
933 template <
typename GroupT,
typename T>
934 EnableIfNativeShuffle<T> ShuffleDown(GroupT g, T x, uint32_t delta) {
938 detail::is_vec<T>::value) {
942 for (
int s = 0; s <
x.size(); ++s)
943 result[s] = ShuffleDown(g, x[s], delta);
947 id<1> TargetLocalId = g.get_local_id();
950 if (TargetLocalId[0] + delta < g.get_local_linear_range())
951 TargetLocalId[0] += delta;
952 uint32_t TargetId = MapShuffleID(g, TargetLocalId);
953 return __spirv_GroupNonUniformShuffle(group_scope<GroupT>::value,
964 if constexpr (is_fixed_size_group_v<GroupT>) {
965 return cuda_shfl_sync_down_i32(MemberMask, x, delta, 31);
967 unsigned localSetBit = g.get_local_id()[0] + 1;
968 int unfoldedSrcSetBit = localSetBit + delta;
969 return cuda_shfl_sync_idx_i32(
970 MemberMask, x, __nvvm_fns(MemberMask, 0, unfoldedSrcSetBit), 31);
973 return cuda_shfl_sync_down_i32(membermask(), x, delta, 31);
978 template <
typename GroupT,
typename T>
979 EnableIfNativeShuffle<T> ShuffleUp(GroupT g, T x, uint32_t delta) {
983 detail::is_vec<T>::value) {
987 for (
int s = 0; s <
x.size(); ++s)
988 result[s] = ShuffleUp(g, x[s], delta);
992 id<1> TargetLocalId = g.get_local_id();
994 if (TargetLocalId[0] >= delta)
995 TargetLocalId[0] -= delta;
996 uint32_t TargetId = MapShuffleID(g, TargetLocalId);
997 return __spirv_GroupNonUniformShuffle(group_scope<GroupT>::value,
1008 if constexpr (is_fixed_size_group_v<GroupT>) {
1009 return cuda_shfl_sync_up_i32(MemberMask, x, delta, 0);
1011 unsigned localSetBit = g.get_local_id()[0] + 1;
1012 int unfoldedSrcSetBit = localSetBit - delta;
1014 return cuda_shfl_sync_idx_i32(
1015 MemberMask, x, __nvvm_fns(MemberMask, 0, unfoldedSrcSetBit), 31);
1018 return cuda_shfl_sync_up_i32(membermask(), x, delta, 0);
1023 template <
typename GroupT,
typename T>
1024 EnableIfVectorShuffle<T> Shuffle(GroupT g, T x, id<1> local_id) {
1026 for (
int s = 0; s <
x.size(); ++s) {
1027 result[s] = Shuffle(g, x[s], local_id);
1032 template <
typename GroupT,
typename T>
1033 EnableIfVectorShuffle<T> ShuffleXor(GroupT g, T x, id<1> local_id) {
1035 for (
int s = 0; s <
x.size(); ++s) {
1036 result[s] = ShuffleXor(g, x[s], local_id);
1041 template <
typename GroupT,
typename T>
1042 EnableIfVectorShuffle<T> ShuffleDown(GroupT g, T x, uint32_t delta) {
1044 for (
int s = 0; s <
x.size(); ++s) {
1045 result[s] = ShuffleDown(g, x[s], delta);
1050 template <
typename GroupT,
typename T>
1051 EnableIfVectorShuffle<T> ShuffleUp(GroupT g, T x, uint32_t delta) {
1053 for (
int s = 0; s <
x.size(); ++s) {
1054 result[s] = ShuffleUp(g, x[s], delta);
1059 template <
typename T>
1060 using ConvertToNativeShuffleType_t = select_cl_scalar_integral_unsigned_t<T>;
1062 template <
typename GroupT,
typename T>
1063 EnableIfBitcastShuffle<T> Shuffle(GroupT g, T x, id<1> local_id) {
1064 using ShuffleT = ConvertToNativeShuffleType_t<T>;
1065 auto ShuffleX = sycl::bit_cast<ShuffleT>(x);
1066 ShuffleT Result = Shuffle(g, ShuffleX, local_id);
1067 return sycl::bit_cast<T>(Result);
1070 template <
typename GroupT,
typename T>
1071 EnableIfBitcastShuffle<T> ShuffleXor(GroupT g, T x, id<1> local_id) {
1072 using ShuffleT = ConvertToNativeShuffleType_t<T>;
1073 auto ShuffleX = sycl::bit_cast<ShuffleT>(x);
1074 ShuffleT Result = ShuffleXor(g, ShuffleX, local_id);
1075 return sycl::bit_cast<T>(Result);
1078 template <
typename GroupT,
typename T>
1079 EnableIfBitcastShuffle<T> ShuffleDown(GroupT g, T x, uint32_t delta) {
1080 using ShuffleT = ConvertToNativeShuffleType_t<T>;
1081 auto ShuffleX = sycl::bit_cast<ShuffleT>(x);
1082 ShuffleT Result = ShuffleDown(g, ShuffleX, delta);
1083 return sycl::bit_cast<T>(Result);
1086 template <
typename GroupT,
typename T>
1087 EnableIfBitcastShuffle<T> ShuffleUp(GroupT g, T x, uint32_t delta) {
1088 using ShuffleT = ConvertToNativeShuffleType_t<T>;
1089 auto ShuffleX = sycl::bit_cast<ShuffleT>(x);
1090 ShuffleT Result = ShuffleUp(g, ShuffleX, delta);
1091 return sycl::bit_cast<T>(Result);
1094 template <
typename GroupT,
typename T>
1095 EnableIfGenericShuffle<T> Shuffle(GroupT g, T x, id<1> local_id) {
1097 char *XBytes =
reinterpret_cast<char *
>(&
x);
1098 char *ResultBytes =
reinterpret_cast<char *
>(&Result);
1099 auto ShuffleBytes = [=](
size_t Offset,
size_t Size) {
1100 ShuffleChunkT ShuffleX, ShuffleResult;
1102 ShuffleResult = Shuffle(g, ShuffleX, local_id);
1105 GenericCall<T>(ShuffleBytes);
1109 template <
typename GroupT,
typename T>
1110 EnableIfGenericShuffle<T> ShuffleXor(GroupT g, T x, id<1> local_id) {
1112 char *XBytes =
reinterpret_cast<char *
>(&
x);
1113 char *ResultBytes =
reinterpret_cast<char *
>(&Result);
1114 auto ShuffleBytes = [=](
size_t Offset,
size_t Size) {
1115 ShuffleChunkT ShuffleX, ShuffleResult;
1117 ShuffleResult = ShuffleXor(g, ShuffleX, local_id);
1120 GenericCall<T>(ShuffleBytes);
1124 template <
typename GroupT,
typename T>
1125 EnableIfGenericShuffle<T> ShuffleDown(GroupT g, T x, uint32_t delta) {
1127 char *XBytes =
reinterpret_cast<char *
>(&
x);
1128 char *ResultBytes =
reinterpret_cast<char *
>(&Result);
1129 auto ShuffleBytes = [=](
size_t Offset,
size_t Size) {
1130 ShuffleChunkT ShuffleX, ShuffleResult;
1132 ShuffleResult = ShuffleDown(g, ShuffleX, delta);
1135 GenericCall<T>(ShuffleBytes);
1139 template <
typename GroupT,
typename T>
1140 EnableIfGenericShuffle<T> ShuffleUp(GroupT g, T x, uint32_t delta) {
1142 char *XBytes =
reinterpret_cast<char *
>(&
x);
1143 char *ResultBytes =
reinterpret_cast<char *
>(&Result);
1144 auto ShuffleBytes = [=](
size_t Offset,
size_t Size) {
1145 ShuffleChunkT ShuffleX, ShuffleResult;
1147 ShuffleResult = ShuffleUp(g, ShuffleX, delta);
1150 GenericCall<T>(ShuffleBytes);
1154 template <
typename Group>
1155 typename std::enable_if_t<
1156 ext::oneapi::experimental::is_fixed_topology_group_v<Group>>
1159 getMemorySemanticsMask(Order) |
1165 template <
typename Group>
1166 typename std::enable_if_t<
1167 ext::oneapi::experimental::is_user_constructed_group_v<Group>>
1169 #if defined(__NVPTX__)
1178 getMemorySemanticsMask(Order) |
1186 #define __SYCL_GROUP_COLLECTIVE_OVERLOAD(Instruction) \
1187 template <__spv::GroupOperation Op, typename Group, typename T> \
1188 inline typename std::enable_if_t< \
1189 ext::oneapi::experimental::is_fixed_topology_group_v<Group>, T> \
1190 Group##Instruction(Group, T x) { \
1191 using ConvertedT = detail::ConvertToOpenCLType_t<T>; \
1193 using OCLT = std::conditional_t< \
1194 std::is_same<ConvertedT, opencl::cl_char>() || \
1195 std::is_same<ConvertedT, opencl::cl_short>(), \
1197 std::conditional_t<std::is_same<ConvertedT, opencl::cl_uchar>() || \
1198 std::is_same<ConvertedT, opencl::cl_ushort>(), \
1199 opencl::cl_uint, ConvertedT>>; \
1201 OCLT Ret = __spirv_Group##Instruction(group_scope<Group>::value, \
1202 static_cast<unsigned int>(Op), Arg); \
1206 template <__spv::GroupOperation Op, typename ParentGroup, typename T> \
1207 inline T Group##Instruction( \
1208 ext::oneapi::experimental::ballot_group<ParentGroup> g, T x) { \
1209 using ConvertedT = detail::ConvertToOpenCLType_t<T>; \
1211 using OCLT = std::conditional_t< \
1212 std::is_same<ConvertedT, opencl::cl_char>() || \
1213 std::is_same<ConvertedT, opencl::cl_short>(), \
1215 std::conditional_t<std::is_same<ConvertedT, opencl::cl_uchar>() || \
1216 std::is_same<ConvertedT, opencl::cl_ushort>(), \
1217 opencl::cl_uint, ConvertedT>>; \
1222 constexpr auto Scope = group_scope<ParentGroup>::value; \
1223 constexpr auto OpInt = static_cast<unsigned int>(Op); \
1224 if (g.get_group_id() == 1) { \
1225 return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg); \
1227 return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg); \
1231 template <__spv::GroupOperation Op, size_t PartitionSize, \
1232 typename ParentGroup, typename T> \
1233 inline T Group##Instruction( \
1234 ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup> \
1237 using ConvertedT = detail::ConvertToOpenCLType_t<T>; \
1239 using OCLT = std::conditional_t< \
1240 std::is_same<ConvertedT, opencl::cl_char>() || \
1241 std::is_same<ConvertedT, opencl::cl_short>(), \
1243 std::conditional_t<std::is_same<ConvertedT, opencl::cl_uchar>() || \
1244 std::is_same<ConvertedT, opencl::cl_ushort>(), \
1245 opencl::cl_uint, ConvertedT>>; \
1247 constexpr auto Scope = group_scope<ParentGroup>::value; \
1250 if constexpr (Op == __spv::GroupOperation::Reduce) { \
1251 constexpr auto OpInt = \
1252 static_cast<unsigned int>(__spv::GroupOperation::ClusteredReduce); \
1253 return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg, \
1257 for (size_t Cluster = 0; Cluster < g.get_group_linear_range(); \
1259 if (Cluster == g.get_group_linear_id()) { \
1260 constexpr auto OpInt = static_cast<unsigned int>(Op); \
1261 tmp = __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg); \
1267 template <__spv::GroupOperation Op, typename Group, typename T> \
1268 inline typename std::enable_if_t< \
1269 is_tangle_or_opportunistic_group<Group>::value, T> \
1270 Group##Instruction(Group, T x) { \
1271 using ConvertedT = detail::ConvertToOpenCLType_t<T>; \
1273 using OCLT = std::conditional_t< \
1274 std::is_same<ConvertedT, opencl::cl_char>() || \
1275 std::is_same<ConvertedT, opencl::cl_short>(), \
1277 std::conditional_t<std::is_same<ConvertedT, opencl::cl_uchar>() || \
1278 std::is_same<ConvertedT, opencl::cl_ushort>(), \
1279 opencl::cl_uint, ConvertedT>>; \
1281 OCLT Ret = __spirv_GroupNonUniform##Instruction( \
1282 group_scope<Group>::value, static_cast<unsigned int>(Op), Arg); \
1286 __SYCL_GROUP_COLLECTIVE_OVERLOAD(SMin)
1287 __SYCL_GROUP_COLLECTIVE_OVERLOAD(UMin)
1288 __SYCL_GROUP_COLLECTIVE_OVERLOAD(FMin)
1290 __SYCL_GROUP_COLLECTIVE_OVERLOAD(SMax)
1291 __SYCL_GROUP_COLLECTIVE_OVERLOAD(UMax)
1292 __SYCL_GROUP_COLLECTIVE_OVERLOAD(FMax)
1294 __SYCL_GROUP_COLLECTIVE_OVERLOAD(IAdd)
1295 __SYCL_GROUP_COLLECTIVE_OVERLOAD(FAdd)
1297 __SYCL_GROUP_COLLECTIVE_OVERLOAD(IMulKHR)
1298 __SYCL_GROUP_COLLECTIVE_OVERLOAD(FMulKHR)
1299 __SYCL_GROUP_COLLECTIVE_OVERLOAD(CMulINTEL)
1301 __SYCL_GROUP_COLLECTIVE_OVERLOAD(BitwiseOrKHR)
1302 __SYCL_GROUP_COLLECTIVE_OVERLOAD(BitwiseXorKHR)
1303 __SYCL_GROUP_COLLECTIVE_OVERLOAD(BitwiseAndKHR)
1305 __SYCL_GROUP_COLLECTIVE_OVERLOAD(LogicalAndKHR)
1306 __SYCL_GROUP_COLLECTIVE_OVERLOAD(LogicalOrKHR)
1308 template <access::address_space Space,
typename T>
1309 auto GenericCastToPtr(T *Ptr) ->
1312 return __SYCL_GenericCastToPtr_ToGlobal<T>(Ptr);
1314 return __SYCL_GenericCastToPtr_ToLocal<T>(Ptr);
1316 return __SYCL_GenericCastToPtr_ToPrivate<T>(Ptr);
1320 template <access::address_space Space,
typename T>
1321 auto GenericCastToPtrExplicit(T *Ptr) ->
1324 return __SYCL_GenericCastToPtrExplicit_ToGlobal<T>(Ptr);
1326 return __SYCL_GenericCastToPtrExplicit_ToLocal<T>(Ptr);
1328 return __SYCL_GenericCastToPtrExplicit_ToPrivate<T>(Ptr);
id_type get_group_id() const
auto convertToOpenCLType(T &&x)
sycl::memory_order memory_order
sycl::memory_scope memory_scope
uint32_t IdToMaskPosition(NonUniformGroup Group, uint32_t Id)
ext::oneapi::sub_group_mask GetMask(NonUniformGroup Group)
void memcpy(void *Dst, const void *Src, size_t Size)
sycl::vec< unsigned, 4 > ExtractMask(ext::oneapi::sub_group_mask Mask)
@ group
Wait until all previous memory transactions from this thread are observed within the local thread-gro...
constexpr bool is_user_constructed_group_v
class __SYCL_EBO __SYCL_SPECIAL_CLASS Dimensions
std::conditional_t< is_decorated, decorated_type *, std::add_pointer_t< value_type > > pointer
__SYCL_CONVERGENT__ __DPCPP_SYCL_EXTERNAL void __spirv_ControlBarrier(__spv::Scope Execution, __spv::Scope Memory, uint32_t Semantics) noexcept
__SYCL_CONVERGENT__ __DPCPP_SYCL_EXTERNAL void __spirv_MemoryBarrier(__spv::Scope Memory, uint32_t Semantics) noexcept
T __spirv_AtomicOr(std::atomic< T > *Ptr, __spv::Scope::Flag, __spv::MemorySemanticsMask::Flag MS, T V)
void __spirv_AtomicStore(std::atomic< T > *Ptr, __spv::Scope::Flag, __spv::MemorySemanticsMask::Flag MS, T V)
T __spirv_AtomicExchange(std::atomic< T > *Ptr, __spv::Scope::Flag, __spv::MemorySemanticsMask::Flag MS, T V)
T __spirv_AtomicLoad(const std::atomic< T > *Ptr, __spv::Scope::Flag, __spv::MemorySemanticsMask::Flag MS)
T __spirv_AtomicMax(std::atomic< T > *Ptr, __spv::Scope::Flag, __spv::MemorySemanticsMask::Flag MS, T V)
T __spirv_AtomicIAdd(std::atomic< T > *Ptr, __spv::Scope::Flag, __spv::MemorySemanticsMask::Flag MS, T V)
T __spirv_AtomicAnd(std::atomic< T > *Ptr, __spv::Scope::Flag, __spv::MemorySemanticsMask::Flag MS, T V)
T __spirv_AtomicXor(std::atomic< T > *Ptr, __spv::Scope::Flag, __spv::MemorySemanticsMask::Flag MS, T V)
T __spirv_AtomicISub(std::atomic< T > *Ptr, __spv::Scope::Flag, __spv::MemorySemanticsMask::Flag MS, T V)
T __spirv_AtomicMin(std::atomic< T > *Ptr, __spv::Scope::Flag, __spv::MemorySemanticsMask::Flag MS, T V)