20 namespace ext::intel {
21 namespace experimental::esimd {
29 __esimd_sbarrier(flag);
34 __esimd_sbarrier(flag);
41 #ifndef __ESIMD_FORCE_STATELESS_MEM
70 template <
typename T1,
int n1,
typename T2,
int n2,
typename T3,
int n3,
72 __ESIMD_API __ESIMD_NS::simd<T1, n1>
73 raw_sends(__ESIMD_NS::simd<T1, n1> msgDst, __ESIMD_NS::simd<T2, n2> msgSrc0,
74 __ESIMD_NS::simd<T3, n3> msgSrc1, uint32_t exDesc, uint32_t msgDesc,
75 uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t numSrc1,
76 uint8_t numDst, uint8_t isEOT = 0, uint8_t isSendc = 0,
77 __ESIMD_NS::simd_mask<N> mask = 1) {
78 constexpr
unsigned _Width1 = n1 *
sizeof(T1);
79 static_assert(_Width1 % 32 == 0,
"Invalid size for raw send rspVar");
80 constexpr
unsigned _Width2 = n2 *
sizeof(T2);
81 static_assert(_Width2 % 32 == 0,
"Invalid size for raw send msgSrc0");
82 constexpr
unsigned _Width3 = n3 *
sizeof(T3);
83 static_assert(_Width3 % 32 == 0,
"Invalid size for raw send msgSrc1");
85 using ElemT1 = __ESIMD_DNS::__raw_t<T1>;
86 using ElemT2 = __ESIMD_DNS::__raw_t<T2>;
87 using ElemT3 = __ESIMD_DNS::__raw_t<T3>;
89 uint8_t modifier = ((isEOT & 0x1) << 1) | (isSendc & 0x1);
90 return __esimd_raw_sends2<ElemT1, n1, ElemT2, n2, ElemT3, n3, N>(
91 modifier, execSize, mask.data(), numSrc0, numSrc1, numDst, sfid, exDesc,
92 msgDesc, msgSrc0.data(), msgSrc1.data(), msgDst.data());
95 template <
typename T1,
int n1,
typename T2,
int n2,
typename T3,
int n3,
99 __ESIMD_NS::
simd<T1, n1> msgDst, __ESIMD_NS::
simd<T2, n2> msgSrc0,
100 __ESIMD_NS::
simd<T3, n3> msgSrc1, uint32_t exDesc, uint32_t msgDesc,
101 uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t numSrc1,
102 uint8_t numDst, uint8_t isEOT = 0, uint8_t isSendc = 0,
104 return raw_sends(msgDst, msgSrc0, msgSrc1, exDesc, msgDesc, execSize, sfid,
105 numSrc0, numSrc1, numDst, isEOT, isSendc);
129 template <
typename T1,
int n1,
typename T2,
int n2,
int N = 16>
130 __ESIMD_API __ESIMD_NS::simd<T1, n1>
131 raw_send(__ESIMD_NS::simd<T1, n1> msgDst, __ESIMD_NS::simd<T2, n2> msgSrc0,
132 uint32_t exDesc, uint32_t msgDesc, uint8_t execSize, uint8_t sfid,
133 uint8_t numSrc0, uint8_t numDst, uint8_t isEOT = 0,
134 uint8_t isSendc = 0, __ESIMD_NS::simd_mask<N> mask = 1) {
135 constexpr
unsigned _Width1 = n1 *
sizeof(T1);
136 static_assert(_Width1 % 32 == 0,
"Invalid size for raw send rspVar");
137 constexpr
unsigned _Width2 = n2 *
sizeof(T2);
138 static_assert(_Width2 % 32 == 0,
"Invalid size for raw send msgSrc0");
140 using ElemT1 = __ESIMD_DNS::__raw_t<T1>;
141 using ElemT2 = __ESIMD_DNS::__raw_t<T2>;
143 uint8_t modifier = ((isEOT & 0x1) << 1) | (isSendc & 0x1);
144 return __esimd_raw_send2<ElemT1, n1, ElemT2, n2, N>(
145 modifier, execSize, mask.data(), numSrc0, numDst, sfid, exDesc, msgDesc,
146 msgSrc0.data(), msgDst.data());
149 template <
typename T1,
int n1,
typename T2,
int n2,
int N = 16>
152 __ESIMD_NS::
simd<T1, n1> msgDst, __ESIMD_NS::
simd<T2, n2> msgSrc0,
153 uint32_t exDesc, uint32_t msgDesc, uint8_t execSize, uint8_t sfid,
154 uint8_t numSrc0, uint8_t numDst, uint8_t isEOT = 0, uint8_t isSendc = 0,
156 return raw_send(msgDst, msgSrc0, exDesc, msgDesc, execSize, sfid, numSrc0,
157 numDst, isEOT, isSendc, mask);
180 template <
typename T1,
int n1,
typename T2,
int n2,
int N = 16>
182 raw_sends(__ESIMD_NS::simd<T1, n1> msgSrc0, __ESIMD_NS::simd<T2, n2> msgSrc1,
183 uint32_t exDesc, uint32_t msgDesc, uint8_t execSize, uint8_t sfid,
184 uint8_t numSrc0, uint8_t numSrc1, uint8_t isEOT = 0,
185 uint8_t isSendc = 0, __ESIMD_NS::simd_mask<N> mask = 1) {
186 constexpr
unsigned _Width1 = n1 *
sizeof(T1);
187 static_assert(_Width1 % 32 == 0,
"Invalid size for raw send msgSrc0");
188 constexpr
unsigned _Width2 = n2 *
sizeof(T2);
189 static_assert(_Width2 % 32 == 0,
"Invalid size for raw send msgSrc1");
191 using ElemT1 = __ESIMD_DNS::__raw_t<T1>;
192 using ElemT2 = __ESIMD_DNS::__raw_t<T2>;
194 uint8_t modifier = ((isEOT & 0x1) << 1) | (isSendc & 0x1);
195 __esimd_raw_sends2_noresult<ElemT1, n1, ElemT2, n2, N>(
196 modifier, execSize, mask.data(), numSrc0, numSrc1, sfid, exDesc, msgDesc,
197 msgSrc0.data(), msgSrc1.data());
200 template <
typename T1,
int n1,
typename T2,
int n2,
int N = 16>
204 __ESIMD_NS::
simd<T2, n2> msgSrc1, uint32_t exDesc,
205 uint32_t msgDesc, uint8_t execSize, uint8_t sfid,
206 uint8_t numSrc0, uint8_t numSrc1, uint8_t isEOT = 0,
209 raw_sends(msgSrc0, msgSrc1, exDesc, msgDesc, execSize, sfid, numSrc0, numSrc1,
210 isEOT, isSendc, mask);
231 template <
typename T1,
int n1,
int N = 16>
233 raw_send(__ESIMD_NS::simd<T1, n1> msgSrc0, uint32_t exDesc, uint32_t msgDesc,
234 uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t isEOT = 0,
235 uint8_t isSendc = 0, __ESIMD_NS::simd_mask<N> mask = 1) {
236 constexpr
unsigned _Width1 = n1 *
sizeof(T1);
237 static_assert(_Width1 % 32 == 0,
"Invalid size for raw send msgSrc0");
238 using ElemT1 = __ESIMD_DNS::__raw_t<T1>;
239 uint8_t modifier = ((isEOT & 0x1) << 1) | (isSendc & 0x1);
240 __esimd_raw_send2_noresult<ElemT1, n1, N>(modifier, execSize, mask.data(),
241 numSrc0, sfid, exDesc, msgDesc,
245 template <
typename T1,
int n1,
int N = 16>
249 uint32_t msgDesc, uint8_t execSize, uint8_t sfid,
250 uint8_t numSrc0, uint8_t isEOT = 0, uint8_t isSendc = 0,
252 raw_send(msgSrc0, exDesc, msgDesc, execSize, sfid, numSrc0, isEOT, isSendc,
258 #endif // !__ESIMD_FORCE_STATELESS_MEM
271 __esimd_nbarrier(0 ,
id, 0 );
279 __esimd_nbarrier_init(NbarCount);
295 uint8_t producer_consumer_mode,
296 uint32_t num_producers,
297 uint32_t num_consumers) {
298 constexpr uint32_t gateway = 3;
299 constexpr uint32_t
barrier = 4;
300 constexpr uint32_t descriptor = 1 << 25 |
304 __ESIMD_DNS::vector_type_t<uint32_t, 8> payload = 0;
305 payload[2] = (num_consumers & 0xff) << 24 | (num_producers & 0xff) << 16 |
306 producer_consumer_mode << 14 | (barrier_id & 0b11111) << 0;
308 __esimd_raw_send_nbarrier_signal<uint32_t, 8>(
309 0 , gateway, descriptor, payload, 1 );
315 template <
typename T,
int N>
316 __ESIMD_API std::enable_if_t<(
sizeof(T) * N >= 2)>
317 wait(__ESIMD_NS::simd<T, N> value) {
318 #ifdef __SYCL_DEVICE_ONLY__
319 uint16_t Word = value.template bit_cast_view<uint16_t>()[0];
321 #endif // __SYCL_DEVICE_ONLY__
327 template <
typename T,
typename RegionT>
328 __ESIMD_API std::enable_if_t<
329 (RegionT::length *
sizeof(
typename RegionT::element_type) >= 2)>
330 wait(__ESIMD_NS::simd_view<T, RegionT> value) {
331 #ifdef __SYCL_DEVICE_ONLY__
332 uint16_t Word = value.template bit_cast_view<uint16_t>()[0];
334 #endif // __SYCL_DEVICE_ONLY__
347 template <
typename T,
int NBlocks,
int Height,
int Width,
bool Transposed,
351 return detail::roundUpNextMultiple<Height, 4 /
sizeof(T)>() *
352 __ESIMD_DNS::getNextPowerOf2<Width>() * NBlocks;
353 return Width * Height * NBlocks;
357 template <
typename RT,
typename T,
int N>
358 ESIMD_INLINE __ESIMD_NS::simd<RT, N>
360 if constexpr (
sizeof(T) == 1) {
362 return Vals.template bit_cast_view<uint8_t>();
363 }
else if constexpr (
sizeof(T) == 2) {
365 return Vals.template bit_cast_view<uint16_t>();
367 return Vals.template bit_cast_view<RT>();
372 template <
typename T,
typename T1,
int N>
373 ESIMD_INLINE __ESIMD_NS::simd<T, N>
375 auto Formatted = Vals.template bit_cast_view<T>();
376 if constexpr (
sizeof(T) ==
sizeof(T1)) {
379 constexpr
int Stride = Formatted.length / N;
380 return Formatted.template select<N, Stride>(0);
390 "Execution size 1, 2, 4, 8, 16, 32 are supported");
392 if constexpr (NumSrc != __ESIMD_DNS::get_num_args<Op>()) {
393 static_assert(NumSrc == __ESIMD_DNS::get_num_args<Op>(),
394 "wrong number of operands");
396 if constexpr (Op == __ESIMD_NS::native::lsc::atomic_op::fcmpxchg) {
397 static_assert(__ESIMD_DNS::is_type<T, float, sycl::half, double>(),
398 "float, double or sycl::half type is expected");
400 __ESIMD_DNS::check_atomic<__ESIMD_DNS::to_atomic_op<Op>(), T, N, NumSrc>();
404 template <cache_h
int L1H = cache_h
int::none, cache_h
int L3H = cache_h
int::none>
406 if constexpr (L1H == cache_hint::read_invalidate &&
407 L3H == cache_hint::cached) {
410 if constexpr (L1H == cache_hint::streaming && L3H == cache_hint::cached) {
413 if constexpr (L1H == cache_hint::streaming && L3H == cache_hint::uncached) {
416 if constexpr (L1H == cache_hint::cached && L3H == cache_hint::cached) {
419 if constexpr (L1H == cache_hint::cached && L3H == cache_hint::uncached) {
422 if constexpr (L1H == cache_hint::uncached && L3H == cache_hint::cached) {
425 if constexpr (L1H == cache_hint::uncached && L3H == cache_hint::uncached) {
431 template <cache_h
int L1H = cache_h
int::none, cache_h
int L3H = cache_h
int::none>
433 if constexpr (L1H == cache_hint::write_back && L3H == cache_hint::cached) {
436 if constexpr (L1H == cache_hint::streaming && L3H == cache_hint::cached) {
439 if constexpr (L1H == cache_hint::streaming && L3H == cache_hint::uncached) {
442 if constexpr (L1H == cache_hint::write_through && L3H == cache_hint::cached) {
445 if constexpr (L1H == cache_hint::write_through &&
446 L3H == cache_hint::uncached) {
449 if constexpr (L1H == cache_hint::uncached && L3H == cache_hint::cached) {
452 if constexpr (L1H == cache_hint::uncached && L3H == cache_hint::uncached) {
475 template <
typename T,
int NElts = 1,
477 __ESIMD_API __ESIMD_NS::simd<T, N * NElts>
479 __ESIMD_NS::simd_mask<N> pred = 1) {
480 detail::check_lsc_vector_size<NElts>();
481 detail::check_lsc_data_size<T, DS>();
482 constexpr uint16_t _AddressScale = 1;
483 constexpr
int _ImmOffset = 0;
487 constexpr
auto _Transposed = detail::lsc_data_order::nontranspose;
489 __ESIMD_NS::simd<MsgT, N * NElts> Tmp =
490 __esimd_lsc_load_slm<MsgT, cache_hint::none, cache_hint::none,
491 _AddressScale, _ImmOffset, _DS, _VS, _Transposed, N>(
492 pred.data(), offsets.data());
493 return detail::lsc_format_ret<T>(Tmp);
513 template <
typename T,
int NElts = 1,
515 __ESIMD_API __ESIMD_NS::simd<T, N * NElts>
517 __ESIMD_NS::simd_mask<N> pred,
518 __ESIMD_NS::simd<T, N * NElts> old_values) {
519 detail::check_lsc_vector_size<NElts>();
520 detail::check_lsc_data_size<T, DS>();
521 constexpr uint16_t _AddressScale = 1;
522 constexpr
int _ImmOffset = 0;
527 detail::lsc_data_order::nontranspose;
529 __ESIMD_NS::simd<MsgT, N * NElts> OldValuesExpanded =
530 detail::lsc_format_input<MsgT>(old_values);
531 __ESIMD_NS::simd<MsgT, N * NElts> Result =
532 __esimd_lsc_load_merge_slm<MsgT, cache_hint::none, cache_hint::none,
533 _AddressScale, _ImmOffset, _DS, _VS,
534 _Transposed, N>(pred.data(), offsets.data(),
535 OldValuesExpanded.data());
536 return detail::lsc_format_ret<T>(Result);
554 template <
typename T,
int NElts, lsc_data_size DS = lsc_data_size::default_size>
555 __ESIMD_API __ESIMD_NS::simd<T, NElts>
557 detail::check_lsc_vector_size<NElts>();
558 detail::check_lsc_data_size<T, DS>();
559 constexpr uint16_t AddressScale = 1;
560 constexpr
int ImmOffset = 0;
561 constexpr
lsc_data_size FDS = detail::finalize_data_size<T, DS>();
562 static_assert(FDS == lsc_data_size::u32 || FDS == lsc_data_size::u64,
563 "Transposed load is supported only for data size u32 or u64");
566 constexpr
auto Transposed = detail::lsc_data_order::transpose;
568 __ESIMD_NS::simd<uint32_t, N> offsets = offset;
569 return __esimd_lsc_load_slm<T, cache_hint::none, cache_hint::none,
570 AddressScale, ImmOffset, FDS, VS, Transposed, N>(
571 pred.data(), offsets.data());
591 template <
typename T,
int NElts, lsc_data_size DS = lsc_data_size::default_size>
592 __ESIMD_API __ESIMD_NS::simd<T, NElts>
594 __ESIMD_NS::simd<T, NElts> old_values) {
595 detail::check_lsc_vector_size<NElts>();
596 detail::check_lsc_data_size<T, DS>();
597 constexpr uint16_t AddressScale = 1;
598 constexpr
int ImmOffset = 0;
599 constexpr
lsc_data_size FDS = detail::finalize_data_size<T, DS>();
600 static_assert(FDS == lsc_data_size::u32 || FDS == lsc_data_size::u64,
601 "Transposed load is supported only for data size u32 or u64");
603 constexpr
auto Transposed = detail::lsc_data_order::transpose;
605 __ESIMD_NS::simd<uint32_t, N> offsets = offset;
606 return __esimd_lsc_load_merge_slm<T, cache_hint::none, cache_hint::none,
607 AddressScale, ImmOffset, FDS, VS,
608 Transposed, N>(pred.data(), offsets.data(),
630 template <
typename T,
int NElts = 1,
633 int N,
typename Toffset>
634 __ESIMD_API __ESIMD_NS::simd<T, N * NElts>
636 __ESIMD_NS::simd_mask<N> pred = 1) {
637 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
638 detail::check_lsc_vector_size<NElts>();
639 detail::check_lsc_data_size<T, DS>();
640 detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
641 constexpr uint16_t _AddressScale = 1;
642 constexpr
int _ImmOffset = 0;
646 constexpr
auto _Transposed = detail::lsc_data_order::nontranspose;
648 __ESIMD_NS::simd<uintptr_t, N> addrs =
reinterpret_cast<uintptr_t
>(p);
649 addrs += convert<uintptr_t>(offsets);
650 __ESIMD_NS::simd<MsgT, N * NElts> Tmp =
651 __esimd_lsc_load_stateless<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS,
652 _VS, _Transposed, N>(pred.data(),
654 return detail::lsc_format_ret<T>(Tmp);
677 template <
typename T,
int NElts = 1,
680 int N,
typename Toffset>
681 __ESIMD_API __ESIMD_NS::simd<T, N * NElts>
683 __ESIMD_NS::simd_mask<N> pred,
684 __ESIMD_NS::simd<T, N * NElts> old_values) {
685 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
686 detail::check_lsc_vector_size<NElts>();
687 detail::check_lsc_data_size<T, DS>();
688 detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
689 constexpr uint16_t _AddressScale = 1;
690 constexpr
int _ImmOffset = 0;
694 constexpr
auto _Transposed = detail::lsc_data_order::nontranspose;
696 __ESIMD_NS::simd<uintptr_t, N> Addrs =
reinterpret_cast<uintptr_t
>(p);
697 Addrs += convert<uintptr_t>(offsets);
698 __ESIMD_NS::simd<MsgT, N * NElts> OldValuesExpanded =
699 detail::lsc_format_input<MsgT>(old_values);
700 __ESIMD_NS::simd<MsgT, N * NElts> Result =
701 __esimd_lsc_load_merge_stateless<MsgT, L1H, L3H, _AddressScale,
702 _ImmOffset, _DS, _VS, _Transposed, N>(
703 pred.data(), Addrs.data(), OldValuesExpanded.data());
704 return detail::lsc_format_ret<T>(Result);
708 typename T,
int NElts = 1,
lsc_data_size DS = lsc_data_size::default_size,
710 typename Toffset,
typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
711 __ESIMD_API __ESIMD_NS::simd<T, N * NElts>
712 lsc_gather(
const T *p, __ESIMD_NS::simd_view<Toffset, RegionTy> offsets,
713 __ESIMD_NS::simd_mask<N> pred = 1) {
714 return lsc_gather<T, NElts, DS, L1H, L3H, N>(p, offsets.read(), pred);
718 typename T,
int NElts = 1,
lsc_data_size DS = lsc_data_size::default_size,
720 typename Toffset,
typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
721 __ESIMD_API __ESIMD_NS::simd<T, N * NElts>
722 lsc_gather(
const T *p, __ESIMD_NS::simd_view<Toffset, RegionTy> offsets,
723 __ESIMD_NS::simd_mask<N> pred,
724 __ESIMD_NS::simd<T, N * NElts> old_values) {
725 return lsc_gather<T, NElts, DS, L1H, L3H, N>(p, offsets.read(), pred,
729 template <
typename T,
int NElts = 1,
732 int N,
typename Toffset>
733 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>,
734 __ESIMD_NS::simd<T, N * NElts>>
735 lsc_gather(
const T *p, Toffset offset, __ESIMD_NS::simd_mask<N> pred = 1) {
736 return lsc_gather<T, NElts, DS, L1H, L3H, N>(
737 p, __ESIMD_NS::simd<Toffset, N>(offset), pred);
740 template <
typename T,
int NElts = 1,
743 int N,
typename Toffset>
744 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>,
745 __ESIMD_NS::simd<T, N * NElts>>
746 lsc_gather(
const T *p, Toffset offset, __ESIMD_NS::simd_mask<N> pred,
747 __ESIMD_NS::simd<T, N * NElts> old_values) {
748 return lsc_gather<T, NElts, DS, L1H, L3H, N>(
749 p, __ESIMD_NS::simd<Toffset, N>(offset), pred, old_values);
771 template <
typename T,
int NElts = 1,
774 int N,
typename AccessorTy>
775 __ESIMD_API std::enable_if_t<!std::is_pointer_v<AccessorTy>,
776 __ESIMD_NS::simd<T, N * NElts>>
778 #ifdef __ESIMD_FORCE_STATELESS_MEM
779 __ESIMD_NS::simd<uint64_t, N> offsets,
781 __ESIMD_NS::simd<uint32_t, N> offsets,
783 __ESIMD_NS::simd_mask<N> pred = 1) {
784 #ifdef __ESIMD_FORCE_STATELESS_MEM
785 return lsc_gather<T, NElts, DS, L1H, L3H>(acc.get_pointer(), offsets, pred);
787 detail::check_lsc_vector_size<NElts>();
788 detail::check_lsc_data_size<T, DS>();
789 detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
790 constexpr uint16_t _AddressScale = 1;
791 constexpr
int _ImmOffset = 0;
796 detail::lsc_data_order::nontranspose;
799 auto loc_offsets = convert<uint32_t>(offsets);
800 __ESIMD_NS::simd<MsgT, N * NElts> Tmp =
801 __esimd_lsc_load_bti<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
802 _Transposed, N>(pred.data(), loc_offsets.data(), si);
803 return detail::lsc_format_ret<T>(Tmp);
807 #ifdef __ESIMD_FORCE_STATELESS_MEM
808 template <
typename T,
int NElts = 1,
811 int N,
typename AccessorTy,
typename Toffset>
812 __ESIMD_API std::enable_if_t<!std::is_pointer_v<AccessorTy> &&
813 std::is_integral_v<Toffset> &&
814 !std::is_same_v<Toffset, uint64_t>,
815 __ESIMD_NS::simd<T, N * NElts>>
816 lsc_gather(AccessorTy acc, __ESIMD_NS::simd<Toffset, N> offsets,
817 __ESIMD_NS::simd_mask<N> pred = 1) {
818 return lsc_gather<T, NElts, DS, L1H, L3H, N, AccessorTy>(
819 acc, convert<uint64_t>(offsets), pred);
844 template <
typename T,
int NElts = 1,
847 int N,
typename AccessorTy>
848 __ESIMD_API std::enable_if_t<!std::is_pointer_v<AccessorTy>,
849 __ESIMD_NS::simd<T, N * NElts>>
851 #ifdef __ESIMD_FORCE_STATELESS_MEM
852 __ESIMD_NS::simd<uint64_t, N> offsets,
854 __ESIMD_NS::simd<uint32_t, N> offsets,
856 __ESIMD_NS::simd_mask<N> pred,
857 __ESIMD_NS::simd<T, N * NElts> old_values) {
858 #ifdef __ESIMD_FORCE_STATELESS_MEM
859 return lsc_gather<T, NElts, DS, L1H, L3H>(acc.get_pointer(), offsets, pred,
862 detail::check_lsc_vector_size<NElts>();
863 detail::check_lsc_data_size<T, DS>();
864 detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
865 constexpr uint16_t _AddressScale = 1;
866 constexpr
int _ImmOffset = 0;
870 constexpr
auto _Transposed = detail::lsc_data_order::nontranspose;
873 auto loc_offsets = convert<uint32_t>(offsets);
874 __ESIMD_NS::simd<MsgT, N * NElts> OldValuesExpanded =
875 detail::lsc_format_input<MsgT>(old_values);
876 __ESIMD_NS::simd<MsgT, N * NElts> Result =
877 __esimd_lsc_load_merge_bti<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS,
878 _VS, _Transposed, N>(
879 pred.data(), loc_offsets.data(), SI, OldValuesExpanded.data());
880 return detail::lsc_format_ret<T>(Result);
884 #ifdef __ESIMD_FORCE_STATELESS_MEM
885 template <
typename T,
int NElts = 1,
888 int N,
typename AccessorTy,
typename Toffset>
889 __ESIMD_API std::enable_if_t<!std::is_pointer_v<AccessorTy> &&
890 std::is_integral_v<Toffset> &&
891 !std::is_same_v<Toffset, uint64_t>,
892 __ESIMD_NS::simd<T, N * NElts>>
893 lsc_gather(AccessorTy acc, __ESIMD_NS::simd<Toffset, N> offsets,
894 __ESIMD_NS::simd_mask<N> pred,
895 __ESIMD_NS::simd<T, N * NElts> old_values) {
896 return lsc_gather<T, NElts, DS, L1H, L3H, N, AccessorTy>(
897 acc, convert<uint64_t>(offsets), pred, old_values);
938 template <
typename T,
int NElts,
lsc_data_size DS = lsc_data_size::default_size,
940 typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
941 __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v<FlagsT>,
942 __ESIMD_NS::simd<T, NElts>>
944 FlagsT flags = FlagsT{}) {
946 detail::check_lsc_data_size<T, DS>();
947 detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
948 constexpr
lsc_data_size FDS = detail::finalize_data_size<T, DS>();
950 static_assert(FDS == lsc_data_size::u16 || FDS == lsc_data_size::u8 ||
951 FDS == lsc_data_size::u32 || FDS == lsc_data_size::u64,
952 "Conversion data types are not supported");
954 FlagsT::template alignment<__ESIMD_DNS::__raw_t<T>>;
956 (
Alignment >= __ESIMD_DNS::OperandSize::DWORD &&
sizeof(T) <= 4) ||
957 (
Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
sizeof(T) > 4),
958 "Incorrect alignment for the data type");
960 constexpr
int SmallIntFactor32Bit =
961 (FDS == lsc_data_size::u16) ? 2 : (FDS == lsc_data_size::u8 ? 4 : 1);
962 static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
963 "Number of elements is not supported by Transposed load");
965 constexpr
bool Use64BitData =
966 Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
968 (DS == lsc_data_size::default_size && NElts / SmallIntFactor32Bit > 64 &&
969 (NElts *
sizeof(T)) % 8 == 0));
970 constexpr
int SmallIntFactor64Bit =
971 (FDS == lsc_data_size::u16)
973 : (FDS == lsc_data_size::u8 ? 8
974 : (FDS == lsc_data_size::u32 ? 2 : 1));
975 constexpr
int SmallIntFactor =
976 Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
977 constexpr
int FactoredNElts = NElts / SmallIntFactor;
978 detail::check_lsc_vector_size<FactoredNElts>();
982 ? __ESIMD_ENS::lsc_data_size::u64
983 : __ESIMD_ENS::lsc_data_size::u32;
986 detail::to_lsc_vector_size<FactoredNElts>();
987 using LoadElemT = __ESIMD_DNS::__raw_t<
988 std::conditional_t<SmallIntFactor == 1, T,
989 std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
990 constexpr uint16_t _AddressScale = 1;
991 constexpr
int _ImmOffset = 0;
993 constexpr
auto _Transposed = detail::lsc_data_order::transpose;
996 __ESIMD_NS::simd<uintptr_t, N> Addrs =
reinterpret_cast<uintptr_t
>(p);
998 __ESIMD_NS::simd<LoadElemT, FactoredNElts> Result =
999 __esimd_lsc_load_stateless<LoadElemT, L1H, L3H, _AddressScale, _ImmOffset,
1000 ActualDS, _VS, _Transposed, N>(pred.data(),
1002 return Result.template bit_cast_view<T>();
1038 template <
typename T,
int NElts,
lsc_data_size DS = lsc_data_size::default_size,
1040 typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
1041 __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v<FlagsT>,
1042 __ESIMD_NS::simd<T, NElts>>
1044 return lsc_block_load<T, NElts, DS, L1H, L3H>(p, __ESIMD_NS::simd_mask<1>(1),
1084 template <
typename T,
int NElts,
lsc_data_size DS = lsc_data_size::default_size,
1086 typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
1087 __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v<FlagsT>,
1088 __ESIMD_NS::simd<T, NElts>>
1090 __ESIMD_NS::simd<T, NElts> old_values, FlagsT flags = FlagsT{}) {
1092 detail::check_lsc_data_size<T, DS>();
1093 detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
1094 constexpr
lsc_data_size FDS = detail::finalize_data_size<T, DS>();
1096 FlagsT::template alignment<__ESIMD_DNS::__raw_t<T>>;
1098 (
Alignment >= __ESIMD_DNS::OperandSize::DWORD &&
sizeof(T) <= 4) ||
1099 (
Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
sizeof(T) > 4),
1100 "Incorrect alignment for the data type");
1101 static_assert(FDS == lsc_data_size::u16 || FDS == lsc_data_size::u8 ||
1102 FDS == lsc_data_size::u32 || FDS == lsc_data_size::u64,
1103 "Conversion data types are not supported");
1104 constexpr
int SmallIntFactor32Bit =
1105 (FDS == lsc_data_size::u16) ? 2 : (FDS == lsc_data_size::u8 ? 4 : 1);
1106 static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1107 "Number of elements is not supported by Transposed load");
1109 constexpr
bool Use64BitData =
1110 Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1112 (DS == lsc_data_size::default_size && NElts / SmallIntFactor32Bit > 64 &&
1113 (NElts *
sizeof(T)) % 8 == 0));
1114 constexpr
int SmallIntFactor64Bit =
1115 (FDS == lsc_data_size::u16)
1117 : (FDS == lsc_data_size::u8 ? 8
1118 : (FDS == lsc_data_size::u32 ? 2 : 1));
1119 constexpr
int SmallIntFactor =
1120 Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1121 constexpr
int FactoredNElts = NElts / SmallIntFactor;
1122 detail::check_lsc_vector_size<FactoredNElts>();
1126 ? __ESIMD_ENS::lsc_data_size::u64
1127 : __ESIMD_ENS::lsc_data_size::u32;
1130 detail::to_lsc_vector_size<FactoredNElts>();
1131 using LoadElemT = __ESIMD_DNS::__raw_t<
1132 std::conditional_t<SmallIntFactor == 1, T,
1133 std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1135 constexpr uint16_t _AddressScale = 1;
1136 constexpr
int _ImmOffset = 0;
1138 constexpr
auto _Transposed = detail::lsc_data_order::transpose;
1139 constexpr
int N = 1;
1141 __ESIMD_NS::simd<uintptr_t, N> Addrs =
reinterpret_cast<uintptr_t
>(p);
1142 __ESIMD_NS::simd<LoadElemT, FactoredNElts> OldVals =
1143 old_values.template bit_cast_view<LoadElemT>();
1144 __ESIMD_NS::simd<LoadElemT, FactoredNElts> Result =
1145 __esimd_lsc_load_merge_stateless<LoadElemT, L1H, L3H, _AddressScale,
1146 _ImmOffset, ActualDS, _VS, _Transposed,
1147 N>(pred.data(), Addrs.data(),
1149 return Result.template bit_cast_view<T>();
1188 template <
typename T,
int NElts,
lsc_data_size DS = lsc_data_size::default_size,
1190 typename AccessorTy,
1191 typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
1192 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value &&
1193 __ESIMD_NS::is_simd_flag_type_v<FlagsT>,
1194 __ESIMD_NS::simd<T, NElts>>
1196 #ifdef __ESIMD_FORCE_STATELESS_MEM
1201 __ESIMD_NS::simd_mask<1> pred = 1, FlagsT flags = FlagsT{}) {
1202 #ifdef __ESIMD_FORCE_STATELESS_MEM
1203 return lsc_block_load<T, NElts, DS, L1H, L3H>(
1204 __ESIMD_DNS::accessorToPointer<T>(acc, offset), pred, flags);
1205 #else // !__ESIMD_FORCE_STATELESS_MEM
1207 detail::check_lsc_data_size<T, DS>();
1208 detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
1211 FlagsT::template alignment<__ESIMD_DNS::__raw_t<T>>;
1213 (
Alignment >= __ESIMD_DNS::OperandSize::DWORD &&
sizeof(T) <= 4) ||
1214 (
Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
sizeof(T) > 4),
1215 "Incorrect alignment for the data type");
1217 constexpr
lsc_data_size FDS = detail::finalize_data_size<T, DS>();
1218 static_assert(FDS == lsc_data_size::u16 || FDS == lsc_data_size::u8 ||
1219 FDS == lsc_data_size::u32 || FDS == lsc_data_size::u64,
1220 "Conversion data types are not supported");
1221 constexpr
int SmallIntFactor32Bit =
1222 (FDS == lsc_data_size::u16) ? 2 : (FDS == lsc_data_size::u8 ? 4 : 1);
1223 static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1224 "Number of elements is not supported by Transposed load");
1225 constexpr
bool Use64BitData =
1226 Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1228 (DS == lsc_data_size::default_size && NElts / SmallIntFactor32Bit > 64 &&
1229 (NElts *
sizeof(T)) % 8 == 0));
1230 constexpr
int SmallIntFactor64Bit =
1231 (FDS == lsc_data_size::u16)
1233 : (FDS == lsc_data_size::u8 ? 8
1234 : (FDS == lsc_data_size::u32 ? 2 : 1));
1235 constexpr
int SmallIntFactor =
1236 Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1237 constexpr
int FactoredNElts = NElts / SmallIntFactor;
1239 ? __ESIMD_ENS::lsc_data_size::u64
1240 : __ESIMD_ENS::lsc_data_size::u32;
1242 detail::check_lsc_vector_size<FactoredNElts>();
1245 using LoadElemT = __ESIMD_DNS::__raw_t<
1246 std::conditional_t<SmallIntFactor == 1, T,
1247 std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1249 constexpr uint16_t _AddressScale = 1;
1250 constexpr
int _ImmOffset = 0;
1251 constexpr
auto _VS = detail::to_lsc_vector_size<FactoredNElts>();
1252 constexpr
auto _Transposed = detail::lsc_data_order::transpose;
1253 constexpr
int N = 1;
1255 __ESIMD_NS::simd<uint32_t, N> Offsets = offset;
1257 __ESIMD_NS::simd<LoadElemT, FactoredNElts> Result =
1258 __esimd_lsc_load_bti<LoadElemT, L1H, L3H, _AddressScale, _ImmOffset,
1259 ActualDS, _VS, _Transposed, N>(pred.data(),
1260 Offsets.data(), SI);
1261 return Result.template bit_cast_view<T>();
1262 #endif // !__ESIMD_FORCE_STATELESS_MEM
1297 template <
typename T,
int NElts,
lsc_data_size DS = lsc_data_size::default_size,
1299 typename AccessorTy,
1300 typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
1301 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value &&
1302 __ESIMD_NS::is_simd_flag_type_v<FlagsT>,
1303 __ESIMD_NS::simd<T, NElts>>
1305 #ifdef __ESIMD_FORCE_STATELESS_MEM
1311 return lsc_block_load<T, NElts, DS, L1H, L3H>(
1312 acc, offset, __ESIMD_NS::simd_mask<1>(1), flags);
1352 template <
typename T,
int NElts,
lsc_data_size DS = lsc_data_size::default_size,
1354 typename AccessorTy,
1355 typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
1356 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value &&
1357 __ESIMD_NS::is_simd_flag_type_v<FlagsT>,
1358 __ESIMD_NS::simd<T, NElts>>
1360 #ifdef __ESIMD_FORCE_STATELESS_MEM
1365 __ESIMD_NS::simd_mask<1> pred,
1366 __ESIMD_NS::simd<T, NElts> old_values, FlagsT flags = FlagsT{}) {
1367 #ifdef __ESIMD_FORCE_STATELESS_MEM
1368 return lsc_block_load<T, NElts, DS, L1H, L3H>(
1369 __ESIMD_DNS::accessorToPointer<T>(acc, offset), pred, old_values, flags);
1370 #else // !__ESIMD_FORCE_STATELESS_MEM
1372 detail::check_lsc_data_size<T, DS>();
1373 detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
1374 constexpr
lsc_data_size FDS = detail::finalize_data_size<T, DS>();
1376 FlagsT::template alignment<__ESIMD_DNS::__raw_t<T>>;
1378 (
Alignment >= __ESIMD_DNS::OperandSize::DWORD &&
sizeof(T) <= 4) ||
1379 (
Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
sizeof(T) > 4),
1380 "Incorrect alignment for the data type");
1381 static_assert(FDS == lsc_data_size::u16 || FDS == lsc_data_size::u8 ||
1382 FDS == lsc_data_size::u32 || FDS == lsc_data_size::u64,
1383 "Conversion data types are not supported");
1384 constexpr
int SmallIntFactor32Bit =
1385 (FDS == lsc_data_size::u16) ? 2 : (FDS == lsc_data_size::u8 ? 4 : 1);
1386 static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1387 "Number of elements is not supported by Transposed load");
1388 constexpr
bool Use64BitData =
1389 Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1391 (DS == lsc_data_size::default_size && NElts / SmallIntFactor32Bit > 64 &&
1392 (NElts *
sizeof(T)) % 8 == 0));
1393 constexpr
int SmallIntFactor64Bit =
1394 (FDS == lsc_data_size::u16)
1396 : (FDS == lsc_data_size::u8 ? 8
1397 : (FDS == lsc_data_size::u32 ? 2 : 1));
1398 constexpr
int SmallIntFactor =
1399 Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1400 constexpr
int FactoredNElts = NElts / SmallIntFactor;
1402 ? __ESIMD_ENS::lsc_data_size::u64
1403 : __ESIMD_ENS::lsc_data_size::u32;
1405 detail::check_lsc_vector_size<FactoredNElts>();
1408 using LoadElemT = __ESIMD_DNS::__raw_t<
1409 std::conditional_t<SmallIntFactor == 1, T,
1410 std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1411 constexpr uint16_t _AddressScale = 1;
1412 constexpr
int _ImmOffset = 0;
1413 constexpr
auto _VS = detail::to_lsc_vector_size<FactoredNElts>();
1414 constexpr
auto _Transposed = detail::lsc_data_order::transpose;
1415 constexpr
int N = 1;
1417 __ESIMD_NS::simd<uint32_t, N> Offsets = offset;
1419 __ESIMD_NS::simd<LoadElemT, FactoredNElts> OldVals =
1420 old_values.template bit_cast_view<LoadElemT>();
1421 __ESIMD_NS::simd<LoadElemT, FactoredNElts> Result =
1422 __esimd_lsc_load_merge_bti<LoadElemT, L1H, L3H, _AddressScale, _ImmOffset,
1423 ActualDS, _VS, _Transposed, N>(
1424 pred.data(), Offsets.data(), SI, OldVals.data());
1425 return Result.template bit_cast_view<T>();
1426 #endif // !__ESIMD_FORCE_STATELESS_MEM
1445 template <
typename T,
int NElts = 1,
1448 int N,
typename Toffset>
1449 __ESIMD_API
void lsc_prefetch(
const T *p, __ESIMD_NS::simd<Toffset, N> offsets,
1450 __ESIMD_NS::simd_mask<N> pred = 1) {
1451 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
1452 detail::check_lsc_vector_size<NElts>();
1453 detail::check_lsc_data_size<T, DS>();
1454 detail::check_lsc_cache_hint<detail::lsc_action::prefetch, L1H, L3H>();
1455 constexpr uint16_t _AddressScale = 1;
1456 constexpr
int _ImmOffset = 0;
1461 detail::lsc_data_order::nontranspose;
1463 __ESIMD_NS::simd<uintptr_t, N> addrs =
reinterpret_cast<uintptr_t
>(p);
1464 addrs += convert<uintptr_t>(offsets);
1465 __esimd_lsc_prefetch_stateless<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS,
1466 _VS, _Transposed, N>(pred.data(),
1471 typename T,
int NElts = 1,
lsc_data_size DS = lsc_data_size::default_size,
1473 typename Toffset,
typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
1475 __ESIMD_NS::simd_view<Toffset, RegionTy> offsets,
1476 __ESIMD_NS::simd_mask<N> pred = 1) {
1477 lsc_prefetch<T, NElts, DS, L1H, L3H, N>(p, offsets.read(), pred);
1480 template <
typename T,
int NElts = 1,
1483 int N,
typename Toffset>
1484 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>>
1485 lsc_prefetch(
const T *p, Toffset offset, __ESIMD_NS::simd_mask<N> pred = 1) {
1486 lsc_prefetch<T, NElts, DS, L1H, L3H, N>(
1487 p, __ESIMD_NS::simd<Toffset, N>(offset), pred);
1503 template <
typename T,
int NElts = 1,
1507 detail::check_lsc_vector_size<NElts>();
1508 detail::check_lsc_data_size<T, DS>();
1509 detail::check_lsc_cache_hint<detail::lsc_action::prefetch, L1H, L3H>();
1510 constexpr uint16_t _AddressScale = 1;
1511 constexpr
int _ImmOffset = 0;
1512 constexpr
lsc_data_size _DS = detail::finalize_data_size<T, DS>();
1515 _DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
1516 "Transposed prefetch is supported only for data size u32 or u64");
1519 detail::lsc_data_order::transpose;
1520 constexpr
int N = 1;
1521 __ESIMD_NS::simd_mask<N> pred = 1;
1523 __ESIMD_NS::simd<uintptr_t, N> addrs =
reinterpret_cast<uintptr_t
>(p);
1524 __esimd_lsc_prefetch_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
1525 _VS, _Transposed, N>(pred.data(),
1546 template <
typename T,
int NElts = 1,
1549 int N,
typename AccessorTy>
1550 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value>
1552 #ifdef __ESIMD_FORCE_STATELESS_MEM
1553 __ESIMD_NS::simd<uint64_t, N> offsets,
1555 __ESIMD_NS::simd<uint32_t, N> offsets,
1557 __ESIMD_NS::simd_mask<N> pred = 1) {
1558 #ifdef __ESIMD_FORCE_STATELESS_MEM
1559 return lsc_prefetch<T, NElts, DS, L1H, L3H>(
1560 __ESIMD_DNS::accessorToPointer<T>(acc), offsets, pred);
1562 detail::check_lsc_vector_size<NElts>();
1563 detail::check_lsc_data_size<T, DS>();
1564 detail::check_lsc_cache_hint<detail::lsc_action::prefetch, L1H, L3H>();
1565 constexpr uint16_t _AddressScale = 1;
1566 constexpr
int _ImmOffset = 0;
1571 detail::lsc_data_order::nontranspose;
1574 auto loc_offsets = convert<uint32_t>(offsets);
1575 __esimd_lsc_prefetch_bti<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
1576 _Transposed, N>(pred.data(), loc_offsets.data(), si);
1580 #ifdef __ESIMD_FORCE_STATELESS_MEM
1581 template <
typename T,
int NElts = 1,
1584 int N,
typename AccessorTy,
typename Toffset>
1585 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value &&
1586 std::is_integral_v<Toffset> &&
1587 !std::is_same_v<Toffset, uint64_t>>
1588 lsc_prefetch(AccessorTy acc, __ESIMD_NS::simd<Toffset, N> offsets,
1589 __ESIMD_NS::simd_mask<N> pred = 1) {
1590 lsc_prefetch<T, NElts, DS, L1H, L3H, N, AccessorTy>(
1591 acc, convert<uint64_t>(offsets), pred);
1610 template <
typename T,
int NElts = 1,
1613 typename AccessorTy>
1614 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value>
1616 #ifdef __ESIMD_FORCE_STATELESS_MEM
1622 #ifdef __ESIMD_FORCE_STATELESS_MEM
1623 lsc_prefetch<T, NElts, DS, L1H, L3H>(
1624 __ESIMD_DNS::accessorToPointer<T>(acc, offset));
1626 detail::check_lsc_vector_size<NElts>();
1627 detail::check_lsc_data_size<T, DS>();
1628 detail::check_lsc_cache_hint<detail::lsc_action::prefetch, L1H, L3H>();
1629 constexpr uint16_t _AddressScale = 1;
1630 constexpr
int _ImmOffset = 0;
1631 constexpr
lsc_data_size _DS = detail::finalize_data_size<T, DS>();
1633 _DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
1634 "Transposed prefetch is supported only for data size u32 or u64");
1637 detail::lsc_data_order::transpose;
1638 constexpr
int N = 1;
1639 __ESIMD_NS::simd_mask<N> pred = 1;
1640 __ESIMD_NS::simd<uint32_t, N> offsets = offset;
1642 __esimd_lsc_prefetch_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
1643 _Transposed, N>(pred.data(), offsets.data(), si);
1661 template <
typename T,
int NElts = 1,
1664 __ESIMD_NS::simd<T, N * NElts> vals,
1665 __ESIMD_NS::simd_mask<N> pred = 1) {
1666 detail::check_lsc_vector_size<NElts>();
1667 detail::check_lsc_data_size<T, DS>();
1668 constexpr uint16_t _AddressScale = 1;
1669 constexpr
int _ImmOffset = 0;
1674 detail::lsc_data_order::nontranspose;
1677 __ESIMD_NS::simd<MsgT, N * NElts> Tmp = vals.template bit_cast_view<CstT>();
1678 __esimd_lsc_store_slm<MsgT, cache_hint::none, cache_hint::none, _AddressScale,
1679 _ImmOffset, _DS, _VS, _Transposed, N>(
1680 pred.data(), offsets.data(), Tmp.data());
1695 template <
typename T,
int NElts, lsc_data_size DS = lsc_data_size::default_size>
1697 __ESIMD_NS::simd<T, NElts> vals) {
1698 detail::check_lsc_vector_size<NElts>();
1699 detail::check_lsc_data_size<T, DS>();
1700 constexpr uint16_t _AddressScale = 1;
1701 constexpr
int _ImmOffset = 0;
1702 constexpr
lsc_data_size _DS = detail::finalize_data_size<T, DS>();
1703 static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
1704 "Transposed store is supported only for data size u32 or u64");
1707 detail::lsc_data_order::transpose;
1708 constexpr
int N = 1;
1709 __ESIMD_NS::simd_mask<N> pred = 1;
1710 __ESIMD_NS::simd<uint32_t, N> offsets = offset;
1711 __esimd_lsc_store_slm<T, cache_hint::none, cache_hint::none, _AddressScale,
1712 _ImmOffset, _DS, _VS, _Transposed, N>(
1713 pred.data(), offsets.data(), vals.data());
1733 template <
typename T,
int NElts = 1,
1736 int N,
typename Toffset>
1737 __ESIMD_API
void lsc_scatter(T *p, __ESIMD_NS::simd<Toffset, N> offsets,
1738 __ESIMD_NS::simd<T, N * NElts> vals,
1739 __ESIMD_NS::simd_mask<N> pred = 1) {
1740 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
1741 detail::check_lsc_vector_size<NElts>();
1742 detail::check_lsc_data_size<T, DS>();
1743 detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
1744 constexpr uint16_t _AddressScale = 1;
1745 constexpr
int _ImmOffset = 0;
1750 detail::lsc_data_order::nontranspose;
1753 __ESIMD_NS::simd<MsgT, N * NElts> Tmp = vals.template bit_cast_view<_CstT>();
1754 __ESIMD_NS::simd<uintptr_t, N> addrs =
reinterpret_cast<uintptr_t
>(p);
1755 addrs += convert<uintptr_t>(offsets);
1756 __esimd_lsc_store_stateless<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS,
1757 _VS, _Transposed, N>(pred.data(), addrs.data(),
1762 typename T,
int NElts = 1,
lsc_data_size DS = lsc_data_size::default_size,
1764 typename Toffset,
typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
1766 __ESIMD_NS::simd_view<Toffset, RegionTy> offsets,
1767 __ESIMD_NS::simd<T, N * NElts> vals,
1768 __ESIMD_NS::simd_mask<N> pred = 1) {
1769 lsc_scatter<T, NElts, DS, L1H, L3H, N>(p, offsets.read(), vals, pred);
1772 template <
typename T,
int NElts = 1,
1775 int N,
typename Toffset>
1776 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> && N == 1>
1777 lsc_scatter(T *p, Toffset offset, __ESIMD_NS::simd<T, N * NElts> vals,
1778 __ESIMD_NS::simd_mask<N> pred = 1) {
1779 lsc_scatter<T, NElts, DS, L1H, L3H, N>(
1780 p, __ESIMD_NS::simd<Toffset, N>(offset), vals, pred);
1801 template <
typename T,
int NElts = 1,
1804 int N,
typename AccessorTy>
1805 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value>
1807 #ifdef __ESIMD_FORCE_STATELESS_MEM
1808 __ESIMD_NS::simd<uint64_t, N> offsets,
1810 __ESIMD_NS::simd<uint32_t, N> offsets,
1812 __ESIMD_NS::simd<T, N * NElts> vals,
1813 __ESIMD_NS::simd_mask<N> pred = 1) {
1814 #ifdef __ESIMD_FORCE_STATELESS_MEM
1815 lsc_scatter<T, NElts, DS, L1H, L3H>(__ESIMD_DNS::accessorToPointer<T>(acc),
1816 offsets, vals, pred);
1818 detail::check_lsc_vector_size<NElts>();
1819 detail::check_lsc_data_size<T, DS>();
1820 detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
1821 constexpr uint16_t _AddressScale = 1;
1822 constexpr
int _ImmOffset = 0;
1827 detail::lsc_data_order::nontranspose;
1830 __ESIMD_NS::simd<MsgT, N * NElts> Tmp = vals.template bit_cast_view<_CstT>();
1832 auto loc_offsets = convert<uint32_t>(offsets);
1833 __esimd_lsc_store_bti<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
1834 _Transposed, N>(pred.data(), loc_offsets.data(),
1839 #ifdef __ESIMD_FORCE_STATELESS_MEM
1840 template <
typename T,
int NElts = 1,
1843 int N,
typename AccessorTy,
typename Toffset>
1844 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value &&
1845 std::is_integral_v<Toffset> &&
1846 !std::is_same_v<Toffset, uint64_t>>
1847 lsc_scatter(AccessorTy acc, __ESIMD_NS::simd<Toffset, N> offsets,
1848 __ESIMD_NS::simd<T, N * NElts> vals,
1849 __ESIMD_NS::simd_mask<N> pred = 1) {
1850 lsc_scatter<T, NElts, DS, L1H, L3H, N, AccessorTy>(
1851 acc, convert<uint64_t>(offsets), vals, pred);
1854 template <
typename T,
int NElts,
lsc_data_size DS = lsc_data_size::default_size,
1888 typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
1889 __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v<FlagsT>>
1891 __ESIMD_NS::simd_mask<1> pred = 1, FlagsT flags = FlagsT{}) {
1892 detail::check_lsc_data_size<T, DS>();
1893 detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
1895 FlagsT::template alignment<__ESIMD_DNS::__raw_t<T>>;
1897 (
Alignment >= __ESIMD_DNS::OperandSize::DWORD &&
sizeof(T) <= 4) ||
1898 (
Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
sizeof(T) > 4),
1899 "Incorrect alignment for the data type");
1902 constexpr uint16_t _AddressScale = 1;
1903 constexpr
int _ImmOffset = 0;
1904 constexpr
lsc_data_size _DS = detail::finalize_data_size<T, DS>();
1905 static_assert(_DS == lsc_data_size::u16 || _DS == lsc_data_size::u8 ||
1906 _DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
1907 "Conversion data types are not supported");
1909 detail::lsc_data_order::transpose;
1910 constexpr
int N = 1;
1911 __ESIMD_NS::simd<uintptr_t, N> Addrs =
reinterpret_cast<uintptr_t
>(p);
1913 constexpr
int SmallIntFactor32Bit =
1914 (_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1);
1915 static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1916 "Number of elements is not supported by Transposed store");
1918 constexpr
bool Use64BitData =
1919 Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1921 (DS == lsc_data_size::default_size && NElts / SmallIntFactor32Bit > 64 &&
1922 (NElts *
sizeof(T)) % 8 == 0));
1923 constexpr
int SmallIntFactor64Bit =
1924 (_DS == lsc_data_size::u16)
1926 : (_DS == lsc_data_size::u8 ? 8
1927 : (_DS == lsc_data_size::u32 ? 2 : 1));
1928 constexpr
int SmallIntFactor =
1929 Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1930 constexpr
int FactoredNElts = NElts / SmallIntFactor;
1932 ? __ESIMD_ENS::lsc_data_size::u64
1933 : __ESIMD_ENS::lsc_data_size::u32;
1935 detail::check_lsc_vector_size<FactoredNElts>();
1937 detail::to_lsc_vector_size<FactoredNElts>();
1939 using StoreType = __ESIMD_DNS::__raw_t<
1940 std::conditional_t<SmallIntFactor == 1, T,
1941 std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1943 __esimd_lsc_store_stateless<StoreType, L1H, L3H, _AddressScale, _ImmOffset,
1944 ActualDS, _VS, _Transposed, N>(
1945 pred.data(), Addrs.data(),
1946 sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreType, FactoredNElts>>(
1978 template <
typename T,
int NElts,
lsc_data_size DS = lsc_data_size::default_size,
1980 typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
1981 __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v<FlagsT>>
1983 lsc_block_store<T, NElts, DS, L1H, L3H>(p, vals, __ESIMD_NS::simd_mask<1>(1),
2021 template <
typename T,
int NElts,
lsc_data_size DS = lsc_data_size::default_size,
2023 typename AccessorTy,
2024 typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
2025 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value &&
2026 __ESIMD_NS::is_simd_flag_type_v<FlagsT>>
2028 #ifdef __ESIMD_FORCE_STATELESS_MEM
2033 __ESIMD_NS::simd<T, NElts> vals,
2034 __ESIMD_NS::simd_mask<1> pred = 1, FlagsT flags = FlagsT{}) {
2035 #ifdef __ESIMD_FORCE_STATELESS_MEM
2036 lsc_block_store<T, NElts, DS, L1H, L3H>(
2037 __ESIMD_DNS::accessorToPointer<T>(acc, offset), vals, pred, flags);
2039 detail::check_lsc_data_size<T, DS>();
2040 detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
2042 FlagsT::template alignment<__ESIMD_DNS::__raw_t<T>>;
2044 (
Alignment >= __ESIMD_DNS::OperandSize::DWORD &&
sizeof(T) <= 4) ||
2045 (
Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
sizeof(T) > 4),
2046 "Incorrect alignment for the data type");
2048 constexpr uint16_t _AddressScale = 1;
2049 constexpr
int _ImmOffset = 0;
2050 constexpr
lsc_data_size _DS = detail::finalize_data_size<T, DS>();
2051 static_assert(_DS == lsc_data_size::u16 || _DS == lsc_data_size::u8 ||
2052 _DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
2053 "Conversion data types are not supported");
2055 detail::lsc_data_order::transpose;
2056 constexpr
int N = 1;
2058 __ESIMD_NS::simd<uint32_t, N> Offsets = offset;
2061 constexpr
int SmallIntFactor32Bit =
2062 (_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1);
2063 static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
2064 "Number of elements is not supported by Transposed store");
2066 constexpr
bool Use64BitData =
2067 Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
2069 (DS == lsc_data_size::default_size && NElts / SmallIntFactor32Bit > 64 &&
2070 (NElts *
sizeof(T)) % 8 == 0));
2071 constexpr
int SmallIntFactor64Bit =
2072 (_DS == lsc_data_size::u16)
2074 : (_DS == lsc_data_size::u8 ? 8
2075 : (_DS == lsc_data_size::u32 ? 2 : 1));
2076 constexpr
int SmallIntFactor =
2077 Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
2078 constexpr
int FactoredNElts = NElts / SmallIntFactor;
2080 ? __ESIMD_ENS::lsc_data_size::u64
2081 : __ESIMD_ENS::lsc_data_size::u32;
2083 detail::check_lsc_vector_size<FactoredNElts>();
2085 detail::to_lsc_vector_size<FactoredNElts>();
2087 using StoreType = __ESIMD_DNS::__raw_t<
2088 std::conditional_t<SmallIntFactor == 1, T,
2089 std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
2091 __esimd_lsc_store_bti<StoreType, L1H, L3H, _AddressScale, _ImmOffset,
2092 ActualDS, _VS, _Transposed, N>(
2093 pred.data(), Offsets.data(),
2094 sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreType, FactoredNElts>>(
2130 template <
typename T,
int NElts,
lsc_data_size DS = lsc_data_size::default_size,
2132 typename AccessorTy,
2133 typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
2134 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value &&
2135 __ESIMD_NS::is_simd_flag_type_v<FlagsT>>
2137 #ifdef __ESIMD_FORCE_STATELESS_MEM
2142 __ESIMD_NS::simd<T, NElts> vals, FlagsT flags) {
2143 lsc_block_store<T, NElts, DS, L1H, L3H>(acc, offset, vals,
2144 __ESIMD_NS::simd_mask<1>(1), flags);
2149 template <
typename T,
int BlockWidth,
int BlockHeight,
int NBlocks,
2150 bool Transposed,
bool Transformed,
bool IsStore =
false>
2152 constexpr
int GRFByteSize = BlockWidth * BlockHeight * NBlocks *
sizeof(T);
2153 static_assert(BlockWidth > 0,
"Block width must be positive");
2154 static_assert(BlockHeight > 0,
"Block height must be positive");
2156 static_assert(!IsStore || GRFByteSize <= 512,
2157 "2D store supports 512 bytes max");
2158 static_assert(IsStore || GRFByteSize <= 2048,
2159 "2D load supports 2048 bytes max");
2160 static_assert(!Transposed || !Transformed,
2161 "Transposed and transformed is not supported");
2162 static_assert((
sizeof(T) * BlockWidth) % 4 == 0,
2163 "Block width must be aligned by DW");
2164 if constexpr (Transposed) {
2165 static_assert(NBlocks == 1,
"Transposed expected to be 1 block only");
2166 static_assert(
sizeof(T) == 4 ||
sizeof(T) == 8,
2167 "Transposed load is supported only for data size u32 or u64");
2168 static_assert(
sizeof(T) == 8 ? BlockHeight == 8
2169 : BlockHeight >= 1 && BlockHeight <= 32,
2170 "Unsupported block height");
2172 : BlockWidth >= 1 && BlockWidth <= 8,
2173 "Unsupported block width");
2174 }
else if constexpr (Transformed) {
2175 static_assert(
sizeof(T) == 1 ||
sizeof(T) == 2,
2176 "VNNI transform is supported only for data size u8 or u16");
2178 "Unsupported number of blocks");
2179 static_assert(BlockHeight *
sizeof(T) >= 4 && BlockHeight <= 32,
2180 "Unsupported block height");
2181 static_assert(BlockWidth *
sizeof(T) >= 4 && BlockWidth <= 16 &&
2182 BlockWidth * NBlocks *
sizeof(T) <= 64,
2183 "Unsupported block width");
2187 "Unsupported number of blocks");
2188 if constexpr (IsStore)
2189 static_assert(BlockHeight <= 8,
"Unsupported block height for store");
2191 static_assert(BlockHeight <= 32,
"Unsupported block height for load");
2192 static_assert(BlockWidth *
sizeof(T) >= 4 &&
2193 BlockWidth * NBlocks *
sizeof(T) <= 64,
2194 "Unsupported block width");
2230 template <
typename T,
int BlockWidth,
int BlockHeight = 1,
int NBlocks = 1,
2231 bool Transposed =
false,
bool Transformed =
false,
2234 T, NBlocks, BlockHeight, BlockWidth, Transposed, Transformed>()>
2235 __ESIMD_API __ESIMD_NS::simd<T, N>
2236 lsc_load_2d(
const T *Ptr,
unsigned SurfaceWidth,
unsigned SurfaceHeight,
2237 unsigned SurfacePitch,
int X,
int Y) {
2238 detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
2240 Transposed, Transformed>();
2246 constexpr
int ElemsPerDword = 4 /
sizeof(T);
2247 constexpr
int GRFRowSize = Transposed ? BlockHeight
2248 : Transformed ? BlockWidth * ElemsPerDword
2250 constexpr
int GRFRowPitch = __ESIMD_DNS::getNextPowerOf2<GRFRowSize>();
2251 constexpr
int GRFColSize =
2254 : (Transformed ? (BlockHeight + ElemsPerDword - 1) / ElemsPerDword
2256 constexpr
int GRFBlockSize = GRFRowPitch * GRFColSize;
2257 constexpr
int GRFBlockPitch =
2258 detail::roundUpNextMultiple<64 /
sizeof(T), GRFBlockSize>();
2259 constexpr
int ActualN = NBlocks * GRFBlockPitch;
2261 constexpr
int DstBlockElements = GRFColSize * GRFRowSize;
2262 constexpr
int DstElements = DstBlockElements * NBlocks;
2264 static_assert(N == ActualN || N == DstElements,
"Incorrect element count");
2267 detail::finalize_data_size<T, lsc_data_size::default_size>();
2268 __ESIMD_NS::simd_mask<ActualN> pred = 1;
2269 uintptr_t surf_addr =
reinterpret_cast<uintptr_t
>(Ptr);
2271 Transposed ? detail::lsc_data_order::transpose
2272 : detail::lsc_data_order::nontranspose;
2273 __ESIMD_NS::simd<T, ActualN> Raw =
2274 __esimd_lsc_load2d_stateless<T, L1H, L3H, DS, _Transposed, NBlocks,
2275 BlockWidth, BlockHeight, Transformed,
2276 ActualN>(pred.data(), surf_addr,
2277 SurfaceWidth, SurfaceHeight,
2278 SurfacePitch, X, Y);
2280 if constexpr (ActualN == N) {
2300 __ESIMD_NS::simd<T, DstElements> Dst;
2302 for (
auto i = 0; i < NBlocks; i++) {
2304 Dst.template select<DstBlockElements, 1>(i * DstBlockElements);
2306 auto RawBlock = Raw.template select<GRFBlockSize, 1>(i * GRFBlockPitch);
2307 DstBlock = RawBlock.template bit_cast_view<T, GRFColSize, GRFRowPitch>()
2308 .template select<GRFColSize, 1, GRFRowSize, 1>(0, 0)
2309 .template bit_cast_view<T>();
2316 template <
typename T,
int BlockWidth,
int BlockHeight = 1,
int NBlocks = 1,
2317 bool Transposed =
false,
bool Transformed =
false,
2320 T, NBlocks, BlockHeight, BlockWidth, Transposed, Transformed>()>
2323 unsigned SurfaceWidth,
2324 unsigned SurfaceHeight,
2325 unsigned SurfacePitch,
int X,
2327 return lsc_load_2d<T, BlockWidth, BlockHeight, NBlocks, Transposed,
2328 Transformed, L1H, L3H>(Ptr, SurfaceWidth, SurfaceHeight,
2329 SurfacePitch, X, Y);
2354 template <
typename T,
int BlockWidth,
int BlockHeight = 1,
int NBlocks = 1,
2357 T, NBlocks, BlockHeight, BlockWidth,
false,
false>()>
2359 unsigned SurfaceHeight,
unsigned SurfacePitch,
2361 detail::check_lsc_cache_hint<detail::lsc_action::prefetch, L1H, L3H>();
2365 detail::finalize_data_size<T, lsc_data_size::default_size>();
2366 __ESIMD_NS::simd_mask<N> pred = 1;
2367 uintptr_t surf_addr =
reinterpret_cast<uintptr_t
>(Ptr);
2369 detail::lsc_data_order::nontranspose;
2370 __esimd_lsc_prefetch2d_stateless<T, L1H, L3H, DS, _Transposed, NBlocks,
2371 BlockWidth, BlockHeight,
false, N>(
2372 pred.data(), surf_addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
2375 template <
typename T,
int BlockWidth,
int BlockHeight = 1,
int NBlocks = 1,
2378 T, NBlocks, BlockHeight, BlockWidth,
false,
false>()>
2381 unsigned SurfaceHeight,
unsigned SurfacePitch,
2383 lsc_prefetch_2d<T, BlockWidth, BlockHeight, NBlocks, L1H, L3H>(
2384 Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
2410 template <
typename T,
int BlockWidth,
int BlockHeight = 1,
2413 T, 1u, BlockHeight, BlockWidth,
false,
false>()>
2415 unsigned SurfaceHeight,
unsigned SurfacePitch,
2416 int X,
int Y, __ESIMD_NS::simd<T, N> Vals) {
2417 detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
2421 detail::finalize_data_size<T, lsc_data_size::default_size>();
2422 uintptr_t surf_addr =
reinterpret_cast<uintptr_t
>(Ptr);
2424 detail::lsc_data_order::nontranspose;
2426 constexpr
int Pitch = __ESIMD_DNS::getNextPowerOf2<BlockWidth>();
2427 __ESIMD_NS::simd<T, BlockHeight * Pitch> Raw;
2429 if constexpr (BlockHeight * Pitch == N) {
2434 auto Data2D = Vals.template bit_cast_view<T, BlockHeight, BlockWidth>();
2435 auto Raw2D = Raw.template bit_cast_view<T, BlockHeight, Pitch>();
2436 Raw2D.template select<BlockHeight, 1, BlockWidth, 1>(0, 0) = Data2D;
2439 __ESIMD_NS::simd_mask<BlockHeight * Pitch> pred = 1;
2440 __esimd_lsc_store2d_stateless<T, L1H, L3H, DS, _Transposed, 1u, BlockWidth,
2441 BlockHeight,
false, BlockHeight * Pitch>(
2442 pred.data(), surf_addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y,
2446 template <
typename T,
int BlockWidth,
int BlockHeight = 1,
2449 T, 1u, BlockHeight, BlockWidth,
false,
false>()>
2452 unsigned SurfaceHeight,
unsigned SurfacePitch,
2453 int X,
int Y, __ESIMD_NS::simd<T, N> Vals) {
2454 lsc_store_2d<T, BlockWidth, BlockHeight, L1H, L3H>(
2455 Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, Vals);
2465 template <
typename T,
int BlockW
idth,
int BlockHeight,
int NBlocks>
2472 payload_data.template select<1, 1>(7) =
2473 ((NBlocks - 1) << 16) | ((BlockHeight - 1) << 8) | (BlockWidth - 1);
2480 : payload_data(other.payload) {}
2494 uint32_t SurfaceHeight, uint32_t SurfacePitch, int32_t X,
2497 payload_data.template bit_cast_view<uint64_t>().template select<1, 1>(0) =
2499 payload_data.template select<1, 1>(2) = SurfaceWidth;
2500 payload_data.template select<1, 1>(3) = SurfaceHeight;
2501 payload_data.template select<1, 1>(4) = SurfacePitch;
2502 payload_data.template select<1, 1>(5) = X;
2503 payload_data.template select<1, 1>(6) = Y;
2513 ->payload_data.template bit_cast_view<uint64_t>()[0]));
2522 ->payload_data.template select<1, 1>(2);
2531 ->payload_data.template select<1, 1>(3);
2540 ->payload_data.template select<1, 1>(4);
2549 ->payload_data.template select<1, 1>(5);
2558 ->payload_data.template select<1, 1>(6);
2585 payload_data.template bit_cast_view<uint64_t>().template select<1, 1>(0) =
2596 payload_data.template select<1, 1>(2) = SurfaceWidth;
2606 payload_data.template select<1, 1>(3) = SurfaceHeight;
2616 payload_data.template select<1, 1>(4) = SurfacePitch;
2626 payload_data.template select<1, 1>(5) = X;
2636 payload_data.template select<1, 1>(6) = Y;
2641 __ESIMD_NS::simd<uint32_t, 16> get_raw_data() {
return payload_data; }
2642 __ESIMD_NS::simd<uint32_t, 16> payload_data;
2644 template <
typename T1,
int BlockWidth1,
int BlockHeight1,
int NBlocks1,
2647 friend ESIMD_INLINE SYCL_ESIMD_FUNCTION __ESIMD_NS::simd<T1, N>
lsc_load_2d(
2648 config_2d_mem_access<T1, BlockWidth1, BlockHeight1, NBlocks1> &payload);
2650 template <
typename T1,
int BlockWidth1,
int BlockHeight1,
int NBlocks1,
2652 friend ESIMD_INLINE SYCL_ESIMD_FUNCTION
void lsc_store_2d(
2653 config_2d_mem_access<T1, BlockWidth1, BlockHeight1, NBlocks1> &payload,
2654 __ESIMD_NS::simd<T1, N> Data);
2656 template <
typename T1,
int BlockWidth1,
int BlockHeight1,
int NBlocks1,
2660 config_2d_mem_access<T1, BlockWidth1, BlockHeight1, NBlocks1> &payload);
2682 template <
typename T,
int BlockWidth,
int BlockHeight = 1,
int NBlocks = 1,
2683 bool Transposed =
false,
bool Transformed =
false,
2686 T, NBlocks, BlockHeight, BlockWidth, Transposed, Transformed>()>
2690 Transposed, Transformed,
false>();
2691 detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
2692 constexpr
int ElemsPerDword = 4 /
sizeof(T);
2693 constexpr
int GRFRowSize = Transposed ? BlockHeight
2694 : Transformed ? BlockWidth * ElemsPerDword
2696 constexpr
int GRFRowPitch = __ESIMD_DNS::getNextPowerOf2<GRFRowSize>();
2697 constexpr
int GRFColSize =
2700 : (Transformed ? (BlockHeight + ElemsPerDword - 1) / ElemsPerDword
2702 constexpr
int GRFBlockSize = GRFRowPitch * GRFColSize;
2703 constexpr
int GRFBlockPitch =
2704 detail::roundUpNextMultiple<64 /
sizeof(T), GRFBlockSize>();
2705 constexpr
int ActualN = NBlocks * GRFBlockPitch;
2707 constexpr
int DstBlockElements = GRFColSize * GRFRowSize;
2708 constexpr
int DstElements = DstBlockElements * NBlocks;
2710 static_assert(N == ActualN || N == DstElements,
"Incorrect element count");
2712 constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask<L1H, L3H>()
2714 constexpr uint32_t base_desc = 0x2800403;
2715 constexpr uint32_t transformMask = Transformed ? 1 << 7 : 0;
2716 constexpr uint32_t transposeMask = Transposed ? 1 << 15 : 0;
2717 __ESIMD_NS::simd<T, N> oldDst;
2718 constexpr uint32_t exDesc = 0x0;
2719 constexpr uint32_t desc =
2720 base_desc | cache_mask | transformMask | transposeMask;
2721 constexpr uint8_t execSize = 0x0;
2722 constexpr uint8_t sfid = 0xF;
2723 constexpr uint8_t numSrc0 = 0x1;
2724 constexpr uint8_t numDst = (N *
sizeof(T)) / 64;
2725 __ESIMD_NS::simd<T, ActualN> Raw =
2726 raw_send(oldDst, payload.get_raw_data(), exDesc, desc, execSize, sfid,
2729 if constexpr (ActualN == N) {
2735 __ESIMD_NS::simd<T, DstElements> Dst;
2737 for (
auto i = 0; i < NBlocks; i++) {
2739 Dst.template select<DstBlockElements, 1>(i * DstBlockElements);
2741 auto RawBlock = Raw.template select<GRFBlockSize, 1>(i * GRFBlockPitch);
2742 DstBlock = RawBlock.template bit_cast_view<T, GRFColSize, GRFRowPitch>()
2743 .template select<GRFColSize, 1, GRFRowSize, 1>(0, 0)
2744 .template bit_cast_view<T>();
2767 template <
typename T,
int BlockWidth,
int BlockHeight = 1,
int NBlocks = 1,
2768 bool Transposed =
false,
bool Transformed =
false,
2771 T, NBlocks, BlockHeight, BlockWidth, Transposed, Transformed>()>
2774 detail::check_lsc_cache_hint<detail::lsc_action::prefetch, L1H, L3H>();
2776 Transposed, Transformed,
false>();
2777 static_assert(!Transposed || !Transformed,
2778 "Transposed and transformed is not supported");
2779 constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask<L1H, L3H>()
2781 constexpr uint32_t base_desc = 0x2000403;
2782 constexpr uint32_t transformMask = Transformed ? 1 << 7 : 0;
2783 constexpr uint32_t transposeMask = Transposed ? 1 << 15 : 0;
2784 constexpr uint32_t exDesc = 0x0;
2785 constexpr uint32_t desc =
2786 base_desc | cache_mask | transformMask | transposeMask;
2787 constexpr uint8_t execSize = 0x0;
2788 constexpr uint8_t sfid = 0xF;
2789 constexpr uint8_t numDst = (N *
sizeof(T)) / 64;
2790 raw_send(payload.get_raw_data(), exDesc, desc, execSize, sfid, numDst);
2808 template <
typename T,
int BlockWidth,
int BlockHeight = 1,
int NBlocks = 1,
2811 T, NBlocks, BlockHeight, BlockWidth,
false,
false>()>
2812 ESIMD_INLINE SYCL_ESIMD_FUNCTION
void
2814 __ESIMD_NS::simd<T, N> Data) {
2816 false,
false,
true>();
2817 detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
2819 constexpr uint32_t cache_mask = detail::get_lsc_store_cache_mask<L1H, L3H>()
2821 constexpr uint32_t base_desc = 0x2000407;
2823 constexpr uint32_t exDesc = 0x0;
2824 constexpr uint32_t desc = base_desc | cache_mask;
2825 constexpr uint8_t execSize = 0x0;
2826 constexpr uint8_t sfid = 0xF;
2827 constexpr uint8_t numSrc0 = 0x1;
2828 constexpr uint8_t numSrc1 = (N *
sizeof(T)) / 64;
2830 raw_sends(payload.get_raw_data(), Data, exDesc, desc, execSize, sfid, numSrc0,
2849 __ESIMD_API __ESIMD_NS::simd<T, N>
2851 __ESIMD_NS::simd_mask<N> pred) {
2852 static_assert(
sizeof(T) == 2 ||
sizeof(T) == 4,
"Unsupported data type");
2853 __ESIMD_EDNS::check_lsc_vector_size<1>();
2854 __ESIMD_EDNS::check_lsc_data_size<T, DS>();
2856 __ESIMD_DNS::to_lsc_atomic_op<Op>();
2857 __ESIMD_EDNS::check_lsc_atomic<_Op, T, N, 0>();
2858 constexpr uint16_t _AddressScale = 1;
2859 constexpr
int _ImmOffset = 0;
2864 detail::lsc_data_order::nontranspose;
2866 __ESIMD_NS::simd<MsgT, N> Tmp =
2867 __esimd_lsc_xatomic_slm_0<MsgT, _Op, cache_hint::none, cache_hint::none,
2868 _AddressScale, _ImmOffset, _DS, _VS,
2869 _Transposed, N>(pred.data(), offsets.data());
2870 return detail::lsc_format_ret<T>(Tmp);
2889 __ESIMD_API __ESIMD_NS::simd<T, N>
2891 __ESIMD_NS::simd<T, N> src0,
2892 __ESIMD_NS::simd_mask<N> pred) {
2893 static_assert(Op != __ESIMD_NS::atomic_op::fadd &&
2894 Op != __ESIMD_NS::atomic_op::fsub,
2895 "fadd and fsub are not supported for slm.");
2896 static_assert(
sizeof(T) == 2 ||
sizeof(T) == 4,
"Unsupported data type");
2897 detail::check_lsc_vector_size<1>();
2898 detail::check_lsc_data_size<T, DS>();
2900 __ESIMD_DNS::to_lsc_atomic_op<Op>();
2901 __ESIMD_EDNS::check_lsc_atomic<_Op, T, N, 1>();
2902 constexpr uint16_t _AddressScale = 1;
2903 constexpr
int _ImmOffset = 0;
2908 detail::lsc_data_order::nontranspose;
2910 __ESIMD_NS::simd<MsgT, N> Msg_data = detail::lsc_format_input<MsgT>(src0);
2911 __ESIMD_NS::simd<MsgT, N> Tmp =
2912 __esimd_lsc_xatomic_slm_1<MsgT, _Op, cache_hint::none, cache_hint::none,
2913 _AddressScale, _ImmOffset, _DS, _VS,
2914 _Transposed, N>(pred.data(), offsets.data(),
2916 return detail::lsc_format_ret<T>(Tmp);
2936 __ESIMD_API __ESIMD_NS::simd<T, N>
2938 __ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd<T, N> src1,
2939 __ESIMD_NS::simd_mask<N> pred) {
2940 static_assert(
sizeof(T) == 2 ||
sizeof(T) == 4 ||
2941 (Op == __ESIMD_NS::atomic_op::cmpxchg &&
sizeof(T) == 8),
2942 "Unsupported data type");
2943 detail::check_lsc_vector_size<1>();
2944 detail::check_lsc_data_size<T, DS>();
2946 __ESIMD_DNS::to_lsc_atomic_op<Op>();
2947 __ESIMD_EDNS::check_lsc_atomic<_Op, T, N, 2>();
2948 constexpr uint16_t _AddressScale = 1;
2949 constexpr
int _ImmOffset = 0;
2954 detail::lsc_data_order::nontranspose;
2956 __ESIMD_NS::simd<MsgT, N> Msg_data0 = detail::lsc_format_input<MsgT>(src0);
2957 __ESIMD_NS::simd<MsgT, N> Msg_data1 = detail::lsc_format_input<MsgT>(src1);
2958 __ESIMD_NS::simd<MsgT, N> Tmp =
2959 __esimd_lsc_xatomic_slm_2<MsgT, _Op, cache_hint::none, cache_hint::none,
2960 _AddressScale, _ImmOffset, _DS, _VS,
2962 pred.data(), offsets.data(), Msg_data0.data(), Msg_data1.data());
2963 return detail::lsc_format_ret<T>(Tmp);
2984 __ESIMD_API std::enable_if_t<
2985 __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 0,
2986 __ESIMD_NS::simd<T, N>>
2988 __ESIMD_NS::simd_mask<N> pred) {
2989 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
2990 static_assert(
sizeof(T) > 1,
"Unsupported data type");
2991 detail::check_lsc_vector_size<1>();
2992 detail::check_lsc_data_size<T, DS>();
2994 __ESIMD_DNS::to_lsc_atomic_op<Op>();
2995 __ESIMD_EDNS::check_lsc_atomic<_Op, T, N, 0>();
2996 detail::check_lsc_cache_hint<detail::lsc_action::atomic, L1H, L3H>();
2997 constexpr uint16_t _AddressScale = 1;
2998 constexpr
int _ImmOffset = 0;
3003 detail::lsc_data_order::nontranspose;
3005 __ESIMD_NS::simd<uintptr_t, N> addrs =
reinterpret_cast<uintptr_t
>(p);
3006 addrs += convert<uintptr_t>(offsets);
3007 __ESIMD_NS::simd<MsgT, N> Tmp =
3008 __esimd_lsc_xatomic_stateless_0<MsgT, _Op, L1H, L3H, _AddressScale,
3009 _ImmOffset, _DS, _VS, _Transposed, N>(
3010 pred.data(), addrs.data());
3011 return detail::lsc_format_ret<T>(Tmp);
3018 typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
3019 __ESIMD_API std::enable_if_t<
3020 __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 0,
3021 __ESIMD_NS::simd<T, N>>
3023 __ESIMD_NS::simd_mask<N> pred = 1) {
3024 return lsc_atomic_update<Op, T, N, DS, L1H, L3H>(p, offsets.read(), pred);
3031 __ESIMD_API std::enable_if_t<
3032 std::is_integral_v<Toffset> &&
3033 __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 0,
3034 __ESIMD_NS::simd<T, N>>
3036 return lsc_atomic_update<Op, T, N, DS, L1H, L3H>(
3037 p, __ESIMD_NS::simd<Toffset, N>(offset), pred);
3059 __ESIMD_API std::enable_if_t<
3060 __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 1,
3061 __ESIMD_NS::simd<T, N>>
3063 __ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd_mask<N> pred) {
3064 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
3065 static_assert(
sizeof(T) > 1,
"Unsupported data type");
3066 detail::check_lsc_vector_size<1>();
3067 detail::check_lsc_data_size<T, DS>();
3069 __ESIMD_DNS::to_lsc_atomic_op<Op>();
3070 __ESIMD_EDNS::check_lsc_atomic<_Op, T, N, 1>();
3071 detail::check_lsc_cache_hint<detail::lsc_action::atomic, L1H, L3H>();
3072 constexpr uint16_t _AddressScale = 1;
3073 constexpr
int _ImmOffset = 0;
3078 detail::lsc_data_order::nontranspose;
3080 __ESIMD_NS::simd<MsgT, N> Msg_data = detail::lsc_format_input<MsgT>(src0);
3081 __ESIMD_NS::simd<uintptr_t, N> addrs =
reinterpret_cast<uintptr_t
>(p);
3082 addrs += convert<uintptr_t>(offsets);
3083 __ESIMD_NS::simd<MsgT, N> Tmp =
3084 __esimd_lsc_xatomic_stateless_1<MsgT, _Op, L1H, L3H, _AddressScale,
3085 _ImmOffset, _DS, _VS, _Transposed, N>(
3086 pred.data(), addrs.data(), Msg_data.data());
3087 return detail::lsc_format_ret<T>(Tmp);
3094 typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
3095 __ESIMD_API std::enable_if_t<
3096 __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 1,
3097 __ESIMD_NS::simd<T, N>>
3099 __ESIMD_NS::simd<T, N> src0,
3100 __ESIMD_NS::simd_mask<N> pred = 1) {
3101 return lsc_atomic_update<Op, T, N, DS, L1H, L3H>(p, offsets.read(), src0,
3109 __ESIMD_API std::enable_if_t<
3110 std::is_integral_v<Toffset> &&
3111 __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 1 &&
3112 ((Op != __ESIMD_NS::atomic_op::store &&
3113 Op != __ESIMD_NS::atomic_op::xchg) ||
3115 __ESIMD_NS::simd<T, N>>
3117 __ESIMD_NS::simd_mask<N> pred = 1) {
3118 return lsc_atomic_update<Op, T, N, DS, L1H, L3H>(
3119 p, __ESIMD_NS::simd<Toffset, N>(offset), src0, pred);
3141 __ESIMD_API std::enable_if_t<
3142 __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 2,
3143 __ESIMD_NS::simd<T, N>>
3145 __ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd<T, N> src1,
3146 __ESIMD_NS::simd_mask<N> pred) {
3147 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
3148 static_assert(
sizeof(T) > 1,
"Unsupported data type");
3149 detail::check_lsc_vector_size<1>();
3150 detail::check_lsc_data_size<T, DS>();
3152 __ESIMD_DNS::to_lsc_atomic_op<Op>();
3153 __ESIMD_EDNS::check_lsc_atomic<_Op, T, N, 2>();
3154 detail::check_lsc_cache_hint<detail::lsc_action::atomic, L1H, L3H>();
3155 constexpr uint16_t _AddressScale = 1;
3156 constexpr
int _ImmOffset = 0;
3161 detail::lsc_data_order::nontranspose;
3163 __ESIMD_NS::simd<MsgT, N> Msg_data0 = detail::lsc_format_input<MsgT>(src0);
3164 __ESIMD_NS::simd<MsgT, N> Msg_data1 = detail::lsc_format_input<MsgT>(src1);
3165 __ESIMD_NS::simd<uintptr_t, N> addrs =
reinterpret_cast<uintptr_t
>(p);
3166 addrs += convert<uintptr_t>(offsets);
3167 __ESIMD_NS::simd<MsgT, N> Tmp =
3168 __esimd_lsc_xatomic_stateless_2<MsgT, _Op, L1H, L3H, _AddressScale,
3169 _ImmOffset, _DS, _VS, _Transposed, N>(
3170 pred.data(), addrs.data(), Msg_data0.data(), Msg_data1.data());
3171 return detail::lsc_format_ret<T>(Tmp);
3178 typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
3179 __ESIMD_API std::enable_if_t<
3180 __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 2,
3181 __ESIMD_NS::simd<T, N>>
3183 __ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd<T, N> src1,
3184 __ESIMD_NS::simd_mask<N> pred = 1) {
3185 return lsc_atomic_update<Op, T, N, DS, L1H, L3H>(p, offsets.read(), src0,
3193 __ESIMD_API std::enable_if_t<
3194 std::is_integral_v<Toffset> &&
3195 __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 2,
3196 __ESIMD_NS::simd<T, N>>
3198 __ESIMD_NS::simd<T, N> src1,
3199 __ESIMD_NS::simd_mask<N> pred = 1) {
3200 return lsc_atomic_update<Op, T, N, DS, L1H, L3H>(
3201 p, __ESIMD_NS::simd<Toffset, N>(offset), src0, src1, pred);
3224 typename AccessorTy,
typename Toffset>
3225 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value,
3226 __ESIMD_NS::simd<T, N>>
3228 __ESIMD_NS::simd_mask<N> pred) {
3229 #ifdef __ESIMD_FORCE_STATELESS_MEM
3230 return lsc_atomic_update<Op, T, N, DS, L1H, L3H>(
3231 __ESIMD_DNS::accessorToPointer<T>(acc), offsets, pred);
3233 static_assert(
sizeof(T) > 1,
"Unsupported data type");
3234 static_assert(std::is_integral_v<Toffset> &&
sizeof(Toffset) == 4,
3235 "Unsupported offset type");
3236 detail::check_lsc_vector_size<1>();
3237 detail::check_lsc_data_size<T, DS>();
3239 __ESIMD_DNS::to_lsc_atomic_op<Op>();
3240 __ESIMD_EDNS::check_lsc_atomic<_Op, T, N, 0>();
3241 detail::check_lsc_cache_hint<detail::lsc_action::atomic, L1H, L3H>();
3242 constexpr uint16_t _AddressScale = 1;
3243 constexpr
int _ImmOffset = 0;
3248 detail::lsc_data_order::nontranspose;
3251 __ESIMD_NS::simd<MsgT, N> Tmp =
3252 __esimd_lsc_xatomic_bti_0<MsgT, _Op, L1H, L3H, _AddressScale, _ImmOffset,
3253 _DS, _VS, _Transposed, N>(pred.data(),
3254 offsets.data(), si);
3255 return detail::lsc_format_ret<T>(Tmp);
3280 typename AccessorTy,
typename Toffset>
3281 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value,
3282 __ESIMD_NS::simd<T, N>>
3284 __ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd_mask<N> pred) {
3285 #ifdef __ESIMD_FORCE_STATELESS_MEM
3286 return lsc_atomic_update<Op, T, N, DS, L1H, L3H>(
3287 __ESIMD_DNS::accessorToPointer<T>(acc), offsets, src0, pred);
3289 static_assert(
sizeof(T) > 1,
"Unsupported data type");
3290 static_assert(std::is_integral_v<Toffset> &&
sizeof(Toffset) == 4,
3291 "Unsupported offset type");
3292 detail::check_lsc_vector_size<1>();
3293 detail::check_lsc_data_size<T, DS>();
3295 __ESIMD_DNS::to_lsc_atomic_op<Op>();
3296 __ESIMD_EDNS::check_lsc_atomic<_Op, T, N, 1>();
3297 detail::check_lsc_cache_hint<detail::lsc_action::atomic, L1H, L3H>();
3298 constexpr uint16_t _AddressScale = 1;
3299 constexpr
int _ImmOffset = 0;
3304 detail::lsc_data_order::nontranspose;
3306 __ESIMD_NS::simd<MsgT, N> Msg_data = detail::lsc_format_input<MsgT>(src0);
3308 __ESIMD_NS::simd<MsgT, N> Tmp =
3309 __esimd_lsc_xatomic_bti_1<MsgT, _Op, L1H, L3H, _AddressScale, _ImmOffset,
3310 _DS, _VS, _Transposed, N>(
3311 pred.data(), offsets.data(), Msg_data.data(), si);
3312 return detail::lsc_format_ret<T>(Tmp);
3338 typename AccessorTy,
typename Toffset>
3339 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value,
3340 __ESIMD_NS::simd<T, N>>
3342 __ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd<T, N> src1,
3343 __ESIMD_NS::simd_mask<N> pred) {
3344 #ifdef __ESIMD_FORCE_STATELESS_MEM
3345 return lsc_atomic_update<Op, T, N, DS, L1H, L3H>(
3346 __ESIMD_DNS::accessorToPointer<T>(acc), offsets, src0, src1, pred);
3348 static_assert(std::is_integral_v<Toffset> &&
sizeof(Toffset) == 4,
3349 "Unsupported offset type");
3350 detail::check_lsc_vector_size<1>();
3351 detail::check_lsc_data_size<T, DS>();
3353 __ESIMD_DNS::to_lsc_atomic_op<Op>();
3354 __ESIMD_EDNS::check_lsc_atomic<_Op, T, N, 2>();
3355 detail::check_lsc_cache_hint<detail::lsc_action::atomic, L1H, L3H>();
3356 constexpr uint16_t _AddressScale = 1;
3357 constexpr
int _ImmOffset = 0;
3362 detail::lsc_data_order::nontranspose;
3364 __ESIMD_NS::simd<MsgT, N> Msg_data0 = detail::lsc_format_input<MsgT>(src0);
3365 __ESIMD_NS::simd<MsgT, N> Msg_data1 = detail::lsc_format_input<MsgT>(src1);
3367 __ESIMD_NS::simd<MsgT, N> Tmp =
3368 __esimd_lsc_xatomic_bti_2<MsgT, _Op, L1H, L3H, _AddressScale, _ImmOffset,
3369 _DS, _VS, _Transposed, N>(
3370 pred.data(), offsets.data(), Msg_data0.data(), Msg_data1.data(), si);
3371 return detail::lsc_format_ret<T>(Tmp);
3385 lsc_scope Scope = lsc_scope::group,
int N = 16>
3386 __ESIMD_API
void lsc_fence(__ESIMD_NS::simd_mask<N> pred = 1) {
3388 Kind != lsc_memory_kind::shared_local ||
3389 (FenceOp == lsc_fence_op::none && Scope == lsc_scope::group),
3390 "SLM fence must have 'none' lsc_fence_op and 'group' scope");
3391 __esimd_lsc_fence<Kind, FenceOp, Scope, N>(pred.data());
3404 #ifdef __SYCL_DEVICE_ONLY__
3405 return __spirv_BuiltInGlobalHWThreadIDINTEL();
3408 #endif // __SYCL_DEVICE_ONLY__
3412 #ifdef __SYCL_DEVICE_ONLY__
3413 return __spirv_BuiltInSubDeviceIDINTEL();
3428 template <native::lsc::atomic_op Op,
typename T,
int N,
typename Toffset>
3429 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3430 __ESIMD_DNS::get_num_args<Op>() == 0,
3433 return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3438 typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
3439 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3440 __ESIMD_DNS::get_num_args<Op>() == 0,
3444 return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3448 template <native::lsc::atomic_op Op,
typename T,
int N,
typename Toffset>
3449 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3450 __ESIMD_DNS::get_num_args<Op>() == 0,
3453 return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3458 template <native::lsc::atomic_op Op,
typename T,
int N,
typename Toffset>
3460 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3461 __ESIMD_DNS::get_num_args<Op>() == 1,
3465 return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3466 p, offset, src0, mask);
3470 typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
3472 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3473 __ESIMD_DNS::get_num_args<Op>() == 1,
3477 return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3478 p, offsets, src0, mask);
3481 template <native::lsc::atomic_op Op,
typename T,
int N,
typename Toffset>
3482 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3483 __ESIMD_DNS::get_num_args<Op>() == 1,
3486 return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3487 p, offset, src0, mask);
3491 template <native::lsc::atomic_op Op,
typename T,
int N,
typename Toffset>
3492 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3493 __ESIMD_DNS::get_num_args<Op>() == 2,
3500 return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3501 p, offset, src1, src0, mask);
3505 typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
3506 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3507 __ESIMD_DNS::get_num_args<Op>() == 2,
3511 return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3512 p, offsets, src1, src0, mask);
3515 template <native::lsc::atomic_op Op,
typename T,
int N,
typename Toffset>
3516 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3517 __ESIMD_DNS::get_num_args<Op>() == 2,
3518 __ESIMD_NS::simd<T, N>>
3521 return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3522 p, offset, src1, src0, mask);
3526 typename AccessorTy>
3527 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3528 __ESIMD_DNS::get_num_args<Op>() == 0 &&
3529 !std::is_pointer<AccessorTy>::value,
3532 return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3537 typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>,
3538 typename AccessorTy>
3539 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3540 __ESIMD_DNS::get_num_args<Op>() == 0 &&
3541 !std::is_pointer<AccessorTy>::value,
3545 return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3546 acc, offsets, mask);
3550 typename AccessorTy>
3551 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3552 __ESIMD_DNS::get_num_args<Op>() == 0 &&
3553 !std::is_pointer<AccessorTy>::value,
3556 return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3562 typename AccessorTy>
3564 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3565 __ESIMD_DNS::get_num_args<Op>() == 1 &&
3566 !std::is_pointer<AccessorTy>::value,
3570 return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3571 acc, offset, src0, mask);
3575 typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>,
3576 typename AccessorTy>
3578 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3579 __ESIMD_DNS::get_num_args<Op>() == 1 &&
3580 !std::is_pointer<AccessorTy>::value,
3584 return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3585 acc, offsets, src0, mask);
3589 typename AccessorTy>
3590 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3591 __ESIMD_DNS::get_num_args<Op>() == 1 &&
3592 !std::is_pointer<AccessorTy>::value,
3596 return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3597 acc, offset, src0, mask);
3602 typename AccessorTy>
3603 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3604 __ESIMD_DNS::get_num_args<Op>() == 2 &&
3605 !std::is_pointer<AccessorTy>::value,
3612 return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3613 acc, offset, src1, src0, mask);
3617 typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>,
3618 typename AccessorTy>
3619 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3620 __ESIMD_DNS::get_num_args<Op>() == 2 &&
3621 !std::is_pointer<AccessorTy>::value,
3625 return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3626 acc, offsets, src1, src0, mask);
3630 typename AccessorTy>
3631 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3632 __ESIMD_DNS::get_num_args<Op>() == 2 &&
3633 !std::is_pointer<AccessorTy>::value,
3634 __ESIMD_NS::simd<T, N>>
3637 return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3638 acc, offset, src1, src0, mask);