DPC++ Runtime
Runtime libraries for oneAPI DPC++
memory.hpp
Go to the documentation of this file.
1 //==-------------- memory.hpp - DPC++ Explicit SIMD API --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // Implement experimental Explicit SIMD memory-access APIs.
9 //===----------------------------------------------------------------------===//
10 
11 #pragma once
12 
17 
19 namespace __ESIMD_ENS {
20 
21 #define __ESIMD_GET_SURF_HANDLE(acc) __ESIMD_NS::get_surface_index(acc)
22 
25 
28 template <split_barrier_action flag> __ESIMD_API void split_barrier() {
29  __esimd_sbarrier(flag);
30 }
31 
32 __SYCL_DEPRECATED("use split_barrier<split_barrier_action>()")
33 __ESIMD_API void split_barrier(split_barrier_action flag) {
34  __esimd_sbarrier(flag);
35 }
36 
38 
39 // sycl_esimd_raw_send intrinsics are not available when stateless memory
40 // accesses are enforced.
41 #ifndef __ESIMD_FORCE_STATELESS_MEM
42 
45 
70 template <typename T1, int n1, typename T2, int n2, typename T3, int n3,
71  int N = 16>
72 __ESIMD_API __ESIMD_NS::simd<T1, n1> raw_sends_load(
73  __ESIMD_NS::simd<T1, n1> msgDst, __ESIMD_NS::simd<T2, n2> msgSrc0,
74  __ESIMD_NS::simd<T3, n3> msgSrc1, uint32_t exDesc, uint32_t msgDesc,
75  uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t numSrc1,
76  uint8_t numDst, uint8_t isEOT = 0, uint8_t isSendc = 0,
77  __ESIMD_NS::simd_mask<N> mask = 1) {
78  constexpr unsigned _Width1 = n1 * sizeof(T1);
79  static_assert(_Width1 % 32 == 0, "Invalid size for raw send rspVar");
80  constexpr unsigned _Width2 = n2 * sizeof(T2);
81  static_assert(_Width2 % 32 == 0, "Invalid size for raw send msgSrc0");
82  constexpr unsigned _Width3 = n3 * sizeof(T3);
83  static_assert(_Width3 % 32 == 0, "Invalid size for raw send msgSrc1");
84 
85  uint8_t modifier = ((isEOT & 0x1) << 1) | (isSendc & 0x1);
86  return __esimd_raw_sends2<T1, n1, T2, n2, T3, n3, N>(
87  modifier, execSize, mask.data(), numSrc0, numSrc1, numDst, sfid, exDesc,
88  msgDesc, msgSrc0.data(), msgSrc1.data(), msgDst.data());
89 }
90 
112 template <typename T1, int n1, typename T2, int n2, int N = 16>
113 __ESIMD_API __ESIMD_NS::simd<T1, n1>
114 raw_send_load(__ESIMD_NS::simd<T1, n1> msgDst, __ESIMD_NS::simd<T2, n2> msgSrc0,
115  uint32_t exDesc, uint32_t msgDesc, uint8_t execSize, uint8_t sfid,
116  uint8_t numSrc0, uint8_t numDst, uint8_t isEOT = 0,
117  uint8_t isSendc = 0, __ESIMD_NS::simd_mask<N> mask = 1) {
118  constexpr unsigned _Width1 = n1 * sizeof(T1);
119  static_assert(_Width1 % 32 == 0, "Invalid size for raw send rspVar");
120  constexpr unsigned _Width2 = n2 * sizeof(T2);
121  static_assert(_Width2 % 32 == 0, "Invalid size for raw send msgSrc0");
122 
123  uint8_t modifier = ((isEOT & 0x1) << 1) | (isSendc & 0x1);
124  return __esimd_raw_send2<T1, n1, T2, n2, N>(
125  modifier, execSize, mask.data(), numSrc0, numDst, sfid, exDesc, msgDesc,
126  msgSrc0.data(), msgDst.data());
127 }
128 
149 template <typename T1, int n1, typename T2, int n2, int N = 16>
150 __ESIMD_API void
151 raw_sends_store(__ESIMD_NS::simd<T1, n1> msgSrc0,
152  __ESIMD_NS::simd<T2, n2> msgSrc1, uint32_t exDesc,
153  uint32_t msgDesc, uint8_t execSize, uint8_t sfid,
154  uint8_t numSrc0, uint8_t numSrc1, uint8_t isEOT = 0,
155  uint8_t isSendc = 0, __ESIMD_NS::simd_mask<N> mask = 1) {
156  constexpr unsigned _Width1 = n1 * sizeof(T1);
157  static_assert(_Width1 % 32 == 0, "Invalid size for raw send msgSrc0");
158  constexpr unsigned _Width2 = n2 * sizeof(T2);
159  static_assert(_Width2 % 32 == 0, "Invalid size for raw send msgSrc1");
160 
161  uint8_t modifier = ((isEOT & 0x1) << 1) | (isSendc & 0x1);
162  __esimd_raw_sends2_noresult<T1, n1, T2, n2, N>(
163  modifier, execSize, mask.data(), numSrc0, numSrc1, sfid, exDesc, msgDesc,
164  msgSrc0.data(), msgSrc1.data());
165 }
166 
185 template <typename T1, int n1, int N = 16>
186 __ESIMD_API void raw_send_store(__ESIMD_NS::simd<T1, n1> msgSrc0,
187  uint32_t exDesc, uint32_t msgDesc,
188  uint8_t execSize, uint8_t sfid, uint8_t numSrc0,
189  uint8_t isEOT = 0, uint8_t isSendc = 0,
190  __ESIMD_NS::simd_mask<N> mask = 1) {
191  constexpr unsigned _Width1 = n1 * sizeof(T1);
192  static_assert(_Width1 % 32 == 0, "Invalid size for raw send msgSrc0");
193 
194  uint8_t modifier = ((isEOT & 0x1) << 1) | (isSendc & 0x1);
195  __esimd_raw_send2_noresult<T1, n1, N>(modifier, execSize, mask.data(),
196  numSrc0, sfid, exDesc, msgDesc,
197  msgSrc0.data());
198 }
199 
201 
202 #endif // !__ESIMD_FORCE_STATELESS_MEM
203 
206 
209 
214 __ESIMD_API void named_barrier_wait(uint8_t id) {
215  __esimd_nbarrier(0 /*wait*/, id, 0 /*thread count*/);
216 }
217 
222 template <uint8_t NbarCount> __ESIMD_API void named_barrier_init() {
223  __esimd_nbarrier_init(NbarCount);
224 }
225 
238 __ESIMD_API void named_barrier_signal(uint8_t barrier_id,
239  uint8_t producer_consumer_mode,
240  uint32_t num_producers,
241  uint32_t num_consumers) {
242  constexpr uint32_t gateway = 3;
243  constexpr uint32_t barrier = 4;
244  constexpr uint32_t descriptor = 1 << 25 | // Message length: 1 register
245  0 << 12 | // Fence Data Ports: No fence
246  barrier; // Barrier subfunction
247 
248  __ESIMD_DNS::vector_type_t<uint32_t, 8> payload = 0;
249  payload[2] = (num_consumers & 0xff) << 24 | (num_producers & 0xff) << 16 |
250  producer_consumer_mode << 14 | (barrier_id & 0b11111) << 0;
251 
252  __esimd_raw_send_nbarrier_signal<uint32_t, 8>(
253  0 /*sendc*/, gateway, descriptor, payload, 1 /*pred*/);
254 }
255 
257 
260 
263 
264 namespace detail {
265 // Compute the data size for 2d block load or store.
266 template <typename T, int NBlocks, int Height, int Width, bool Transposed,
267  bool Transformed>
268 constexpr int get_lsc_block_2d_data_size() {
269  if (Transformed)
270  return detail::roundUpNextMultiple<Height, 4 / sizeof(T)>() *
271  __ESIMD_DNS::getNextPowerOf2<Width>() * NBlocks;
272  return Width * Height * NBlocks;
273 }
274 
275 // Format u8u32 and u16u32 back to u8 and u16.
276 template <typename T, typename T1, int N>
277 ESIMD_INLINE __ESIMD_NS::simd<T, N>
278 lsc_format_ret(__ESIMD_NS::simd<T1, N> Vals) {
279  auto Formatted = Vals.template bit_cast_view<T>();
280  constexpr int Stride = Formatted.length / N;
281  return Formatted.template select<N, Stride>(0);
282 }
283 } // namespace detail
284 
300 template <typename T, uint8_t NElts = 1,
301  lsc_data_size DS = lsc_data_size::default_size, int N>
302 __ESIMD_API __ESIMD_NS::simd<T, N * NElts>
303 lsc_slm_gather(__ESIMD_NS::simd<uint32_t, N> offsets,
304  __ESIMD_NS::simd_mask<N> pred = 1) {
305  detail::check_lsc_vector_size<NElts>();
306  detail::check_lsc_data_size<T, DS>();
307  constexpr uint16_t _AddressScale = 1;
308  constexpr int _ImmOffset = 0;
309  constexpr lsc_data_size _DS =
310  detail::expand_data_size(detail::finalize_data_size<T, DS>());
311  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
312  constexpr detail::lsc_data_order _Transposed =
313  detail::lsc_data_order::nontranspose;
314  using _MsgT = typename detail::lsc_expand_type<T>::type;
315  __ESIMD_NS::simd<_MsgT, N *NElts> Tmp =
316  __esimd_lsc_load_slm<_MsgT, cache_hint::none, cache_hint::none,
317  _AddressScale, _ImmOffset, _DS, _VS, _Transposed, N>(
318  pred.data(), offsets.data());
319  return detail::lsc_format_ret<T>(Tmp);
320 }
321 
335 template <typename T, uint8_t NElts = 1,
336  lsc_data_size DS = lsc_data_size::default_size>
337 __ESIMD_API __ESIMD_NS::simd<T, NElts> lsc_slm_block_load(uint32_t offset) {
338  detail::check_lsc_vector_size<NElts>();
339  detail::check_lsc_data_size<T, DS>();
340  constexpr uint16_t _AddressScale = 1;
341  constexpr int _ImmOffset = 0;
342  constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
343  static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
344  "Transposed load is supported only for data size u32 or u64");
345  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
346  constexpr detail::lsc_data_order _Transposed =
347  detail::lsc_data_order::transpose;
348  constexpr int N = 1;
349  __ESIMD_NS::simd_mask<N> pred = 1;
350  __ESIMD_NS::simd<uint32_t, N> offsets = offset;
351  return __esimd_lsc_load_slm<T, cache_hint::none, cache_hint::none,
352  _AddressScale, _ImmOffset, _DS, _VS, _Transposed,
353  N>(pred.data(), offsets.data());
354 }
355 
375 template <typename T, uint8_t NElts = 1,
376  lsc_data_size DS = lsc_data_size::default_size,
377  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
378  int N, typename AccessorTy>
379 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value,
380  __ESIMD_NS::simd<T, N * NElts>>
381 lsc_gather(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
382  __ESIMD_NS::simd_mask<N> pred = 1) {
383 #ifdef __ESIMD_FORCE_STATELESS_MEM
384  return lsc_gather<T, N, DS, L1H>(acc.get_pointer().get(), offsets, pred);
385 #else
386  detail::check_lsc_vector_size<NElts>();
387  detail::check_lsc_data_size<T, DS>();
388  detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
389  constexpr uint16_t _AddressScale = 1;
390  constexpr int _ImmOffset = 0;
391  constexpr lsc_data_size _DS =
392  detail::expand_data_size(detail::finalize_data_size<T, DS>());
393  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
394  constexpr detail::lsc_data_order _Transposed =
395  detail::lsc_data_order::nontranspose;
396  using _MsgT = typename detail::lsc_expand_type<T>::type;
397  auto si = __ESIMD_GET_SURF_HANDLE(acc);
398  __ESIMD_NS::simd<_MsgT, N *NElts> Tmp =
399  __esimd_lsc_load_bti<_MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
400  _Transposed, N>(pred.data(), offsets.data(), si);
401  return detail::lsc_format_ret<T>(Tmp);
402 #endif
403 }
404 
422 template <typename T, uint8_t NElts = 1,
423  lsc_data_size DS = lsc_data_size::default_size,
424  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
425  typename AccessorTy>
426 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value,
427  __ESIMD_NS::simd<T, NElts>>
428 lsc_block_load(AccessorTy acc, uint32_t offset) {
429 #ifdef __ESIMD_FORCE_STATELESS_MEM
430  return lsc_block_load<T, NElts, DS, L1H, L3H>(
431  __ESIMD_DNS::accessorToPointer<T>(acc, offset));
432 #else
433  detail::check_lsc_vector_size<NElts>();
434  detail::check_lsc_data_size<T, DS>();
435  detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
436  constexpr uint16_t _AddressScale = 1;
437  constexpr int _ImmOffset = 0;
438  constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
439  static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
440  "Transposed load is supported only for data size u32 or u64");
441  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
442  constexpr detail::lsc_data_order _Transposed =
443  detail::lsc_data_order::transpose;
444  constexpr int N = 1;
445  __ESIMD_NS::simd_mask<N> pred = 1;
446  __ESIMD_NS::simd<uint32_t, N> offsets = offset;
447  auto si = __ESIMD_GET_SURF_HANDLE(acc);
448  return __esimd_lsc_load_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
449  _Transposed, N>(pred.data(), offsets.data(), si);
450 #endif
451 }
452 
471 template <typename T, uint8_t NElts = 1,
472  lsc_data_size DS = lsc_data_size::default_size,
473  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
474  int N>
475 __ESIMD_API __ESIMD_NS::simd<T, N * NElts>
476 lsc_gather(const T *p, __ESIMD_NS::simd<uint32_t, N> offsets,
477  __ESIMD_NS::simd_mask<N> pred = 1) {
478  detail::check_lsc_vector_size<NElts>();
479  detail::check_lsc_data_size<T, DS>();
480  detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
481  constexpr uint16_t _AddressScale = 1;
482  constexpr int _ImmOffset = 0;
483  constexpr lsc_data_size _DS =
484  detail::expand_data_size(detail::finalize_data_size<T, DS>());
485  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
486  constexpr detail::lsc_data_order _Transposed =
487  detail::lsc_data_order::nontranspose;
488  using _MsgT = typename detail::lsc_expand_type<T>::type;
489  __ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
490  addrs += convert<uintptr_t>(offsets);
491  __ESIMD_NS::simd<_MsgT, N *NElts> Tmp =
492  __esimd_lsc_load_stateless<_MsgT, L1H, L3H, _AddressScale, _ImmOffset,
493  _DS, _VS, _Transposed, N>(pred.data(),
494  addrs.data());
495  return detail::lsc_format_ret<T>(Tmp);
496 }
497 
513 template <typename T, uint8_t NElts = 1,
514  lsc_data_size DS = lsc_data_size::default_size,
515  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
516 __ESIMD_API __ESIMD_NS::simd<T, NElts> lsc_block_load(const T *p) {
517  detail::check_lsc_vector_size<NElts>();
518  detail::check_lsc_data_size<T, DS>();
519  detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
520  constexpr uint16_t _AddressScale = 1;
521  constexpr int _ImmOffset = 0;
522  constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
523  static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
524  "Transposed load is supported only for data size u32 or u64");
525  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
526  constexpr detail::lsc_data_order _Transposed =
527  detail::lsc_data_order::transpose;
528  constexpr int N = 1;
529  __ESIMD_NS::simd_mask<N> pred = 1;
530  __ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
531  return __esimd_lsc_load_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
532  _VS, _Transposed, N>(pred.data(),
533  addrs.data());
534 }
535 
553 template <typename T, uint8_t NElts = 1,
554  lsc_data_size DS = lsc_data_size::default_size,
555  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
556  int N, typename AccessorTy>
557 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value>
558 lsc_prefetch(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
559  __ESIMD_NS::simd_mask<N> pred = 1) {
560 #ifdef __ESIMD_FORCE_STATELESS_MEM
561  return lsc_prefetch<T, NElts, DS, L1H, L3H>(
562  __ESIMD_DNS::accessorToPointer<T>(acc), offsets, pred);
563 #else
564  detail::check_lsc_vector_size<NElts>();
565  detail::check_lsc_data_size<T, DS>();
566  detail::check_lsc_cache_hint<detail::lsc_action::prefetch, L1H, L3H>();
567  constexpr uint16_t _AddressScale = 1;
568  constexpr int _ImmOffset = 0;
569  constexpr lsc_data_size _DS =
570  detail::expand_data_size(detail::finalize_data_size<T, DS>());
571  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
572  constexpr detail::lsc_data_order _Transposed =
573  detail::lsc_data_order::nontranspose;
574  using _MsgT = typename detail::lsc_expand_type<T>::type;
575  auto si = __ESIMD_GET_SURF_HANDLE(acc);
576  __esimd_lsc_prefetch_bti<_MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
577  _Transposed, N>(pred.data(), offsets.data(), si);
578 #endif
579 }
580 
596 template <typename T, uint8_t NElts = 1,
597  lsc_data_size DS = lsc_data_size::default_size,
598  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
599  typename AccessorTy>
600 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value>
601 lsc_prefetch(AccessorTy acc, uint32_t offset) {
602 #ifdef __ESIMD_FORCE_STATELESS_MEM
603  lsc_prefetch<T, NElts, DS, L1H, L3H>(
604  __ESIMD_DNS::accessorToPointer<T>(acc, offset));
605 #else
606  detail::check_lsc_vector_size<NElts>();
607  detail::check_lsc_data_size<T, DS>();
608  detail::check_lsc_cache_hint<detail::lsc_action::prefetch, L1H, L3H>();
609  constexpr uint16_t _AddressScale = 1;
610  constexpr int _ImmOffset = 0;
611  constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
612  static_assert(
613  _DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
614  "Transposed prefetch is supported only for data size u32 or u64");
615  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
616  constexpr detail::lsc_data_order _Transposed =
617  detail::lsc_data_order::transpose;
618  constexpr int N = 1;
619  __ESIMD_NS::simd_mask<N> pred = 1;
620  __ESIMD_NS::simd<uint32_t, N> offsets = offset;
621  auto si = __ESIMD_GET_SURF_HANDLE(acc);
622  __esimd_lsc_prefetch_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
623  _Transposed, N>(pred.data(), offsets.data(), si);
624 #endif
625 }
626 
643 template <typename T, uint8_t NElts = 1,
644  lsc_data_size DS = lsc_data_size::default_size,
645  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
646  int N>
647 __ESIMD_API void lsc_prefetch(const T *p, __ESIMD_NS::simd<uint32_t, N> offsets,
648  __ESIMD_NS::simd_mask<N> pred = 1) {
649  detail::check_lsc_vector_size<NElts>();
650  detail::check_lsc_data_size<T, DS>();
651  detail::check_lsc_cache_hint<detail::lsc_action::prefetch, L1H, L3H>();
652  constexpr uint16_t _AddressScale = 1;
653  constexpr int _ImmOffset = 0;
654  constexpr lsc_data_size _DS =
655  detail::expand_data_size(detail::finalize_data_size<T, DS>());
656  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
657  constexpr detail::lsc_data_order _Transposed =
658  detail::lsc_data_order::nontranspose;
659  using _MsgT = typename detail::lsc_expand_type<T>::type;
660  __ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
661  addrs += convert<uintptr_t>(offsets);
662  __esimd_lsc_prefetch_stateless<_MsgT, L1H, L3H, _AddressScale, _ImmOffset,
663  _DS, _VS, _Transposed, N>(pred.data(),
664  addrs.data());
665 }
666 
680 template <typename T, uint8_t NElts = 1,
681  lsc_data_size DS = lsc_data_size::default_size,
682  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
683 __ESIMD_API void lsc_prefetch(const T *p) {
684  detail::check_lsc_vector_size<NElts>();
685  detail::check_lsc_data_size<T, DS>();
686  detail::check_lsc_cache_hint<detail::lsc_action::prefetch, L1H, L3H>();
687  constexpr uint16_t _AddressScale = 1;
688  constexpr int _ImmOffset = 0;
689  constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
690  static_assert(
691  _DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
692  "Transposed prefetch is supported only for data size u32 or u64");
693  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
694  constexpr detail::lsc_data_order _Transposed =
695  detail::lsc_data_order::transpose;
696  constexpr int N = 1;
697  __ESIMD_NS::simd_mask<N> pred = 1;
698  __ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
699  __esimd_lsc_prefetch_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
700  _VS, _Transposed, N>(pred.data(),
701  addrs.data());
702 }
703 
718 template <typename T, uint8_t NElts = 1,
719  lsc_data_size DS = lsc_data_size::default_size, int N>
720 __ESIMD_API void lsc_slm_scatter(__ESIMD_NS::simd<uint32_t, N> offsets,
721  __ESIMD_NS::simd<T, N * NElts> vals,
722  __ESIMD_NS::simd_mask<N> pred = 1) {
723  detail::check_lsc_vector_size<NElts>();
724  detail::check_lsc_data_size<T, DS>();
725  constexpr uint16_t _AddressScale = 1;
726  constexpr int _ImmOffset = 0;
727  constexpr lsc_data_size _DS =
728  detail::expand_data_size(detail::finalize_data_size<T, DS>());
729  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
730  constexpr detail::lsc_data_order _Transposed =
731  detail::lsc_data_order::nontranspose;
732  using _MsgT = typename detail::lsc_expand_type<T>::type;
733  using _CstT = typename detail::lsc_bitcast_type<T>::type;
734  __ESIMD_NS::simd<_MsgT, N *NElts> Tmp = vals.template bit_cast_view<_CstT>();
735  __esimd_lsc_store_slm<_MsgT, cache_hint::none, cache_hint::none,
736  _AddressScale, _ImmOffset, _DS, _VS, _Transposed, N>(
737  pred.data(), offsets.data(), Tmp.data());
738 }
739 
752 template <typename T, uint8_t NElts = 1,
753  lsc_data_size DS = lsc_data_size::default_size>
754 __ESIMD_API void lsc_slm_block_store(uint32_t offset,
755  __ESIMD_NS::simd<T, NElts> vals) {
756  detail::check_lsc_vector_size<NElts>();
757  detail::check_lsc_data_size<T, DS>();
758  constexpr uint16_t _AddressScale = 1;
759  constexpr int _ImmOffset = 0;
760  constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
761  static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
762  "Transposed store is supported only for data size u32 or u64");
763  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
764  constexpr detail::lsc_data_order _Transposed =
765  detail::lsc_data_order::transpose;
766  constexpr int N = 1;
767  __ESIMD_NS::simd_mask<N> pred = 1;
768  __ESIMD_NS::simd<uint32_t, N> offsets = offset;
769  __esimd_lsc_store_slm<T, cache_hint::none, cache_hint::none, _AddressScale,
770  _ImmOffset, _DS, _VS, _Transposed, N>(
771  pred.data(), offsets.data(), vals.data());
772 }
773 
792 template <typename T, uint8_t NElts = 1,
793  lsc_data_size DS = lsc_data_size::default_size,
794  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
795  int N, typename AccessorTy>
796 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value>
797 lsc_scatter(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
798  __ESIMD_NS::simd<T, N * NElts> vals,
799  __ESIMD_NS::simd_mask<N> pred = 1) {
800 #ifdef __ESIMD_FORCE_STATELESS_MEM
801  lsc_scatter<T, NElts, DS, L1H>(__ESIMD_DNS::accessorToPointer<T>(acc),
802  offsets, pred);
803 #else
804  detail::check_lsc_vector_size<NElts>();
805  detail::check_lsc_data_size<T, DS>();
806  detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
807  constexpr uint16_t _AddressScale = 1;
808  constexpr int _ImmOffset = 0;
809  constexpr lsc_data_size _DS =
810  detail::expand_data_size(detail::finalize_data_size<T, DS>());
811  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
812  constexpr detail::lsc_data_order _Transposed =
813  detail::lsc_data_order::nontranspose;
814  using _MsgT = typename detail::lsc_expand_type<T>::type;
815  using _CstT = typename detail::lsc_bitcast_type<T>::type;
816  __ESIMD_NS::simd<_MsgT, N *NElts> Tmp = vals.template bit_cast_view<_CstT>();
817  auto si = __ESIMD_GET_SURF_HANDLE(acc);
818  __esimd_lsc_store_bti<_MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
819  _Transposed, N>(pred.data(), offsets.data(), Tmp.data(),
820  si);
821 #endif
822 }
823 
840 template <typename T, uint8_t NElts = 1,
841  lsc_data_size DS = lsc_data_size::default_size,
842  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
843  typename AccessorTy>
844 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value>
845 lsc_block_store(AccessorTy acc, uint32_t offset,
846  __ESIMD_NS::simd<T, NElts> vals) {
847 #ifdef __ESIMD_FORCE_STATELESS_MEM
848  lsc_block_store<T, NElts, DS, L1H>(
849  __ESIMD_DNS::accessorToPointer<T>(acc, offset), vals);
850 #else
851  detail::check_lsc_vector_size<NElts>();
852  detail::check_lsc_data_size<T, DS>();
853  detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
854  constexpr uint16_t _AddressScale = 1;
855  constexpr int _ImmOffset = 0;
856  constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
857  static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
858  "Transposed store is supported only for data size u32 or u64");
859  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
860  constexpr detail::lsc_data_order _Transposed =
861  detail::lsc_data_order::transpose;
862  constexpr int N = 1;
863  __ESIMD_NS::simd_mask<N> pred = 1;
864  __ESIMD_NS::simd<uint32_t, N> offsets = offset;
865  auto si = __ESIMD_GET_SURF_HANDLE(acc);
866  __esimd_lsc_store_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
867  _Transposed, N>(pred.data(), offsets.data(),
868  vals.data(), si);
869 #endif
870 }
871 
889 template <typename T, uint8_t NElts = 1,
890  lsc_data_size DS = lsc_data_size::default_size,
891  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
892  int N>
893 __ESIMD_API void lsc_scatter(T *p, __ESIMD_NS::simd<uint32_t, N> offsets,
894  __ESIMD_NS::simd<T, N * NElts> vals,
895  __ESIMD_NS::simd_mask<N> pred = 1) {
896  detail::check_lsc_vector_size<NElts>();
897  detail::check_lsc_data_size<T, DS>();
898  detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
899  constexpr uint16_t _AddressScale = 1;
900  constexpr int _ImmOffset = 0;
901  constexpr lsc_data_size _DS =
902  detail::expand_data_size(detail::finalize_data_size<T, DS>());
903  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
904  constexpr detail::lsc_data_order _Transposed =
905  detail::lsc_data_order::nontranspose;
906  using _MsgT = typename detail::lsc_expand_type<T>::type;
907  using _CstT = typename detail::lsc_bitcast_type<T>::type;
908  __ESIMD_NS::simd<_MsgT, N *NElts> Tmp = vals.template bit_cast_view<_CstT>();
909  __ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
910  addrs += convert<uintptr_t>(offsets);
911  __esimd_lsc_store_stateless<_MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS,
912  _VS, _Transposed, N>(pred.data(), addrs.data(),
913  Tmp.data());
914 }
915 
930 template <typename T, uint8_t NElts = 1,
931  lsc_data_size DS = lsc_data_size::default_size,
932  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
933 __ESIMD_API void lsc_block_store(T *p, __ESIMD_NS::simd<T, NElts> vals) {
934  detail::check_lsc_vector_size<NElts>();
935  detail::check_lsc_data_size<T, DS>();
936  detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
937  constexpr uint16_t _AddressScale = 1;
938  constexpr int _ImmOffset = 0;
939  constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
940  static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
941  "Transposed store is supported only for data size u32 or u64");
942  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
943  constexpr detail::lsc_data_order _Transposed =
944  detail::lsc_data_order::transpose;
945  constexpr int N = 1;
946  __ESIMD_NS::simd_mask<N> pred = 1;
947  __ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
948  __esimd_lsc_store_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
949  _Transposed, N>(pred.data(), addrs.data(),
950  vals.data());
951 }
952 
953 namespace detail {
954 // Compile-time checks for lsc_load2d/store2d restrictions.
955 template <typename T, int BlockWidth, int BlockHeight, int NBlocks,
956  bool Transposed, bool Transformed, bool IsStore = false>
958  constexpr int GRFByteSize = BlockWidth * BlockHeight * NBlocks * sizeof(T);
959  static_assert(!IsStore || GRFByteSize <= 512,
960  "2D store supports 512 bytes max");
961  static_assert(IsStore || GRFByteSize <= 2048,
962  "2D load supports 2048 bytes max");
963  static_assert(!Transposed || !Transformed,
964  "Transposed and transformed is not supported");
965  if constexpr (Transposed) {
966  static_assert(NBlocks == 1, "Transposed expected to be 1 block only");
967  static_assert(sizeof(T) == 4 || sizeof(T) == 8,
968  "Transposed load is supported only for data size u32 or u64");
969  static_assert(sizeof(T) == 64 ? BlockHeight == 8
970  : BlockHeight >= 1 && BlockHeight <= 32,
971  "Unsupported block height");
972  static_assert(sizeof(T) == 64 ? __ESIMD_DNS::isPowerOf2(BlockWidth, 4)
973  : BlockWidth >= 1 && BlockWidth <= 8,
974  "Unsupported block width");
975  } else if constexpr (Transformed) {
976  static_assert(sizeof(T) == 1 || sizeof(T) == 2,
977  "VNNI transform is supported only for data size u8 or u16");
978  static_assert(__ESIMD_DNS::isPowerOf2(NBlocks, 4),
979  "Unsupported number of blocks");
980  static_assert(BlockHeight * sizeof(T) >= 4 && BlockHeight <= 32,
981  "Unsupported block height");
982  static_assert(BlockWidth * sizeof(T) >= 4 &&
983  BlockWidth * NBlocks * sizeof(T) <= 64,
984  "Unsupported block width");
985  } else {
986  static_assert(
987  __ESIMD_DNS::isPowerOf2(NBlocks, sizeof(T) == 1 ? 4 : 8 / sizeof(T)),
988  "Unsupported number of blocks");
989  static_assert(BlockHeight >= 1 && BlockHeight <= 32,
990  "Unsupported block height");
991  static_assert(BlockWidth * sizeof(T) >= 4 &&
992  BlockWidth * NBlocks * sizeof(T) <= 64,
993  "Unsupported block width");
994  }
995 }
996 } // namespace detail
997 
1028 template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
1029  bool Transposed = false, bool Transformed = false,
1030  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1032  T, NBlocks, BlockHeight, BlockWidth, Transposed, Transformed>()>
1033 __ESIMD_API __ESIMD_NS::simd<T, N>
1034 lsc_load2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
1035  unsigned SurfacePitch, int X, int Y) {
1036  detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
1037  detail::check_lsc_block_2d_restrictions<T, BlockWidth, BlockHeight, NBlocks,
1038  Transposed, Transformed>();
1039  constexpr int ElemsPerDword = 4 / sizeof(T);
1040  constexpr int GRFRowSize = Transposed ? BlockHeight : BlockWidth;
1041  constexpr int GRFRowPitch = __ESIMD_DNS::getNextPowerOf2<GRFRowSize>();
1042  constexpr int GRFBlockSize =
1043  GRFRowPitch * (Transposed ? BlockWidth : BlockHeight);
1044  constexpr int GRFBlockPitch =
1045  detail::roundUpNextMultiple<64 / sizeof(T), GRFBlockSize>();
1046  constexpr int ActualN = NBlocks * GRFBlockPitch;
1047  static_assert(
1048  ActualN == N,
1049  "These parameters require unpadding. It is not implemented yet");
1050  constexpr lsc_data_size DS =
1051  detail::finalize_data_size<T, lsc_data_size::default_size>();
1052  __ESIMD_NS::simd_mask<N> pred = 1;
1053  uintptr_t surf_addr = reinterpret_cast<uintptr_t>(Ptr);
1054  constexpr detail::lsc_data_order _Transposed =
1055  Transposed ? detail::lsc_data_order::transpose
1056  : detail::lsc_data_order::nontranspose;
1057  return __esimd_lsc_load2d_stateless<T, L1H, L3H, DS, _Transposed, NBlocks,
1058  BlockWidth, BlockHeight, Transformed, N>(
1059  pred.data(), surf_addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
1060 }
1061 
1084 template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
1085  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1087  T, NBlocks, BlockHeight, BlockWidth, false, false>()>
1088 __ESIMD_API void lsc_prefetch2d(const T *Ptr, unsigned SurfaceWidth,
1089  unsigned SurfaceHeight, unsigned SurfacePitch,
1090  int X, int Y) {
1091  detail::check_lsc_cache_hint<detail::lsc_action::prefetch, L1H, L3H>();
1092  detail::check_lsc_block_2d_restrictions<T, BlockWidth, BlockHeight, NBlocks,
1093  false, false>();
1094  constexpr lsc_data_size DS =
1095  detail::finalize_data_size<T, lsc_data_size::default_size>();
1096  __ESIMD_NS::simd_mask<N> pred = 1;
1097  uintptr_t surf_addr = reinterpret_cast<uintptr_t>(Ptr);
1098  constexpr detail::lsc_data_order _Transposed =
1099  detail::lsc_data_order::nontranspose;
1100  __esimd_lsc_prefetch2d_stateless<T, L1H, L3H, DS, _Transposed, NBlocks,
1101  BlockWidth, BlockHeight, false, N>(
1102  pred.data(), surf_addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
1103 }
1104 
1129 template <typename T, int BlockWidth, int BlockHeight = 1,
1130  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1132  T, 1u, BlockHeight, BlockWidth, false, false>()>
1133 __ESIMD_API void lsc_store2d(T *Ptr, unsigned SurfaceWidth,
1134  unsigned SurfaceHeight, unsigned SurfacePitch,
1135  int X, int Y, __ESIMD_NS::simd<T, N> Vals) {
1136  detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
1137  detail::check_lsc_block_2d_restrictions<T, BlockWidth, BlockHeight, 1, false,
1138  false, true /*IsStore*/>();
1139  constexpr lsc_data_size DS =
1140  detail::finalize_data_size<T, lsc_data_size::default_size>();
1141  __ESIMD_NS::simd_mask<N> pred = 1;
1142  uintptr_t surf_addr = reinterpret_cast<uintptr_t>(Ptr);
1143  constexpr detail::lsc_data_order _Transposed =
1144  detail::lsc_data_order::nontranspose;
1145  __esimd_lsc_store2d_stateless<T, L1H, L3H, DS, _Transposed, 1u, BlockWidth,
1146  BlockHeight, false, N>(
1147  pred.data(), surf_addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y,
1148  Vals.data());
1149 }
1150 
1162 template <__ESIMD_NS::atomic_op Op, typename T, int N,
1163  lsc_data_size DS = lsc_data_size::default_size>
1164 __ESIMD_API __ESIMD_NS::simd<T, N>
1165 lsc_slm_atomic_update(__ESIMD_NS::simd<uint32_t, N> offsets,
1166  __ESIMD_NS::simd_mask<N> pred) {
1167  detail::check_lsc_vector_size<1>();
1168  detail::check_lsc_data_size<T, DS>();
1169  detail::check_lsc_atomic<Op, 0>();
1170  constexpr uint16_t _AddressScale = 1;
1171  constexpr int _ImmOffset = 0;
1172  constexpr lsc_data_size _DS =
1173  detail::expand_data_size(detail::finalize_data_size<T, DS>());
1174  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
1175  constexpr detail::lsc_data_order _Transposed =
1176  detail::lsc_data_order::nontranspose;
1177  constexpr detail::lsc_atomic_op _Op = detail::to_lsc_atomic_op<Op>();
1178  using _MsgT = typename detail::lsc_expand_type<T>::type;
1179  __ESIMD_NS::simd<_MsgT, N> Tmp =
1180  __esimd_lsc_xatomic_slm_0<_MsgT, _Op, cache_hint::none, cache_hint::none,
1181  _AddressScale, _ImmOffset, _DS, _VS,
1182  _Transposed, N>(pred.data(), offsets.data());
1183  return detail::lsc_format_ret<T>(Tmp);
1184 }
1185 
1198 template <__ESIMD_NS::atomic_op Op, typename T, int N,
1199  lsc_data_size DS = lsc_data_size::default_size>
1200 __ESIMD_API __ESIMD_NS::simd<T, N>
1201 lsc_slm_atomic_update(__ESIMD_NS::simd<uint32_t, N> offsets,
1202  __ESIMD_NS::simd<T, N> src0,
1203  __ESIMD_NS::simd_mask<N> pred) {
1204  detail::check_lsc_vector_size<1>();
1205  detail::check_lsc_data_size<T, DS>();
1206  detail::check_lsc_atomic<Op, 1>();
1207  constexpr uint16_t _AddressScale = 1;
1208  constexpr int _ImmOffset = 0;
1209  constexpr lsc_data_size _DS =
1210  detail::expand_data_size(detail::finalize_data_size<T, DS>());
1211  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
1212  constexpr detail::lsc_data_order _Transposed =
1213  detail::lsc_data_order::nontranspose;
1214  constexpr detail::lsc_atomic_op _Op = detail::to_lsc_atomic_op<Op>();
1215  using _MsgT = typename detail::lsc_expand_type<T>::type;
1216  __ESIMD_NS::simd<_MsgT, N> Tmp =
1217  __esimd_lsc_xatomic_slm_1<_MsgT, _Op, cache_hint::none, cache_hint::none,
1218  _AddressScale, _ImmOffset, _DS, _VS,
1219  _Transposed, N>(pred.data(), offsets.data(),
1220  src0.data());
1221  return detail::lsc_format_ret<T>(Tmp);
1222 }
1223 
1237 template <__ESIMD_NS::atomic_op Op, typename T, int N,
1238  lsc_data_size DS = lsc_data_size::default_size>
1239 __ESIMD_API __ESIMD_NS::simd<T, N>
1240 lsc_slm_atomic_update(__ESIMD_NS::simd<uint32_t, N> offsets,
1241  __ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd<T, N> src1,
1242  __ESIMD_NS::simd_mask<N> pred) {
1243  detail::check_lsc_vector_size<1>();
1244  detail::check_lsc_data_size<T, DS>();
1245  detail::check_lsc_atomic<Op, 2>();
1246  constexpr uint16_t _AddressScale = 1;
1247  constexpr int _ImmOffset = 0;
1248  constexpr lsc_data_size _DS =
1249  detail::expand_data_size(detail::finalize_data_size<T, DS>());
1250  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
1251  constexpr detail::lsc_data_order _Transposed =
1252  detail::lsc_data_order::nontranspose;
1253  constexpr detail::lsc_atomic_op _Op = detail::to_lsc_atomic_op<Op>();
1254  using _MsgT = typename detail::lsc_expand_type<T>::type;
1255  __ESIMD_NS::simd<_MsgT, N> Tmp =
1256  __esimd_lsc_xatomic_slm_2<_MsgT, _Op, cache_hint::none, cache_hint::none,
1257  _AddressScale, _ImmOffset, _DS, _VS,
1258  _Transposed, N>(pred.data(), offsets.data(),
1259  src0.data(), src1.data());
1260  return detail::lsc_format_ret<T>(Tmp);
1261 }
1262 
1278 template <__ESIMD_NS::atomic_op Op, typename T, int N,
1279  lsc_data_size DS = lsc_data_size::default_size,
1280  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1281  typename AccessorTy>
1282 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value,
1283  __ESIMD_NS::simd<T, N>>
1284 lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
1285  __ESIMD_NS::simd_mask<N> pred) {
1286 #ifdef __ESIMD_FORCE_STATELESS_MEM
1287  return lsc_atomic_update<Op, T, N, DS, L1H, L3H>(
1288  __ESIMD_DNS::accessorToPointer<T>(acc), offsets, pred);
1289 #else
1290  detail::check_lsc_vector_size<1>();
1291  detail::check_lsc_data_size<T, DS>();
1292  detail::check_lsc_atomic<Op, 0>();
1293  detail::check_lsc_cache_hint<detail::lsc_action::atomic, L1H, L3H>();
1294  constexpr uint16_t _AddressScale = 1;
1295  constexpr int _ImmOffset = 0;
1296  constexpr lsc_data_size _DS =
1297  detail::expand_data_size(detail::finalize_data_size<T, DS>());
1298  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
1299  constexpr detail::lsc_data_order _Transposed =
1300  detail::lsc_data_order::nontranspose;
1301  constexpr detail::lsc_atomic_op _Op = detail::to_lsc_atomic_op<Op>();
1302  using _MsgT = typename detail::lsc_expand_type<T>::type;
1303  auto si = __ESIMD_GET_SURF_HANDLE(acc);
1304  __ESIMD_NS::simd<_MsgT, N> Tmp =
1305  __esimd_lsc_xatomic_bti_0<_MsgT, _Op, L1H, L3H, _AddressScale, _ImmOffset,
1306  _DS, _VS, _Transposed, N>(pred.data(),
1307  offsets.data(), si);
1308  return detail::lsc_format_ret<T>(Tmp);
1309 #endif
1310 }
1311 
1328 template <__ESIMD_NS::atomic_op Op, typename T, int N,
1329  lsc_data_size DS = lsc_data_size::default_size,
1330  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1331  typename AccessorTy>
1332 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value,
1333  __ESIMD_NS::simd<T, N>>
1334 lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
1335  __ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd_mask<N> pred) {
1336 #ifdef __ESIMD_FORCE_STATELESS_MEM
1337  return lsc_atomic_update<Op, T, N, DS, L1H, L3H>(
1338  __ESIMD_DNS::accessorToPointer<T>(acc), offsets, src0, pred);
1339 #else
1340  detail::check_lsc_vector_size<1>();
1341  detail::check_lsc_data_size<T, DS>();
1342  detail::check_lsc_atomic<Op, 1>();
1343  detail::check_lsc_cache_hint<detail::lsc_action::atomic, L1H, L3H>();
1344  constexpr uint16_t _AddressScale = 1;
1345  constexpr int _ImmOffset = 0;
1346  constexpr lsc_data_size _DS =
1347  detail::expand_data_size(detail::finalize_data_size<T, DS>());
1348  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
1349  constexpr detail::lsc_data_order _Transposed =
1350  detail::lsc_data_order::nontranspose;
1351  constexpr detail::lsc_atomic_op _Op = detail::to_lsc_atomic_op<Op>();
1352  using _MsgT = typename detail::lsc_expand_type<T>::type;
1353  auto si = __ESIMD_GET_SURF_HANDLE(acc);
1354  __ESIMD_NS::simd<_MsgT, N> Tmp =
1355  __esimd_lsc_xatomic_bti_1<_MsgT, _Op, L1H, L3H, _AddressScale, _ImmOffset,
1356  _DS, _VS, _Transposed, N>(
1357  pred.data(), offsets.data(), src0.data(), si);
1358  return detail::lsc_format_ret<T>(Tmp);
1359 #endif
1360 }
1361 
1379 template <__ESIMD_NS::atomic_op Op, typename T, int N,
1380  lsc_data_size DS = lsc_data_size::default_size,
1381  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1382  typename AccessorTy>
1383 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value,
1384  __ESIMD_NS::simd<T, N>>
1385 lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
1386  __ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd<T, N> src1,
1387  __ESIMD_NS::simd_mask<N> pred) {
1388 #ifdef __ESIMD_FORCE_STATELESS_MEM
1389  return lsc_atomic_update<Op, T, N, DS, L1H, L3H>(
1390  __ESIMD_DNS::accessorToPointer<T>(acc), offsets, src0, src1, pred);
1391 #else
1392  detail::check_lsc_vector_size<1>();
1393  detail::check_lsc_data_size<T, DS>();
1394  detail::check_lsc_atomic<Op, 2>();
1395  detail::check_lsc_cache_hint<detail::lsc_action::atomic, L1H, L3H>();
1396  constexpr uint16_t _AddressScale = 1;
1397  constexpr int _ImmOffset = 0;
1398  constexpr lsc_data_size _DS =
1399  detail::expand_data_size(detail::finalize_data_size<T, DS>());
1400  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
1401  constexpr detail::lsc_data_order _Transposed =
1402  detail::lsc_data_order::nontranspose;
1403  constexpr detail::lsc_atomic_op _Op = detail::to_lsc_atomic_op<Op>();
1404  using _MsgT = typename detail::lsc_expand_type<T>::type;
1405  auto si = __ESIMD_GET_SURF_HANDLE(acc);
1406  __ESIMD_NS::simd<_MsgT, N> Tmp =
1407  __esimd_lsc_xatomic_bti_2<_MsgT, _Op, L1H, L3H, _AddressScale, _ImmOffset,
1408  _DS, _VS, _Transposed, N>(
1409  pred.data(), offsets.data(), src0.data(), src1.data(), si);
1410  return detail::lsc_format_ret<T>(Tmp);
1411 #endif
1412 }
1413 
1428 template <__ESIMD_NS::atomic_op Op, typename T, int N,
1429  lsc_data_size DS = lsc_data_size::default_size,
1430  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
1431 __ESIMD_API __ESIMD_NS::simd<T, N>
1432 lsc_atomic_update(T *p, __ESIMD_NS::simd<uint32_t, N> offsets,
1433  __ESIMD_NS::simd_mask<N> pred) {
1434  detail::check_lsc_vector_size<1>();
1435  detail::check_lsc_data_size<T, DS>();
1436  detail::check_lsc_atomic<Op, 0>();
1437  detail::check_lsc_cache_hint<detail::lsc_action::atomic, L1H, L3H>();
1438  constexpr uint16_t _AddressScale = 1;
1439  constexpr int _ImmOffset = 0;
1440  constexpr lsc_data_size _DS =
1441  detail::expand_data_size(detail::finalize_data_size<T, DS>());
1442  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
1443  constexpr detail::lsc_data_order _Transposed =
1444  detail::lsc_data_order::nontranspose;
1445  constexpr detail::lsc_atomic_op _Op = detail::to_lsc_atomic_op<Op>();
1446  using _MsgT = typename detail::lsc_expand_type<T>::type;
1447  __ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
1448  addrs += convert<uintptr_t>(offsets);
1449  __ESIMD_NS::simd<_MsgT, N> Tmp =
1450  __esimd_lsc_xatomic_stateless_0<_MsgT, _Op, L1H, L3H, _AddressScale,
1451  _ImmOffset, _DS, _VS, _Transposed, N>(
1452  pred.data(), addrs.data());
1453  return detail::lsc_format_ret<T>(Tmp);
1454 }
1455 
1471 template <__ESIMD_NS::atomic_op Op, typename T, int N,
1472  lsc_data_size DS = lsc_data_size::default_size,
1473  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
1474 __ESIMD_API __ESIMD_NS::simd<T, N>
1475 lsc_atomic_update(T *p, __ESIMD_NS::simd<uint32_t, N> offsets,
1476  __ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd_mask<N> pred) {
1477  detail::check_lsc_vector_size<1>();
1478  detail::check_lsc_data_size<T, DS>();
1479  detail::check_lsc_atomic<Op, 1>();
1480  detail::check_lsc_cache_hint<detail::lsc_action::atomic, L1H, L3H>();
1481  constexpr uint16_t _AddressScale = 1;
1482  constexpr int _ImmOffset = 0;
1483  constexpr lsc_data_size _DS =
1484  detail::expand_data_size(detail::finalize_data_size<T, DS>());
1485  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
1486  constexpr detail::lsc_data_order _Transposed =
1487  detail::lsc_data_order::nontranspose;
1488  constexpr detail::lsc_atomic_op _Op = detail::to_lsc_atomic_op<Op>();
1489  using _MsgT = typename detail::lsc_expand_type<T>::type;
1490  __ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
1491  addrs += convert<uintptr_t>(offsets);
1492  __ESIMD_NS::simd<_MsgT, N> Tmp =
1493  __esimd_lsc_xatomic_stateless_1<_MsgT, _Op, L1H, L3H, _AddressScale,
1494  _ImmOffset, _DS, _VS, _Transposed, N>(
1495  pred.data(), addrs.data(), src0.data());
1496  return detail::lsc_format_ret<T>(Tmp);
1497 }
1498 
1515 template <__ESIMD_NS::atomic_op Op, typename T, int N,
1516  lsc_data_size DS = lsc_data_size::default_size,
1517  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
1518 __ESIMD_API __ESIMD_NS::simd<T, N>
1519 lsc_atomic_update(T *p, __ESIMD_NS::simd<uint32_t, N> offsets,
1520  __ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd<T, N> src1,
1521  __ESIMD_NS::simd_mask<N> pred) {
1522  detail::check_lsc_vector_size<1>();
1523  detail::check_lsc_data_size<T, DS>();
1524  detail::check_lsc_atomic<Op, 2>();
1525  detail::check_lsc_cache_hint<detail::lsc_action::atomic, L1H, L3H>();
1526  constexpr uint16_t _AddressScale = 1;
1527  constexpr int _ImmOffset = 0;
1528  constexpr lsc_data_size _DS =
1529  detail::expand_data_size(detail::finalize_data_size<T, DS>());
1530  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
1531  constexpr detail::lsc_data_order _Transposed =
1532  detail::lsc_data_order::nontranspose;
1533  constexpr detail::lsc_atomic_op _Op = detail::to_lsc_atomic_op<Op>();
1534  using _MsgT = typename detail::lsc_expand_type<T>::type;
1535  __ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
1536  addrs += convert<uintptr_t>(offsets);
1537  __ESIMD_NS::simd<_MsgT, N> Tmp =
1538  __esimd_lsc_xatomic_stateless_2<_MsgT, _Op, L1H, L3H, _AddressScale,
1539  _ImmOffset, _DS, _VS, _Transposed, N>(
1540  pred.data(), addrs.data(), src0.data(), src1.data());
1541  return detail::lsc_format_ret<T>(Tmp);
1542 }
1543 
1552 template <lsc_memory_kind Kind = lsc_memory_kind::untyped_global,
1553  lsc_fence_op FenceOp = lsc_fence_op::none,
1554  lsc_scope Scope = lsc_scope::group, int N = 16>
1555 __ESIMD_API void lsc_fence(__ESIMD_NS::simd_mask<N> pred = 1) {
1556  static_assert(
1557  Kind != lsc_memory_kind::shared_local ||
1558  (FenceOp == lsc_fence_op::none && Scope == lsc_scope::group),
1559  "SLM fence must have 'none' lsc_fence_op and 'group' scope");
1560  __esimd_lsc_fence<Kind, FenceOp, Scope, N>(pred.data());
1561 }
1562 
1564 
1565 #undef __ESIMD_GET_SURF_HANDLE
1566 
1567 } // namespace __ESIMD_ENS
1568 } // __SYCL_INLINE_NAMESPACE(cl)
cl::sycl::ext::intel::esimd::atomic_op
atomic_op
Represents an atomic operation.
Definition: common.hpp:159
cl::sycl::ext::intel::experimental::esimd::lsc_scope
lsc_scope
The scope that lsc_fence operation should apply to Supported platforms: DG2, PVC.
Definition: common.hpp:45
cl::sycl::ext::intel::experimental::esimd::named_barrier_init
__ESIMD_API void named_barrier_init()
Initialize number of named barriers for a kernel Available only on PVC.
Definition: memory.hpp:222
cl::sycl::ext::intel::experimental::esimd::lsc_scatter
__ESIMD_API void lsc_scatter(T *p, sycl::ext::intel::esimd::simd< uint32_t, N > offsets, sycl::ext::intel::esimd::simd< T, N *NElts > vals, sycl::ext::intel::esimd::simd_mask< N > pred=1)
USM pointer scatter.
Definition: memory.hpp:893
cl::sycl::ext::intel::experimental::esimd::raw_send_load
__ESIMD_API sycl::ext::intel::esimd::simd< T1, n1 > raw_send_load(sycl::ext::intel::esimd::simd< T1, n1 > msgDst, sycl::ext::intel::esimd::simd< T2, n2 > msgSrc0, uint32_t exDesc, uint32_t msgDesc, uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t numDst, uint8_t isEOT=0, uint8_t isSendc=0, sycl::ext::intel::esimd::simd_mask< N > mask=1)
Raw send load.
Definition: memory.hpp:114
cl::sycl::ext::intel::experimental::esimd::detail::lsc_expand_type::type
typename std::conditional< sizeof(T)< 4, uint32_t, T >::type type
Definition: common.hpp:322
cl::sycl::ext::intel::experimental::esimd::detail::lsc_bitcast_type::type
typename std::conditional< sizeof(_type2)==1, _type2, _type1 >::type type
Definition: common.hpp:332
cl::sycl::ext::intel::experimental::esimd::split_barrier
__ESIMD_API void split_barrier(split_barrier_action flag)
Definition: memory.hpp:33
__SYCL_DEPRECATED
#define __SYCL_DEPRECATED(message)
Definition: defines_elementary.hpp:43
memory.hpp
cl::sycl::ext::intel::experimental::esimd::lsc_store2d
__ESIMD_API void lsc_store2d(T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y, sycl::ext::intel::esimd::simd< T, N > Vals)
2D USM pointer block store.
Definition: memory.hpp:1133
cl::sycl::ext::intel::experimental::esimd::lsc_slm_gather
__ESIMD_API sycl::ext::intel::esimd::simd< T, N *NElts > lsc_slm_gather(sycl::ext::intel::esimd::simd< uint32_t, N > offsets, sycl::ext::intel::esimd::simd_mask< N > pred=1)
SLM gather.
Definition: memory.hpp:303
util.hpp
cl::sycl::ext::intel::experimental::esimd::named_barrier_signal
__ESIMD_API void named_barrier_signal(uint8_t barrier_id, uint8_t producer_consumer_mode, uint32_t num_producers, uint32_t num_consumers)
Perform signal operation for the given named barrier Available only on PVC.
Definition: memory.hpp:238
cl::sycl::ext::intel::experimental::esimd::lsc_prefetch
__ESIMD_API void lsc_prefetch(const T *p)
USM pointer prefetch transposed gather with 1 channel.
Definition: memory.hpp:683
cl::sycl::ext::intel::experimental::esimd::detail::lsc_format_ret
ESIMD_INLINE sycl::ext::intel::esimd::simd< T, N > lsc_format_ret(sycl::ext::intel::esimd::simd< T1, N > Vals)
Definition: memory.hpp:278
cl::sycl::ext::intel::experimental::esimd::detail::check_lsc_block_2d_restrictions
constexpr void check_lsc_block_2d_restrictions()
Definition: memory.hpp:957
cl::sycl::ext::intel::experimental::esimd::lsc_gather
__ESIMD_API sycl::ext::intel::esimd::simd< T, N *NElts > lsc_gather(const T *p, sycl::ext::intel::esimd::simd< uint32_t, N > offsets, sycl::ext::intel::esimd::simd_mask< N > pred=1)
USM pointer gather.
Definition: memory.hpp:476
cl::sycl::ext::intel::experimental::esimd::lsc_fence
__ESIMD_API void lsc_fence(sycl::ext::intel::esimd::simd_mask< N > pred=1)
Memory fence.
Definition: memory.hpp:1555
cl::sycl::ext::intel::experimental::esimd::named_barrier_wait
__ESIMD_API void named_barrier_wait(uint8_t id)
Wait on a named barrier Available only on PVC.
Definition: memory.hpp:214
cl::sycl::ext::intel::experimental::esimd::lsc_block_store
__ESIMD_API void lsc_block_store(T *p, sycl::ext::intel::esimd::simd< T, NElts > vals)
USM pointer transposed scatter with 1 channel.
Definition: memory.hpp:933
cl::sycl::ext::intel::experimental::esimd::raw_send_store
__ESIMD_API void raw_send_store(sycl::ext::intel::esimd::simd< T1, n1 > msgSrc0, uint32_t exDesc, uint32_t msgDesc, uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t isEOT=0, uint8_t isSendc=0, sycl::ext::intel::esimd::simd_mask< N > mask=1)
Raw send store.
Definition: memory.hpp:186
cl::sycl::ext::intel::experimental::esimd::lsc_load2d
__ESIMD_API sycl::ext::intel::esimd::simd< T, N > lsc_load2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y)
2D USM pointer block load.
Definition: memory.hpp:1034
cl::sycl::ext::intel::experimental::esimd::detail::lsc_vector_size
lsc_vector_size
Definition: common.hpp:112
cl
We provide new interfaces for matrix muliply in this patch:
Definition: access.hpp:13
cl::sycl::ext::intel::experimental::esimd::lsc_slm_atomic_update
__ESIMD_API sycl::ext::intel::esimd::simd< T, N > lsc_slm_atomic_update(sycl::ext::intel::esimd::simd< uint32_t, N > offsets, sycl::ext::intel::esimd::simd< T, N > src0, sycl::ext::intel::esimd::simd< T, N > src1, sycl::ext::intel::esimd::simd_mask< N > pred)
SLM atomic.
Definition: memory.hpp:1240
cl::sycl::ext::intel::experimental::esimd::lsc_prefetch2d
__ESIMD_API void lsc_prefetch2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y)
2D USM pointer block prefetch.
Definition: memory.hpp:1088
common.hpp
cl::sycl::ext::intel::experimental::esimd::lsc_fence_op
lsc_fence_op
The lsc_fence operation to apply to caches Supported platforms: DG2, PVC.
Definition: common.hpp:57
cl::sycl::ext::intel::experimental::esimd::raw_sends_store
__ESIMD_API void raw_sends_store(sycl::ext::intel::esimd::simd< T1, n1 > msgSrc0, sycl::ext::intel::esimd::simd< T2, n2 > msgSrc1, uint32_t exDesc, uint32_t msgDesc, uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t numSrc1, uint8_t isEOT=0, uint8_t isSendc=0, sycl::ext::intel::esimd::simd_mask< N > mask=1)
Raw sends store.
Definition: memory.hpp:151
cl::sycl::ext::intel::experimental::esimd::lsc_memory_kind
lsc_memory_kind
The specific LSC shared function to fence with lsc_fence Supported platforms: DG2,...
Definition: common.hpp:69
cl::sycl::ext::intel::experimental::esimd::lsc_atomic_update
__ESIMD_API sycl::ext::intel::esimd::simd< T, N > lsc_atomic_update(T *p, sycl::ext::intel::esimd::simd< uint32_t, N > offsets, sycl::ext::intel::esimd::simd< T, N > src0, sycl::ext::intel::esimd::simd< T, N > src1, sycl::ext::intel::esimd::simd_mask< N > pred)
USM pointer atomic.
Definition: memory.hpp:1519
cl::sycl::ext::intel::experimental::esimd::lsc_slm_block_store
__ESIMD_API void lsc_slm_block_store(uint32_t offset, sycl::ext::intel::esimd::simd< T, NElts > vals)
Transposed SLM scatter with 1 channel.
Definition: memory.hpp:754
cl::sycl::ext::intel::experimental::esimd::lsc_block_load
__ESIMD_API sycl::ext::intel::esimd::simd< T, NElts > lsc_block_load(const T *p)
USM pointer transposed gather with 1 channel.
Definition: memory.hpp:516
cl::sycl::ext::intel::experimental::esimd::cache_hint
cache_hint
L1 or L3 cache hint kinds.
Definition: common.hpp:338
cl::sycl::ext::intel::esimd::barrier
__ESIMD_API void barrier()
Generic work-group barrier.
Definition: memory.hpp:919
cl::sycl::ext::intel::experimental::esimd::raw_sends_load
__ESIMD_API sycl::ext::intel::esimd::simd< T1, n1 > raw_sends_load(sycl::ext::intel::esimd::simd< T1, n1 > msgDst, sycl::ext::intel::esimd::simd< T2, n2 > msgSrc0, sycl::ext::intel::esimd::simd< T3, n3 > msgSrc1, uint32_t exDesc, uint32_t msgDesc, uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t numSrc1, uint8_t numDst, uint8_t isEOT=0, uint8_t isSendc=0, sycl::ext::intel::esimd::simd_mask< N > mask=1)
Raw sends load.
Definition: memory.hpp:72
cl::sycl::ext::intel::experimental::esimd::lsc_slm_block_load
__ESIMD_API sycl::ext::intel::esimd::simd< T, NElts > lsc_slm_block_load(uint32_t offset)
Transposed SLM gather with 1 channel.
Definition: memory.hpp:337
cl::sycl::ext::intel::experimental::esimd::detail::lsc_atomic_op
lsc_atomic_op
LSC atomic operations op codes.
Definition: common.hpp:90
cl::sycl::ext::intel::experimental::esimd::lsc_data_size
lsc_data_size
Data size or format to read or store.
Definition: common.hpp:77
cl::sycl::ext::intel::experimental::esimd::detail::get_lsc_block_2d_data_size
constexpr int get_lsc_block_2d_data_size()
Definition: memory.hpp:268
cl::sycl::ext::intel::experimental::esimd::split_barrier_action
split_barrier_action
Represents a split barrier action.
Definition: common.hpp:416
__ESIMD_GET_SURF_HANDLE
#define __ESIMD_GET_SURF_HANDLE(acc)
Definition: memory.hpp:21
cl::sycl::ext::intel::experimental::esimd::detail::lsc_data_order
lsc_data_order
Definition: common.hpp:123
memory_intrin.hpp
cl::sycl::ext::intel::experimental::esimd::detail::expand_data_size
constexpr lsc_data_size expand_data_size(lsc_data_size DS)
Definition: common.hpp:313
cl::sycl::ext::intel::experimental::esimd::lsc_slm_scatter
__ESIMD_API void lsc_slm_scatter(sycl::ext::intel::esimd::simd< uint32_t, N > offsets, sycl::ext::intel::esimd::simd< T, N *NElts > vals, sycl::ext::intel::esimd::simd_mask< N > pred=1)
SLM scatter.
Definition: memory.hpp:720
__SYCL_INLINE_NAMESPACE
#define __SYCL_INLINE_NAMESPACE(X)
Definition: defines_elementary.hpp:11