DPC++ Runtime
Runtime libraries for oneAPI DPC++
memory.hpp
Go to the documentation of this file.
1 //==-------------- memory.hpp - DPC++ Explicit SIMD API --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // Implement experimental Explicit SIMD memory-access APIs.
9 //===----------------------------------------------------------------------===//
10 
11 #pragma once
12 
17 
18 namespace sycl {
20 namespace ext::intel {
21 namespace experimental::esimd {
22 
25 
28 template <split_barrier_action flag> __ESIMD_API void split_barrier() {
29  __esimd_sbarrier(flag);
30 }
31 
32 __SYCL_DEPRECATED("use split_barrier<split_barrier_action>()")
33 __ESIMD_API void split_barrier(split_barrier_action flag) {
34  __esimd_sbarrier(flag);
35 }
36 
38 
39 // sycl_esimd_raw_send intrinsics are not available when stateless memory
40 // accesses are enforced.
41 #ifndef __ESIMD_FORCE_STATELESS_MEM
42 
45 
70 template <typename T1, int n1, typename T2, int n2, typename T3, int n3,
71  int N = 16>
72 __ESIMD_API __ESIMD_NS::simd<T1, n1>
73 raw_sends(__ESIMD_NS::simd<T1, n1> msgDst, __ESIMD_NS::simd<T2, n2> msgSrc0,
74  __ESIMD_NS::simd<T3, n3> msgSrc1, uint32_t exDesc, uint32_t msgDesc,
75  uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t numSrc1,
76  uint8_t numDst, uint8_t isEOT = 0, uint8_t isSendc = 0,
77  __ESIMD_NS::simd_mask<N> mask = 1) {
78  constexpr unsigned _Width1 = n1 * sizeof(T1);
79  static_assert(_Width1 % 32 == 0, "Invalid size for raw send rspVar");
80  constexpr unsigned _Width2 = n2 * sizeof(T2);
81  static_assert(_Width2 % 32 == 0, "Invalid size for raw send msgSrc0");
82  constexpr unsigned _Width3 = n3 * sizeof(T3);
83  static_assert(_Width3 % 32 == 0, "Invalid size for raw send msgSrc1");
84 
85  using ElemT1 = __ESIMD_DNS::__raw_t<T1>;
86  using ElemT2 = __ESIMD_DNS::__raw_t<T2>;
87  using ElemT3 = __ESIMD_DNS::__raw_t<T3>;
88 
89  uint8_t modifier = ((isEOT & 0x1) << 1) | (isSendc & 0x1);
90  return __esimd_raw_sends2<ElemT1, n1, ElemT2, n2, ElemT3, n3, N>(
91  modifier, execSize, mask.data(), numSrc0, numSrc1, numDst, sfid, exDesc,
92  msgDesc, msgSrc0.data(), msgSrc1.data(), msgDst.data());
93 }
94 
95 template <typename T1, int n1, typename T2, int n2, typename T3, int n3,
96  int N = 16>
97 __SYCL_DEPRECATED("raw_sends_load is deprecated. Use raw_sends")
98 __ESIMD_API __ESIMD_NS::simd<T1, n1> raw_sends_load(
99  __ESIMD_NS::simd<T1, n1> msgDst, __ESIMD_NS::simd<T2, n2> msgSrc0,
100  __ESIMD_NS::simd<T3, n3> msgSrc1, uint32_t exDesc, uint32_t msgDesc,
101  uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t numSrc1,
102  uint8_t numDst, uint8_t isEOT = 0, uint8_t isSendc = 0,
103  __ESIMD_NS::simd_mask<N> mask = 1) {
104  return raw_sends(msgDst, msgSrc0, msgSrc1, exDesc, msgDesc, execSize, sfid,
105  numSrc0, numSrc1, numDst, isEOT, isSendc);
106 }
107 
129 template <typename T1, int n1, typename T2, int n2, int N = 16>
130 __ESIMD_API __ESIMD_NS::simd<T1, n1>
131 raw_send(__ESIMD_NS::simd<T1, n1> msgDst, __ESIMD_NS::simd<T2, n2> msgSrc0,
132  uint32_t exDesc, uint32_t msgDesc, uint8_t execSize, uint8_t sfid,
133  uint8_t numSrc0, uint8_t numDst, uint8_t isEOT = 0,
134  uint8_t isSendc = 0, __ESIMD_NS::simd_mask<N> mask = 1) {
135  constexpr unsigned _Width1 = n1 * sizeof(T1);
136  static_assert(_Width1 % 32 == 0, "Invalid size for raw send rspVar");
137  constexpr unsigned _Width2 = n2 * sizeof(T2);
138  static_assert(_Width2 % 32 == 0, "Invalid size for raw send msgSrc0");
139 
140  using ElemT1 = __ESIMD_DNS::__raw_t<T1>;
141  using ElemT2 = __ESIMD_DNS::__raw_t<T2>;
142 
143  uint8_t modifier = ((isEOT & 0x1) << 1) | (isSendc & 0x1);
144  return __esimd_raw_send2<ElemT1, n1, ElemT2, n2, N>(
145  modifier, execSize, mask.data(), numSrc0, numDst, sfid, exDesc, msgDesc,
146  msgSrc0.data(), msgDst.data());
147 }
148 
149 template <typename T1, int n1, typename T2, int n2, int N = 16>
150 __SYCL_DEPRECATED("raw_send_load is deprecated. Use raw_send")
151 __ESIMD_API __ESIMD_NS::simd<T1, n1> raw_send_load(
152  __ESIMD_NS::simd<T1, n1> msgDst, __ESIMD_NS::simd<T2, n2> msgSrc0,
153  uint32_t exDesc, uint32_t msgDesc, uint8_t execSize, uint8_t sfid,
154  uint8_t numSrc0, uint8_t numDst, uint8_t isEOT = 0, uint8_t isSendc = 0,
155  __ESIMD_NS::simd_mask<N> mask = 1) {
156  return raw_send(msgDst, msgSrc0, exDesc, msgDesc, execSize, sfid, numSrc0,
157  numDst, isEOT, isSendc, mask);
158 }
159 
180 template <typename T1, int n1, typename T2, int n2, int N = 16>
181 __ESIMD_API void
182 raw_sends(__ESIMD_NS::simd<T1, n1> msgSrc0, __ESIMD_NS::simd<T2, n2> msgSrc1,
183  uint32_t exDesc, uint32_t msgDesc, uint8_t execSize, uint8_t sfid,
184  uint8_t numSrc0, uint8_t numSrc1, uint8_t isEOT = 0,
185  uint8_t isSendc = 0, __ESIMD_NS::simd_mask<N> mask = 1) {
186  constexpr unsigned _Width1 = n1 * sizeof(T1);
187  static_assert(_Width1 % 32 == 0, "Invalid size for raw send msgSrc0");
188  constexpr unsigned _Width2 = n2 * sizeof(T2);
189  static_assert(_Width2 % 32 == 0, "Invalid size for raw send msgSrc1");
190 
191  using ElemT1 = __ESIMD_DNS::__raw_t<T1>;
192  using ElemT2 = __ESIMD_DNS::__raw_t<T2>;
193 
194  uint8_t modifier = ((isEOT & 0x1) << 1) | (isSendc & 0x1);
195  __esimd_raw_sends2_noresult<ElemT1, n1, ElemT2, n2, N>(
196  modifier, execSize, mask.data(), numSrc0, numSrc1, sfid, exDesc, msgDesc,
197  msgSrc0.data(), msgSrc1.data());
198 }
199 
200 template <typename T1, int n1, typename T2, int n2, int N = 16>
201 __SYCL_DEPRECATED("raw_sends_store is deprecated. Use raw_sends")
202 __ESIMD_API
203  void raw_sends_store(__ESIMD_NS::simd<T1, n1> msgSrc0,
204  __ESIMD_NS::simd<T2, n2> msgSrc1, uint32_t exDesc,
205  uint32_t msgDesc, uint8_t execSize, uint8_t sfid,
206  uint8_t numSrc0, uint8_t numSrc1, uint8_t isEOT = 0,
207  uint8_t isSendc = 0,
208  __ESIMD_NS::simd_mask<N> mask = 1) {
209  raw_sends(msgSrc0, msgSrc1, exDesc, msgDesc, execSize, sfid, numSrc0, numSrc1,
210  isEOT, isSendc, mask);
211 }
212 
231 template <typename T1, int n1, int N = 16>
232 __ESIMD_API void
233 raw_send(__ESIMD_NS::simd<T1, n1> msgSrc0, uint32_t exDesc, uint32_t msgDesc,
234  uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t isEOT = 0,
235  uint8_t isSendc = 0, __ESIMD_NS::simd_mask<N> mask = 1) {
236  constexpr unsigned _Width1 = n1 * sizeof(T1);
237  static_assert(_Width1 % 32 == 0, "Invalid size for raw send msgSrc0");
238  using ElemT1 = __ESIMD_DNS::__raw_t<T1>;
239  uint8_t modifier = ((isEOT & 0x1) << 1) | (isSendc & 0x1);
240  __esimd_raw_send2_noresult<ElemT1, n1, N>(modifier, execSize, mask.data(),
241  numSrc0, sfid, exDesc, msgDesc,
242  msgSrc0.data());
243 }
244 
245 template <typename T1, int n1, int N = 16>
246 __SYCL_DEPRECATED("raw_send_store is deprecated. Use raw_send")
247 __ESIMD_API
248  void raw_send_store(__ESIMD_NS::simd<T1, n1> msgSrc0, uint32_t exDesc,
249  uint32_t msgDesc, uint8_t execSize, uint8_t sfid,
250  uint8_t numSrc0, uint8_t isEOT = 0, uint8_t isSendc = 0,
251  __ESIMD_NS::simd_mask<N> mask = 1) {
252  raw_send(msgSrc0, exDesc, msgDesc, execSize, sfid, numSrc0, isEOT, isSendc,
253  mask);
254 }
255 
257 
258 #endif // !__ESIMD_FORCE_STATELESS_MEM
259 
262 
265 
270 __ESIMD_API void named_barrier_wait(uint8_t id) {
271  __esimd_nbarrier(0 /*wait*/, id, 0 /*thread count*/);
272 }
273 
278 template <uint8_t NbarCount> __ESIMD_API void named_barrier_init() {
279  __esimd_nbarrier_init(NbarCount);
280 }
281 
294 __ESIMD_API void named_barrier_signal(uint8_t barrier_id,
295  uint8_t producer_consumer_mode,
296  uint32_t num_producers,
297  uint32_t num_consumers) {
298  constexpr uint32_t gateway = 3;
299  constexpr uint32_t barrier = 4;
300  constexpr uint32_t descriptor = 1 << 25 | // Message length: 1 register
301  0 << 12 | // Fence Data Ports: No fence
302  barrier; // Barrier subfunction
303 
304  __ESIMD_DNS::vector_type_t<uint32_t, 8> payload = 0;
305  payload[2] = (num_consumers & 0xff) << 24 | (num_producers & 0xff) << 16 |
306  producer_consumer_mode << 14 | (barrier_id & 0b11111) << 0;
307 
308  __esimd_raw_send_nbarrier_signal<uint32_t, 8>(
309  0 /*sendc*/, gateway, descriptor, payload, 1 /*pred*/);
310 }
311 
315 template <typename T, int N>
316 __ESIMD_API std::enable_if_t<(sizeof(T) * N >= 2)>
317 wait(__ESIMD_NS::simd<T, N> value) {
318 #ifdef __SYCL_DEVICE_ONLY__
319  uint16_t Word = value.template bit_cast_view<uint16_t>()[0];
320  __esimd_wait(Word);
321 #endif // __SYCL_DEVICE_ONLY__
322 }
323 
327 template <typename T, typename RegionT>
328 __ESIMD_API std::enable_if_t<
329  (RegionT::length * sizeof(typename RegionT::element_type) >= 2)>
330 wait(__ESIMD_NS::simd_view<T, RegionT> value) {
331 #ifdef __SYCL_DEVICE_ONLY__
332  uint16_t Word = value.template bit_cast_view<uint16_t>()[0];
333  __esimd_wait(Word);
334 #endif // __SYCL_DEVICE_ONLY__
335 }
336 
338 
341 
344 
345 namespace detail {
346 // Compute the data size for 2d block load or store.
347 template <typename T, int NBlocks, int Height, int Width, bool Transposed,
348  bool Transformed>
349 constexpr int get_lsc_block_2d_data_size() {
350  if (Transformed)
351  return detail::roundUpNextMultiple<Height, 4 / sizeof(T)>() *
352  __ESIMD_DNS::getNextPowerOf2<Width>() * NBlocks;
353  return Width * Height * NBlocks;
354 }
355 
356 // Format u8 and u16 to u8u32 and u16u32 by doing garbage-extension.
357 template <typename RT, typename T, int N>
358 ESIMD_INLINE __ESIMD_NS::simd<RT, N>
359 lsc_format_input(__ESIMD_NS::simd<T, N> Vals) {
360  if constexpr (sizeof(T) == 1) {
361  // Extend bytes to RT.
362  return Vals.template bit_cast_view<uint8_t>();
363  } else if constexpr (sizeof(T) == 2) {
364  // Extend words to RT.
365  return Vals.template bit_cast_view<uint16_t>();
366  } else {
367  return Vals.template bit_cast_view<RT>();
368  }
369 }
370 
371 // Format u8u32 and u16u32 back to u8 and u16.
372 template <typename T, typename T1, int N>
373 ESIMD_INLINE __ESIMD_NS::simd<T, N>
374 lsc_format_ret(__ESIMD_NS::simd<T1, N> Vals) {
375  auto Formatted = Vals.template bit_cast_view<T>();
376  if constexpr (sizeof(T) == sizeof(T1)) {
377  return Formatted;
378  } else {
379  constexpr int Stride = Formatted.length / N;
380  return Formatted.template select<N, Stride>(0);
381  }
382 }
383 
385 template <__ESIMD_NS::native::lsc::atomic_op Op, typename T, int N,
386  unsigned NumSrc>
387 constexpr void check_lsc_atomic() {
388  if constexpr (!__ESIMD_DNS::isPowerOf2(N, 32)) {
389  static_assert((__ESIMD_DNS::isPowerOf2(N, 32)),
390  "Execution size 1, 2, 4, 8, 16, 32 are supported");
391  }
392  if constexpr (NumSrc != __ESIMD_DNS::get_num_args<Op>()) {
393  static_assert(NumSrc == __ESIMD_DNS::get_num_args<Op>(),
394  "wrong number of operands");
395  }
396  if constexpr (Op == __ESIMD_NS::native::lsc::atomic_op::fcmpxchg) {
397  static_assert(__ESIMD_DNS::is_type<T, float, sycl::half, double>(),
398  "float, double or sycl::half type is expected");
399  } else {
400  __ESIMD_DNS::check_atomic<__ESIMD_DNS::to_atomic_op<Op>(), T, N, NumSrc>();
401  }
402 }
403 
404 template <cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
405 constexpr uint32_t get_lsc_load_cache_mask() {
406  if constexpr (L1H == cache_hint::read_invalidate &&
407  L3H == cache_hint::cached) {
408  return 7;
409  }
410  if constexpr (L1H == cache_hint::streaming && L3H == cache_hint::cached) {
411  return 6;
412  }
413  if constexpr (L1H == cache_hint::streaming && L3H == cache_hint::uncached) {
414  return 5;
415  }
416  if constexpr (L1H == cache_hint::cached && L3H == cache_hint::cached) {
417  return 4;
418  }
419  if constexpr (L1H == cache_hint::cached && L3H == cache_hint::uncached) {
420  return 3;
421  }
422  if constexpr (L1H == cache_hint::uncached && L3H == cache_hint::cached) {
423  return 2;
424  }
425  if constexpr (L1H == cache_hint::uncached && L3H == cache_hint::uncached) {
426  return 1;
427  }
428  return 0;
429 }
430 
431 template <cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
432 constexpr uint32_t get_lsc_store_cache_mask() {
433  if constexpr (L1H == cache_hint::write_back && L3H == cache_hint::cached) {
434  return 7;
435  }
436  if constexpr (L1H == cache_hint::streaming && L3H == cache_hint::cached) {
437  return 6;
438  }
439  if constexpr (L1H == cache_hint::streaming && L3H == cache_hint::uncached) {
440  return 5;
441  }
442  if constexpr (L1H == cache_hint::write_through && L3H == cache_hint::cached) {
443  return 4;
444  }
445  if constexpr (L1H == cache_hint::write_through &&
446  L3H == cache_hint::uncached) {
447  return 3;
448  }
449  if constexpr (L1H == cache_hint::uncached && L3H == cache_hint::cached) {
450  return 2;
451  }
452  if constexpr (L1H == cache_hint::uncached && L3H == cache_hint::uncached) {
453  return 1;
454  }
455  return 0;
456 }
457 
458 } // namespace detail
459 
475 template <typename T, int NElts = 1,
476  lsc_data_size DS = lsc_data_size::default_size, int N>
477 __ESIMD_API __ESIMD_NS::simd<T, N * NElts>
478 lsc_slm_gather(__ESIMD_NS::simd<uint32_t, N> offsets,
479  __ESIMD_NS::simd_mask<N> pred = 1) {
480  detail::check_lsc_vector_size<NElts>();
481  detail::check_lsc_data_size<T, DS>();
482  constexpr uint16_t _AddressScale = 1;
483  constexpr int _ImmOffset = 0;
484  constexpr lsc_data_size _DS =
485  detail::expand_data_size(detail::finalize_data_size<T, DS>());
486  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
487  constexpr auto _Transposed = detail::lsc_data_order::nontranspose;
488  using MsgT = typename detail::lsc_expand_type<T>::type;
489  __ESIMD_NS::simd<MsgT, N * NElts> Tmp =
490  __esimd_lsc_load_slm<MsgT, cache_hint::none, cache_hint::none,
491  _AddressScale, _ImmOffset, _DS, _VS, _Transposed, N>(
492  pred.data(), offsets.data());
493  return detail::lsc_format_ret<T>(Tmp);
494 }
495 
513 template <typename T, int NElts = 1,
514  lsc_data_size DS = lsc_data_size::default_size, int N>
515 __ESIMD_API __ESIMD_NS::simd<T, N * NElts>
516 lsc_slm_gather(__ESIMD_NS::simd<uint32_t, N> offsets,
517  __ESIMD_NS::simd_mask<N> pred,
518  __ESIMD_NS::simd<T, N * NElts> old_values) {
519  detail::check_lsc_vector_size<NElts>();
520  detail::check_lsc_data_size<T, DS>();
521  constexpr uint16_t _AddressScale = 1;
522  constexpr int _ImmOffset = 0;
523  constexpr lsc_data_size _DS =
524  detail::expand_data_size(detail::finalize_data_size<T, DS>());
525  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
526  constexpr detail::lsc_data_order _Transposed =
527  detail::lsc_data_order::nontranspose;
528  using MsgT = typename detail::lsc_expand_type<T>::type;
529  __ESIMD_NS::simd<MsgT, N * NElts> OldValuesExpanded =
530  detail::lsc_format_input<MsgT>(old_values);
531  __ESIMD_NS::simd<MsgT, N * NElts> Result =
532  __esimd_lsc_load_merge_slm<MsgT, cache_hint::none, cache_hint::none,
533  _AddressScale, _ImmOffset, _DS, _VS,
534  _Transposed, N>(pred.data(), offsets.data(),
535  OldValuesExpanded.data());
536  return detail::lsc_format_ret<T>(Result);
537 }
538 
554 template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size>
555 __ESIMD_API __ESIMD_NS::simd<T, NElts>
556 lsc_slm_block_load(uint32_t offset, __ESIMD_NS::simd_mask<1> pred = 1) {
557  detail::check_lsc_vector_size<NElts>();
558  detail::check_lsc_data_size<T, DS>();
559  constexpr uint16_t AddressScale = 1;
560  constexpr int ImmOffset = 0;
561  constexpr lsc_data_size FDS = detail::finalize_data_size<T, DS>();
562  static_assert(FDS == lsc_data_size::u32 || FDS == lsc_data_size::u64,
563  "Transposed load is supported only for data size u32 or u64");
564  constexpr detail::lsc_vector_size VS = detail::to_lsc_vector_size<NElts>();
565 
566  constexpr auto Transposed = detail::lsc_data_order::transpose;
567  constexpr int N = 1;
568  __ESIMD_NS::simd<uint32_t, N> offsets = offset;
569  return __esimd_lsc_load_slm<T, cache_hint::none, cache_hint::none,
570  AddressScale, ImmOffset, FDS, VS, Transposed, N>(
571  pred.data(), offsets.data());
572 }
573 
591 template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size>
592 __ESIMD_API __ESIMD_NS::simd<T, NElts>
593 lsc_slm_block_load(uint32_t offset, __ESIMD_NS::simd_mask<1> pred,
594  __ESIMD_NS::simd<T, NElts> old_values) {
595  detail::check_lsc_vector_size<NElts>();
596  detail::check_lsc_data_size<T, DS>();
597  constexpr uint16_t AddressScale = 1;
598  constexpr int ImmOffset = 0;
599  constexpr lsc_data_size FDS = detail::finalize_data_size<T, DS>();
600  static_assert(FDS == lsc_data_size::u32 || FDS == lsc_data_size::u64,
601  "Transposed load is supported only for data size u32 or u64");
602  constexpr detail::lsc_vector_size VS = detail::to_lsc_vector_size<NElts>();
603  constexpr auto Transposed = detail::lsc_data_order::transpose;
604  constexpr int N = 1;
605  __ESIMD_NS::simd<uint32_t, N> offsets = offset;
606  return __esimd_lsc_load_merge_slm<T, cache_hint::none, cache_hint::none,
607  AddressScale, ImmOffset, FDS, VS,
608  Transposed, N>(pred.data(), offsets.data(),
609  old_values.data());
610 }
611 
630 template <typename T, int NElts = 1,
631  lsc_data_size DS = lsc_data_size::default_size,
632  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
633  int N, typename Toffset>
634 __ESIMD_API __ESIMD_NS::simd<T, N * NElts>
635 lsc_gather(const T *p, __ESIMD_NS::simd<Toffset, N> offsets,
636  __ESIMD_NS::simd_mask<N> pred = 1) {
637  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
638  detail::check_lsc_vector_size<NElts>();
639  detail::check_lsc_data_size<T, DS>();
640  detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
641  constexpr uint16_t _AddressScale = 1;
642  constexpr int _ImmOffset = 0;
643  constexpr lsc_data_size _DS =
644  detail::expand_data_size(detail::finalize_data_size<T, DS>());
645  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
646  constexpr auto _Transposed = detail::lsc_data_order::nontranspose;
647  using MsgT = typename detail::lsc_expand_type<T>::type;
648  __ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
649  addrs += convert<uintptr_t>(offsets);
650  __ESIMD_NS::simd<MsgT, N * NElts> Tmp =
651  __esimd_lsc_load_stateless<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS,
652  _VS, _Transposed, N>(pred.data(),
653  addrs.data());
654  return detail::lsc_format_ret<T>(Tmp);
655 }
656 
677 template <typename T, int NElts = 1,
678  lsc_data_size DS = lsc_data_size::default_size,
679  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
680  int N, typename Toffset>
681 __ESIMD_API __ESIMD_NS::simd<T, N * NElts>
682 lsc_gather(const T *p, __ESIMD_NS::simd<Toffset, N> offsets,
683  __ESIMD_NS::simd_mask<N> pred,
684  __ESIMD_NS::simd<T, N * NElts> old_values) {
685  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
686  detail::check_lsc_vector_size<NElts>();
687  detail::check_lsc_data_size<T, DS>();
688  detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
689  constexpr uint16_t _AddressScale = 1;
690  constexpr int _ImmOffset = 0;
691  constexpr lsc_data_size _DS =
692  detail::expand_data_size(detail::finalize_data_size<T, DS>());
693  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
694  constexpr auto _Transposed = detail::lsc_data_order::nontranspose;
695  using MsgT = typename detail::lsc_expand_type<T>::type;
696  __ESIMD_NS::simd<uintptr_t, N> Addrs = reinterpret_cast<uintptr_t>(p);
697  Addrs += convert<uintptr_t>(offsets);
698  __ESIMD_NS::simd<MsgT, N * NElts> OldValuesExpanded =
699  detail::lsc_format_input<MsgT>(old_values);
700  __ESIMD_NS::simd<MsgT, N * NElts> Result =
701  __esimd_lsc_load_merge_stateless<MsgT, L1H, L3H, _AddressScale,
702  _ImmOffset, _DS, _VS, _Transposed, N>(
703  pred.data(), Addrs.data(), OldValuesExpanded.data());
704  return detail::lsc_format_ret<T>(Result);
705 }
706 
707 template <
708  typename T, int NElts = 1, lsc_data_size DS = lsc_data_size::default_size,
709  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, int N,
710  typename Toffset, typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
711 __ESIMD_API __ESIMD_NS::simd<T, N * NElts>
712 lsc_gather(const T *p, __ESIMD_NS::simd_view<Toffset, RegionTy> offsets,
713  __ESIMD_NS::simd_mask<N> pred = 1) {
714  return lsc_gather<T, NElts, DS, L1H, L3H, N>(p, offsets.read(), pred);
715 }
716 
717 template <
718  typename T, int NElts = 1, lsc_data_size DS = lsc_data_size::default_size,
719  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, int N,
720  typename Toffset, typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
721 __ESIMD_API __ESIMD_NS::simd<T, N * NElts>
722 lsc_gather(const T *p, __ESIMD_NS::simd_view<Toffset, RegionTy> offsets,
723  __ESIMD_NS::simd_mask<N> pred,
724  __ESIMD_NS::simd<T, N * NElts> old_values) {
725  return lsc_gather<T, NElts, DS, L1H, L3H, N>(p, offsets.read(), pred,
726  old_values);
727 }
728 
729 template <typename T, int NElts = 1,
730  lsc_data_size DS = lsc_data_size::default_size,
731  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
732  int N, typename Toffset>
733 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>,
734  __ESIMD_NS::simd<T, N * NElts>>
735 lsc_gather(const T *p, Toffset offset, __ESIMD_NS::simd_mask<N> pred = 1) {
736  return lsc_gather<T, NElts, DS, L1H, L3H, N>(
737  p, __ESIMD_NS::simd<Toffset, N>(offset), pred);
738 }
739 
740 template <typename T, int NElts = 1,
741  lsc_data_size DS = lsc_data_size::default_size,
742  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
743  int N, typename Toffset>
744 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>,
745  __ESIMD_NS::simd<T, N * NElts>>
746 lsc_gather(const T *p, Toffset offset, __ESIMD_NS::simd_mask<N> pred,
747  __ESIMD_NS::simd<T, N * NElts> old_values) {
748  return lsc_gather<T, NElts, DS, L1H, L3H, N>(
749  p, __ESIMD_NS::simd<Toffset, N>(offset), pred, old_values);
750 }
751 
771 template <typename T, int NElts = 1,
772  lsc_data_size DS = lsc_data_size::default_size,
773  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
774  int N, typename AccessorTy>
775 __ESIMD_API std::enable_if_t<!std::is_pointer_v<AccessorTy>,
776  __ESIMD_NS::simd<T, N * NElts>>
777 lsc_gather(AccessorTy acc,
778 #ifdef __ESIMD_FORCE_STATELESS_MEM
779  __ESIMD_NS::simd<uint64_t, N> offsets,
780 #else
781  __ESIMD_NS::simd<uint32_t, N> offsets,
782 #endif
783  __ESIMD_NS::simd_mask<N> pred = 1) {
784 #ifdef __ESIMD_FORCE_STATELESS_MEM
785  return lsc_gather<T, NElts, DS, L1H, L3H>(acc.get_pointer(), offsets, pred);
786 #else
787  detail::check_lsc_vector_size<NElts>();
788  detail::check_lsc_data_size<T, DS>();
789  detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
790  constexpr uint16_t _AddressScale = 1;
791  constexpr int _ImmOffset = 0;
792  constexpr lsc_data_size _DS =
793  detail::expand_data_size(detail::finalize_data_size<T, DS>());
794  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
795  constexpr detail::lsc_data_order _Transposed =
796  detail::lsc_data_order::nontranspose;
797  using MsgT = typename detail::lsc_expand_type<T>::type;
798  auto si = __ESIMD_NS::get_surface_index(acc);
799  auto loc_offsets = convert<uint32_t>(offsets);
800  __ESIMD_NS::simd<MsgT, N * NElts> Tmp =
801  __esimd_lsc_load_bti<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
802  _Transposed, N>(pred.data(), loc_offsets.data(), si);
803  return detail::lsc_format_ret<T>(Tmp);
804 #endif
805 }
806 
807 #ifdef __ESIMD_FORCE_STATELESS_MEM
808 template <typename T, int NElts = 1,
809  lsc_data_size DS = lsc_data_size::default_size,
810  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
811  int N, typename AccessorTy, typename Toffset>
812 __ESIMD_API std::enable_if_t<!std::is_pointer_v<AccessorTy> &&
813  std::is_integral_v<Toffset> &&
814  !std::is_same_v<Toffset, uint64_t>,
815  __ESIMD_NS::simd<T, N * NElts>>
816 lsc_gather(AccessorTy acc, __ESIMD_NS::simd<Toffset, N> offsets,
817  __ESIMD_NS::simd_mask<N> pred = 1) {
818  return lsc_gather<T, NElts, DS, L1H, L3H, N, AccessorTy>(
819  acc, convert<uint64_t>(offsets), pred);
820 }
821 #endif
822 
844 template <typename T, int NElts = 1,
845  lsc_data_size DS = lsc_data_size::default_size,
846  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
847  int N, typename AccessorTy>
848 __ESIMD_API std::enable_if_t<!std::is_pointer_v<AccessorTy>,
849  __ESIMD_NS::simd<T, N * NElts>>
850 lsc_gather(AccessorTy acc,
851 #ifdef __ESIMD_FORCE_STATELESS_MEM
852  __ESIMD_NS::simd<uint64_t, N> offsets,
853 #else
854  __ESIMD_NS::simd<uint32_t, N> offsets,
855 #endif
856  __ESIMD_NS::simd_mask<N> pred,
857  __ESIMD_NS::simd<T, N * NElts> old_values) {
858 #ifdef __ESIMD_FORCE_STATELESS_MEM
859  return lsc_gather<T, NElts, DS, L1H, L3H>(acc.get_pointer(), offsets, pred,
860  old_values);
861 #else
862  detail::check_lsc_vector_size<NElts>();
863  detail::check_lsc_data_size<T, DS>();
864  detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
865  constexpr uint16_t _AddressScale = 1;
866  constexpr int _ImmOffset = 0;
867  constexpr lsc_data_size _DS =
868  detail::expand_data_size(detail::finalize_data_size<T, DS>());
869  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
870  constexpr auto _Transposed = detail::lsc_data_order::nontranspose;
871  using MsgT = typename detail::lsc_expand_type<T>::type;
872  auto SI = __ESIMD_NS::get_surface_index(acc);
873  auto loc_offsets = convert<uint32_t>(offsets);
874  __ESIMD_NS::simd<MsgT, N * NElts> OldValuesExpanded =
875  detail::lsc_format_input<MsgT>(old_values);
876  __ESIMD_NS::simd<MsgT, N * NElts> Result =
877  __esimd_lsc_load_merge_bti<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS,
878  _VS, _Transposed, N>(
879  pred.data(), loc_offsets.data(), SI, OldValuesExpanded.data());
880  return detail::lsc_format_ret<T>(Result);
881 #endif
882 }
883 
884 #ifdef __ESIMD_FORCE_STATELESS_MEM
885 template <typename T, int NElts = 1,
886  lsc_data_size DS = lsc_data_size::default_size,
887  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
888  int N, typename AccessorTy, typename Toffset>
889 __ESIMD_API std::enable_if_t<!std::is_pointer_v<AccessorTy> &&
890  std::is_integral_v<Toffset> &&
891  !std::is_same_v<Toffset, uint64_t>,
892  __ESIMD_NS::simd<T, N * NElts>>
893 lsc_gather(AccessorTy acc, __ESIMD_NS::simd<Toffset, N> offsets,
894  __ESIMD_NS::simd_mask<N> pred,
895  __ESIMD_NS::simd<T, N * NElts> old_values) {
896  return lsc_gather<T, NElts, DS, L1H, L3H, N, AccessorTy>(
897  acc, convert<uint64_t>(offsets), pred, old_values);
898 }
899 #endif
900 
938 template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
939  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
940  typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
941 __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v<FlagsT>,
942  __ESIMD_NS::simd<T, NElts>>
943 lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1,
944  FlagsT flags = FlagsT{}) {
945  // Verify input template arguments.
946  detail::check_lsc_data_size<T, DS>();
947  detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
948  constexpr lsc_data_size FDS = detail::finalize_data_size<T, DS>();
949 
950  static_assert(FDS == lsc_data_size::u16 || FDS == lsc_data_size::u8 ||
951  FDS == lsc_data_size::u32 || FDS == lsc_data_size::u64,
952  "Conversion data types are not supported");
953  constexpr auto Alignment =
954  FlagsT::template alignment<__ESIMD_DNS::__raw_t<T>>;
955  static_assert(
956  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
957  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
958  "Incorrect alignment for the data type");
959 
960  constexpr int SmallIntFactor32Bit =
961  (FDS == lsc_data_size::u16) ? 2 : (FDS == lsc_data_size::u8 ? 4 : 1);
962  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
963  "Number of elements is not supported by Transposed load");
964 
965  constexpr bool Use64BitData =
966  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
967  (sizeof(T) == 8 ||
968  (DS == lsc_data_size::default_size && NElts / SmallIntFactor32Bit > 64 &&
969  (NElts * sizeof(T)) % 8 == 0));
970  constexpr int SmallIntFactor64Bit =
971  (FDS == lsc_data_size::u16)
972  ? 4
973  : (FDS == lsc_data_size::u8 ? 8
974  : (FDS == lsc_data_size::u32 ? 2 : 1));
975  constexpr int SmallIntFactor =
976  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
977  constexpr int FactoredNElts = NElts / SmallIntFactor;
978  detail::check_lsc_vector_size<FactoredNElts>();
979 
980  // Prepare template arguments for the call of intrinsic.
981  constexpr lsc_data_size ActualDS = Use64BitData
982  ? __ESIMD_ENS::lsc_data_size::u64
983  : __ESIMD_ENS::lsc_data_size::u32;
984 
985  constexpr detail::lsc_vector_size _VS =
986  detail::to_lsc_vector_size<FactoredNElts>();
987  using LoadElemT = __ESIMD_DNS::__raw_t<
988  std::conditional_t<SmallIntFactor == 1, T,
989  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
990  constexpr uint16_t _AddressScale = 1;
991  constexpr int _ImmOffset = 0;
992 
993  constexpr auto _Transposed = detail::lsc_data_order::transpose;
994  constexpr int N = 1;
995 
996  __ESIMD_NS::simd<uintptr_t, N> Addrs = reinterpret_cast<uintptr_t>(p);
997 
998  __ESIMD_NS::simd<LoadElemT, FactoredNElts> Result =
999  __esimd_lsc_load_stateless<LoadElemT, L1H, L3H, _AddressScale, _ImmOffset,
1000  ActualDS, _VS, _Transposed, N>(pred.data(),
1001  Addrs.data());
1002  return Result.template bit_cast_view<T>();
1003 }
1004 
1038 template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
1039  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1040  typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
1041 __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v<FlagsT>,
1042  __ESIMD_NS::simd<T, NElts>>
1043 lsc_block_load(const T *p, FlagsT flags) {
1044  return lsc_block_load<T, NElts, DS, L1H, L3H>(p, __ESIMD_NS::simd_mask<1>(1),
1045  flags);
1046 }
1047 
1084 template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
1085  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1086  typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
1087 __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v<FlagsT>,
1088  __ESIMD_NS::simd<T, NElts>>
1089 lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred,
1090  __ESIMD_NS::simd<T, NElts> old_values, FlagsT flags = FlagsT{}) {
1091  // Verify input template arguments.
1092  detail::check_lsc_data_size<T, DS>();
1093  detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
1094  constexpr lsc_data_size FDS = detail::finalize_data_size<T, DS>();
1095  constexpr auto Alignment =
1096  FlagsT::template alignment<__ESIMD_DNS::__raw_t<T>>;
1097  static_assert(
1098  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
1099  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
1100  "Incorrect alignment for the data type");
1101  static_assert(FDS == lsc_data_size::u16 || FDS == lsc_data_size::u8 ||
1102  FDS == lsc_data_size::u32 || FDS == lsc_data_size::u64,
1103  "Conversion data types are not supported");
1104  constexpr int SmallIntFactor32Bit =
1105  (FDS == lsc_data_size::u16) ? 2 : (FDS == lsc_data_size::u8 ? 4 : 1);
1106  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1107  "Number of elements is not supported by Transposed load");
1108 
1109  constexpr bool Use64BitData =
1110  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1111  (sizeof(T) == 8 ||
1112  (DS == lsc_data_size::default_size && NElts / SmallIntFactor32Bit > 64 &&
1113  (NElts * sizeof(T)) % 8 == 0));
1114  constexpr int SmallIntFactor64Bit =
1115  (FDS == lsc_data_size::u16)
1116  ? 4
1117  : (FDS == lsc_data_size::u8 ? 8
1118  : (FDS == lsc_data_size::u32 ? 2 : 1));
1119  constexpr int SmallIntFactor =
1120  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1121  constexpr int FactoredNElts = NElts / SmallIntFactor;
1122  detail::check_lsc_vector_size<FactoredNElts>();
1123 
1124  // Prepare template arguments for the call of intrinsic.
1125  constexpr lsc_data_size ActualDS = Use64BitData
1126  ? __ESIMD_ENS::lsc_data_size::u64
1127  : __ESIMD_ENS::lsc_data_size::u32;
1128 
1129  constexpr detail::lsc_vector_size _VS =
1130  detail::to_lsc_vector_size<FactoredNElts>();
1131  using LoadElemT = __ESIMD_DNS::__raw_t<
1132  std::conditional_t<SmallIntFactor == 1, T,
1133  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1134 
1135  constexpr uint16_t _AddressScale = 1;
1136  constexpr int _ImmOffset = 0;
1137 
1138  constexpr auto _Transposed = detail::lsc_data_order::transpose;
1139  constexpr int N = 1;
1140 
1141  __ESIMD_NS::simd<uintptr_t, N> Addrs = reinterpret_cast<uintptr_t>(p);
1142  __ESIMD_NS::simd<LoadElemT, FactoredNElts> OldVals =
1143  old_values.template bit_cast_view<LoadElemT>();
1144  __ESIMD_NS::simd<LoadElemT, FactoredNElts> Result =
1145  __esimd_lsc_load_merge_stateless<LoadElemT, L1H, L3H, _AddressScale,
1146  _ImmOffset, ActualDS, _VS, _Transposed,
1147  N>(pred.data(), Addrs.data(),
1148  OldVals.data());
1149  return Result.template bit_cast_view<T>();
1150 }
1151 
1188 template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
1189  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1190  typename AccessorTy,
1191  typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
1192 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value &&
1193  __ESIMD_NS::is_simd_flag_type_v<FlagsT>,
1194  __ESIMD_NS::simd<T, NElts>>
1195 lsc_block_load(AccessorTy acc,
1196 #ifdef __ESIMD_FORCE_STATELESS_MEM
1197  uint64_t offset,
1198 #else
1199  uint32_t offset,
1200 #endif
1201  __ESIMD_NS::simd_mask<1> pred = 1, FlagsT flags = FlagsT{}) {
1202 #ifdef __ESIMD_FORCE_STATELESS_MEM
1203  return lsc_block_load<T, NElts, DS, L1H, L3H>(
1204  __ESIMD_DNS::accessorToPointer<T>(acc, offset), pred, flags);
1205 #else // !__ESIMD_FORCE_STATELESS_MEM
1206  // Verify input template arguments.
1207  detail::check_lsc_data_size<T, DS>();
1208  detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
1209 
1210  constexpr auto Alignment =
1211  FlagsT::template alignment<__ESIMD_DNS::__raw_t<T>>;
1212  static_assert(
1213  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
1214  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
1215  "Incorrect alignment for the data type");
1216 
1217  constexpr lsc_data_size FDS = detail::finalize_data_size<T, DS>();
1218  static_assert(FDS == lsc_data_size::u16 || FDS == lsc_data_size::u8 ||
1219  FDS == lsc_data_size::u32 || FDS == lsc_data_size::u64,
1220  "Conversion data types are not supported");
1221  constexpr int SmallIntFactor32Bit =
1222  (FDS == lsc_data_size::u16) ? 2 : (FDS == lsc_data_size::u8 ? 4 : 1);
1223  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1224  "Number of elements is not supported by Transposed load");
1225  constexpr bool Use64BitData =
1226  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1227  (sizeof(T) == 8 ||
1228  (DS == lsc_data_size::default_size && NElts / SmallIntFactor32Bit > 64 &&
1229  (NElts * sizeof(T)) % 8 == 0));
1230  constexpr int SmallIntFactor64Bit =
1231  (FDS == lsc_data_size::u16)
1232  ? 4
1233  : (FDS == lsc_data_size::u8 ? 8
1234  : (FDS == lsc_data_size::u32 ? 2 : 1));
1235  constexpr int SmallIntFactor =
1236  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1237  constexpr int FactoredNElts = NElts / SmallIntFactor;
1238  constexpr lsc_data_size ActualDS = Use64BitData
1239  ? __ESIMD_ENS::lsc_data_size::u64
1240  : __ESIMD_ENS::lsc_data_size::u32;
1241 
1242  detail::check_lsc_vector_size<FactoredNElts>();
1243 
1244  // Prepare template arguments for the call of intrinsic.
1245  using LoadElemT = __ESIMD_DNS::__raw_t<
1246  std::conditional_t<SmallIntFactor == 1, T,
1247  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1248 
1249  constexpr uint16_t _AddressScale = 1;
1250  constexpr int _ImmOffset = 0;
1251  constexpr auto _VS = detail::to_lsc_vector_size<FactoredNElts>();
1252  constexpr auto _Transposed = detail::lsc_data_order::transpose;
1253  constexpr int N = 1;
1254 
1255  __ESIMD_NS::simd<uint32_t, N> Offsets = offset;
1256  auto SI = __ESIMD_NS::get_surface_index(acc);
1257  __ESIMD_NS::simd<LoadElemT, FactoredNElts> Result =
1258  __esimd_lsc_load_bti<LoadElemT, L1H, L3H, _AddressScale, _ImmOffset,
1259  ActualDS, _VS, _Transposed, N>(pred.data(),
1260  Offsets.data(), SI);
1261  return Result.template bit_cast_view<T>();
1262 #endif // !__ESIMD_FORCE_STATELESS_MEM
1263 }
1264 
1297 template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
1298  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1299  typename AccessorTy,
1300  typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
1301 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value &&
1302  __ESIMD_NS::is_simd_flag_type_v<FlagsT>,
1303  __ESIMD_NS::simd<T, NElts>>
1304 lsc_block_load(AccessorTy acc,
1305 #ifdef __ESIMD_FORCE_STATELESS_MEM
1306  uint64_t offset,
1307 #else
1308  uint32_t offset,
1309 #endif
1310  FlagsT flags) {
1311  return lsc_block_load<T, NElts, DS, L1H, L3H>(
1312  acc, offset, __ESIMD_NS::simd_mask<1>(1), flags);
1313 }
1314 
1352 template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
1353  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1354  typename AccessorTy,
1355  typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
1356 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value &&
1357  __ESIMD_NS::is_simd_flag_type_v<FlagsT>,
1358  __ESIMD_NS::simd<T, NElts>>
1359 lsc_block_load(AccessorTy acc,
1360 #ifdef __ESIMD_FORCE_STATELESS_MEM
1361  uint64_t offset,
1362 #else
1363  uint32_t offset,
1364 #endif
1365  __ESIMD_NS::simd_mask<1> pred,
1366  __ESIMD_NS::simd<T, NElts> old_values, FlagsT flags = FlagsT{}) {
1367 #ifdef __ESIMD_FORCE_STATELESS_MEM
1368  return lsc_block_load<T, NElts, DS, L1H, L3H>(
1369  __ESIMD_DNS::accessorToPointer<T>(acc, offset), pred, old_values, flags);
1370 #else // !__ESIMD_FORCE_STATELESS_MEM
1371  // Verify input template arguments.
1372  detail::check_lsc_data_size<T, DS>();
1373  detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
1374  constexpr lsc_data_size FDS = detail::finalize_data_size<T, DS>();
1375  constexpr auto Alignment =
1376  FlagsT::template alignment<__ESIMD_DNS::__raw_t<T>>;
1377  static_assert(
1378  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
1379  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
1380  "Incorrect alignment for the data type");
1381  static_assert(FDS == lsc_data_size::u16 || FDS == lsc_data_size::u8 ||
1382  FDS == lsc_data_size::u32 || FDS == lsc_data_size::u64,
1383  "Conversion data types are not supported");
1384  constexpr int SmallIntFactor32Bit =
1385  (FDS == lsc_data_size::u16) ? 2 : (FDS == lsc_data_size::u8 ? 4 : 1);
1386  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1387  "Number of elements is not supported by Transposed load");
1388  constexpr bool Use64BitData =
1389  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1390  (sizeof(T) == 8 ||
1391  (DS == lsc_data_size::default_size && NElts / SmallIntFactor32Bit > 64 &&
1392  (NElts * sizeof(T)) % 8 == 0));
1393  constexpr int SmallIntFactor64Bit =
1394  (FDS == lsc_data_size::u16)
1395  ? 4
1396  : (FDS == lsc_data_size::u8 ? 8
1397  : (FDS == lsc_data_size::u32 ? 2 : 1));
1398  constexpr int SmallIntFactor =
1399  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1400  constexpr int FactoredNElts = NElts / SmallIntFactor;
1401  constexpr lsc_data_size ActualDS = Use64BitData
1402  ? __ESIMD_ENS::lsc_data_size::u64
1403  : __ESIMD_ENS::lsc_data_size::u32;
1404 
1405  detail::check_lsc_vector_size<FactoredNElts>();
1406 
1407  // Prepare template arguments for the call of intrinsic.
1408  using LoadElemT = __ESIMD_DNS::__raw_t<
1409  std::conditional_t<SmallIntFactor == 1, T,
1410  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1411  constexpr uint16_t _AddressScale = 1;
1412  constexpr int _ImmOffset = 0;
1413  constexpr auto _VS = detail::to_lsc_vector_size<FactoredNElts>();
1414  constexpr auto _Transposed = detail::lsc_data_order::transpose;
1415  constexpr int N = 1;
1416 
1417  __ESIMD_NS::simd<uint32_t, N> Offsets = offset;
1418  auto SI = __ESIMD_NS::get_surface_index(acc);
1419  __ESIMD_NS::simd<LoadElemT, FactoredNElts> OldVals =
1420  old_values.template bit_cast_view<LoadElemT>();
1421  __ESIMD_NS::simd<LoadElemT, FactoredNElts> Result =
1422  __esimd_lsc_load_merge_bti<LoadElemT, L1H, L3H, _AddressScale, _ImmOffset,
1423  ActualDS, _VS, _Transposed, N>(
1424  pred.data(), Offsets.data(), SI, OldVals.data());
1425  return Result.template bit_cast_view<T>();
1426 #endif // !__ESIMD_FORCE_STATELESS_MEM
1427 }
1428 
1445 template <typename T, int NElts = 1,
1446  lsc_data_size DS = lsc_data_size::default_size,
1447  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1448  int N, typename Toffset>
1449 __ESIMD_API void lsc_prefetch(const T *p, __ESIMD_NS::simd<Toffset, N> offsets,
1450  __ESIMD_NS::simd_mask<N> pred = 1) {
1451  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
1452  detail::check_lsc_vector_size<NElts>();
1453  detail::check_lsc_data_size<T, DS>();
1454  detail::check_lsc_cache_hint<detail::lsc_action::prefetch, L1H, L3H>();
1455  constexpr uint16_t _AddressScale = 1;
1456  constexpr int _ImmOffset = 0;
1457  constexpr lsc_data_size _DS =
1458  detail::expand_data_size(detail::finalize_data_size<T, DS>());
1459  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
1460  constexpr detail::lsc_data_order _Transposed =
1461  detail::lsc_data_order::nontranspose;
1462  using MsgT = typename detail::lsc_expand_type<T>::type;
1463  __ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
1464  addrs += convert<uintptr_t>(offsets);
1465  __esimd_lsc_prefetch_stateless<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS,
1466  _VS, _Transposed, N>(pred.data(),
1467  addrs.data());
1468 }
1469 
1470 template <
1471  typename T, int NElts = 1, lsc_data_size DS = lsc_data_size::default_size,
1472  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, int N,
1473  typename Toffset, typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
1474 __ESIMD_API void lsc_prefetch(const T *p,
1475  __ESIMD_NS::simd_view<Toffset, RegionTy> offsets,
1476  __ESIMD_NS::simd_mask<N> pred = 1) {
1477  lsc_prefetch<T, NElts, DS, L1H, L3H, N>(p, offsets.read(), pred);
1478 }
1479 
1480 template <typename T, int NElts = 1,
1481  lsc_data_size DS = lsc_data_size::default_size,
1482  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1483  int N, typename Toffset>
1484 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>>
1485 lsc_prefetch(const T *p, Toffset offset, __ESIMD_NS::simd_mask<N> pred = 1) {
1486  lsc_prefetch<T, NElts, DS, L1H, L3H, N>(
1487  p, __ESIMD_NS::simd<Toffset, N>(offset), pred);
1488 }
1489 
1503 template <typename T, int NElts = 1,
1504  lsc_data_size DS = lsc_data_size::default_size,
1505  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
1506 __ESIMD_API void lsc_prefetch(const T *p) {
1507  detail::check_lsc_vector_size<NElts>();
1508  detail::check_lsc_data_size<T, DS>();
1509  detail::check_lsc_cache_hint<detail::lsc_action::prefetch, L1H, L3H>();
1510  constexpr uint16_t _AddressScale = 1;
1511  constexpr int _ImmOffset = 0;
1512  constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
1513 
1514  static_assert(
1515  _DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
1516  "Transposed prefetch is supported only for data size u32 or u64");
1517  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
1518  constexpr detail::lsc_data_order _Transposed =
1519  detail::lsc_data_order::transpose;
1520  constexpr int N = 1;
1521  __ESIMD_NS::simd_mask<N> pred = 1;
1522 
1523  __ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
1524  __esimd_lsc_prefetch_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
1525  _VS, _Transposed, N>(pred.data(),
1526  addrs.data());
1527 }
1528 
1546 template <typename T, int NElts = 1,
1547  lsc_data_size DS = lsc_data_size::default_size,
1548  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1549  int N, typename AccessorTy>
1550 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value>
1551 lsc_prefetch(AccessorTy acc,
1552 #ifdef __ESIMD_FORCE_STATELESS_MEM
1553  __ESIMD_NS::simd<uint64_t, N> offsets,
1554 #else
1555  __ESIMD_NS::simd<uint32_t, N> offsets,
1556 #endif
1557  __ESIMD_NS::simd_mask<N> pred = 1) {
1558 #ifdef __ESIMD_FORCE_STATELESS_MEM
1559  return lsc_prefetch<T, NElts, DS, L1H, L3H>(
1560  __ESIMD_DNS::accessorToPointer<T>(acc), offsets, pred);
1561 #else
1562  detail::check_lsc_vector_size<NElts>();
1563  detail::check_lsc_data_size<T, DS>();
1564  detail::check_lsc_cache_hint<detail::lsc_action::prefetch, L1H, L3H>();
1565  constexpr uint16_t _AddressScale = 1;
1566  constexpr int _ImmOffset = 0;
1567  constexpr lsc_data_size _DS =
1568  detail::expand_data_size(detail::finalize_data_size<T, DS>());
1569  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
1570  constexpr detail::lsc_data_order _Transposed =
1571  detail::lsc_data_order::nontranspose;
1572  using MsgT = typename detail::lsc_expand_type<T>::type;
1573  auto si = __ESIMD_NS::get_surface_index(acc);
1574  auto loc_offsets = convert<uint32_t>(offsets);
1575  __esimd_lsc_prefetch_bti<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
1576  _Transposed, N>(pred.data(), loc_offsets.data(), si);
1577 #endif
1578 }
1579 
1580 #ifdef __ESIMD_FORCE_STATELESS_MEM
1581 template <typename T, int NElts = 1,
1582  lsc_data_size DS = lsc_data_size::default_size,
1583  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1584  int N, typename AccessorTy, typename Toffset>
1585 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value &&
1586  std::is_integral_v<Toffset> &&
1587  !std::is_same_v<Toffset, uint64_t>>
1588 lsc_prefetch(AccessorTy acc, __ESIMD_NS::simd<Toffset, N> offsets,
1589  __ESIMD_NS::simd_mask<N> pred = 1) {
1590  lsc_prefetch<T, NElts, DS, L1H, L3H, N, AccessorTy>(
1591  acc, convert<uint64_t>(offsets), pred);
1592 }
1593 #endif
1594 
1610 template <typename T, int NElts = 1,
1611  lsc_data_size DS = lsc_data_size::default_size,
1612  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1613  typename AccessorTy>
1614 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value>
1615 lsc_prefetch(AccessorTy acc,
1616 #ifdef __ESIMD_FORCE_STATELESS_MEM
1617  uint64_t offset
1618 #else
1619  uint32_t offset
1620 #endif
1621 ) {
1622 #ifdef __ESIMD_FORCE_STATELESS_MEM
1623  lsc_prefetch<T, NElts, DS, L1H, L3H>(
1624  __ESIMD_DNS::accessorToPointer<T>(acc, offset));
1625 #else
1626  detail::check_lsc_vector_size<NElts>();
1627  detail::check_lsc_data_size<T, DS>();
1628  detail::check_lsc_cache_hint<detail::lsc_action::prefetch, L1H, L3H>();
1629  constexpr uint16_t _AddressScale = 1;
1630  constexpr int _ImmOffset = 0;
1631  constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
1632  static_assert(
1633  _DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
1634  "Transposed prefetch is supported only for data size u32 or u64");
1635  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
1636  constexpr detail::lsc_data_order _Transposed =
1637  detail::lsc_data_order::transpose;
1638  constexpr int N = 1;
1639  __ESIMD_NS::simd_mask<N> pred = 1;
1640  __ESIMD_NS::simd<uint32_t, N> offsets = offset;
1641  auto si = __ESIMD_NS::get_surface_index(acc);
1642  __esimd_lsc_prefetch_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
1643  _Transposed, N>(pred.data(), offsets.data(), si);
1644 #endif
1645 }
1646 
1661 template <typename T, int NElts = 1,
1662  lsc_data_size DS = lsc_data_size::default_size, int N>
1663 __ESIMD_API void lsc_slm_scatter(__ESIMD_NS::simd<uint32_t, N> offsets,
1664  __ESIMD_NS::simd<T, N * NElts> vals,
1665  __ESIMD_NS::simd_mask<N> pred = 1) {
1666  detail::check_lsc_vector_size<NElts>();
1667  detail::check_lsc_data_size<T, DS>();
1668  constexpr uint16_t _AddressScale = 1;
1669  constexpr int _ImmOffset = 0;
1670  constexpr lsc_data_size _DS =
1671  detail::expand_data_size(detail::finalize_data_size<T, DS>());
1672  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
1673  constexpr detail::lsc_data_order _Transposed =
1674  detail::lsc_data_order::nontranspose;
1675  using MsgT = typename detail::lsc_expand_type<T>::type;
1676  using CstT = typename detail::lsc_bitcast_type<T>::type;
1677  __ESIMD_NS::simd<MsgT, N * NElts> Tmp = vals.template bit_cast_view<CstT>();
1678  __esimd_lsc_store_slm<MsgT, cache_hint::none, cache_hint::none, _AddressScale,
1679  _ImmOffset, _DS, _VS, _Transposed, N>(
1680  pred.data(), offsets.data(), Tmp.data());
1681 }
1682 
1695 template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size>
1696 __ESIMD_API void lsc_slm_block_store(uint32_t offset,
1697  __ESIMD_NS::simd<T, NElts> vals) {
1698  detail::check_lsc_vector_size<NElts>();
1699  detail::check_lsc_data_size<T, DS>();
1700  constexpr uint16_t _AddressScale = 1;
1701  constexpr int _ImmOffset = 0;
1702  constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
1703  static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
1704  "Transposed store is supported only for data size u32 or u64");
1705  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
1706  constexpr detail::lsc_data_order _Transposed =
1707  detail::lsc_data_order::transpose;
1708  constexpr int N = 1;
1709  __ESIMD_NS::simd_mask<N> pred = 1;
1710  __ESIMD_NS::simd<uint32_t, N> offsets = offset;
1711  __esimd_lsc_store_slm<T, cache_hint::none, cache_hint::none, _AddressScale,
1712  _ImmOffset, _DS, _VS, _Transposed, N>(
1713  pred.data(), offsets.data(), vals.data());
1714 }
1715 
1733 template <typename T, int NElts = 1,
1734  lsc_data_size DS = lsc_data_size::default_size,
1735  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1736  int N, typename Toffset>
1737 __ESIMD_API void lsc_scatter(T *p, __ESIMD_NS::simd<Toffset, N> offsets,
1738  __ESIMD_NS::simd<T, N * NElts> vals,
1739  __ESIMD_NS::simd_mask<N> pred = 1) {
1740  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
1741  detail::check_lsc_vector_size<NElts>();
1742  detail::check_lsc_data_size<T, DS>();
1743  detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
1744  constexpr uint16_t _AddressScale = 1;
1745  constexpr int _ImmOffset = 0;
1746  constexpr lsc_data_size _DS =
1747  detail::expand_data_size(detail::finalize_data_size<T, DS>());
1748  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
1749  constexpr detail::lsc_data_order _Transposed =
1750  detail::lsc_data_order::nontranspose;
1751  using MsgT = typename detail::lsc_expand_type<T>::type;
1752  using _CstT = typename detail::lsc_bitcast_type<T>::type;
1753  __ESIMD_NS::simd<MsgT, N * NElts> Tmp = vals.template bit_cast_view<_CstT>();
1754  __ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
1755  addrs += convert<uintptr_t>(offsets);
1756  __esimd_lsc_store_stateless<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS,
1757  _VS, _Transposed, N>(pred.data(), addrs.data(),
1758  Tmp.data());
1759 }
1760 
1761 template <
1762  typename T, int NElts = 1, lsc_data_size DS = lsc_data_size::default_size,
1763  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, int N,
1764  typename Toffset, typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
1765 __ESIMD_API void lsc_scatter(T *p,
1766  __ESIMD_NS::simd_view<Toffset, RegionTy> offsets,
1767  __ESIMD_NS::simd<T, N * NElts> vals,
1768  __ESIMD_NS::simd_mask<N> pred = 1) {
1769  lsc_scatter<T, NElts, DS, L1H, L3H, N>(p, offsets.read(), vals, pred);
1770 }
1771 
1772 template <typename T, int NElts = 1,
1773  lsc_data_size DS = lsc_data_size::default_size,
1774  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1775  int N, typename Toffset>
1776 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> && N == 1>
1777 lsc_scatter(T *p, Toffset offset, __ESIMD_NS::simd<T, N * NElts> vals,
1778  __ESIMD_NS::simd_mask<N> pred = 1) {
1779  lsc_scatter<T, NElts, DS, L1H, L3H, N>(
1780  p, __ESIMD_NS::simd<Toffset, N>(offset), vals, pred);
1781 }
1782 
1801 template <typename T, int NElts = 1,
1802  lsc_data_size DS = lsc_data_size::default_size,
1803  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1804  int N, typename AccessorTy>
1805 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value>
1806 lsc_scatter(AccessorTy acc,
1807 #ifdef __ESIMD_FORCE_STATELESS_MEM
1808  __ESIMD_NS::simd<uint64_t, N> offsets,
1809 #else
1810  __ESIMD_NS::simd<uint32_t, N> offsets,
1811 #endif
1812  __ESIMD_NS::simd<T, N * NElts> vals,
1813  __ESIMD_NS::simd_mask<N> pred = 1) {
1814 #ifdef __ESIMD_FORCE_STATELESS_MEM
1815  lsc_scatter<T, NElts, DS, L1H, L3H>(__ESIMD_DNS::accessorToPointer<T>(acc),
1816  offsets, vals, pred);
1817 #else
1818  detail::check_lsc_vector_size<NElts>();
1819  detail::check_lsc_data_size<T, DS>();
1820  detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
1821  constexpr uint16_t _AddressScale = 1;
1822  constexpr int _ImmOffset = 0;
1823  constexpr lsc_data_size _DS =
1824  detail::expand_data_size(detail::finalize_data_size<T, DS>());
1825  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
1826  constexpr detail::lsc_data_order _Transposed =
1827  detail::lsc_data_order::nontranspose;
1828  using MsgT = typename detail::lsc_expand_type<T>::type;
1829  using _CstT = typename detail::lsc_bitcast_type<T>::type;
1830  __ESIMD_NS::simd<MsgT, N * NElts> Tmp = vals.template bit_cast_view<_CstT>();
1831  auto si = __ESIMD_NS::get_surface_index(acc);
1832  auto loc_offsets = convert<uint32_t>(offsets);
1833  __esimd_lsc_store_bti<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
1834  _Transposed, N>(pred.data(), loc_offsets.data(),
1835  Tmp.data(), si);
1836 #endif
1837 }
1838 
1839 #ifdef __ESIMD_FORCE_STATELESS_MEM
1840 template <typename T, int NElts = 1,
1841  lsc_data_size DS = lsc_data_size::default_size,
1842  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1843  int N, typename AccessorTy, typename Toffset>
1844 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value &&
1845  std::is_integral_v<Toffset> &&
1846  !std::is_same_v<Toffset, uint64_t>>
1847 lsc_scatter(AccessorTy acc, __ESIMD_NS::simd<Toffset, N> offsets,
1848  __ESIMD_NS::simd<T, N * NElts> vals,
1849  __ESIMD_NS::simd_mask<N> pred = 1) {
1850  lsc_scatter<T, NElts, DS, L1H, L3H, N, AccessorTy>(
1851  acc, convert<uint64_t>(offsets), vals, pred);
1852 }
1853 #endif
1854 template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
1887  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1888  typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
1889 __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v<FlagsT>>
1890 lsc_block_store(T *p, __ESIMD_NS::simd<T, NElts> vals,
1891  __ESIMD_NS::simd_mask<1> pred = 1, FlagsT flags = FlagsT{}) {
1892  detail::check_lsc_data_size<T, DS>();
1893  detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
1894  constexpr auto Alignment =
1895  FlagsT::template alignment<__ESIMD_DNS::__raw_t<T>>;
1896  static_assert(
1897  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
1898  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
1899  "Incorrect alignment for the data type");
1900 
1901  // Prepare template arguments for the call of intrinsic.
1902  constexpr uint16_t _AddressScale = 1;
1903  constexpr int _ImmOffset = 0;
1904  constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
1905  static_assert(_DS == lsc_data_size::u16 || _DS == lsc_data_size::u8 ||
1906  _DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
1907  "Conversion data types are not supported");
1908  constexpr detail::lsc_data_order _Transposed =
1909  detail::lsc_data_order::transpose;
1910  constexpr int N = 1;
1911  __ESIMD_NS::simd<uintptr_t, N> Addrs = reinterpret_cast<uintptr_t>(p);
1912 
1913  constexpr int SmallIntFactor32Bit =
1914  (_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1);
1915  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1916  "Number of elements is not supported by Transposed store");
1917 
1918  constexpr bool Use64BitData =
1919  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1920  (sizeof(T) == 8 ||
1921  (DS == lsc_data_size::default_size && NElts / SmallIntFactor32Bit > 64 &&
1922  (NElts * sizeof(T)) % 8 == 0));
1923  constexpr int SmallIntFactor64Bit =
1924  (_DS == lsc_data_size::u16)
1925  ? 4
1926  : (_DS == lsc_data_size::u8 ? 8
1927  : (_DS == lsc_data_size::u32 ? 2 : 1));
1928  constexpr int SmallIntFactor =
1929  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1930  constexpr int FactoredNElts = NElts / SmallIntFactor;
1931  constexpr lsc_data_size ActualDS = Use64BitData
1932  ? __ESIMD_ENS::lsc_data_size::u64
1933  : __ESIMD_ENS::lsc_data_size::u32;
1934 
1935  detail::check_lsc_vector_size<FactoredNElts>();
1936  constexpr detail::lsc_vector_size _VS =
1937  detail::to_lsc_vector_size<FactoredNElts>();
1938 
1939  using StoreType = __ESIMD_DNS::__raw_t<
1940  std::conditional_t<SmallIntFactor == 1, T,
1941  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1942 
1943  __esimd_lsc_store_stateless<StoreType, L1H, L3H, _AddressScale, _ImmOffset,
1944  ActualDS, _VS, _Transposed, N>(
1945  pred.data(), Addrs.data(),
1946  sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreType, FactoredNElts>>(
1947  vals.data()));
1948 }
1949 
1978 template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
1979  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
1980  typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
1981 __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v<FlagsT>>
1982 lsc_block_store(T *p, __ESIMD_NS::simd<T, NElts> vals, FlagsT flags) {
1983  lsc_block_store<T, NElts, DS, L1H, L3H>(p, vals, __ESIMD_NS::simd_mask<1>(1),
1984  flags);
1985 }
1986 
2021 template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
2022  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
2023  typename AccessorTy,
2024  typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
2025 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value &&
2026  __ESIMD_NS::is_simd_flag_type_v<FlagsT>>
2027 lsc_block_store(AccessorTy acc,
2028 #ifdef __ESIMD_FORCE_STATELESS_MEM
2029  uint64_t offset,
2030 #else
2031  uint32_t offset,
2032 #endif
2033  __ESIMD_NS::simd<T, NElts> vals,
2034  __ESIMD_NS::simd_mask<1> pred = 1, FlagsT flags = FlagsT{}) {
2035 #ifdef __ESIMD_FORCE_STATELESS_MEM
2036  lsc_block_store<T, NElts, DS, L1H, L3H>(
2037  __ESIMD_DNS::accessorToPointer<T>(acc, offset), vals, pred, flags);
2038 #else
2039  detail::check_lsc_data_size<T, DS>();
2040  detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
2041  constexpr auto Alignment =
2042  FlagsT::template alignment<__ESIMD_DNS::__raw_t<T>>;
2043  static_assert(
2044  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
2045  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
2046  "Incorrect alignment for the data type");
2047  // Prepare template arguments for the call of intrinsic.
2048  constexpr uint16_t _AddressScale = 1;
2049  constexpr int _ImmOffset = 0;
2050  constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
2051  static_assert(_DS == lsc_data_size::u16 || _DS == lsc_data_size::u8 ||
2052  _DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
2053  "Conversion data types are not supported");
2054  constexpr detail::lsc_data_order _Transposed =
2055  detail::lsc_data_order::transpose;
2056  constexpr int N = 1;
2057 
2058  __ESIMD_NS::simd<uint32_t, N> Offsets = offset;
2059  auto si = __ESIMD_NS::get_surface_index(acc);
2060 
2061  constexpr int SmallIntFactor32Bit =
2062  (_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1);
2063  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
2064  "Number of elements is not supported by Transposed store");
2065 
2066  constexpr bool Use64BitData =
2067  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
2068  (sizeof(T) == 8 ||
2069  (DS == lsc_data_size::default_size && NElts / SmallIntFactor32Bit > 64 &&
2070  (NElts * sizeof(T)) % 8 == 0));
2071  constexpr int SmallIntFactor64Bit =
2072  (_DS == lsc_data_size::u16)
2073  ? 4
2074  : (_DS == lsc_data_size::u8 ? 8
2075  : (_DS == lsc_data_size::u32 ? 2 : 1));
2076  constexpr int SmallIntFactor =
2077  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
2078  constexpr int FactoredNElts = NElts / SmallIntFactor;
2079  constexpr lsc_data_size ActualDS = Use64BitData
2080  ? __ESIMD_ENS::lsc_data_size::u64
2081  : __ESIMD_ENS::lsc_data_size::u32;
2082 
2083  detail::check_lsc_vector_size<FactoredNElts>();
2084  constexpr detail::lsc_vector_size _VS =
2085  detail::to_lsc_vector_size<FactoredNElts>();
2086 
2087  using StoreType = __ESIMD_DNS::__raw_t<
2088  std::conditional_t<SmallIntFactor == 1, T,
2089  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
2090 
2091  __esimd_lsc_store_bti<StoreType, L1H, L3H, _AddressScale, _ImmOffset,
2092  ActualDS, _VS, _Transposed, N>(
2093  pred.data(), Offsets.data(),
2094  sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreType, FactoredNElts>>(
2095  vals.data()),
2096  si);
2097 #endif
2098 }
2099 
2130 template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
2131  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
2132  typename AccessorTy,
2133  typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
2134 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value &&
2135  __ESIMD_NS::is_simd_flag_type_v<FlagsT>>
2136 lsc_block_store(AccessorTy acc,
2137 #ifdef __ESIMD_FORCE_STATELESS_MEM
2138  uint64_t offset,
2139 #else
2140  uint32_t offset,
2141 #endif
2142  __ESIMD_NS::simd<T, NElts> vals, FlagsT flags) {
2143  lsc_block_store<T, NElts, DS, L1H, L3H>(acc, offset, vals,
2144  __ESIMD_NS::simd_mask<1>(1), flags);
2145 }
2146 
2147 namespace detail {
2148 // Compile-time checks for lsc_load_2d/prefetch_2d/store_2d restrictions.
2149 template <typename T, int BlockWidth, int BlockHeight, int NBlocks,
2150  bool Transposed, bool Transformed, bool IsStore = false>
2152  constexpr int GRFByteSize = BlockWidth * BlockHeight * NBlocks * sizeof(T);
2153  static_assert(BlockWidth > 0, "Block width must be positive");
2154  static_assert(BlockHeight > 0, "Block height must be positive");
2155  // Restrictions based on documentation.
2156  static_assert(!IsStore || GRFByteSize <= 512,
2157  "2D store supports 512 bytes max");
2158  static_assert(IsStore || GRFByteSize <= 2048,
2159  "2D load supports 2048 bytes max");
2160  static_assert(!Transposed || !Transformed,
2161  "Transposed and transformed is not supported");
2162  static_assert((sizeof(T) * BlockWidth) % 4 == 0,
2163  "Block width must be aligned by DW");
2164  if constexpr (Transposed) {
2165  static_assert(NBlocks == 1, "Transposed expected to be 1 block only");
2166  static_assert(sizeof(T) == 4 || sizeof(T) == 8,
2167  "Transposed load is supported only for data size u32 or u64");
2168  static_assert(sizeof(T) == 8 ? BlockHeight == 8
2169  : BlockHeight >= 1 && BlockHeight <= 32,
2170  "Unsupported block height");
2171  static_assert(sizeof(T) == 8 ? __ESIMD_DNS::isPowerOf2(BlockWidth, 4)
2172  : BlockWidth >= 1 && BlockWidth <= 8,
2173  "Unsupported block width");
2174  } else if constexpr (Transformed) {
2175  static_assert(sizeof(T) == 1 || sizeof(T) == 2,
2176  "VNNI transform is supported only for data size u8 or u16");
2177  static_assert(__ESIMD_DNS::isPowerOf2(NBlocks, 4),
2178  "Unsupported number of blocks");
2179  static_assert(BlockHeight * sizeof(T) >= 4 && BlockHeight <= 32,
2180  "Unsupported block height");
2181  static_assert(BlockWidth * sizeof(T) >= 4 && BlockWidth <= 16 &&
2182  BlockWidth * NBlocks * sizeof(T) <= 64,
2183  "Unsupported block width");
2184  } else {
2185  static_assert(
2186  __ESIMD_DNS::isPowerOf2(NBlocks, sizeof(T) == 1 ? 4 : 8 / sizeof(T)),
2187  "Unsupported number of blocks");
2188  if constexpr (IsStore)
2189  static_assert(BlockHeight <= 8, "Unsupported block height for store");
2190  else
2191  static_assert(BlockHeight <= 32, "Unsupported block height for load");
2192  static_assert(BlockWidth * sizeof(T) >= 4 &&
2193  BlockWidth * NBlocks * sizeof(T) <= 64,
2194  "Unsupported block width");
2195  }
2196 }
2197 
2198 } // namespace detail
2199 
2230 template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
2231  bool Transposed = false, bool Transformed = false,
2232  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
2234  T, NBlocks, BlockHeight, BlockWidth, Transposed, Transformed>()>
2235 __ESIMD_API __ESIMD_NS::simd<T, N>
2236 lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
2237  unsigned SurfacePitch, int X, int Y) {
2238  detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
2239  detail::check_lsc_block_2d_restrictions<T, BlockWidth, BlockHeight, NBlocks,
2240  Transposed, Transformed>();
2241  // For Load BlockWidth is padded up to the next power-of-two value.
2242  // For Load with Transpose the pre-operation BlockHeight is padded up
2243  // to the next power-of-two value.
2244  // For Load with Transform pre-operation BlockHeight is padded up to
2245  // multiple of K, where K = 4B / sizeof(T).
2246  constexpr int ElemsPerDword = 4 / sizeof(T);
2247  constexpr int GRFRowSize = Transposed ? BlockHeight
2248  : Transformed ? BlockWidth * ElemsPerDword
2249  : BlockWidth;
2250  constexpr int GRFRowPitch = __ESIMD_DNS::getNextPowerOf2<GRFRowSize>();
2251  constexpr int GRFColSize =
2252  Transposed
2253  ? BlockWidth
2254  : (Transformed ? (BlockHeight + ElemsPerDword - 1) / ElemsPerDword
2255  : BlockHeight);
2256  constexpr int GRFBlockSize = GRFRowPitch * GRFColSize;
2257  constexpr int GRFBlockPitch =
2258  detail::roundUpNextMultiple<64 / sizeof(T), GRFBlockSize>();
2259  constexpr int ActualN = NBlocks * GRFBlockPitch;
2260 
2261  constexpr int DstBlockElements = GRFColSize * GRFRowSize;
2262  constexpr int DstElements = DstBlockElements * NBlocks;
2263 
2264  static_assert(N == ActualN || N == DstElements, "Incorrect element count");
2265 
2266  constexpr lsc_data_size DS =
2267  detail::finalize_data_size<T, lsc_data_size::default_size>();
2268  __ESIMD_NS::simd_mask<ActualN> pred = 1;
2269  uintptr_t surf_addr = reinterpret_cast<uintptr_t>(Ptr);
2270  constexpr detail::lsc_data_order _Transposed =
2271  Transposed ? detail::lsc_data_order::transpose
2272  : detail::lsc_data_order::nontranspose;
2273  __ESIMD_NS::simd<T, ActualN> Raw =
2274  __esimd_lsc_load2d_stateless<T, L1H, L3H, DS, _Transposed, NBlocks,
2275  BlockWidth, BlockHeight, Transformed,
2276  ActualN>(pred.data(), surf_addr,
2277  SurfaceWidth, SurfaceHeight,
2278  SurfacePitch, X, Y);
2279 
2280  if constexpr (ActualN == N) {
2281  return Raw;
2282  } else {
2283  // HW restrictions force data which is read to contain padding filled with
2284  // zeros for 2d lsc loads. This code eliminates such padding.
2285 
2286  // For example, 2D block load of 5 elements of 1 byte data type will
2287  // take 8 bytes per row for each block.
2288  //
2289  // +----+----+----+----+----+----+-----+-----+
2290  // | 00 | 01 | 02 | 03 | 04 | 05 | 06* | 07* |
2291  // +----+----+----+----+----+----+-----+-----+
2292  // | 10 | 11 | 12 | 13 | 14 | 15 | 16* | 17* |
2293  // +----+----+----+----+----+----+-----+-----+
2294  // | 20 | 21 | 22 | 23 | 24 | 25 | 26* | 27* |
2295  // +----+----+----+----+----+----+-----+-----+
2296  // | 30 | 31 | 32 | 33 | 34 | 35 | 36* | 37* |
2297  // +----+----+----+----+----+----+-----+-----+
2298  // * signifies the padded element.
2299 
2300  __ESIMD_NS::simd<T, DstElements> Dst;
2301 
2302  for (auto i = 0; i < NBlocks; i++) {
2303  auto DstBlock =
2304  Dst.template select<DstBlockElements, 1>(i * DstBlockElements);
2305 
2306  auto RawBlock = Raw.template select<GRFBlockSize, 1>(i * GRFBlockPitch);
2307  DstBlock = RawBlock.template bit_cast_view<T, GRFColSize, GRFRowPitch>()
2308  .template select<GRFColSize, 1, GRFRowSize, 1>(0, 0)
2309  .template bit_cast_view<T>();
2310  }
2311 
2312  return Dst;
2313  }
2314 }
2315 
2316 template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
2317  bool Transposed = false, bool Transformed = false,
2318  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
2320  T, NBlocks, BlockHeight, BlockWidth, Transposed, Transformed>()>
2321 __SYCL_DEPRECATED("use lsc_load_2d()")
2322 __ESIMD_API __ESIMD_NS::simd<T, N> lsc_load2d(const T *Ptr,
2323  unsigned SurfaceWidth,
2324  unsigned SurfaceHeight,
2325  unsigned SurfacePitch, int X,
2326  int Y) {
2327  return lsc_load_2d<T, BlockWidth, BlockHeight, NBlocks, Transposed,
2328  Transformed, L1H, L3H>(Ptr, SurfaceWidth, SurfaceHeight,
2329  SurfacePitch, X, Y);
2330 }
2331 
2354 template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
2355  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
2357  T, NBlocks, BlockHeight, BlockWidth, false, false>()>
2358 __ESIMD_API void lsc_prefetch_2d(const T *Ptr, unsigned SurfaceWidth,
2359  unsigned SurfaceHeight, unsigned SurfacePitch,
2360  int X, int Y) {
2361  detail::check_lsc_cache_hint<detail::lsc_action::prefetch, L1H, L3H>();
2362  detail::check_lsc_block_2d_restrictions<T, BlockWidth, BlockHeight, NBlocks,
2363  false, false>();
2364  constexpr lsc_data_size DS =
2365  detail::finalize_data_size<T, lsc_data_size::default_size>();
2366  __ESIMD_NS::simd_mask<N> pred = 1;
2367  uintptr_t surf_addr = reinterpret_cast<uintptr_t>(Ptr);
2368  constexpr detail::lsc_data_order _Transposed =
2369  detail::lsc_data_order::nontranspose;
2370  __esimd_lsc_prefetch2d_stateless<T, L1H, L3H, DS, _Transposed, NBlocks,
2371  BlockWidth, BlockHeight, false, N>(
2372  pred.data(), surf_addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
2373 }
2374 
2375 template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
2376  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
2378  T, NBlocks, BlockHeight, BlockWidth, false, false>()>
2379 __SYCL_DEPRECATED("use lsc_prefetch_2d()")
2380 __ESIMD_API void lsc_prefetch2d(const T *Ptr, unsigned SurfaceWidth,
2381  unsigned SurfaceHeight, unsigned SurfacePitch,
2382  int X, int Y) {
2383  lsc_prefetch_2d<T, BlockWidth, BlockHeight, NBlocks, L1H, L3H>(
2384  Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
2385 }
2410 template <typename T, int BlockWidth, int BlockHeight = 1,
2411  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
2413  T, 1u, BlockHeight, BlockWidth, false, false>()>
2414 __ESIMD_API void lsc_store_2d(T *Ptr, unsigned SurfaceWidth,
2415  unsigned SurfaceHeight, unsigned SurfacePitch,
2416  int X, int Y, __ESIMD_NS::simd<T, N> Vals) {
2417  detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
2418  detail::check_lsc_block_2d_restrictions<T, BlockWidth, BlockHeight, 1, false,
2419  false, true /*IsStore*/>();
2420  constexpr lsc_data_size DS =
2421  detail::finalize_data_size<T, lsc_data_size::default_size>();
2422  uintptr_t surf_addr = reinterpret_cast<uintptr_t>(Ptr);
2423  constexpr detail::lsc_data_order _Transposed =
2424  detail::lsc_data_order::nontranspose;
2425 
2426  constexpr int Pitch = __ESIMD_DNS::getNextPowerOf2<BlockWidth>();
2427  __ESIMD_NS::simd<T, BlockHeight * Pitch> Raw;
2428 
2429  if constexpr (BlockHeight * Pitch == N) {
2430  Raw = Vals;
2431  } else {
2432  // For store with padding, allocate the block with padding, and place
2433  // original data there.
2434  auto Data2D = Vals.template bit_cast_view<T, BlockHeight, BlockWidth>();
2435  auto Raw2D = Raw.template bit_cast_view<T, BlockHeight, Pitch>();
2436  Raw2D.template select<BlockHeight, 1, BlockWidth, 1>(0, 0) = Data2D;
2437  }
2438 
2439  __ESIMD_NS::simd_mask<BlockHeight * Pitch> pred = 1;
2440  __esimd_lsc_store2d_stateless<T, L1H, L3H, DS, _Transposed, 1u, BlockWidth,
2441  BlockHeight, false, BlockHeight * Pitch>(
2442  pred.data(), surf_addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y,
2443  Raw.data());
2444 }
2445 
2446 template <typename T, int BlockWidth, int BlockHeight = 1,
2447  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
2449  T, 1u, BlockHeight, BlockWidth, false, false>()>
2450 __SYCL_DEPRECATED("use lsc_store_2d()")
2451 __ESIMD_API void lsc_store2d(T *Ptr, unsigned SurfaceWidth,
2452  unsigned SurfaceHeight, unsigned SurfacePitch,
2453  int X, int Y, __ESIMD_NS::simd<T, N> Vals) {
2454  lsc_store_2d<T, BlockWidth, BlockHeight, L1H, L3H>(
2455  Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, Vals);
2456 }
2457 
2465 template <typename T, int BlockWidth, int BlockHeight, int NBlocks>
2467 public:
2471  config_2d_mem_access() : payload_data(0) {
2472  payload_data.template select<1, 1>(7) =
2473  ((NBlocks - 1) << 16) | ((BlockHeight - 1) << 8) | (BlockWidth - 1);
2474  }
2475 
2480  : payload_data(other.payload) {}
2481 
2493  config_2d_mem_access(const T *Ptr, uint32_t SurfaceWidth,
2494  uint32_t SurfaceHeight, uint32_t SurfacePitch, int32_t X,
2495  int32_t Y)
2496  : config_2d_mem_access() {
2497  payload_data.template bit_cast_view<uint64_t>().template select<1, 1>(0) =
2498  (uint64_t)Ptr;
2499  payload_data.template select<1, 1>(2) = SurfaceWidth;
2500  payload_data.template select<1, 1>(3) = SurfaceHeight;
2501  payload_data.template select<1, 1>(4) = SurfacePitch;
2502  payload_data.template select<1, 1>(5) = X;
2503  payload_data.template select<1, 1>(6) = Y;
2504  }
2505 
2510  T *get_data_pointer() const {
2511  return (T *)((
2512  uint64_t)(const_cast<config_2d_mem_access *>(this)
2513  ->payload_data.template bit_cast_view<uint64_t>()[0]));
2514  }
2515 
2520  uint32_t get_surface_width() const {
2521  return const_cast<config_2d_mem_access *>(this)
2522  ->payload_data.template select<1, 1>(2);
2523  }
2524 
2529  uint32_t get_surface_height() const {
2530  return const_cast<config_2d_mem_access *>(this)
2531  ->payload_data.template select<1, 1>(3);
2532  }
2533 
2538  uint32_t get_surface_pitch() const {
2539  return const_cast<config_2d_mem_access *>(this)
2540  ->payload_data.template select<1, 1>(4);
2541  }
2542 
2547  int32_t get_x() const {
2548  return const_cast<config_2d_mem_access *>(this)
2549  ->payload_data.template select<1, 1>(5);
2550  }
2551 
2556  int32_t get_y() const {
2557  return const_cast<config_2d_mem_access *>(this)
2558  ->payload_data.template select<1, 1>(6);
2559  }
2560 
2565  constexpr int32_t get_width() const { return BlockWidth; }
2566 
2571  constexpr int32_t get_height() const { return BlockHeight; }
2572 
2577  constexpr int32_t get_number_of_blocks() const { return NBlocks; }
2578 
2585  payload_data.template bit_cast_view<uint64_t>().template select<1, 1>(0) =
2586  (uint64_t)Ptr;
2587  return *this;
2588  }
2589 
2595  config_2d_mem_access &set_surface_width(uint32_t SurfaceWidth) {
2596  payload_data.template select<1, 1>(2) = SurfaceWidth;
2597  return *this;
2598  }
2599 
2605  config_2d_mem_access &set_surface_height(uint32_t SurfaceHeight) {
2606  payload_data.template select<1, 1>(3) = SurfaceHeight;
2607  return *this;
2608  }
2609 
2615  config_2d_mem_access &set_surface_pitch(uint32_t SurfacePitch) {
2616  payload_data.template select<1, 1>(4) = SurfacePitch;
2617  return *this;
2618  }
2619 
2626  payload_data.template select<1, 1>(5) = X;
2627  return *this;
2628  }
2629 
2636  payload_data.template select<1, 1>(6) = Y;
2637  return *this;
2638  }
2639 
2640 private:
2641  __ESIMD_NS::simd<uint32_t, 16> get_raw_data() { return payload_data; }
2642  __ESIMD_NS::simd<uint32_t, 16> payload_data;
2643 
2644  template <typename T1, int BlockWidth1, int BlockHeight1, int NBlocks1,
2645  bool Transposed1, bool Transformed1, cache_hint L1H, cache_hint L3H,
2646  int N>
2647  friend ESIMD_INLINE SYCL_ESIMD_FUNCTION __ESIMD_NS::simd<T1, N> lsc_load_2d(
2648  config_2d_mem_access<T1, BlockWidth1, BlockHeight1, NBlocks1> &payload);
2649 
2650  template <typename T1, int BlockWidth1, int BlockHeight1, int NBlocks1,
2651  cache_hint L1H, cache_hint L3H, int N>
2652  friend ESIMD_INLINE SYCL_ESIMD_FUNCTION void lsc_store_2d(
2653  config_2d_mem_access<T1, BlockWidth1, BlockHeight1, NBlocks1> &payload,
2654  __ESIMD_NS::simd<T1, N> Data);
2655 
2656  template <typename T1, int BlockWidth1, int BlockHeight1, int NBlocks1,
2657  bool Transposed1, bool Transformed1, cache_hint L1H, cache_hint L3H,
2658  int N>
2659  friend ESIMD_INLINE SYCL_ESIMD_FUNCTION void lsc_prefetch_2d(
2660  config_2d_mem_access<T1, BlockWidth1, BlockHeight1, NBlocks1> &payload);
2661 };
2662 
2682 template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
2683  bool Transposed = false, bool Transformed = false,
2684  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
2686  T, NBlocks, BlockHeight, BlockWidth, Transposed, Transformed>()>
2687 ESIMD_INLINE SYCL_ESIMD_FUNCTION __ESIMD_NS::simd<T, N> lsc_load_2d(
2689  detail::check_lsc_block_2d_restrictions<T, BlockWidth, BlockHeight, NBlocks,
2690  Transposed, Transformed, false>();
2691  detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
2692  constexpr int ElemsPerDword = 4 / sizeof(T);
2693  constexpr int GRFRowSize = Transposed ? BlockHeight
2694  : Transformed ? BlockWidth * ElemsPerDword
2695  : BlockWidth;
2696  constexpr int GRFRowPitch = __ESIMD_DNS::getNextPowerOf2<GRFRowSize>();
2697  constexpr int GRFColSize =
2698  Transposed
2699  ? BlockWidth
2700  : (Transformed ? (BlockHeight + ElemsPerDword - 1) / ElemsPerDword
2701  : BlockHeight);
2702  constexpr int GRFBlockSize = GRFRowPitch * GRFColSize;
2703  constexpr int GRFBlockPitch =
2704  detail::roundUpNextMultiple<64 / sizeof(T), GRFBlockSize>();
2705  constexpr int ActualN = NBlocks * GRFBlockPitch;
2706 
2707  constexpr int DstBlockElements = GRFColSize * GRFRowSize;
2708  constexpr int DstElements = DstBlockElements * NBlocks;
2709 
2710  static_assert(N == ActualN || N == DstElements, "Incorrect element count");
2711 
2712  constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask<L1H, L3H>()
2713  << 17;
2714  constexpr uint32_t base_desc = 0x2800403;
2715  constexpr uint32_t transformMask = Transformed ? 1 << 7 : 0;
2716  constexpr uint32_t transposeMask = Transposed ? 1 << 15 : 0;
2717  __ESIMD_NS::simd<T, N> oldDst;
2718  constexpr uint32_t exDesc = 0x0;
2719  constexpr uint32_t desc =
2720  base_desc | cache_mask | transformMask | transposeMask;
2721  constexpr uint8_t execSize = 0x0;
2722  constexpr uint8_t sfid = 0xF;
2723  constexpr uint8_t numSrc0 = 0x1;
2724  constexpr uint8_t numDst = (N * sizeof(T)) / 64;
2725  __ESIMD_NS::simd<T, ActualN> Raw =
2726  raw_send(oldDst, payload.get_raw_data(), exDesc, desc, execSize, sfid,
2727  numSrc0, numDst);
2728 
2729  if constexpr (ActualN == N) {
2730  return Raw;
2731  } else {
2732  // HW restrictions force data which is read to contain padding filled with
2733  // zeros for 2d lsc loads. This code eliminates such padding.
2734 
2735  __ESIMD_NS::simd<T, DstElements> Dst;
2736 
2737  for (auto i = 0; i < NBlocks; i++) {
2738  auto DstBlock =
2739  Dst.template select<DstBlockElements, 1>(i * DstBlockElements);
2740 
2741  auto RawBlock = Raw.template select<GRFBlockSize, 1>(i * GRFBlockPitch);
2742  DstBlock = RawBlock.template bit_cast_view<T, GRFColSize, GRFRowPitch>()
2743  .template select<GRFColSize, 1, GRFRowSize, 1>(0, 0)
2744  .template bit_cast_view<T>();
2745  }
2746 
2747  return Dst;
2748  }
2749 }
2750 
2767 template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
2768  bool Transposed = false, bool Transformed = false,
2769  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
2771  T, NBlocks, BlockHeight, BlockWidth, Transposed, Transformed>()>
2772 ESIMD_INLINE SYCL_ESIMD_FUNCTION void lsc_prefetch_2d(
2774  detail::check_lsc_cache_hint<detail::lsc_action::prefetch, L1H, L3H>();
2775  detail::check_lsc_block_2d_restrictions<T, BlockWidth, BlockHeight, NBlocks,
2776  Transposed, Transformed, false>();
2777  static_assert(!Transposed || !Transformed,
2778  "Transposed and transformed is not supported");
2779  constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask<L1H, L3H>()
2780  << 17;
2781  constexpr uint32_t base_desc = 0x2000403;
2782  constexpr uint32_t transformMask = Transformed ? 1 << 7 : 0;
2783  constexpr uint32_t transposeMask = Transposed ? 1 << 15 : 0;
2784  constexpr uint32_t exDesc = 0x0;
2785  constexpr uint32_t desc =
2786  base_desc | cache_mask | transformMask | transposeMask;
2787  constexpr uint8_t execSize = 0x0;
2788  constexpr uint8_t sfid = 0xF;
2789  constexpr uint8_t numDst = (N * sizeof(T)) / 64;
2790  raw_send(payload.get_raw_data(), exDesc, desc, execSize, sfid, numDst);
2791 }
2792 
2808 template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
2809  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
2811  T, NBlocks, BlockHeight, BlockWidth, false, false>()>
2812 ESIMD_INLINE SYCL_ESIMD_FUNCTION void
2814  __ESIMD_NS::simd<T, N> Data) {
2815  detail::check_lsc_block_2d_restrictions<T, BlockWidth, BlockHeight, NBlocks,
2816  false, false, true>();
2817  detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
2818 
2819  constexpr uint32_t cache_mask = detail::get_lsc_store_cache_mask<L1H, L3H>()
2820  << 17;
2821  constexpr uint32_t base_desc = 0x2000407;
2822 
2823  constexpr uint32_t exDesc = 0x0;
2824  constexpr uint32_t desc = base_desc | cache_mask;
2825  constexpr uint8_t execSize = 0x0;
2826  constexpr uint8_t sfid = 0xF;
2827  constexpr uint8_t numSrc0 = 0x1;
2828  constexpr uint8_t numSrc1 = (N * sizeof(T)) / 64;
2829 
2830  raw_sends(payload.get_raw_data(), Data, exDesc, desc, execSize, sfid, numSrc0,
2831  numSrc1);
2832 }
2833 
2847 template <__ESIMD_NS::atomic_op Op, typename T, int N,
2848  lsc_data_size DS = lsc_data_size::default_size>
2849 __ESIMD_API __ESIMD_NS::simd<T, N>
2850 lsc_slm_atomic_update(__ESIMD_NS::simd<uint32_t, N> offsets,
2851  __ESIMD_NS::simd_mask<N> pred) {
2852  static_assert(sizeof(T) == 2 || sizeof(T) == 4, "Unsupported data type");
2853  __ESIMD_EDNS::check_lsc_vector_size<1>();
2854  __ESIMD_EDNS::check_lsc_data_size<T, DS>();
2855  constexpr __ESIMD_NS::native::lsc::atomic_op _Op =
2856  __ESIMD_DNS::to_lsc_atomic_op<Op>();
2857  __ESIMD_EDNS::check_lsc_atomic<_Op, T, N, 0>();
2858  constexpr uint16_t _AddressScale = 1;
2859  constexpr int _ImmOffset = 0;
2860  constexpr lsc_data_size _DS =
2861  detail::expand_data_size(detail::finalize_data_size<T, DS>());
2862  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
2863  constexpr detail::lsc_data_order _Transposed =
2864  detail::lsc_data_order::nontranspose;
2865  using MsgT = typename detail::lsc_expand_type<T>::type;
2866  __ESIMD_NS::simd<MsgT, N> Tmp =
2867  __esimd_lsc_xatomic_slm_0<MsgT, _Op, cache_hint::none, cache_hint::none,
2868  _AddressScale, _ImmOffset, _DS, _VS,
2869  _Transposed, N>(pred.data(), offsets.data());
2870  return detail::lsc_format_ret<T>(Tmp);
2871 }
2872 
2887 template <__ESIMD_NS::atomic_op Op, typename T, int N,
2888  lsc_data_size DS = lsc_data_size::default_size>
2889 __ESIMD_API __ESIMD_NS::simd<T, N>
2890 lsc_slm_atomic_update(__ESIMD_NS::simd<uint32_t, N> offsets,
2891  __ESIMD_NS::simd<T, N> src0,
2892  __ESIMD_NS::simd_mask<N> pred) {
2893  static_assert(Op != __ESIMD_NS::atomic_op::fadd &&
2894  Op != __ESIMD_NS::atomic_op::fsub,
2895  "fadd and fsub are not supported for slm.");
2896  static_assert(sizeof(T) == 2 || sizeof(T) == 4, "Unsupported data type");
2897  detail::check_lsc_vector_size<1>();
2898  detail::check_lsc_data_size<T, DS>();
2899  constexpr __ESIMD_NS::native::lsc::atomic_op _Op =
2900  __ESIMD_DNS::to_lsc_atomic_op<Op>();
2901  __ESIMD_EDNS::check_lsc_atomic<_Op, T, N, 1>();
2902  constexpr uint16_t _AddressScale = 1;
2903  constexpr int _ImmOffset = 0;
2904  constexpr lsc_data_size _DS =
2905  detail::expand_data_size(detail::finalize_data_size<T, DS>());
2906  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
2907  constexpr detail::lsc_data_order _Transposed =
2908  detail::lsc_data_order::nontranspose;
2909  using MsgT = typename detail::lsc_expand_type<T>::type;
2910  __ESIMD_NS::simd<MsgT, N> Msg_data = detail::lsc_format_input<MsgT>(src0);
2911  __ESIMD_NS::simd<MsgT, N> Tmp =
2912  __esimd_lsc_xatomic_slm_1<MsgT, _Op, cache_hint::none, cache_hint::none,
2913  _AddressScale, _ImmOffset, _DS, _VS,
2914  _Transposed, N>(pred.data(), offsets.data(),
2915  Msg_data.data());
2916  return detail::lsc_format_ret<T>(Tmp);
2917 }
2918 
2934 template <__ESIMD_NS::atomic_op Op, typename T, int N,
2935  lsc_data_size DS = lsc_data_size::default_size>
2936 __ESIMD_API __ESIMD_NS::simd<T, N>
2937 lsc_slm_atomic_update(__ESIMD_NS::simd<uint32_t, N> offsets,
2938  __ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd<T, N> src1,
2939  __ESIMD_NS::simd_mask<N> pred) {
2940  static_assert(sizeof(T) == 2 || sizeof(T) == 4 ||
2941  (Op == __ESIMD_NS::atomic_op::cmpxchg && sizeof(T) == 8),
2942  "Unsupported data type");
2943  detail::check_lsc_vector_size<1>();
2944  detail::check_lsc_data_size<T, DS>();
2945  constexpr __ESIMD_NS::native::lsc::atomic_op _Op =
2946  __ESIMD_DNS::to_lsc_atomic_op<Op>();
2947  __ESIMD_EDNS::check_lsc_atomic<_Op, T, N, 2>();
2948  constexpr uint16_t _AddressScale = 1;
2949  constexpr int _ImmOffset = 0;
2950  constexpr lsc_data_size _DS =
2951  detail::expand_data_size(detail::finalize_data_size<T, DS>());
2952  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
2953  constexpr detail::lsc_data_order _Transposed =
2954  detail::lsc_data_order::nontranspose;
2955  using MsgT = typename detail::lsc_expand_type<T>::type;
2956  __ESIMD_NS::simd<MsgT, N> Msg_data0 = detail::lsc_format_input<MsgT>(src0);
2957  __ESIMD_NS::simd<MsgT, N> Msg_data1 = detail::lsc_format_input<MsgT>(src1);
2958  __ESIMD_NS::simd<MsgT, N> Tmp =
2959  __esimd_lsc_xatomic_slm_2<MsgT, _Op, cache_hint::none, cache_hint::none,
2960  _AddressScale, _ImmOffset, _DS, _VS,
2961  _Transposed, N>(
2962  pred.data(), offsets.data(), Msg_data0.data(), Msg_data1.data());
2963  return detail::lsc_format_ret<T>(Tmp);
2964 }
2965 
2980 template <__ESIMD_NS::atomic_op Op, typename T, int N,
2981  lsc_data_size DS = lsc_data_size::default_size,
2982  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
2983  typename Toffset>
2984 __ESIMD_API std::enable_if_t<
2985  __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 0,
2986  __ESIMD_NS::simd<T, N>>
2987 lsc_atomic_update(T *p, __ESIMD_NS::simd<Toffset, N> offsets,
2988  __ESIMD_NS::simd_mask<N> pred) {
2989  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
2990  static_assert(sizeof(T) > 1, "Unsupported data type");
2991  detail::check_lsc_vector_size<1>();
2992  detail::check_lsc_data_size<T, DS>();
2993  constexpr __ESIMD_NS::native::lsc::atomic_op _Op =
2994  __ESIMD_DNS::to_lsc_atomic_op<Op>();
2995  __ESIMD_EDNS::check_lsc_atomic<_Op, T, N, 0>();
2996  detail::check_lsc_cache_hint<detail::lsc_action::atomic, L1H, L3H>();
2997  constexpr uint16_t _AddressScale = 1;
2998  constexpr int _ImmOffset = 0;
2999  constexpr lsc_data_size _DS =
3000  detail::expand_data_size(detail::finalize_data_size<T, DS>());
3001  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
3002  constexpr detail::lsc_data_order _Transposed =
3003  detail::lsc_data_order::nontranspose;
3004  using MsgT = typename detail::lsc_expand_type<T>::type;
3005  __ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
3006  addrs += convert<uintptr_t>(offsets);
3007  __ESIMD_NS::simd<MsgT, N> Tmp =
3008  __esimd_lsc_xatomic_stateless_0<MsgT, _Op, L1H, L3H, _AddressScale,
3009  _ImmOffset, _DS, _VS, _Transposed, N>(
3010  pred.data(), addrs.data());
3011  return detail::lsc_format_ret<T>(Tmp);
3012 }
3013 
3014 template <__ESIMD_NS::atomic_op Op, typename T, int N,
3015  lsc_data_size DS = lsc_data_size::default_size,
3016  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
3017  typename Toffset,
3018  typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
3019 __ESIMD_API std::enable_if_t<
3020  __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 0,
3021  __ESIMD_NS::simd<T, N>>
3022 lsc_atomic_update(T *p, __ESIMD_NS::simd_view<Toffset, RegionTy> offsets,
3023  __ESIMD_NS::simd_mask<N> pred = 1) {
3024  return lsc_atomic_update<Op, T, N, DS, L1H, L3H>(p, offsets.read(), pred);
3025 }
3026 
3027 template <__ESIMD_NS::atomic_op Op, typename T, int N,
3028  lsc_data_size DS = lsc_data_size::default_size,
3029  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
3030  typename Toffset>
3031 __ESIMD_API std::enable_if_t<
3032  std::is_integral_v<Toffset> &&
3033  __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 0,
3034  __ESIMD_NS::simd<T, N>>
3035 lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd_mask<N> pred = 1) {
3036  return lsc_atomic_update<Op, T, N, DS, L1H, L3H>(
3037  p, __ESIMD_NS::simd<Toffset, N>(offset), pred);
3038 }
3039 
3055 template <__ESIMD_NS::atomic_op Op, typename T, int N,
3056  lsc_data_size DS = lsc_data_size::default_size,
3057  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
3058  typename Toffset>
3059 __ESIMD_API std::enable_if_t<
3060  __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 1,
3061  __ESIMD_NS::simd<T, N>>
3062 lsc_atomic_update(T *p, __ESIMD_NS::simd<Toffset, N> offsets,
3063  __ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd_mask<N> pred) {
3064  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
3065  static_assert(sizeof(T) > 1, "Unsupported data type");
3066  detail::check_lsc_vector_size<1>();
3067  detail::check_lsc_data_size<T, DS>();
3068  constexpr __ESIMD_NS::native::lsc::atomic_op _Op =
3069  __ESIMD_DNS::to_lsc_atomic_op<Op>();
3070  __ESIMD_EDNS::check_lsc_atomic<_Op, T, N, 1>();
3071  detail::check_lsc_cache_hint<detail::lsc_action::atomic, L1H, L3H>();
3072  constexpr uint16_t _AddressScale = 1;
3073  constexpr int _ImmOffset = 0;
3074  constexpr lsc_data_size _DS =
3075  detail::expand_data_size(detail::finalize_data_size<T, DS>());
3076  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
3077  constexpr detail::lsc_data_order _Transposed =
3078  detail::lsc_data_order::nontranspose;
3079  using MsgT = typename detail::lsc_expand_type<T>::type;
3080  __ESIMD_NS::simd<MsgT, N> Msg_data = detail::lsc_format_input<MsgT>(src0);
3081  __ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
3082  addrs += convert<uintptr_t>(offsets);
3083  __ESIMD_NS::simd<MsgT, N> Tmp =
3084  __esimd_lsc_xatomic_stateless_1<MsgT, _Op, L1H, L3H, _AddressScale,
3085  _ImmOffset, _DS, _VS, _Transposed, N>(
3086  pred.data(), addrs.data(), Msg_data.data());
3087  return detail::lsc_format_ret<T>(Tmp);
3088 }
3089 
3090 template <__ESIMD_NS::atomic_op Op, typename T, int N,
3091  lsc_data_size DS = lsc_data_size::default_size,
3092  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
3093  typename Toffset,
3094  typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
3095 __ESIMD_API std::enable_if_t<
3096  __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 1,
3097  __ESIMD_NS::simd<T, N>>
3098 lsc_atomic_update(T *p, __ESIMD_NS::simd_view<Toffset, RegionTy> offsets,
3099  __ESIMD_NS::simd<T, N> src0,
3100  __ESIMD_NS::simd_mask<N> pred = 1) {
3101  return lsc_atomic_update<Op, T, N, DS, L1H, L3H>(p, offsets.read(), src0,
3102  pred);
3103 }
3104 
3105 template <__ESIMD_NS::atomic_op Op, typename T, int N,
3106  lsc_data_size DS = lsc_data_size::default_size,
3107  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
3108  typename Toffset>
3109 __ESIMD_API std::enable_if_t<
3110  std::is_integral_v<Toffset> &&
3111  __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 1 &&
3112  ((Op != __ESIMD_NS::atomic_op::store &&
3113  Op != __ESIMD_NS::atomic_op::xchg) ||
3114  N == 1),
3115  __ESIMD_NS::simd<T, N>>
3116 lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd<T, N> src0,
3117  __ESIMD_NS::simd_mask<N> pred = 1) {
3118  return lsc_atomic_update<Op, T, N, DS, L1H, L3H>(
3119  p, __ESIMD_NS::simd<Toffset, N>(offset), src0, pred);
3120 }
3137 template <__ESIMD_NS::atomic_op Op, typename T, int N,
3138  lsc_data_size DS = lsc_data_size::default_size,
3139  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
3140  typename Toffset>
3141 __ESIMD_API std::enable_if_t<
3142  __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 2,
3143  __ESIMD_NS::simd<T, N>>
3144 lsc_atomic_update(T *p, __ESIMD_NS::simd<Toffset, N> offsets,
3145  __ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd<T, N> src1,
3146  __ESIMD_NS::simd_mask<N> pred) {
3147  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
3148  static_assert(sizeof(T) > 1, "Unsupported data type");
3149  detail::check_lsc_vector_size<1>();
3150  detail::check_lsc_data_size<T, DS>();
3151  constexpr __ESIMD_NS::native::lsc::atomic_op _Op =
3152  __ESIMD_DNS::to_lsc_atomic_op<Op>();
3153  __ESIMD_EDNS::check_lsc_atomic<_Op, T, N, 2>();
3154  detail::check_lsc_cache_hint<detail::lsc_action::atomic, L1H, L3H>();
3155  constexpr uint16_t _AddressScale = 1;
3156  constexpr int _ImmOffset = 0;
3157  constexpr lsc_data_size _DS =
3158  detail::expand_data_size(detail::finalize_data_size<T, DS>());
3159  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
3160  constexpr detail::lsc_data_order _Transposed =
3161  detail::lsc_data_order::nontranspose;
3162  using MsgT = typename detail::lsc_expand_type<T>::type;
3163  __ESIMD_NS::simd<MsgT, N> Msg_data0 = detail::lsc_format_input<MsgT>(src0);
3164  __ESIMD_NS::simd<MsgT, N> Msg_data1 = detail::lsc_format_input<MsgT>(src1);
3165  __ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
3166  addrs += convert<uintptr_t>(offsets);
3167  __ESIMD_NS::simd<MsgT, N> Tmp =
3168  __esimd_lsc_xatomic_stateless_2<MsgT, _Op, L1H, L3H, _AddressScale,
3169  _ImmOffset, _DS, _VS, _Transposed, N>(
3170  pred.data(), addrs.data(), Msg_data0.data(), Msg_data1.data());
3171  return detail::lsc_format_ret<T>(Tmp);
3172 }
3173 
3174 template <__ESIMD_NS::atomic_op Op, typename T, int N,
3175  lsc_data_size DS = lsc_data_size::default_size,
3176  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
3177  typename Toffset,
3178  typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
3179 __ESIMD_API std::enable_if_t<
3180  __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 2,
3181  __ESIMD_NS::simd<T, N>>
3182 lsc_atomic_update(T *p, __ESIMD_NS::simd_view<Toffset, RegionTy> offsets,
3183  __ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd<T, N> src1,
3184  __ESIMD_NS::simd_mask<N> pred = 1) {
3185  return lsc_atomic_update<Op, T, N, DS, L1H, L3H>(p, offsets.read(), src0,
3186  src1, pred);
3187 }
3188 
3189 template <__ESIMD_NS::atomic_op Op, typename T, int N,
3190  lsc_data_size DS = lsc_data_size::default_size,
3191  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
3192  typename Toffset>
3193 __ESIMD_API std::enable_if_t<
3194  std::is_integral_v<Toffset> &&
3195  __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 2,
3196  __ESIMD_NS::simd<T, N>>
3197 lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd<T, N> src0,
3198  __ESIMD_NS::simd<T, N> src1,
3199  __ESIMD_NS::simd_mask<N> pred = 1) {
3200  return lsc_atomic_update<Op, T, N, DS, L1H, L3H>(
3201  p, __ESIMD_NS::simd<Toffset, N>(offset), src0, src1, pred);
3202 }
3203 
3221 template <__ESIMD_NS::atomic_op Op, typename T, int N,
3222  lsc_data_size DS = lsc_data_size::default_size,
3223  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
3224  typename AccessorTy, typename Toffset>
3225 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value,
3226  __ESIMD_NS::simd<T, N>>
3227 lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd<Toffset, N> offsets,
3228  __ESIMD_NS::simd_mask<N> pred) {
3229 #ifdef __ESIMD_FORCE_STATELESS_MEM
3230  return lsc_atomic_update<Op, T, N, DS, L1H, L3H>(
3231  __ESIMD_DNS::accessorToPointer<T>(acc), offsets, pred);
3232 #else
3233  static_assert(sizeof(T) > 1, "Unsupported data type");
3234  static_assert(std::is_integral_v<Toffset> && sizeof(Toffset) == 4,
3235  "Unsupported offset type");
3236  detail::check_lsc_vector_size<1>();
3237  detail::check_lsc_data_size<T, DS>();
3238  constexpr __ESIMD_NS::native::lsc::atomic_op _Op =
3239  __ESIMD_DNS::to_lsc_atomic_op<Op>();
3240  __ESIMD_EDNS::check_lsc_atomic<_Op, T, N, 0>();
3241  detail::check_lsc_cache_hint<detail::lsc_action::atomic, L1H, L3H>();
3242  constexpr uint16_t _AddressScale = 1;
3243  constexpr int _ImmOffset = 0;
3244  constexpr lsc_data_size _DS =
3245  detail::expand_data_size(detail::finalize_data_size<T, DS>());
3246  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
3247  constexpr detail::lsc_data_order _Transposed =
3248  detail::lsc_data_order::nontranspose;
3249  using MsgT = typename detail::lsc_expand_type<T>::type;
3250  auto si = __ESIMD_NS::get_surface_index(acc);
3251  __ESIMD_NS::simd<MsgT, N> Tmp =
3252  __esimd_lsc_xatomic_bti_0<MsgT, _Op, L1H, L3H, _AddressScale, _ImmOffset,
3253  _DS, _VS, _Transposed, N>(pred.data(),
3254  offsets.data(), si);
3255  return detail::lsc_format_ret<T>(Tmp);
3256 #endif
3257 }
3258 
3277 template <__ESIMD_NS::atomic_op Op, typename T, int N,
3278  lsc_data_size DS = lsc_data_size::default_size,
3279  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
3280  typename AccessorTy, typename Toffset>
3281 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value,
3282  __ESIMD_NS::simd<T, N>>
3283 lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd<Toffset, N> offsets,
3284  __ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd_mask<N> pred) {
3285 #ifdef __ESIMD_FORCE_STATELESS_MEM
3286  return lsc_atomic_update<Op, T, N, DS, L1H, L3H>(
3287  __ESIMD_DNS::accessorToPointer<T>(acc), offsets, src0, pred);
3288 #else
3289  static_assert(sizeof(T) > 1, "Unsupported data type");
3290  static_assert(std::is_integral_v<Toffset> && sizeof(Toffset) == 4,
3291  "Unsupported offset type");
3292  detail::check_lsc_vector_size<1>();
3293  detail::check_lsc_data_size<T, DS>();
3294  constexpr __ESIMD_NS::native::lsc::atomic_op _Op =
3295  __ESIMD_DNS::to_lsc_atomic_op<Op>();
3296  __ESIMD_EDNS::check_lsc_atomic<_Op, T, N, 1>();
3297  detail::check_lsc_cache_hint<detail::lsc_action::atomic, L1H, L3H>();
3298  constexpr uint16_t _AddressScale = 1;
3299  constexpr int _ImmOffset = 0;
3300  constexpr lsc_data_size _DS =
3301  detail::expand_data_size(detail::finalize_data_size<T, DS>());
3302  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
3303  constexpr detail::lsc_data_order _Transposed =
3304  detail::lsc_data_order::nontranspose;
3305  using MsgT = typename detail::lsc_expand_type<T>::type;
3306  __ESIMD_NS::simd<MsgT, N> Msg_data = detail::lsc_format_input<MsgT>(src0);
3307  auto si = __ESIMD_NS::get_surface_index(acc);
3308  __ESIMD_NS::simd<MsgT, N> Tmp =
3309  __esimd_lsc_xatomic_bti_1<MsgT, _Op, L1H, L3H, _AddressScale, _ImmOffset,
3310  _DS, _VS, _Transposed, N>(
3311  pred.data(), offsets.data(), Msg_data.data(), si);
3312  return detail::lsc_format_ret<T>(Tmp);
3313 #endif
3314 }
3315 
3335 template <__ESIMD_NS::atomic_op Op, typename T, int N,
3336  lsc_data_size DS = lsc_data_size::default_size,
3337  cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
3338  typename AccessorTy, typename Toffset>
3339 __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value,
3340  __ESIMD_NS::simd<T, N>>
3341 lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd<Toffset, N> offsets,
3342  __ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd<T, N> src1,
3343  __ESIMD_NS::simd_mask<N> pred) {
3344 #ifdef __ESIMD_FORCE_STATELESS_MEM
3345  return lsc_atomic_update<Op, T, N, DS, L1H, L3H>(
3346  __ESIMD_DNS::accessorToPointer<T>(acc), offsets, src0, src1, pred);
3347 #else
3348  static_assert(std::is_integral_v<Toffset> && sizeof(Toffset) == 4,
3349  "Unsupported offset type");
3350  detail::check_lsc_vector_size<1>();
3351  detail::check_lsc_data_size<T, DS>();
3352  constexpr __ESIMD_NS::native::lsc::atomic_op _Op =
3353  __ESIMD_DNS::to_lsc_atomic_op<Op>();
3354  __ESIMD_EDNS::check_lsc_atomic<_Op, T, N, 2>();
3355  detail::check_lsc_cache_hint<detail::lsc_action::atomic, L1H, L3H>();
3356  constexpr uint16_t _AddressScale = 1;
3357  constexpr int _ImmOffset = 0;
3358  constexpr lsc_data_size _DS =
3359  detail::expand_data_size(detail::finalize_data_size<T, DS>());
3360  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
3361  constexpr detail::lsc_data_order _Transposed =
3362  detail::lsc_data_order::nontranspose;
3363  using MsgT = typename detail::lsc_expand_type<T>::type;
3364  __ESIMD_NS::simd<MsgT, N> Msg_data0 = detail::lsc_format_input<MsgT>(src0);
3365  __ESIMD_NS::simd<MsgT, N> Msg_data1 = detail::lsc_format_input<MsgT>(src1);
3366  auto si = __ESIMD_NS::get_surface_index(acc);
3367  __ESIMD_NS::simd<MsgT, N> Tmp =
3368  __esimd_lsc_xatomic_bti_2<MsgT, _Op, L1H, L3H, _AddressScale, _ImmOffset,
3369  _DS, _VS, _Transposed, N>(
3370  pred.data(), offsets.data(), Msg_data0.data(), Msg_data1.data(), si);
3371  return detail::lsc_format_ret<T>(Tmp);
3372 #endif
3373 }
3374 
3383 template <lsc_memory_kind Kind = lsc_memory_kind::untyped_global,
3384  lsc_fence_op FenceOp = lsc_fence_op::none,
3385  lsc_scope Scope = lsc_scope::group, int N = 16>
3386 __ESIMD_API void lsc_fence(__ESIMD_NS::simd_mask<N> pred = 1) {
3387  static_assert(
3388  Kind != lsc_memory_kind::shared_local ||
3389  (FenceOp == lsc_fence_op::none && Scope == lsc_scope::group),
3390  "SLM fence must have 'none' lsc_fence_op and 'group' scope");
3391  __esimd_lsc_fence<Kind, FenceOp, Scope, N>(pred.data());
3392 }
3393 
3395 
3398 
3401 
3403 __ESIMD_API int32_t get_hw_thread_id() {
3404 #ifdef __SYCL_DEVICE_ONLY__
3405  return __spirv_BuiltInGlobalHWThreadIDINTEL();
3406 #else
3407  return std::rand();
3408 #endif // __SYCL_DEVICE_ONLY__
3409 }
3411 __ESIMD_API int32_t get_subdevice_id() {
3412 #ifdef __SYCL_DEVICE_ONLY__
3413  return __spirv_BuiltInSubDeviceIDINTEL();
3414 #else
3415  return 0;
3416 #endif
3417 }
3418 
3420 
3421 } // namespace experimental::esimd
3422 
3423 namespace esimd {
3424 
3428 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset>
3429 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3430  __ESIMD_DNS::get_num_args<Op>() == 0,
3431  simd<T, N>>
3433  return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3434  p, offset, mask);
3435 }
3436 
3437 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset,
3438  typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
3439 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3440  __ESIMD_DNS::get_num_args<Op>() == 0,
3441  simd<T, N>>
3443  simd_mask<N> mask = 1) {
3444  return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3445  p, offsets, mask);
3446 }
3447 
3448 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset>
3449 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3450  __ESIMD_DNS::get_num_args<Op>() == 0,
3451  simd<T, N>>
3452 atomic_update(T *p, Toffset offset, simd_mask<N> mask = 1) {
3453  return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3454  p, offset, mask);
3455 }
3456 
3458 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset>
3459 __ESIMD_API
3460  __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3461  __ESIMD_DNS::get_num_args<Op>() == 1,
3462  simd<T, N>>
3464  simd_mask<N> mask) {
3465  return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3466  p, offset, src0, mask);
3467 }
3468 
3469 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset,
3470  typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
3471 __ESIMD_API
3472  __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3473  __ESIMD_DNS::get_num_args<Op>() == 1,
3474  simd<T, N>>
3476  simd_mask<N> mask = 1) {
3477  return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3478  p, offsets, src0, mask);
3479 }
3480 
3481 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset>
3482 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3483  __ESIMD_DNS::get_num_args<Op>() == 1,
3484  simd<T, N>>
3485 atomic_update(T *p, Toffset offset, simd<T, N> src0, simd_mask<N> mask = 1) {
3486  return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3487  p, offset, src0, mask);
3488 }
3489 
3491 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset>
3492 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3493  __ESIMD_DNS::get_num_args<Op>() == 2,
3494  simd<T, N>>
3496  simd_mask<N> mask) {
3497  // 2-argument lsc_atomic_update arguments order matches the standard one -
3498  // expected value first, then new value. But atomic_update uses reverse
3499  // order, hence the src1/src0 swap.
3500  return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3501  p, offset, src1, src0, mask);
3502 }
3503 
3504 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset,
3505  typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
3506 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3507  __ESIMD_DNS::get_num_args<Op>() == 2,
3508  simd<T, N>>
3510  simd<T, N> src1, simd_mask<N> mask = 1) {
3511  return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3512  p, offsets, src1, src0, mask);
3513 }
3514 
3515 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset>
3516 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3517  __ESIMD_DNS::get_num_args<Op>() == 2,
3518  __ESIMD_NS::simd<T, N>>
3519 atomic_update(T *p, Toffset offset, simd<T, N> src0, simd<T, N> src1,
3520  simd_mask<N> mask = 1) {
3521  return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3522  p, offset, src1, src0, mask);
3523 }
3524 
3525 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset,
3526  typename AccessorTy>
3527 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3528  __ESIMD_DNS::get_num_args<Op>() == 0 &&
3529  !std::is_pointer<AccessorTy>::value,
3530  simd<T, N>>
3531 atomic_update(AccessorTy acc, simd<Toffset, N> offset, simd_mask<N> mask) {
3532  return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3533  acc, offset, mask);
3534 }
3535 
3536 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset,
3537  typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>,
3538  typename AccessorTy>
3539 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3540  __ESIMD_DNS::get_num_args<Op>() == 0 &&
3541  !std::is_pointer<AccessorTy>::value,
3542  simd<T, N>>
3544  simd_mask<N> mask) {
3545  return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3546  acc, offsets, mask);
3547 }
3548 
3549 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset,
3550  typename AccessorTy>
3551 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3552  __ESIMD_DNS::get_num_args<Op>() == 0 &&
3553  !std::is_pointer<AccessorTy>::value,
3554  simd<T, N>>
3555 atomic_update(AccessorTy acc, Toffset offset, simd_mask<N> mask) {
3556  return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3557  acc, offset, mask);
3558 }
3559 
3561 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset,
3562  typename AccessorTy>
3563 __ESIMD_API
3564  __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3565  __ESIMD_DNS::get_num_args<Op>() == 1 &&
3566  !std::is_pointer<AccessorTy>::value,
3567  simd<T, N>>
3568  atomic_update(AccessorTy acc, simd<Toffset, N> offset, simd<T, N> src0,
3569  simd_mask<N> mask) {
3570  return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3571  acc, offset, src0, mask);
3572 }
3573 
3574 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset,
3575  typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>,
3576  typename AccessorTy>
3577 __ESIMD_API
3578  __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3579  __ESIMD_DNS::get_num_args<Op>() == 1 &&
3580  !std::is_pointer<AccessorTy>::value,
3581  simd<T, N>>
3583  simd<T, N> src0, simd_mask<N> mask) {
3584  return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3585  acc, offsets, src0, mask);
3586 }
3587 
3588 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset,
3589  typename AccessorTy>
3590 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3591  __ESIMD_DNS::get_num_args<Op>() == 1 &&
3592  !std::is_pointer<AccessorTy>::value,
3593  simd<T, N>>
3594 atomic_update(AccessorTy acc, Toffset offset, simd<T, N> src0,
3595  simd_mask<N> mask) {
3596  return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3597  acc, offset, src0, mask);
3598 }
3599 
3601 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset,
3602  typename AccessorTy>
3603 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3604  __ESIMD_DNS::get_num_args<Op>() == 2 &&
3605  !std::is_pointer<AccessorTy>::value,
3606  simd<T, N>>
3607 atomic_update(AccessorTy acc, simd<Toffset, N> offset, simd<T, N> src0,
3608  simd<T, N> src1, simd_mask<N> mask) {
3609  // 2-argument lsc_atomic_update arguments order matches the standard one -
3610  // expected value first, then new value. But atomic_update uses reverse
3611  // order, hence the src1/src0 swap.
3612  return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3613  acc, offset, src1, src0, mask);
3614 }
3615 
3616 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset,
3617  typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>,
3618  typename AccessorTy>
3619 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3620  __ESIMD_DNS::get_num_args<Op>() == 2 &&
3621  !std::is_pointer<AccessorTy>::value,
3622  simd<T, N>>
3624  simd<T, N> src0, simd<T, N> src1, simd_mask<N> mask) {
3625  return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3626  acc, offsets, src1, src0, mask);
3627 }
3628 
3629 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset,
3630  typename AccessorTy>
3631 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
3632  __ESIMD_DNS::get_num_args<Op>() == 2 &&
3633  !std::is_pointer<AccessorTy>::value,
3634  __ESIMD_NS::simd<T, N>>
3635 atomic_update(AccessorTy acc, Toffset offset, simd<T, N> src0, simd<T, N> src1,
3636  simd_mask<N> mask) {
3637  return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
3638  acc, offset, src1, src0, mask);
3639 }
3640 
3658 template <int SLMAmount> class slm_allocator {
3659  int offset;
3660 
3661 public:
3663  slm_allocator() { offset = __esimd_slm_alloc(SLMAmount); }
3664 
3666  ESIMD_INLINE int get_offset() const { return offset; }
3667 
3669  ~slm_allocator() { __esimd_slm_free(offset); }
3670 };
3671 
3672 } // namespace esimd
3673 } // namespace ext::intel
3674 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
3675 } // namespace sycl
sycl::_V1::ext::intel::experimental::esimd::config_2d_mem_access::get_surface_width
uint32_t get_surface_width() const
Get surface width
Definition: memory.hpp:2520
simd_mask
Definition: simd.hpp:1029
sycl::_V1::ext::intel::experimental::esimd::detail::lsc_expand_type::type
std::conditional_t< sizeof(T)<=4, std::conditional_t< std::is_signed_v< T >, int32_t, uint32_t >, std::conditional_t< std::is_signed_v< T >, int64_t, uint64_t > > type
Definition: common.hpp:203
sycl::_V1::ext::intel::experimental::esimd::config_2d_mem_access::get_surface_pitch
uint32_t get_surface_pitch() const
Get surface pitch
Definition: memory.hpp:2538
sycl::_V1::ext::intel::experimental::esimd::raw_sends_store
__ESIMD_API void raw_sends_store(sycl::ext::intel::esimd::simd< T1, n1 > msgSrc0, sycl::ext::intel::esimd::simd< T2, n2 > msgSrc1, uint32_t exDesc, uint32_t msgDesc, uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t numSrc1, uint8_t isEOT=0, uint8_t isSendc=0, sycl::ext::intel::esimd::simd_mask< N > mask=1)
Definition: memory.hpp:203
sycl::_V1::ext::intel::experimental::esimd::lsc_store2d
__ESIMD_API void lsc_store2d(T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y, sycl::ext::intel::esimd::simd< T, N > Vals)
Definition: memory.hpp:2451
common.hpp
__SYCL_INLINE_VER_NAMESPACE
#define __SYCL_INLINE_VER_NAMESPACE(X)
Definition: defines_elementary.hpp:11
sycl::_V1::ext::intel::experimental::esimd::config_2d_mem_access::get_surface_height
uint32_t get_surface_height() const
Get surface height
Definition: memory.hpp:2529
sycl::_V1::ext::intel::experimental::esimd::get_subdevice_id
__ESIMD_API int32_t get_subdevice_id()
Get subdevice ID.
Definition: memory.hpp:3411
sycl::_V1::ext::intel::esimd::atomic_update
__ESIMD_API std::enable_if_t< std::is_integral_v< Toffset > &&__ESIMD_DNS::get_num_args< Op >)==2 &&!std::is_pointer< AccessorTy >::value, sycl::ext::intel::esimd::simd< T, N > > atomic_update(AccessorTy acc, Toffset offset, simd< T, N > src0, simd< T, N > src1, simd_mask< N > mask)
Definition: memory.hpp:3635
sycl::_V1::ext::intel::esimd::slm_allocator
RAII-style class used to implement "semi-dynamic" SLM allocation.
Definition: memory.hpp:3658
sycl::_V1::ext::intel::experimental::esimd::lsc_slm_scatter
__ESIMD_API void lsc_slm_scatter(sycl::ext::intel::esimd::simd< uint32_t, N > offsets, sycl::ext::intel::esimd::simd< T, N *NElts > vals, sycl::ext::intel::esimd::simd_mask< N > pred=1)
SLM scatter.
Definition: memory.hpp:1663
sycl::_V1::ext::intel::experimental::esimd::detail::lsc_format_input
ESIMD_INLINE sycl::ext::intel::esimd::simd< RT, N > lsc_format_input(sycl::ext::intel::esimd::simd< T, N > Vals)
Definition: memory.hpp:359
memory.hpp
sycl::_V1::ext::intel::experimental::esimd::lsc_slm_gather
__ESIMD_API sycl::ext::intel::esimd::simd< T, N *NElts > lsc_slm_gather(sycl::ext::intel::esimd::simd< uint32_t, N > offsets, sycl::ext::intel::esimd::simd_mask< N > pred, sycl::ext::intel::esimd::simd< T, N *NElts > old_values)
SLM gather.
Definition: memory.hpp:516
sycl::_V1::ext::intel::experimental::esimd::config_2d_mem_access::config_2d_mem_access
config_2d_mem_access(const T *Ptr, uint32_t SurfaceWidth, uint32_t SurfaceHeight, uint32_t SurfacePitch, int32_t X, int32_t Y)
Constructor
Definition: memory.hpp:2493
sycl::_V1::ext::intel::experimental::esimd::config_2d_mem_access::set_surface_width
config_2d_mem_access & set_surface_width(uint32_t SurfaceWidth)
Sets surface width
Definition: memory.hpp:2595
sycl::_V1::ext::intel::esimd::simd
The main simd vector class.
Definition: types.hpp:34
sycl::_V1::ext::intel::experimental::esimd::config_2d_mem_access::get_x
int32_t get_x() const
Get top left corner X coordinate of the block
Definition: memory.hpp:2547
sycl::_V1::ext::intel::experimental::esimd::raw_send
__ESIMD_API void raw_send(sycl::ext::intel::esimd::simd< T1, n1 > msgSrc0, uint32_t exDesc, uint32_t msgDesc, uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t isEOT=0, uint8_t isSendc=0, sycl::ext::intel::esimd::simd_mask< N > mask=1)
Raw send.
Definition: memory.hpp:233
sycl::_V1::ext::intel::experimental::esimd::lsc_store_2d
ESIMD_INLINE SYCL_ESIMD_FUNCTION void lsc_store_2d(config_2d_mem_access< T, BlockWidth, BlockHeight, NBlocks > &payload, sycl::ext::intel::esimd::simd< T, N > Data)
A variation of 2D stateless block store with parameters passed as config_2d_mem_access object Note: C...
Definition: memory.hpp:2813
sycl
---— Error handling, matching OpenCL plugin semantics.
Definition: access.hpp:14
sycl::_V1::ext::intel::esimd::simd_view
This class represents a reference to a sub-region of a base simd object.
Definition: types.hpp:35
sycl::_V1::ext::intel::experimental::esimd::lsc_scatter
__ESIMD_API std::enable_if_t<!std::is_pointer< AccessorTy >::value > lsc_scatter(AccessorTy acc, sycl::ext::intel::esimd::simd< uint32_t, N > offsets, sycl::ext::intel::esimd::simd< T, N *NElts > vals, sycl::ext::intel::esimd::simd_mask< N > pred=1)
Accessor-based scatter.
Definition: memory.hpp:1806
util.hpp
sycl::_V1::ext::intel::experimental::esimd::config_2d_mem_access::config_2d_mem_access
config_2d_mem_access(const config_2d_mem_access &other)
Copy constructor
Definition: memory.hpp:2479
sycl::_V1::ext::intel::experimental::esimd::lsc_gather
__ESIMD_API std::enable_if_t<!std::is_pointer_v< AccessorTy >, sycl::ext::intel::esimd::simd< T, N *NElts > > lsc_gather(AccessorTy acc, sycl::ext::intel::esimd::simd< uint32_t, N > offsets, sycl::ext::intel::esimd::simd_mask< N > pred, sycl::ext::intel::esimd::simd< T, N *NElts > old_values)
Accessor-based gather.
Definition: memory.hpp:850
sycl::_V1::ext::intel::experimental::esimd::raw_send_store
__ESIMD_API void raw_send_store(sycl::ext::intel::esimd::simd< T1, n1 > msgSrc0, uint32_t exDesc, uint32_t msgDesc, uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t isEOT=0, uint8_t isSendc=0, sycl::ext::intel::esimd::simd_mask< N > mask=1)
Definition: memory.hpp:248
sycl::_V1::ext::intel::experimental::esimd::named_barrier_init
__ESIMD_API void named_barrier_init()
Initialize number of named barriers for a kernel Available only on PVC.
Definition: memory.hpp:278
sycl::_V1::ext::intel::experimental::esimd::config_2d_mem_access::get_width
constexpr int32_t get_width() const
Get width of the block
Definition: memory.hpp:2565
sycl::_V1::ext::intel::experimental::esimd::detail::expand_data_size
constexpr lsc_data_size expand_data_size(lsc_data_size DS)
Definition: common.hpp:191
sycl::_V1::ext::intel::experimental::esimd::named_barrier_signal
__ESIMD_API void named_barrier_signal(uint8_t barrier_id, uint8_t producer_consumer_mode, uint32_t num_producers, uint32_t num_consumers)
Perform signal operation for the given named barrier Available only on PVC.
Definition: memory.hpp:294
sycl::_V1::ext::intel::experimental::esimd::config_2d_mem_access
Container class to hold parameters for load2d/store2d functions
Definition: memory.hpp:2466
__SYCL_DEPRECATED
#define __SYCL_DEPRECATED(message)
Definition: defines_elementary.hpp:46
sycl::_V1::ext::intel::experimental::esimd::lsc_slm_atomic_update
__ESIMD_API sycl::ext::intel::esimd::simd< T, N > lsc_slm_atomic_update(sycl::ext::intel::esimd::simd< uint32_t, N > offsets, sycl::ext::intel::esimd::simd< T, N > src0, sycl::ext::intel::esimd::simd< T, N > src1, sycl::ext::intel::esimd::simd_mask< N > pred)
SLM atomic.
Definition: memory.hpp:2937
sycl::_V1::ext::intel::esimd::slm_allocator::~slm_allocator
~slm_allocator()
Releases the SLM chunk allocated in the constructor.
Definition: memory.hpp:3669
sycl::_V1::ext::intel::experimental::esimd::lsc_load_2d
ESIMD_INLINE SYCL_ESIMD_FUNCTION sycl::ext::intel::esimd::simd< T, N > lsc_load_2d(config_2d_mem_access< T, BlockWidth, BlockHeight, NBlocks > &payload)
A variation of 2D stateless block load with parameters passed as config_2d_mem_access object Note: Co...
Definition: memory.hpp:2687
sycl::_V1::ext::intel::experimental::esimd::lsc_memory_kind
lsc_memory_kind
The specific LSC shared function to fence with lsc_fence Supported platforms: DG2,...
Definition: common.hpp:57
sycl::_V1::ext::intel::experimental::esimd::lsc_slm_block_store
__ESIMD_API void lsc_slm_block_store(uint32_t offset, sycl::ext::intel::esimd::simd< T, NElts > vals)
Transposed SLM scatter with 1 channel.
Definition: memory.hpp:1696
sycl::_V1::ext::intel::esimd::barrier
__ESIMD_API void barrier()
Generic work-group barrier.
Definition: memory.hpp:1716
sycl::_V1::ext::intel::experimental::esimd::lsc_fence_op
lsc_fence_op
The lsc_fence operation to apply to caches Supported platforms: DG2, PVC.
Definition: common.hpp:45
sycl::_V1::ext::intel::experimental::esimd::config_2d_mem_access::get_number_of_blocks
constexpr int32_t get_number_of_blocks() const
Get number of blocks
Definition: memory.hpp:2577
sycl::_V1::ext::intel::experimental::esimd::config_2d_mem_access::set_x
config_2d_mem_access & set_x(int32_t X)
Sets top left corner X coordinate of the block
Definition: memory.hpp:2625
sycl::_V1::ext::intel::experimental::esimd::lsc_block_load
__ESIMD_API std::enable_if_t<!std::is_pointer< AccessorTy >::value &&sycl::ext::intel::esimd::is_simd_flag_type_v< FlagsT >, sycl::ext::intel::esimd::simd< T, NElts > > lsc_block_load(AccessorTy acc, uint32_t offset, sycl::ext::intel::esimd::simd_mask< 1 > pred, sycl::ext::intel::esimd::simd< T, NElts > old_values, FlagsT flags=FlagsT{})
Accessor-based transposed gather with 1 channel.
Definition: memory.hpp:1359
sycl::_V1::ext::intel::experimental::esimd::wait
__ESIMD_API std::enable_if_t<(sizeof(T) *N >=2)> wait(sycl::ext::intel::esimd::simd< T, N > value)
Create explicit scoreboard dependency to avoid device code motion across this call and preserve the v...
Definition: memory.hpp:317
sycl::_V1::ext::intel::experimental::esimd::detail::lsc_data_order
lsc_data_order
Definition: common.hpp:89
sycl::_V1::ext::intel::esimd::detail::isPowerOf2
constexpr ESIMD_INLINE bool isPowerOf2(unsigned int n)
Check if a given 32 bit positive integer is a power of 2 at compile time.
Definition: common.hpp:79
sycl::_V1::ext::intel::experimental::esimd::config_2d_mem_access::set_surface_height
config_2d_mem_access & set_surface_height(uint32_t SurfaceHeight)
Sets surface height
Definition: memory.hpp:2605
sycl::_V1::ext::intel::experimental::esimd::detail::check_lsc_block_2d_restrictions
constexpr void check_lsc_block_2d_restrictions()
Definition: memory.hpp:2151
sycl::_V1::ext::intel::esimd::get_surface_index
__ESIMD_API SurfaceIndex get_surface_index(AccessorTy acc)
Get surface index corresponding to a SYCL accessor.
Definition: memory.hpp:62
sycl::_V1::ext::intel::experimental::esimd::config_2d_mem_access::get_data_pointer
T * get_data_pointer() const
Get a surface base address
Definition: memory.hpp:2510
sycl::_V1::ext::intel::experimental::esimd::raw_sends
__ESIMD_API void raw_sends(sycl::ext::intel::esimd::simd< T1, n1 > msgSrc0, sycl::ext::intel::esimd::simd< T2, n2 > msgSrc1, uint32_t exDesc, uint32_t msgDesc, uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t numSrc1, uint8_t isEOT=0, uint8_t isSendc=0, sycl::ext::intel::esimd::simd_mask< N > mask=1)
Raw sends.
Definition: memory.hpp:182
sycl::_V1::ext::intel::experimental::esimd::detail::lsc_vector_size
lsc_vector_size
Definition: common.hpp:78
sycl::_V1::ext::intel::experimental::esimd::raw_send_load
__ESIMD_API sycl::ext::intel::esimd::simd< T1, n1 > raw_send_load(sycl::ext::intel::esimd::simd< T1, n1 > msgDst, sycl::ext::intel::esimd::simd< T2, n2 > msgSrc0, uint32_t exDesc, uint32_t msgDesc, uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t numDst, uint8_t isEOT=0, uint8_t isSendc=0, sycl::ext::intel::esimd::simd_mask< N > mask=1)
Definition: memory.hpp:151
sycl::_V1::ext::intel::experimental::esimd::named_barrier_wait
__ESIMD_API void named_barrier_wait(uint8_t id)
Wait on a named barrier Available only on PVC.
Definition: memory.hpp:270
sycl::_V1::ext::oneapi::experimental::detail::Alignment
@ Alignment
Definition: property.hpp:189
sycl::_V1::ext::intel::experimental::esimd::detail::get_lsc_load_cache_mask
constexpr uint32_t get_lsc_load_cache_mask()
Definition: memory.hpp:405
sycl::_V1::ext::intel::experimental::esimd::config_2d_mem_access::get_y
int32_t get_y() const
Get top left corner Y coordinate of the block
Definition: memory.hpp:2556
sycl::_V1::ext::intel::experimental::esimd::split_barrier
__ESIMD_API void split_barrier(split_barrier_action flag)
Definition: memory.hpp:33
sycl::_V1::ext::intel::experimental::esimd::raw_sends_load
__ESIMD_API sycl::ext::intel::esimd::simd< T1, n1 > raw_sends_load(sycl::ext::intel::esimd::simd< T1, n1 > msgDst, sycl::ext::intel::esimd::simd< T2, n2 > msgSrc0, sycl::ext::intel::esimd::simd< T3, n3 > msgSrc1, uint32_t exDesc, uint32_t msgDesc, uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t numSrc1, uint8_t numDst, uint8_t isEOT=0, uint8_t isSendc=0, sycl::ext::intel::esimd::simd_mask< N > mask=1)
Definition: memory.hpp:98
sycl::_V1::ext::intel::experimental::esimd::lsc_block_store
__ESIMD_API std::enable_if_t<!std::is_pointer< AccessorTy >::value &&sycl::ext::intel::esimd::is_simd_flag_type_v< FlagsT > > lsc_block_store(AccessorTy acc, uint32_t offset, sycl::ext::intel::esimd::simd< T, NElts > vals, FlagsT flags)
A variation of lsc_block_store without predicate parameter to simplify use of alignment parameter.
Definition: memory.hpp:2136
sycl::_V1::ext::intel::experimental::esimd::detail::get_lsc_block_2d_data_size
constexpr int get_lsc_block_2d_data_size()
Definition: memory.hpp:349
sycl::_V1::ext::intel::experimental::esimd::config_2d_mem_access::set_surface_pitch
config_2d_mem_access & set_surface_pitch(uint32_t SurfacePitch)
Sets surface pitch
Definition: memory.hpp:2615
sycl::_V1::ext::intel::experimental::esimd::config_2d_mem_access::config_2d_mem_access
config_2d_mem_access()
Default constructor
Definition: memory.hpp:2471
sycl::_V1::ext::intel::experimental::esimd::lsc_scope
lsc_scope
The scope that lsc_fence operation should apply to Supported platforms: DG2, PVC.
Definition: common.hpp:33
sycl::_V1::ext::intel::esimd::slm_allocator::get_offset
ESIMD_INLINE int get_offset() const
Definition: memory.hpp:3666
simd
Definition: simd.hpp:1027
sycl::_V1::ext::intel::experimental::esimd::split_barrier_action
split_barrier_action
Represents a split barrier action.
Definition: common.hpp:296
sycl::_V1::ext::intel::experimental::esimd::lsc_prefetch2d
__ESIMD_API void lsc_prefetch2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y)
Definition: memory.hpp:2380
sycl::_V1::ext::intel::experimental::esimd::lsc_prefetch
__ESIMD_API std::enable_if_t<!std::is_pointer< AccessorTy >::value > lsc_prefetch(AccessorTy acc, uint32_t offset)
Accessor-based transposed prefetch gather with 1 channel.
Definition: memory.hpp:1615
sycl::_V1::ext::intel::experimental::esimd::cache_hint
cache_hint
L1 or L3 cache hint kinds.
Definition: common.hpp:219
sycl::_V1::ext::intel::experimental::esimd::lsc_prefetch_2d
ESIMD_INLINE SYCL_ESIMD_FUNCTION void lsc_prefetch_2d(config_2d_mem_access< T, BlockWidth, BlockHeight, NBlocks > &payload)
A variation of 2D stateless block prefetch with parameters passed as config_2d_mem_access object Note...
Definition: memory.hpp:2772
sycl::_V1::ext::intel::experimental::esimd::get_hw_thread_id
__ESIMD_API int32_t get_hw_thread_id()
Get HW Thread ID.
Definition: memory.hpp:3403
sycl::_V1::ext::intel::experimental::esimd::lsc_fence
__ESIMD_API void lsc_fence(sycl::ext::intel::esimd::simd_mask< N > pred=1)
Memory fence.
Definition: memory.hpp:3386
sycl::_V1::ext::intel::experimental::esimd::detail::get_lsc_store_cache_mask
constexpr uint32_t get_lsc_store_cache_mask()
Definition: memory.hpp:432
sycl::_V1::ext::intel::esimd::slm_allocator::slm_allocator
slm_allocator()
Allocates the amount of SLM which is class' template parameter.
Definition: memory.hpp:3663
sycl::_V1::ext::intel::experimental::esimd::detail::lsc_format_ret
ESIMD_INLINE sycl::ext::intel::esimd::simd< T, N > lsc_format_ret(sycl::ext::intel::esimd::simd< T1, N > Vals)
Definition: memory.hpp:374
sycl::_V1::ext::intel::experimental::esimd::detail::check_lsc_atomic
constexpr void check_lsc_atomic()
Check the legality of lsc atomic call in terms of size and type.
Definition: memory.hpp:387
sycl::_V1::ext::intel::experimental::esimd::lsc_load2d
__ESIMD_API sycl::ext::intel::esimd::simd< T, N > lsc_load2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y)
Definition: memory.hpp:2322
sycl::_V1::ext::intel::experimental::esimd::config_2d_mem_access::get_height
constexpr int32_t get_height() const
Get height of the block
Definition: memory.hpp:2571
sycl::_V1::ext::intel::experimental::esimd::lsc_atomic_update
__ESIMD_API std::enable_if_t<!std::is_pointer< AccessorTy >::value, sycl::ext::intel::esimd::simd< T, N > > lsc_atomic_update(AccessorTy acc, sycl::ext::intel::esimd::simd< Toffset, N > offsets, sycl::ext::intel::esimd::simd< T, N > src0, sycl::ext::intel::esimd::simd< T, N > src1, sycl::ext::intel::esimd::simd_mask< N > pred)
Accessor-based atomic.
Definition: memory.hpp:3341
sycl::_V1::ext::intel::experimental::esimd::lsc_slm_block_load
__ESIMD_API sycl::ext::intel::esimd::simd< T, NElts > lsc_slm_block_load(uint32_t offset, sycl::ext::intel::esimd::simd_mask< 1 > pred, sycl::ext::intel::esimd::simd< T, NElts > old_values)
Transposed SLM gather with 1 channel.
Definition: memory.hpp:593
sycl::_V1::ext::intel::experimental::esimd::lsc_data_size
lsc_data_size
Data size or format to read or store.
Definition: common.hpp:65
sycl::_V1::ext::intel::experimental::esimd::detail::lsc_bitcast_type::type
std::conditional_t< sizeof(T)==1, uint8_t, std::conditional_t< sizeof(T)==2, uint16_t, std::conditional_t< sizeof(T)==4, uint32_t, std::conditional_t< sizeof(T)==8, uint64_t, T > >> > type
Definition: common.hpp:213
sycl::_V1::ext::intel::experimental::esimd::config_2d_mem_access::set_data_pointer
config_2d_mem_access & set_data_pointer(T *Ptr)
Sets surface base address
Definition: memory.hpp:2584
sycl::_V1::ext::intel::experimental::esimd::config_2d_mem_access::set_y
config_2d_mem_access & set_y(int32_t Y)
Sets top left corner Y coordinate of the block
Definition: memory.hpp:2635
sycl::_V1::ext::intel::esimd::atomic_op
atomic_op
Represents an atomic operation.
Definition: common.hpp:145
memory_intrin.hpp