DPC++ Runtime
Runtime libraries for oneAPI DPC++
memory.hpp
Go to the documentation of this file.
1 //==-------------- memory.hpp - DPC++ Explicit SIMD API --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // Implement Explicit SIMD memory-access APIs.
9 //===----------------------------------------------------------------------===//
10 
11 #pragma once
12 
20 #include <sycl/half_type.hpp>
21 
22 #include <algorithm>
23 #include <cstdint>
24 
25 namespace sycl {
26 inline namespace _V1 {
27 namespace ext::intel::esimd {
28 
31 
37 
39 
41 
43 
46 
52 template <typename AccessorTy>
53 __ESIMD_API SurfaceIndex get_surface_index(AccessorTy acc) {
54  if constexpr (std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
55  sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>) {
56  return detail::SLM_BTI;
57  } else {
58 #ifdef __ESIMD_FORCE_STATELESS_MEM
59  static_assert(sycl::detail::acc_properties::is_image_accessor_v<AccessorTy>,
60  "The function get_surface_index() is available only for "
61  "image- and local-accessors in stateless-only memory mode. "
62  "Consider using "
63  "-fno-sycl-esimd-force-stateless-mem compilation switch.");
64 #endif // __ESIMD_FORCE_STATELESS_MEM
65  return __esimd_get_surface_index(
66  detail::AccessorPrivateProxy::getQualifiedPtrOrImageObj(acc));
67  }
68 }
69 
70 namespace detail {
71 
72 // Format u8 and u16 to u8u32 and u16u32 by doing garbage-extension.
73 template <typename RT, typename T, int N>
75  if constexpr (sizeof(T) == 1) {
76  // Extend bytes to RT.
77  return Vals.template bit_cast_view<uint8_t>();
78  } else if constexpr (sizeof(T) == 2) {
79  // Extend words to RT.
80  return Vals.template bit_cast_view<uint16_t>();
81  } else {
82  return Vals.template bit_cast_view<RT>();
83  }
84 }
85 
86 // Format u8u32 and u16u32 back to u8 and u16.
87 template <typename T, typename T1, int N>
89  auto Formatted = Vals.template bit_cast_view<T>();
90  if constexpr (sizeof(T) == sizeof(T1)) {
91  return Formatted;
92  } else {
93  constexpr int Stride = Formatted.length / N;
94  return Formatted.template select<N, Stride>(0);
95  }
96 }
97 
115 template <typename T, int NElts, lsc_data_size DS, cache_hint L1H,
116  cache_hint L2H, int N, typename OffsetT>
117 __ESIMD_API simd<T, N * NElts> gather_impl(const T *p, simd<OffsetT, N> offsets,
118  simd_mask<N> pred) {
119  static_assert(std::is_integral_v<OffsetT>, "Unsupported offset type");
120  check_lsc_vector_size<NElts>();
121  check_lsc_data_size<T, DS>();
122  check_cache_hint<cache_action::load, L1H, L2H>();
123  constexpr uint16_t AddressScale = 1;
124  constexpr int ImmOffset = 0;
125  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
126  constexpr lsc_vector_size VS = to_lsc_vector_size<NElts>();
127  constexpr auto Transposed = lsc_data_order::nontranspose;
128  using MsgT = typename lsc_expand_type<T>::type;
129  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
130  addrs += convert<uintptr_t>(offsets);
132  __esimd_lsc_load_stateless<MsgT, L1H, L2H, AddressScale, ImmOffset, EDS,
133  VS, Transposed, N>(pred.data(), addrs.data());
134  return lsc_format_ret<T>(Tmp);
135 }
136 
157 template <typename T, int NElts, lsc_data_size DS, cache_hint L1H,
158  cache_hint L2H, int N, typename OffsetT>
159 __ESIMD_API simd<T, N * NElts> gather_impl(const T *p, simd<OffsetT, N> offsets,
160  simd_mask<N> pred,
161  simd<T, N * NElts> pass_thru) {
162  static_assert(std::is_integral_v<OffsetT>, "Unsupported offset type");
163  check_lsc_vector_size<NElts>();
164  check_lsc_data_size<T, DS>();
165  check_cache_hint<cache_action::load, L1H, L2H>();
166  constexpr uint16_t AddressScale = 1;
167  constexpr int ImmOffset = 0;
168  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
169  constexpr lsc_vector_size VS = to_lsc_vector_size<NElts>();
170  constexpr auto Transposed = lsc_data_order::nontranspose;
171  using MsgT = typename lsc_expand_type<T>::type;
172  simd<uintptr_t, N> Addrs = reinterpret_cast<uintptr_t>(p);
173  Addrs += convert<uintptr_t>(offsets);
174  simd<MsgT, N * NElts> PassThruExpanded = lsc_format_input<MsgT>(pass_thru);
175  simd<MsgT, N * NElts> Result =
176  __esimd_lsc_load_merge_stateless<MsgT, L1H, L2H, AddressScale, ImmOffset,
177  EDS, VS, Transposed, N>(
178  pred.data(), Addrs.data(), PassThruExpanded.data());
179  return lsc_format_ret<T>(Result);
180 }
181 
199 template <typename T, int NElts, lsc_data_size DS, cache_hint L1H,
200  cache_hint L2H, int N, typename Toffset>
201 __ESIMD_API void scatter_impl(T *p, simd<Toffset, N> offsets,
202  simd<T, N * NElts> vals, simd_mask<N> pred) {
203  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
204  check_lsc_vector_size<NElts>();
205  check_lsc_data_size<T, DS>();
206  check_cache_hint<cache_action::store, L1H, L2H>();
207  constexpr uint16_t AddressScale = 1;
208  constexpr int ImmOffset = 0;
209  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
210  constexpr lsc_vector_size VS = to_lsc_vector_size<NElts>();
211  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
212  using MsgT = typename lsc_expand_type<T>::type;
213  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
214  addrs += convert<uintptr_t>(offsets);
215  simd<MsgT, N * NElts> Tmp = lsc_format_input<MsgT, T>(vals);
216  __esimd_lsc_store_stateless<MsgT, L1H, L2H, AddressScale, ImmOffset, EDS, VS,
217  Transposed, N>(pred.data(), addrs.data(),
218  Tmp.data());
219 }
220 
221 // Returns true iff it is Ok to use llvm.masked.gather and llvm.masked.scatter.
222 // By default (without use specifying __ESIMD_GATHER_SCATTER_LLVM_IR) it is
223 // not used because of an issue in GPU driver, which does not recognize
224 // those operations in SPIR-V when they are used in mixed (scalar and vector)
225 // kernels using invoke_simd() API.
227 #ifdef __ESIMD_GATHER_SCATTER_LLVM_IR
228  return true;
229 #else
230  return false;
231 #endif
232 }
233 
234 } // namespace detail
235 
275 
281 #ifndef __ESIMD_GATHER_SCATTER_LLVM_IR
284 #endif // __ESIMD_GATHER_SCATTER_LLVM_IR
307 template <typename T, int N, int VS, typename OffsetT,
308  typename PropertyListT =
310 __ESIMD_API std::enable_if_t<
311  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
312 gather(const T *p, simd<OffsetT, N / VS> byte_offsets, simd_mask<N / VS> mask,
313  simd<T, N> pass_thru, PropertyListT props = {}) {
314  static_assert(std::is_integral_v<OffsetT>, "Unsupported offset type");
315  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
316 
317  constexpr size_t Alignment =
318  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
319  static_assert(Alignment >= sizeof(T),
320  "gather() requires at least element-size alignment");
321  constexpr auto L1Hint =
322  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
324  constexpr auto L2Hint =
325  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
327 
328  // Use LSC lowering if L1/L2 or VS > 1. Also, if masked gather is
329  // not available, then LSC is the only lowering option.
330  if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none ||
332  static_assert(VS == 1 || sizeof(T) >= 4,
333  "VS > 1 is supprted only for 4- and 8-byte elements");
335  L1Hint, L2Hint>(p, byte_offsets, mask,
336  pass_thru);
337  } else {
338  simd<uint64_t, N> Addrs(reinterpret_cast<uint64_t>(p));
339  Addrs = Addrs + convert<uint64_t>(byte_offsets);
340 
341  using MsgT = detail::__raw_t<T>;
342  return __esimd_gather_ld<MsgT, N, Alignment>(
343  Addrs.data(), mask.data(),
344  sycl::bit_cast<__ESIMD_DNS::vector_type_t<MsgT, N>>(pass_thru.data()));
345  }
346 }
347 
373 template <typename T, int N, int VS, typename OffsetT,
374  typename PropertyListT =
376 __ESIMD_API std::enable_if_t<
377  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
378 gather(const T *p, simd<OffsetT, N / VS> byte_offsets, simd_mask<N / VS> mask,
379  PropertyListT props = {}) {
380  constexpr size_t Alignment =
381  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
382  static_assert(Alignment >= sizeof(T),
383  "gather() requires at least element-size alignment");
384  constexpr auto L1Hint =
385  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
387  constexpr auto L2Hint =
388  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
390 
391  if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none ||
393  simd<T, N> PassThru; // it is intentionally undefined
394  return gather<T, N, VS>(p, byte_offsets, mask, PassThru, props);
395  } else {
396  static_assert(detail::isPowerOf2(N, 32), "Unsupported value of N");
397  simd<uintptr_t, N> Addrs = reinterpret_cast<uintptr_t>(p);
398  Addrs += convert<uintptr_t>(byte_offsets);
399  using MsgT = detail::__raw_t<T>;
400  if constexpr (sizeof(T) == 1) {
401  auto Ret = __esimd_svm_gather<MsgT, N, detail::ElemsPerAddrEncoding<4>(),
402  detail::ElemsPerAddrEncoding<1>()>(
403  Addrs.data(), mask.data());
404  return __esimd_rdregion<MsgT, N * 4, N, /*VS*/ 0, N, 4>(Ret, 0);
405  } else if constexpr (sizeof(T) == 2) {
406  auto Ret = __esimd_svm_gather<MsgT, N, detail::ElemsPerAddrEncoding<2>(),
407  detail::ElemsPerAddrEncoding<2>()>(
408  Addrs.data(), mask.data());
409  return __esimd_rdregion<MsgT, N * 2, N, /*VS*/ 0, N, 2>(Ret, 0);
410  } else {
411  return __esimd_svm_gather<MsgT, N, detail::ElemsPerAddrEncoding<1>(),
412  detail::ElemsPerAddrEncoding<1>()>(Addrs.data(),
413  mask.data());
414  }
415  }
416 }
417 
436 template <typename T, int N, int VS, typename OffsetT,
437  typename PropertyListT =
439 __ESIMD_API std::enable_if_t<
440  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
441 gather(const T *p, simd<OffsetT, N / VS> byte_offsets,
442  PropertyListT props = {}) {
443  simd_mask<N / VS> Mask = 1;
444  return gather<T, N, VS>(p, byte_offsets, Mask, props);
445 }
446 
471 template <typename T, int N, typename OffsetT,
472  typename PropertyListT =
474 __ESIMD_API std::enable_if_t<
475  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
476 gather(const T *p, simd<OffsetT, N> byte_offsets, simd_mask<N> mask,
477  simd<T, N> pass_thru, PropertyListT props = {}) {
478  constexpr int VS = 1;
479  return gather<T, N, VS>(p, byte_offsets, mask, pass_thru, props);
480 }
481 
503 template <typename T, int N, typename OffsetT,
504  typename PropertyListT =
506 __ESIMD_API std::enable_if_t<
507  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
508 gather(const T *p, simd<OffsetT, N> byte_offsets, simd_mask<N> mask,
509  PropertyListT props = {}) {
510  constexpr int VS = 1;
511  return gather<T, N, VS>(p, byte_offsets, mask, props);
512 }
513 
529 template <typename T, int N, typename OffsetT,
530  typename PropertyListT =
532 __ESIMD_API std::enable_if_t<
533  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
534 gather(const T *p, simd<OffsetT, N> byte_offsets, PropertyListT props = {}) {
535  constexpr int VS = 1;
536  return gather<T, N, VS>(p, byte_offsets, props);
537 }
538 
567 template <typename T, int N, int VS = 1, typename OffsetObjT,
568  typename OffsetRegionT,
569  typename PropertyListT =
571 __ESIMD_API std::enable_if_t<
572  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
574  simd_mask<N / VS> mask, simd<T, N> pass_thru, PropertyListT props = {}) {
575  return gather<T, N, VS>(p, byte_offsets.read(), mask, pass_thru, props);
576 }
577 
601 template <typename T, int N, int VS = 1, typename OffsetObjT,
602  typename OffsetRegionT,
603  typename PropertyListT =
605 __ESIMD_API std::enable_if_t<
606  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
608  simd_mask<N / VS> mask, PropertyListT props = {}) {
609  return gather<T, N, VS>(p, byte_offsets.read(), mask, props);
610 }
611 
629 template <typename T, int N, int VS = 1, typename OffsetObjT,
630  typename OffsetRegionT,
631  typename PropertyListT =
633 __ESIMD_API std::enable_if_t<
634  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
636  PropertyListT props = {}) {
637  return gather<T, N, VS>(p, byte_offsets.read(), props);
638 }
639 
652 template <typename Tx, int N, typename Toffset>
653 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>, simd<Tx, N>>
654 gather(const Tx *p, Toffset offset, simd_mask<N> mask = 1) {
655  return gather<Tx, N>(p, simd<Toffset, N>(offset), mask);
656 }
657 
662 
667 
670 
675 
680 
704 template <typename T, int N, int VS = 1, typename OffsetT,
705  typename PropertyListT =
707 __ESIMD_API std::enable_if_t<
708  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
709 scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
710  simd_mask<N / VS> mask, PropertyListT props = {}) {
711  static_assert(std::is_integral_v<OffsetT>, "Unsupported offset type");
712  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
713 
714  constexpr size_t Alignment =
715  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
716  static_assert(Alignment >= sizeof(T),
717  "scatter() requires at least element-size alignment");
718  constexpr auto L1Hint =
719  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
721  constexpr auto L2Hint =
722  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
724 
725  // Use LSC lowering if L1/L2 or VS > 1.
726  if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none ||
727  VS > 1 ||
728  (!__ESIMD_DNS::isPowerOf2(N, 32) &&
730  static_assert(VS == 1 || sizeof(T) >= 4,
731  "VS > 1 is supprted only for 4- and 8-byte elements");
733  L1Hint, L2Hint>(p, byte_offsets, vals, mask);
734  } else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) {
735  simd<uint64_t, N> Addrs(reinterpret_cast<uint64_t>(p));
736  Addrs = Addrs + convert<uint64_t>(byte_offsets);
737  using MsgT = detail::__raw_t<T>;
738  __esimd_scatter_st<MsgT, N, Alignment>(
739  sycl::bit_cast<__ESIMD_DNS::vector_type_t<MsgT, N>>(vals.data()),
740  Addrs.data(), mask.data());
741  } else {
742  using Tx = detail::__raw_t<T>;
743  simd<uint64_t, N> byte_offsets_i = convert<uint64_t>(byte_offsets);
744  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
745  addrs = addrs + byte_offsets_i;
746  if constexpr (sizeof(T) == 1) {
747  simd<T, N * 4> D = __esimd_wrregion<Tx, N * 4, N, /*VS*/ 0, N, 4>(
748  D.data(), vals.data(), 0);
749  __esimd_svm_scatter<Tx, N, detail::ElemsPerAddrEncoding<4>(),
750  detail::ElemsPerAddrEncoding<1>()>(
751  addrs.data(), D.data(), mask.data());
752  } else if constexpr (sizeof(T) == 2) {
753  simd<Tx, N * 2> D = __esimd_wrregion<Tx, N * 2, N, /*VS*/ 0, N, 2>(
754  D.data(), vals.data(), 0);
755  __esimd_svm_scatter<Tx, N, detail::ElemsPerAddrEncoding<2>(),
756  detail::ElemsPerAddrEncoding<2>()>(
757  addrs.data(), D.data(), mask.data());
758  } else
759  __esimd_svm_scatter<Tx, N, detail::ElemsPerAddrEncoding<1>(),
760  detail::ElemsPerAddrEncoding<1>()>(
761  addrs.data(), vals.data(), mask.data());
762  }
763 }
764 
765 // template <typename T, int N, int VS = 1, typename OffsetT,
766 // typename PropertyListT = empty_properties_t>
767 // void scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
768 // PropertyListT props = {}); // (usm-sc-2)
786 template <typename T, int N, int VS = 1, typename OffsetT,
787  typename PropertyListT =
789 __ESIMD_API std::enable_if_t<
790  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
791 scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
792  PropertyListT props = {}) {
793  simd_mask<N / VS> Mask = 1;
794  scatter<T, N, VS>(p, byte_offsets, vals, Mask, props);
795 }
796 
797 // template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
798 // typename PropertyListT = empty_properties_t>
799 // void scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
800 // simd_mask<N / VS> mask, PropertyListT props = {}); // (usm-sc-3)
821 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
822  typename PropertyListT =
824 __ESIMD_API std::enable_if_t<
825  detail::is_simd_view_type_v<OffsetSimdViewT> &&
826  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
827 scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
828  simd_mask<N / VS> mask, PropertyListT props = {}) {
829  scatter<T, N, VS>(p, byte_offsets.read(), vals, mask, props);
830 }
831 
854 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
855  typename PropertyListT =
857 __ESIMD_API std::enable_if_t<
858  detail::is_simd_view_type_v<OffsetSimdViewT> &&
859  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
860 scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
861  PropertyListT props = {}) {
862  simd_mask<N / VS> Mask = 1;
863  scatter<T, N, VS>(p, byte_offsets.read(), vals, Mask, props);
864 }
865 
877 template <typename Tx, int N, typename Toffset>
878 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> && N == 1>
879 scatter(Tx *p, Toffset offset, simd<Tx, N> vals, simd_mask<N> mask = 1) {
880  scatter<Tx, N, 1>(p, simd<Toffset, N>(offset), vals, mask);
881 }
882 
883 namespace detail {
884 // Accessors may get either 32-bit offset or 64-bit depending on
885 // the -fsycl-esimd-force-stateles-mem mode setting.
886 #ifdef __ESIMD_FORCE_STATELESS_MEM
887 using DeviceAccessorOffsetT = uint64_t;
888 #else
889 using DeviceAccessorOffsetT = uint32_t;
890 #endif
891 
892 template <typename T, int NElts, cache_hint L1H, cache_hint L2H,
893  typename FlagsT>
894 __ESIMD_API std::enable_if_t<is_simd_flag_type_v<FlagsT>, simd<T, NElts>>
895 block_load_impl(const T *p, simd_mask<1> pred, FlagsT flags) {
896  // Verify input template arguments.
897  check_cache_hint<cache_action::load, L1H, L2H>();
898  constexpr auto Alignment =
899  FlagsT::template alignment<__ESIMD_DNS::__raw_t<T>>;
900  static_assert(
901  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
902  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
903  "Incorrect alignment for the data type");
904 
905  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
906  constexpr int SmallIntFactor32Bit =
907  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
908  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
909  "Number of elements is not supported by Transposed load");
910 
911  // If alignment >= 8 and (NElts * sizeof(T)) % 8 == 0) we can load QWORDs.
912  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
913  // because it would require a bit-cast, which is supposed to be NO-OP, but
914  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
915  constexpr bool Use64BitData =
916  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
917  (NElts * sizeof(T)) % sizeof(uint64_t) == 0 &&
918  (sizeof(T) != sizeof(uint32_t) || NElts * sizeof(T) > 256);
919  constexpr int SmallIntFactor =
920  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
921  constexpr int FactoredNElts = NElts / SmallIntFactor;
922  check_lsc_vector_size<FactoredNElts>();
923 
924  // Prepare template arguments for the call of intrinsic.
925  using LoadElemT = __ESIMD_DNS::__raw_t<
926  std::conditional_t<SmallIntFactor == 1, T,
927  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
928  constexpr uint16_t AddressScale = 1;
929  constexpr int ImmOffset = 0;
930  constexpr lsc_data_size ActualDS =
931  Use64BitData ? lsc_data_size::u64 : lsc_data_size::u32;
932  constexpr lsc_vector_size VS = to_lsc_vector_size<FactoredNElts>();
933  constexpr auto Transposed = lsc_data_order::transpose;
934  constexpr int N = 1;
935 
936  // Prepare non-template arguments and call the intrinsic.
937  simd<uintptr_t, N> Addrs = reinterpret_cast<uintptr_t>(p);
939  __esimd_lsc_load_stateless<LoadElemT, L1H, L2H, AddressScale, ImmOffset,
940  ActualDS, VS, Transposed, N>(pred.data(),
941  Addrs.data());
942  return Result.template bit_cast_view<T>();
943 }
944 
977 template <typename T, int NElts, cache_hint L1H, cache_hint L2H,
978  typename FlagsT>
979 __ESIMD_API std::enable_if_t<is_simd_flag_type_v<FlagsT>, simd<T, NElts>>
980 block_load_impl(const T *p, simd_mask<1> pred, simd<T, NElts> pass_thru,
981  FlagsT flags) {
982  // Verify input template arguments.
983  check_cache_hint<cache_action::load, L1H, L2H>();
984  constexpr auto Alignment =
985  FlagsT::template alignment<__ESIMD_DNS::__raw_t<T>>;
986  static_assert(
987  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
988  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
989  "Incorrect alignment for the data type");
990 
991  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
992  constexpr int SmallIntFactor32Bit =
993  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
994  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
995  "Number of elements is not supported by Transposed load");
996 
997  // If alignment >= 8 and (NElts * sizeof(T)) % 8 == 0) we can load QWORDs.
998  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
999  // because it would require a bit-cast, which is supposed to be NO-OP, but
1000  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
1001  constexpr bool Use64BitData =
1002  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1003  (NElts * sizeof(T)) % sizeof(uint64_t) == 0 &&
1004  (sizeof(T) != sizeof(uint32_t) || NElts * sizeof(T) > 256);
1005  constexpr int SmallIntFactor =
1006  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1007  constexpr int FactoredNElts = NElts / SmallIntFactor;
1008  check_lsc_vector_size<FactoredNElts>();
1009 
1010  // Prepare template arguments for the call of intrinsic.
1011  using LoadElemT = __ESIMD_DNS::__raw_t<
1012  std::conditional_t<SmallIntFactor == 1, T,
1013  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1014  constexpr uint16_t AddressScale = 1;
1015  constexpr int ImmOffset = 0;
1016  constexpr lsc_data_size ActualDS =
1017  Use64BitData ? lsc_data_size::u64 : lsc_data_size::u32;
1018  constexpr lsc_vector_size VS = to_lsc_vector_size<FactoredNElts>();
1019  constexpr auto Transposed = lsc_data_order::transpose;
1020  constexpr int N = 1;
1021 
1022  // Prepare non-template arguments and call the intrinsic.
1023  simd<uintptr_t, N> Addrs = reinterpret_cast<uintptr_t>(p);
1025  pass_thru.template bit_cast_view<LoadElemT>();
1027  __esimd_lsc_load_merge_stateless<LoadElemT, L1H, L2H, AddressScale,
1028  ImmOffset, ActualDS, VS, Transposed, N>(
1029  pred.data(), Addrs.data(), PassThru.data());
1030  return Result.template bit_cast_view<T>();
1031 }
1032 
1066 template <typename T, int NElts, cache_hint L1H, cache_hint L2H,
1067  typename AccessorT, typename FlagsT>
1068 __ESIMD_API
1069  std::enable_if_t<detail::is_device_accessor_with_v<
1070  AccessorT, detail::accessor_mode_cap::can_read> &&
1071  is_simd_flag_type_v<FlagsT>,
1074  simd_mask<1> pred, FlagsT flags) {
1075 #ifdef __ESIMD_FORCE_STATELESS_MEM
1076  return block_load_impl<T, NElts, L1H, L2H>(accessorToPointer<T>(acc, offset),
1077  pred, flags);
1078 #else // !__ESIMD_FORCE_STATELESS_MEM
1079  // Verify input template arguments.
1080  check_cache_hint<cache_action::load, L1H, L2H>();
1081  constexpr auto Alignment =
1082  FlagsT::template alignment<__ESIMD_DNS::__raw_t<T>>;
1083  static_assert(
1084  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
1085  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
1086  "Incorrect alignment for the data type");
1087 
1088  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
1089  constexpr int SmallIntFactor32Bit =
1090  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
1091  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1092  "Number of elements is not supported by Transposed load");
1093 
1094  // If alignment >= 8 and (NElts * sizeof(T)) % 8 == 0) we can load QWORDs.
1095  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
1096  // because it would require a bit-cast, which is supposed to be NO-OP, but
1097  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
1098  constexpr bool Use64BitData =
1099  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1100  (NElts * sizeof(T)) % sizeof(uint64_t) == 0 &&
1101  (sizeof(T) != sizeof(uint32_t) || NElts * sizeof(T) > 256);
1102  constexpr int SmallIntFactor =
1103  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1104  constexpr int FactoredNElts = NElts / SmallIntFactor;
1105  check_lsc_vector_size<FactoredNElts>();
1106 
1107  // Prepare template arguments for the call of intrinsic.
1108  using LoadElemT = __ESIMD_DNS::__raw_t<
1109  std::conditional_t<SmallIntFactor == 1, T,
1110  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1111 
1112  constexpr uint16_t AddressScale = 1;
1113  constexpr int ImmOffset = 0;
1114  constexpr lsc_data_size ActualDS =
1115  Use64BitData ? lsc_data_size::u64 : lsc_data_size::u32;
1116  constexpr auto VS = to_lsc_vector_size<FactoredNElts>();
1117  constexpr auto Transposed = lsc_data_order::transpose;
1118  constexpr int N = 1;
1119 
1120  // Prepare non-template arguments and call the intrinsic.
1121  simd<uint32_t, N> Offsets = offset;
1122  auto SI = get_surface_index(acc);
1124  __esimd_lsc_load_bti<LoadElemT, L1H, L2H, AddressScale, ImmOffset,
1125  ActualDS, VS, Transposed, N>(pred.data(),
1126  Offsets.data(), SI);
1127  return Result.template bit_cast_view<T>();
1128 #endif // !__ESIMD_FORCE_STATELESS_MEM
1129 }
1130 
1165 template <typename T, int NElts, cache_hint L1H, cache_hint L2H,
1166  typename AccessorT, typename FlagsT>
1167 __ESIMD_API
1168  std::enable_if_t<detail::is_device_accessor_with_v<
1169  AccessorT, detail::accessor_mode_cap::can_read> &&
1170  is_simd_flag_type_v<FlagsT>,
1173  simd_mask<1> pred, simd<T, NElts> pass_thru, FlagsT flags) {
1174 #ifdef __ESIMD_FORCE_STATELESS_MEM
1175  return block_load_impl<T, NElts, L1H, L2H>(accessorToPointer<T>(acc, offset),
1176  pred, pass_thru, flags);
1177 #else // !__ESIMD_FORCE_STATELESS_MEM
1178  // Verify input template arguments.
1179  check_cache_hint<cache_action::load, L1H, L2H>();
1180  constexpr auto Alignment =
1181  FlagsT::template alignment<__ESIMD_DNS::__raw_t<T>>;
1182  static_assert(
1183  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
1184  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
1185  "Incorrect alignment for the data type");
1186 
1187  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
1188  constexpr int SmallIntFactor32Bit =
1189  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
1190  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1191  "Number of elements is not supported by Transposed load");
1192 
1193  // If alignment >= 8 and (NElts * sizeof(T)) % 8 == 0) we can load QWORDs.
1194  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
1195  // because it would require a bit-cast, which is supposed to be NO-OP, but
1196  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
1197  constexpr bool Use64BitData =
1198  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1199  (NElts * sizeof(T)) % sizeof(uint64_t) == 0 &&
1200  (sizeof(T) != sizeof(uint32_t) || NElts * sizeof(T) > 256);
1201  constexpr int SmallIntFactor =
1202  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1203  constexpr int FactoredNElts = NElts / SmallIntFactor;
1204  check_lsc_vector_size<FactoredNElts>();
1205 
1206  // Prepare template arguments for the call of intrinsic.
1207  using LoadElemT = __ESIMD_DNS::__raw_t<
1208  std::conditional_t<SmallIntFactor == 1, T,
1209  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1210 
1211  constexpr uint16_t AddressScale = 1;
1212  constexpr int ImmOffset = 0;
1213  constexpr lsc_data_size ActualDS =
1214  Use64BitData ? lsc_data_size::u64 : lsc_data_size::u32;
1215  constexpr auto VS = to_lsc_vector_size<FactoredNElts>();
1216  constexpr auto Transposed = lsc_data_order::transpose;
1217  constexpr int N = 1;
1218 
1219  // Prepare non-template arguments and call the intrinsic.
1220  simd<uint32_t, N> Offsets = offset;
1221  auto SI = get_surface_index(acc);
1223  pass_thru.template bit_cast_view<LoadElemT>();
1225  __esimd_lsc_load_merge_bti<LoadElemT, L1H, L2H, AddressScale, ImmOffset,
1226  ActualDS, VS, Transposed, N>(
1227  pred.data(), Offsets.data(), SI, PassThru.data());
1228  return Result.template bit_cast_view<T>();
1229 #endif // !__ESIMD_FORCE_STATELESS_MEM
1230 }
1231 
1232 template <typename T, int NElts, cache_hint L1H, cache_hint L2H,
1233  typename FlagsT>
1234 __ESIMD_API std::enable_if_t<is_simd_flag_type_v<FlagsT>>
1235 block_store_impl(T *p, simd<T, NElts> vals, simd_mask<1> pred, FlagsT flags) {
1236  detail::check_cache_hint<cache_action::store, L1H, L2H>();
1237  constexpr auto Alignment =
1238  FlagsT::template alignment<__ESIMD_DNS::__raw_t<T>>;
1239  static_assert(
1240  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
1241  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
1242  "Incorrect alignment for the data type");
1243 
1244  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
1245  constexpr int SmallIntFactor32Bit =
1246  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
1247  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1248  "Number of elements is not supported by Transposed store");
1249 
1250  // If alignment >= 8 and (NElts * sizeof(T)) % 8 == 0) we can store QWORDs.
1251  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
1252  // because it would require a bit-cast, which is supposed to be NO-OP, but
1253  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
1254  constexpr bool Use64BitData =
1255  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1256  (NElts * sizeof(T)) % sizeof(uint64_t) == 0 &&
1257  (sizeof(T) != sizeof(uint32_t) || NElts * sizeof(T) > 256);
1258 
1259  constexpr int SmallIntFactor =
1260  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1261  constexpr int FactoredNElts = NElts / SmallIntFactor;
1262 
1263  check_lsc_vector_size<FactoredNElts>();
1264 
1265  using StoreType = __ESIMD_DNS::__raw_t<
1266  std::conditional_t<SmallIntFactor == 1, T,
1267  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1268  constexpr uint16_t AddressScale = 1;
1269  constexpr int ImmOffset = 0;
1270  constexpr lsc_data_size ActualDS =
1271  Use64BitData ? lsc_data_size::u64 : lsc_data_size::u32;
1272  constexpr lsc_vector_size VS = to_lsc_vector_size<FactoredNElts>();
1273  constexpr auto Transposed = lsc_data_order::transpose;
1274  constexpr int N = 1;
1275  simd<uintptr_t, N> Addrs = reinterpret_cast<uintptr_t>(p);
1276 
1277  __esimd_lsc_store_stateless<StoreType, L1H, L2H, AddressScale, ImmOffset,
1278  ActualDS, VS, Transposed, N>(
1279  pred.data(), Addrs.data(),
1280  sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreType, FactoredNElts>>(
1281  vals.data()));
1282 }
1283 
1284 template <typename T, int NElts, cache_hint L1H, cache_hint L2H,
1285  typename AccessorT, typename FlagsT>
1286 __ESIMD_API
1287  std::enable_if_t<detail::is_device_accessor_with_v<
1288  AccessorT, detail::accessor_mode_cap::can_write> &&
1289  is_simd_flag_type_v<FlagsT>>
1291  simd<T, NElts> vals, simd_mask<1> pred, FlagsT flags) {
1292 #ifdef __ESIMD_FORCE_STATELESS_MEM
1293  block_store_impl<T, NElts, L1H, L2H>(accessorToPointer<T>(acc, offset), vals,
1294  pred, flags);
1295 #else
1296  // Verify input template arguments.
1297  check_cache_hint<cache_action::store, L1H, L2H>();
1298  constexpr auto Alignment =
1299  FlagsT::template alignment<__ESIMD_DNS::__raw_t<T>>;
1300  static_assert(
1301  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
1302  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
1303  "Incorrect alignment for the data type");
1304 
1305  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
1306  constexpr int SmallIntFactor32Bit =
1307  sizeof(uint32_t) / sizeof(T) > static_cast<size_t>(1)
1308  ? sizeof(uint32_t) / sizeof(T)
1309  : static_cast<size_t>(1);
1310  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1311  "Number of elements is not supported by Transposed store");
1312 
1313  // If alignment >= 8 and (NElts * sizeof(T)) % 8 == 0) we can store QWORDs.
1314  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
1315  // because it would require a bit-cast, which is supposed to be NO-OP, but
1316  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
1317  constexpr bool Use64BitData =
1318  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1319  (NElts * sizeof(T)) % sizeof(uint64_t) == 0 &&
1320  (sizeof(T) != sizeof(uint32_t) || NElts * sizeof(T) > 256);
1321  constexpr int SmallIntFactor =
1322  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1323  constexpr int FactoredNElts = NElts / SmallIntFactor;
1324  check_lsc_vector_size<FactoredNElts>();
1325 
1326  // Prepare template arguments for the call of intrinsic.
1327  using StoreElemT = __ESIMD_DNS::__raw_t<
1328  std::conditional_t<SmallIntFactor == 1, T,
1329  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1330 
1331  constexpr uint16_t AddressScale = 1;
1332  constexpr int ImmOffset = 0;
1333  constexpr lsc_data_size ActualDS =
1334  Use64BitData ? lsc_data_size::u64 : lsc_data_size::u32;
1335  constexpr auto VS = to_lsc_vector_size<FactoredNElts>();
1336  constexpr auto Transposed = lsc_data_order::transpose;
1337  constexpr int N = 1;
1338 
1339  // Prepare non-template arguments and call the intrinsic.
1340  simd<uint32_t, N> Offsets = offset;
1341  auto SI = get_surface_index(acc);
1342 
1343  __esimd_lsc_store_bti<StoreElemT, L1H, L2H, AddressScale, ImmOffset, ActualDS,
1344  VS, Transposed, N>(
1345  pred.data(), Offsets.data(),
1346  sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreElemT, FactoredNElts>>(
1347  vals.data()),
1348  SI);
1349 #endif
1350 }
1351 
1352 } // namespace detail
1353 
1368 template <typename Tx, int N,
1370 __ESIMD_API std::enable_if_t<is_simd_flag_type_v<Flags>>
1371 block_store(Tx *addr, simd<Tx, N> vals, Flags) {
1372  using T = typename detail::__raw_t<Tx>;
1373  using VecT = typename simd<T, N>::raw_vector_type;
1374  constexpr size_t Align = Flags::template alignment<simd<T, N>>;
1375  __esimd_svm_block_st<T, N, Align>(reinterpret_cast<VecT *>(addr),
1376  vals.data());
1377 }
1378 
1381 
1392 
1396 
1401 
1407 
1441 template <typename T, int N,
1442  typename PropertyListT =
1444 __ESIMD_API std::enable_if_t<
1445  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
1446 block_load(const T *ptr, PropertyListT props = {}) {
1447  constexpr auto L1Hint =
1448  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
1450  constexpr auto L2Hint =
1451  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
1453  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
1454  "L3 cache hint is reserved. The old/experimental L3 LSC cache "
1455  "hint is cache_level::L2 now.");
1456 
1457  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
1458  constexpr size_t Alignment =
1459  detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
1460  if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none) {
1461  detail::check_cache_hint<detail::cache_action::load, L1Hint, L2Hint>();
1462 
1463  simd_mask<1> Mask = 1;
1464  return detail::block_load_impl<T, N, L1Hint, L2Hint>(
1465  ptr, Mask, overaligned_tag<Alignment>{});
1466  } else {
1467  return block_load<T, N>(ptr, overaligned_tag<Alignment>{});
1468  }
1469 }
1470 
1506 template <typename T, int N,
1507  typename PropertyListT =
1509 __ESIMD_API std::enable_if_t<
1510  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
1511 block_load(const T *ptr, size_t byte_offset, PropertyListT props = {}) {
1512  const T *AdjustedPtr = reinterpret_cast<const T *>(
1513  reinterpret_cast<const int8_t *>(ptr) + byte_offset);
1514  return block_load<T, N>(AdjustedPtr, props);
1515 }
1516 
1549 template <typename T, int N,
1550  typename PropertyListT =
1552 __ESIMD_API std::enable_if_t<
1553  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
1554 block_load(const T *ptr, simd_mask<1> pred, PropertyListT props = {}) {
1555  constexpr auto L1Hint =
1556  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
1558  constexpr auto L2Hint =
1559  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
1561  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
1562  "L3 cache hint is reserved. The old/experimental L3 LSC cache "
1563  "hint is cache_level::L2 now.");
1564 
1565  detail::check_cache_hint<detail::cache_action::load, L1Hint, L2Hint>();
1566  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
1567  constexpr size_t Alignment =
1568  detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
1569 
1570  return detail::block_load_impl<T, N, L1Hint, L2Hint>(
1571  ptr, pred, overaligned_tag<Alignment>{});
1572 }
1573 
1607 template <typename T, int N,
1608  typename PropertyListT =
1610 __ESIMD_API std::enable_if_t<
1611  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
1612 block_load(const T *ptr, size_t byte_offset, simd_mask<1> pred,
1613  PropertyListT props = {}) {
1614  const T *AdjustedPtr = reinterpret_cast<const T *>(
1615  reinterpret_cast<const int8_t *>(ptr) + byte_offset);
1616  return block_load<T, N>(AdjustedPtr, pred, props);
1617 }
1618 
1651 template <typename T, int N,
1652  typename PropertyListT =
1654 __ESIMD_API std::enable_if_t<
1655  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
1656 block_load(const T *ptr, simd_mask<1> pred, simd<T, N> pass_thru,
1657  PropertyListT props = {}) {
1658  constexpr auto L1Hint =
1659  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
1661  constexpr auto L2Hint =
1662  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
1664  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
1665  "L3 cache hint is reserved. The old/experimental L3 LSC cache "
1666  "hint is cache_level::L2 now.");
1667 
1668  detail::check_cache_hint<detail::cache_action::load, L1Hint, L2Hint>();
1669  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
1670  constexpr size_t Alignment =
1671  detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
1672 
1673  return detail::block_load_impl<T, N, L1Hint, L2Hint>(
1674  ptr, pred, pass_thru, overaligned_tag<Alignment>{});
1675 }
1676 
1711 template <typename T, int N,
1712  typename PropertyListT =
1714 __ESIMD_API std::enable_if_t<
1715  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
1716 block_load(const T *ptr, size_t byte_offset, simd_mask<1> pred,
1717  simd<T, N> pass_thru, PropertyListT props = {}) {
1718  const T *AdjustedPtr = reinterpret_cast<const T *>(
1719  reinterpret_cast<const int8_t *>(ptr) + byte_offset);
1720  return block_load<T, N>(AdjustedPtr, pred, pass_thru, props);
1721 }
1722 
1738 template <typename Tx, int N,
1740 __ESIMD_API std::enable_if_t<is_simd_flag_type_v<Flags>, simd<Tx, N>>
1741 block_load(const Tx *addr, Flags) {
1742  using T = typename detail::__raw_t<Tx>;
1743  using VecT = typename simd<T, N>::raw_vector_type;
1744  constexpr size_t Align = Flags::template alignment<simd<T, N>>;
1745  return __esimd_svm_block_ld<T, N, Align>(
1746  reinterpret_cast<const VecT *>(addr));
1747 }
1748 
1764 template <typename Tx, int N, typename AccessorTy,
1765  typename Flags = vector_aligned_tag,
1766  typename = std::enable_if_t<
1767  is_simd_flag_type_v<Flags> &&
1768  detail::is_device_accessor_with_v<
1769  AccessorTy, detail::accessor_mode_cap::can_read>>,
1770  class T = detail::__raw_t<Tx>>
1771 __ESIMD_API simd<Tx, N> block_load(AccessorTy acc,
1772  detail::DeviceAccessorOffsetT byte_offset,
1773  Flags flags) {
1774 #ifdef __ESIMD_FORCE_STATELESS_MEM
1775  return block_load<Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc, byte_offset),
1776  flags);
1777 #else
1778  std::ignore = flags;
1779  constexpr unsigned Sz = sizeof(T) * N;
1780  static_assert(Sz >= detail::OperandSize::OWORD,
1781  "block size must be at least 1 oword");
1782  static_assert(Sz % detail::OperandSize::OWORD == 0,
1783  "block size must be whole number of owords");
1784  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
1785  "block must be 1, 2, 4 or 8 owords long");
1786  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
1787  "block size must be at most 8 owords");
1788 
1789  auto surf_ind = __esimd_get_surface_index(
1790  detail::AccessorPrivateProxy::getQualifiedPtrOrImageObj(acc));
1791 
1792  if constexpr (Flags::template alignment<simd<T, N>> >=
1793  detail::OperandSize::OWORD) {
1794  return __esimd_oword_ld<T, N>(surf_ind, byte_offset >> 4);
1795  } else {
1796  return __esimd_oword_ld_unaligned<T, N>(surf_ind, byte_offset);
1797  }
1798 #endif
1799 }
1800 
1810 
1814 
1821 
1827 
1865 template <typename T, int N, typename AccessorT,
1866  typename PropertyListT =
1868 __ESIMD_API std::enable_if_t<
1869  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
1870  detail::is_device_accessor_with_v<AccessorT,
1871  detail::accessor_mode_cap::can_read>,
1872  simd<T, N>>
1873 block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
1874  PropertyListT props = {}) {
1875 #ifdef __ESIMD_FORCE_STATELESS_MEM
1876  return block_load<T, N>(detail::accessorToPointer<T>(acc, byte_offset),
1877  props);
1878 #else // !__ESIMD_FORCE_STATELESS_MEM
1879  constexpr auto L1Hint =
1880  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
1882  constexpr auto L2Hint =
1883  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
1885  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
1886  "L3 cache hint is reserved. The old/experimental L3 LSC cache "
1887  "hint is cache_level::L2 now.");
1888 
1889  // If the alignment property is not passed, then assume the pointer
1890  // is element-aligned.
1891  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
1892  constexpr size_t Alignment =
1893  detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
1894 
1895  // Legacy surface index loads must be 1, 2, 4 or 8 owords long.
1896  constexpr size_t Size = sizeof(T) * N;
1897  constexpr size_t OWord = detail::OperandSize::OWORD;
1898  constexpr bool IsLegacySize = Size == OWord || Size == 2 * OWord ||
1899  Size == 4 * OWord || Size == 8 * OWord;
1900 
1901  if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none ||
1902  !IsLegacySize) {
1903  return detail::block_load_impl<T, N, L1Hint, L2Hint>(
1904  acc, byte_offset, simd_mask<1>(1), overaligned_tag<Alignment>{});
1905  } else {
1906  return block_load<T, N>(acc, byte_offset, overaligned_tag<Alignment>{});
1907  }
1908 #endif // !__ESIMD_FORCE_STATELESS_MEM
1909 }
1910 
1940 template <typename T, int N, typename AccessorT,
1941  typename PropertyListT =
1943 __ESIMD_API std::enable_if_t<
1944  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
1945  detail::is_device_accessor_with_v<AccessorT,
1946  detail::accessor_mode_cap::can_read>,
1947  simd<T, N>>
1948 block_load(AccessorT acc, PropertyListT /* props */ = {}) {
1949  // Create new properties without the alignment property passed in 'props',
1950  // and add alignment<16> as it is usable and most favourable in this case.
1951  constexpr auto L1Hint =
1952  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
1954  constexpr auto L2Hint =
1955  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
1957  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
1958  "L3 cache hint is reserved. The old/experimental L3 LSC cache "
1959  "hint is cache_level::L2 now.");
1960  properties Props{cache_hint_L1<L1Hint>, cache_hint_L2<L2Hint>, alignment<16>};
1961  return block_load<T, N>(acc, 0, Props);
1962 }
1963 
1995 template <typename T, int N, typename AccessorT,
1996  typename PropertyListT =
1998 __ESIMD_API std::enable_if_t<
1999  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
2000  detail::is_device_accessor_with_v<AccessorT,
2001  detail::accessor_mode_cap::can_read>,
2002  simd<T, N>>
2003 block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
2004  simd_mask<1> pred, simd<T, N> pass_thru,
2005  PropertyListT /* props */ = {}) {
2006  constexpr auto L1Hint =
2007  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
2009  constexpr auto L2Hint =
2010  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
2012  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
2013  "L3 cache hint is reserved. The old/experimental L3 LSC cache "
2014  "hint is cache_level::L2 now.");
2015 
2016  // If the alignment property is not passed, then assume the byte_offset
2017  // is element-aligned and is at leat 4-bytes.
2018  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
2019  constexpr size_t Alignment =
2020  detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
2021  return detail::block_load_impl<T, N, L1Hint, L2Hint>(
2022  acc, byte_offset, pred, pass_thru, overaligned_tag<Alignment>{});
2023 }
2024 
2056 template <typename T, int N, typename AccessorT,
2057  typename PropertyListT =
2059 __ESIMD_API std::enable_if_t<
2060  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
2061  detail::is_device_accessor_with_v<AccessorT,
2062  detail::accessor_mode_cap::can_read>,
2063  simd<T, N>>
2064 block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
2065  simd_mask<1> pred, PropertyListT props = {}) {
2066  simd<T, N> PassThru; // Intentionally uninitialized.
2067  return block_load<T, N>(acc, byte_offset, pred, PassThru, props);
2068 }
2069 
2097 template <typename T, int N, typename AccessorT,
2098  typename PropertyListT =
2100 __ESIMD_API std::enable_if_t<
2101  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
2102  detail::is_device_accessor_with_v<AccessorT,
2103  detail::accessor_mode_cap::can_read>,
2104  simd<T, N>>
2105 block_load(AccessorT acc, simd_mask<1> pred, simd<T, N> pass_thru,
2106  PropertyListT /* props */ = {}) {
2107  // Create new properties without the alignment property passed in 'props',
2108  // and add alignment<16> as it is usable and most favourable in this case.
2109  constexpr auto L1Hint =
2110  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
2112  constexpr auto L2Hint =
2113  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
2115  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
2116  "L3 cache hint is reserved. The old/experimental L3 LSC cache "
2117  "hint is cache_level::L2 now.");
2118  properties Props{cache_hint_L1<L1Hint>, cache_hint_L2<L2Hint>, alignment<16>};
2119  return block_load<T, N>(acc, 0, pred, pass_thru, Props);
2120 }
2121 
2148 template <typename T, int N, typename AccessorT,
2149  typename PropertyListT =
2151 __ESIMD_API std::enable_if_t<
2152  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
2153  detail::is_device_accessor_with_v<AccessorT,
2154  detail::accessor_mode_cap::can_read>,
2155  simd<T, N>>
2156 block_load(AccessorT acc, simd_mask<1> pred, PropertyListT /* props */ = {}) {
2157  // Create new properties without the alignment property passed in 'props',
2158  // and add alignment<16> as it is usable and most favourable in this case.
2159  constexpr auto L1Hint =
2160  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
2162  constexpr auto L2Hint =
2163  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
2165  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
2166  "L3 cache hint is reserved. The old/experimental L3 LSC cache "
2167  "hint is cache_level::L2 now.");
2168  properties Props{cache_hint_L1<L1Hint>, cache_hint_L2<L2Hint>, alignment<16>};
2169 
2170  simd<T, N> PassThru; // Intentionally uninitialized.
2171  return block_load<T, N>(acc, 0, pred, PassThru, Props);
2172 }
2173 
2187 
2190 
2227 template <typename T, int N,
2228  typename PropertyListT =
2230 __ESIMD_API std::enable_if_t<
2231  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
2232 block_store(T *ptr, simd<T, N> vals, PropertyListT props = {}) {
2233  constexpr auto L1Hint =
2234  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
2236  constexpr auto L2Hint =
2237  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
2239  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
2240  "L3 cache hint is reserved. The old/experimental L3 LSC cache "
2241  "hint is cache_level::L2 now.");
2242  if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none) {
2243  detail::check_cache_hint<detail::cache_action::store, L1Hint, L2Hint>();
2244  constexpr int DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
2245  constexpr size_t Alignment =
2246  detail::getPropertyValue<PropertyListT, alignment_key>(
2247  DefaultAlignment);
2248 
2249  simd_mask<1> Mask = 1;
2250  detail::block_store_impl<T, N, L1Hint, L2Hint>(
2251  ptr, vals, Mask, overaligned_tag<Alignment>{});
2252  } else {
2253  // If the alignment property is not passed, then assume the pointer
2254  // is OWORD-aligned.
2255  constexpr size_t Alignment =
2256  detail::getPropertyValue<PropertyListT, alignment_key>(
2257  detail::OperandSize::OWORD);
2258  block_store<T, N>(ptr, vals, overaligned_tag<Alignment>{});
2259  }
2260 }
2261 
2296 template <typename T, int N,
2297  typename PropertyListT =
2299 __ESIMD_API std::enable_if_t<
2300  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
2301 block_store(T *ptr, size_t byte_offset, simd<T, N> vals,
2302  PropertyListT props = {}) {
2303  T *AdjustedPtr =
2304  reinterpret_cast<T *>(reinterpret_cast<int8_t *>(ptr) + byte_offset);
2305  block_store<T, N>(AdjustedPtr, vals, props);
2306 }
2307 
2340 template <typename T, int N,
2341  typename PropertyListT =
2343 __ESIMD_API std::enable_if_t<
2344  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
2346  PropertyListT props = {}) {
2347  constexpr auto L1Hint =
2348  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
2350  constexpr auto L2Hint =
2351  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
2353  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
2354  "L3 cache hint is reserved. The old/experimental L3 LSC cache "
2355  "hint is cache_level::L2 now.");
2356 
2357  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
2358  constexpr size_t Alignment =
2359  detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
2360 
2361  detail::block_store_impl<T, N, L1Hint, L2Hint>(ptr, vals, pred,
2363 }
2364 
2385 // the minimally required element-size alignment otherwise.
2401 template <typename T, int N,
2402  typename PropertyListT =
2404 __ESIMD_API std::enable_if_t<
2405  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
2406 block_store(T *ptr, size_t byte_offset, simd<T, N> vals, simd_mask<1> pred,
2407  PropertyListT props = {}) {
2408  T *AdjustedPtr =
2409  reinterpret_cast<T *>(reinterpret_cast<int8_t *>(ptr) + byte_offset);
2410  block_store<T, N>(AdjustedPtr, vals, pred, props);
2411 }
2412 
2421 
2424 
2428 
2431 
2475 template <typename T, int N, typename AccessorT,
2476  typename PropertyListT =
2478 __ESIMD_API std::enable_if_t<
2479  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
2480  detail::is_device_accessor_with_v<AccessorT,
2481  detail::accessor_mode_cap::can_write>>
2482 block_store(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
2483  simd<T, N> vals, PropertyListT props = {}) {
2484 #ifdef __ESIMD_FORCE_STATELESS_MEM
2485  block_store<T, N>(detail::accessorToPointer<T>(acc, byte_offset), vals,
2486  props);
2487 #else
2488  constexpr auto L1Hint =
2489  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
2491  constexpr auto L2Hint =
2492  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
2494  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
2495  "L3 cache hint is reserved. The old/experimental L3 LSC cache "
2496  "hint is cache_level::L2 now.");
2497  constexpr int DefaultLSCAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
2498  constexpr size_t Alignment =
2499  detail::getPropertyValue<PropertyListT, alignment_key>(
2500  DefaultLSCAlignment);
2501  constexpr bool AlignmentRequiresLSC =
2502  PropertyListT::template has_property<alignment_key>() && Alignment < 16;
2503  if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none ||
2504  AlignmentRequiresLSC) {
2505  detail::check_cache_hint<detail::cache_action::store, L1Hint, L2Hint>();
2506  simd_mask<1> Mask = 1;
2507  detail::block_store_impl<T, N, L1Hint, L2Hint>(
2508  acc, byte_offset, vals, Mask, overaligned_tag<Alignment>{});
2509  } else {
2510  using Tx = detail::__raw_t<T>;
2511  constexpr unsigned Sz = sizeof(Tx) * N;
2512  static_assert(Sz >= detail::OperandSize::OWORD,
2513  "block size must be at least 1 oword");
2514  static_assert(Sz % detail::OperandSize::OWORD == 0,
2515  "block size must be whole number of owords");
2516  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
2517  "block must be 1, 2, 4 or 8 owords long");
2518  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
2519  "block size must be at most 8 owords");
2520 
2521  auto surf_ind = __esimd_get_surface_index(
2522  detail::AccessorPrivateProxy::getQualifiedPtrOrImageObj(acc));
2523  __esimd_oword_st<Tx, N>(surf_ind, byte_offset >> 4, vals.data());
2524  }
2525 #endif
2526 }
2527 
2557 template <typename T, int N, typename AccessorT,
2558  typename PropertyListT =
2560 __ESIMD_API std::enable_if_t<
2561  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
2562  detail::is_device_accessor_with_v<AccessorT,
2563  detail::accessor_mode_cap::can_write>>
2564 block_store(AccessorT acc, simd<T, N> vals, PropertyListT props = {}) {
2565  // Create new properties without the alignment property passed in 'props',
2566  // and add alignment<16> as it is usable and most favourable in this case.
2567  constexpr auto L1Hint =
2568  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
2570  constexpr auto L2Hint =
2571  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
2573  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
2574  "L3 cache hint is reserved. The old/experimental L3 LSC cache "
2575  "hint is cache_level::L2 now.");
2576  properties Props{cache_hint_L1<L1Hint>, cache_hint_L2<L2Hint>, alignment<16>};
2577 
2578  block_store<T, N>(acc, 0, vals, Props);
2579 }
2580 
2612 template <typename T, int N, typename AccessorT,
2613  typename PropertyListT =
2615 __ESIMD_API std::enable_if_t<
2616  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
2617  detail::is_device_accessor_with_v<AccessorT,
2618  detail::accessor_mode_cap::can_write>>
2619 block_store(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
2620  simd<T, N> vals, simd_mask<1> pred, PropertyListT props = {}) {
2621  constexpr auto L1Hint =
2622  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
2624  constexpr auto L2Hint =
2625  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
2627  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
2628  "L3 cache hint is reserved. The old/experimental L3 LSC cache "
2629  "hint is cache_level::L2 now.");
2630 
2631  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
2632  constexpr size_t Alignment =
2633  detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
2634 
2635  detail::block_store_impl<T, N, L1Hint, L2Hint>(acc, byte_offset, vals, pred,
2637 }
2638 
2663 template <typename T, int N, typename AccessorT,
2664  typename PropertyListT =
2666 __ESIMD_API std::enable_if_t<
2667  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
2668  detail::is_device_accessor_with_v<AccessorT,
2669  detail::accessor_mode_cap::can_write>>
2670 block_store(AccessorT acc, simd<T, N> vals, simd_mask<1> pred,
2671  PropertyListT props = {}) {
2672  // Create new properties without the alignment property passed in 'props',
2673  // and add alignment<16> as it is usable and most favourable in this case.
2674  constexpr auto L1Hint =
2675  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
2677  constexpr auto L2Hint =
2678  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
2680  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
2681  "L3 cache hint is reserved. The old/experimental L3 LSC cache "
2682  "hint is cache_level::L2 now.");
2683  properties Props{cache_hint_L1<L1Hint>, cache_hint_L2<L2Hint>, alignment<16>};
2684  block_store<T, N>(acc, 0, vals, pred, Props);
2685 }
2686 
2688 
2690 
2692 
2693 // Implementations of accessor-based gather and scatter functions
2694 namespace detail {
2695 template <typename T, int N, typename AccessorTy>
2696 ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t<
2697  std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
2698  is_accessor_with_v<AccessorTy, detail::accessor_mode_cap::can_write>>
2699 scatter_impl(AccessorTy acc, simd<T, N> vals, simd<uint32_t, N> offsets,
2700  uint32_t glob_offset, simd_mask<N> mask) {
2701 
2702  static_assert(detail::isPowerOf2(N, 32), "Unexpected vector length");
2703  if constexpr (sizeof(T) == 8) {
2704  scatter_impl<uint32_t, N>(
2705  acc, vals.template bit_cast_view<uint32_t>().template select<N, 2>(0),
2706  offsets, glob_offset, mask);
2707  scatter_impl<uint32_t, N>(
2708  acc, vals.template bit_cast_view<uint32_t>().template select<N, 2>(1),
2709  offsets, glob_offset + sizeof(uint32_t), mask);
2710  } else {
2711  constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
2712  // TODO (performance) use hardware-supported scale once BE supports it
2713  constexpr int16_t scale = 0;
2714  const auto si = __ESIMD_NS::get_surface_index(acc);
2715 
2716  if constexpr (sizeof(T) < 4) {
2717  using Tint = std::conditional_t<std::is_integral_v<T>, T,
2718  detail::uint_type_t<sizeof(T)>>;
2719  using Treal = __raw_t<T>;
2720  simd<Tint, N> vals_int = bitcast<Tint, Treal, N>(std::move(vals).data());
2721  using PromoT = typename std::conditional_t<std::is_signed<Tint>::value,
2722  int32_t, uint32_t>;
2723  const simd<PromoT, N> promo_vals = convert<PromoT>(std::move(vals_int));
2724  __esimd_scatter_scaled<PromoT, N, decltype(si), TypeSizeLog2, scale>(
2725  mask.data(), si, glob_offset, offsets.data(), promo_vals.data());
2726  } else {
2727  using Treal = __raw_t<T>;
2728  if constexpr (!std::is_same_v<Treal, T>) {
2729  simd<Treal, N> Values = vals.template bit_cast_view<Treal>();
2730  __esimd_scatter_scaled<Treal, N, decltype(si), TypeSizeLog2, scale>(
2731  mask.data(), si, glob_offset, offsets.data(), Values.data());
2732  } else {
2733  __esimd_scatter_scaled<T, N, decltype(si), TypeSizeLog2, scale>(
2734  mask.data(), si, glob_offset, offsets.data(), vals.data());
2735  }
2736  }
2737  }
2738 }
2739 
2740 #ifndef __ESIMD_FORCE_STATELESS_MEM
2759 template <typename T, int NElts, lsc_data_size DS, cache_hint L1H,
2760  cache_hint L2H, int N, typename AccessorTy, typename OffsetT>
2761 __ESIMD_API std::enable_if_t<
2762  is_device_accessor_with_v<AccessorTy, accessor_mode_cap::can_write>>
2763 scatter_impl(AccessorTy acc, simd<OffsetT, N> offsets, simd<T, N * NElts> vals,
2764  simd_mask<N> pred) {
2765  static_assert(std::is_integral_v<OffsetT>,
2766  "Scatter must have integral byte_offset type");
2767  static_assert(sizeof(OffsetT) <= 4,
2768  "Implicit truncation of 64-bit byte_offset to 32-bit is "
2769  "disabled. Use -fsycl-esimd-force-stateless-mem or explicitly "
2770  "convert offsets to a 32-bit vector");
2771  check_lsc_vector_size<NElts>();
2772  check_lsc_data_size<T, DS>();
2773  check_cache_hint<cache_action::store, L1H, L2H>();
2774  constexpr uint16_t AddressScale = 1;
2775  constexpr int ImmOffset = 0;
2776  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
2777  constexpr lsc_vector_size LSCNElts = to_lsc_vector_size<NElts>();
2778  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
2779  using MsgT = typename lsc_expand_type<T>::type;
2780  simd<MsgT, N * NElts> Tmp = lsc_format_input<MsgT, T>(vals);
2781  simd<uint32_t, N> ByteOffsets32 = convert<uint32_t>(offsets);
2782  auto si = get_surface_index(acc);
2783  __esimd_lsc_store_bti<MsgT, L1H, L2H, AddressScale, ImmOffset, EDS, LSCNElts,
2784  Transposed, N>(pred.data(), ByteOffsets32.data(),
2785  Tmp.data(), si);
2786 }
2787 #endif // __ESIMD_FORCE_STATELESS_MEM
2788 
2789 template <typename T, int N, typename AccessorTy>
2790 __ESIMD_API std::enable_if_t<
2791  (std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
2792  is_accessor_with_v<AccessorTy, detail::accessor_mode_cap::can_read>),
2793  simd<T, N>>
2794 gather_impl(AccessorTy acc, simd<uint32_t, N> offsets, uint32_t glob_offset,
2795  simd_mask<N> mask) {
2796  static_assert(detail::isPowerOf2(N, 32), "Unexpected vector length");
2797 
2798  if constexpr (sizeof(T) == 8) {
2799  simd<T, N> Res;
2800  Res.template bit_cast_view<uint32_t>().template select<N, 2>(0) =
2801  gather_impl<uint32_t, N>(acc, offsets, glob_offset, mask);
2802  Res.template bit_cast_view<uint32_t>().template select<N, 2>(1) =
2803  gather_impl<uint32_t, N>(acc, offsets, glob_offset + sizeof(uint32_t),
2804  mask);
2805  return Res;
2806  } else {
2807  using Treal = __raw_t<T>;
2808  constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
2809  // TODO (performance) use hardware-supported scale once BE supports it
2810  constexpr uint32_t scale = 0;
2811  const auto si = get_surface_index(acc);
2812  if constexpr (sizeof(T) < 4) {
2813  using Tint = std::conditional_t<std::is_integral_v<T>, T,
2814  detail::uint_type_t<sizeof(T)>>;
2815 
2816  static_assert(std::is_integral<Tint>::value,
2817  "only integral 1- & 2-byte types are supported");
2818  using PromoT = typename std::conditional_t<std::is_signed<Tint>::value,
2819  int32_t, uint32_t>;
2820  simd<PromoT, N> promo_vals =
2821  __esimd_gather_masked_scaled2<PromoT, N, decltype(si), TypeSizeLog2,
2822  scale>(si, glob_offset, offsets.data(),
2823  mask.data());
2824  auto Res = convert<Tint>(promo_vals);
2825 
2826  if constexpr (!std::is_same_v<Tint, T>) {
2827  return detail::bitcast<Treal, Tint, N>(Res.data());
2828  } else {
2829  return Res;
2830  }
2831  } else {
2832  simd<Treal, N> Res = __esimd_gather_masked_scaled2<Treal, N, decltype(si),
2833  TypeSizeLog2, scale>(
2834  si, glob_offset, offsets.data(), mask.data());
2835  if constexpr (!std::is_same_v<Treal, T>) {
2836  return Res.template bit_cast_view<T>();
2837  } else {
2838  return Res;
2839  }
2840  }
2841  }
2842 }
2843 
2844 #ifndef __ESIMD_FORCE_STATELESS_MEM
2845 template <typename T, int N, int VS, cache_hint L1H, cache_hint L2H,
2846  lsc_data_size DS, typename OffsetT, typename AccessorT>
2847 __ESIMD_API std::enable_if_t<
2848  is_device_accessor_with_v<AccessorT, accessor_mode_cap::can_read>,
2849  simd<T, N>>
2850 gather_impl(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
2851  simd_mask<N / VS> pred, simd<T, N> pass_thru) {
2852  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
2853  static_assert(std::is_integral_v<OffsetT>,
2854  "Gather must have integral byte_offset type");
2855  static_assert(sizeof(OffsetT) <= 4,
2856  "Implicit truncation of 64-bit byte_offset to 32-bit is "
2857  "disabled. Use -fsycl-esimd-force-stateless-mem or explicitly "
2858  "convert offsets to a 32-bit vector");
2859  static_assert(VS == 1 || sizeof(T) >= 4,
2860  "VS > 1 is supprted only for 4- and 8-byte elements");
2861  check_lsc_vector_size<VS>();
2862  check_lsc_data_size<T, DS>();
2863  check_cache_hint<cache_action::load, L1H, L2H>();
2864  constexpr uint16_t AddressScale = 1;
2865  constexpr int ImmOffset = 0;
2866  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
2867  constexpr lsc_vector_size LSCVS = to_lsc_vector_size<VS>();
2868  constexpr auto Transposed = lsc_data_order::nontranspose;
2869  using MsgT = typename lsc_expand_type<T>::type;
2870  auto SI = get_surface_index(acc);
2871  simd<uint32_t, N / VS> ByteOffsets32 = convert<uint32_t>(byte_offsets);
2872  simd<MsgT, N> PassThruExpanded = lsc_format_input<MsgT>(pass_thru);
2873  simd<MsgT, N> Result =
2874  __esimd_lsc_load_merge_bti<MsgT, L1H, L2H, AddressScale, ImmOffset, EDS,
2875  LSCVS, Transposed, N / VS>(
2876  pred.data(), ByteOffsets32.data(), SI, PassThruExpanded.data());
2877  return lsc_format_ret<T>(Result);
2878 }
2879 #endif // __ESIMD_FORCE_STATELESS_MEM
2880 
2898 template <typename T, int NElts, lsc_data_size DS, int N>
2899 __ESIMD_API simd<T, N * NElts> slm_gather_impl(simd<uint32_t, N> offsets,
2900  simd_mask<N> pred,
2901  simd<T, N * NElts> pass_thru) {
2902  check_lsc_vector_size<NElts>();
2903  check_lsc_data_size<T, DS>();
2904  constexpr uint16_t AddressScale = 1;
2905  constexpr int ImmOffset = 0;
2906  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
2907  constexpr lsc_vector_size LSCVS = to_lsc_vector_size<NElts>();
2908  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
2909  using MsgT = typename lsc_expand_type<T>::type;
2910  simd<MsgT, N * NElts> PassThruExpanded = lsc_format_input<MsgT>(pass_thru);
2911  simd<MsgT, N * NElts> Result =
2912  __esimd_lsc_load_merge_slm<MsgT, cache_hint::none, cache_hint::none,
2913  AddressScale, ImmOffset, EDS, LSCVS,
2914  Transposed, N>(pred.data(), offsets.data(),
2915  PassThruExpanded.data());
2916  return lsc_format_ret<T>(Result);
2917 }
2918 
2933 template <typename T, int NElts, lsc_data_size DS, int N>
2934 __ESIMD_API void slm_scatter_impl(simd<uint32_t, N> offsets,
2935  simd<T, N * NElts> vals, simd_mask<N> pred) {
2936  check_lsc_vector_size<NElts>();
2937  check_lsc_data_size<T, DS>();
2938  constexpr uint16_t AddressScale = 1;
2939  constexpr int ImmOffset = 0;
2940  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
2941  constexpr lsc_vector_size LSCVS = to_lsc_vector_size<NElts>();
2942  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
2943  using MsgT = typename lsc_expand_type<T>::type;
2944  simd<MsgT, N * NElts> Tmp = lsc_format_input<MsgT, T>(vals);
2945  __esimd_lsc_store_slm<MsgT, cache_hint::none, cache_hint::none, AddressScale,
2946  ImmOffset, EDS, LSCVS, Transposed, N>(
2947  pred.data(), offsets.data(), Tmp.data());
2948 }
2949 
2966 template <typename T, int NElts, lsc_data_size DS, cache_hint L1H,
2967  cache_hint L2H, int N, typename Toffset>
2968 __ESIMD_API void prefetch_impl(const T *p, simd<Toffset, N> byte_offsets,
2969  simd_mask<N> pred) {
2970  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
2971  check_lsc_vector_size<NElts>();
2972  check_lsc_data_size<T, DS>();
2973  check_cache_hint<cache_action::prefetch, L1H, L2H>();
2974  constexpr uint16_t AddressScale = 1;
2975  constexpr int ImmOffset = 0;
2976  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
2977  constexpr lsc_vector_size LSCVS = to_lsc_vector_size<NElts>();
2978  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
2979  using MsgT = typename lsc_expand_type<T>::type;
2980  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
2981  addrs += convert<uintptr_t>(byte_offsets);
2982  __esimd_lsc_prefetch_stateless<MsgT, L1H, L2H, AddressScale, ImmOffset, EDS,
2983  LSCVS, Transposed, N>(pred.data(),
2984  addrs.data());
2985 }
2986 
2987 template <typename T, int NElts, lsc_data_size DS, cache_hint L1H,
2988  cache_hint L2H, typename Toffset>
2989 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>>
2990 prefetch_impl(const T *p, Toffset offset, simd_mask<1> pred) {
2991  check_lsc_vector_size<NElts>();
2992  check_lsc_data_size<T, DS>();
2993  check_cache_hint<cache_action::prefetch, L1H, L2H>();
2994  constexpr uint16_t AddressScale = 1;
2995  constexpr int ImmOffset = 0;
2996  constexpr lsc_data_size EDS = finalize_data_size<T, DS>();
2997 
2998  static_assert(
2999  EDS == lsc_data_size::u32 || EDS == lsc_data_size::u64,
3000  "Transposed prefetch is supported only for data size u32 or u64");
3001  constexpr lsc_vector_size LSCVS = to_lsc_vector_size<NElts>();
3002  constexpr lsc_data_order Transposed = lsc_data_order::transpose;
3003  constexpr int N = 1;
3004 
3005  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p) + offset;
3006  __esimd_lsc_prefetch_stateless<T, L1H, L2H, AddressScale, ImmOffset, EDS,
3007  LSCVS, Transposed, N>(pred.data(),
3008  addrs.data());
3009 }
3010 
3011 } // namespace detail
3012 
3014 
3017 
3039 // Dev note: the argument \p glob_offset of this function does not have
3040 // a default value to not conflict with more generic variant (acc-ga-3)
3041 // defined below. This restriction though requires adding an additional
3042 // variant: simd<T, N> gather(acc, glob_offset) to support calls that require
3043 // implicit conversion of a scalar offset to a vector of offsets, e.g.
3044 // 'res = gather<T, N>(acc, 0);'
3045 template <typename T, int N, typename AccessorT>
3046 __ESIMD_API
3047  std::enable_if_t<detail::is_device_accessor_with_v<
3048  AccessorT, detail::accessor_mode_cap::can_read>,
3049  simd<T, N>>
3050  gather(AccessorT acc, simd<detail::DeviceAccessorOffsetT, N> byte_offsets,
3051  detail::DeviceAccessorOffsetT glob_offset, simd_mask<N> mask = 1) {
3052 #ifdef __ESIMD_FORCE_STATELESS_MEM
3053  return gather<T, N>(__ESIMD_DNS::accessorToPointer<T>(acc, glob_offset),
3054  byte_offsets, mask);
3055 #else
3056  if constexpr (!detail::isPowerOf2(N, 32)) {
3057  // Requires DG2 or PVC.
3058  simd<T, N> PassThru; // Intentionally undefined
3059  byte_offsets += glob_offset;
3062  acc, byte_offsets, mask, PassThru);
3063  } else {
3064  return detail::gather_impl<T, N>(acc, byte_offsets, glob_offset, mask);
3065  }
3066 #endif // __ESIMD_FORCE_STATELESS_MEM
3067 }
3068 
3082 template <typename T, int N, typename AccessorT>
3083 __ESIMD_API
3084  std::enable_if_t<detail::is_device_accessor_with_v<
3085  AccessorT, detail::accessor_mode_cap::can_read>,
3086  simd<T, N>>
3087  gather(AccessorT acc, detail::DeviceAccessorOffsetT glob_offset) {
3089  return gather<T, N>(acc, ByteOffsets, glob_offset);
3090 }
3091 
3092 #ifdef __ESIMD_FORCE_STATELESS_MEM
3093 template <typename T, int N, typename AccessorTy, typename Toffset>
3094 __ESIMD_API std::enable_if_t<
3095  detail::is_device_accessor_with_v<AccessorTy,
3096  detail::accessor_mode_cap::can_read> &&
3097  std::is_integral_v<Toffset> && !std::is_same_v<Toffset, uint64_t>,
3098  simd<T, N>>
3099 gather(AccessorTy acc, simd<Toffset, N> offsets, uint64_t glob_offset,
3100  simd_mask<N> mask = 1) {
3101  return gather<T, N>(acc, convert<uint64_t>(offsets), glob_offset, mask);
3102 }
3103 #endif
3104 
3134 // typename PropertyListT = empty_properties_t>
3143 
3177 template <typename T, int N, int VS, typename AccessorT, typename OffsetT,
3178  typename PropertyListT =
3180 __ESIMD_API std::enable_if_t<
3181  (detail::is_device_accessor_with_v<AccessorT,
3182  detail::accessor_mode_cap::can_read> &&
3183  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3184  simd<T, N>>
3185 gather(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
3186  simd_mask<N / VS> mask, simd<T, N> pass_thru, PropertyListT props = {}) {
3187 #ifdef __ESIMD_FORCE_STATELESS_MEM
3188  return gather<T, N, VS>(detail::accessorToPointer<T>(acc), byte_offsets, mask,
3189  pass_thru, props);
3190 #else
3191  constexpr auto L1Hint =
3192  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
3194  constexpr auto L2Hint =
3195  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
3197  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
3198  "L3 cache hint is reserved. The old/experimental L3 LSC cache "
3199  "hint is cache_level::L2 now.");
3200 
3201  return detail::gather_impl<T, N, VS, L1Hint, L2Hint,
3203  acc, byte_offsets, mask, pass_thru);
3204 #endif // __ESIMD_FORCE_STATELESS_MEM
3205 }
3206 
3236 template <typename T, int N, int VS, typename AccessorT, typename OffsetT,
3237  typename PropertyListT =
3239 __ESIMD_API std::enable_if_t<
3240  (detail::is_device_accessor_with_v<AccessorT,
3241  detail::accessor_mode_cap::can_read> &&
3242  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3243  simd<T, N>>
3244 gather(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
3245  simd_mask<N / VS> mask, PropertyListT props = {}) {
3246 #ifdef __ESIMD_FORCE_STATELESS_MEM
3247  return gather<T, N, VS>(detail::accessorToPointer<T>(acc), byte_offsets, mask,
3248  props);
3249 #else
3250  constexpr size_t Alignment =
3251  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
3252  static_assert(Alignment >= sizeof(T),
3253  "gather() requires at least element-size alignment");
3254  constexpr auto L1Hint =
3255  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
3257  constexpr auto L2Hint =
3258  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
3260  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
3261  "L3 cache hint is reserved. The old/experimental L3 LSC cache "
3262  "hint is cache_level::L2 now.");
3263 
3264  if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none ||
3265  VS > 1 || !(detail::isPowerOf2(N, 32))) {
3266  simd<T, N> PassThru; // Intentionally undefined
3267  return detail::gather_impl<T, N, VS, L1Hint, L2Hint,
3269  acc, byte_offsets, mask, PassThru);
3270  } else {
3271  return detail::gather_impl<T, N>(acc, byte_offsets, 0, mask);
3272  }
3273 #endif // __ESIMD_FORCE_STATELESS_MEM
3274 }
3275 
3299 template <typename T, int N, int VS, typename AccessorT, typename OffsetT,
3300  typename PropertyListT =
3302 __ESIMD_API std::enable_if_t<
3303  (detail::is_device_accessor_with_v<AccessorT,
3304  detail::accessor_mode_cap::can_read> &&
3305  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3306  simd<T, N>>
3307 gather(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
3308  PropertyListT props = {}) {
3309  simd_mask<N / VS> Mask = 1;
3310  return gather<T, N, VS>(acc, byte_offsets, Mask, props);
3311 }
3312 
3322 // Dev note: the mask type was turned into template parameter `MaskT` to
3323 // avoid the conflicts of this prototype with the old gather() function
3324 // accepting a 'global_offset' parameter and avoid 'ambiguous call' errors
3325 // for calls like this: gather(acc, byte_offsets_simd, 0, mask);
3326 template <typename T, int N, typename AccessorT, typename OffsetT,
3327  typename MaskT,
3328  typename PropertyListT =
3330 __ESIMD_API std::enable_if_t<
3331  (detail::is_device_accessor_with_v<AccessorT,
3332  detail::accessor_mode_cap::can_read> &&
3333  std::is_same_v<MaskT, simd_mask<N>> &&
3334  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3335  simd<T, N>>
3336 gather(AccessorT acc, simd<OffsetT, N> byte_offsets, MaskT mask,
3337  simd<T, N> pass_thru, PropertyListT props = {}) {
3338  return gather<T, N, 1>(acc, byte_offsets, mask, pass_thru, props);
3339 }
3340 
3348 // Dev note: the mask type was turned into template parameter `MaskT` to
3349 // avoid the conflicts of this prototype with the old gather() function
3350 // accepting a 'global_offset' parameter and avoid 'ambiguous call' errors
3351 // for calls like this: gather(acc, byte_offsets_simd, 0);
3352 template <typename T, int N, typename AccessorT, typename OffsetT,
3353  typename MaskT,
3354  typename PropertyListT =
3356 __ESIMD_API std::enable_if_t<
3357  (detail::is_device_accessor_with_v<AccessorT,
3358  detail::accessor_mode_cap::can_read> &&
3359  std::is_same_v<MaskT, simd_mask<N>> &&
3360  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3361  simd<T, N>>
3362 gather(AccessorT acc, simd<OffsetT, N> byte_offsets, MaskT mask,
3363  PropertyListT props = {}) {
3364  return gather<T, N, 1>(acc, byte_offsets, mask, props);
3365 }
3366 
3374 template <typename T, int N, typename AccessorT, typename OffsetT,
3375  typename PropertyListT =
3377 __ESIMD_API std::enable_if_t<
3378  (detail::is_device_accessor_with_v<AccessorT,
3379  detail::accessor_mode_cap::can_read> &&
3380  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3381  simd<T, N>>
3382 gather(AccessorT acc, simd<OffsetT, N> byte_offsets, PropertyListT props = {}) {
3383  return gather<T, N, 1>(acc, byte_offsets, props);
3384 }
3385 
3388 // typename PropertyListT = empty_properties_t>
3394 template <typename T, int N, int VS = 1, typename AccessorT,
3395  typename OffsetSimdViewT,
3396  typename PropertyListT =
3398 __ESIMD_API std::enable_if_t<
3399  (detail::is_device_accessor_with_v<AccessorT,
3400  detail::accessor_mode_cap::can_read> &&
3401  detail::is_simd_view_type_v<OffsetSimdViewT> &&
3402  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3403  simd<T, N>>
3404 gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
3405  simd<T, N> pass_thru, PropertyListT props = {}) {
3406  return gather<T, N, VS>(acc, byte_offsets.read(), mask, pass_thru, props);
3407 }
3408 
3411 // typename PropertyListT = empty_properties_t>
3417 template <typename T, int N, int VS = 1, typename AccessorT,
3418  typename OffsetSimdViewT,
3419  typename PropertyListT =
3421 __ESIMD_API std::enable_if_t<
3422  (detail::is_device_accessor_with_v<AccessorT,
3423  detail::accessor_mode_cap::can_read> &&
3424  detail::is_simd_view_type_v<OffsetSimdViewT> &&
3425  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3426  simd<T, N>>
3427 gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
3428  PropertyListT props = {}) {
3429  return gather<T, N, VS>(acc, byte_offsets.read(), mask, props);
3430 }
3431 
3434 // typename PropertyListT = empty_properties_t>
3439 template <typename T, int N, int VS = 1, typename AccessorT,
3440  typename OffsetSimdViewT,
3441  typename PropertyListT =
3443 __ESIMD_API std::enable_if_t<
3444  (detail::is_device_accessor_with_v<AccessorT,
3445  detail::accessor_mode_cap::can_read> &&
3446  detail::is_simd_view_type_v<OffsetSimdViewT> &&
3447  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3448  simd<T, N>>
3449 gather(AccessorT acc, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
3450  return gather<T, N, VS>(acc, byte_offsets.read(), props);
3451 }
3452 
3466 
3469 
3504 template <typename T, int N, int VS = 1, typename AccessorTy, typename OffsetT,
3505  typename PropertyListT =
3507 __ESIMD_API std::enable_if_t<
3508  detail::is_device_accessor_with_v<AccessorTy,
3509  detail::accessor_mode_cap::can_write> &&
3510  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3511 scatter(AccessorTy acc, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
3512  simd_mask<N / VS> mask, PropertyListT props = {}) {
3513 #ifdef __ESIMD_FORCE_STATELESS_MEM
3514  scatter<T, N, VS>(__ESIMD_DNS::accessorToPointer<T>(acc), byte_offsets, vals,
3515  mask, props);
3516 #else
3517  constexpr size_t Alignment =
3518  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
3519  static_assert(Alignment >= sizeof(T),
3520  "gather() requires at least element-size alignment");
3521  constexpr auto L1Hint =
3522  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
3524  constexpr auto L2Hint =
3525  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
3527  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
3528  "L3 cache hint is reserved. The old/experimental L3 LSC cache "
3529  "hint is cache_level::L2 now.");
3530 
3531  if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none ||
3532  VS > 1 || !detail::isPowerOf2(N, 32)) {
3534  L2Hint>(acc, byte_offsets, vals, mask);
3535  } else {
3536  detail::scatter_impl<T, N, AccessorTy>(acc, vals, byte_offsets, 0, mask);
3537  }
3538 
3539 #endif // __ESIMD_FORCE_STATELESS_MEM
3540 }
3560 template <typename T, int N, int VS = 1, typename AccessorTy, typename OffsetT,
3561  typename PropertyListT =
3563 __ESIMD_API std::enable_if_t<
3564  detail::is_device_accessor_with_v<AccessorTy,
3565  detail::accessor_mode_cap::can_write> &&
3566  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3567 scatter(AccessorTy acc, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
3568  PropertyListT props = {}) {
3569  simd_mask<N / VS> Mask = 1;
3570  scatter<T, N, VS>(acc, byte_offsets, vals, Mask, props);
3571 }
3572 
3598 template <typename T, int N, int VS = 1, typename AccessorTy,
3599  typename OffsetSimdViewT,
3600  typename PropertyListT =
3602 __ESIMD_API std::enable_if_t<
3603  detail::is_device_accessor_with_v<AccessorTy,
3604  detail::accessor_mode_cap::can_write> &&
3605  detail::is_simd_view_type_v<OffsetSimdViewT> &&
3606  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3607 scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
3608  simd_mask<N / VS> mask, PropertyListT props = {}) {
3609  scatter<T, N, VS>(acc, byte_offsets.read(), vals, mask, props);
3610 }
3611 
3632 template <typename T, int N, int VS = 1, typename AccessorTy,
3633  typename OffsetSimdViewT,
3634  typename PropertyListT =
3636 __ESIMD_API std::enable_if_t<
3637  detail::is_device_accessor_with_v<AccessorTy,
3638  detail::accessor_mode_cap::can_write> &&
3639  detail::is_simd_view_type_v<OffsetSimdViewT> &&
3640  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3641 scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
3642  PropertyListT props = {}) {
3643  simd_mask<N / VS> Mask = 1;
3644  scatter<T, N, VS>(acc, byte_offsets.read(), vals, Mask, props);
3645 }
3646 
3663 template <typename T, int N, typename AccessorTy>
3664 __ESIMD_API
3665  std::enable_if_t<(detail::isPowerOf2(N, 32)) &&
3666  detail::is_device_accessor_with_v<
3667  AccessorTy, detail::accessor_mode_cap::can_write>>
3669  simd<T, N> vals, detail::DeviceAccessorOffsetT glob_offset,
3670  simd_mask<N> mask = 1) {
3671  offsets += glob_offset;
3672  scatter<T, N>(acc, offsets, vals, mask);
3673 }
3674 
3675 template <typename T, int N, typename AccessorTy>
3676 __ESIMD_API
3677  std::enable_if_t<(detail::isPowerOf2(N, 32)) &&
3678  detail::is_device_accessor_with_v<
3679  AccessorTy, detail::accessor_mode_cap::can_write>>
3680  scatter(AccessorTy acc, detail::DeviceAccessorOffsetT glob_offset,
3681  simd<T, N> vals, simd_mask<N> mask = 1) {
3683  scatter<T, N>(acc, ByteOffsets, vals, glob_offset, mask);
3684 }
3685 
3686 #ifdef __ESIMD_FORCE_STATELESS_MEM
3687 template <typename T, int N, typename AccessorTy, typename Toffset>
3688 __ESIMD_API std::enable_if_t<
3689  detail::is_device_accessor_with_v<AccessorTy,
3690  detail::accessor_mode_cap::can_write> &&
3691  std::is_integral_v<Toffset> && !std::is_same_v<Toffset, uint64_t>>
3692 scatter(AccessorTy acc, simd<Toffset, N> offsets, simd<T, N> vals,
3693  uint64_t glob_offset, simd_mask<N> mask = 1) {
3694  scatter<T, N, AccessorTy>(acc, convert<uint64_t>(offsets), vals, glob_offset,
3695  mask);
3696 }
3697 #endif
3698 
3706 template <typename T, typename AccessorTy>
3707 __ESIMD_API T scalar_load(AccessorTy acc,
3709  const simd<T, 1> Res =
3710  gather<T, 1, AccessorTy>(acc, simd<decltype(offset), 1>(offset));
3711  return Res[0];
3712 }
3713 
3721 template <typename T, typename AccessorTy>
3722 __ESIMD_API void scalar_store(AccessorTy acc,
3723  detail::DeviceAccessorOffsetT offset, T val) {
3724  scatter<T, 1, AccessorTy>(acc, simd<decltype(offset), 1>(offset),
3725  simd<T, 1>(val));
3726 }
3727 
3761 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
3762  int N, typename Toffset>
3763 __ESIMD_API simd<T, N * get_num_channels_enabled(RGBAMask)>
3764 gather_rgba(const T *p, simd<Toffset, N> offsets, simd_mask<N> mask = 1) {
3765  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
3766  static_assert((N == 8 || N == 16 || N == 32), "Unsupported value of N");
3767  static_assert(sizeof(T) == 4, "Unsupported size of type T");
3768  simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
3769  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
3770  addrs = addrs + offsets_i;
3771  return __esimd_svm_gather4_scaled<detail::__raw_t<T>, N, RGBAMask>(
3772  addrs.data(), mask.data());
3773 }
3774 
3790 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
3791  int N, typename OffsetObjT, typename RegionTy>
3792 __ESIMD_API simd<T, N * get_num_channels_enabled(RGBAMask)>
3794  simd_mask<N> mask = 1) {
3795  return gather_rgba<RGBAMask, T, N>(p, offsets.read(), mask);
3796 }
3797 
3813 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
3814  int N, typename Toffset>
3815 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>,
3816  simd<T, N * get_num_channels_enabled(RGBAMask)>>
3817 gather_rgba(const T *p, Toffset offset, simd_mask<N> mask = 1) {
3818  return gather_rgba<RGBAMask, T, N>(p, simd<Toffset, N>(offset), mask);
3819 }
3820 
3821 namespace detail {
3822 template <rgba_channel_mask M> static void validate_rgba_write_channel_mask() {
3823  using CM = rgba_channel_mask;
3824  static_assert(
3825  (M == CM::ABGR || M == CM::BGR || M == CM::GR || M == CM::R) &&
3826  "Only ABGR, BGR, GR, R channel masks are valid in write operations");
3827 }
3828 } // namespace detail
3829 
3851 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
3852  int N, typename Toffset>
3853 __ESIMD_API void
3855  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
3856  simd_mask<N> mask = 1) {
3857  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
3858  static_assert((N == 8 || N == 16 || N == 32), "Unsupported value of N");
3859  static_assert(sizeof(T) == 4, "Unsupported size of type T");
3860  detail::validate_rgba_write_channel_mask<RGBAMask>();
3861  simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
3862  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
3863  addrs = addrs + offsets_i;
3864  __esimd_svm_scatter4_scaled<detail::__raw_t<T>, N, RGBAMask>(
3865  addrs.data(), vals.data(), mask.data());
3866 }
3867 
3883 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
3884  int N, typename OffsetObjT, typename RegionTy>
3885 __ESIMD_API void
3887  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
3888  simd_mask<N> mask = 1) {
3889  scatter_rgba<RGBAMask, T, N>(p, offsets.read(), vals, mask);
3890 }
3891 
3907 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
3908  int N, typename Toffset>
3909 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> && N == 1>
3910 scatter_rgba(T *p, Toffset offset,
3911  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
3912  simd_mask<N> mask = 1) {
3913  scatter_rgba<RGBAMask, T, N>(p, simd<Toffset, N>(offset), vals, mask);
3914 }
3915 
3916 template <typename T, int N, rgba_channel_mask RGBAMask>
3917 __SYCL_DEPRECATED("use scatter_rgba<rgba_channel_mask>()")
3918 __ESIMD_API std::
3919  enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4> scatter_rgba(
3920  T *p, simd<uint32_t, N> offsets,
3921  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
3922  simd_mask<N> mask = 1) {
3923  scatter_rgba<RGBAMask>(p, offsets, vals, mask);
3924 }
3925 
3948 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
3949  typename AccessorT, int N,
3950  typename T = typename AccessorT::value_type>
3951 __ESIMD_API
3952  std::enable_if_t<((N == 8 || N == 16 || N == 32) && sizeof(T) == 4 &&
3953  detail::is_device_accessor_with_v<
3954  AccessorT, detail::accessor_mode_cap::can_read>),
3955  simd<T, N * get_num_channels_enabled(RGBAMask)>>
3957  detail::DeviceAccessorOffsetT global_offset = 0,
3958  simd_mask<N> mask = 1) {
3959 #ifdef __ESIMD_FORCE_STATELESS_MEM
3960  return gather_rgba<RGBAMask>(
3961  __ESIMD_DNS::accessorToPointer<T>(acc, global_offset), offsets, mask);
3962 #else
3963  // TODO (performance) use hardware-supported scale once BE supports it
3964  constexpr uint32_t Scale = 0;
3965  const auto SI = get_surface_index(acc);
3966  return __esimd_gather4_masked_scaled2<detail::__raw_t<T>, N, RGBAMask,
3967  decltype(SI), Scale>(
3968  SI, global_offset, offsets.data(), mask.data());
3969 #endif
3970 }
3971 
3972 #ifdef __ESIMD_FORCE_STATELESS_MEM
3973 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
3974  typename AccessorT, int N,
3975  typename T = typename AccessorT::value_type, typename Toffset>
3976 __ESIMD_API std::enable_if_t<
3977  ((N == 8 || N == 16 || N == 32) && sizeof(T) == 4 &&
3978  detail::is_device_accessor_with_v<AccessorT,
3979  detail::accessor_mode_cap::can_read> &&
3980  std::is_integral_v<Toffset> && !std::is_same_v<Toffset, uint64_t>),
3981  simd<T, N * get_num_channels_enabled(RGBAMask)>>
3982 gather_rgba(AccessorT acc, simd<Toffset, N> offsets, uint64_t global_offset = 0,
3983  simd_mask<N> mask = 1) {
3984  return gather_rgba<RGBAMask, AccessorT, N, T>(acc, convert<uint64_t>(offsets),
3985  global_offset, mask);
3986 }
3987 #endif
3988 
4003 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
4004  typename AccessorT, int N,
4005  typename T = typename AccessorT::value_type>
4006 __ESIMD_API
4007  std::enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4 &&
4008  detail::is_device_accessor_with_v<
4009  AccessorT, detail::accessor_mode_cap::can_write>>
4011  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
4012  detail::DeviceAccessorOffsetT global_offset = 0,
4013  simd_mask<N> mask = 1) {
4014  detail::validate_rgba_write_channel_mask<RGBAMask>();
4015 #ifdef __ESIMD_FORCE_STATELESS_MEM
4016  scatter_rgba<RGBAMask>(__ESIMD_DNS::accessorToPointer<T>(acc, global_offset),
4017  offsets, vals, mask);
4018 #else
4019  // TODO (performance) use hardware-supported scale once BE supports it
4020  constexpr uint32_t Scale = 0;
4021  const auto SI = get_surface_index(acc);
4022  __esimd_scatter4_scaled<T, N, decltype(SI), RGBAMask, Scale>(
4023  mask.data(), SI, global_offset, offsets.data(), vals.data());
4024 #endif
4025 }
4026 
4027 #ifdef __ESIMD_FORCE_STATELESS_MEM
4028 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
4029  typename AccessorT, int N,
4030  typename T = typename AccessorT::value_type, typename Toffset>
4031 __ESIMD_API std::enable_if_t<
4032  (N == 8 || N == 16 || N == 32) && sizeof(T) == 4 &&
4033  detail::is_device_accessor_with_v<AccessorT,
4034  detail::accessor_mode_cap::can_write> &&
4035  std::is_integral_v<Toffset> && !std::is_same_v<Toffset, uint64_t>>
4036 scatter_rgba(AccessorT acc, simd<Toffset, N> offsets,
4037  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
4038  uint64_t global_offset = 0, simd_mask<N> mask = 1) {
4039  scatter_rgba<RGBAMask, AccessorT, N, T>(acc, convert<uint64_t>(offsets), vals,
4040  global_offset, mask);
4041 }
4042 #endif
4044 
4045 namespace detail {
4046 
4047 #ifndef __ESIMD_FP_ATOMIC_OP_TYPE_CHECK
4048 #define __ESIMD_FP_ATOMIC_OP_TYPE_CHECK(T) \
4049  static_assert(is_type<T, float, sycl::half, double>(), \
4050  "float, double or sycl::half type is expected");
4051 #endif // __ESIMD_FP_ATOMIC_OP_TYPE_CHECK
4052 
4055 template <__ESIMD_NS::atomic_op Op, typename T, int N, unsigned NumSrc,
4056  bool IsLSC = false>
4057 constexpr void check_atomic() {
4058 
4059  static_assert(sizeof(T) > 1, "Unsupported data type");
4060 
4061  // LSC atomic operation is supported for any width.
4062  if constexpr (!IsLSC)
4063  static_assert((detail::isPowerOf2(N, 32)),
4064  "Execution size 1, 2, 4, 8, 16, 32 are supported");
4065 
4066  static_assert(NumSrc == __ESIMD_DNS::get_num_args<Op>(),
4067  "Wrong number of operands");
4068  constexpr bool IsInt2BytePlus =
4069  std::is_integral_v<T> && (sizeof(T) >= sizeof(uint16_t));
4070 
4071  if constexpr (Op == __ESIMD_NS::atomic_op::xchg ||
4072  Op == __ESIMD_NS::atomic_op::cmpxchg ||
4073  Op == __ESIMD_NS::atomic_op::predec ||
4074  Op == __ESIMD_NS::atomic_op::inc ||
4076 
4077  static_assert(IsInt2BytePlus, "Integral 16-bit or wider type is expected");
4078  }
4079  // FP ops (are always delegated to native::lsc::<Op>)
4080  if constexpr (Op == __ESIMD_NS::atomic_op::fmax ||
4082  Op == __ESIMD_NS::atomic_op::fadd ||
4083  Op == __ESIMD_NS::atomic_op::fsub ||
4084  Op == __ESIMD_NS::atomic_op::fcmpxchg) {
4086  }
4087  if constexpr (Op == __ESIMD_NS::atomic_op::add ||
4088  Op == __ESIMD_NS::atomic_op::sub ||
4089  Op == __ESIMD_NS::atomic_op::umin ||
4090  Op == __ESIMD_NS::atomic_op::umax ||
4094  Op == __ESIMD_NS::atomic_op::smin ||
4095  Op == __ESIMD_NS::atomic_op::smax) {
4096  static_assert(IsInt2BytePlus, "Integral 16-bit or wider type is expected");
4097  constexpr bool IsSignedMinmax = (Op == __ESIMD_NS::atomic_op::smin) ||
4098  (Op == __ESIMD_NS::atomic_op::smax);
4099  constexpr bool IsUnsignedMinmax = (Op == __ESIMD_NS::atomic_op::umin) ||
4100  (Op == __ESIMD_NS::atomic_op::umax);
4101 
4102  if constexpr (IsSignedMinmax || IsUnsignedMinmax) {
4103  constexpr bool SignOK = std::is_signed_v<T> == IsSignedMinmax;
4104  static_assert(SignOK, "Signed/unsigned integer type expected for "
4105  "signed/unsigned min/max operation");
4106  }
4107  }
4108 }
4109 #undef __ESIMD_FP_ATOMIC_OP_TYPE_CHECK
4110 } // namespace detail
4111 
4114 
4127 template <uint32_t SLMSize> __ESIMD_API void slm_init() {
4128  __esimd_slm_init(SLMSize);
4129 }
4130 
4136 // with esimd::slm_allocator() class.
4139 __ESIMD_API void slm_init(uint32_t size) { __esimd_slm_init(size); }
4140 
4177 
4183 #ifndef __ESIMD_GATHER_SCATTER_LLVM_IR
4186 #endif // __ESIMD_GATHER_SCATTER_LLVM_IR
4207 template <typename T, int N, int VS,
4208  typename PropertyListT =
4210 __ESIMD_API std::enable_if_t<
4211  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
4213  simd<T, N> pass_thru, PropertyListT props = {}) {
4214  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
4215 
4216  constexpr size_t Alignment =
4217  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
4218  static_assert(Alignment >= sizeof(T),
4219  "slm_gather() requires at least element-size alignment");
4220 
4221  // Use LSC lowering if VS > 1. Also, if masked gather is
4222  // not available, then LSC is the only lowering option.
4223  if constexpr (VS > 1 || !detail::isMaskedGatherScatterLLVMAvailable()) {
4224  return __ESIMD_DNS::slm_gather_impl<T, VS,
4226  byte_offsets, mask, pass_thru);
4227  } else {
4228  if constexpr (sizeof(T) == 8) {
4229  simd<T, N> Res;
4230  Res.template bit_cast_view<uint32_t>().template select<N, 2>(0) =
4231  __esimd_slm_gather_ld<uint32_t, N, Alignment>(
4232  byte_offsets.data(), mask.data(),
4233  (pass_thru.template bit_cast_view<uint32_t>()
4234  .template select<N, 2>(0))
4235  .data());
4236  simd<uint32_t, N / VS> Offset = byte_offsets + sizeof(uint32_t);
4237  Res.template bit_cast_view<uint32_t>().template select<N, 2>(1) =
4238  __esimd_slm_gather_ld<uint32_t, N, sizeof(uint32_t)>(
4239  Offset.data(), mask.data(),
4240  (pass_thru.template bit_cast_view<uint32_t>()
4241  .template select<N, 2>(1))
4242  .data());
4243  return Res;
4244  } else {
4245  using MsgT = detail::__raw_t<T>;
4246  return __esimd_slm_gather_ld<MsgT, N, Alignment>(
4247  byte_offsets.data(), mask.data(), pass_thru.data());
4248  }
4249  }
4250 }
4251 
4275 template <typename T, int N, int VS,
4276  typename PropertyListT =
4278 __ESIMD_API std::enable_if_t<
4279  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
4281  PropertyListT props = {}) {
4282  constexpr size_t Alignment =
4283  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
4284  static_assert(Alignment >= sizeof(T),
4285  "slm_gather() requires at least element-size alignment");
4286 
4287  if constexpr (VS > 1 || (!detail::isPowerOf2(N, 32) &&
4289  simd<T, N> PassThru; // Intentionally undefined
4290  return detail::slm_gather_impl<T, VS, detail::lsc_data_size::default_size>(
4291  byte_offsets, mask, PassThru);
4292  } else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) {
4293  if constexpr (sizeof(T) == 8) {
4294  simd<T, N> Res;
4295  simd<uint32_t, N> PassThru; // it is intentionally undefined
4296 
4297  Res.template bit_cast_view<uint32_t>().template select<N, 2>(0) =
4298  __esimd_slm_gather_ld<uint32_t, N, Alignment>(
4299  byte_offsets.data(), mask.data(), PassThru.data());
4300  simd<uint32_t, N / VS> Offset = byte_offsets + sizeof(uint32_t);
4301  Res.template bit_cast_view<uint32_t>().template select<N, 2>(1) =
4302  __esimd_slm_gather_ld<uint32_t, N, sizeof(uint32_t)>(
4303  Offset.data(), mask.data(), PassThru.data());
4304  return Res;
4305  } else {
4306  using MsgT = detail::__raw_t<T>;
4307  simd<MsgT, N> PassThru; // it is intentionally undefined
4308  return __esimd_slm_gather_ld<MsgT, N, Alignment>(
4309  byte_offsets.data(), mask.data(), PassThru.data());
4310  }
4311  } else {
4312  detail::LocalAccessorMarker acc;
4313  return detail::gather_impl<T, N>(acc, byte_offsets, 0, mask);
4314  }
4315 }
4316 
4334 template <typename T, int N, int VS,
4335  typename PropertyListT =
4337 __ESIMD_API std::enable_if_t<
4338  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
4339 slm_gather(simd<uint32_t, N / VS> byte_offsets, PropertyListT props = {}) {
4340  simd_mask<N / VS> Mask = 1;
4341  return slm_gather<T, N, VS>(byte_offsets, Mask, props);
4342 }
4343 
4366 template <typename T, int N,
4367  typename PropertyListT =
4369 __ESIMD_API std::enable_if_t<
4370  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
4372  simd<T, N> pass_thru, PropertyListT props = {}) {
4373  constexpr int VS = 1;
4374  return slm_gather<T, N, VS>(byte_offsets, mask, pass_thru, props);
4375 }
4376 
4396 template <typename T, int N,
4397  typename PropertyListT =
4399 __ESIMD_API std::enable_if_t<
4400  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
4402  PropertyListT props = {}) {
4403  constexpr int VS = 1;
4404  return slm_gather<T, N, VS>(byte_offsets, mask, props);
4405 }
4406 
4421 template <typename T, int N,
4422  typename PropertyListT =
4424 __ESIMD_API std::enable_if_t<
4425  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
4426 slm_gather(simd<uint32_t, N> byte_offsets, PropertyListT props = {}) {
4427  constexpr int VS = 1;
4428  return slm_gather<T, N, VS>(byte_offsets, props);
4429 }
4430 
4457 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
4458  typename PropertyListT =
4460 __ESIMD_API std::enable_if_t<
4461  detail::is_simd_view_type_v<OffsetSimdViewT> &&
4462  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
4463  simd<T, N>>
4464 slm_gather(OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
4465  simd<T, N> pass_thru, PropertyListT props = {}) {
4466  return slm_gather<T, N, VS>(byte_offsets.read(), mask, pass_thru, props);
4467 }
4468 
4490 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
4491  typename PropertyListT =
4493 __ESIMD_API std::enable_if_t<
4494  detail::is_simd_view_type_v<OffsetSimdViewT> &&
4495  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
4496  simd<T, N>>
4497 slm_gather(OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
4498  PropertyListT props = {}) {
4499  return slm_gather<T, N, VS>(byte_offsets.read(), mask, props);
4500 }
4501 
4518 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
4519  typename PropertyListT =
4521 __ESIMD_API std::enable_if_t<
4522  detail::is_simd_view_type_v<OffsetSimdViewT> &&
4523  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
4524  simd<T, N>>
4525 slm_gather(OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
4526  return slm_gather<T, N, VS>(byte_offsets.read(), props);
4527 }
4528 
4534 template <typename T> __ESIMD_API T slm_scalar_load(uint32_t offset) {
4535  const simd<T, 1> Res = slm_gather<T, 1>(simd<uint32_t, 1>(offset));
4536  return Res[0];
4537 }
4538 
4556 
4579 template <typename T, int N, int VS = 1,
4580  typename PropertyListT =
4582 __ESIMD_API std::enable_if_t<
4583  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4585  simd_mask<N / VS> mask, PropertyListT props = {}) {
4586  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
4587 
4588  constexpr size_t Alignment =
4589  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
4590  static_assert(Alignment >= sizeof(T),
4591  "slm_scatter() requires at least element-size alignment");
4592 
4593  // Use LSC lowering if VS > 1.
4594  if constexpr (VS > 1 || (!detail::isPowerOf2(N, 32) &&
4596  __ESIMD_DNS::slm_scatter_impl<T, VS, detail::lsc_data_size::default_size>(
4597  byte_offsets, vals, mask);
4598  } else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) {
4599  if constexpr (sizeof(T) == 8) {
4600  __esimd_slm_scatter_st<uint32_t, N, Alignment>(
4601  vals.template bit_cast_view<uint32_t>()
4602  .template select<N, 2>(0)
4603  .data(),
4604  byte_offsets.data(), mask.data());
4605  simd<uint32_t, N / VS> Offset = byte_offsets + sizeof(uint32_t);
4606  __esimd_slm_scatter_st<uint32_t, N, sizeof(uint32_t)>(
4607  vals.template bit_cast_view<uint32_t>()
4608  .template select<N, 2>(1)
4609  .data(),
4610  Offset.data(), mask.data());
4611 
4612  } else {
4613  using MsgT = detail::__raw_t<T>;
4614  __esimd_slm_scatter_st<MsgT, N, Alignment>(
4615  sycl::bit_cast<__ESIMD_DNS::vector_type_t<MsgT, N>>(vals.data()),
4616  byte_offsets.data(), mask.data());
4617  }
4618  } else {
4619  detail::LocalAccessorMarker acc;
4620  detail::scatter_impl<T, N>(acc, vals, byte_offsets, 0, mask);
4621  }
4622 }
4623 
4640 template <typename T, int N, int VS = 1,
4641  typename PropertyListT =
4643 __ESIMD_API std::enable_if_t<
4644  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4646  PropertyListT props = {}) {
4647  simd_mask<N / VS> Mask = 1;
4648  slm_scatter<T, N, VS>(byte_offsets, vals, Mask, props);
4649 }
4650 
4674 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
4675  typename PropertyListT =
4677 __ESIMD_API std::enable_if_t<
4678  detail::is_simd_view_type_v<OffsetSimdViewT> &&
4679  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4680 slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
4681  simd_mask<N / VS> mask, PropertyListT props = {}) {
4682  slm_scatter<T, N, VS>(byte_offsets.read(), vals, mask, props);
4683 }
4684 
4700 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
4701  typename PropertyListT =
4703 __ESIMD_API std::enable_if_t<
4704  detail::is_simd_view_type_v<OffsetSimdViewT> &&
4705  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4706 slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
4707  PropertyListT props = {}) {
4708  return slm_scatter<T, N, VS>(byte_offsets.read(), vals, props);
4709 }
4710 
4716 template <typename T>
4717 __ESIMD_API void slm_scalar_store(uint32_t offset, T val) {
4718  slm_scatter<T, 1>(simd<uint32_t, 1>(offset), simd<T, 1>(val), 1);
4719 }
4720 
4731 template <typename T, int N, rgba_channel_mask RGBAMask>
4732 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && (sizeof(T) == 4),
4733  simd<T, N * get_num_channels_enabled(RGBAMask)>>
4736  return __esimd_gather4_masked_scaled2<T, N, RGBAMask>(
4737  SI, 0 /*global_offset*/, offsets.data(), mask.data());
4738 }
4739 
4750 template <typename T, int N, rgba_channel_mask Mask>
4751 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && (sizeof(T) == 4)>
4753  simd<T, N * get_num_channels_enabled(Mask)> vals,
4754  simd_mask<N> mask = 1) {
4755  detail::validate_rgba_write_channel_mask<Mask>();
4757  constexpr int16_t Scale = 0;
4758  constexpr int global_offset = 0;
4759  __esimd_scatter4_scaled<T, N, decltype(si), Mask, Scale>(
4760  mask.data(), si, global_offset, offsets.data(), vals.data());
4761 }
4762 
4778 template <typename T, int N,
4780 __ESIMD_API std::enable_if_t<is_simd_flag_type_v<Flags>, simd<T, N>>
4781 slm_block_load(uint32_t byte_offset, Flags) {
4782  constexpr size_t Align = Flags::template alignment<simd<T, N>>;
4783  return __esimd_slm_block_ld<detail::__raw_t<T>, N, Align>(byte_offset);
4784 }
4785 
4794 
4801 
4819 
4834 template <typename T, int N,
4835  typename PropertyListT =
4837 __ESIMD_API std::enable_if_t<
4838  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
4839 slm_block_load(uint32_t byte_offset, PropertyListT props = {}) {
4840  constexpr size_t DefaultAlignment = detail::OperandSize::OWORD;
4841  constexpr size_t Alignment =
4842  detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
4843  return __esimd_slm_block_ld<detail::__raw_t<T>, N, Alignment>(byte_offset);
4844 }
4845 
4872 template <typename T, int N,
4873  typename PropertyListT =
4875 __ESIMD_API std::enable_if_t<
4876  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
4877 slm_block_load(uint32_t byte_offset, simd_mask<1> pred,
4878  PropertyListT props = {}) {
4879  // Verify input template arguments.
4880  constexpr size_t DefaultAlignment = sizeof(T) <= 4 ? 4 : sizeof(T);
4881  constexpr size_t Alignment =
4882  detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
4883  static_assert(
4884  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
4885  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
4886  "Incorrect alignment for the data type");
4887 
4888  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
4889  constexpr int SmallIntFactor32Bit =
4890  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
4891  static_assert(N > 0 && N % SmallIntFactor32Bit == 0,
4892  "Number of elements is not supported by Transposed load");
4893 
4894  // If alignment >= 8 and (N * sizeof(T)) % 8 == 0) we can load QWORDs.
4895  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
4896  // because it would require a bit-cast, which is supposed to be NO-OP, but
4897  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
4898  constexpr bool Use64BitData =
4899  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
4900  (N * sizeof(T)) % sizeof(uint64_t) == 0 &&
4901  (sizeof(T) != sizeof(uint32_t) || N * sizeof(T) > 256);
4902  constexpr int SmallIntFactor =
4903  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
4904  constexpr int FactoredN = N / SmallIntFactor;
4905  detail::check_lsc_vector_size<FactoredN>();
4906 
4907  // Prepare template arguments for the call of intrinsic.
4908  using LoadElemT = __ESIMD_DNS::__raw_t<
4909  std::conditional_t<SmallIntFactor == 1, T,
4910  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
4911 
4912  constexpr uint16_t AddressScale = 1;
4913  constexpr int ImmOffset = 0;
4914  constexpr detail::lsc_data_size DS =
4916  constexpr auto VS = detail::to_lsc_vector_size<FactoredN>();
4917  constexpr auto Transposed = detail::lsc_data_order::transpose;
4918  constexpr int NLanes = 1;
4919 
4920  // Prepare non-template arguments and call the intrinsic.
4921  simd<uint32_t, NLanes> Offsets = byte_offset;
4923  __esimd_lsc_load_slm<LoadElemT, cache_hint::none, cache_hint::none,
4924  AddressScale, ImmOffset, DS, VS, Transposed, NLanes>(
4925  pred.data(), Offsets.data());
4926  return Result.template bit_cast_view<T>();
4927 }
4928 
4958 template <typename T, int N,
4959  typename PropertyListT =
4961 __ESIMD_API std::enable_if_t<
4962  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
4963 slm_block_load(uint32_t offset, simd_mask<1> pred, simd<T, N> pass_thru,
4964  PropertyListT props = {}) {
4965  // Verify input template arguments.
4966  constexpr size_t DefaultAlignment = sizeof(T) <= 4 ? 4 : sizeof(T);
4967  constexpr size_t Alignment =
4968  detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
4969  static_assert(
4970  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
4971  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
4972  "Incorrect alignment for the data type");
4973 
4974  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
4975  constexpr int SmallIntFactor32Bit =
4976  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
4977  static_assert(N > 0 && N % SmallIntFactor32Bit == 0,
4978  "Number of elements is not supported by Transposed load");
4979 
4980  // If alignment >= 8 and (N * sizeof(T)) % 8 == 0) we can load QWORDs.
4981  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
4982  // because it would require a bit-cast, which is supposed to be NO-OP, but
4983  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
4984  constexpr bool Use64BitData =
4985  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
4986  (N * sizeof(T)) % sizeof(uint64_t) == 0 &&
4987  (sizeof(T) != sizeof(uint32_t) || N * sizeof(T) > 256);
4988  constexpr int SmallIntFactor =
4989  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
4990  constexpr int FactoredN = N / SmallIntFactor;
4991  detail::check_lsc_vector_size<FactoredN>();
4992 
4993  // Prepare template arguments for the call of intrinsic.
4994  using LoadElemT = __ESIMD_DNS::__raw_t<
4995  std::conditional_t<SmallIntFactor == 1, T,
4996  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
4997 
4998  constexpr uint16_t AddressScale = 1;
4999  constexpr int ImmOffset = 0;
5000  constexpr detail::lsc_data_size DS =
5002  constexpr auto VS = detail::to_lsc_vector_size<FactoredN>();
5003  constexpr auto Transposed = detail::lsc_data_order::transpose;
5004  constexpr int NLanes = 1;
5005 
5006  // Prepare non-template arguments and call the intrinsic.
5007  simd<uint32_t, NLanes> Offsets = offset;
5008  simd<LoadElemT, FactoredN> PassThru =
5009  pass_thru.template bit_cast_view<LoadElemT>();
5011  __esimd_lsc_load_merge_slm<LoadElemT, cache_hint::none, cache_hint::none,
5012  AddressScale, ImmOffset, DS, VS, Transposed,
5013  NLanes>(pred.data(), Offsets.data(),
5014  PassThru.data());
5015  return Result.template bit_cast_view<T>();
5016 }
5017 
5041 template <typename T, int N, typename AccessorT,
5042  typename PropertyListT =
5044 __ESIMD_API std::enable_if_t<
5045  detail::is_local_accessor_with_v<AccessorT,
5046  detail::accessor_mode_cap::can_read> &&
5047  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
5048  simd<T, N>>
5049 block_load(AccessorT lacc, uint32_t byte_offset, PropertyListT props = {}) {
5050  byte_offset += detail::localAccessorToOffset(lacc);
5051  return slm_block_load<T, N>(byte_offset, props);
5052 }
5053 
5076 template <typename T, int N, typename AccessorT,
5077  typename PropertyListT =
5079 __ESIMD_API std::enable_if_t<
5080  detail::is_local_accessor_with_v<AccessorT,
5081  detail::accessor_mode_cap::can_read> &&
5082  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
5083  simd<T, N>>
5084 block_load(AccessorT lacc, PropertyListT props = {}) {
5085  return slm_block_load<T, N>(detail::localAccessorToOffset(lacc), props);
5086 }
5087 
5115 template <typename T, int N, typename AccessorT,
5116  typename PropertyListT =
5118 __ESIMD_API std::enable_if_t<
5119  detail::is_local_accessor_with_v<AccessorT,
5120  detail::accessor_mode_cap::can_read> &&
5121  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
5122  simd<T, N>>
5123 block_load(AccessorT lacc, uint32_t byte_offset, simd_mask<1> pred,
5124  PropertyListT props = {}) {
5125  byte_offset += detail::localAccessorToOffset(lacc);
5126  return slm_block_load<T, N>(byte_offset, pred, props);
5127 }
5128 
5154 template <typename T, int N, typename AccessorT,
5155  typename PropertyListT =
5157 __ESIMD_API std::enable_if_t<
5158  detail::is_local_accessor_with_v<AccessorT,
5159  detail::accessor_mode_cap::can_read> &&
5160  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
5161  simd<T, N>>
5162 block_load(AccessorT lacc, simd_mask<1> pred, PropertyListT props = {}) {
5163  return slm_block_load<T, N>(detail::localAccessorToOffset(lacc), pred, props);
5164 }
5165 
5193 template <typename T, int N, typename AccessorT,
5194  typename PropertyListT =
5196 __ESIMD_API std::enable_if_t<
5197  detail::is_local_accessor_with_v<AccessorT,
5198  detail::accessor_mode_cap::can_read> &&
5199  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
5200  simd<T, N>>
5201 block_load(AccessorT lacc, uint32_t byte_offset, simd_mask<1> pred,
5202  simd<T, N> pass_thru, PropertyListT props = {}) {
5203  byte_offset += __ESIMD_DNS::localAccessorToOffset(lacc);
5204  return slm_block_load<T, N>(byte_offset, pred, pass_thru, props);
5205 }
5206 
5233 template <typename T, int N, typename AccessorT,
5234  typename PropertyListT =
5236 __ESIMD_API std::enable_if_t<
5237  detail::is_local_accessor_with_v<AccessorT,
5238  detail::accessor_mode_cap::can_read> &&
5239  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
5240  simd<T, N>>
5241 block_load(AccessorT lacc, simd_mask<1> pred, simd<T, N> pass_thru,
5242  PropertyListT props = {}) {
5243  return slm_block_load<T, N>(__ESIMD_DNS::localAccessorToOffset(lacc), pred,
5244  pass_thru, props);
5245 }
5246 
5262 template <typename T, int N, typename Flags>
5263 __ESIMD_API std::enable_if_t<is_simd_flag_type_v<Flags>>
5264 slm_block_store(uint32_t offset, simd<T, N> vals, Flags) {
5265  constexpr size_t Align = Flags::template alignment<simd<T, N>>;
5266  __esimd_slm_block_st<detail::__raw_t<T>, N, Align>(offset, vals.data());
5267 }
5268 
5276 
5324 template <typename T, int N,
5325  typename PropertyListT =
5327 __ESIMD_API std::enable_if_t<
5328  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
5329 slm_block_store(uint32_t byte_offset, simd<T, N> vals, simd_mask<1> pred,
5330  PropertyListT props = {}) {
5331  // Verify input template arguments.
5332  constexpr size_t DefaultAlignment = sizeof(T) <= 4 ? 4 : sizeof(T);
5333  constexpr size_t Alignment =
5334  detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
5335  static_assert(
5336  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
5337  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
5338  "Incorrect alignment for the data type");
5339 
5340  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
5341  constexpr int SmallIntFactor32Bit =
5342  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
5343 
5344  static_assert(N > 0 && N % SmallIntFactor32Bit == 0,
5345  "Number of elements is not supported by Transposed store");
5346 
5347  // If alignment >= 8 and (N * sizeof(T)) % 8 == 0) we can store QWORDs.
5348  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
5349  // because it would require a bit-cast, which is supposed to be NO-OP, but
5350  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
5351  constexpr bool Use64BitData =
5352  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
5353  (N * sizeof(T)) % sizeof(uint64_t) == 0 &&
5354  (sizeof(T) != sizeof(uint32_t) || N * sizeof(T) > 256);
5355  constexpr int SmallIntFactor =
5356  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
5357  constexpr int FactoredN = N / SmallIntFactor;
5358  detail::check_lsc_vector_size<FactoredN>();
5359 
5360  // Prepare template arguments for the call of intrinsic.
5361  using StoreElemT = __ESIMD_DNS::__raw_t<
5362  std::conditional_t<SmallIntFactor == 1, T,
5363  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
5364 
5365  constexpr uint16_t AddressScale = 1;
5366  constexpr int ImmOffset = 0;
5367  constexpr detail::lsc_data_size DS =
5369  constexpr auto VS = detail::to_lsc_vector_size<FactoredN>();
5370  constexpr auto Transposed = detail::lsc_data_order::transpose;
5371  constexpr int NLanes = 1;
5372 
5373  // Prepare non-template arguments and call the intrinsic.
5374  simd<uint32_t, NLanes> Offsets = byte_offset;
5375  __esimd_lsc_store_slm<StoreElemT, cache_hint::none, cache_hint::none,
5376  AddressScale, ImmOffset, DS, VS, Transposed, NLanes>(
5377  pred.data(), Offsets.data(),
5378  sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreElemT, FactoredN>>(
5379  vals.data()));
5380 }
5381 
5397 template <typename T, int N,
5398  typename PropertyListT =
5400 __ESIMD_API std::enable_if_t<
5401  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
5402 slm_block_store(uint32_t byte_offset, simd<T, N> vals,
5403  PropertyListT props = {}) {
5404  constexpr size_t DefaultAlignment = detail::OperandSize::OWORD;
5405  constexpr size_t Alignment =
5406  detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
5407  using StoreElemT = detail::__raw_t<T>;
5408  __esimd_slm_block_st<StoreElemT, N, Alignment>(
5409  byte_offset,
5410  sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreElemT, N>>(vals.data()));
5411 }
5412 
5429 template <typename T, int N, typename AccessorT,
5430  typename PropertyListT =
5432 __ESIMD_API std::enable_if_t<
5433  detail::is_local_accessor_with_v<AccessorT,
5434  detail::accessor_mode_cap::can_write> &&
5435  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
5436 block_store(AccessorT lacc, uint32_t byte_offset, simd<T, N> vals,
5437  PropertyListT props = {}) {
5438  byte_offset += detail::localAccessorToOffset(lacc);
5439  slm_block_store<T, N>(byte_offset, vals, props);
5440 }
5441 
5457 template <typename T, int N, typename AccessorT,
5458  typename PropertyListT =
5460 __ESIMD_API std::enable_if_t<
5461  detail::is_local_accessor_with_v<AccessorT,
5462  detail::accessor_mode_cap::can_write> &&
5463  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
5464 block_store(AccessorT lacc, simd<T, N> vals, PropertyListT props = {}) {
5465  slm_block_store<T, N>(detail::localAccessorToOffset(lacc), vals, props);
5466 }
5467 
5495 template <typename T, int N, typename AccessorT,
5496  typename PropertyListT =
5498 __ESIMD_API std::enable_if_t<
5499  detail::is_local_accessor_with_v<AccessorT,
5500  detail::accessor_mode_cap::can_write> &&
5501  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
5502 block_store(AccessorT lacc, uint32_t byte_offset, simd<T, N> vals,
5503  simd_mask<1> pred, PropertyListT props = {}) {
5504  byte_offset += detail::localAccessorToOffset(lacc);
5505  slm_block_store<T, N>(byte_offset, vals, pred, props);
5506 }
5507 
5533 template <typename T, int N, typename AccessorT,
5534  typename PropertyListT =
5536 __ESIMD_API std::enable_if_t<
5537  detail::is_local_accessor_with_v<AccessorT,
5538  detail::accessor_mode_cap::can_write> &&
5539  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
5540 block_store(AccessorT lacc, simd<T, N> vals, simd_mask<1> pred,
5541  PropertyListT props = {}) {
5542  slm_block_store<T, N>(detail::localAccessorToOffset(lacc), vals, pred, props);
5543 }
5544 namespace detail {
5545 
5546 // lsc_atomic_update() operations may share atomic_op values for data types
5547 // of the same (fp vs integral) class for convenience (e.g. re-use 'fmax' for
5548 // all FP types). In fact those data types may require using different internal
5549 // opcodes. This function returns the corresponding internal opcode for
5550 // the input type 'T' and operation 'Op'.
5551 template <typename T, __ESIMD_NS::atomic_op Op>
5552 constexpr int lsc_to_internal_atomic_op() {
5553  constexpr __ESIMD_NS::native::lsc::atomic_op LSCOp =
5554  __ESIMD_DNS::to_lsc_atomic_op<Op>();
5555  return static_cast<int>(LSCOp);
5556 }
5557 
5571 
5572 template <atomic_op Op, typename T, int N, lsc_data_size DS>
5573 __ESIMD_API std::enable_if_t<get_num_args<Op>() == 0, simd<T, N>>
5575  check_lsc_data_size<T, DS>();
5576  check_atomic<Op, T, N, 0, /*IsLSC*/ true>();
5577  constexpr uint16_t AddressScale = 1;
5578  constexpr int ImmOffset = 0;
5579  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
5580  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
5581  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
5582  using MsgT = typename lsc_expand_type<T>::type;
5583  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
5584  simd<MsgT, N> Tmp =
5585  __esimd_lsc_xatomic_slm_0<MsgT, IOp, cache_hint::none, cache_hint::none,
5586  AddressScale, ImmOffset, EDS, VS, Transposed,
5587  N>(pred.data(), offsets.data());
5588  return lsc_format_ret<T>(Tmp);
5589 }
5590 
5605 template <atomic_op Op, typename T, int N, lsc_data_size DS>
5606 __ESIMD_API std::enable_if_t<get_num_args<Op>() == 1, simd<T, N>>
5608  simd_mask<N> pred) {
5609  check_lsc_data_size<T, DS>();
5610  check_atomic<Op, T, N, 1, /*IsLSC*/ true>();
5611  constexpr uint16_t AddressScale = 1;
5612  constexpr int ImmOffset = 0;
5613  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
5614  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
5615  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
5616  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
5617  if constexpr (std::is_same_v<T, double>) {
5618  return __esimd_lsc_xatomic_slm_1<T, IOp, cache_hint::none, cache_hint::none,
5619  AddressScale, ImmOffset, EDS, VS,
5620  Transposed, N>(pred.data(), offsets.data(),
5621  src0.data());
5622  } else {
5623  using MsgT = typename lsc_expand_type<T>::type;
5624  simd<MsgT, N> Msg_data = lsc_format_input<MsgT>(src0);
5625  simd<MsgT, N> Tmp =
5626  __esimd_lsc_xatomic_slm_1<MsgT, IOp, cache_hint::none, cache_hint::none,
5627  AddressScale, ImmOffset, EDS, VS, Transposed,
5628  N>(pred.data(), offsets.data(),
5629  Msg_data.data());
5630  return lsc_format_ret<T>(Tmp);
5631  }
5632 }
5633 
5649 template <atomic_op Op, typename T, int N, lsc_data_size DS>
5652  simd_mask<N> pred) {
5653  check_lsc_data_size<T, DS>();
5654  check_atomic<Op, T, N, 2, /*IsLSC*/ true>();
5655  constexpr uint16_t AddressScale = 1;
5656  constexpr int ImmOffset = 0;
5657  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
5658  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
5659  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
5660  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
5661  if constexpr (std::is_same_v<T, double>) {
5662  return __esimd_lsc_xatomic_slm_2<T, IOp, cache_hint::none, cache_hint::none,
5663  AddressScale, ImmOffset, EDS, VS,
5664  Transposed, N>(pred.data(), offsets.data(),
5665  src0.data(), src1.data());
5666  } else {
5667  using MsgT = typename lsc_expand_type<T>::type;
5668  simd<MsgT, N> Msg_data0 = lsc_format_input<MsgT>(src0);
5669  simd<MsgT, N> Msg_data1 = lsc_format_input<MsgT>(src1);
5670  simd<MsgT, N> Tmp =
5671  __esimd_lsc_xatomic_slm_2<MsgT, IOp, cache_hint::none, cache_hint::none,
5672  AddressScale, ImmOffset, EDS, VS, Transposed,
5673  N>(pred.data(), offsets.data(),
5674  Msg_data0.data(), Msg_data1.data());
5675  return lsc_format_ret<T>(Tmp);
5676  }
5677 }
5678 
5679 } // namespace detail
5680 
5684 
5688 
5693 
5698 
5700 
5718 template <atomic_op Op, typename T, int N>
5719 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0, simd<T, N>>
5721  // 2 byte, 8 byte types, non-power of two, and operations wider than 32 are
5722  // supported only by LSC.
5723  if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
5724  !__ESIMD_DNS::isPowerOf2(N, 32)) {
5725  return slm_atomic_update_impl<Op, T, N,
5727  byte_offset, mask);
5728  } else if constexpr (Op == atomic_op::load) {
5729  if constexpr (std::is_integral_v<T>) {
5730  return slm_atomic_update<atomic_op::bit_or, T, N>(byte_offset,
5731  simd<T, N>(0), mask);
5732  } else {
5733  using Tint = detail::uint_type_t<sizeof(T)>;
5734  simd<Tint, N> Res = slm_atomic_update<atomic_op::bit_or, Tint, N>(
5735  byte_offset, simd<Tint, N>(0), mask);
5736  return Res.template bit_cast_view<T>();
5737  }
5738  } else {
5739  detail::check_atomic<Op, T, N, 0>();
5741  return __esimd_dword_atomic0<Op, T, N>(mask.data(), si, byte_offset.data());
5742  }
5743 }
5744 
5753 template <atomic_op Op, typename T, int N, typename AccessorT>
5754 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0 &&
5755  __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
5756  simd<T, N>>
5757 atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset,
5758  simd_mask<N> mask = 1) {
5759  byte_offset += detail::localAccessorToOffset(lacc);
5760  return slm_atomic_update<Op, T, N>(byte_offset, mask);
5761 }
5762 
5764 
5770 
5777 
5779 
5797 template <atomic_op Op, typename T, int N>
5798 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1, simd<T, N>>
5800  simd_mask<N> mask = 1) {
5801  // 2 byte, 8 byte types, non-power of two, and operations wider than 32 are
5802  // supported only by LSC.
5803  if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
5804  !__ESIMD_DNS::isPowerOf2(N, 32)) {
5805  // half and short are supported in LSC.
5806  return slm_atomic_update_impl<Op, T, N,
5808  byte_offset, src0, mask);
5809  } else if constexpr (Op == atomic_op::store) {
5810  if constexpr (std::is_integral_v<T>) {
5811  return slm_atomic_update<atomic_op::xchg, T, N>(byte_offset, src0, mask);
5812  } else {
5813  using Tint = detail::uint_type_t<sizeof(T)>;
5814  simd<Tint, N> Res = slm_atomic_update<atomic_op::xchg, Tint, N>(
5815  byte_offset, src0.template bit_cast_view<Tint>(), mask);
5816  return Res.template bit_cast_view<T>();
5817  }
5818  } else {
5819  detail::check_atomic<Op, T, N, 1>();
5821  return __esimd_dword_atomic1<Op, T, N>(mask.data(), si, byte_offset.data(),
5822  src0.data());
5823  }
5824 }
5825 
5844 template <atomic_op Op, typename T, int N, typename AccessorT>
5845 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1 &&
5846  __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
5847  simd<T, N>>
5848 atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, simd<T, N> src0,
5849  simd_mask<N> mask = 1) {
5850  byte_offset += detail::localAccessorToOffset(lacc);
5851  return slm_atomic_update<Op, T, N>(byte_offset, src0, mask);
5852 }
5853 
5855 
5860 
5868 
5886 template <atomic_op Op, typename T, int N>
5887 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2, simd<T, N>>
5889  simd<T, N> src1, simd_mask<N> mask = 1) {
5890  // 2 byte, 8 byte types, non-power of two, and operations wider than 32 are
5891  // supported only by LSC.
5892  if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
5893  !__ESIMD_DNS::isPowerOf2(N, 32)) {
5894  // 2-argument lsc_atomic_update arguments order matches the standard one -
5895  // expected value first, then new value. But atomic_update uses reverse
5896  // order, hence the src1/src0 swap.
5897  return detail::slm_atomic_update_impl<Op, T, N,
5899  byte_offset, src1, src0, mask);
5900  } else {
5901  detail::check_atomic<Op, T, N, 2>();
5903  return __esimd_dword_atomic2<Op, T, N>(mask.data(), si, byte_offset.data(),
5904  src0.data(), src1.data());
5905  }
5906 }
5907 
5914 template <atomic_op Op, typename T, int N, typename AccessorT>
5915 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
5916  __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
5917  simd<T, N>>
5918 atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, simd<T, N> src0,
5919  simd<T, N> src1, simd_mask<N> mask = 1) {
5920  byte_offset += detail::localAccessorToOffset(lacc);
5921  return slm_atomic_update<Op, T, N>(byte_offset, src0, src1, mask);
5922 }
5923 
5925 
5926 namespace detail {
5927 
5942 template <atomic_op Op, typename T, int N, lsc_data_size DS, cache_hint L1H,
5943  cache_hint L2H, typename Toffset>
5944 __ESIMD_API std::enable_if_t<get_num_args<Op>() == 0, simd<T, N>>
5946  static_assert(sizeof(T) > 1, "Unsupported data type");
5947  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
5948  check_atomic<Op, T, N, 0, /*IsLSC*/ true>();
5949  check_lsc_data_size<T, DS>();
5950  check_cache_hint<cache_action::atomic, L1H, L2H>();
5951  constexpr uint16_t AddressScale = 1;
5952  constexpr int ImmOffset = 0;
5953  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
5954  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
5955  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
5956  using MsgT = typename lsc_expand_type<T>::type;
5957  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
5958  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
5959  addrs += convert<uintptr_t>(offsets);
5960  simd<MsgT, N> Tmp =
5961  __esimd_lsc_xatomic_stateless_0<MsgT, IOp, L1H, L2H, AddressScale,
5962  ImmOffset, EDS, VS, Transposed, N>(
5963  pred.data(), addrs.data());
5964  return lsc_format_ret<T>(Tmp);
5965 }
5966 
5982 template <atomic_op Op, typename T, int N, lsc_data_size DS, cache_hint L1H,
5983  cache_hint L2H, typename Toffset>
5984 __ESIMD_API std::enable_if_t<get_num_args<Op>() == 1, simd<T, N>>
5986  simd_mask<N> pred) {
5987  static_assert(sizeof(T) > 1, "Unsupported data type");
5988  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
5989  check_lsc_data_size<T, DS>();
5990  check_atomic<Op, T, N, 1, /*IsLSC*/ true>();
5991  check_cache_hint<cache_action::atomic, L1H, L2H>();
5992  constexpr uint16_t AddressScale = 1;
5993  constexpr int ImmOffset = 0;
5994  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
5995  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
5996  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
5997  using MsgT = typename lsc_expand_type<T>::type;
5998  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
5999  simd<MsgT, N> Msg_data = lsc_format_input<MsgT>(src0);
6000  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
6001  addrs += convert<uintptr_t>(offsets);
6002  simd<MsgT, N> Tmp =
6003  __esimd_lsc_xatomic_stateless_1<MsgT, IOp, L1H, L2H, AddressScale,
6004  ImmOffset, EDS, VS, Transposed, N>(
6005  pred.data(), addrs.data(), Msg_data.data());
6006  return lsc_format_ret<T>(Tmp);
6007 }
6008 
6025 template <atomic_op Op, typename T, int N, lsc_data_size DS, cache_hint L1H,
6026  cache_hint L2H, typename Toffset>
6027 __ESIMD_API std::enable_if_t<get_num_args<Op>() == 2, simd<T, N>>
6029  simd<T, N> src1, simd_mask<N> pred) {
6030  static_assert(sizeof(T) > 1, "Unsupported data type");
6031  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
6032  check_lsc_data_size<T, DS>();
6033  check_atomic<Op, T, N, 2, /*IsLSC*/ true>();
6034  check_cache_hint<cache_action::atomic, L1H, L2H>();
6035  constexpr uint16_t AddressScale = 1;
6036  constexpr int ImmOffset = 0;
6037  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
6038  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
6039  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
6040  using MsgT = typename lsc_expand_type<T>::type;
6041  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
6042  simd<MsgT, N> Msg_data0 = lsc_format_input<MsgT>(src0);
6043  simd<MsgT, N> Msg_data1 = lsc_format_input<MsgT>(src1);
6044  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
6045  addrs += convert<uintptr_t>(offsets);
6046  simd<MsgT, N> Tmp =
6047  __esimd_lsc_xatomic_stateless_2<MsgT, IOp, L1H, L2H, AddressScale,
6048  ImmOffset, EDS, VS, Transposed, N>(
6049  pred.data(), addrs.data(), Msg_data0.data(), Msg_data1.data());
6050  return lsc_format_ret<T>(Tmp);
6051 }
6052 
6069 template <atomic_op Op, typename T, int N,
6072  typename AccessorTy, typename Toffset>
6073 __ESIMD_API
6074  std::enable_if_t<get_num_args<Op>() == 0 &&
6075  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy>,
6076  simd<T, N>>
6077  atomic_update_impl(AccessorTy acc, simd<Toffset, N> byte_offsets,
6078  simd_mask<N> pred) {
6079 #ifdef __ESIMD_FORCE_STATELESS_MEM
6080  return atomic_update_impl<Op, T, N, DS, L1H, L2H>(accessorToPointer<T>(acc),
6081  byte_offsets, pred);
6082 #else
6083  static_assert(sizeof(T) > 1, "Unsupported data type");
6084  static_assert(std::is_integral_v<Toffset> && sizeof(Toffset) == 4,
6085  "Unsupported offset type");
6086  check_lsc_data_size<T, DS>();
6087  check_atomic<Op, T, N, 0, /*IsLSC*/ true>();
6088  check_cache_hint<cache_action::atomic, L1H, L2H>();
6089  constexpr uint16_t AddressScale = 1;
6090  constexpr int ImmOffset = 0;
6091  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
6092  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
6093  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
6094  using MsgT = typename lsc_expand_type<T>::type;
6095  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
6096  auto si = get_surface_index(acc);
6097  simd<MsgT, N> Tmp =
6098  __esimd_lsc_xatomic_bti_0<MsgT, IOp, L1H, L2H, AddressScale, ImmOffset,
6099  EDS, VS, Transposed, N>(
6100  pred.data(), byte_offsets.data(), si);
6101  return lsc_format_ret<T>(Tmp);
6102 #endif
6103 }
6104 
6123 template <atomic_op Op, typename T, int N, lsc_data_size DS, cache_hint L1H,
6124  cache_hint L2H, typename AccessorTy, typename Toffset>
6125 __ESIMD_API
6126  std::enable_if_t<get_num_args<Op>() == 1 &&
6127  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy>,
6128  simd<T, N>>
6129  atomic_update_impl(AccessorTy acc, simd<Toffset, N> byte_offset,
6130  simd<T, N> src0, simd_mask<N> pred) {
6131 #ifdef __ESIMD_FORCE_STATELESS_MEM
6132  return atomic_update_impl<Op, T, N, DS, L1H, L2H>(accessorToPointer<T>(acc),
6133  byte_offset, src0, pred);
6134 #else
6135  static_assert(sizeof(T) > 1, "Unsupported data type");
6136  static_assert(std::is_integral_v<Toffset> && sizeof(Toffset) == 4,
6137  "Unsupported offset type");
6138  check_lsc_data_size<T, DS>();
6139  check_atomic<Op, T, N, 1, /*IsLSC*/ true>();
6140  check_cache_hint<cache_action::atomic, L1H, L2H>();
6141  constexpr uint16_t AddressScale = 1;
6142  constexpr int ImmOffset = 0;
6143  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
6144  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
6145  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
6146  using MsgT = typename lsc_expand_type<T>::type;
6147  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
6148  simd<MsgT, N> Src0Msg = lsc_format_input<MsgT>(src0);
6149  auto si = get_surface_index(acc);
6150  simd<MsgT, N> Tmp =
6151  __esimd_lsc_xatomic_bti_1<MsgT, IOp, L1H, L2H, AddressScale, ImmOffset,
6152  EDS, VS, Transposed, N>(
6153  pred.data(), byte_offset.data(), Src0Msg.data(), si);
6154  return lsc_format_ret<T>(Tmp);
6155 #endif
6156 }
6157 
6177 template <atomic_op Op, typename T, int N, lsc_data_size DS, cache_hint L1H,
6178  cache_hint L2H, typename AccessorTy, typename Toffset>
6179 __ESIMD_API
6180  std::enable_if_t<get_num_args<Op>() == 2 &&
6181  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy>,
6182  simd<T, N>>
6183  atomic_update_impl(AccessorTy acc, simd<Toffset, N> byte_offset,
6185 #ifdef __ESIMD_FORCE_STATELESS_MEM
6186  return atomic_update_impl<Op, T, N, DS, L1H, L2H>(
6187  __ESIMD_DNS::accessorToPointer<T>(acc), byte_offset, src0, src1, pred);
6188 #else
6189  static_assert(std::is_integral_v<Toffset> && sizeof(Toffset) == 4,
6190  "Unsupported offset type");
6191  check_lsc_vector_size<1>();
6192  check_lsc_data_size<T, DS>();
6193  check_atomic<Op, T, N, 2, /*IsLSC*/ true>();
6194  check_cache_hint<cache_action::atomic, L1H, L2H>();
6195  constexpr uint16_t AddressScale = 1;
6196  constexpr int ImmOffset = 0;
6197  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
6198  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
6199  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
6200  using MsgT = typename lsc_expand_type<T>::type;
6201  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
6202  simd<MsgT, N> Msg_data0 = lsc_format_input<MsgT>(src0);
6203  simd<MsgT, N> Msg_data1 = lsc_format_input<MsgT>(src1);
6204  auto si = get_surface_index(acc);
6205  simd<MsgT, N> Tmp =
6206  __esimd_lsc_xatomic_bti_2<MsgT, IOp, L1H, L2H, AddressScale, ImmOffset,
6207  EDS, VS, Transposed, N>(
6208  pred.data(), byte_offset.data(), Msg_data0.data(), Msg_data1.data(),
6209  si);
6210  return lsc_format_ret<T>(Tmp);
6211 #endif
6212 }
6213 } // namespace detail
6214 
6217 
6255 // Other properties are ignored.
6259 template <atomic_op Op, typename T, int N, typename Toffset,
6260  typename PropertyListT =
6262 __ESIMD_API std::enable_if_t<
6263  __ESIMD_DNS::get_num_args<Op>() == 0 &&
6264  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6265  simd<T, N>>
6267  PropertyListT props = {}) {
6268  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
6269 
6270  constexpr auto L1Hint =
6271  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
6273 
6274  constexpr auto L2Hint =
6275  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
6277 
6278  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
6279  "L3 cache hint is reserved. The old/experimental L3 LSC cache "
6280  "hint is cache_level::L2 now.");
6281 
6282  if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none ||
6283  !__ESIMD_DNS::isPowerOf2(N, 32)) {
6285  Op, T, N, detail::lsc_data_size::default_size, L1Hint, L2Hint, Toffset>(
6286  p, byte_offset, mask);
6287  } else if constexpr (N == 16 || N == 32) {
6288  // TODO: In fact GPU BE supports legalization for any N, even for
6289  // non-power-of-2, but it is implemented with an error now. For example,
6290  // N=17 is emulated as 2 calls (N=16 and N=1), while it must be 3 calls:
6291  // (N=8, N=8, N=1). I.e. Gen12 atomic instruction supports only N up to 8
6292  // and GPU thinks now it is up to 16.
6293  // Thus we emulate N=16 with 2 calls with N=8 each.
6294  // N=32 is emulated with 4 calls with N=8 each.
6295  // Task1: Remove the special-case emulation for N=16 and N=32 below when
6296  // GPU driver fixes the error.
6297  // Task2: remove the condition "!__ESIMD_DNS::isPowerOf2(N, 32)" above
6298  // and let svm.atomic for any N.
6299 
6300  simd<T, N> Res;
6301  for (int I = 0; I < N; I += 8) {
6302  simd_mask<8> Mask8 = mask.template select<8, 1>(I);
6303  simd<Toffset, 8> ByteOffset8 = byte_offset.template select<8, 1>(I);
6304  Res.template select<8, 1>(I) =
6305  atomic_update<Op, T, 8>(p, ByteOffset8, Mask8, props);
6306  }
6307  return Res;
6308  } else if constexpr (Op == atomic_op::load) {
6309  if constexpr (std::is_integral_v<T>) {
6310  return atomic_update<atomic_op::bit_or, T, N>(p, byte_offset,
6311  simd<T, N>(0), mask, props);
6312  } else {
6313  using Tint = detail::uint_type_t<sizeof(T)>;
6314  simd<Tint, N> Res = atomic_update<atomic_op::bit_or, Tint, N>(
6315  reinterpret_cast<Tint *>(p), byte_offset, simd<Tint, N>(0), mask,
6316  props);
6317  return Res.template bit_cast_view<T>();
6318  }
6319  } else {
6320  detail::check_atomic<Op, T, N, 0>();
6321  simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
6322  simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(byte_offset);
6323  vAddr += offset_i1;
6324  using Tx = typename detail::__raw_t<T>;
6325  return __esimd_svm_atomic0<Op, Tx, N>(vAddr.data(), mask.data());
6326  }
6327 }
6328 
6347 template <atomic_op Op, typename T, int N, typename Toffset,
6348  typename PropertyListT =
6350 __ESIMD_API std::enable_if_t<
6351  __ESIMD_DNS::get_num_args<Op>() == 0 &&
6352  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6353  simd<T, N>>
6354 atomic_update(T *p, simd<Toffset, N> byte_offset, PropertyListT props = {}) {
6355  simd_mask<N> mask = 1;
6356  return atomic_update<Op, T, N>(p, byte_offset, mask, props);
6357 }
6358 
6379 template <atomic_op Op, typename T, int N, typename OffsetObjT,
6380  typename RegionTy,
6381  typename PropertyListT =
6383 __ESIMD_API std::enable_if_t<
6384  __ESIMD_DNS::get_num_args<Op>() == 0 &&
6385  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6386  simd<T, N>>
6388  PropertyListT props = {}) {
6389  return atomic_update<Op, T, N>(p, offsets.read(), mask, props);
6390 }
6391 
6410 template <atomic_op Op, typename T, int N, typename OffsetObjT,
6411  typename RegionTy,
6412  typename PropertyListT =
6414 __ESIMD_API std::enable_if_t<
6415  __ESIMD_DNS::get_num_args<Op>() == 0 &&
6416  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6417  simd<T, N>>
6419  PropertyListT props = {}) {
6420  simd_mask<N> mask = 1;
6421  return atomic_update<Op, T, N>(p, byte_offset.read(), mask, props);
6422 }
6423 
6438 template <atomic_op Op, typename T, int N, typename Toffset>
6439 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>, simd<T, N>>
6440 atomic_update(T *p, Toffset byte_offset, simd_mask<N> mask = 1) {
6441  return atomic_update<Op, T, N>(p, simd<Toffset, N>(byte_offset), mask);
6442 }
6443 
6463 
6492 template <atomic_op Op, typename T, int N, typename Toffset,
6493  typename PropertyListT =
6495 __ESIMD_API std::enable_if_t<
6496  __ESIMD_DNS::get_num_args<Op>() == 1 &&
6497  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6498  simd<T, N>>
6500  simd_mask<N> mask, PropertyListT props = {}) {
6501  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
6502 
6503  constexpr auto L1Hint =
6504  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
6506 
6507  constexpr auto L2Hint =
6508  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
6510 
6511  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
6512  "L3 cache hint is reserved. The old/experimental L3 LSC cache "
6513  "hint is cache_level::L2 now.");
6514 
6515  // Auto-convert FP atomics to LSC version.
6516  if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none ||
6517  (Op == atomic_op::fmin) || (Op == atomic_op::fmax) ||
6518  (Op == atomic_op::fadd) || (Op == atomic_op::fsub) ||
6519  !__ESIMD_DNS::isPowerOf2(N, 32)) {
6521  Op, T, N, detail::lsc_data_size::default_size, L1Hint, L2Hint, Toffset>(
6522  p, byte_offset, src0, mask);
6523  } else if constexpr (N == 16 || N == 32) {
6524  // TODO: In fact GPU BE supports legalization for any N, even for
6525  // non-power-of-2, but it is implemented with an error now. For example,
6526  // N=17 is emulated as 2 calls (N=16 and N=1), while it must be 3 calls:
6527  // (N=8, N=8, N=1). I.e. Gen12 atomic instruction supports only N up to 8
6528  // and GPU thinks now it is up to 16.
6529  // Thus we emulate N=16 with 2 calls with N=8 each.
6530  // N=32 is emulated with 4 calls with N=8 each.
6531  // Task1: Remove the special-case emulation for N=16 and N=32 below when
6532  // GPU driver fixes the error.
6533  // Task2: remove the condition "!__ESIMD_DNS::isPowerOf2(N, 32)" above
6534  // and let svm.atomic for any N.
6535  simd<T, N> Res;
6536  for (int I = 0; I < N; I += 8) {
6537  simd_mask<8> Mask8 = mask.template select<8, 1>(I);
6538  simd<Toffset, 8> ByteOffset8 = byte_offset.template select<8, 1>(I);
6539  simd<T, 8> Src08 = src0.template select<8, 1>(I);
6540  Res.template select<8, 1>(I) =
6541  atomic_update<Op, T, 8>(p, ByteOffset8, Src08, Mask8, props);
6542  }
6543  return Res;
6544  } else if constexpr (Op == atomic_op::store) {
6545  if constexpr (std::is_integral_v<T>) {
6546  return atomic_update<atomic_op::xchg, T, N>(p, byte_offset, src0, mask,
6547  props);
6548  } else {
6549  using Tint = detail::uint_type_t<sizeof(T)>;
6550  simd<Tint, N> Res = atomic_update<atomic_op::xchg, Tint, N>(
6551  reinterpret_cast<Tint *>(p), byte_offset,
6552  src0.template bit_cast_view<Tint>(), mask, props);
6553  return Res.template bit_cast_view<T>();
6554  }
6555  } else {
6556  detail::check_atomic<Op, T, N, 1>();
6557  simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
6558  simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(byte_offset);
6559  vAddr += offset_i1;
6560 
6561  using Tx = typename detail::__raw_t<T>;
6562  return __esimd_svm_atomic1<Op, Tx, N>(vAddr.data(), src0.data(),
6563  mask.data());
6564  }
6565 }
6566 
6570 
6572 
6590 template <atomic_op Op, typename T, int N, typename Toffset,
6591  typename PropertyListT =
6593 __ESIMD_API std::enable_if_t<
6594  __ESIMD_DNS::get_num_args<Op>() == 1 &&
6595  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6596  simd<T, N>>
6598  PropertyListT props = {}) {
6599  simd_mask<N> mask = 1;
6600  return atomic_update<Op, T, N>(p, byte_offset, src0, mask, props);
6601 }
6602 
6630 template <atomic_op Op, typename T, int N, typename OffsetObjT,
6631  typename RegionTy,
6632  typename PropertyListT =
6634 __ESIMD_API std::enable_if_t<
6635  __ESIMD_DNS::get_num_args<Op>() == 1 &&
6636  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6637  simd<T, N>>
6639  simd_mask<N> mask, PropertyListT props = {}) {
6640  return atomic_update<Op, T, N>(p, offsets.read(), src0, mask, props);
6641 }
6642 
6668 template <atomic_op Op, typename T, int N, typename OffsetObjT,
6669  typename RegionTy,
6670  typename PropertyListT =
6672 __ESIMD_API std::enable_if_t<
6673  __ESIMD_DNS::get_num_args<Op>() == 1 &&
6674  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6675  simd<T, N>>
6677  PropertyListT props = {}) {
6678  simd_mask<N> mask = 1;
6679  return atomic_update<Op, T, N>(p, offsets.read(), src0, mask, props);
6680 }
6681 
6700 template <atomic_op Op, typename Tx, int N, typename Toffset>
6701 __ESIMD_API std::enable_if_t<
6702  std::is_integral_v<Toffset> &&
6703  ((Op != atomic_op::store && Op != atomic_op::xchg) || N == 1),
6704  simd<Tx, N>>
6705 atomic_update(Tx *p, Toffset byte_offset, simd<Tx, N> src0, simd_mask<N> mask) {
6706  return atomic_update<Op, Tx, N>(p, simd<Toffset, N>(byte_offset), src0, mask);
6707 }
6708 
6733 
6751 // Other properties are ignored.
6755 template <atomic_op Op, typename T, int N, typename Toffset,
6756  typename PropertyListT =
6758 __ESIMD_API std::enable_if_t<
6759  __ESIMD_DNS::get_num_args<Op>() == 2 &&
6760  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6761  simd<T, N>>
6763  simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
6764  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
6765 
6766  constexpr auto L1Hint =
6767  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
6769 
6770  constexpr auto L2Hint =
6771  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
6773 
6774  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
6775  "L3 cache hint is reserved. The old/experimental L3 LSC cache "
6776  "hint is cache_level::L2 now.");
6777 
6778  // Use LSC atomic when cache hints are present, FP atomics is used,
6779  // non-power of two length is used, or operation width greater than 32.
6780  if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none ||
6781  Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32)) {
6782  // 2-argument lsc_atomic_update arguments order matches the standard one -
6783  // expected value first, then new value. But atomic_update uses reverse
6784  // order, hence the src1/src0 swap.
6786  Op, T, N, detail::lsc_data_size::default_size, L1Hint, L2Hint, Toffset>(
6787  p, byte_offset, src1, src0, mask);
6788  } else if constexpr (N == 16 || N == 32) {
6789  // TODO: In fact GPU BE supports legalization for any N, even for
6790  // non-power-of-2, but it is implemented with an error now. For example,
6791  // N=17 is emulated as 2 calls (N=16 and N=1), while it must be 3 calls:
6792  // (N=8, N=8, N=1). I.e. Gen12 atomic instruction supports only N up to 8
6793  // and GPU thinks now it is up to 16.
6794  // Thus we emulate N=16 with 2 calls with N=8 each.
6795  // N=32 is emulated with 4 calls with N=8 each.
6796  // Task1: Remove the special-case emulation for N=16 and N=32 below when
6797  // GPU driver fixes the error.
6798  // Task2: remove the condition "!__ESIMD_DNS::isPowerOf2(N, 32)" above
6799  // and let svm.atomic for any N.
6800  simd<T, N> Res;
6801  for (int I = 0; I < N; I += 8) {
6802  simd_mask<8> Mask8 = mask.template select<8, 1>(I);
6803  simd<Toffset, 8> ByteOffset8 = byte_offset.template select<8, 1>(I);
6804  simd<T, 8> Src08 = src0.template select<8, 1>(I);
6805  simd<T, 8> Src18 = src1.template select<8, 1>(I);
6806  Res.template select<8, 1>(I) =
6807  atomic_update<Op, T, 8>(p, ByteOffset8, Src08, Src18, Mask8, props);
6808  }
6809  return Res;
6810  } else {
6811  detail::check_atomic<Op, T, N, 2>();
6812  simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
6813  simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(byte_offset);
6814  vAddr += offset_i1;
6815  using Tx = typename detail::__raw_t<T>;
6816  return __esimd_svm_atomic2<Op, Tx, N>(vAddr.data(), src0.data(),
6817  src1.data(), mask.data());
6818  }
6819 }
6820 
6825 //
6836 // Other properties are ignored.
6840 template <atomic_op Op, typename T, int N, typename Toffset,
6841  typename PropertyListT =
6843 __ESIMD_API std::enable_if_t<
6844  __ESIMD_DNS::get_num_args<Op>() == 2 &&
6845  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6846  simd<T, N>>
6848  simd<T, N> src1, PropertyListT props = {}) {
6849  simd_mask<N> mask = 1;
6850  return atomic_update<Op, T, N>(p, byte_offset, src0, src1, mask, props);
6851 }
6852 
6870 // Other properties are ignored.
6873 template <atomic_op Op, typename T, int N, typename OffsetObjT,
6874  typename OffsetRegionTy,
6875  typename PropertyListT =
6877 __ESIMD_API std::enable_if_t<
6878  __ESIMD_DNS::get_num_args<Op>() == 2 &&
6879  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6880  simd<T, N>>
6883  PropertyListT props = {}) {
6884  return atomic_update<Op, T, N>(p, byte_offset.read(), src0, src1, mask,
6885  props);
6886 }
6887 
6903 // Other properties are ignored.
6906 template <atomic_op Op, typename T, int N, typename OffsetObjT,
6907  typename OffsetRegionTy,
6908  typename PropertyListT =
6910 __ESIMD_API std::enable_if_t<
6911  __ESIMD_DNS::get_num_args<Op>() == 2 &&
6912  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6913  simd<T, N>>
6915  simd<T, N> src0, simd<T, N> src1, PropertyListT props = {}) {
6916  simd_mask<N> mask = 1;
6917  return atomic_update<Op, T, N>(p, byte_offset.read(), src0, src1, mask,
6918  props);
6919 }
6920 
6937 template <atomic_op Op, typename Tx, int N, typename Toffset>
6938 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>, simd<Tx, N>>
6939 atomic_update(Tx *p, Toffset byte_offset, simd<Tx, N> src0, simd<Tx, N> src1,
6940  simd_mask<N> mask) {
6941  return atomic_update<Op, Tx, N>(p, simd<Toffset, N>(byte_offset), src0, src1,
6942  mask);
6943 }
6944 
6961 
6986 // Other properties are ignored.
6990 template <atomic_op Op, typename T, int N, typename Toffset,
6991  typename AccessorTy,
6992  typename PropertyListT =
6994 __ESIMD_API std::enable_if_t<
6995  __ESIMD_DNS::get_num_args<Op>() == 0 &&
6996  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
6997  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6998  simd<T, N>>
6999 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd_mask<N> mask,
7000  PropertyListT props = {}) {
7001 #ifdef __ESIMD_FORCE_STATELESS_MEM
7002  return atomic_update<Op, T, N>(__ESIMD_DNS::accessorToPointer<T>(acc),
7003  byte_offset, mask, props);
7004 #else
7005  constexpr auto L1Hint =
7006  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
7008  constexpr auto L2Hint =
7009  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
7011  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
7012  "L3 cache hint is reserved. The old/experimental L3 LSC cache "
7013  "hint is cache_level::L2 now.");
7014 
7015  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
7016 
7017  if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none ||
7018  !detail::isPowerOf2(N, 32)) {
7020  Op, T, N, detail::lsc_data_size::default_size, L1Hint, L2Hint>(
7021  acc, byte_offset, mask);
7022  } else {
7023  if constexpr (Op == atomic_op::load) {
7024  if constexpr (std::is_integral_v<T>) {
7025  return atomic_update<atomic_op::bit_or, T, N>(
7026  acc, byte_offset, simd<T, N>(0), mask, props);
7027  } else {
7028  using Tint = detail::uint_type_t<sizeof(T)>;
7029  simd<Tint, N> Res = atomic_update<atomic_op::bit_or, Tint, N>(
7030  acc, byte_offset, simd<Tint, N>(0), mask, props);
7031  return Res.template bit_cast_view<T>();
7032  }
7033  } else {
7034  detail::check_atomic<Op, T, N, 0>();
7035  static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported");
7036 
7037  static_assert(sizeof(T) == 4, "Only 32 bit data is supported");
7038  const auto si = get_surface_index(acc);
7039  using Tx = typename detail::__raw_t<T>;
7040  return __esimd_dword_atomic0<Op, Tx, N>(mask.data(), si,
7041  byte_offset.data());
7042  }
7043  }
7044 #endif
7045 }
7046 
7063 // Other properties are ignored.
7067 template <atomic_op Op, typename T, int N, typename Toffset,
7068  typename AccessorTy,
7069  typename PropertyListT =
7071 __ESIMD_API std::enable_if_t<
7072  __ESIMD_DNS::get_num_args<Op>() == 0 &&
7073  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7074  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7075  simd<T, N>>
7076 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
7077  PropertyListT props = {}) {
7078  simd_mask<N> mask = 1;
7079  return atomic_update<Op, T, N>(acc, byte_offset, mask, props);
7080 }
7081 
7102 // Other properties are ignored.
7106 template <atomic_op Op, typename T, int N, typename OffsetObjT,
7107  typename AccessorTy, typename RegionTy,
7108  typename PropertyListT =
7110 __ESIMD_API std::enable_if_t<
7111  __ESIMD_DNS::get_num_args<Op>() == 0 &&
7112  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7113  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7114  simd<T, N>>
7116  simd_mask<N> mask, PropertyListT props = {}) {
7117  return atomic_update<Op, T, N>(acc, byte_offset.read(), mask, props);
7118 }
7119 
7139 template <atomic_op Op, typename T, int N, typename OffsetObjT,
7140  typename AccessorTy, typename RegionTy,
7141  typename PropertyListT =
7143 __ESIMD_API std::enable_if_t<
7144  __ESIMD_DNS::get_num_args<Op>() == 0 &&
7145  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7146  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7147  simd<T, N>>
7149  PropertyListT props = {}) {
7150  simd_mask<N> mask = 1;
7151  return atomic_update<Op, T, N>(acc, byte_offset.read(), mask, props);
7152 }
7153 
7172 template <atomic_op Op, typename T, int N, typename Toffset,
7173  typename AccessorTy>
7174 __ESIMD_API
7175  std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0 &&
7176  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy>,
7177  simd<T, N>>
7178  atomic_update(AccessorTy acc, Toffset byte_offset, simd_mask<N> mask) {
7179  return atomic_update<Op, T, N>(acc, simd<Toffset, N>(byte_offset), mask);
7180 }
7181 
7200 template <atomic_op Op, typename T, int N, typename AccessorTy>
7201 __ESIMD_API
7202  std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0 &&
7203  __ESIMD_DNS::is_rw_local_accessor_v<AccessorTy>,
7204  simd<T, N>>
7205  atomic_update(AccessorTy acc, uint32_t byte_offset, simd_mask<N> mask) {
7206  return atomic_update<Op, T, N>(acc, simd<uint32_t, N>(byte_offset), mask);
7207 }
7208 
7230 
7262 
7263 template <atomic_op Op, typename T, int N, typename Toffset,
7264  typename AccessorTy,
7265  typename PropertyListT =
7267 __ESIMD_API std::enable_if_t<
7268  __ESIMD_DNS::get_num_args<Op>() == 1 &&
7269  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7270  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7271  simd<T, N>>
7272 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
7273  simd_mask<N> mask, PropertyListT props = {}) {
7274 #ifdef __ESIMD_FORCE_STATELESS_MEM
7275  return atomic_update<Op, T, N>(__ESIMD_DNS::accessorToPointer<T>(acc),
7276  byte_offset, src0, mask, props);
7277 #else
7278  constexpr auto L1Hint =
7279  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
7281 
7282  constexpr auto L2Hint =
7283  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
7285 
7286  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
7287  "L3 cache hint is reserved. The old/experimental L3 LSC cache "
7288  "hint is cache_level::L2 now.");
7289  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
7290  static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported");
7291  // Auto-convert FP atomics to LSC version.
7292  if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none ||
7293  Op == atomic_op::fmin || Op == atomic_op::fmax ||
7294  Op == atomic_op::fadd || Op == atomic_op::fsub ||
7295  !__ESIMD_DNS::isPowerOf2(N, 32)) {
7297  Op, T, N, detail::lsc_data_size::default_size, L1Hint, L2Hint>(
7298  acc, byte_offset, src0, mask);
7299  } else if constexpr (Op == atomic_op::store) {
7300  if constexpr (std::is_integral_v<T>) {
7301  return atomic_update<atomic_op::xchg, T, N>(acc, byte_offset, src0, mask,
7302  props);
7303  } else {
7304  using Tint = detail::uint_type_t<sizeof(T)>;
7305  simd<Tint, N> Res = atomic_update<atomic_op::xchg, Tint, N>(
7306  acc, byte_offset, src0.template bit_cast_view<Tint>(), mask, props);
7307  return Res.template bit_cast_view<T>();
7308  }
7309  } else {
7310  detail::check_atomic<Op, T, N, 1>();
7311  static_assert(sizeof(T) == 4, "Only 32 bit data is supported");
7312  const auto si = __ESIMD_NS::get_surface_index(acc);
7313  using Tx = typename detail::__raw_t<T>;
7314  return __esimd_dword_atomic1<Op, Tx, N>(
7315  mask.data(), si, byte_offset.data(),
7316  sycl::bit_cast<__ESIMD_DNS::vector_type_t<Tx, N>>(src0.data()));
7317  }
7318 #endif
7319 }
7320 
7352 template <atomic_op Op, typename T, int N, typename Toffset,
7353  typename AccessorTy,
7354  typename PropertyListT =
7356 __ESIMD_API std::enable_if_t<
7357  __ESIMD_DNS::get_num_args<Op>() == 1 &&
7358  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7359  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7360  simd<T, N>>
7361 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
7362  PropertyListT props = {}) {
7363  simd_mask<N> mask = 1;
7364  return atomic_update<Op, T, N>(acc, byte_offset, src0, mask, props);
7365 }
7366 
7399 template <atomic_op Op, typename T, int N, typename OffsetObjT,
7400  typename AccessorTy, typename RegionTy,
7401  typename PropertyListT =
7403 __ESIMD_API std::enable_if_t<
7404  __ESIMD_DNS::get_num_args<Op>() == 1 &&
7405  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7406  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7407  simd<T, N>>
7409  simd<T, N> src0, simd_mask<N> mask, PropertyListT props = {}) {
7410  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, mask, props);
7411 }
7412 
7443 template <atomic_op Op, typename T, int N, typename OffsetObjT,
7444  typename AccessorTy, typename RegionTy,
7445  typename PropertyListT =
7447 __ESIMD_API std::enable_if_t<
7448  __ESIMD_DNS::get_num_args<Op>() == 1 &&
7449  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7450  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7451  simd<T, N>>
7453  simd<T, N> src0, PropertyListT props = {}) {
7454  simd_mask<N> mask = 1;
7455  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, mask, props);
7456 }
7457 
7479 template <atomic_op Op, typename T, int N, typename Toffset,
7480  typename AccessorTy>
7481 __ESIMD_API std::enable_if_t<
7482  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7483  ((Op != atomic_op::store && Op != atomic_op::xchg) || N == 1),
7484  simd<T, N>>
7485 atomic_update(AccessorTy acc, Toffset offset, simd<T, N> src0,
7486  simd_mask<N> mask) {
7487  return atomic_update<Op, T, N>(acc, simd<Toffset, N>(offset), src0, mask);
7488 }
7489 
7509 template <atomic_op Op, typename Tx, int N, typename AccessorTy>
7510 __ESIMD_API std::enable_if_t<
7511  __ESIMD_DNS::is_rw_local_accessor_v<AccessorTy> &&
7512  ((Op != atomic_op::store && Op != atomic_op::xchg) || N == 1),
7513  simd<Tx, N>>
7514 atomic_update(AccessorTy acc, uint32_t offset, simd<Tx, N> src0,
7515  simd_mask<N> mask) {
7516  return atomic_update<Op, Tx, N>(acc, simd<uint32_t, N>(offset), src0, mask);
7517 }
7518 
7525 // simd_mask<N> mask,props = {}); // (acc-au2-1)
7541 
7545 // simd_mask<N> mask,props = {}); // (acc-au2-1)
7568 // Other properties are ignored.
7572 template <atomic_op Op, typename T, int N, typename Toffset,
7573  typename AccessorTy,
7574  typename PropertyListT =
7576 __ESIMD_API std::enable_if_t<
7577  __ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
7578  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7579  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7580  simd<T, N>>
7581 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
7582  simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
7583 #ifdef __ESIMD_FORCE_STATELESS_MEM
7584  return atomic_update<Op, T, N>(__ESIMD_DNS::accessorToPointer<T>(acc),
7585  byte_offset, src0, src1, mask, props);
7586 #else
7587  constexpr auto L1Hint =
7588  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
7590 
7591  constexpr auto L2Hint =
7592  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
7594 
7595  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
7596  "L3 cache hint is reserved. The old/experimental L3 LSC cache "
7597  "hint is cache_level::L2 now.");
7598  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
7599  static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported");
7600  // Use LSC atomic when cache hints are present, FP atomics is used,
7601  // non-power of two length is used, or operation width greater than 32.
7602  if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none ||
7603  Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32)) {
7604  // 2-argument lsc_atomic_update arguments order matches the standard one -
7605  // expected value first, then new value. But atomic_update uses reverse
7606  // order, hence the src1/src0 swap.
7608  Op, T, N, detail::lsc_data_size::default_size, L1Hint, L2Hint>(
7609  acc, byte_offset, src1, src0, mask);
7610  } else {
7611  detail::check_atomic<Op, T, N, 2>();
7612  static_assert(sizeof(T) == 4, "Only 32 bit data is supported");
7613  const auto si = __ESIMD_NS::get_surface_index(acc);
7614  using Tx = typename detail::__raw_t<T>;
7615  return __esimd_dword_atomic2<Op, Tx, N>(
7616  mask.data(), si, byte_offset.data(),
7617  sycl::bit_cast<__ESIMD_DNS::vector_type_t<Tx, N>>(src0.data()),
7618  sycl::bit_cast<__ESIMD_DNS::vector_type_t<Tx, N>>(src1.data()));
7619  }
7620 #endif
7621 }
7622 
7640 // Other properties are ignored.
7644 template <atomic_op Op, typename T, int N, typename Toffset,
7645  typename AccessorTy,
7646  typename PropertyListT =
7648 __ESIMD_API std::enable_if_t<
7649  __ESIMD_DNS::get_num_args<Op>() == 2 &&
7650  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7651  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7652  simd<T, N>>
7653 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
7654  simd<T, N> src1, PropertyListT props = {}) {
7655  simd_mask<N> mask = 1;
7656  return atomic_update<Op, T, N>(acc, byte_offset, src0, src1, mask, props);
7657 }
7658 
7679 // Other properties are ignored.
7682 template <atomic_op Op, typename T, int N, typename OffsetObjT,
7683  typename AccessorTy, typename OffsetRegionTy,
7684  typename PropertyListT =
7686 __ESIMD_API std::enable_if_t<
7687  __ESIMD_DNS::get_num_args<Op>() == 2 &&
7688  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7689  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7690  simd<T, N>>
7693  PropertyListT props = {}) {
7694  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, src1, mask,
7695  props);
7696 }
7697 
7716 // Other properties are ignored.
7719 template <atomic_op Op, typename T, int N, typename OffsetObjT,
7720  typename AccessorTy, typename OffsetRegionTy,
7721  typename PropertyListT =
7723 __ESIMD_API std::enable_if_t<
7724  __ESIMD_DNS::get_num_args<Op>() == 2 &&
7725  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7726  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7727  simd<T, N>>
7729  simd<T, N> src0, simd<T, N> src1, PropertyListT props = {}) {
7730  simd_mask<N> mask = 1;
7731  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, src1, mask,
7732  props);
7733 }
7734 
7755 template <atomic_op Op, typename Tx, int N, typename Toffset,
7756  typename AccessorTy>
7757 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_rw_device_accessor_v<AccessorTy>,
7758  simd<Tx, N>>
7759 atomic_update(AccessorTy acc, Toffset offset, simd<Tx, N> src0,
7760  simd<Tx, N> src1, simd_mask<N> mask) {
7761  return atomic_update<Op, Tx, N>(acc, simd<Toffset, N>(offset), src0, src1,
7762  mask);
7763 }
7764 
7782 template <atomic_op Op, typename Tx, int N, typename AccessorTy>
7783 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_rw_local_accessor_v<AccessorTy>,
7784  simd<Tx, N>>
7785 atomic_update(AccessorTy acc, uint32_t offset, simd<Tx, N> src0,
7786  simd<Tx, N> src1, simd_mask<N> mask) {
7787  return atomic_update<Op, Tx, N>(acc, simd<uint32_t, N>(offset), src0, src1,
7788  mask);
7789 }
7790 
7792 
7795 
7798 enum fence_mask : uint8_t {
7816  sw_barrier = 0x80
7817 };
7818 
7822 template <uint8_t cntl> __ESIMD_API void fence() { __esimd_fence(cntl); }
7823 
7824 __SYCL_DEPRECATED("use fence<fence_mask>()")
7825 __ESIMD_API void fence(fence_mask cntl) { __esimd_fence(cntl); }
7826 
7833 template <memory_kind Kind = memory_kind::global,
7836 __ESIMD_API void fence() {
7837  static_assert(
7838  Kind != memory_kind::local ||
7839  (FenceOp == fence_flush_op::none && Scope == fence_scope::group),
7840  "SLM fence must have 'none' lsc_fence_op and 'group' scope");
7841  constexpr int N = 16;
7842  simd_mask<N> Mask = 1;
7843  __esimd_lsc_fence<static_cast<uint8_t>(Kind), static_cast<uint8_t>(FenceOp),
7844  static_cast<uint8_t>(Scope), N>(Mask.data());
7845 }
7846 
7855 __ESIMD_API void barrier() {
7857  __esimd_barrier();
7858 }
7860 
7863 
7876 template <typename T, int m, int N, typename AccessorTy, unsigned plane = 0>
7877 __ESIMD_API simd<T, m * N> media_block_load(AccessorTy acc, unsigned x,
7878  unsigned y) {
7879  constexpr unsigned Width = N * sizeof(T);
7880  static_assert(Width * m <= 256u,
7881  "data does not fit into a single dataport transaction");
7882  static_assert(Width <= 64u, "valid block width is in range [1, 64]");
7883  static_assert(m <= 64u, "valid block height is in range [1, 64]");
7884  static_assert(plane <= 3u, "valid plane index is in range [0, 3]");
7885 
7886  const auto si = __ESIMD_NS::get_surface_index(acc);
7887  using SurfIndTy = decltype(si);
7888  constexpr unsigned int RoundedWidth =
7889  Width < 4 ? 4 : detail::getNextPowerOf2<Width>();
7890  constexpr int BlockWidth = sizeof(T) * N;
7891  constexpr int Mod = 0;
7892 
7893  if constexpr (Width < RoundedWidth) {
7894  constexpr unsigned int n1 = RoundedWidth / sizeof(T);
7895  simd<T, m * n1> temp =
7896  __esimd_media_ld<T, m, n1, Mod, SurfIndTy, (int)plane, BlockWidth>(
7897  si, x, y);
7898  return temp.template select<m, 1, N, 1>(0, 0);
7899  } else {
7900  return __esimd_media_ld<T, m, N, Mod, SurfIndTy, (int)plane, BlockWidth>(
7901  si, x, y);
7902  }
7903 }
7904 
7917 template <typename T, int m, int N, typename AccessorTy, unsigned plane = 0>
7918 __ESIMD_API void media_block_store(AccessorTy acc, unsigned x, unsigned y,
7919  simd<T, m * N> vals) {
7920  constexpr unsigned Width = N * sizeof(T);
7921  static_assert(Width * m <= 256u,
7922  "data does not fit into a single dataport transaction");
7923  static_assert(Width <= 64u, "valid block width is in range [1, 64]");
7924  static_assert(m <= 64u, "valid block height is in range [1, 64]");
7925  static_assert(plane <= 3u, "valid plane index is in range [0, 3]");
7926  const auto si = __ESIMD_NS::get_surface_index(acc);
7927  using SurfIndTy = decltype(si);
7928  constexpr unsigned int RoundedWidth =
7929  Width < 4 ? 4 : detail::getNextPowerOf2<Width>();
7930  constexpr unsigned int n1 = RoundedWidth / sizeof(T);
7931  constexpr int BlockWidth = sizeof(T) * N;
7932  constexpr int Mod = 0;
7933 
7934  if constexpr (Width < RoundedWidth) {
7935  simd<T, m * n1> temp;
7936  auto temp_ref = temp.template bit_cast_view<T, m, n1>();
7937  auto vals_ref = vals.template bit_cast_view<T, m, N>();
7938  temp_ref.template select<m, 1, N, 1>() = vals_ref;
7939  __esimd_media_st<T, m, n1, Mod, SurfIndTy, plane, BlockWidth>(si, x, y,
7940  temp.data());
7941  } else {
7942  __esimd_media_st<T, m, N, Mod, SurfIndTy, plane, BlockWidth>(si, x, y,
7943  vals.data());
7944  }
7945 }
7946 
7965 template <typename T, int N, typename AccessorTy,
7967 __ESIMD_API
7968  std::enable_if_t<detail::is_local_accessor_with_v<
7969  AccessorTy, detail::accessor_mode_cap::can_read> &&
7970  is_simd_flag_type_v<Flags>,
7971  simd<T, N>>
7972  block_load(AccessorTy acc, uint32_t byte_offset, Flags flags) {
7973  return slm_block_load<T, N>(byte_offset + detail::localAccessorToOffset(acc),
7974  flags);
7975 }
7976 
7994 template <typename T, int N, typename AccessorT, typename Flags>
7995 __ESIMD_API
7996  std::enable_if_t<detail::is_local_accessor_with_v<
7997  AccessorT, detail::accessor_mode_cap::can_write> &&
7998  is_simd_flag_type_v<Flags>>
7999  block_store(AccessorT acc, uint32_t offset, simd<T, N> vals, Flags flags) {
8000  slm_block_store<T, N>(offset + __ESIMD_DNS::localAccessorToOffset(acc), vals,
8001  flags);
8002 }
8003 
8034 // typename PropertyListT = empty_properties_t>
8043 
8075 template <typename T, int N, int VS, typename AccessorT,
8076  typename PropertyListT =
8078 __ESIMD_API std::enable_if_t<
8079  (detail::is_local_accessor_with_v<AccessorT,
8080  detail::accessor_mode_cap::can_read> &&
8081  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8082  simd<T, N>>
8083 gather(AccessorT acc, simd<uint32_t, N / VS> byte_offsets,
8084  simd_mask<N / VS> mask, simd<T, N> pass_thru, PropertyListT props = {}) {
8085  return slm_gather<T, N, VS>(byte_offsets +
8086  __ESIMD_DNS::localAccessorToOffset(acc),
8087  mask, pass_thru, props);
8088 }
8089 
8120 template <typename T, int N, int VS, typename AccessorT,
8121  typename PropertyListT =
8123 __ESIMD_API std::enable_if_t<
8124  (detail::is_local_accessor_with_v<AccessorT,
8125  detail::accessor_mode_cap::can_read> &&
8126  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8127  simd<T, N>>
8128 gather(AccessorT acc, simd<uint32_t, N / VS> byte_offsets,
8129  simd_mask<N / VS> mask, PropertyListT props = {}) {
8130  return slm_gather<T, N, VS>(
8131  byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), mask, props);
8132 }
8133 
8158 template <typename T, int N, int VS, typename AccessorT,
8159  typename PropertyListT =
8161 __ESIMD_API std::enable_if_t<
8162  (detail::is_local_accessor_with_v<AccessorT,
8163  detail::accessor_mode_cap::can_read> &&
8164  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8165  simd<T, N>>
8166 gather(AccessorT acc, simd<uint32_t, N / VS> byte_offsets,
8167  PropertyListT props = {}) {
8168  return slm_gather<T, N, VS>(
8169  byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), props);
8170 }
8171 
8181 // Dev note: the mask type was turned into template parameter `MaskT` to
8182 // avoid the conflicts of this prototype with the old gather() function
8183 // accepting a 'global_offset' parameter and avoid 'ambiguous call' errors
8184 // for calls like this: gather(acc, byte_offsets_simd, 0, mask);
8185 template <typename T, int N, typename AccessorT, typename MaskT,
8186  typename PropertyListT =
8188 __ESIMD_API std::enable_if_t<
8189  (detail::is_local_accessor_with_v<AccessorT,
8190  detail::accessor_mode_cap::can_read> &&
8191  std::is_same_v<MaskT, simd_mask<N>> &&
8192  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8193  simd<T, N>>
8194 gather(AccessorT acc, simd<uint32_t, N> byte_offsets, MaskT mask,
8195  simd<T, N> pass_thru, PropertyListT props = {}) {
8196  return slm_gather<T, N>(byte_offsets +
8197  __ESIMD_DNS::localAccessorToOffset(acc),
8198  mask, pass_thru, props);
8199 }
8200 
8208 // Dev note: the mask type was turned into template parameter `MaskT` to
8209 // avoid the conflicts of this prototype with the old gather() function
8210 // accepting a 'global_offset' parameter and avoid 'ambiguous call' errors
8211 // for calls like this: gather(acc, byte_offsets_simd, 0);
8212 template <typename T, int N, typename AccessorT, typename MaskT,
8213  typename PropertyListT =
8215 __ESIMD_API std::enable_if_t<
8216  (detail::is_local_accessor_with_v<AccessorT,
8217  detail::accessor_mode_cap::can_read> &&
8218  std::is_same_v<MaskT, simd_mask<N>> &&
8219  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8220  simd<T, N>>
8221 gather(AccessorT acc, simd<uint32_t, N> byte_offsets, MaskT mask,
8222  PropertyListT props = {}) {
8223  return slm_gather<T, N>(
8224  byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), mask, props);
8225 }
8226 
8234 template <typename T, int N, typename AccessorT,
8235  typename PropertyListT =
8237 __ESIMD_API std::enable_if_t<
8238  (detail::is_local_accessor_with_v<AccessorT,
8239  detail::accessor_mode_cap::can_read> &&
8240  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8241  simd<T, N>>
8242 gather(AccessorT acc, simd<uint32_t, N> byte_offsets,
8243  PropertyListT props = {}) {
8244  return slm_gather<T, N>(
8245  byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), props);
8246 }
8247 
8250 // typename PropertyListT = empty_properties_t>
8256 template <typename T, int N, int VS = 1, typename AccessorT,
8257  typename OffsetSimdViewT,
8258  typename PropertyListT =
8260 __ESIMD_API std::enable_if_t<
8261  (detail::is_local_accessor_with_v<AccessorT,
8262  detail::accessor_mode_cap::can_read> &&
8263  detail::is_simd_view_type_v<OffsetSimdViewT> &&
8264  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8265  simd<T, N>>
8266 gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
8267  simd<T, N> pass_thru, PropertyListT props = {}) {
8268  return gather<T, N, VS>(acc, byte_offsets.read(), mask, pass_thru, props);
8269 }
8270 
8273 // typename PropertyListT = empty_properties_t>
8279 template <typename T, int N, int VS = 1, typename AccessorT,
8280  typename OffsetSimdViewT,
8281  typename PropertyListT =
8283 __ESIMD_API std::enable_if_t<
8284  (detail::is_local_accessor_with_v<AccessorT,
8285  detail::accessor_mode_cap::can_read> &&
8286  detail::is_simd_view_type_v<OffsetSimdViewT> &&
8287  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8288  simd<T, N>>
8289 gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
8290  PropertyListT props = {}) {
8291  return gather<T, N, VS>(acc, byte_offsets.read(), mask, props);
8292 }
8293 
8296 // typename PropertyListT = empty_properties_t>
8301 template <typename T, int N, int VS = 1, typename AccessorT,
8302  typename OffsetSimdViewT,
8303  typename PropertyListT =
8305 __ESIMD_API std::enable_if_t<
8306  (detail::is_local_accessor_with_v<AccessorT,
8307  detail::accessor_mode_cap::can_read> &&
8308  detail::is_simd_view_type_v<OffsetSimdViewT> &&
8309  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8310  simd<T, N>>
8311 gather(AccessorT acc, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
8312  return gather<T, N, VS>(acc, byte_offsets.read(), props);
8313 }
8314 
8332 template <typename T, int N, typename AccessorTy>
8333 __ESIMD_API
8334  std::enable_if_t<detail::is_local_accessor_with_v<
8335  AccessorTy, detail::accessor_mode_cap::can_read>,
8336  simd<T, N>>
8337  gather(AccessorTy acc, simd<uint32_t, N> offsets, uint32_t glob_offset,
8338  simd_mask<N> mask = 1) {
8339  return slm_gather<T, N>(
8340  offsets + glob_offset + __ESIMD_DNS::localAccessorToOffset(acc), mask);
8341 }
8342 
8351 
8358 
8361 
8370 
8378 
8405 template <typename T, int N, int VS = 1, typename AccessorT,
8406  typename PropertyListT =
8408 __ESIMD_API std::enable_if_t<
8409  detail::is_local_accessor_with_v<AccessorT,
8410  detail::accessor_mode_cap::can_write> &&
8411  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8412 scatter(AccessorT acc, simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
8413  simd_mask<N / VS> mask, PropertyListT props = {}) {
8414  slm_scatter<T, N, VS>(byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc),
8415  vals, mask, props);
8416 }
8417 
8440 template <typename T, int N, int VS = 1, typename AccessorT,
8441  typename PropertyListT =
8443 __ESIMD_API std::enable_if_t<
8444  detail::is_local_accessor_with_v<AccessorT,
8445  detail::accessor_mode_cap::can_write> &&
8446  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8447 scatter(AccessorT acc, simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
8448  PropertyListT props = {}) {
8449  simd_mask<N / VS> Mask = 1;
8450  scatter<T, N, VS>(acc, byte_offsets, vals, Mask, props);
8451 }
8452 
8480 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
8481  typename AccessorT,
8482  typename PropertyListT =
8484 __ESIMD_API std::enable_if_t<
8485  detail::is_local_accessor_with_v<AccessorT,
8486  detail::accessor_mode_cap::can_write> &&
8487  detail::is_simd_view_type_v<OffsetSimdViewT> &&
8488  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8489 scatter(AccessorT acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
8490  simd_mask<N / VS> mask, PropertyListT props = {}) {
8491  scatter<T, N, VS>(acc, byte_offsets.read(), vals, mask, props);
8492 }
8493 
8518 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
8519  typename AccessorT,
8520  typename PropertyListT =
8522 __ESIMD_API std::enable_if_t<
8523  detail::is_local_accessor_with_v<AccessorT,
8524  detail::accessor_mode_cap::can_write> &&
8525  detail::is_simd_view_type_v<OffsetSimdViewT> &&
8526  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8527 scatter(AccessorT acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
8528  PropertyListT props = {}) {
8529  simd_mask<N / VS> Mask = 1;
8530  scatter<T, N, VS>(acc, byte_offsets.read(), vals, Mask, props);
8531 }
8532 
8551 template <typename T, int N, typename AccessorTy>
8552 __ESIMD_API std::enable_if_t<detail::is_local_accessor_with_v<
8553  AccessorTy, detail::accessor_mode_cap::can_write>>
8554 scatter(AccessorTy acc, simd<uint32_t, N> offsets, simd<T, N> vals,
8555  uint32_t glob_offset, simd_mask<N> mask = 1) {
8556  slm_scatter<T, N>(offsets + glob_offset +
8557  __ESIMD_DNS::localAccessorToOffset(acc),
8558  vals, mask);
8559 }
8560 
8602 
8624 template <typename T, int N, int VS, typename OffsetT,
8625  typename PropertyListT =
8627 __ESIMD_API std::enable_if_t<
8628  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8629 prefetch(const T *p, simd<OffsetT, N / VS> byte_offsets, simd_mask<N / VS> mask,
8630  PropertyListT props = {}) {
8631  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
8632 
8633  constexpr auto L1Hint =
8634  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
8636  constexpr auto L2Hint =
8637  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
8640  L2Hint>(p, byte_offsets, mask);
8641 }
8642 
8659 template <typename T, int N, int VS, typename OffsetT,
8660  typename PropertyListT =
8662 __ESIMD_API std::enable_if_t<
8663  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8664 prefetch(const T *p, simd<OffsetT, N / VS> byte_offsets,
8665  PropertyListT props = {}) {
8666  simd_mask<N / VS> Mask = 1;
8667  prefetch<T, N, VS>(p, byte_offsets, Mask, props);
8668 }
8669 
8689 template <typename T, int N, typename OffsetT,
8690  typename PropertyListT =
8692 __ESIMD_API std::enable_if_t<
8693  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8694 prefetch(const T *p, simd<OffsetT, N> byte_offsets, simd_mask<N> mask,
8695  PropertyListT props = {}) {
8696  constexpr int VS = 1;
8697  prefetch<T, N, VS>(p, byte_offsets, mask, props);
8698 }
8699 
8714 template <typename T, int N, typename OffsetT,
8715  typename PropertyListT =
8717 __ESIMD_API std::enable_if_t<
8718  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8719 prefetch(const T *p, simd<OffsetT, N> byte_offsets, PropertyListT props = {}) {
8720  constexpr int VS = 1;
8721  prefetch<T, N, VS>(p, byte_offsets, props);
8722 }
8723 
8744 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
8745  typename PropertyListT =
8747 __ESIMD_API std::enable_if_t<
8748  detail::is_simd_view_type_v<OffsetSimdViewT> &&
8749  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8750 prefetch(const T *p, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
8751  PropertyListT props = {}) {
8752  prefetch<T, N, VS>(p, byte_offsets.read(), mask, props);
8753 }
8754 
8772 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
8773  typename PropertyListT =
8775 __ESIMD_API std::enable_if_t<
8776  detail::is_simd_view_type_v<OffsetSimdViewT> &&
8777  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8778 prefetch(const T *p, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
8779  prefetch<T, N, VS>(p, byte_offsets.read(), props);
8780 }
8781 
8799 template <typename T, int VS = 1, typename OffsetT,
8800  typename PropertyListT =
8802 __ESIMD_API std::enable_if_t<
8803  std::is_integral_v<OffsetT> &&
8804  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8805 prefetch(const T *p, OffsetT byte_offset, simd_mask<1> mask,
8806  PropertyListT props = {}) {
8807  constexpr auto L1Hint =
8808  detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
8810  constexpr auto L2Hint =
8811  detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
8814  L2Hint>(p, byte_offset, mask);
8815 }
8816 
8832 template <typename T, int VS = 1, typename OffsetT,
8833  typename PropertyListT =
8835 __ESIMD_API std::enable_if_t<
8836  std::is_integral_v<OffsetT> &&
8837  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8838 prefetch(const T *p, OffsetT byte_offset, PropertyListT props = {}) {
8839  simd_mask<1> Mask = 1;
8840  prefetch<T, VS>(p, byte_offset, Mask, props);
8841 }
8842 
8859 template <typename T, int VS = 1,
8860  typename PropertyListT =
8862 __ESIMD_API std::enable_if_t<
8863  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8864 prefetch(const T *p, simd_mask<1> mask, PropertyListT props = {}) {
8865  prefetch<T, VS>(p, 0, mask, props);
8866 }
8867 
8880 template <typename T, int VS = 1,
8881  typename PropertyListT =
8883 __ESIMD_API std::enable_if_t<
8884  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8885 prefetch(const T *p, PropertyListT props = {}) {
8886  simd_mask<1> Mask = 1;
8887  prefetch<T, VS>(p, 0, Mask, props);
8888 }
8889 
8914 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
8915  typename AccessorT, int N,
8916  typename T = typename AccessorT::value_type>
8917 __ESIMD_API
8918  std::enable_if_t<detail::is_local_accessor_with_v<
8919  AccessorT, detail::accessor_mode_cap::can_read>,
8920  simd<T, N * get_num_channels_enabled(RGBAMask)>>
8921  gather_rgba(AccessorT acc, simd<uint32_t, N> offsets,
8922  uint32_t global_offset = 0, simd_mask<N> mask = 1) {
8923  return slm_gather_rgba<T, N, RGBAMask>(
8924  offsets + global_offset + __ESIMD_DNS::localAccessorToOffset(acc), mask);
8925 }
8926 
8943 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
8944  typename AccessorT, int N,
8945  typename T = typename AccessorT::value_type>
8946 __ESIMD_API std::enable_if_t<detail::is_local_accessor_with_v<
8947  AccessorT, detail::accessor_mode_cap::can_write>>
8948 scatter_rgba(AccessorT acc, simd<uint32_t, N> offsets,
8949  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
8950  uint32_t global_offset = 0, simd_mask<N> mask = 1) {
8951  detail::validate_rgba_write_channel_mask<RGBAMask>();
8952  slm_scatter_rgba<T, N, RGBAMask>(offsets + global_offset +
8953  __ESIMD_DNS::localAccessorToOffset(acc),
8954  vals, mask);
8955 }
8956 
8959 
8980 template <uint8_t exec_size, uint8_t sfid, uint8_t num_src0, uint8_t num_src1,
8981  uint8_t num_dst, raw_send_eot eot = raw_send_eot::not_eot,
8982  raw_send_sendc sendc = raw_send_sendc::not_sendc, typename T1, int n1,
8983  typename T2, int n2, typename T3, int n3>
8984 __ESIMD_API __ESIMD_NS::simd<T1, n1>
8985 raw_sends(__ESIMD_NS::simd<T1, n1> msg_dst, __ESIMD_NS::simd<T2, n2> msg_src0,
8986  __ESIMD_NS::simd<T3, n3> msg_src1, uint32_t ex_desc,
8987  uint32_t msg_desc, __ESIMD_NS::simd_mask<exec_size> mask = 1) {
8988  constexpr unsigned _Width1 = n1 * sizeof(T1);
8989  static_assert(_Width1 % 32 == 0, "Invalid size for raw send rspVar");
8990  constexpr unsigned _Width2 = n2 * sizeof(T2);
8991  static_assert(_Width2 % 32 == 0, "Invalid size for raw send msg_src0");
8992  constexpr unsigned _Width3 = n3 * sizeof(T3);
8993  static_assert(_Width3 % 32 == 0, "Invalid size for raw send msg_src1");
8994 
8995  using ElemT1 = __ESIMD_DNS::__raw_t<T1>;
8996  using ElemT2 = __ESIMD_DNS::__raw_t<T2>;
8997  using ElemT3 = __ESIMD_DNS::__raw_t<T3>;
8998 
8999  constexpr uint8_t modifier =
9000  ((eot == raw_send_eot::eot) << 1) | (sendc == raw_send_sendc::sendc);
9001 
9002  return __esimd_raw_sends2<ElemT1, n1, ElemT2, n2, ElemT3, n3, exec_size>(
9003  modifier, exec_size, mask.data(), num_src0, num_src1, num_dst, sfid,
9004  ex_desc, msg_desc, msg_src0.data(), msg_src1.data(), msg_dst.data());
9005 }
9006 
9024 template <uint8_t exec_size, uint8_t sfid, uint8_t num_src0, uint8_t num_dst,
9026  raw_send_sendc sendc = raw_send_sendc::not_sendc, typename T1, int n1,
9027  typename T2, int n2>
9028 __ESIMD_API __ESIMD_NS::simd<T1, n1>
9029 raw_send(__ESIMD_NS::simd<T1, n1> msg_dst, __ESIMD_NS::simd<T2, n2> msg_src0,
9030  uint32_t ex_desc, uint32_t msg_desc,
9031  __ESIMD_NS::simd_mask<exec_size> mask = 1) {
9032  constexpr unsigned _Width1 = n1 * sizeof(T1);
9033  static_assert(_Width1 % 32 == 0, "Invalid size for raw send rspVar");
9034  constexpr unsigned _Width2 = n2 * sizeof(T2);
9035  static_assert(_Width2 % 32 == 0, "Invalid size for raw send msg_src0");
9036 
9037  using ElemT1 = __ESIMD_DNS::__raw_t<T1>;
9038  using ElemT2 = __ESIMD_DNS::__raw_t<T2>;
9039 
9040  constexpr uint8_t modifier =
9041  ((eot == raw_send_eot::eot) << 1) | (sendc == raw_send_sendc::sendc);
9042  return __esimd_raw_send2<ElemT1, n1, ElemT2, n2, exec_size>(
9043  modifier, exec_size, mask.data(), num_src0, num_dst, sfid, ex_desc,
9044  msg_desc, msg_src0.data(), msg_dst.data());
9045 }
9046 
9064 template <uint8_t exec_size, uint8_t sfid, uint8_t num_src0, uint8_t num_src1,
9066  raw_send_sendc sendc = raw_send_sendc::not_sendc, typename T1, int n1,
9067  typename T2, int n2>
9068 __ESIMD_API void raw_sends(__ESIMD_NS::simd<T1, n1> msg_src0,
9069  __ESIMD_NS::simd<T2, n2> msg_src1, uint32_t ex_desc,
9070  uint32_t msg_desc,
9071  __ESIMD_NS::simd_mask<exec_size> mask = 1) {
9072  constexpr unsigned _Width1 = n1 * sizeof(T1);
9073  static_assert(_Width1 % 32 == 0, "Invalid size for raw send msg_src0");
9074  constexpr unsigned _Width2 = n2 * sizeof(T2);
9075  static_assert(_Width2 % 32 == 0, "Invalid size for raw send msg_src1");
9076 
9077  using ElemT1 = __ESIMD_DNS::__raw_t<T1>;
9078  using ElemT2 = __ESIMD_DNS::__raw_t<T2>;
9079 
9080  constexpr uint8_t modifier =
9081  ((eot == raw_send_eot::eot) << 1) | (sendc == raw_send_sendc::sendc);
9082  __esimd_raw_sends2_noresult<ElemT1, n1, ElemT2, n2, exec_size>(
9083  modifier, exec_size, mask.data(), num_src0, num_src1, sfid, ex_desc,
9084  msg_desc, msg_src0.data(), msg_src1.data());
9085 }
9086 
9102 template <uint8_t exec_size, uint8_t sfid, uint8_t num_src0,
9104  raw_send_sendc sendc = raw_send_sendc::not_sendc, typename T1, int n1>
9105 __ESIMD_API void raw_send(__ESIMD_NS::simd<T1, n1> msg_src0, uint32_t ex_desc,
9106  uint32_t msg_desc,
9107  __ESIMD_NS::simd_mask<exec_size> mask = 1) {
9108  constexpr unsigned _Width1 = n1 * sizeof(T1);
9109  static_assert(_Width1 % 32 == 0, "Invalid size for raw send msg_src0");
9110  using ElemT1 = __ESIMD_DNS::__raw_t<T1>;
9111  constexpr uint8_t modifier =
9112  ((eot == raw_send_eot::eot) << 1) | (sendc == raw_send_sendc::sendc);
9113  __esimd_raw_send2_noresult<ElemT1, n1, exec_size>(
9114  modifier, exec_size, mask.data(), num_src0, sfid, ex_desc, msg_desc,
9115  msg_src0.data());
9116 }
9117 
9119 
9121 
9123 
9124 namespace detail {
9125 // -- Outlined implementations of simd_obj_impl class memory access APIs.
9126 
9127 template <typename T, int N, class T1, class SFINAE>
9128 template <typename Flags, int ChunkSize, typename>
9131  Flags) SYCL_ESIMD_FUNCTION {
9133  constexpr unsigned Size = sizeof(T) * N;
9134  constexpr unsigned Align = Flags::template alignment<T1>;
9135 
9136  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
9137  constexpr unsigned NumBlocks = Size / BlockSize;
9138  constexpr unsigned RemSize = Size % BlockSize;
9139 
9140  if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 &&
9141  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
9142  if constexpr (NumBlocks > 0) {
9143  constexpr unsigned BlockN = BlockSize / sizeof(T);
9144  ForHelper<NumBlocks>::unroll([BlockN, Addr, this](unsigned Block) {
9145  select<BlockN, 1>(Block * BlockN) =
9146  block_load<UT, BlockN, Flags>(Addr + (Block * BlockN), Flags{});
9147  });
9148  }
9149  if constexpr (RemSize > 0) {
9150  constexpr unsigned RemN = RemSize / sizeof(T);
9151  constexpr unsigned BlockN = BlockSize / sizeof(T);
9152  select<RemN, 1>(NumBlocks * BlockN) =
9153  block_load<UT, RemN, Flags>(Addr + (NumBlocks * BlockN), Flags{});
9154  }
9155  } else if constexpr (sizeof(T) == 8) {
9156  simd<int32_t, N * 2> BC(reinterpret_cast<const int32_t *>(Addr), Flags{});
9157  bit_cast_view<int32_t>() = BC;
9158  } else {
9159  constexpr unsigned NumChunks = N / ChunkSize;
9160  if constexpr (NumChunks > 0) {
9161  simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
9162  ForHelper<NumChunks>::unroll([Addr, &Offsets, this](unsigned Block) {
9163  select<ChunkSize, 1>(Block * ChunkSize) =
9164  gather<UT, ChunkSize>(Addr + (Block * ChunkSize), Offsets);
9165  });
9166  }
9167  constexpr unsigned RemN = N % ChunkSize;
9168  if constexpr (RemN > 0) {
9169  if constexpr (RemN == 1) {
9170  select<1, 1>(NumChunks * ChunkSize) = Addr[NumChunks * ChunkSize];
9171  } else if constexpr (RemN == 8 || RemN == 16) {
9172  simd<uint32_t, RemN> Offsets(0u, sizeof(T));
9173  select<RemN, 1>(NumChunks * ChunkSize) =
9174  gather<UT, RemN>(Addr + (NumChunks * ChunkSize), Offsets);
9175  } else {
9176  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
9177  simd_mask_type<N1> Pred(0);
9178  Pred.template select<RemN, 1>() = 1;
9179  simd<uint32_t, N1> Offsets(0u, sizeof(T));
9180  simd<UT, N1> Vals =
9181  gather<UT, N1>(Addr + (NumChunks * ChunkSize), Offsets, Pred);
9182  select<RemN, 1>(NumChunks * ChunkSize) =
9183  Vals.template select<RemN, 1>();
9184  }
9185  }
9186  }
9187 }
9188 
9189 template <typename T, int N, class T1, class SFINAE>
9190 template <int ChunkSize, typename Flags, typename AccessorT, typename TOffset>
9191 ESIMD_INLINE void simd_obj_impl<T, N, T1, SFINAE>::copy_to_impl(
9192  AccessorT acc, TOffset offset) const SYCL_ESIMD_FUNCTION {
9194  constexpr unsigned Size = sizeof(T) * N;
9195  constexpr unsigned Align = Flags::template alignment<T1>;
9196 
9197  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
9198  constexpr unsigned NumBlocks = Size / BlockSize;
9199  constexpr unsigned RemSize = Size % BlockSize;
9200 
9201  simd<UT, N> Tmp{data()};
9202  if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
9203  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
9204  if constexpr (NumBlocks > 0) {
9205  constexpr unsigned BlockN = BlockSize / sizeof(T);
9206  ForHelper<NumBlocks>::unroll([BlockN, acc, offset, &Tmp](unsigned Block) {
9207  block_store<UT, BlockN, AccessorT>(
9208  acc, offset + (Block * BlockSize),
9209  Tmp.template select<BlockN, 1>(Block * BlockN));
9210  });
9211  }
9212  if constexpr (RemSize > 0) {
9213  constexpr unsigned RemN = RemSize / sizeof(T);
9214  constexpr unsigned BlockN = BlockSize / sizeof(T);
9215  block_store<UT, RemN, AccessorT>(
9216  acc, offset + (NumBlocks * BlockSize),
9217  Tmp.template select<RemN, 1>(NumBlocks * BlockN));
9218  }
9219  } else if constexpr (sizeof(T) == 8) {
9220  simd<int32_t, N * 2> BC = Tmp.template bit_cast_view<int32_t>();
9221  BC.copy_to(acc, offset, Flags{});
9222  } else {
9223  constexpr unsigned NumChunks = N / ChunkSize;
9224  if constexpr (NumChunks > 0) {
9225  simd<TOffset, ChunkSize> Offsets(0u, sizeof(T));
9226  ForHelper<NumChunks>::unroll([acc, offset, &Offsets,
9227  &Tmp](unsigned Block) {
9228  scatter<UT, ChunkSize, AccessorT>(
9229  acc, Offsets, Tmp.template select<ChunkSize, 1>(Block * ChunkSize),
9230  offset + (Block * ChunkSize * sizeof(T)));
9231  });
9232  }
9233  constexpr unsigned RemN = N % ChunkSize;
9234  if constexpr (RemN > 0) {
9235  if constexpr (RemN == 1 || RemN == 8 || RemN == 16) {
9236  simd<TOffset, RemN> Offsets(0u, sizeof(T));
9237  scatter<UT, RemN, AccessorT>(
9238  acc, Offsets, Tmp.template select<RemN, 1>(NumChunks * ChunkSize),
9239  offset + (NumChunks * ChunkSize * sizeof(T)));
9240  } else {
9241  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
9242  simd_mask_type<N1> Pred(0);
9243  Pred.template select<RemN, 1>() = 1;
9244  simd<UT, N1> Vals;
9245  Vals.template select<RemN, 1>() =
9246  Tmp.template select<RemN, 1>(NumChunks * ChunkSize);
9247  simd<TOffset, N1> Offsets(0u, sizeof(T));
9248  scatter<UT, N1, AccessorT>(acc, Offsets, Vals,
9249  offset + (NumChunks * ChunkSize * sizeof(T)),
9250  Pred);
9251  }
9252  }
9253  }
9254 }
9255 
9256 template <typename T, int N, class T1, class SFINAE>
9257 template <int ChunkSize, typename Flags, typename AccessorT, typename TOffset>
9258 ESIMD_INLINE void simd_obj_impl<T, N, T1, SFINAE>::copy_from_impl(
9259  AccessorT acc, TOffset offset) SYCL_ESIMD_FUNCTION {
9261  static_assert(sizeof(UT) == sizeof(T));
9262  constexpr unsigned Size = sizeof(T) * N;
9263  constexpr unsigned Align = Flags::template alignment<T1>;
9264 
9265  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
9266  constexpr unsigned NumBlocks = Size / BlockSize;
9267  constexpr unsigned RemSize = Size % BlockSize;
9268 
9269  if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 &&
9270  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
9271  if constexpr (NumBlocks > 0) {
9272  constexpr unsigned BlockN = BlockSize / sizeof(T);
9273  ForHelper<NumBlocks>::unroll([BlockN, acc, offset, this](unsigned Block) {
9274  select<BlockN, 1>(Block * BlockN) =
9275  block_load<UT, BlockN, AccessorT, Flags>(
9276  acc, offset + (Block * BlockSize), Flags{});
9277  });
9278  }
9279  if constexpr (RemSize > 0) {
9280  constexpr unsigned RemN = RemSize / sizeof(T);
9281  constexpr unsigned BlockN = BlockSize / sizeof(T);
9282  select<RemN, 1>(NumBlocks * BlockN) =
9283  block_load<UT, RemN, AccessorT, Flags>(
9284  acc, offset + (NumBlocks * BlockSize), Flags{});
9285  }
9286  } else if constexpr (sizeof(T) == 8) {
9287  simd<int32_t, N * 2> BC(acc, offset, Flags{});
9288  bit_cast_view<int32_t>() = BC;
9289  } else {
9290  constexpr unsigned NumChunks = N / ChunkSize;
9291  if constexpr (NumChunks > 0) {
9292  simd<TOffset, ChunkSize> Offsets(0u, sizeof(T));
9293  ForHelper<NumChunks>::unroll(
9294  [acc, offset, &Offsets, this](unsigned Block) {
9295  select<ChunkSize, 1>(Block * ChunkSize) =
9296  gather<UT, ChunkSize, AccessorT>(
9297  acc, Offsets, offset + (Block * ChunkSize * sizeof(T)));
9298  });
9299  }
9300  constexpr unsigned RemN = N % ChunkSize;
9301  if constexpr (RemN > 0) {
9302  if constexpr (RemN == 1 || RemN == 8 || RemN == 16) {
9303  simd<TOffset, RemN> Offsets(0u, sizeof(T));
9304  select<RemN, 1>(NumChunks * ChunkSize) = gather<UT, RemN, AccessorT>(
9305  acc, Offsets, offset + (NumChunks * ChunkSize * sizeof(T)));
9306  } else {
9307  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
9308  simd_mask_type<N1> Pred(0);
9309  Pred.template select<RemN, 1>() = 1;
9310  simd<TOffset, N1> Offsets(0u, sizeof(T));
9311  simd<UT, N1> Vals = gather<UT, N1>(
9312  acc, Offsets, offset + (NumChunks * ChunkSize * sizeof(T)), Pred);
9313  select<RemN, 1>(NumChunks * ChunkSize) =
9314  Vals.template select<RemN, 1>();
9315  }
9316  }
9317  }
9318 }
9319 
9320 template <typename T, int N, class T1, class SFINAE>
9321 template <typename AccessorT, typename Flags, int ChunkSize, typename>
9322 ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_read, void>
9325  Flags) SYCL_ESIMD_FUNCTION {
9326 
9327  copy_from_impl<ChunkSize, Flags>(acc, offset);
9328 }
9329 
9330 template <typename T, int N, class T1, class SFINAE>
9331 template <typename AccessorT, typename Flags, int ChunkSize, typename>
9332 ESIMD_INLINE std::enable_if_t<
9333  detail::is_local_accessor_with_v<AccessorT, accessor_mode_cap::can_read>,
9334  void>
9335 simd_obj_impl<T, N, T1, SFINAE>::copy_from(AccessorT acc, uint32_t offset,
9336  Flags) SYCL_ESIMD_FUNCTION {
9337 
9338  copy_from_impl<ChunkSize, Flags>(acc, offset);
9339 }
9340 
9341 template <typename T, int N, class T1, class SFINAE>
9342 template <typename Flags, int ChunkSize, typename>
9345  Flags) const SYCL_ESIMD_FUNCTION {
9347  constexpr unsigned Size = sizeof(T) * N;
9348  constexpr unsigned Align = Flags::template alignment<T1>;
9349 
9350  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
9351  constexpr unsigned NumBlocks = Size / BlockSize;
9352  constexpr unsigned RemSize = Size % BlockSize;
9353 
9354  simd<UT, N> Tmp{data()};
9355  if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
9356  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
9357  if constexpr (NumBlocks > 0) {
9358  constexpr unsigned BlockN = BlockSize / sizeof(T);
9359  ForHelper<NumBlocks>::unroll([BlockN, Addr, &Tmp](unsigned Block) {
9360  block_store<UT, BlockN>(Addr + (Block * BlockN),
9361  Tmp.template select<BlockN, 1>(Block * BlockN));
9362  });
9363  }
9364  if constexpr (RemSize > 0) {
9365  constexpr unsigned RemN = RemSize / sizeof(T);
9366  constexpr unsigned BlockN = BlockSize / sizeof(T);
9367  block_store<UT, RemN>(Addr + (NumBlocks * BlockN),
9368  Tmp.template select<RemN, 1>(NumBlocks * BlockN));
9369  }
9370  } else if constexpr (sizeof(T) == 8) {
9371  simd<int32_t, N * 2> BC = Tmp.template bit_cast_view<int32_t>();
9372  BC.copy_to(reinterpret_cast<int32_t *>(Addr), Flags{});
9373  } else {
9374  constexpr unsigned NumChunks = N / ChunkSize;
9375  if constexpr (NumChunks > 0) {
9376  simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
9377  ForHelper<NumChunks>::unroll([Addr, &Offsets, &Tmp](unsigned Block) {
9378  scatter<UT, ChunkSize>(
9379  Addr + (Block * ChunkSize), Offsets,
9380  Tmp.template select<ChunkSize, 1>(Block * ChunkSize));
9381  });
9382  }
9383  constexpr unsigned RemN = N % ChunkSize;
9384  if constexpr (RemN > 0) {
9385  if constexpr (RemN == 1) {
9386  Addr[NumChunks * ChunkSize] = Tmp[NumChunks * ChunkSize];
9387  } else if constexpr (RemN == 8 || RemN == 16) {
9388  // TODO: GPU runtime may handle scatter of 16 byte elements
9389  // incorrectly. The code below is a workaround which must be deleted
9390  // once GPU runtime is fixed.
9391  if constexpr (sizeof(T) == 1 && RemN == 16) {
9392  if constexpr (Align % OperandSize::DWORD > 0) {
9393  ForHelper<RemN>::unroll([Addr, &Tmp](unsigned Index) {
9394  Addr[Index + NumChunks * ChunkSize] =
9395  Tmp[Index + NumChunks * ChunkSize];
9396  });
9397  } else {
9398  simd_mask_type<8> Pred(0);
9399  simd<int32_t, 8> Vals;
9400  Pred.template select<4, 1>() = 1;
9401  Vals.template select<4, 1>() =
9402  Tmp.template bit_cast_view<int32_t>().template select<4, 1>(
9403  NumChunks * ChunkSize);
9404 
9405  simd<uint32_t, 8> Offsets(0u, sizeof(int32_t));
9406  scatter<int32_t, 8>(
9407  reinterpret_cast<int32_t *>(Addr + (NumChunks * ChunkSize)),
9408  Offsets, Vals, Pred);
9409  }
9410  } else {
9411  simd<uint32_t, RemN> Offsets(0u, sizeof(T));
9412  scatter<UT, RemN>(
9413  Addr + (NumChunks * ChunkSize), Offsets,
9414  Tmp.template select<RemN, 1>(NumChunks * ChunkSize));
9415  }
9416  } else {
9417  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
9418  simd_mask_type<N1> Pred(0);
9419  Pred.template select<RemN, 1>() = 1;
9420  simd<UT, N1> Vals;
9421  Vals.template select<RemN, 1>() =
9422  Tmp.template select<RemN, 1>(NumChunks * ChunkSize);
9423  simd<uint32_t, N1> Offsets(0u, sizeof(T));
9424  scatter<UT, N1>(Addr + (NumChunks * ChunkSize), Offsets, Vals, Pred);
9425  }
9426  }
9427  }
9428 }
9429 
9430 template <typename T, int N, class T1, class SFINAE>
9431 template <typename AccessorT, typename Flags, int ChunkSize, typename>
9432 ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_write, void>
9435  Flags) const SYCL_ESIMD_FUNCTION {
9436  copy_to_impl<ChunkSize, Flags>(acc, offset);
9437 }
9438 
9439 template <typename T, int N, class T1, class SFINAE>
9440 template <typename AccessorT, typename Flags, int ChunkSize, typename>
9441 ESIMD_INLINE std::enable_if_t<
9442  detail::is_local_accessor_with_v<AccessorT, accessor_mode_cap::can_write>,
9443  void>
9444 simd_obj_impl<T, N, T1, SFINAE>::copy_to(AccessorT acc, uint32_t offset,
9445  Flags) const SYCL_ESIMD_FUNCTION {
9446  copy_to_impl<ChunkSize, Flags>(acc, offset);
9447 }
9448 
9449 } // namespace detail
9451 
9452 } // namespace ext::intel::esimd
9453 } // namespace _V1
9454 } // namespace sycl
const auto & data() const noexcept
Definition: simd.hpp:1673
Definition: simd.hpp:1387
std::enable_if_t< __vectorizable< _Up >) &&is_simd_flag_type< _Flags >::value > copy_to(_Up *__buffer, _Flags) const
Definition: simd.hpp:1526
get_vector_element_type< Derived > element_type
Element type of the derived (user) class.
ESIMD_INLINE void copy_from(const Ty *addr, Flags={}) SYCL_ESIMD_FUNCTION
Copy a contiguous block of data from memory into this simd_obj_impl object.
ESIMD_INLINE void copy_to(Ty *addr, Flags={}) const SYCL_ESIMD_FUNCTION
Copy all vector elements of this object into a contiguous block in memory.
value_type read() const
Reads the viewed region from the target w/o any conversion and returns as an object of the value_type...
This class represents a reference to a sub-region of a base simd object.
Definition: simd_view.hpp:37
The main simd vector class.
Definition: simd.hpp:53
typename base_type::raw_vector_type raw_vector_type
Definition: simd.hpp:60
#define __SYCL_DEPRECATED(message)
#define __ESIMD_FP_ATOMIC_OP_TYPE_CHECK(T)
Definition: memory.hpp:4048
raw_send_eot
Specify if end of thread should be set.
Definition: common.hpp:66
rgba_channel_mask
Represents a pixel's channel mask - all possible combinations of enabled channels.
Definition: common.hpp:121
raw_send_sendc
Specify if sendc should be used.
Definition: common.hpp:72
unsigned int SurfaceIndex
Surface index type.
Definition: common.hpp:63
constexpr int get_num_channels_enabled(rgba_channel_mask M)
Definition: common.hpp:144
atomic_op
Represents an atomic operation.
Definition: common.hpp:159
@ fsub
ACM/PVC: Subtraction (floating point): *addr = *addr - src0.
@ fmax
ACM/PVC: Minimum (floating point): *addr = min(*addr, src0).
@ fadd
ACM/PVC: Addition (floating point): *addr = *addr + src0.
@ xchg
Exchange. *addr == src0;
@ fmin
ACM/PVC: Maximum (floating point): *addr = max(*addr, src0).
@ fcmpxchg
ACM/PVC: Compare and exchange (floating point).
__ESIMD_API SZ simd< T, SZ > src1
Definition: math.hpp:179
__ESIMD_API SZ src0
Definition: math.hpp:179
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT >, simd< T, N > > block_load(const T *ptr, PropertyListT props={})
Each of the following block load functions loads a contiguous memory block from the address reference...
Definition: memory.hpp:1446
__ESIMD_API std::enable_if_t< __ESIMD_DNS::get_num_args< Op >)==0 &&__ESIMD_DNS::is_rw_local_accessor_v< AccessorT >, simd< T, N > > atomic_update(AccessorT lacc, simd< uint32_t, N > byte_offset, simd_mask< N > mask=1)
simd<T, N> atomic_update(local_accessor lacc, simd<uint32_t, N> byte_offset, simd_mask<N> pred = 1); ...
Definition: memory.hpp:5757
__ESIMD_API std::enable_if_t<(N==8||N==16||N==32) &&(sizeof(T)==4)> slm_scatter_rgba(simd< uint32_t, N > offsets, simd< T, N *get_num_channels_enabled(Mask)> vals, simd_mask< N > mask=1)
Gather data from the Shared Local Memory at specified offsets and return it as simd vector.
Definition: memory.hpp:4752
__ESIMD_API T slm_scalar_load(uint32_t offset)
Load a scalar value from the Shared Local Memory.
Definition: memory.hpp:4534
__ESIMD_API std::enable_if_t< is_simd_flag_type_v< Flags >, simd< T, N > > slm_block_load(uint32_t byte_offset, Flags)
Loads a contiguous block of SLM memory referenced by the given byte-offset offset,...
Definition: memory.hpp:4781
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT > > slm_scatter(simd< uint32_t, N/VS > byte_offsets, simd< T, N > vals, simd_mask< N/VS > mask, PropertyListT props={})
template <typename T, int N, int VS = 1, typename PropertyListT = empty_properties_t> void slm_scatte...
Definition: memory.hpp:4584
__ESIMD_API std::enable_if_t< is_simd_flag_type_v< Flags > > slm_block_store(uint32_t offset, simd< T, N > vals, Flags)
Stores elements of the vector vals to a contiguous block of SLM memory at the given byte-offset offse...
Definition: memory.hpp:5264
__ESIMD_API std::enable_if_t< __ESIMD_DNS::get_num_args< Op >)==0, simd< T, N > > slm_atomic_update(simd< uint32_t, N > byte_offset, simd_mask< N > mask=1)
Definition: memory.hpp:5720
__ESIMD_API void slm_init()
Declare per-work-group slm size.
Definition: memory.hpp:4127
__ESIMD_API std::enable_if_t<(N==8||N==16||N==32) &&(sizeof(T)==4), simd< T, N *get_num_channels_enabled(RGBAMask)> > slm_gather_rgba(simd< uint32_t, N > offsets, simd_mask< N > mask=1)
Gather data from the Shared Local Memory at specified offsets and return it as simd vector.
Definition: memory.hpp:4734
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT >, simd< T, N > > slm_gather(simd< uint32_t, N/VS > byte_offsets, simd_mask< N/VS > mask, simd< T, N > pass_thru, PropertyListT props={})
template <typename T, int N, int VS, typename PropertyListT = empty_properties_t> simd<T,...
Definition: memory.hpp:4212
__ESIMD_API void slm_scalar_store(uint32_t offset, T val)
Store a scalar value into the Shared Local Memory.
Definition: memory.hpp:4717
__ESIMD_API simd< T, N *get_num_channels_enabled(RGBAMask)> gather_rgba(const T *p, simd< Toffset, N > offsets, simd_mask< N > mask=1)
Gather and transpose pixels from given memory locations defined by the base pointer p and offsets.
Definition: memory.hpp:3764
__ESIMD_API T scalar_load(AccessorTy acc, detail::DeviceAccessorOffsetT offset)
Load a scalar value from an accessor.
Definition: memory.hpp:3707
__ESIMD_API std::enable_if_t< is_simd_flag_type_v< Flags > > block_store(Tx *addr, simd< Tx, N > vals, Flags)
Stores elements of the vector vals to a contiguous block of memory at the given address addr.
Definition: memory.hpp:1371
__ESIMD_API void scatter_rgba(T *p, simd< Toffset, N > offsets, simd< T, N *get_num_channels_enabled(RGBAMask)> vals, simd_mask< N > mask=1)
Transpose and scatter pixels to given memory locations defined by the base pointer p and offsets.
Definition: memory.hpp:3854
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT > > scatter(T *p, simd< OffsetT, N/VS > byte_offsets, simd< T, N > vals, simd_mask< N/VS > mask, PropertyListT props={})
template <typename T, int N, int VS = 1, typename OffsetT, typename PropertyListT = empty_properties_...
Definition: memory.hpp:709
__ESIMD_API void fence()
esimd::fence sets the memory read/write order.
Definition: memory.hpp:7822
__ESIMD_API void scalar_store(AccessorTy acc, detail::DeviceAccessorOffsetT offset, T val)
Store a scalar value into an accessor.
Definition: memory.hpp:3722
__ESIMD_API void media_block_store(AccessorTy acc, unsigned x, unsigned y, simd< T, m *N > vals)
Media block store.
Definition: memory.hpp:7918
fence_mask
Represetns a bit mask to control behavior of esimd::fence.
Definition: memory.hpp:7798
__ESIMD_API SurfaceIndex get_surface_index(AccessorTy acc)
Get surface index corresponding to a SYCL accessor.
Definition: memory.hpp:53
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT > > prefetch(const T *p, simd< OffsetT, N/VS > byte_offsets, simd_mask< N/VS > mask, PropertyListT props={})
template <typename T, int N, int VS, typename OffsetT, typename PropertyListT = empty_properties_t> v...
Definition: memory.hpp:8629
__ESIMD_API simd< T, m *N > media_block_load(AccessorTy acc, unsigned x, unsigned y)
Media block load.
Definition: memory.hpp:7877
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT >, simd< T, N > > gather(const T *p, simd< OffsetT, N/VS > byte_offsets, simd_mask< N/VS > mask, simd< T, N > pass_thru, PropertyListT props={})
template <typename T, int N, int VS, typename OffsetT, typename PropertyListT = empty_properties_t> s...
Definition: memory.hpp:312
__ESIMD_API void barrier()
Generic work-group barrier.
Definition: memory.hpp:7855
@ l3_flush_constant_data
Flush constant cache.
Definition: memory.hpp:7806
@ global_coherent_fence
“Commit enable” - wait for fence to complete before continuing.
Definition: memory.hpp:7800
@ local_barrier
Issue SLM memory barrier only. If not set, the memory barrier is global.
Definition: memory.hpp:7810
@ l1_flush_ro_data
Flush L1 read - only data cache.
Definition: memory.hpp:7812
@ l3_flush_instructions
Flush the instruction cache.
Definition: memory.hpp:7802
@ sw_barrier
Creates a software (compiler) barrier, which does not generate any instruction and only prevents inst...
Definition: memory.hpp:7816
@ l3_flush_rw_data
Flush constant cache.
Definition: memory.hpp:7808
@ l3_flush_texture_data
Flush sampler (texture) cache.
Definition: memory.hpp:7804
__ESIMD_API sycl::ext::intel::esimd::simd< T1, n1 > raw_send(sycl::ext::intel::esimd::simd< T1, n1 > msg_dst, sycl::ext::intel::esimd::simd< T2, n2 > msg_src0, uint32_t ex_desc, uint32_t msg_desc, sycl::ext::intel::esimd::simd_mask< exec_size > mask=1)
Raw send.
Definition: memory.hpp:9029
__ESIMD_API sycl::ext::intel::esimd::simd< T1, n1 > raw_sends(sycl::ext::intel::esimd::simd< T1, n1 > msg_dst, sycl::ext::intel::esimd::simd< T2, n2 > msg_src0, sycl::ext::intel::esimd::simd< T3, n3 > msg_src1, uint32_t ex_desc, uint32_t msg_desc, sycl::ext::intel::esimd::simd_mask< exec_size > mask=1)
Raw sends.
Definition: memory.hpp:8985
void add(const void *DeviceGlobalPtr, const char *UniqueId)
ESIMD_INLINE simd< T, N > lsc_format_ret(simd< T1, N > Vals)
Definition: memory.hpp:88
__ESIMD_API std::enable_if_t< get_num_args< Op >)==0, simd< T, N > > slm_atomic_update_impl(simd< uint32_t, N > offsets, simd_mask< N > pred)
SLM atomic.
Definition: memory.hpp:5574
constexpr bool isMaskedGatherScatterLLVMAvailable()
Definition: memory.hpp:226
static constexpr SurfaceIndex SLM_BTI
Definition: common.hpp:114
__ESIMD_API std::enable_if_t< get_num_args< Op >)==0, simd< T, N > > atomic_update_impl(T *p, simd< Toffset, N > offsets, simd_mask< N > pred)
USM pointer atomic.
Definition: memory.hpp:5945
__ESIMD_API simd< T, N *NElts > gather_impl(const T *p, simd< OffsetT, N > offsets, simd_mask< N > pred)
USM pointer gather.
Definition: memory.hpp:117
__ESIMD_API std::enable_if_t< is_simd_flag_type_v< FlagsT > > block_store_impl(T *p, simd< T, NElts > vals, simd_mask< 1 > pred, FlagsT flags)
Definition: memory.hpp:1235
constexpr void check_atomic()
Check the legality of an atomic call in terms of size and type.
Definition: memory.hpp:4057
static void validate_rgba_write_channel_mask()
Definition: memory.hpp:3822
__ESIMD_API simd< T, N > slm_atomic_update_impl(simd< uint32_t, N > offsets, simd< T, N > src0, simd< T, N > src1, simd_mask< N > pred)
SLM atomic.
Definition: memory.hpp:5650
__ESIMD_API void scatter_impl(T *p, simd< Toffset, N > offsets, simd< T, N *NElts > vals, simd_mask< N > pred)
USM pointer scatter.
Definition: memory.hpp:201
lsc_data_size
Data size or format to read or store.
Definition: common.hpp:449
ESIMD_INLINE simd< RT, N > lsc_format_input(simd< T, N > Vals)
Definition: memory.hpp:74
__ESIMD_API std::enable_if_t< is_simd_flag_type_v< FlagsT >, simd< T, NElts > > block_load_impl(const T *p, simd_mask< 1 > pred, FlagsT flags)
Definition: memory.hpp:895
constexpr int lsc_to_internal_atomic_op()
Definition: memory.hpp:5552
constexpr ESIMD_INLINE bool isPowerOf2(unsigned int n)
Check if a given 32 bit positive integer is a power of 2 at compile time.
Definition: common.hpp:95
constexpr lsc_data_size expand_data_size(lsc_data_size DS)
Definition: common.hpp:635
constexpr alignment_key::value_t< K > alignment
cache_hint
L1, L2 or L3 cache hints.
Definition: common.hpp:348
@ uncached
load/store/atomic: do not cache data to cache;
fence_scope
The scope that fence() operation should apply to.
Definition: common.hpp:389
@ group
Wait until all previous memory transactions from this thread are observed within the local thread-gro...
fence_flush_op
The cache flush operation to apply to caches after fence() is complete.
Definition: common.hpp:423
memory_kind
The target memory kind for fence() operation.
Definition: common.hpp:436
@ local
image (also known as typed global memory)
void prefetch_impl(T *ptr, size_t bytes, Properties properties)
Definition: prefetch.hpp:71
properties< std::tuple<> > empty_properties_t
Definition: properties.hpp:207
std::bit_or< T > bit_or
Definition: functional.hpp:22
std::bit_xor< T > bit_xor
Definition: functional.hpp:23
std::bit_and< T > bit_and
Definition: functional.hpp:24
std::enable_if_t< detail::is_vgenfloat_v< T >, T > fmin(T x, typename T::element_type y)
std::enable_if_t< sizeof(To)==sizeof(From) &&std::is_trivially_copyable< From >::value &&std::is_trivially_copyable< To >::value, To > bit_cast(const From &from) noexcept
Definition: bit_cast.hpp:52
std::enable_if_t< detail::is_vgenfloat_v< T >, T > fmax(T x, typename T::element_type y)
constexpr stream_manipulator dec
Definition: stream.hpp:784
Definition: access.hpp:18
std::conditional_t< sizeof(T)<=4, std::conditional_t< std::is_signed_v< T >, int32_t, uint32_t >, std::conditional_t< std::is_signed_v< T >, int64_t, uint64_t > > type
Definition: common.hpp:647