DPC++ Runtime
Runtime libraries for oneAPI DPC++
memory.hpp
Go to the documentation of this file.
1 //==-------------- memory.hpp - DPC++ Explicit SIMD API --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // Implement Explicit SIMD memory-access APIs.
9 //===----------------------------------------------------------------------===//
10 
11 #pragma once
12 
20 #include <sycl/half_type.hpp>
21 
22 #include <algorithm>
23 #include <cstdint>
24 
25 namespace sycl {
26 inline namespace _V1 {
27 namespace ext::intel::esimd {
28 
31 
37 
39 
41 
43 
46 
52 template <typename AccessorTy>
53 __ESIMD_API SurfaceIndex get_surface_index(AccessorTy acc) {
54  if constexpr (std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
55  sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>) {
56  return detail::SLM_BTI;
57  } else {
58 #ifdef __ESIMD_FORCE_STATELESS_MEM
59  static_assert(sycl::detail::acc_properties::is_image_accessor_v<AccessorTy>,
60  "The function get_surface_index() is available only for "
61  "image- and local-accessors in stateless-only memory mode. "
62  "Consider using "
63  "-fno-sycl-esimd-force-stateless-mem compilation switch.");
64 #endif // __ESIMD_FORCE_STATELESS_MEM
65  return __esimd_get_surface_index(
66  detail::AccessorPrivateProxy::getQualifiedPtrOrImageObj(acc));
67  }
68 }
69 
70 namespace detail {
71 
72 // Format u8 and u16 to u8u32 and u16u32 by doing garbage-extension.
73 template <typename RT, typename T, int N>
75  if constexpr (sizeof(T) == 1) {
76  // Extend bytes to RT.
77  return Vals.template bit_cast_view<uint8_t>();
78  } else if constexpr (sizeof(T) == 2) {
79  // Extend words to RT.
80  return Vals.template bit_cast_view<uint16_t>();
81  } else {
82  return Vals.template bit_cast_view<RT>();
83  }
84 }
85 
86 // Format u8u32 and u16u32 back to u8 and u16.
87 template <typename T, typename T1, int N>
89  auto Formatted = Vals.template bit_cast_view<T>();
90  if constexpr (sizeof(T) == sizeof(T1)) {
91  return Formatted;
92  } else {
93  constexpr int Stride = Formatted.length / N;
94  return Formatted.template select<N, Stride>(0);
95  }
96 }
97 
101 template <typename PropertyListT, cache_level Level>
103  static_assert(Level == cache_level::L1 || Level == cache_level::L2,
104  "ESIMD/GENX intrinsics accept only L1/L2 cache hints");
105  if constexpr (Level == cache_level::L1) {
106  return getPropertyValue<PropertyListT, cache_hint_L1_key>(cache_hint::none);
107  } else {
108  return getPropertyValue<PropertyListT, cache_hint_L2_key>(cache_hint::none);
109  }
110 }
111 
131 template <typename T, int NElts, lsc_data_size DS, typename PropertyListT,
132  int N, typename OffsetT>
133 __ESIMD_API simd<T, N * NElts> gather_impl(const T *p, simd<OffsetT, N> offsets,
134  simd_mask<N> pred,
135  simd<T, N * NElts> pass_thru) {
136  static_assert(std::is_integral_v<OffsetT>, "Unsupported offset type");
137  check_lsc_vector_size<NElts>();
138  check_lsc_data_size<T, DS>();
139  check_cache_hints<cache_action::load, PropertyListT>();
140  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
141  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
142  constexpr uint16_t AddressScale = 1;
143  constexpr int ImmOffset = 0;
144  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
145  constexpr lsc_vector_size VS = to_lsc_vector_size<NElts>();
146  constexpr auto Transposed = lsc_data_order::nontranspose;
147  using MsgT = typename lsc_expand_type<T>::type;
148  simd<uintptr_t, N> Addrs = reinterpret_cast<uintptr_t>(p);
149  Addrs += convert<uintptr_t>(offsets);
150  simd<MsgT, N * NElts> PassThruExpanded = lsc_format_input<MsgT>(pass_thru);
151  simd<MsgT, N * NElts> Result =
152  __esimd_lsc_load_merge_stateless<MsgT, L1H, L2H, AddressScale, ImmOffset,
153  EDS, VS, Transposed, N>(
154  pred.data(), Addrs.data(), PassThruExpanded.data());
155  return lsc_format_ret<T>(Result);
156 }
157 
174 template <typename T, int NElts, lsc_data_size DS, typename PropertyListT,
175  int N, typename Toffset>
176 __ESIMD_API void scatter_impl(T *p, simd<Toffset, N> offsets,
177  simd<T, N * NElts> vals, simd_mask<N> pred) {
178  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
179  check_lsc_vector_size<NElts>();
180  check_lsc_data_size<T, DS>();
181  check_cache_hints<cache_action::store, PropertyListT>();
182  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
183  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
184  constexpr uint16_t AddressScale = 1;
185  constexpr int ImmOffset = 0;
186  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
187  constexpr lsc_vector_size VS = to_lsc_vector_size<NElts>();
188  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
189  using MsgT = typename lsc_expand_type<T>::type;
190  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
191  addrs += convert<uintptr_t>(offsets);
192  simd<MsgT, N * NElts> Tmp = lsc_format_input<MsgT, T>(vals);
193  __esimd_lsc_store_stateless<MsgT, L1H, L2H, AddressScale, ImmOffset, EDS, VS,
194  Transposed, N>(pred.data(), addrs.data(),
195  Tmp.data());
196 }
197 
198 // Returns true iff it is Ok to use llvm.masked.gather and llvm.masked.scatter.
199 // By default (without use specifying __ESIMD_GATHER_SCATTER_LLVM_IR) it is
200 // not used because of an issue in GPU driver, which does not recognize
201 // those operations in SPIR-V when they are used in mixed (scalar and vector)
202 // kernels using invoke_simd() API.
204 #ifdef __ESIMD_GATHER_SCATTER_LLVM_IR
205  return true;
206 #else
207  return false;
208 #endif
209 }
210 
211 } // namespace detail
212 
252 
258 #ifndef __ESIMD_GATHER_SCATTER_LLVM_IR
261 #endif // __ESIMD_GATHER_SCATTER_LLVM_IR
284 template <
285  typename T, int N, int VS, typename OffsetT,
286  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
287 __ESIMD_API std::enable_if_t<
288  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
289 gather(const T *p, simd<OffsetT, N / VS> byte_offsets, simd_mask<N / VS> mask,
290  simd<T, N> pass_thru, PropertyListT props = {}) {
291  static_assert(std::is_integral_v<OffsetT>, "Unsupported offset type");
292  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
293 
294  constexpr size_t Alignment =
295  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
296  static_assert(Alignment >= sizeof(T),
297  "gather() requires at least element-size alignment");
298 
299  // Use LSC lowering if cache-hints are used or VS > 1. Also, if
300  // llvm.masked.gather is not available, then LSC is the only lowering option.
301  if constexpr (detail::has_cache_hints<PropertyListT>() || VS > 1 ||
303  static_assert(VS == 1 || sizeof(T) >= 4,
304  "VS > 1 is supprted only for 4- and 8-byte elements");
306  PropertyListT>(p, byte_offsets, mask, pass_thru);
307  } else {
308  simd<uint64_t, N> Addrs(reinterpret_cast<uint64_t>(p));
309  Addrs = Addrs + convert<uint64_t>(byte_offsets);
310 
311  using MsgT = detail::__raw_t<T>;
312  return __esimd_gather_ld<MsgT, N, Alignment>(
313  Addrs.data(), mask.data(),
314  sycl::bit_cast<__ESIMD_DNS::vector_type_t<MsgT, N>>(pass_thru.data()));
315  }
316 }
317 
343 template <
344  typename T, int N, int VS, typename OffsetT,
345  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
346 __ESIMD_API std::enable_if_t<
347  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
348 gather(const T *p, simd<OffsetT, N / VS> byte_offsets, simd_mask<N / VS> mask,
349  PropertyListT props = {}) {
350  constexpr size_t Alignment =
351  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
352  static_assert(Alignment >= sizeof(T),
353  "gather() requires at least element-size alignment");
354 
355  if constexpr (detail::has_cache_hints<PropertyListT>() || VS > 1 ||
357  !detail::isPowerOf2(N, 32)) {
358  simd<T, N> PassThru; // it is intentionally undefined
359  return gather<T, N, VS>(p, byte_offsets, mask, PassThru, props);
360  } else {
361  simd<uintptr_t, N> Addrs = reinterpret_cast<uintptr_t>(p);
362  Addrs += convert<uintptr_t>(byte_offsets);
363  using MsgT = detail::__raw_t<T>;
364  if constexpr (sizeof(T) == 1) {
365  auto Ret = __esimd_svm_gather<MsgT, N, detail::ElemsPerAddrEncoding<4>(),
366  detail::ElemsPerAddrEncoding<1>()>(
367  Addrs.data(), mask.data());
368  detail::check_rdregion_params<N * 4, N, /*VS*/ 0, N, 4>();
369  return __esimd_rdregion<MsgT, N * 4, N, /*VS*/ 0, N, 4>(Ret, 0);
370  } else if constexpr (sizeof(T) == 2) {
371  auto Ret = __esimd_svm_gather<MsgT, N, detail::ElemsPerAddrEncoding<2>(),
372  detail::ElemsPerAddrEncoding<2>()>(
373  Addrs.data(), mask.data());
374  detail::check_rdregion_params<N * 2, N, /*VS*/ 0, N, 2>();
375  return __esimd_rdregion<MsgT, N * 2, N, /*VS*/ 0, N, 2>(Ret, 0);
376  } else {
377  return __esimd_svm_gather<MsgT, N, detail::ElemsPerAddrEncoding<1>(),
378  detail::ElemsPerAddrEncoding<1>()>(Addrs.data(),
379  mask.data());
380  }
381  }
382 }
383 
402 template <
403  typename T, int N, int VS, typename OffsetT,
404  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
405 __ESIMD_API std::enable_if_t<
406  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
407 gather(const T *p, simd<OffsetT, N / VS> byte_offsets,
408  PropertyListT props = {}) {
409  simd_mask<N / VS> Mask = 1;
410  return gather<T, N, VS>(p, byte_offsets, Mask, props);
411 }
412 
437 template <
438  typename T, int N, typename OffsetT,
439  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
440 __ESIMD_API std::enable_if_t<
441  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
442 gather(const T *p, simd<OffsetT, N> byte_offsets, simd_mask<N> mask,
443  simd<T, N> pass_thru, PropertyListT props = {}) {
444  constexpr int VS = 1;
445  return gather<T, N, VS>(p, byte_offsets, mask, pass_thru, props);
446 }
447 
469 template <
470  typename T, int N, typename OffsetT,
471  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
472 __ESIMD_API std::enable_if_t<
473  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
474 gather(const T *p, simd<OffsetT, N> byte_offsets, simd_mask<N> mask,
475  PropertyListT props = {}) {
476  constexpr int VS = 1;
477  return gather<T, N, VS>(p, byte_offsets, mask, props);
478 }
479 
495 template <
496  typename T, int N, typename OffsetT,
497  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
498 __ESIMD_API std::enable_if_t<
499  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
500 gather(const T *p, simd<OffsetT, N> byte_offsets, PropertyListT props = {}) {
501  constexpr int VS = 1;
502  return gather<T, N, VS>(p, byte_offsets, props);
503 }
504 
533 template <
534  typename T, int N, int VS = 1, typename OffsetSimdViewT,
535  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
536 __ESIMD_API std::enable_if_t<
537  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
538  detail::is_simd_view_type_v<OffsetSimdViewT>,
539  simd<T, N>>
540 gather(const T *p, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
541  simd<T, N> pass_thru, PropertyListT props = {}) {
542  return gather<T, N, VS>(p, byte_offsets.read(), mask, pass_thru, props);
543 }
544 
573 template <
574  int VS = 1, typename OffsetT, typename T, typename PassThruSimdViewT,
575  int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
576  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
577 __ESIMD_API std::enable_if_t<
578  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
579  detail::is_simd_view_type_v<PassThruSimdViewT>,
580  simd<T, N>>
581 gather(const T *p, simd<OffsetT, N / VS> byte_offsets, simd_mask<N / VS> mask,
582  PassThruSimdViewT pass_thru, PropertyListT props = {}) {
583  return gather<T, N, VS>(p, byte_offsets, mask, pass_thru.read(), props);
584 }
585 
614 template <
615  int VS = 1, typename OffsetSimdViewT, typename T,
616  typename PassThruSimdViewT,
617  int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
618  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
619 __ESIMD_API std::enable_if_t<
620  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
621  detail::is_simd_view_type_v<OffsetSimdViewT> &&
622  detail::is_simd_view_type_v<PassThruSimdViewT>,
623  simd<T, N>>
624 gather(const T *p, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
625  PassThruSimdViewT pass_thru, PropertyListT props = {}) {
626  static_assert(N / VS ==
627  OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
628  "Size of pass_thru parameter must correspond to the size of "
629  "byte_offsets parameter.");
630  return gather<T, N, VS>(p, byte_offsets.read(), mask, pass_thru.read(),
631  props);
632 }
633 
660 template <
661  int VS, typename OffsetSimdViewT, typename T, int N,
662  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
663 __ESIMD_API std::enable_if_t<
664  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
665  detail::is_simd_view_type_v<OffsetSimdViewT>,
666  simd<T, N>>
667 gather(const T *p, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
668  simd<T, N> pass_thru, PropertyListT props = {}) {
669  static_assert(N / VS ==
670  OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
671  "Size of pass_thru parameter must correspond to the size of "
672  "byte_offsets parameter.");
673  return gather<T, N, VS>(p, byte_offsets.read(), mask, pass_thru, props);
674 }
675 
699 template <
700  typename T, int N, int VS = 1, typename OffsetSimdViewT,
701  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
702 __ESIMD_API std::enable_if_t<
703  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
704  detail::is_simd_view_type_v<OffsetSimdViewT>,
705  simd<T, N>>
706 gather(const T *p, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
707  PropertyListT props = {}) {
708  return gather<T, N, VS>(p, byte_offsets.read(), mask, props);
709 }
710 
732 template <
733  int VS = 1, typename OffsetSimdViewT, typename T,
734  int N = OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY() * VS,
735  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
736 __ESIMD_API std::enable_if_t<
737  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
738  detail::is_simd_view_type_v<OffsetSimdViewT>,
739  simd<T, N>>
740 gather(const T *p, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
741  PropertyListT props = {}) {
742  return gather<T, N, VS>(p, byte_offsets.read(), mask, props);
743 }
744 
762 template <
763  typename T, int N, int VS = 1, typename OffsetSimdViewT,
764  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
765 __ESIMD_API std::enable_if_t<
766  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
767  detail::is_simd_view_type_v<OffsetSimdViewT>,
768  simd<T, N>>
769 gather(const T *p, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
770  return gather<T, N, VS>(p, byte_offsets.read(), props);
771 }
788 template <
789  int VS = 1, typename OffsetSimdViewT, typename T,
790  int N = OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY() * VS,
791  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
792 __ESIMD_API std::enable_if_t<
793  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
794  detail::is_simd_view_type_v<OffsetSimdViewT>,
795  simd<T, N>>
796 gather(const T *p, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
797  return gather<T, N, VS>(p, byte_offsets.read(), props);
798 }
799 
812 template <typename Tx, int N, typename Toffset>
813 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>, simd<Tx, N>>
814 gather(const Tx *p, Toffset offset, simd_mask<N> mask = 1) {
815  return gather<Tx, N>(p, simd<Toffset, N>(offset), mask);
816 }
817 
822 
827 
830 
835 
840 
864 template <
865  typename T, int N, int VS = 1, typename OffsetT,
866  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
867 __ESIMD_API std::enable_if_t<
868  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
869 scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
870  simd_mask<N / VS> mask, PropertyListT props = {}) {
871  static_assert(std::is_integral_v<OffsetT>, "Unsupported offset type");
872  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
873 
874  constexpr size_t Alignment =
875  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
876  static_assert(Alignment >= sizeof(T),
877  "scatter() requires at least element-size alignment");
878 
879  // Use LSC lowering if cache-hints are used or VS > 1.
880  if constexpr (detail::has_cache_hints<PropertyListT>() || VS > 1 ||
881  (!__ESIMD_DNS::isPowerOf2(N, 32) &&
883  static_assert(VS == 1 || sizeof(T) >= 4,
884  "VS > 1 is supprted only for 4- and 8-byte elements");
886  PropertyListT>(p, byte_offsets, vals, mask);
887  } else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) {
888  simd<uint64_t, N> Addrs(reinterpret_cast<uint64_t>(p));
889  Addrs = Addrs + convert<uint64_t>(byte_offsets);
890  using MsgT = detail::__raw_t<T>;
891  __esimd_scatter_st<MsgT, N, Alignment>(
892  sycl::bit_cast<__ESIMD_DNS::vector_type_t<MsgT, N>>(vals.data()),
893  Addrs.data(), mask.data());
894  } else {
895  using Tx = detail::__raw_t<T>;
896  simd<uint64_t, N> byte_offsets_i = convert<uint64_t>(byte_offsets);
897  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
898  addrs = addrs + byte_offsets_i;
899  if constexpr (sizeof(T) == 1) {
900  detail::check_wrregion_params<N * 4, N, /*VS*/ 0, N, 4>();
901  simd<T, N * 4> D; // Intentionally undefined.
902  D = __esimd_wrregion<Tx, N * 4, N, /*VS*/ 0, N, 4>(D.data(), vals.data(),
903  0);
904  __esimd_svm_scatter<Tx, N, detail::ElemsPerAddrEncoding<4>(),
905  detail::ElemsPerAddrEncoding<1>()>(
906  addrs.data(), D.data(), mask.data());
907  } else if constexpr (sizeof(T) == 2) {
908  detail::check_wrregion_params<N * 2, N, /*VS*/ 0, N, 2>();
909  simd<Tx, N * 2> D; // Intentionally undefined.
910  D = __esimd_wrregion<Tx, N * 2, N, /*VS*/ 0, N, 2>(D.data(), vals.data(),
911  0);
912  __esimd_svm_scatter<Tx, N, detail::ElemsPerAddrEncoding<2>(),
913  detail::ElemsPerAddrEncoding<2>()>(
914  addrs.data(), D.data(), mask.data());
915  } else
916  __esimd_svm_scatter<Tx, N, detail::ElemsPerAddrEncoding<1>(),
917  detail::ElemsPerAddrEncoding<1>()>(
918  addrs.data(), vals.data(), mask.data());
919  }
920 }
921 
946 template <
947  int VS = 1, typename OffsetT, typename ValuesSimdViewT, typename T,
948  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
949  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
950 __ESIMD_API std::enable_if_t<
951  detail::is_simd_view_type_v<ValuesSimdViewT> &&
952  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
953 scatter(T *p, simd<OffsetT, N / VS> byte_offsets, ValuesSimdViewT vals,
954  simd_mask<N / VS> mask, PropertyListT props = {}) {
955  scatter<T, N, VS>(p, byte_offsets, vals.read(), mask, props);
956 }
957 
979 template <
980  typename T, int N, int VS = 1, typename OffsetT,
981  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
982 __ESIMD_API std::enable_if_t<
983  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
984 scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
985  PropertyListT props = {}) {
986  simd_mask<N / VS> Mask = 1;
987  scatter<T, N, VS>(p, byte_offsets, vals, Mask, props);
988 }
989 
1013 template <
1014  int VS = 1, typename OffsetSimdViewT, typename ValuesSimdViewT, typename T,
1015  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
1016  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
1017 __ESIMD_API std::enable_if_t<
1018  detail::is_simd_view_type_v<ValuesSimdViewT> &&
1019  detail::is_simd_view_type_v<OffsetSimdViewT> &&
1020  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
1021 scatter(T *p, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
1022  simd_mask<N / VS> mask, PropertyListT props = {}) {
1023  static_assert(N / VS ==
1024  OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
1025  "Size of vals parameter must correspond to the size of "
1026  "byte_offsets parameter.");
1027  scatter<T, N, VS>(p, byte_offsets.read(), vals.read(), mask, props);
1028 }
1029 
1052 template <
1053  int VS = 1, typename OffsetT, typename ValuesSimdViewT, typename T,
1054  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
1055  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
1056 __ESIMD_API std::enable_if_t<
1057  detail::is_simd_view_type_v<ValuesSimdViewT> &&
1058  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
1059 scatter(T *p, simd<OffsetT, N / VS> byte_offsets, ValuesSimdViewT vals,
1060  PropertyListT props = {}) {
1061  scatter<T, N, VS>(p, byte_offsets, vals.read(), props);
1062 }
1063 
1088 template <
1089  typename T, int N, int VS = 1, typename OffsetSimdViewT,
1090  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
1091 __ESIMD_API std::enable_if_t<
1092  detail::is_simd_view_type_v<OffsetSimdViewT> &&
1093  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
1094 scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
1095  simd_mask<N / VS> mask, PropertyListT props = {}) {
1096  scatter<T, N, VS>(p, byte_offsets.read(), vals, mask, props);
1097 }
1098 
1123 template <
1124  int VS, typename OffsetSimdViewT, typename T, int N,
1125  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
1126 __ESIMD_API std::enable_if_t<
1127  detail::is_simd_view_type_v<OffsetSimdViewT> &&
1128  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
1129 scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
1130  simd_mask<N / VS> mask, PropertyListT props = {}) {
1131  static_assert(N / VS ==
1132  OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
1133  "Size of vals parameter must correspond to the size of "
1134  "byte_offsets parameter.");
1135  scatter<T, N, VS>(p, byte_offsets.read(), vals, mask, props);
1136 }
1137 
1161 template <
1162  int VS, typename OffsetSimdViewT, typename T, int N,
1163  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
1164 __ESIMD_API std::enable_if_t<
1165  detail::is_simd_view_type_v<OffsetSimdViewT> &&
1166  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
1167 scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
1168  PropertyListT props = {}) {
1169  static_assert(N / VS ==
1170  OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
1171  "Size of vals parameter must correspond to the size of "
1172  "byte_offsets parameter.");
1173  scatter<T, N, VS>(p, byte_offsets.read(), vals, props);
1174 }
1175 
1198 template <
1199  typename T, int N, int VS = 1, typename OffsetSimdViewT,
1200  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
1201 __ESIMD_API std::enable_if_t<
1202  detail::is_simd_view_type_v<OffsetSimdViewT> &&
1203  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
1204 scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
1205  PropertyListT props = {}) {
1206  simd_mask<N / VS> Mask = 1;
1207  scatter<T, N, VS>(p, byte_offsets.read(), vals, Mask, props);
1208 }
1209 
1234 template <
1235  int VS = 1, typename OffsetSimdViewT, typename ValuesSimdViewT, typename T,
1236  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
1237  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
1238 __ESIMD_API std::enable_if_t<
1239  detail::is_simd_view_type_v<OffsetSimdViewT> &&
1240  detail::is_simd_view_type_v<ValuesSimdViewT> &&
1241  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
1242 scatter(T *p, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
1243  PropertyListT props = {}) {
1244  static_assert(N / VS ==
1245  OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
1246  "Size of vals parameter must correspond to the size of "
1247  "byte_offsets parameter.");
1248  scatter<T, N, VS>(p, byte_offsets.read(), vals.read(), props);
1249 }
1250 
1262 template <typename Tx, int N, typename Toffset>
1263 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> && N == 1>
1264 scatter(Tx *p, Toffset offset, simd<Tx, N> vals, simd_mask<N> mask = 1) {
1265  scatter<Tx, N, 1>(p, simd<Toffset, N>(offset), vals, mask);
1266 }
1267 
1268 namespace detail {
1269 // Accessors may get either 32-bit offset or 64-bit depending on
1270 // the -fsycl-esimd-force-stateles-mem mode setting.
1271 #ifdef __ESIMD_FORCE_STATELESS_MEM
1272 using DeviceAccessorOffsetT = uint64_t;
1273 #else
1274 using DeviceAccessorOffsetT = uint32_t;
1275 #endif
1276 
1308 template <typename T, int NElts, typename PropertyListT>
1309 __ESIMD_API std::enable_if_t<is_property_list_v<PropertyListT>, simd<T, NElts>>
1310 block_load_impl(const T *p, simd_mask<1> pred, simd<T, NElts> pass_thru) {
1311  // Verify input template arguments.
1312  check_cache_hints<cache_action::load, PropertyListT>();
1313  constexpr size_t Alignment =
1314  PropertyListT::template get_property<alignment_key>().value;
1315  static_assert(
1316  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
1317  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
1318  "Incorrect alignment for the data type");
1319 
1320  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
1321  constexpr int SmallIntFactor32Bit =
1322  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
1323  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1324  "Number of elements is not supported by Transposed load");
1325 
1326  // If alignment >= 8 and (NElts * sizeof(T)) % 8 == 0) we can load QWORDs.
1327  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
1328  // because it would require a bit-cast, which is supposed to be NO-OP, but
1329  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
1330  constexpr bool Use64BitData =
1331  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1332  (NElts * sizeof(T)) % sizeof(uint64_t) == 0 &&
1333  (sizeof(T) != sizeof(uint32_t) || NElts * sizeof(T) > 256);
1334  constexpr int SmallIntFactor =
1335  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1336  constexpr int FactoredNElts = NElts / SmallIntFactor;
1337  check_lsc_vector_size<FactoredNElts>();
1338 
1339  // Prepare template arguments for the call of intrinsic.
1340  using LoadElemT = __ESIMD_DNS::__raw_t<
1341  std::conditional_t<SmallIntFactor == 1, T,
1342  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1343  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
1344  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
1345 
1346  constexpr uint16_t AddressScale = 1;
1347  constexpr int ImmOffset = 0;
1348  constexpr lsc_data_size ActualDS =
1349  Use64BitData ? lsc_data_size::u64 : lsc_data_size::u32;
1350  constexpr lsc_vector_size VS = to_lsc_vector_size<FactoredNElts>();
1351  constexpr auto Transposed = lsc_data_order::transpose;
1352  constexpr int N = 1;
1353 
1354  // Prepare non-template arguments and call the intrinsic.
1355  simd<uintptr_t, N> Addrs = reinterpret_cast<uintptr_t>(p);
1357  pass_thru.template bit_cast_view<LoadElemT>();
1359  __esimd_lsc_load_merge_stateless<LoadElemT, L1H, L2H, AddressScale,
1360  ImmOffset, ActualDS, VS, Transposed, N>(
1361  pred.data(), Addrs.data(), PassThru.data());
1362  return Result.template bit_cast_view<T>();
1363 }
1364 
1397 template <typename T, int NElts, typename PropertyListT, typename AccessorT>
1398 __ESIMD_API
1399  std::enable_if_t<detail::is_device_accessor_with_v<
1400  AccessorT, detail::accessor_mode_cap::can_read> &&
1401  is_property_list_v<PropertyListT>,
1404  simd_mask<1> pred) {
1405 #ifdef __ESIMD_FORCE_STATELESS_MEM
1406  simd<T, NElts> PassThru; // Intentionally undefined.
1407  return block_load_impl<T, NElts, PropertyListT>(
1408  accessorToPointer<T>(acc, offset), pred, PassThru);
1409 #else // !__ESIMD_FORCE_STATELESS_MEM
1410  // Verify input template arguments.
1411  check_cache_hints<cache_action::load, PropertyListT>();
1412  constexpr size_t Alignment =
1413  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
1414  static_assert(
1415  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
1416  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
1417  "Incorrect alignment for the data type");
1418 
1419  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
1420  constexpr int SmallIntFactor32Bit =
1421  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
1422  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1423  "Number of elements is not supported by Transposed load");
1424 
1425  // If alignment >= 8 and (NElts * sizeof(T)) % 8 == 0) we can load QWORDs.
1426  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
1427  // because it would require a bit-cast, which is supposed to be NO-OP, but
1428  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
1429  constexpr bool Use64BitData =
1430  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1431  (NElts * sizeof(T)) % sizeof(uint64_t) == 0 &&
1432  (sizeof(T) != sizeof(uint32_t) || NElts * sizeof(T) > 256);
1433  constexpr int SmallIntFactor =
1434  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1435  constexpr int FactoredNElts = NElts / SmallIntFactor;
1436  check_lsc_vector_size<FactoredNElts>();
1437 
1438  // Prepare template arguments for the call of intrinsic.
1439  using LoadElemT = __ESIMD_DNS::__raw_t<
1440  std::conditional_t<SmallIntFactor == 1, T,
1441  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1442  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
1443  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
1444  constexpr uint16_t AddressScale = 1;
1445  constexpr int ImmOffset = 0;
1446  constexpr lsc_data_size ActualDS =
1447  Use64BitData ? lsc_data_size::u64 : lsc_data_size::u32;
1448  constexpr auto VS = to_lsc_vector_size<FactoredNElts>();
1449  constexpr auto Transposed = lsc_data_order::transpose;
1450  constexpr int N = 1;
1451 
1452  // Prepare non-template arguments and call the intrinsic.
1453  simd<uint32_t, N> Offsets = offset;
1454  auto SI = get_surface_index(acc);
1456  __esimd_lsc_load_bti<LoadElemT, L1H, L2H, AddressScale, ImmOffset,
1457  ActualDS, VS, Transposed, N>(pred.data(),
1458  Offsets.data(), SI);
1459  return Result.template bit_cast_view<T>();
1460 #endif // !__ESIMD_FORCE_STATELESS_MEM
1461 }
1462 
1496 template <typename T, int NElts, typename PropertyListT, typename AccessorT>
1497 __ESIMD_API
1498  std::enable_if_t<detail::is_device_accessor_with_v<
1499  AccessorT, detail::accessor_mode_cap::can_read> &&
1500  is_property_list_v<PropertyListT>,
1503  simd_mask<1> pred, simd<T, NElts> pass_thru) {
1504 #ifdef __ESIMD_FORCE_STATELESS_MEM
1505  return block_load_impl<T, NElts, PropertyListT>(
1506  accessorToPointer<T>(acc, offset), pred, pass_thru);
1507 #else // !__ESIMD_FORCE_STATELESS_MEM
1508  // Verify input template arguments.
1509  check_cache_hints<cache_action::load, PropertyListT>();
1510  constexpr size_t Alignment =
1511  PropertyListT::template get_property<alignment_key>().value;
1512  static_assert(
1513  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
1514  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
1515  "Incorrect alignment for the data type");
1516 
1517  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
1518  constexpr int SmallIntFactor32Bit =
1519  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
1520  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1521  "Number of elements is not supported by Transposed load");
1522 
1523  // If alignment >= 8 and (NElts * sizeof(T)) % 8 == 0) we can load QWORDs.
1524  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
1525  // because it would require a bit-cast, which is supposed to be NO-OP, but
1526  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
1527  constexpr bool Use64BitData =
1528  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1529  (NElts * sizeof(T)) % sizeof(uint64_t) == 0 &&
1530  (sizeof(T) != sizeof(uint32_t) || NElts * sizeof(T) > 256);
1531  constexpr int SmallIntFactor =
1532  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1533  constexpr int FactoredNElts = NElts / SmallIntFactor;
1534  check_lsc_vector_size<FactoredNElts>();
1535 
1536  // Prepare template arguments for the call of intrinsic.
1537  using LoadElemT = __ESIMD_DNS::__raw_t<
1538  std::conditional_t<SmallIntFactor == 1, T,
1539  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1540  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
1541  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
1542  constexpr uint16_t AddressScale = 1;
1543  constexpr int ImmOffset = 0;
1544  constexpr lsc_data_size ActualDS =
1545  Use64BitData ? lsc_data_size::u64 : lsc_data_size::u32;
1546  constexpr auto VS = to_lsc_vector_size<FactoredNElts>();
1547  constexpr auto Transposed = lsc_data_order::transpose;
1548  constexpr int N = 1;
1549 
1550  // Prepare non-template arguments and call the intrinsic.
1551  simd<uint32_t, N> Offsets = offset;
1552  auto SI = get_surface_index(acc);
1554  pass_thru.template bit_cast_view<LoadElemT>();
1556  __esimd_lsc_load_merge_bti<LoadElemT, L1H, L2H, AddressScale, ImmOffset,
1557  ActualDS, VS, Transposed, N>(
1558  pred.data(), Offsets.data(), SI, PassThru.data());
1559  return Result.template bit_cast_view<T>();
1560 #endif // !__ESIMD_FORCE_STATELESS_MEM
1561 }
1562 
1563 template <typename T, int NElts, typename PropertyListT>
1564 __ESIMD_API std::enable_if_t<detail::is_property_list_v<PropertyListT>>
1566  detail::check_cache_hints<cache_action::store, PropertyListT>();
1567  constexpr size_t Alignment =
1568  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
1569  static_assert(
1570  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
1571  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
1572  "Incorrect alignment for the data type");
1573 
1574  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
1575  constexpr int SmallIntFactor32Bit =
1576  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
1577  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1578  "Number of elements is not supported by Transposed store");
1579 
1580  // If alignment >= 8 and (NElts * sizeof(T)) % 8 == 0) we can store QWORDs.
1581  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
1582  // because it would require a bit-cast, which is supposed to be NO-OP, but
1583  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
1584  constexpr bool Use64BitData =
1585  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1586  (NElts * sizeof(T)) % sizeof(uint64_t) == 0 &&
1587  (sizeof(T) != sizeof(uint32_t) || NElts * sizeof(T) > 256);
1588 
1589  constexpr int SmallIntFactor =
1590  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1591  constexpr int FactoredNElts = NElts / SmallIntFactor;
1592 
1593  check_lsc_vector_size<FactoredNElts>();
1594 
1595  using StoreType = __ESIMD_DNS::__raw_t<
1596  std::conditional_t<SmallIntFactor == 1, T,
1597  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1598  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
1599  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
1600  constexpr uint16_t AddressScale = 1;
1601  constexpr int ImmOffset = 0;
1602  constexpr lsc_data_size ActualDS =
1603  Use64BitData ? lsc_data_size::u64 : lsc_data_size::u32;
1604  constexpr lsc_vector_size VS = to_lsc_vector_size<FactoredNElts>();
1605  constexpr auto Transposed = lsc_data_order::transpose;
1606  constexpr int N = 1;
1607  simd<uintptr_t, N> Addrs = reinterpret_cast<uintptr_t>(p);
1608 
1609  __esimd_lsc_store_stateless<StoreType, L1H, L2H, AddressScale, ImmOffset,
1610  ActualDS, VS, Transposed, N>(
1611  pred.data(), Addrs.data(),
1612  sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreType, FactoredNElts>>(
1613  vals.data()));
1614 }
1615 
1616 template <typename T, int NElts, typename PropertyListT, typename AccessorT>
1617 __ESIMD_API
1618  std::enable_if_t<detail::is_device_accessor_with_v<
1619  AccessorT, detail::accessor_mode_cap::can_write> &&
1620  detail::is_property_list_v<PropertyListT>>
1622  simd<T, NElts> vals, simd_mask<1> pred) {
1623 #ifdef __ESIMD_FORCE_STATELESS_MEM
1624  block_store_impl<T, NElts, PropertyListT>(accessorToPointer<T>(acc, offset),
1625  vals, pred);
1626 #else
1627  // Verify input template arguments.
1628  check_cache_hints<cache_action::store, PropertyListT>();
1629  constexpr size_t Alignment =
1630  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
1631  static_assert(
1632  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
1633  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
1634  "Incorrect alignment for the data type");
1635 
1636  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
1637  constexpr int SmallIntFactor32Bit =
1638  sizeof(uint32_t) / sizeof(T) > static_cast<size_t>(1)
1639  ? sizeof(uint32_t) / sizeof(T)
1640  : static_cast<size_t>(1);
1641  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1642  "Number of elements is not supported by Transposed store");
1643 
1644  // If alignment >= 8 and (NElts * sizeof(T)) % 8 == 0) we can store QWORDs.
1645  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
1646  // because it would require a bit-cast, which is supposed to be NO-OP, but
1647  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
1648  constexpr bool Use64BitData =
1649  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1650  (NElts * sizeof(T)) % sizeof(uint64_t) == 0 &&
1651  (sizeof(T) != sizeof(uint32_t) || NElts * sizeof(T) > 256);
1652  constexpr int SmallIntFactor =
1653  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1654  constexpr int FactoredNElts = NElts / SmallIntFactor;
1655  check_lsc_vector_size<FactoredNElts>();
1656 
1657  // Prepare template arguments for the call of intrinsic.
1658  using StoreElemT = __ESIMD_DNS::__raw_t<
1659  std::conditional_t<SmallIntFactor == 1, T,
1660  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1661  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
1662  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
1663  constexpr uint16_t AddressScale = 1;
1664  constexpr int ImmOffset = 0;
1665  constexpr lsc_data_size ActualDS =
1666  Use64BitData ? lsc_data_size::u64 : lsc_data_size::u32;
1667  constexpr auto VS = to_lsc_vector_size<FactoredNElts>();
1668  constexpr auto Transposed = lsc_data_order::transpose;
1669  constexpr int N = 1;
1670 
1671  // Prepare non-template arguments and call the intrinsic.
1672  simd<uint32_t, N> Offsets = offset;
1673  auto SI = get_surface_index(acc);
1674 
1675  __esimd_lsc_store_bti<StoreElemT, L1H, L2H, AddressScale, ImmOffset, ActualDS,
1676  VS, Transposed, N>(
1677  pred.data(), Offsets.data(),
1678  sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreElemT, FactoredNElts>>(
1679  vals.data()),
1680  SI);
1681 #endif
1682 }
1683 
1684 } // namespace detail
1685 
1700 template <typename Tx, int N,
1702 __ESIMD_API std::enable_if_t<is_simd_flag_type_v<Flags>>
1703 block_store(Tx *addr, simd<Tx, N> vals, Flags) {
1704  using T = typename detail::__raw_t<Tx>;
1705  using VecT = typename simd<T, N>::raw_vector_type;
1706  constexpr size_t Align = Flags::template alignment<simd<T, N>>;
1707  __esimd_svm_block_st<T, N, Align>(reinterpret_cast<VecT *>(addr),
1708  vals.data());
1709 }
1710 
1713 
1724 
1728 
1733 
1739 
1773 template <
1774  typename T, int N,
1775  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
1776 __ESIMD_API std::enable_if_t<
1777  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
1778 block_load(const T *ptr, PropertyListT props = {}) {
1779  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
1780  using NewPropertyListT =
1781  detail::add_alignment_property_t<PropertyListT, DefaultAlignment>;
1782  if constexpr (detail::has_cache_hints<PropertyListT>()) {
1783  simd<T, N> PassThru; // Intentionally undefined.
1784  simd_mask<1> Mask = 1;
1785  return detail::block_load_impl<T, N, NewPropertyListT>(ptr, Mask, PassThru);
1786  } else {
1787  constexpr size_t Alignment =
1788  NewPropertyListT::template get_property<alignment_key>().value;
1789  return block_load<T, N>(ptr, overaligned_tag<Alignment>{});
1790  }
1791 }
1792 
1828 template <
1829  typename T, int N,
1830  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
1831 __ESIMD_API std::enable_if_t<
1832  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
1833 block_load(const T *ptr, size_t byte_offset, PropertyListT props = {}) {
1834  const T *AdjustedPtr = reinterpret_cast<const T *>(
1835  reinterpret_cast<const int8_t *>(ptr) + byte_offset);
1836  return block_load<T, N>(AdjustedPtr, props);
1837 }
1838 
1871 template <
1872  typename T, int N,
1873  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
1874 __ESIMD_API
1875  std::enable_if_t<detail::is_property_list_v<PropertyListT>, simd<T, N>>
1876  block_load(const T *ptr, simd_mask<1> pred, PropertyListT props = {}) {
1877  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
1878  using NewPropertyListT =
1879  detail::add_alignment_property_t<PropertyListT, DefaultAlignment>;
1880  simd<T, N> PassThru; // Intentionally uninitialized.
1881  return detail::block_load_impl<T, N, NewPropertyListT>(ptr, pred, PassThru);
1882 }
1883 
1917 template <
1918  typename T, int N,
1919  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
1920 __ESIMD_API std::enable_if_t<
1921  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
1922 block_load(const T *ptr, size_t byte_offset, simd_mask<1> pred,
1923  PropertyListT props = {}) {
1924  const T *AdjustedPtr = reinterpret_cast<const T *>(
1925  reinterpret_cast<const int8_t *>(ptr) + byte_offset);
1926  return block_load<T, N>(AdjustedPtr, pred, props);
1927 }
1928 
1961 template <
1962  typename T, int N,
1963  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
1964 __ESIMD_API std::enable_if_t<
1965  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
1966 block_load(const T *ptr, simd_mask<1> pred, simd<T, N> pass_thru,
1967  PropertyListT props = {}) {
1968  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
1969  using NewPropertyListT =
1970  detail::add_alignment_property_t<PropertyListT, DefaultAlignment>;
1971  return detail::block_load_impl<T, N, NewPropertyListT>(ptr, pred, pass_thru);
1972 }
1973 
2008 template <
2009  typename PassThruSimdViewT, typename T,
2010  int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
2011  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
2012 __ESIMD_API std::enable_if_t<
2013  detail::is_simd_view_type_v<PassThruSimdViewT> &&
2014  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
2015  simd<T, N>>
2016 block_load(const T *ptr, simd_mask<1> pred, PassThruSimdViewT pass_thru,
2017  PropertyListT props = {}) {
2018  return block_load<T, N>(ptr, pred, pass_thru.read(), props);
2019 }
2020 
2055 template <
2056  typename T, int N,
2057  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
2058 __ESIMD_API std::enable_if_t<
2059  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
2060 block_load(const T *ptr, size_t byte_offset, simd_mask<1> pred,
2061  simd<T, N> pass_thru, PropertyListT props = {}) {
2062  const T *AdjustedPtr = reinterpret_cast<const T *>(
2063  reinterpret_cast<const int8_t *>(ptr) + byte_offset);
2064  return block_load<T, N>(AdjustedPtr, pred, pass_thru, props);
2065 }
2066 
2103 template <
2104  typename PassThruSimdViewT, typename T,
2105  int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
2106  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
2107 __ESIMD_API std::enable_if_t<
2108  detail::is_simd_view_type_v<PassThruSimdViewT> &&
2109  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
2110  simd<T, N>>
2111 block_load(const T *ptr, size_t byte_offset, simd_mask<1> pred,
2112  PassThruSimdViewT pass_thru, PropertyListT props = {}) {
2113  return block_load<T, N>(ptr, byte_offset, pred, pass_thru.read(), props);
2114 }
2115 
2131 template <typename Tx, int N,
2133 __ESIMD_API std::enable_if_t<is_simd_flag_type_v<Flags>, simd<Tx, N>>
2134 block_load(const Tx *addr, Flags) {
2135  using T = typename detail::__raw_t<Tx>;
2136  using VecT = typename simd<T, N>::raw_vector_type;
2137  constexpr size_t Align = Flags::template alignment<simd<T, N>>;
2138  return __esimd_svm_block_ld<T, N, Align>(
2139  reinterpret_cast<const VecT *>(addr));
2140 }
2141 
2157 template <typename Tx, int N, typename AccessorTy,
2158  typename Flags = vector_aligned_tag,
2159  typename = std::enable_if_t<
2160  is_simd_flag_type_v<Flags> &&
2161  detail::is_device_accessor_with_v<
2162  AccessorTy, detail::accessor_mode_cap::can_read>>,
2163  class T = detail::__raw_t<Tx>>
2164 __ESIMD_API simd<Tx, N> block_load(AccessorTy acc,
2165  detail::DeviceAccessorOffsetT byte_offset,
2166  Flags flags) {
2167 #ifdef __ESIMD_FORCE_STATELESS_MEM
2168  return block_load<Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc, byte_offset),
2169  flags);
2170 #else
2171  std::ignore = flags;
2172  constexpr unsigned Sz = sizeof(T) * N;
2173  static_assert(Sz >= detail::OperandSize::OWORD,
2174  "block size must be at least 1 oword");
2175  static_assert(Sz % detail::OperandSize::OWORD == 0,
2176  "block size must be whole number of owords");
2177  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
2178  "block must be 1, 2, 4 or 8 owords long");
2179  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
2180  "block size must be at most 8 owords");
2181 
2182  auto surf_ind = __esimd_get_surface_index(
2183  detail::AccessorPrivateProxy::getQualifiedPtrOrImageObj(acc));
2184 
2185  if constexpr (Flags::template alignment<simd<T, N>> >=
2186  detail::OperandSize::OWORD) {
2187  return __esimd_oword_ld<T, N>(surf_ind, byte_offset >> 4);
2188  } else {
2189  return __esimd_oword_ld_unaligned<T, N>(surf_ind, byte_offset);
2190  }
2191 #endif
2192 }
2193 
2203 
2207 
2214 
2220 
2258 template <
2259  typename T, int N, typename AccessorT,
2260  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
2261 __ESIMD_API std::enable_if_t<
2262  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
2263  detail::is_device_accessor_with_v<AccessorT,
2264  detail::accessor_mode_cap::can_read>,
2265  simd<T, N>>
2266 block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
2267  PropertyListT props = {}) {
2268 #ifdef __ESIMD_FORCE_STATELESS_MEM
2269  return block_load<T, N>(detail::accessorToPointer<T>(acc, byte_offset),
2270  props);
2271 #else // !__ESIMD_FORCE_STATELESS_MEM
2272  // If the alignment property is not passed, then assume the pointer
2273  // is element-aligned.
2274  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
2275  constexpr size_t Alignment =
2276  detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
2277 
2278  // Legacy surface index loads must be 1, 2, 4 or 8 owords long.
2279  constexpr size_t Size = sizeof(T) * N;
2280  constexpr size_t OWord = detail::OperandSize::OWORD;
2281  constexpr bool IsLegacySize = Size == OWord || Size == 2 * OWord ||
2282  Size == 4 * OWord || Size == 8 * OWord;
2283 
2284  using NewPropertyListT =
2285  detail::add_alignment_property_t<PropertyListT, DefaultAlignment>;
2286  if constexpr (detail::has_cache_hints<PropertyListT>() || !IsLegacySize) {
2287  return detail::block_load_impl<T, N, NewPropertyListT>(acc, byte_offset,
2288  simd_mask<1>(1));
2289  } else {
2290  constexpr size_t Alignment =
2291  NewPropertyListT::template get_property<alignment_key>().value;
2292  return block_load<T, N>(acc, byte_offset, overaligned_tag<Alignment>{});
2293  }
2294 #endif // !__ESIMD_FORCE_STATELESS_MEM
2295 }
2296 
2326 template <
2327  typename T, int N, typename AccessorT,
2328  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
2329 __ESIMD_API std::enable_if_t<
2330  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
2331  detail::is_device_accessor_with_v<AccessorT,
2332  detail::accessor_mode_cap::can_read>,
2333  simd<T, N>>
2334 block_load(AccessorT acc, PropertyListT /* props */ = {}) {
2335  // Create new properties without the alignment property passed in 'props',
2336  // and add alignment<16> as it is usable and most favourable in this case.
2337  using NewPropertyListT =
2338  detail::add_or_replace_alignment_property_t<PropertyListT, 16>;
2339  return block_load<T, N>(acc, 0, NewPropertyListT{});
2340 }
2341 
2373 template <
2374  typename T, int N, typename AccessorT,
2375  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
2376 __ESIMD_API std::enable_if_t<
2377  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
2378  detail::is_device_accessor_with_v<AccessorT,
2379  detail::accessor_mode_cap::can_read>,
2380  simd<T, N>>
2381 block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
2382  simd_mask<1> pred, simd<T, N> pass_thru,
2383  PropertyListT /* props */ = {}) {
2384  // If the alignment property is not passed, then assume the byte_offset
2385  // is element-aligned and is at least 4-bytes.
2386  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
2387  using NewPropertyListT =
2388  detail::add_alignment_property_t<PropertyListT, DefaultAlignment>;
2389  return detail::block_load_impl<T, N, NewPropertyListT>(acc, byte_offset, pred,
2390  pass_thru);
2391 }
2392 
2426 template <
2427  typename PassThruSimdViewT,
2428  typename T = PassThruSimdViewT::value_type::element_type,
2429  int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
2430  typename AccessorT,
2431  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
2432 __ESIMD_API std::enable_if_t<
2433  detail::is_simd_view_type_v<PassThruSimdViewT> &&
2434  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
2435  detail::is_device_accessor_with_v<AccessorT,
2436  detail::accessor_mode_cap::can_read>,
2437  simd<T, N>>
2438 block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
2439  simd_mask<1> pred, PassThruSimdViewT pass_thru,
2440  PropertyListT props = {}) {
2441  return block_load<T, N>(acc, byte_offset, pred, pass_thru.read(), props);
2442 }
2443 
2475 template <
2476  typename T, int N, typename AccessorT,
2477  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
2478 __ESIMD_API std::enable_if_t<
2479  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
2480  detail::is_device_accessor_with_v<AccessorT,
2481  detail::accessor_mode_cap::can_read>,
2482  simd<T, N>>
2483 block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
2484  simd_mask<1> pred, PropertyListT props = {}) {
2485  simd<T, N> PassThru; // Intentionally uninitialized.
2486  return block_load<T, N>(acc, byte_offset, pred, PassThru, props);
2487 }
2488 
2516 template <
2517  typename T, int N, typename AccessorT,
2518  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
2519 __ESIMD_API std::enable_if_t<
2520  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
2521  detail::is_device_accessor_with_v<AccessorT,
2522  detail::accessor_mode_cap::can_read>,
2523  simd<T, N>>
2524 block_load(AccessorT acc, simd_mask<1> pred, simd<T, N> pass_thru,
2525  PropertyListT /* props */ = {}) {
2526  // Create new properties without the alignment property passed in 'props',
2527  // and add alignment<16> as it is usable and most favourable in this case.
2528  using NewPropertyListT =
2529  detail::add_or_replace_alignment_property_t<PropertyListT, 16>;
2530  return block_load<T, N>(acc, 0, pred, pass_thru, NewPropertyListT{});
2531 }
2532 
2563 template <
2564  typename PassThruSimdViewT,
2565  typename T = PassThruSimdViewT::value_type::element_type,
2566  int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
2567  typename AccessorT,
2568  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
2569 __ESIMD_API std::enable_if_t<
2570  detail::is_simd_view_type_v<PassThruSimdViewT> &&
2571  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
2572  detail::is_device_accessor_with_v<AccessorT,
2573  detail::accessor_mode_cap::can_read>,
2574  simd<T, N>>
2575 block_load(AccessorT acc, simd_mask<1> pred, PassThruSimdViewT pass_thru,
2576  PropertyListT props = {}) {
2577  return block_load<T, N>(acc, pred, pass_thru.read(), props);
2578 }
2579 
2606 template <
2607  typename T, int N, typename AccessorT,
2608  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
2609 __ESIMD_API std::enable_if_t<
2610  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
2611  detail::is_device_accessor_with_v<AccessorT,
2612  detail::accessor_mode_cap::can_read>,
2613  simd<T, N>>
2614 block_load(AccessorT acc, simd_mask<1> pred, PropertyListT /* props */ = {}) {
2615  // Create new properties without the alignment property passed in 'props',
2616  // and add alignment<16> as it is usable and most favourable in this case.
2617  using NewPropertyListT =
2618  detail::add_or_replace_alignment_property_t<PropertyListT, 16>;
2619  simd<T, N> PassThru; // Intentionally uninitialized.
2620  return block_load<T, N>(acc, 0, pred, PassThru, NewPropertyListT{});
2621 }
2622 
2636 
2639 
2676 template <
2677  typename T, int N,
2678  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
2679 __ESIMD_API std::enable_if_t<detail::is_property_list_v<PropertyListT>>
2680 block_store(T *ptr, simd<T, N> vals, PropertyListT /* props */ = {}) {
2681  if constexpr (detail::has_cache_hints<PropertyListT>()) {
2682  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
2683  using NewPropertyListT =
2684  detail::add_alignment_property_t<PropertyListT, DefaultAlignment>;
2685  simd_mask<1> Mask = 1;
2686  detail::block_store_impl<T, N, NewPropertyListT>(ptr, vals, Mask);
2687  } else {
2688  // If the alignment property is not passed, then assume the pointer
2689  // is OWORD-aligned.
2690  constexpr size_t Alignment =
2691  detail::getPropertyValue<PropertyListT, alignment_key>(
2692  detail::OperandSize::OWORD);
2693  block_store<T, N>(ptr, vals, overaligned_tag<Alignment>{});
2694  }
2695 }
2696 
2731 template <
2732  typename T, int N,
2733  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
2734 __ESIMD_API std::enable_if_t<
2735  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
2736 block_store(T *ptr, size_t byte_offset, simd<T, N> vals,
2737  PropertyListT props = {}) {
2738  T *AdjustedPtr =
2739  reinterpret_cast<T *>(reinterpret_cast<int8_t *>(ptr) + byte_offset);
2740  block_store<T, N>(AdjustedPtr, vals, props);
2741 }
2742 
2775 template <
2776  typename T, int N,
2777  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
2778 __ESIMD_API std::enable_if_t<detail::is_property_list_v<PropertyListT>>
2780  PropertyListT /* props */ = {}) {
2781  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
2782  using NewPropertyListT =
2783  detail::add_alignment_property_t<PropertyListT, DefaultAlignment>;
2784  detail::block_store_impl<T, N, NewPropertyListT>(ptr, vals, pred);
2785 }
2786 
2807 // the minimally required element-size alignment otherwise.
2823 template <
2824  typename T, int N,
2825  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
2826 __ESIMD_API std::enable_if_t<
2827  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
2828 block_store(T *ptr, size_t byte_offset, simd<T, N> vals, simd_mask<1> pred,
2829  PropertyListT props = {}) {
2830  T *AdjustedPtr =
2831  reinterpret_cast<T *>(reinterpret_cast<int8_t *>(ptr) + byte_offset);
2832  block_store<T, N>(AdjustedPtr, vals, pred, props);
2833 }
2834 
2870 template <
2871  typename ValuesSimdViewT, typename T,
2872  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
2873  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
2874 __ESIMD_API std::enable_if_t<detail::is_simd_view_type_v<ValuesSimdViewT> &&
2875  detail::is_property_list_v<PropertyListT>>
2876 block_store(T *ptr, ValuesSimdViewT vals, PropertyListT props = {}) {
2877  block_store<T, N>(ptr, vals.read(), props);
2878 }
2879 
2916 template <
2917  typename ValuesSimdViewT, typename T,
2918  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
2919  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
2920 __ESIMD_API std::enable_if_t<
2921  detail::is_simd_view_type_v<ValuesSimdViewT> &&
2922  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
2923 block_store(T *ptr, size_t byte_offset, ValuesSimdViewT vals,
2924  PropertyListT props = {}) {
2925  block_store<T, N>(ptr, byte_offset, vals.read(), props);
2926 }
2927 
2962 template <
2963  typename ValuesSimdViewT, typename T,
2964  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
2965  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
2966 __ESIMD_API std::enable_if_t<detail::is_simd_view_type_v<ValuesSimdViewT> &&
2967  detail::is_property_list_v<PropertyListT>>
2968 block_store(T *ptr, ValuesSimdViewT vals, simd_mask<1> pred,
2969  PropertyListT props = {}) {
2970  block_store<T, N>(ptr, vals.read(), pred, props);
2971 }
2972 
2995 // the minimally required element-size alignment otherwise.
3011 template <
3012  typename ValuesSimdViewT, typename T,
3013  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
3014  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
3015 __ESIMD_API std::enable_if_t<
3016  detail::is_simd_view_type_v<ValuesSimdViewT> &&
3017  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3018 block_store(T *ptr, size_t byte_offset, ValuesSimdViewT vals, simd_mask<1> pred,
3019  PropertyListT props = {}) {
3020  block_store<T, N>(ptr, byte_offset, vals.read(), pred, props);
3021 }
3022 
3031 
3034 
3038 
3041 
3085 template <
3086  typename T, int N, typename AccessorT,
3087  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
3088 __ESIMD_API std::enable_if_t<
3089  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
3090  detail::is_device_accessor_with_v<AccessorT,
3091  detail::accessor_mode_cap::can_write>>
3092 block_store(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
3093  simd<T, N> vals, PropertyListT props = {}) {
3094 #ifdef __ESIMD_FORCE_STATELESS_MEM
3095  block_store<T, N>(detail::accessorToPointer<T>(acc, byte_offset), vals,
3096  props);
3097 #else
3098  constexpr int DefaultLSCAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
3099  constexpr size_t Alignment =
3100  detail::getPropertyValue<PropertyListT, alignment_key>(
3101  DefaultLSCAlignment);
3102  constexpr bool AlignmentRequiresLSC =
3103  PropertyListT::template has_property<alignment_key>() && Alignment < 16;
3104  using Tx = detail::__raw_t<T>;
3105  constexpr unsigned Sz = sizeof(Tx) * N;
3106  constexpr bool SzRequiresLSC =
3107  Sz < detail::OperandSize::OWORD || Sz % detail::OperandSize::OWORD != 0 ||
3108  !detail::isPowerOf2(Sz / detail::OperandSize::OWORD) ||
3109  Sz > 8 * detail::OperandSize::OWORD;
3110  if constexpr (detail::has_cache_hints<PropertyListT>() ||
3111  AlignmentRequiresLSC || SzRequiresLSC) {
3112  using NewPropertyListT =
3113  detail::add_alignment_property_t<PropertyListT, DefaultLSCAlignment>;
3114  simd_mask<1> Mask = 1;
3115  detail::block_store_impl<T, N, NewPropertyListT>(acc, byte_offset, vals,
3116  Mask);
3117  } else {
3118  auto surf_ind = __esimd_get_surface_index(
3119  detail::AccessorPrivateProxy::getQualifiedPtrOrImageObj(acc));
3120  __esimd_oword_st<Tx, N>(surf_ind, byte_offset >> 4, vals.data());
3121  }
3122 #endif
3123 }
3124 
3154 template <
3155  typename T, int N, typename AccessorT,
3156  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
3157 __ESIMD_API std::enable_if_t<
3158  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
3159  detail::is_device_accessor_with_v<AccessorT,
3160  detail::accessor_mode_cap::can_write>>
3161 block_store(AccessorT acc, simd<T, N> vals, PropertyListT props = {}) {
3162  // Create new properties without the alignment property passed in 'props',
3163  // and add alignment<16> as it is usable and most favourable in this case.
3164  using NewPropertyListT =
3165  detail::add_or_replace_alignment_property_t<PropertyListT, 16>;
3166  block_store<T, N>(acc, 0, vals, NewPropertyListT{});
3167 }
3168 
3200 template <
3201  typename T, int N, typename AccessorT,
3202  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
3203 __ESIMD_API std::enable_if_t<
3204  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
3205  detail::is_device_accessor_with_v<AccessorT,
3206  detail::accessor_mode_cap::can_write>>
3207 block_store(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
3208  simd<T, N> vals, simd_mask<1> pred, PropertyListT props = {}) {
3209  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
3210  using NewPropertyListT =
3211  detail::add_alignment_property_t<PropertyListT, DefaultAlignment>;
3212  detail::block_store_impl<T, N, NewPropertyListT>(acc, byte_offset, vals,
3213  pred);
3214 }
3215 
3240 template <
3241  typename T, int N, typename AccessorT,
3242  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
3243 __ESIMD_API std::enable_if_t<
3244  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
3245  detail::is_device_accessor_with_v<AccessorT,
3246  detail::accessor_mode_cap::can_write>>
3247 block_store(AccessorT acc, simd<T, N> vals, simd_mask<1> pred,
3248  PropertyListT props = {}) {
3249  // Create new properties without the alignment property passed in 'props',
3250  // and add alignment<16> as it is usable and most favourable in this case.
3251  using NewPropertyListT =
3252  detail::add_or_replace_alignment_property_t<PropertyListT, 16>;
3253  block_store<T, N>(acc, 0, vals, pred, NewPropertyListT{});
3254 }
3255 
3301 template <
3302  typename ValuesSimdViewT,
3303  typename T = ValuesSimdViewT::value_type::element_type,
3304  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
3305  typename AccessorT,
3306  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
3307 __ESIMD_API std::enable_if_t<
3308  detail::is_simd_view_type_v<ValuesSimdViewT> &&
3309  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
3310  detail::is_device_accessor_with_v<AccessorT,
3311  detail::accessor_mode_cap::can_write>>
3312 block_store(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
3313  ValuesSimdViewT vals, PropertyListT props = {}) {
3314  block_store<T, N>(acc, byte_offset, vals.read(), props);
3315 }
3316 
3348 template <
3349  typename ValuesSimdViewT,
3350  typename T = ValuesSimdViewT::value_type::element_type,
3351  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
3352  typename AccessorT,
3353  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
3354 __ESIMD_API std::enable_if_t<
3355  detail::is_simd_view_type_v<ValuesSimdViewT> &&
3356  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
3357  detail::is_device_accessor_with_v<AccessorT,
3358  detail::accessor_mode_cap::can_write>>
3359 block_store(AccessorT acc, ValuesSimdViewT vals, PropertyListT props = {}) {
3360  block_store<T, N>(acc, vals.read(), props);
3361 }
3362 
3396 template <
3397  typename ValuesSimdViewT,
3398  typename T = ValuesSimdViewT::value_type::element_type,
3399  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
3400  typename AccessorT,
3401  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
3402 __ESIMD_API std::enable_if_t<
3403  detail::is_simd_view_type_v<ValuesSimdViewT> &&
3404  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
3405  detail::is_device_accessor_with_v<AccessorT,
3406  detail::accessor_mode_cap::can_write>>
3407 block_store(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
3408  ValuesSimdViewT vals, simd_mask<1> pred, PropertyListT props = {}) {
3409  block_store<T, N>(acc, byte_offset, vals.read(), pred, props);
3410 }
3411 
3438 template <
3439  typename ValuesSimdViewT,
3440  typename T = ValuesSimdViewT::value_type::element_type,
3441  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
3442  typename AccessorT,
3443  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
3444 __ESIMD_API std::enable_if_t<
3445  detail::is_simd_view_type_v<ValuesSimdViewT> &&
3446  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
3447  detail::is_device_accessor_with_v<AccessorT,
3448  detail::accessor_mode_cap::can_write>>
3449 block_store(AccessorT acc, ValuesSimdViewT vals, simd_mask<1> pred,
3450  PropertyListT props = {}) {
3451  block_store<T, N>(acc, vals.read(), pred, props);
3452 }
3453 
3455 
3457 
3459 
3460 // Implementations of accessor-based gather and scatter functions
3461 namespace detail {
3462 template <typename T, int N, typename AccessorTy>
3463 ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t<
3464  std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
3465  is_accessor_with_v<AccessorTy, detail::accessor_mode_cap::can_write>>
3466 scatter_impl(AccessorTy acc, simd<T, N> vals, simd<uint32_t, N> offsets,
3467  uint32_t glob_offset, simd_mask<N> mask) {
3468 
3469  static_assert(detail::isPowerOf2(N, 32), "Unexpected vector length");
3470  if constexpr (sizeof(T) == 8) {
3471  scatter_impl<uint32_t, N>(
3472  acc, vals.template bit_cast_view<uint32_t>().template select<N, 2>(0),
3473  offsets, glob_offset, mask);
3474  scatter_impl<uint32_t, N>(
3475  acc, vals.template bit_cast_view<uint32_t>().template select<N, 2>(1),
3476  offsets, glob_offset + sizeof(uint32_t), mask);
3477  } else {
3478  constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
3479  // TODO (performance) use hardware-supported scale once BE supports it
3480  constexpr int16_t scale = 0;
3481  const auto si = __ESIMD_NS::get_surface_index(acc);
3482 
3483  if constexpr (sizeof(T) < 4) {
3484  using Tint = std::conditional_t<std::is_integral_v<T>, T,
3485  detail::uint_type_t<sizeof(T)>>;
3486  using Treal = __raw_t<T>;
3487  simd<Tint, N> vals_int = bitcast<Tint, Treal, N>(std::move(vals).data());
3488  using PromoT = typename std::conditional_t<std::is_signed<Tint>::value,
3489  int32_t, uint32_t>;
3490  const simd<PromoT, N> promo_vals = convert<PromoT>(std::move(vals_int));
3491  __esimd_scatter_scaled<PromoT, N, decltype(si), TypeSizeLog2, scale>(
3492  mask.data(), si, glob_offset, offsets.data(), promo_vals.data());
3493  } else {
3494  using Treal = __raw_t<T>;
3495  if constexpr (!std::is_same_v<Treal, T>) {
3496  simd<Treal, N> Values = vals.template bit_cast_view<Treal>();
3497  __esimd_scatter_scaled<Treal, N, decltype(si), TypeSizeLog2, scale>(
3498  mask.data(), si, glob_offset, offsets.data(), Values.data());
3499  } else {
3500  __esimd_scatter_scaled<T, N, decltype(si), TypeSizeLog2, scale>(
3501  mask.data(), si, glob_offset, offsets.data(), vals.data());
3502  }
3503  }
3504  }
3505 }
3506 
3507 #ifndef __ESIMD_FORCE_STATELESS_MEM
3525 template <typename T, int NElts, lsc_data_size DS, typename PropertyListT,
3526  int N, typename AccessorTy, typename OffsetT>
3527 __ESIMD_API std::enable_if_t<
3528  is_device_accessor_with_v<AccessorTy, accessor_mode_cap::can_write>>
3529 scatter_impl(AccessorTy acc, simd<OffsetT, N> offsets, simd<T, N * NElts> vals,
3530  simd_mask<N> pred) {
3531  static_assert(std::is_integral_v<OffsetT>,
3532  "Scatter must have integral byte_offset type");
3533  static_assert(sizeof(OffsetT) <= 4,
3534  "Implicit truncation of 64-bit byte_offset to 32-bit is "
3535  "disabled. Use -fsycl-esimd-force-stateless-mem or explicitly "
3536  "convert offsets to a 32-bit vector");
3537  check_lsc_vector_size<NElts>();
3538  check_lsc_data_size<T, DS>();
3539  check_cache_hints<cache_action::store, PropertyListT>();
3540  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
3541  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
3542  constexpr uint16_t AddressScale = 1;
3543  constexpr int ImmOffset = 0;
3544  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
3545  constexpr lsc_vector_size LSCNElts = to_lsc_vector_size<NElts>();
3546  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
3547  using MsgT = typename lsc_expand_type<T>::type;
3548  simd<MsgT, N * NElts> Tmp = lsc_format_input<MsgT, T>(vals);
3549  simd<uint32_t, N> ByteOffsets32 = convert<uint32_t>(offsets);
3550  auto si = get_surface_index(acc);
3551  __esimd_lsc_store_bti<MsgT, L1H, L2H, AddressScale, ImmOffset, EDS, LSCNElts,
3552  Transposed, N>(pred.data(), ByteOffsets32.data(),
3553  Tmp.data(), si);
3554 }
3555 #endif // __ESIMD_FORCE_STATELESS_MEM
3556 
3557 template <typename T, int N, typename AccessorTy>
3558 __ESIMD_API std::enable_if_t<
3559  (std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
3560  is_accessor_with_v<AccessorTy, detail::accessor_mode_cap::can_read>),
3561  simd<T, N>>
3562 gather_impl(AccessorTy acc, simd<uint32_t, N> offsets, uint32_t glob_offset,
3563  simd_mask<N> mask) {
3564  static_assert(detail::isPowerOf2(N, 32), "Unexpected vector length");
3565 
3566  if constexpr (sizeof(T) == 8) {
3567  simd<T, N> Res;
3568  Res.template bit_cast_view<uint32_t>().template select<N, 2>(0) =
3569  gather_impl<uint32_t, N>(acc, offsets, glob_offset, mask);
3570  Res.template bit_cast_view<uint32_t>().template select<N, 2>(1) =
3571  gather_impl<uint32_t, N>(acc, offsets, glob_offset + sizeof(uint32_t),
3572  mask);
3573  return Res;
3574  } else {
3575  using Treal = __raw_t<T>;
3576  constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
3577  // TODO (performance) use hardware-supported scale once BE supports it
3578  constexpr uint32_t scale = 0;
3579  const auto si = get_surface_index(acc);
3580  if constexpr (sizeof(T) < 4) {
3581  using Tint = std::conditional_t<std::is_integral_v<T>, T,
3582  detail::uint_type_t<sizeof(T)>>;
3583 
3584  static_assert(std::is_integral<Tint>::value,
3585  "only integral 1- & 2-byte types are supported");
3586  using PromoT = typename std::conditional_t<std::is_signed<Tint>::value,
3587  int32_t, uint32_t>;
3588  simd<PromoT, N> promo_vals =
3589  __esimd_gather_masked_scaled2<PromoT, N, decltype(si), TypeSizeLog2,
3590  scale>(si, glob_offset, offsets.data(),
3591  mask.data());
3592  auto Res = convert<Tint>(promo_vals);
3593 
3594  if constexpr (!std::is_same_v<Tint, T>) {
3595  return detail::bitcast<Treal, Tint, N>(Res.data());
3596  } else {
3597  return Res;
3598  }
3599  } else {
3600  simd<Treal, N> Res = __esimd_gather_masked_scaled2<Treal, N, decltype(si),
3601  TypeSizeLog2, scale>(
3602  si, glob_offset, offsets.data(), mask.data());
3603  if constexpr (!std::is_same_v<Treal, T>) {
3604  return Res.template bit_cast_view<T>();
3605  } else {
3606  return Res;
3607  }
3608  }
3609  }
3610 }
3611 
3612 #ifndef __ESIMD_FORCE_STATELESS_MEM
3613 template <typename T, int N, int VS, typename PropertyListT, lsc_data_size DS,
3614  typename OffsetT, typename AccessorT>
3615 __ESIMD_API std::enable_if_t<
3616  is_device_accessor_with_v<AccessorT, accessor_mode_cap::can_read>,
3617  simd<T, N>>
3618 gather_impl(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
3619  simd_mask<N / VS> pred, simd<T, N> pass_thru) {
3620  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
3621  static_assert(std::is_integral_v<OffsetT>,
3622  "Gather must have integral byte_offset type");
3623  static_assert(sizeof(OffsetT) <= 4,
3624  "Implicit truncation of 64-bit byte_offset to 32-bit is "
3625  "disabled. Use -fsycl-esimd-force-stateless-mem or explicitly "
3626  "convert offsets to a 32-bit vector");
3627  static_assert(VS == 1 || sizeof(T) >= 4,
3628  "VS > 1 is supprted only for 4- and 8-byte elements");
3629  check_lsc_vector_size<VS>();
3630  check_lsc_data_size<T, DS>();
3631  check_cache_hints<cache_action::load, PropertyListT>();
3632  constexpr uint16_t AddressScale = 1;
3633  constexpr int ImmOffset = 0;
3634  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
3635  constexpr lsc_vector_size LSCVS = to_lsc_vector_size<VS>();
3636  constexpr auto Transposed = lsc_data_order::nontranspose;
3637  using MsgT = typename lsc_expand_type<T>::type;
3638  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
3639  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
3640  auto SI = get_surface_index(acc);
3641  simd<uint32_t, N / VS> ByteOffsets32 = convert<uint32_t>(byte_offsets);
3642  simd<MsgT, N> PassThruExpanded = lsc_format_input<MsgT>(pass_thru);
3643  simd<MsgT, N> Result =
3644  __esimd_lsc_load_merge_bti<MsgT, L1H, L2H, AddressScale, ImmOffset, EDS,
3645  LSCVS, Transposed, N / VS>(
3646  pred.data(), ByteOffsets32.data(), SI, PassThruExpanded.data());
3647  return lsc_format_ret<T>(Result);
3648 }
3649 #endif // __ESIMD_FORCE_STATELESS_MEM
3650 
3668 template <typename T, int NElts, lsc_data_size DS, int N>
3669 __ESIMD_API simd<T, N * NElts> slm_gather_impl(simd<uint32_t, N> offsets,
3670  simd_mask<N> pred,
3671  simd<T, N * NElts> pass_thru) {
3672  check_lsc_vector_size<NElts>();
3673  check_lsc_data_size<T, DS>();
3674  constexpr uint16_t AddressScale = 1;
3675  constexpr int ImmOffset = 0;
3676  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
3677  constexpr lsc_vector_size LSCVS = to_lsc_vector_size<NElts>();
3678  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
3679  using MsgT = typename lsc_expand_type<T>::type;
3680  simd<MsgT, N * NElts> PassThruExpanded = lsc_format_input<MsgT>(pass_thru);
3681  simd<MsgT, N * NElts> Result =
3682  __esimd_lsc_load_merge_slm<MsgT, cache_hint::none, cache_hint::none,
3683  AddressScale, ImmOffset, EDS, LSCVS,
3684  Transposed, N>(pred.data(), offsets.data(),
3685  PassThruExpanded.data());
3686  return lsc_format_ret<T>(Result);
3687 }
3688 
3703 template <typename T, int NElts, lsc_data_size DS, int N>
3704 __ESIMD_API void slm_scatter_impl(simd<uint32_t, N> offsets,
3705  simd<T, N * NElts> vals, simd_mask<N> pred) {
3706  check_lsc_vector_size<NElts>();
3707  check_lsc_data_size<T, DS>();
3708  constexpr uint16_t AddressScale = 1;
3709  constexpr int ImmOffset = 0;
3710  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
3711  constexpr lsc_vector_size LSCVS = to_lsc_vector_size<NElts>();
3712  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
3713  using MsgT = typename lsc_expand_type<T>::type;
3714  simd<MsgT, N * NElts> Tmp = lsc_format_input<MsgT, T>(vals);
3715  __esimd_lsc_store_slm<MsgT, cache_hint::none, cache_hint::none, AddressScale,
3716  ImmOffset, EDS, LSCVS, Transposed, N>(
3717  pred.data(), offsets.data(), Tmp.data());
3718 }
3719 
3735 template <typename T, int NElts, lsc_data_size DS, typename PropertyListT,
3736  int N, typename Toffset>
3737 __ESIMD_API void prefetch_impl(const T *p, simd<Toffset, N> byte_offsets,
3738  simd_mask<N> pred) {
3739  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
3740  check_lsc_vector_size<NElts>();
3741  check_lsc_data_size<T, DS>();
3742  check_cache_hints<cache_action::prefetch, PropertyListT>();
3743  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
3744  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
3745  constexpr uint16_t AddressScale = 1;
3746  constexpr int ImmOffset = 0;
3747  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
3748  constexpr lsc_vector_size LSCVS = to_lsc_vector_size<NElts>();
3749  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
3750  using MsgT = typename lsc_expand_type<T>::type;
3751  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
3752  addrs += convert<uintptr_t>(byte_offsets);
3753  __esimd_lsc_prefetch_stateless<MsgT, L1H, L2H, AddressScale, ImmOffset, EDS,
3754  LSCVS, Transposed, N>(pred.data(),
3755  addrs.data());
3756 }
3757 
3758 template <typename T, int NElts, lsc_data_size DS, typename PropertyListT,
3759  typename Toffset>
3760 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>>
3761 prefetch_impl(const T *p, Toffset offset, simd_mask<1> pred) {
3762  check_lsc_data_size<T, DS>();
3763  check_cache_hints<cache_action::prefetch, PropertyListT>();
3764 
3765  constexpr size_t Alignment =
3766  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
3767  static_assert(
3768  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
3769  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
3770  "Incorrect alignment for the data type");
3771 
3772  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
3773  constexpr int SmallIntFactor32Bit =
3774  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
3775  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
3776  "Number of elements is not supported by Transposed load");
3777 
3778  // If alignment >= 8 and (NElts * sizeof(T)) % 8 == 0) we can prefetch QWORDs.
3779  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
3780  // because it would require a bit-cast, which is supposed to be NO-OP, but
3781  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
3782  constexpr bool Use64BitData =
3783  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
3784  (NElts * sizeof(T)) % sizeof(uint64_t) == 0 &&
3785  (sizeof(T) != sizeof(uint32_t) || NElts * sizeof(T) > 256);
3786  constexpr int SmallIntFactor =
3787  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
3788  constexpr int FactoredNElts = NElts / SmallIntFactor;
3789  check_lsc_vector_size<FactoredNElts>();
3790 
3791  // Prepare template arguments for the call of intrinsic.
3792  using LoadElemT = __ESIMD_DNS::__raw_t<
3793  std::conditional_t<SmallIntFactor == 1, T,
3794  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
3795 
3796  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
3797  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
3798  constexpr uint16_t AddressScale = 1;
3799  constexpr int ImmOffset = 0;
3800  constexpr lsc_data_size EDS = finalize_data_size<LoadElemT, DS>();
3801 
3802  static_assert(
3803  EDS == lsc_data_size::u32 || EDS == lsc_data_size::u64,
3804  "Transposed prefetch is supported only for data size u32 or u64");
3805  constexpr lsc_vector_size LSCVS = to_lsc_vector_size<FactoredNElts>();
3806  constexpr lsc_data_order Transposed = lsc_data_order::transpose;
3807  constexpr int N = 1;
3808 
3809  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p) + offset;
3810  __esimd_lsc_prefetch_stateless<LoadElemT, L1H, L2H, AddressScale, ImmOffset,
3811  EDS, LSCVS, Transposed, N>(pred.data(),
3812  addrs.data());
3813 }
3814 
3815 #ifndef __ESIMD_FORCE_STATELESS_MEM
3833 
3834 template <typename T, int NElts, lsc_data_size DS, typename PropertyListT,
3835  int N, typename AccessorTy, typename OffsetT>
3836 __ESIMD_API std::enable_if_t<
3837  is_device_accessor_with_v<AccessorTy, accessor_mode_cap::can_read>>
3838 prefetch_impl(AccessorTy acc, simd<OffsetT, N> byte_offsets,
3839  simd_mask<N> pred) {
3840  static_assert(std::is_integral_v<OffsetT>,
3841  "Prefetch must have integral byte_offset type");
3842  static_assert(sizeof(OffsetT) <= 4,
3843  "Implicit truncation of 64-bit byte_offset to 32-bit is "
3844  "disabled. Use -fsycl-esimd-force-stateless-mem or explicitly "
3845  "convert offsets to a 32-bit vector");
3846  check_lsc_vector_size<NElts>();
3847  check_lsc_data_size<T, DS>();
3848  check_cache_hints<cache_action::prefetch, PropertyListT>();
3849  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
3850  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
3851  constexpr uint16_t AddressScale = 1;
3852  constexpr int ImmOffset = 0;
3853  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
3854  constexpr lsc_vector_size LSCVS = to_lsc_vector_size<NElts>();
3855  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
3856  using MsgT = typename lsc_expand_type<T>::type;
3857  simd<uint32_t, N> ByteOffsets32 = convert<uint32_t>(byte_offsets);
3858  auto SI = get_surface_index(acc);
3859  __esimd_lsc_prefetch_bti<MsgT, L1H, L2H, AddressScale, ImmOffset, EDS, LSCVS,
3860  Transposed, N>(pred.data(), ByteOffsets32.data(),
3861  SI);
3862 }
3863 
3881 template <typename T, int NElts, lsc_data_size DS, typename PropertyListT,
3882  typename AccessorTy, typename OffsetT>
3883 __ESIMD_API std::enable_if_t<
3884  std::is_integral_v<OffsetT> &&
3885  is_device_accessor_with_v<AccessorTy, accessor_mode_cap::can_read>>
3886 prefetch_impl(AccessorTy acc, OffsetT byte_offset, simd_mask<1> pred) {
3887  static_assert(sizeof(OffsetT) <= 4,
3888  "Implicit truncation of 64-bit byte_offset to 32-bit is "
3889  "disabled. Use -fsycl-esimd-force-stateless-mem or explicitly "
3890  "convert offsets to a 32-bit vector");
3891  check_lsc_data_size<T, DS>();
3892  check_cache_hints<cache_action::prefetch, PropertyListT>();
3893 
3894  constexpr size_t Alignment =
3895  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
3896 
3897  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
3898  constexpr int SmallIntFactor32Bit =
3899  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
3900  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
3901  "Number of elements is not supported by Transposed load");
3902 
3903  // If alignment >= 8 and (NElts * sizeof(T)) % 8 == 0) we can load QWORDs.
3904  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
3905  // because it would require a bit-cast, which is supposed to be NO-OP, but
3906  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
3907  constexpr bool Use64BitData =
3908  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
3909  (NElts * sizeof(T)) % sizeof(uint64_t) == 0 &&
3910  (sizeof(T) != sizeof(uint32_t) || NElts * sizeof(T) > 256);
3911  constexpr int SmallIntFactor =
3912  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
3913  constexpr int FactoredNElts = NElts / SmallIntFactor;
3914  check_lsc_vector_size<FactoredNElts>();
3915 
3916  // Prepare template arguments for the call of intrinsic.
3917  using LoadElemT = __ESIMD_DNS::__raw_t<
3918  std::conditional_t<SmallIntFactor == 1, T,
3919  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
3920 
3921  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
3922  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
3923  constexpr uint16_t AddressScale = 1;
3924  constexpr int ImmOffset = 0;
3925  constexpr lsc_data_size EDS = finalize_data_size<LoadElemT, DS>();
3926 
3927  static_assert(
3928  EDS == lsc_data_size::u32 || EDS == lsc_data_size::u64,
3929  "Transposed prefetch is supported only for data size u32 or u64");
3930  constexpr lsc_vector_size LSCVS = to_lsc_vector_size<FactoredNElts>();
3931  constexpr lsc_data_order Transposed = lsc_data_order::transpose;
3932  constexpr int N = 1;
3933 
3934  simd<uint32_t, N> offsets = byte_offset;
3935  auto SI = get_surface_index(acc);
3936  __esimd_lsc_prefetch_bti<LoadElemT, L1H, L2H, AddressScale, ImmOffset, EDS,
3937  LSCVS, Transposed, N>(pred.data(), offsets.data(),
3938  SI);
3939 }
3940 #endif // __ESIMD_FORCE_STATELESS_MEM
3941 
3942 // Compute the data size for 2d block load or store.
3943 template <typename T, int NBlocks, int Height, int Width, bool Transposed,
3944  bool Transformed>
3945 constexpr int get_lsc_block_2d_data_size() {
3946  if constexpr (Transformed)
3947  return roundUpNextMultiple<Height, 4 / sizeof(T)>() *
3948  getNextPowerOf2<Width>() * NBlocks;
3949  return Width * Height * NBlocks;
3950 }
3951 
3952 #ifndef __ESIMD_DWORD_BLOCK_2D_WIDTH_SCALE
3953 #define __ESIMD_DWORD_BLOCK_2D_WIDTH_SCALE (1)
3954 #endif
3955 
3956 #ifndef __ESIMD_BLOCK_2D_WIDTH_CHECK
3957 #define __ESIMD_BLOCK_2D_WIDTH_CHECK(OP, BLOCK_WIDTH, NBLOCKS, SIZE) \
3958  static_assert((BLOCK_WIDTH) * (NBLOCKS) * (SIZE) <= 64, \
3959  "Unsupported block width");
3960 #endif
3961 
3962 enum class block_2d_op { prefetch, load, store };
3963 
3964 // Compile-time checks for lsc_load_2d/prefetch_2d/store_2d restrictions.
3965 template <typename T, int BlockWidth, int BlockHeight, int NBlocks,
3966  bool Transposed, bool Transformed, block_2d_op Op>
3967 constexpr void check_lsc_block_2d_restrictions() {
3968  constexpr int GRFByteSize = BlockWidth * BlockHeight * NBlocks * sizeof(T);
3969  static_assert(BlockWidth > 0, "Block width must be positive");
3970  static_assert(BlockHeight > 0, "Block height must be positive");
3971  // Restrictions based on documentation.
3972  if constexpr (Op == block_2d_op::store)
3973  static_assert(GRFByteSize <= 512, "2D store supports 512 bytes max");
3974  else
3975  static_assert(GRFByteSize <= 2048,
3976  "2D load/prefetch supports 2048 bytes max");
3977  static_assert(!Transposed || !Transformed,
3978  "Transposed and transformed is not supported");
3979  static_assert((sizeof(T) * BlockWidth) % 4 == 0,
3980  "Block width must be aligned by DW");
3981  if constexpr (Transposed) {
3982  static_assert(NBlocks == 1, "Transposed expected to be 1 block only");
3983  static_assert(sizeof(T) == 4 || sizeof(T) == 8,
3984  "Transposed load is supported only for data size u32 or u64");
3985  static_assert(sizeof(T) == 8 ? BlockHeight == 8
3986  : BlockHeight >= 1 && BlockHeight <= 32,
3987  "Unsupported block height");
3988  static_assert(sizeof(T) == 8
3989  ? __ESIMD_DNS::isPowerOf2(BlockWidth, 4)
3990  : BlockWidth >= 1 &&
3991  BlockWidth <=
3992  8 * __ESIMD_DWORD_BLOCK_2D_WIDTH_SCALE,
3993  "Unsupported block width");
3994  } else if constexpr (Transformed) {
3995  static_assert(sizeof(T) == 1 || sizeof(T) == 2,
3996  "VNNI transform is supported only for data size u8 or u16");
3997  static_assert(__ESIMD_DNS::isPowerOf2(NBlocks, 4),
3998  "Unsupported number of blocks");
3999  static_assert(BlockHeight * sizeof(T) >= 4 && BlockHeight <= 32,
4000  "Unsupported block height");
4001  static_assert(BlockWidth * sizeof(T) >= 4 && BlockWidth <= 16 &&
4002  BlockWidth * NBlocks * sizeof(T) <= 64,
4003  "Unsupported block width");
4004  } else {
4005  if constexpr (Op == block_2d_op::store) {
4006  static_assert(NBlocks == 1, "Unsupported number of blocks for 2D store");
4007  static_assert(BlockHeight <= 8, "Unsupported block height for store");
4008  } else {
4009  static_assert(
4010  __ESIMD_DNS::isPowerOf2(NBlocks, sizeof(T) == 1 ? 4 : 8 / sizeof(T)),
4011  "Unsupported number of blocks for 2D load/prefetch");
4012  static_assert(BlockHeight <= 32, "Unsupported block height for load");
4013  }
4014  static_assert(BlockWidth * sizeof(T) >= 4, "Unsupported block width");
4015  __ESIMD_BLOCK_2D_WIDTH_CHECK(Op, BlockWidth, NBlocks, sizeof(T));
4016  }
4017 }
4018 #undef __ESIMD_DWORD_BLOCK_2D_WIDTH_SCALE
4019 #undef __ESIMD_BLOCK_2D_WIDTH_CHECK
4020 
4051 template <
4052  typename T, int BlockWidth, int BlockHeight, int NBlocks, bool Transposed,
4053  bool Transformed, typename PropertyListT,
4054  int N = get_lsc_block_2d_data_size<__raw_t<T>, NBlocks, BlockHeight,
4055  BlockWidth, Transposed, Transformed>()>
4056 __ESIMD_API simd<T, N> load_2d_impl(const T *Ptr, unsigned SurfaceWidth,
4057  unsigned SurfaceHeight,
4058  unsigned SurfacePitch, int X, int Y) {
4059 
4060  check_cache_hints<cache_action::load, PropertyListT>();
4061  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
4062  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
4063  using RawT = __raw_t<T>;
4064  check_lsc_block_2d_restrictions<RawT, BlockWidth, BlockHeight, NBlocks,
4065  Transposed, Transformed, block_2d_op::load>();
4066  // For Load BlockWidth is padded up to the next power-of-two value.
4067  // For Load with Transpose the pre-operation BlockHeight is padded up
4068  // to the next power-of-two value.
4069  // For Load with Transform pre-operation BlockHeight is padded up to
4070  // multiple of K, where K = 4B / sizeof(T).
4071  constexpr int ElemsPerDword = 4 / sizeof(RawT);
4072  constexpr int GRFRowSize = Transposed ? BlockHeight
4073  : Transformed ? BlockWidth * ElemsPerDword
4074  : BlockWidth;
4075  constexpr int GRFRowPitch = getNextPowerOf2<GRFRowSize>();
4076  constexpr int GRFColSize =
4077  Transposed
4078  ? BlockWidth
4079  : (Transformed ? (BlockHeight + ElemsPerDword - 1) / ElemsPerDword
4080  : BlockHeight);
4081  constexpr int GRFBlockSize = GRFRowPitch * GRFColSize;
4082  constexpr int GRFBlockPitch =
4083  roundUpNextMultiple<64 / sizeof(RawT), GRFBlockSize>();
4084  constexpr int ActualN = NBlocks * GRFBlockPitch;
4085 
4086  constexpr int DstBlockElements = GRFColSize * GRFRowSize;
4087  constexpr int DstElements = DstBlockElements * NBlocks;
4088 
4089  static_assert(N == ActualN || N == DstElements, "Incorrect element count");
4090  simd_mask<1> Mask = 1;
4091  constexpr lsc_data_size DS =
4092  finalize_data_size<RawT, lsc_data_size::default_size>();
4093  uintptr_t Addr = reinterpret_cast<uintptr_t>(Ptr);
4094  constexpr lsc_data_order Transpose =
4096  simd<RawT, ActualN> Raw =
4097  __esimd_lsc_load2d_stateless<RawT, L1H, L2H, DS, Transpose, NBlocks,
4098  BlockWidth, BlockHeight, Transformed,
4099  ActualN>(Mask.data(), Addr, SurfaceWidth,
4100  SurfaceHeight, SurfacePitch, X, Y);
4101 
4102  if constexpr (ActualN == N) {
4103  return Raw;
4104  } else {
4105  // HW restrictions force data which is read to contain padding filled with
4106  // zeros for 2d lsc loads. This code eliminates such padding.
4107 
4108  // For example, 2D block load of 5 elements of 1 byte data type will
4109  // take 8 bytes per row for each block.
4110  //
4111  // +----+----+----+----+----+----+-----+-----+
4112  // | 00 | 01 | 02 | 03 | 04 | 05 | 06* | 07* |
4113  // +----+----+----+----+----+----+-----+-----+
4114  // | 10 | 11 | 12 | 13 | 14 | 15 | 16* | 17* |
4115  // +----+----+----+----+----+----+-----+-----+
4116  // | 20 | 21 | 22 | 23 | 24 | 25 | 26* | 27* |
4117  // +----+----+----+----+----+----+-----+-----+
4118  // | 30 | 31 | 32 | 33 | 34 | 35 | 36* | 37* |
4119  // +----+----+----+----+----+----+-----+-----+
4120  // * signifies the padded element.
4121 
4123 
4124  for (auto i = 0; i < NBlocks; i++) {
4125  auto DstBlock =
4126  Dst.template select<DstBlockElements, 1>(i * DstBlockElements);
4127 
4128  auto RawBlock = Raw.template select<GRFBlockSize, 1>(i * GRFBlockPitch);
4129  DstBlock =
4130  RawBlock.template bit_cast_view<RawT, GRFColSize, GRFRowPitch>()
4131  .template select<GRFColSize, 1, GRFRowSize, 1>(0, 0)
4132  .template bit_cast_view<RawT>();
4133  }
4134 
4135  return Dst;
4136  }
4137 }
4138 
4161 template <typename T, int BlockWidth, int BlockHeight, int NBlocks,
4162  typename PropertyListT,
4163  int N = get_lsc_block_2d_data_size<__raw_t<T>, NBlocks, BlockHeight,
4164  BlockWidth, false /*Transposed*/,
4165  false /*Transformed*/>()>
4166 __ESIMD_API void prefetch_2d_impl(const T *Ptr, unsigned SurfaceWidth,
4167  unsigned SurfaceHeight, unsigned SurfacePitch,
4168  int X, int Y) {
4169  using RawT = __raw_t<T>;
4170  check_cache_hints<cache_action::prefetch, PropertyListT>();
4171  check_lsc_block_2d_restrictions<RawT, BlockWidth, BlockHeight, NBlocks, false,
4172  false, block_2d_op::prefetch>();
4173  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
4174  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
4175  constexpr lsc_data_size DS =
4176  finalize_data_size<RawT, lsc_data_size::default_size>();
4177  uintptr_t Addr = reinterpret_cast<uintptr_t>(Ptr);
4178  constexpr lsc_data_order Transpose = lsc_data_order::nontranspose;
4179  simd_mask<1> Mask = 1;
4180  __esimd_lsc_prefetch2d_stateless<RawT, L1H, L2H, DS, Transpose, NBlocks,
4181  BlockWidth, BlockHeight, false, N>(
4182  Mask.data(), Addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
4183 }
4184 
4209 template <typename T, int BlockWidth, int BlockHeight, typename PropertyListT,
4211  __raw_t<T>, 1u, BlockHeight, BlockWidth, false /*Transposed*/,
4212  false /*Transformed*/>()>
4213 __ESIMD_API void store_2d_impl(T *Ptr, unsigned SurfaceWidth,
4214  unsigned SurfaceHeight, unsigned SurfacePitch,
4215  int X, int Y, simd<T, N> Vals) {
4216  using RawT = __raw_t<T>;
4217  __ESIMD_DNS::check_cache_hints<__ESIMD_DNS::cache_action::store,
4218  PropertyListT>();
4219  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
4220  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
4221  check_lsc_block_2d_restrictions<RawT, BlockWidth, BlockHeight, 1, false,
4222  false, block_2d_op::store>();
4223  constexpr lsc_data_size DS =
4224  finalize_data_size<RawT, lsc_data_size::default_size>();
4225  uintptr_t Addr = reinterpret_cast<uintptr_t>(Ptr);
4226  constexpr lsc_data_order Transpose = lsc_data_order::nontranspose;
4227 
4228  constexpr int Pitch = getNextPowerOf2<BlockWidth>();
4229  constexpr int NElts = BlockHeight * Pitch;
4230  simd<RawT, NElts> Raw;
4231  simd_mask<1> Mask = 1;
4232 
4233  if constexpr (NElts == N) {
4234  Raw = Vals;
4235  } else {
4236  // For store with padding, allocate the block with padding, and place
4237  // original data there.
4238  auto Data2D = Vals.template bit_cast_view<RawT, BlockHeight, BlockWidth>();
4239  auto Raw2D = Raw.template bit_cast_view<RawT, BlockHeight, Pitch>();
4240  Raw2D.template select<BlockHeight, 1, BlockWidth, 1>(0, 0) = Data2D;
4241  }
4242 
4243  __esimd_lsc_store2d_stateless<RawT, L1H, L2H, DS, Transpose, 1u, BlockWidth,
4244  BlockHeight, false, NElts>(
4245  Mask.data(), Addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y,
4246  Raw.data());
4247 }
4248 
4249 } // namespace detail
4250 
4252 
4255 
4277 // Dev note: the argument \p glob_offset of this function does not have
4278 // a default value to not conflict with more generic variant (acc-ga-3)
4279 // defined below. This restriction though requires adding an additional
4280 // variant: simd<T, N> gather(acc, glob_offset) to support calls that require
4281 // implicit conversion of a scalar offset to a vector of offsets, e.g.
4282 // 'res = gather<T, N>(acc, 0);'
4283 template <typename T, int N, typename AccessorT>
4284 __ESIMD_API
4285  std::enable_if_t<detail::is_device_accessor_with_v<
4286  AccessorT, detail::accessor_mode_cap::can_read>,
4287  simd<T, N>>
4288  gather(AccessorT acc, simd<detail::DeviceAccessorOffsetT, N> byte_offsets,
4289  detail::DeviceAccessorOffsetT glob_offset, simd_mask<N> mask = 1) {
4290 #ifdef __ESIMD_FORCE_STATELESS_MEM
4291  return gather<T, N>(__ESIMD_DNS::accessorToPointer<T>(acc, glob_offset),
4292  byte_offsets, mask);
4293 #else
4294  if constexpr (!detail::isPowerOf2(N, 32)) {
4295  // Requires DG2 or PVC.
4296  simd<T, N> PassThru; // Intentionally undefined
4297  byte_offsets += glob_offset;
4298  return detail::gather_impl<T, N, 1,
4301  acc, byte_offsets, mask, PassThru);
4302  } else {
4303  return detail::gather_impl<T, N>(acc, byte_offsets, glob_offset, mask);
4304  }
4305 #endif // __ESIMD_FORCE_STATELESS_MEM
4306 }
4307 
4321 template <typename T, int N, typename AccessorT>
4322 __ESIMD_API
4323  std::enable_if_t<detail::is_device_accessor_with_v<
4324  AccessorT, detail::accessor_mode_cap::can_read>,
4325  simd<T, N>>
4326  gather(AccessorT acc, detail::DeviceAccessorOffsetT glob_offset) {
4328  return gather<T, N>(acc, ByteOffsets, glob_offset);
4329 }
4330 
4331 #ifdef __ESIMD_FORCE_STATELESS_MEM
4332 template <typename T, int N, typename AccessorTy, typename Toffset>
4333 __ESIMD_API std::enable_if_t<
4334  detail::is_device_accessor_with_v<AccessorTy,
4335  detail::accessor_mode_cap::can_read> &&
4336  std::is_integral_v<Toffset> && !std::is_same_v<Toffset, uint64_t>,
4337  simd<T, N>>
4338 gather(AccessorTy acc, simd<Toffset, N> offsets, uint64_t glob_offset,
4339  simd_mask<N> mask = 1) {
4340  return gather<T, N>(acc, convert<uint64_t>(offsets), glob_offset, mask);
4341 }
4342 #endif
4343 
4373 // typename PropertyListT = empty_properties_t>
4382 
4416 template <
4417  typename T, int N, int VS, typename AccessorT, typename OffsetT,
4418  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
4419 __ESIMD_API std::enable_if_t<
4420  (detail::is_device_accessor_with_v<AccessorT,
4421  detail::accessor_mode_cap::can_read> &&
4422  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
4423  simd<T, N>>
4424 gather(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
4425  simd_mask<N / VS> mask, simd<T, N> pass_thru, PropertyListT props = {}) {
4426 #ifdef __ESIMD_FORCE_STATELESS_MEM
4427  return gather<T, N, VS>(detail::accessorToPointer<T>(acc), byte_offsets, mask,
4428  pass_thru, props);
4429 #else
4430  return detail::gather_impl<T, N, VS, PropertyListT,
4432  acc, byte_offsets, mask, pass_thru);
4433 #endif // __ESIMD_FORCE_STATELESS_MEM
4434 }
4435 
4465 template <
4466  typename T, int N, int VS, typename AccessorT, typename OffsetT,
4467  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
4468 __ESIMD_API std::enable_if_t<
4469  (detail::is_device_accessor_with_v<AccessorT,
4470  detail::accessor_mode_cap::can_read> &&
4471  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
4472  simd<T, N>>
4473 gather(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
4474  simd_mask<N / VS> mask, PropertyListT props = {}) {
4475 #ifdef __ESIMD_FORCE_STATELESS_MEM
4476  return gather<T, N, VS>(detail::accessorToPointer<T>(acc), byte_offsets, mask,
4477  props);
4478 #else
4479  constexpr size_t Alignment =
4480  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
4481  static_assert(Alignment >= sizeof(T),
4482  "gather() requires at least element-size alignment");
4483 
4484  if constexpr (detail::has_cache_hints<PropertyListT>() || VS > 1 ||
4485  !(detail::isPowerOf2(N, 32))) {
4486  simd<T, N> PassThru; // Intentionally undefined
4487  return detail::gather_impl<T, N, VS, PropertyListT,
4489  acc, byte_offsets, mask, PassThru);
4490  } else {
4491  return detail::gather_impl<T, N>(acc, byte_offsets, 0, mask);
4492  }
4493 #endif // __ESIMD_FORCE_STATELESS_MEM
4494 }
4495 
4519 template <
4520  typename T, int N, int VS, typename AccessorT, typename OffsetT,
4521  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
4522 __ESIMD_API std::enable_if_t<
4523  (detail::is_device_accessor_with_v<AccessorT,
4524  detail::accessor_mode_cap::can_read> &&
4525  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
4526  simd<T, N>>
4527 gather(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
4528  PropertyListT props = {}) {
4529  simd_mask<N / VS> Mask = 1;
4530  return gather<T, N, VS>(acc, byte_offsets, Mask, props);
4531 }
4532 
4542 // Dev note: the mask type was turned into template parameter `MaskT` to
4543 // avoid the conflicts of this prototype with the old gather() function
4544 // accepting a 'global_offset' parameter and avoid 'ambiguous call' errors
4545 // for calls like this: gather(acc, byte_offsets_simd, 0, mask);
4546 template <
4547  typename T, int N, typename AccessorT, typename OffsetT, typename MaskT,
4548  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
4549 __ESIMD_API std::enable_if_t<
4550  (detail::is_device_accessor_with_v<AccessorT,
4551  detail::accessor_mode_cap::can_read> &&
4552  std::is_same_v<MaskT, simd_mask<N>> &&
4553  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
4554  simd<T, N>>
4555 gather(AccessorT acc, simd<OffsetT, N> byte_offsets, MaskT mask,
4556  simd<T, N> pass_thru, PropertyListT props = {}) {
4557  return gather<T, N, 1>(acc, byte_offsets, mask, pass_thru, props);
4558 }
4559 
4567 // Dev note: the mask type was turned into template parameter `MaskT` to
4568 // avoid the conflicts of this prototype with the old gather() function
4569 // accepting a 'global_offset' parameter and avoid 'ambiguous call' errors
4570 // for calls like this: gather(acc, byte_offsets_simd, 0);
4571 template <
4572  typename T, int N, typename AccessorT, typename OffsetT, typename MaskT,
4573  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
4574 __ESIMD_API std::enable_if_t<
4575  (detail::is_device_accessor_with_v<AccessorT,
4576  detail::accessor_mode_cap::can_read> &&
4577  std::is_same_v<MaskT, simd_mask<N>> &&
4578  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
4579  simd<T, N>>
4580 gather(AccessorT acc, simd<OffsetT, N> byte_offsets, MaskT mask,
4581  PropertyListT props = {}) {
4582  return gather<T, N, 1>(acc, byte_offsets, mask, props);
4583 }
4584 
4592 template <
4593  typename T, int N, typename AccessorT, typename OffsetT,
4594  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
4595 __ESIMD_API std::enable_if_t<
4596  (detail::is_device_accessor_with_v<AccessorT,
4597  detail::accessor_mode_cap::can_read> &&
4598  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
4599  simd<T, N>>
4600 gather(AccessorT acc, simd<OffsetT, N> byte_offsets, PropertyListT props = {}) {
4601  return gather<T, N, 1>(acc, byte_offsets, props);
4602 }
4603 
4606 // typename PropertyListT = empty_properties_t>
4612 template <
4613  typename T, int N, int VS = 1, typename AccessorT, typename OffsetSimdViewT,
4614  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
4615 __ESIMD_API std::enable_if_t<
4616  (detail::is_device_accessor_with_v<AccessorT,
4617  detail::accessor_mode_cap::can_read> &&
4618  detail::is_simd_view_type_v<OffsetSimdViewT> &&
4619  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
4620  simd<T, N>>
4621 gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
4622  simd<T, N> pass_thru, PropertyListT props = {}) {
4623  return gather<T, N, VS>(acc, byte_offsets.read(), mask, pass_thru, props);
4624 }
4625 
4627 // typename PropertyListT = empty_properties_t>
4633 template <
4634  int VS, typename T, int N, typename AccessorT, typename OffsetSimdViewT,
4635  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
4636 __ESIMD_API std::enable_if_t<
4637  (detail::is_device_accessor_with_v<AccessorT,
4638  detail::accessor_mode_cap::can_read> &&
4639  detail::is_simd_view_type_v<OffsetSimdViewT> &&
4640  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
4641  simd<T, N>>
4642 gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
4643  simd<T, N> pass_thru, PropertyListT props = {}) {
4644  static_assert(N / VS ==
4645  OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
4646  "Size of pass_thru parameter must correspond to the size of "
4647  "byte_offsets parameter.");
4648  return gather<T, N, VS>(acc, byte_offsets.read(), mask, pass_thru, props);
4649 }
4650 
4661 template <
4662  int VS = 1, typename AccessorT, typename OffsetSimdViewT,
4663  typename PassThruSimdViewT,
4664  int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
4665  typename T = PassThruSimdViewT::value_type::element_type,
4666  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
4667 __ESIMD_API std::enable_if_t<
4668  (detail::is_device_accessor_with_v<AccessorT,
4669  detail::accessor_mode_cap::can_read> &&
4670  detail::is_simd_view_type_v<OffsetSimdViewT> &&
4671  detail::is_simd_view_type_v<PassThruSimdViewT> &&
4672  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
4673  simd<T, N>>
4674 gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
4675  PassThruSimdViewT pass_thru, PropertyListT props = {}) {
4676  static_assert(N / VS ==
4677  OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
4678  "Size of pass_thru parameter must correspond to the size of "
4679  "byte_offsets parameter.");
4680  return gather<T, N, VS>(acc, byte_offsets.read(), mask, pass_thru.read(),
4681  props);
4682 }
4683 
4694 template <
4695  int VS = 1, typename AccessorT, typename OffsetT,
4696  typename PassThruSimdViewT,
4697  int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
4698  typename T = PassThruSimdViewT::value_type::element_type,
4699  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
4700 __ESIMD_API std::enable_if_t<
4701  (detail::is_device_accessor_with_v<AccessorT,
4702  detail::accessor_mode_cap::can_read> &&
4703  detail::is_simd_view_type_v<PassThruSimdViewT> &&
4704  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
4705  simd<T, N>>
4706 gather(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
4707  simd_mask<N / VS> mask, PassThruSimdViewT pass_thru,
4708  PropertyListT props = {}) {
4709  return gather<T, N, VS>(acc, byte_offsets, mask, pass_thru.read(), props);
4710 }
4711 
4714 // typename PropertyListT = empty_properties_t>
4720 template <
4721  typename T, int N, int VS = 1, typename AccessorT, typename OffsetSimdViewT,
4722  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
4723 __ESIMD_API std::enable_if_t<
4724  (detail::is_device_accessor_with_v<AccessorT,
4725  detail::accessor_mode_cap::can_read> &&
4726  detail::is_simd_view_type_v<OffsetSimdViewT> &&
4727  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
4728  simd<T, N>>
4729 gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
4730  PropertyListT props = {}) {
4731  return gather<T, N, VS>(acc, byte_offsets.read(), mask, props);
4732 }
4733 
4736 // typename PropertyListT = empty_properties_t>
4741 template <
4742  typename T, int N, int VS = 1, typename AccessorT, typename OffsetSimdViewT,
4743  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
4744 __ESIMD_API std::enable_if_t<
4745  (detail::is_device_accessor_with_v<AccessorT,
4746  detail::accessor_mode_cap::can_read> &&
4747  detail::is_simd_view_type_v<OffsetSimdViewT> &&
4748  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
4749  simd<T, N>>
4750 gather(AccessorT acc, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
4751  return gather<T, N, VS>(acc, byte_offsets.read(), props);
4752 }
4753 
4767 
4770 
4805 template <
4806  typename T, int N, int VS = 1, typename AccessorTy, typename OffsetT,
4807  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
4808 __ESIMD_API std::enable_if_t<
4809  detail::is_device_accessor_with_v<AccessorTy,
4810  detail::accessor_mode_cap::can_write> &&
4811  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4812 scatter(AccessorTy acc, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
4813  simd_mask<N / VS> mask, PropertyListT props = {}) {
4814 #ifdef __ESIMD_FORCE_STATELESS_MEM
4815  scatter<T, N, VS>(__ESIMD_DNS::accessorToPointer<T>(acc), byte_offsets, vals,
4816  mask, props);
4817 #else
4818  constexpr size_t Alignment =
4819  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
4820  static_assert(Alignment >= sizeof(T),
4821  "gather() requires at least element-size alignment");
4822 
4823  if constexpr (detail::has_cache_hints<PropertyListT>() || VS > 1 ||
4824  !detail::isPowerOf2(N, 32)) {
4826  PropertyListT>(acc, byte_offsets, vals, mask);
4827  } else {
4828  detail::scatter_impl<T, N, AccessorTy>(acc, vals, byte_offsets, 0, mask);
4829  }
4830 
4831 #endif // __ESIMD_FORCE_STATELESS_MEM
4832 }
4852 template <
4853  typename T, int N, int VS = 1, typename AccessorTy, typename OffsetT,
4854  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
4855 __ESIMD_API std::enable_if_t<
4856  detail::is_device_accessor_with_v<AccessorTy,
4857  detail::accessor_mode_cap::can_write> &&
4858  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4859 scatter(AccessorTy acc, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
4860  PropertyListT props = {}) {
4861  simd_mask<N / VS> Mask = 1;
4862  scatter<T, N, VS>(acc, byte_offsets, vals, Mask, props);
4863 }
4864 
4890 template <
4891  typename T, int N, int VS = 1, typename AccessorTy,
4892  typename OffsetSimdViewT,
4893  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
4894 __ESIMD_API std::enable_if_t<
4895  detail::is_device_accessor_with_v<AccessorTy,
4896  detail::accessor_mode_cap::can_write> &&
4897  detail::is_simd_view_type_v<OffsetSimdViewT> &&
4898  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4899 scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
4900  simd_mask<N / VS> mask, PropertyListT props = {}) {
4901  scatter<T, N, VS>(acc, byte_offsets.read(), vals, mask, props);
4902 }
4903 
4929 template <
4930  int VS, typename AccessorTy, typename T, int N, typename OffsetSimdViewT,
4931  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
4932 __ESIMD_API std::enable_if_t<
4933  detail::is_device_accessor_with_v<AccessorTy,
4934  detail::accessor_mode_cap::can_write> &&
4935  detail::is_simd_view_type_v<OffsetSimdViewT> &&
4936  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4937 scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
4938  simd_mask<N / VS> mask, PropertyListT props = {}) {
4939  static_assert(N / VS ==
4940  OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
4941  "Size of vals parameter must correspond to the size of "
4942  "byte_offsets parameter.");
4943  scatter<T, N, VS>(acc, byte_offsets.read(), vals, mask, props);
4944 }
4945 
4966 template <
4967  int VS, typename AccessorTy, typename T, int N, typename OffsetSimdViewT,
4968  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
4969 __ESIMD_API std::enable_if_t<
4970  detail::is_device_accessor_with_v<AccessorTy,
4971  detail::accessor_mode_cap::can_write> &&
4972  detail::is_simd_view_type_v<OffsetSimdViewT> &&
4973  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4974 scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
4975  PropertyListT props = {}) {
4976  static_assert(N / VS ==
4977  OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
4978  "Size of vals parameter must correspond to the size of "
4979  "byte_offsets parameter.");
4980  scatter<T, N, VS>(acc, byte_offsets.read(), vals, props);
4981 }
4982 
5011 template <
5012  int VS = 1, typename AccessorTy, typename ValuesSimdViewT,
5013  typename OffsetSimdViewT,
5014  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
5015  typename T = ValuesSimdViewT::value_type::element_type,
5016  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
5017 __ESIMD_API std::enable_if_t<
5018  detail::is_device_accessor_with_v<AccessorTy,
5019  detail::accessor_mode_cap::can_write> &&
5020  detail::is_simd_view_type_v<OffsetSimdViewT> &&
5021  detail::is_simd_view_type_v<ValuesSimdViewT> &&
5022  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
5023 scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
5024  simd_mask<N / VS> mask, PropertyListT props = {}) {
5025  static_assert(N / VS ==
5026  OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
5027  "Size of vals parameter must correspond to the size of "
5028  "byte_offsets parameter.");
5029  scatter<T, N, VS>(acc, byte_offsets.read(), vals.read(), mask, props);
5030 }
5031 
5055 template <
5056  int VS = 1, typename AccessorTy, typename ValuesSimdViewT,
5057  typename OffsetSimdViewT,
5058  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
5059  typename T = ValuesSimdViewT::value_type::element_type,
5060  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
5061 __ESIMD_API std::enable_if_t<
5062  detail::is_device_accessor_with_v<AccessorTy,
5063  detail::accessor_mode_cap::can_write> &&
5064  detail::is_simd_view_type_v<OffsetSimdViewT> &&
5065  detail::is_simd_view_type_v<ValuesSimdViewT> &&
5066  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
5067 scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
5068  PropertyListT props = {}) {
5069  static_assert(N / VS ==
5070  OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
5071  "Size of vals parameter must correspond to the size of "
5072  "byte_offsets parameter.");
5073  scatter<T, N, VS>(acc, byte_offsets.read(), vals.read(), props);
5074 }
5075 
5104 template <
5105  int VS = 1, typename AccessorTy, typename ValuesSimdViewT, typename OffsetT,
5106  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
5107  typename T = ValuesSimdViewT::value_type::element_type,
5108  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
5109 __ESIMD_API std::enable_if_t<
5110  detail::is_device_accessor_with_v<AccessorTy,
5111  detail::accessor_mode_cap::can_write> &&
5112  detail::is_simd_view_type_v<ValuesSimdViewT> &&
5113  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
5114 scatter(AccessorTy acc, simd<OffsetT, N / VS> byte_offsets,
5115  ValuesSimdViewT vals, simd_mask<N / VS> mask,
5116  PropertyListT props = {}) {
5117  scatter<T, N, VS>(acc, byte_offsets, vals.read(), mask, props);
5118 }
5119 
5143 template <
5144  int VS = 1, typename AccessorTy, typename ValuesSimdViewT, typename OffsetT,
5145  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
5146  typename T = ValuesSimdViewT::value_type::element_type,
5147  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
5148 __ESIMD_API std::enable_if_t<
5149  detail::is_device_accessor_with_v<AccessorTy,
5150  detail::accessor_mode_cap::can_write> &&
5151  detail::is_simd_view_type_v<ValuesSimdViewT> &&
5152  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
5153 scatter(AccessorTy acc, simd<OffsetT, N / VS> byte_offsets,
5154  ValuesSimdViewT vals, PropertyListT props = {}) {
5155  scatter<T, N, VS>(acc, byte_offsets, vals.read(), props);
5156 }
5157 
5178 template <
5179  typename T, int N, int VS = 1, typename AccessorTy,
5180  typename OffsetSimdViewT,
5181  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
5182 __ESIMD_API std::enable_if_t<
5183  detail::is_device_accessor_with_v<AccessorTy,
5184  detail::accessor_mode_cap::can_write> &&
5185  detail::is_simd_view_type_v<OffsetSimdViewT> &&
5186  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
5187 scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
5188  PropertyListT props = {}) {
5189  simd_mask<N / VS> Mask = 1;
5190  scatter<T, N, VS>(acc, byte_offsets.read(), vals, Mask, props);
5191 }
5192 
5209 template <typename T, int N, typename AccessorTy>
5210 __ESIMD_API
5211  std::enable_if_t<(detail::isPowerOf2(N, 32)) &&
5212  detail::is_device_accessor_with_v<
5213  AccessorTy, detail::accessor_mode_cap::can_write>>
5215  simd<T, N> vals, detail::DeviceAccessorOffsetT glob_offset,
5216  simd_mask<N> mask = 1) {
5217  offsets += glob_offset;
5218  scatter<T, N>(acc, offsets, vals, mask);
5219 }
5220 
5221 template <typename T, int N, typename AccessorTy>
5222 __ESIMD_API
5223  std::enable_if_t<(detail::isPowerOf2(N, 32)) &&
5224  detail::is_device_accessor_with_v<
5225  AccessorTy, detail::accessor_mode_cap::can_write>>
5226  scatter(AccessorTy acc, detail::DeviceAccessorOffsetT glob_offset,
5227  simd<T, N> vals, simd_mask<N> mask = 1) {
5229  scatter<T, N>(acc, ByteOffsets, vals, glob_offset, mask);
5230 }
5231 
5232 #ifdef __ESIMD_FORCE_STATELESS_MEM
5233 template <typename T, int N, typename AccessorTy, typename Toffset>
5234 __ESIMD_API std::enable_if_t<
5235  detail::is_device_accessor_with_v<AccessorTy,
5236  detail::accessor_mode_cap::can_write> &&
5237  std::is_integral_v<Toffset> && !std::is_same_v<Toffset, uint64_t>>
5238 scatter(AccessorTy acc, simd<Toffset, N> offsets, simd<T, N> vals,
5239  uint64_t glob_offset, simd_mask<N> mask = 1) {
5240  scatter<T, N, AccessorTy>(acc, convert<uint64_t>(offsets), vals, glob_offset,
5241  mask);
5242 }
5243 #endif
5244 
5252 template <typename T, typename AccessorTy>
5253 __ESIMD_API T scalar_load(AccessorTy acc,
5255  const simd<T, 1> Res =
5256  gather<T, 1, AccessorTy>(acc, simd<decltype(offset), 1>(offset));
5257  return Res[0];
5258 }
5259 
5267 template <typename T, typename AccessorTy>
5268 __ESIMD_API void scalar_store(AccessorTy acc,
5269  detail::DeviceAccessorOffsetT offset, T val) {
5270  scatter<T, 1, AccessorTy>(acc, simd<decltype(offset), 1>(offset),
5271  simd<T, 1>(val));
5272 }
5273 
5307 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
5308  int N, typename Toffset>
5309 __ESIMD_API simd<T, N * get_num_channels_enabled(RGBAMask)>
5310 gather_rgba(const T *p, simd<Toffset, N> offsets, simd_mask<N> mask = 1) {
5311  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
5312  static_assert((N == 8 || N == 16 || N == 32), "Unsupported value of N");
5313  static_assert(sizeof(T) == 4, "Unsupported size of type T");
5314  simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
5315  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
5316  addrs = addrs + offsets_i;
5317  return __esimd_svm_gather4_scaled<detail::__raw_t<T>, N, RGBAMask>(
5318  addrs.data(), mask.data());
5319 }
5320 
5336 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
5337  int N, typename OffsetSimdViewT, typename RegionTy>
5338 __ESIMD_API std::enable_if_t<detail::is_simd_view_type_v<OffsetSimdViewT>,
5339  simd<T, N * get_num_channels_enabled(RGBAMask)>>
5340 gather_rgba(const T *p, OffsetSimdViewT offsets, simd_mask<N> mask = 1) {
5341  return gather_rgba<RGBAMask, T, N>(p, offsets.read(), mask);
5342 }
5343 
5359 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
5360  int N, typename Toffset>
5361 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>,
5362  simd<T, N * get_num_channels_enabled(RGBAMask)>>
5363 gather_rgba(const T *p, Toffset offset, simd_mask<N> mask = 1) {
5364  return gather_rgba<RGBAMask, T, N>(p, simd<Toffset, N>(offset), mask);
5365 }
5366 
5367 namespace detail {
5368 template <rgba_channel_mask M> static void validate_rgba_write_channel_mask() {
5369  using CM = rgba_channel_mask;
5370  static_assert(
5371  (M == CM::ABGR || M == CM::BGR || M == CM::GR || M == CM::R) &&
5372  "Only ABGR, BGR, GR, R channel masks are valid in write operations");
5373 }
5374 } // namespace detail
5375 
5397 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
5398  int N, typename Toffset>
5399 __ESIMD_API void
5401  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
5402  simd_mask<N> mask = 1) {
5403  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
5404  static_assert((N == 8 || N == 16 || N == 32), "Unsupported value of N");
5405  static_assert(sizeof(T) == 4, "Unsupported size of type T");
5406  detail::validate_rgba_write_channel_mask<RGBAMask>();
5407  simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
5408  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
5409  addrs = addrs + offsets_i;
5410  __esimd_svm_scatter4_scaled<detail::__raw_t<T>, N, RGBAMask>(
5411  addrs.data(), vals.data(), mask.data());
5412 }
5413 
5429 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
5430  int N, typename OffsetSimdViewT, typename RegionTy>
5431 __ESIMD_API std::enable_if_t<detail::is_simd_view_type_v<OffsetSimdViewT>>
5432 scatter_rgba(T *p, OffsetSimdViewT offsets,
5433  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
5434  simd_mask<N> mask = 1) {
5435  scatter_rgba<RGBAMask, T, N>(p, offsets.read(), vals, mask);
5436 }
5437 
5453 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
5454  int N, typename Toffset>
5455 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> && N == 1>
5456 scatter_rgba(T *p, Toffset offset,
5457  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
5458  simd_mask<N> mask = 1) {
5459  scatter_rgba<RGBAMask, T, N>(p, simd<Toffset, N>(offset), vals, mask);
5460 }
5461 
5484 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
5485  typename AccessorT, int N,
5486  typename T = typename AccessorT::value_type>
5487 __ESIMD_API
5488  std::enable_if_t<((N == 8 || N == 16 || N == 32) && sizeof(T) == 4 &&
5489  detail::is_device_accessor_with_v<
5490  AccessorT, detail::accessor_mode_cap::can_read>),
5491  simd<T, N * get_num_channels_enabled(RGBAMask)>>
5493  detail::DeviceAccessorOffsetT global_offset = 0,
5494  simd_mask<N> mask = 1) {
5495 #ifdef __ESIMD_FORCE_STATELESS_MEM
5496  return gather_rgba<RGBAMask>(
5497  __ESIMD_DNS::accessorToPointer<T>(acc, global_offset), offsets, mask);
5498 #else
5499  // TODO (performance) use hardware-supported scale once BE supports it
5500  constexpr uint32_t Scale = 0;
5501  const auto SI = get_surface_index(acc);
5502  return __esimd_gather4_masked_scaled2<detail::__raw_t<T>, N, RGBAMask,
5503  decltype(SI), Scale>(
5504  SI, global_offset, offsets.data(), mask.data());
5505 #endif
5506 }
5507 
5508 #ifdef __ESIMD_FORCE_STATELESS_MEM
5509 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
5510  typename AccessorT, int N,
5511  typename T = typename AccessorT::value_type, typename Toffset>
5512 __ESIMD_API std::enable_if_t<
5513  ((N == 8 || N == 16 || N == 32) && sizeof(T) == 4 &&
5514  detail::is_device_accessor_with_v<AccessorT,
5515  detail::accessor_mode_cap::can_read> &&
5516  std::is_integral_v<Toffset> && !std::is_same_v<Toffset, uint64_t>),
5517  simd<T, N * get_num_channels_enabled(RGBAMask)>>
5518 gather_rgba(AccessorT acc, simd<Toffset, N> offsets, uint64_t global_offset = 0,
5519  simd_mask<N> mask = 1) {
5520  return gather_rgba<RGBAMask, AccessorT, N, T>(acc, convert<uint64_t>(offsets),
5521  global_offset, mask);
5522 }
5523 #endif
5524 
5539 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
5540  typename AccessorT, int N,
5541  typename T = typename AccessorT::value_type>
5542 __ESIMD_API
5543  std::enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4 &&
5544  detail::is_device_accessor_with_v<
5545  AccessorT, detail::accessor_mode_cap::can_write>>
5547  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
5548  detail::DeviceAccessorOffsetT global_offset = 0,
5549  simd_mask<N> mask = 1) {
5550  detail::validate_rgba_write_channel_mask<RGBAMask>();
5551 #ifdef __ESIMD_FORCE_STATELESS_MEM
5552  scatter_rgba<RGBAMask>(__ESIMD_DNS::accessorToPointer<T>(acc, global_offset),
5553  offsets, vals, mask);
5554 #else
5555  // TODO (performance) use hardware-supported scale once BE supports it
5556  constexpr uint32_t Scale = 0;
5557  const auto SI = get_surface_index(acc);
5558  __esimd_scatter4_scaled<T, N, decltype(SI), RGBAMask, Scale>(
5559  mask.data(), SI, global_offset, offsets.data(), vals.data());
5560 #endif
5561 }
5562 
5563 #ifdef __ESIMD_FORCE_STATELESS_MEM
5564 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
5565  typename AccessorT, int N,
5566  typename T = typename AccessorT::value_type, typename Toffset>
5567 __ESIMD_API std::enable_if_t<
5568  (N == 8 || N == 16 || N == 32) && sizeof(T) == 4 &&
5569  detail::is_device_accessor_with_v<AccessorT,
5570  detail::accessor_mode_cap::can_write> &&
5571  std::is_integral_v<Toffset> && !std::is_same_v<Toffset, uint64_t>>
5572 scatter_rgba(AccessorT acc, simd<Toffset, N> offsets,
5573  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
5574  uint64_t global_offset = 0, simd_mask<N> mask = 1) {
5575  scatter_rgba<RGBAMask, AccessorT, N, T>(acc, convert<uint64_t>(offsets), vals,
5576  global_offset, mask);
5577 }
5578 #endif
5580 
5581 namespace detail {
5582 
5583 #ifndef __ESIMD_FP_ATOMIC_OP_TYPE_CHECK
5584 #define __ESIMD_FP_ATOMIC_OP_TYPE_CHECK(T) \
5585  static_assert(is_type<T, float, sycl::half, double>(), \
5586  "float, double or sycl::half type is expected");
5587 #endif // __ESIMD_FP_ATOMIC_OP_TYPE_CHECK
5588 
5591 template <__ESIMD_NS::atomic_op Op, typename T, int N, unsigned NumSrc,
5592  bool IsLSC = false>
5593 constexpr void check_atomic() {
5594 
5595  static_assert(sizeof(T) > 1, "Unsupported data type");
5596 
5597  // LSC atomic operation is supported for any width.
5598  if constexpr (!IsLSC)
5599  static_assert((detail::isPowerOf2(N, 32)),
5600  "Execution size 1, 2, 4, 8, 16, 32 are supported");
5601 
5602  static_assert(NumSrc == __ESIMD_DNS::get_num_args<Op>(),
5603  "Wrong number of operands");
5604  constexpr bool IsInt2BytePlus =
5605  std::is_integral_v<T> && (sizeof(T) >= sizeof(uint16_t));
5606 
5607  if constexpr (Op == __ESIMD_NS::atomic_op::xchg ||
5608  Op == __ESIMD_NS::atomic_op::cmpxchg ||
5609  Op == __ESIMD_NS::atomic_op::inc ||
5611 
5612  static_assert(IsInt2BytePlus, "Integral 16-bit or wider type is expected");
5613  }
5614  // FP ops (are always delegated to native::lsc::<Op>)
5615  if constexpr (Op == __ESIMD_NS::atomic_op::fmax ||
5617  Op == __ESIMD_NS::atomic_op::fadd ||
5618  Op == __ESIMD_NS::atomic_op::fsub ||
5619  Op == __ESIMD_NS::atomic_op::fcmpxchg) {
5621  }
5622  if constexpr (Op == __ESIMD_NS::atomic_op::add ||
5623  Op == __ESIMD_NS::atomic_op::sub ||
5629  Op == __ESIMD_NS::atomic_op::smin ||
5630  Op == __ESIMD_NS::atomic_op::smax) {
5631  static_assert(IsInt2BytePlus, "Integral 16-bit or wider type is expected");
5632  constexpr bool IsSignedMinmax = (Op == __ESIMD_NS::atomic_op::smin) ||
5633  (Op == __ESIMD_NS::atomic_op::smax);
5634  constexpr bool IsUnsignedMinmax = (Op == __ESIMD_NS::atomic_op::umin) ||
5636 
5637  if constexpr (IsSignedMinmax || IsUnsignedMinmax) {
5638  constexpr bool SignOK = std::is_signed_v<T> == IsSignedMinmax;
5639  static_assert(SignOK, "Signed/unsigned integer type expected for "
5640  "signed/unsigned min/max operation");
5641  }
5642  }
5643 }
5644 #undef __ESIMD_FP_ATOMIC_OP_TYPE_CHECK
5645 } // namespace detail
5646 
5649 
5667 template <int SLMAmount> class slm_allocator {
5668  int offset;
5669 
5670 public:
5672  slm_allocator() { offset = __esimd_slm_alloc(SLMAmount); }
5673 
5675  ESIMD_INLINE int get_offset() const { return offset; }
5676 
5678  ~slm_allocator() { __esimd_slm_free(offset); }
5679 };
5680 
5693 template <uint32_t SLMSize> __ESIMD_API void slm_init() {
5694  __esimd_slm_init(SLMSize);
5695 }
5696 
5702 // with esimd::slm_allocator() class.
5705 __ESIMD_API void slm_init(uint32_t size) { __esimd_slm_init(size); }
5706 
5743 
5749 #ifndef __ESIMD_GATHER_SCATTER_LLVM_IR
5752 #endif // __ESIMD_GATHER_SCATTER_LLVM_IR
5773 template <
5774  typename T, int N, int VS,
5775  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
5776 __ESIMD_API std::enable_if_t<
5777  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
5779  simd<T, N> pass_thru, PropertyListT props = {}) {
5780  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
5781 
5782  constexpr size_t Alignment =
5783  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
5784  static_assert(Alignment >= sizeof(T),
5785  "slm_gather() requires at least element-size alignment");
5786 
5787  // Use LSC lowering if VS > 1. Also, if masked gather is
5788  // not available, then LSC is the only lowering option.
5789  if constexpr (VS > 1 || !detail::isMaskedGatherScatterLLVMAvailable()) {
5790  return __ESIMD_DNS::slm_gather_impl<T, VS,
5792  byte_offsets, mask, pass_thru);
5793  } else {
5794  if constexpr (sizeof(T) == 8) {
5795  simd<T, N> Res;
5796  Res.template bit_cast_view<uint32_t>().template select<N, 2>(0) =
5797  __esimd_slm_gather_ld<uint32_t, N, Alignment>(
5798  byte_offsets.data(), mask.data(),
5799  (pass_thru.template bit_cast_view<uint32_t>()
5800  .template select<N, 2>(0))
5801  .data());
5802  simd<uint32_t, N / VS> Offset = byte_offsets + sizeof(uint32_t);
5803  Res.template bit_cast_view<uint32_t>().template select<N, 2>(1) =
5804  __esimd_slm_gather_ld<uint32_t, N, sizeof(uint32_t)>(
5805  Offset.data(), mask.data(),
5806  (pass_thru.template bit_cast_view<uint32_t>()
5807  .template select<N, 2>(1))
5808  .data());
5809  return Res;
5810  } else {
5811  using MsgT = detail::__raw_t<T>;
5812  return __esimd_slm_gather_ld<MsgT, N, Alignment>(
5813  byte_offsets.data(), mask.data(), pass_thru.data());
5814  }
5815  }
5816 }
5817 
5841 template <
5842  typename T, int N, int VS,
5843  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
5844 __ESIMD_API std::enable_if_t<
5845  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
5847  PropertyListT props = {}) {
5848  constexpr size_t Alignment =
5849  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
5850  static_assert(Alignment >= sizeof(T),
5851  "slm_gather() requires at least element-size alignment");
5852 
5853  if constexpr (VS > 1 || (!detail::isPowerOf2(N, 32) &&
5855  simd<T, N> PassThru; // Intentionally undefined
5856  return detail::slm_gather_impl<T, VS, detail::lsc_data_size::default_size>(
5857  byte_offsets, mask, PassThru);
5858  } else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) {
5859  if constexpr (sizeof(T) == 8) {
5860  simd<T, N> Res;
5861  simd<uint32_t, N> PassThru; // it is intentionally undefined
5862 
5863  Res.template bit_cast_view<uint32_t>().template select<N, 2>(0) =
5864  __esimd_slm_gather_ld<uint32_t, N, Alignment>(
5865  byte_offsets.data(), mask.data(), PassThru.data());
5866  simd<uint32_t, N / VS> Offset = byte_offsets + sizeof(uint32_t);
5867  Res.template bit_cast_view<uint32_t>().template select<N, 2>(1) =
5868  __esimd_slm_gather_ld<uint32_t, N, sizeof(uint32_t)>(
5869  Offset.data(), mask.data(), PassThru.data());
5870  return Res;
5871  } else {
5872  using MsgT = detail::__raw_t<T>;
5873  simd<MsgT, N> PassThru; // it is intentionally undefined
5874  return __esimd_slm_gather_ld<MsgT, N, Alignment>(
5875  byte_offsets.data(), mask.data(), PassThru.data());
5876  }
5877  } else {
5878  detail::LocalAccessorMarker acc;
5879  return detail::gather_impl<T, N>(acc, byte_offsets, 0, mask);
5880  }
5881 }
5882 
5900 template <
5901  typename T, int N, int VS,
5902  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
5903 __ESIMD_API std::enable_if_t<
5904  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
5905 slm_gather(simd<uint32_t, N / VS> byte_offsets, PropertyListT props = {}) {
5906  simd_mask<N / VS> Mask = 1;
5907  return slm_gather<T, N, VS>(byte_offsets, Mask, props);
5908 }
5909 
5932 template <
5933  typename T, int N,
5934  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
5935 __ESIMD_API std::enable_if_t<
5936  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
5938  simd<T, N> pass_thru, PropertyListT props = {}) {
5939  constexpr int VS = 1;
5940  return slm_gather<T, N, VS>(byte_offsets, mask, pass_thru, props);
5941 }
5942 
5962 template <
5963  typename T, int N,
5964  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
5965 __ESIMD_API std::enable_if_t<
5966  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
5968  PropertyListT props = {}) {
5969  constexpr int VS = 1;
5970  return slm_gather<T, N, VS>(byte_offsets, mask, props);
5971 }
5972 
5987 template <
5988  typename T, int N,
5989  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
5990 __ESIMD_API std::enable_if_t<
5991  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
5992 slm_gather(simd<uint32_t, N> byte_offsets, PropertyListT props = {}) {
5993  constexpr int VS = 1;
5994  return slm_gather<T, N, VS>(byte_offsets, props);
5995 }
5996 
6023 template <
6024  typename T, int N, int VS = 1, typename OffsetSimdViewT,
6025  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
6026 __ESIMD_API std::enable_if_t<
6027  detail::is_simd_view_type_v<OffsetSimdViewT> &&
6028  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6029  simd<T, N>>
6030 slm_gather(OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
6031  simd<T, N> pass_thru, PropertyListT props = {}) {
6032  return slm_gather<T, N, VS>(byte_offsets.read(), mask, pass_thru, props);
6033 }
6034 
6061 template <
6062  int VS, typename T, int N, typename OffsetSimdViewT,
6063  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
6064 __ESIMD_API std::enable_if_t<
6065  (detail::is_simd_view_type_v<OffsetSimdViewT> &&
6066  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
6067  simd<T, N>>
6068 slm_gather(OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
6069  simd<T, N> pass_thru, PropertyListT props = {}) {
6070  static_assert(N / VS ==
6071  OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
6072  "Size of pass_thru parameter must correspond to the size of "
6073  "byte_offsets parameter.");
6074  return slm_gather<T, N, VS>(byte_offsets.read(), mask, pass_thru, props);
6075 }
6076 
6106 template <
6107  int VS = 1, typename OffsetSimdViewT, typename PassThruSimdViewT,
6108  int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
6109  typename T = PassThruSimdViewT::value_type::element_type,
6110  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
6111 __ESIMD_API std::enable_if_t<
6112  (detail::is_simd_view_type_v<OffsetSimdViewT> &&
6113  detail::is_simd_view_type_v<PassThruSimdViewT> &&
6114  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
6115  simd<T, N>>
6116 slm_gather(OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
6117  PassThruSimdViewT pass_thru, PropertyListT props = {}) {
6118  static_assert(N / VS ==
6119  OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
6120  "Size of pass_thru parameter must correspond to the size of "
6121  "byte_offsets parameter.");
6122  return slm_gather<T, N, VS>(byte_offsets.read(), mask, pass_thru.read(),
6123  props);
6124 }
6125 
6155 template <
6156  int VS = 1, typename PassThruSimdViewT,
6157  int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
6158  typename T = PassThruSimdViewT::value_type::element_type,
6159  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
6160 __ESIMD_API std::enable_if_t<
6161  (detail::is_simd_view_type_v<PassThruSimdViewT> &&
6162  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
6163  simd<T, N>>
6165  PassThruSimdViewT pass_thru, PropertyListT props = {}) {
6166  return slm_gather<T, N, VS>(byte_offsets, mask, pass_thru.read(), props);
6167 }
6168 
6190 template <
6191  typename T, int N, int VS = 1, typename OffsetSimdViewT,
6192  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
6193 __ESIMD_API std::enable_if_t<
6194  detail::is_simd_view_type_v<OffsetSimdViewT> &&
6195  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6196  simd<T, N>>
6197 slm_gather(OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
6198  PropertyListT props = {}) {
6199  return slm_gather<T, N, VS>(byte_offsets.read(), mask, props);
6200 }
6201 
6218 template <
6219  typename T, int N, int VS = 1, typename OffsetSimdViewT,
6220  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
6221 __ESIMD_API std::enable_if_t<
6222  detail::is_simd_view_type_v<OffsetSimdViewT> &&
6223  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6224  simd<T, N>>
6225 slm_gather(OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
6226  return slm_gather<T, N, VS>(byte_offsets.read(), props);
6227 }
6228 
6234 template <typename T> __ESIMD_API T slm_scalar_load(uint32_t offset) {
6235  const simd<T, 1> Res = slm_gather<T, 1>(simd<uint32_t, 1>(offset));
6236  return Res[0];
6237 }
6238 
6256 
6279 template <
6280  typename T, int N, int VS = 1,
6281  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
6282 __ESIMD_API std::enable_if_t<
6283  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
6285  simd_mask<N / VS> mask, PropertyListT props = {}) {
6286  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
6287 
6288  constexpr size_t Alignment =
6289  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
6290  static_assert(Alignment >= sizeof(T),
6291  "slm_scatter() requires at least element-size alignment");
6292 
6293  // Use LSC lowering if VS > 1.
6294  if constexpr (VS > 1 || (!detail::isPowerOf2(N, 32) &&
6296  __ESIMD_DNS::slm_scatter_impl<T, VS, detail::lsc_data_size::default_size>(
6297  byte_offsets, vals, mask);
6298  } else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) {
6299  if constexpr (sizeof(T) == 8) {
6300  __esimd_slm_scatter_st<uint32_t, N, Alignment>(
6301  vals.template bit_cast_view<uint32_t>()
6302  .template select<N, 2>(0)
6303  .data(),
6304  byte_offsets.data(), mask.data());
6305  simd<uint32_t, N / VS> Offset = byte_offsets + sizeof(uint32_t);
6306  __esimd_slm_scatter_st<uint32_t, N, sizeof(uint32_t)>(
6307  vals.template bit_cast_view<uint32_t>()
6308  .template select<N, 2>(1)
6309  .data(),
6310  Offset.data(), mask.data());
6311 
6312  } else {
6313  using MsgT = detail::__raw_t<T>;
6314  __esimd_slm_scatter_st<MsgT, N, Alignment>(
6315  sycl::bit_cast<__ESIMD_DNS::vector_type_t<MsgT, N>>(vals.data()),
6316  byte_offsets.data(), mask.data());
6317  }
6318  } else {
6319  detail::LocalAccessorMarker acc;
6320  detail::scatter_impl<T, N>(acc, vals, byte_offsets, 0, mask);
6321  }
6322 }
6323 
6340 template <
6341  typename T, int N, int VS = 1,
6342  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
6343 __ESIMD_API std::enable_if_t<
6344  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
6346  PropertyListT props = {}) {
6347  simd_mask<N / VS> Mask = 1;
6348  slm_scatter<T, N, VS>(byte_offsets, vals, Mask, props);
6349 }
6350 
6374 template <
6375  typename T, int N, int VS = 1, typename OffsetSimdViewT,
6376  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
6377 __ESIMD_API std::enable_if_t<
6378  detail::is_simd_view_type_v<OffsetSimdViewT> &&
6379  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
6380 slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
6381  simd_mask<N / VS> mask, PropertyListT props = {}) {
6382  slm_scatter<T, N, VS>(byte_offsets.read(), vals, mask, props);
6383 }
6384 
6400 template <
6401  typename T, int N, int VS = 1, typename OffsetSimdViewT,
6402  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
6403 __ESIMD_API std::enable_if_t<
6404  detail::is_simd_view_type_v<OffsetSimdViewT> &&
6405  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
6406 slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
6407  PropertyListT props = {}) {
6408  return slm_scatter<T, N, VS>(byte_offsets.read(), vals, props);
6409 }
6410 
6428 template <
6429  int VS, typename T, int N, typename OffsetSimdViewT,
6430  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
6431 __ESIMD_API std::enable_if_t<
6432  detail::is_simd_view_type_v<OffsetSimdViewT> &&
6433  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
6434 slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
6435  simd_mask<N / VS> mask, PropertyListT props = {}) {
6436  static_assert(N / VS ==
6437  OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
6438  "Size of vals parameter must correspond to the size of "
6439  "byte_offsets parameter.");
6440  slm_scatter<T, N, VS>(byte_offsets.read(), vals, mask, props);
6441 }
6442 
6460 template <
6461  int VS, typename T, int N, typename OffsetSimdViewT,
6462  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
6463 __ESIMD_API std::enable_if_t<
6464  detail::is_simd_view_type_v<OffsetSimdViewT> &&
6465  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
6466 slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
6467  PropertyListT props = {}) {
6468  static_assert(N / VS ==
6469  OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
6470  "Size of vals parameter must correspond to the size of "
6471  "byte_offsets parameter.");
6472  slm_scatter<T, N, VS>(byte_offsets.read(), vals, props);
6473 }
6474 
6495 template <
6496  int VS = 1, typename ValuesSimdViewT, typename OffsetSimdViewT,
6497  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
6498  typename T = ValuesSimdViewT::value_type::element_type,
6499  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
6500 __ESIMD_API std::enable_if_t<
6501  detail::is_simd_view_type_v<OffsetSimdViewT> &&
6502  detail::is_simd_view_type_v<ValuesSimdViewT> &&
6503  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
6504 slm_scatter(OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
6505  simd_mask<N / VS> mask, PropertyListT props = {}) {
6506  static_assert(N / VS ==
6507  OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
6508  "Size of vals parameter must correspond to the size of "
6509  "byte_offsets parameter.");
6510  slm_scatter<T, N, VS>(byte_offsets.read(), vals.read(), mask, props);
6511 }
6512 
6532 template <
6533  int VS = 1, typename ValuesSimdViewT, typename OffsetSimdViewT,
6534  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
6535  typename T = ValuesSimdViewT::value_type::element_type,
6536  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
6537 __ESIMD_API std::enable_if_t<
6538  detail::is_simd_view_type_v<OffsetSimdViewT> &&
6539  detail::is_simd_view_type_v<ValuesSimdViewT> &&
6540  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
6541 slm_scatter(OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
6542  PropertyListT props = {}) {
6543  static_assert(N / VS ==
6544  OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
6545  "Size of vals parameter must correspond to the size of "
6546  "byte_offsets parameter.");
6547  slm_scatter<T, N, VS>(byte_offsets.read(), vals.read(), props);
6548 }
6549 
6570 template <
6571  int VS = 1, typename ValuesSimdViewT, typename OffsetT,
6572  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
6573  typename T = ValuesSimdViewT::value_type::element_type,
6574  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
6575 __ESIMD_API std::enable_if_t<
6576  detail::is_simd_view_type_v<ValuesSimdViewT> &&
6577  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
6578 slm_scatter(simd<OffsetT, N / VS> byte_offsets, ValuesSimdViewT vals,
6579  simd_mask<N / VS> mask, PropertyListT props = {}) {
6580  slm_scatter<T, N, VS>(byte_offsets, vals.read(), mask, props);
6581 }
6582 
6602 template <
6603  int VS = 1, typename ValuesSimdViewT, typename OffsetT,
6604  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
6605  typename T = ValuesSimdViewT::value_type::element_type,
6606  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
6607 __ESIMD_API std::enable_if_t<
6608  detail::is_simd_view_type_v<ValuesSimdViewT> &&
6609  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
6610 slm_scatter(simd<OffsetT, N / VS> byte_offsets, ValuesSimdViewT vals,
6611  PropertyListT props = {}) {
6612  slm_scatter<T, N, VS>(byte_offsets, vals.read(), props);
6613 }
6614 
6620 template <typename T>
6621 __ESIMD_API void slm_scalar_store(uint32_t offset, T val) {
6622  slm_scatter<T, 1>(simd<uint32_t, 1>(offset), simd<T, 1>(val), 1);
6623 }
6624 
6635 template <typename T, int N, rgba_channel_mask RGBAMask>
6636 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && (sizeof(T) == 4),
6637  simd<T, N * get_num_channels_enabled(RGBAMask)>>
6640  return __esimd_gather4_masked_scaled2<T, N, RGBAMask>(
6641  SI, 0 /*global_offset*/, offsets.data(), mask.data());
6642 }
6643 
6654 template <typename T, int N, rgba_channel_mask Mask>
6655 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && (sizeof(T) == 4)>
6657  simd<T, N * get_num_channels_enabled(Mask)> vals,
6658  simd_mask<N> mask = 1) {
6659  detail::validate_rgba_write_channel_mask<Mask>();
6661  constexpr int16_t Scale = 0;
6662  constexpr int global_offset = 0;
6663  __esimd_scatter4_scaled<T, N, decltype(si), Mask, Scale>(
6664  mask.data(), si, global_offset, offsets.data(), vals.data());
6665 }
6666 
6682 template <typename T, int N,
6684 __ESIMD_API std::enable_if_t<is_simd_flag_type_v<Flags>, simd<T, N>>
6685 slm_block_load(uint32_t byte_offset, Flags) {
6686  constexpr size_t Align = Flags::template alignment<simd<T, N>>;
6687  return __esimd_slm_block_ld<detail::__raw_t<T>, N, Align>(byte_offset);
6688 }
6689 
6698 
6705 
6723 
6738 template <
6739  typename T, int N,
6740  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
6741 __ESIMD_API std::enable_if_t<
6742  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
6743 slm_block_load(uint32_t byte_offset, PropertyListT props = {}) {
6744  constexpr size_t DefaultAlignment = detail::OperandSize::OWORD;
6745  constexpr size_t Alignment =
6746  detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
6747  return __esimd_slm_block_ld<detail::__raw_t<T>, N, Alignment>(byte_offset);
6748 }
6749 
6776 template <
6777  typename T, int N,
6778  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
6779 __ESIMD_API std::enable_if_t<
6780  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
6781 slm_block_load(uint32_t byte_offset, simd_mask<1> pred,
6782  PropertyListT props = {}) {
6783  // Verify input template arguments.
6784  constexpr size_t DefaultAlignment = sizeof(T) <= 4 ? 4 : sizeof(T);
6785  constexpr size_t Alignment =
6786  detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
6787  static_assert(
6788  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
6789  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
6790  "Incorrect alignment for the data type");
6791 
6792  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
6793  constexpr int SmallIntFactor32Bit =
6794  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
6795  static_assert(N > 0 && N % SmallIntFactor32Bit == 0,
6796  "Number of elements is not supported by Transposed load");
6797 
6798  // If alignment >= 8 and (N * sizeof(T)) % 8 == 0) we can load QWORDs.
6799  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
6800  // because it would require a bit-cast, which is supposed to be NO-OP, but
6801  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
6802  constexpr bool Use64BitData =
6803  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
6804  (N * sizeof(T)) % sizeof(uint64_t) == 0 &&
6805  (sizeof(T) != sizeof(uint32_t) || N * sizeof(T) > 256);
6806  constexpr int SmallIntFactor =
6807  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
6808  constexpr int FactoredN = N / SmallIntFactor;
6809  detail::check_lsc_vector_size<FactoredN>();
6810 
6811  // Prepare template arguments for the call of intrinsic.
6812  using LoadElemT = __ESIMD_DNS::__raw_t<
6813  std::conditional_t<SmallIntFactor == 1, T,
6814  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
6815 
6816  constexpr uint16_t AddressScale = 1;
6817  constexpr int ImmOffset = 0;
6818  constexpr detail::lsc_data_size DS =
6820  constexpr auto VS = detail::to_lsc_vector_size<FactoredN>();
6821  constexpr auto Transposed = detail::lsc_data_order::transpose;
6822  constexpr int NLanes = 1;
6823 
6824  // Prepare non-template arguments and call the intrinsic.
6825  simd<uint32_t, NLanes> Offsets = byte_offset;
6827  __esimd_lsc_load_slm<LoadElemT, cache_hint::none, cache_hint::none,
6828  AddressScale, ImmOffset, DS, VS, Transposed, NLanes>(
6829  pred.data(), Offsets.data());
6830  return Result.template bit_cast_view<T>();
6831 }
6832 
6862 template <
6863  typename T, int N,
6864  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
6865 __ESIMD_API std::enable_if_t<
6866  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
6867 slm_block_load(uint32_t offset, simd_mask<1> pred, simd<T, N> pass_thru,
6868  PropertyListT props = {}) {
6869  // Verify input template arguments.
6870  constexpr size_t DefaultAlignment = sizeof(T) <= 4 ? 4 : sizeof(T);
6871  constexpr size_t Alignment =
6872  detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
6873  static_assert(
6874  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
6875  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
6876  "Incorrect alignment for the data type");
6877 
6878  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
6879  constexpr int SmallIntFactor32Bit =
6880  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
6881  static_assert(N > 0 && N % SmallIntFactor32Bit == 0,
6882  "Number of elements is not supported by Transposed load");
6883 
6884  // If alignment >= 8 and (N * sizeof(T)) % 8 == 0) we can load QWORDs.
6885  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
6886  // because it would require a bit-cast, which is supposed to be NO-OP, but
6887  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
6888  constexpr bool Use64BitData =
6889  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
6890  (N * sizeof(T)) % sizeof(uint64_t) == 0 &&
6891  (sizeof(T) != sizeof(uint32_t) || N * sizeof(T) > 256);
6892  constexpr int SmallIntFactor =
6893  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
6894  constexpr int FactoredN = N / SmallIntFactor;
6895  detail::check_lsc_vector_size<FactoredN>();
6896 
6897  // Prepare template arguments for the call of intrinsic.
6898  using LoadElemT = __ESIMD_DNS::__raw_t<
6899  std::conditional_t<SmallIntFactor == 1, T,
6900  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
6901 
6902  constexpr uint16_t AddressScale = 1;
6903  constexpr int ImmOffset = 0;
6904  constexpr detail::lsc_data_size DS =
6906  constexpr auto VS = detail::to_lsc_vector_size<FactoredN>();
6907  constexpr auto Transposed = detail::lsc_data_order::transpose;
6908  constexpr int NLanes = 1;
6909 
6910  // Prepare non-template arguments and call the intrinsic.
6911  simd<uint32_t, NLanes> Offsets = offset;
6912  simd<LoadElemT, FactoredN> PassThru =
6913  pass_thru.template bit_cast_view<LoadElemT>();
6915  __esimd_lsc_load_merge_slm<LoadElemT, cache_hint::none, cache_hint::none,
6916  AddressScale, ImmOffset, DS, VS, Transposed,
6917  NLanes>(pred.data(), Offsets.data(),
6918  PassThru.data());
6919  return Result.template bit_cast_view<T>();
6920 }
6921 
6953 template <
6954  typename PassThruSimdViewT,
6955  typename T = PassThruSimdViewT::value_type::element_type,
6956  int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
6957  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
6958 __ESIMD_API std::enable_if_t<
6959  detail::is_simd_view_type_v<PassThruSimdViewT> &&
6960  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6961  simd<T, N>>
6962 slm_block_load(uint32_t offset, simd_mask<1> pred, PassThruSimdViewT pass_thru,
6963  PropertyListT props = {}) {
6964  return slm_block_load<T, N>(offset, pred, pass_thru.read(), props);
6965 }
6966 
6990 template <
6991  typename T, int N, typename AccessorT,
6992  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
6993 __ESIMD_API std::enable_if_t<
6994  detail::is_local_accessor_with_v<AccessorT,
6995  detail::accessor_mode_cap::can_read> &&
6996  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6997  simd<T, N>>
6998 block_load(AccessorT lacc, uint32_t byte_offset, PropertyListT props = {}) {
6999  byte_offset += detail::localAccessorToOffset(lacc);
7000  return slm_block_load<T, N>(byte_offset, props);
7001 }
7002 
7025 template <
7026  typename T, int N, typename AccessorT,
7027  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
7028 __ESIMD_API std::enable_if_t<
7029  detail::is_local_accessor_with_v<AccessorT,
7030  detail::accessor_mode_cap::can_read> &&
7031  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7032  simd<T, N>>
7033 block_load(AccessorT lacc, PropertyListT props = {}) {
7034  return slm_block_load<T, N>(detail::localAccessorToOffset(lacc), props);
7035 }
7036 
7064 template <
7065  typename T, int N, typename AccessorT,
7066  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
7067 __ESIMD_API std::enable_if_t<
7068  detail::is_local_accessor_with_v<AccessorT,
7069  detail::accessor_mode_cap::can_read> &&
7070  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7071  simd<T, N>>
7072 block_load(AccessorT lacc, uint32_t byte_offset, simd_mask<1> pred,
7073  PropertyListT props = {}) {
7074  byte_offset += detail::localAccessorToOffset(lacc);
7075  return slm_block_load<T, N>(byte_offset, pred, props);
7076 }
7077 
7103 template <
7104  typename T, int N, typename AccessorT,
7105  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
7106 __ESIMD_API std::enable_if_t<
7107  detail::is_local_accessor_with_v<AccessorT,
7108  detail::accessor_mode_cap::can_read> &&
7109  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7110  simd<T, N>>
7111 block_load(AccessorT lacc, simd_mask<1> pred, PropertyListT props = {}) {
7112  return slm_block_load<T, N>(detail::localAccessorToOffset(lacc), pred, props);
7113 }
7114 
7142 template <
7143  typename T, int N, typename AccessorT,
7144  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
7145 __ESIMD_API std::enable_if_t<
7146  detail::is_local_accessor_with_v<AccessorT,
7147  detail::accessor_mode_cap::can_read> &&
7148  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7149  simd<T, N>>
7150 block_load(AccessorT lacc, uint32_t byte_offset, simd_mask<1> pred,
7151  simd<T, N> pass_thru, PropertyListT props = {}) {
7152  byte_offset += __ESIMD_DNS::localAccessorToOffset(lacc);
7153  return slm_block_load<T, N>(byte_offset, pred, pass_thru, props);
7154 }
7155 
7185 template <
7186  typename PassThruSimdViewT,
7187  typename T = PassThruSimdViewT::value_type::element_type,
7188  int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
7189  typename AccessorT,
7190  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
7191 __ESIMD_API std::enable_if_t<
7192  detail::is_simd_view_type_v<PassThruSimdViewT> &&
7193  detail::is_local_accessor_with_v<AccessorT,
7194  detail::accessor_mode_cap::can_read> &&
7195  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7196  simd<T, N>>
7197 block_load(AccessorT lacc, uint32_t byte_offset, simd_mask<1> pred,
7198  PassThruSimdViewT pass_thru, PropertyListT props = {}) {
7199  return block_load<T, N>(lacc, byte_offset, pred, pass_thru.read(), props);
7200 }
7201 
7228 template <
7229  typename T, int N, typename AccessorT,
7230  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
7231 __ESIMD_API std::enable_if_t<
7232  detail::is_local_accessor_with_v<AccessorT,
7233  detail::accessor_mode_cap::can_read> &&
7234  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7235  simd<T, N>>
7236 block_load(AccessorT lacc, simd_mask<1> pred, simd<T, N> pass_thru,
7237  PropertyListT props = {}) {
7238  return slm_block_load<T, N>(__ESIMD_DNS::localAccessorToOffset(lacc), pred,
7239  pass_thru, props);
7240 }
7241 
7270 template <
7271  typename PassThruSimdViewT,
7272  typename T = PassThruSimdViewT::value_type::element_type,
7273  int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
7274  typename AccessorT,
7275  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
7276 __ESIMD_API std::enable_if_t<
7277  detail::is_simd_view_type_v<PassThruSimdViewT> &&
7278  detail::is_local_accessor_with_v<AccessorT,
7279  detail::accessor_mode_cap::can_read> &&
7280  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7281  simd<T, N>>
7282 block_load(AccessorT lacc, simd_mask<1> pred, PassThruSimdViewT pass_thru,
7283  PropertyListT props = {}) {
7284  return block_load<T, N>(lacc, pred, pass_thru.read(), props);
7285 }
7286 
7302 template <typename T, int N, typename Flags>
7303 __ESIMD_API std::enable_if_t<is_simd_flag_type_v<Flags>>
7304 slm_block_store(uint32_t offset, simd<T, N> vals, Flags) {
7305  constexpr size_t Align = Flags::template alignment<simd<T, N>>;
7306  __esimd_slm_block_st<detail::__raw_t<T>, N, Align>(offset, vals.data());
7307 }
7308 
7316 
7364 template <
7365  typename T, int N,
7366  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
7367 __ESIMD_API std::enable_if_t<
7368  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
7369 slm_block_store(uint32_t byte_offset, simd<T, N> vals, simd_mask<1> pred,
7370  PropertyListT props = {}) {
7371  // Verify input template arguments.
7372  constexpr size_t DefaultAlignment = sizeof(T) <= 4 ? 4 : sizeof(T);
7373  constexpr size_t Alignment =
7374  detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
7375  static_assert(
7376  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
7377  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
7378  "Incorrect alignment for the data type");
7379 
7380  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
7381  constexpr int SmallIntFactor32Bit =
7382  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
7383 
7384  static_assert(N > 0 && N % SmallIntFactor32Bit == 0,
7385  "Number of elements is not supported by Transposed store");
7386 
7387  // If alignment >= 8 and (N * sizeof(T)) % 8 == 0) we can store QWORDs.
7388  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
7389  // because it would require a bit-cast, which is supposed to be NO-OP, but
7390  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
7391  constexpr bool Use64BitData =
7392  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
7393  (N * sizeof(T)) % sizeof(uint64_t) == 0 &&
7394  (sizeof(T) != sizeof(uint32_t) || N * sizeof(T) > 256);
7395  constexpr int SmallIntFactor =
7396  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
7397  constexpr int FactoredN = N / SmallIntFactor;
7398  detail::check_lsc_vector_size<FactoredN>();
7399 
7400  // Prepare template arguments for the call of intrinsic.
7401  using StoreElemT = __ESIMD_DNS::__raw_t<
7402  std::conditional_t<SmallIntFactor == 1, T,
7403  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
7404 
7405  constexpr uint16_t AddressScale = 1;
7406  constexpr int ImmOffset = 0;
7407  constexpr detail::lsc_data_size DS =
7409  constexpr auto VS = detail::to_lsc_vector_size<FactoredN>();
7410  constexpr auto Transposed = detail::lsc_data_order::transpose;
7411  constexpr int NLanes = 1;
7412 
7413  // Prepare non-template arguments and call the intrinsic.
7414  simd<uint32_t, NLanes> Offsets = byte_offset;
7415  __esimd_lsc_store_slm<StoreElemT, cache_hint::none, cache_hint::none,
7416  AddressScale, ImmOffset, DS, VS, Transposed, NLanes>(
7417  pred.data(), Offsets.data(),
7418  sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreElemT, FactoredN>>(
7419  vals.data()));
7420 }
7421 
7437 template <
7438  typename T, int N,
7439  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
7440 __ESIMD_API std::enable_if_t<
7441  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
7442 slm_block_store(uint32_t byte_offset, simd<T, N> vals,
7443  PropertyListT props = {}) {
7444  constexpr size_t DefaultAlignment = detail::OperandSize::OWORD;
7445  constexpr size_t Alignment =
7446  detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
7447  using StoreElemT = detail::__raw_t<T>;
7448  __esimd_slm_block_st<StoreElemT, N, Alignment>(
7449  byte_offset,
7450  sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreElemT, N>>(vals.data()));
7451 }
7452 
7480 template <
7481  typename ValuesSimdViewT,
7482  typename T = ValuesSimdViewT::value_type::element_type,
7483  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
7484  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
7485 __ESIMD_API std::enable_if_t<
7486  detail::is_simd_view_type_v<ValuesSimdViewT> &&
7487  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
7488 slm_block_store(uint32_t byte_offset, ValuesSimdViewT vals, simd_mask<1> pred,
7489  PropertyListT props = {}) {
7490  slm_block_store<T, N>(byte_offset, vals.read(), pred, props);
7491 }
7492 
7510 template <
7511  typename ValuesSimdViewT,
7512  typename T = ValuesSimdViewT::value_type::element_type,
7513  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
7514  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
7515 __ESIMD_API std::enable_if_t<
7516  detail::is_simd_view_type_v<ValuesSimdViewT> &&
7517  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
7518 slm_block_store(uint32_t byte_offset, ValuesSimdViewT vals,
7519  PropertyListT props = {}) {
7520  slm_block_store<T, N>(byte_offset, vals.read(), props);
7521 }
7522 
7539 template <
7540  typename T, int N, typename AccessorT,
7541  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
7542 __ESIMD_API std::enable_if_t<
7543  detail::is_local_accessor_with_v<AccessorT,
7544  detail::accessor_mode_cap::can_write> &&
7545  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
7546 block_store(AccessorT lacc, uint32_t byte_offset, simd<T, N> vals,
7547  PropertyListT props = {}) {
7548  byte_offset += detail::localAccessorToOffset(lacc);
7549  slm_block_store<T, N>(byte_offset, vals, props);
7550 }
7551 
7567 template <
7568  typename T, int N, typename AccessorT,
7569  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
7570 __ESIMD_API std::enable_if_t<
7571  detail::is_local_accessor_with_v<AccessorT,
7572  detail::accessor_mode_cap::can_write> &&
7573  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
7574 block_store(AccessorT lacc, simd<T, N> vals, PropertyListT props = {}) {
7575  slm_block_store<T, N>(detail::localAccessorToOffset(lacc), vals, props);
7576 }
7577 
7605 template <
7606  typename T, int N, typename AccessorT,
7607  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
7608 __ESIMD_API std::enable_if_t<
7609  detail::is_local_accessor_with_v<AccessorT,
7610  detail::accessor_mode_cap::can_write> &&
7611  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
7612 block_store(AccessorT lacc, uint32_t byte_offset, simd<T, N> vals,
7613  simd_mask<1> pred, PropertyListT props = {}) {
7614  byte_offset += detail::localAccessorToOffset(lacc);
7615  slm_block_store<T, N>(byte_offset, vals, pred, props);
7616 }
7617 
7643 template <
7644  typename T, int N, typename AccessorT,
7645  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
7646 __ESIMD_API std::enable_if_t<
7647  detail::is_local_accessor_with_v<AccessorT,
7648  detail::accessor_mode_cap::can_write> &&
7649  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
7650 block_store(AccessorT lacc, simd<T, N> vals, simd_mask<1> pred,
7651  PropertyListT props = {}) {
7652  slm_block_store<T, N>(detail::localAccessorToOffset(lacc), vals, pred, props);
7653 }
7654 
7673 template <
7674  typename ValuesSimdViewT,
7675  typename T = ValuesSimdViewT::value_type::element_type,
7676  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
7677  typename AccessorT,
7678  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
7679 __ESIMD_API std::enable_if_t<
7680  detail::is_simd_view_type_v<ValuesSimdViewT> &&
7681  detail::is_local_accessor_with_v<AccessorT,
7682  detail::accessor_mode_cap::can_write> &&
7683  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
7684 block_store(AccessorT lacc, uint32_t byte_offset, ValuesSimdViewT vals,
7685  PropertyListT props = {}) {
7686  block_store<T, N>(lacc, byte_offset, vals.read(), props);
7687 }
7688 
7706 template <
7707  typename ValuesSimdViewT,
7708  typename T = ValuesSimdViewT::value_type::element_type,
7709  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
7710  typename AccessorT,
7711  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
7712 __ESIMD_API std::enable_if_t<
7713  detail::is_simd_view_type_v<ValuesSimdViewT> &&
7714  detail::is_local_accessor_with_v<AccessorT,
7715  detail::accessor_mode_cap::can_write> &&
7716  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
7717 block_store(AccessorT lacc, ValuesSimdViewT vals, PropertyListT props = {}) {
7718  block_store<T, N>(lacc, vals.read(), props);
7719 }
7720 
7749 template <
7750  typename ValuesSimdViewT,
7751  typename T = ValuesSimdViewT::value_type::element_type,
7752  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
7753  typename AccessorT,
7754  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
7755 __ESIMD_API std::enable_if_t<
7756  detail::is_simd_view_type_v<ValuesSimdViewT> &&
7757  detail::is_local_accessor_with_v<AccessorT,
7758  detail::accessor_mode_cap::can_write> &&
7759  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
7760 block_store(AccessorT lacc, uint32_t byte_offset, ValuesSimdViewT vals,
7761  simd_mask<1> pred, PropertyListT props = {}) {
7762  block_store<T, N>(lacc, byte_offset, vals.read(), pred, props);
7763 }
7764 
7792 template <
7793  typename ValuesSimdViewT,
7794  typename T = ValuesSimdViewT::value_type::element_type,
7795  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
7796  typename AccessorT,
7797  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
7798 __ESIMD_API std::enable_if_t<
7799  detail::is_simd_view_type_v<ValuesSimdViewT> &&
7800  detail::is_local_accessor_with_v<AccessorT,
7801  detail::accessor_mode_cap::can_write> &&
7802  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
7803 block_store(AccessorT lacc, ValuesSimdViewT vals, simd_mask<1> pred,
7804  PropertyListT props = {}) {
7805  block_store<T, N>(lacc, vals.read(), pred, props);
7806 }
7807 namespace detail {
7808 
7809 // lsc_atomic_update() operations may share atomic_op values for data types
7810 // of the same (fp vs integral) class for convenience (e.g. re-use 'fmax' for
7811 // all FP types). In fact those data types may require using different internal
7812 // opcodes. This function returns the corresponding internal opcode for
7813 // the input type 'T' and operation 'Op'.
7814 template <typename T, __ESIMD_NS::atomic_op Op>
7815 constexpr int lsc_to_internal_atomic_op() {
7816  constexpr __ESIMD_NS::native::lsc::atomic_op LSCOp =
7817  __ESIMD_DNS::to_lsc_atomic_op<Op>();
7818  return static_cast<int>(LSCOp);
7819 }
7820 
7834 
7835 template <atomic_op Op, typename T, int N, lsc_data_size DS>
7836 __ESIMD_API std::enable_if_t<get_num_args<Op>() == 0, simd<T, N>>
7838  check_lsc_data_size<T, DS>();
7839  check_atomic<Op, T, N, 0, /*IsLSC*/ true>();
7840  constexpr uint16_t AddressScale = 1;
7841  constexpr int ImmOffset = 0;
7842  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
7843  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
7844  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
7845  using MsgT = typename lsc_expand_type<T>::type;
7846  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
7847  simd<MsgT, N> Tmp =
7848  __esimd_lsc_xatomic_slm_0<MsgT, IOp, cache_hint::none, cache_hint::none,
7849  AddressScale, ImmOffset, EDS, VS, Transposed,
7850  N>(pred.data(), offsets.data());
7851  return lsc_format_ret<T>(Tmp);
7852 }
7853 
7868 template <atomic_op Op, typename T, int N, lsc_data_size DS>
7869 __ESIMD_API std::enable_if_t<get_num_args<Op>() == 1, simd<T, N>>
7871  simd_mask<N> pred) {
7872  check_lsc_data_size<T, DS>();
7873  check_atomic<Op, T, N, 1, /*IsLSC*/ true>();
7874  constexpr uint16_t AddressScale = 1;
7875  constexpr int ImmOffset = 0;
7876  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
7877  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
7878  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
7879  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
7880  if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
7881  return __esimd_lsc_xatomic_slm_1<T, IOp, cache_hint::none, cache_hint::none,
7882  AddressScale, ImmOffset, EDS, VS,
7883  Transposed, N>(pred.data(), offsets.data(),
7884  src0.data());
7885  } else {
7886  using MsgT = typename lsc_expand_type<T>::type;
7887  simd<MsgT, N> Msg_data = lsc_format_input<MsgT>(src0);
7888  simd<MsgT, N> Tmp =
7889  __esimd_lsc_xatomic_slm_1<MsgT, IOp, cache_hint::none, cache_hint::none,
7890  AddressScale, ImmOffset, EDS, VS, Transposed,
7891  N>(pred.data(), offsets.data(),
7892  Msg_data.data());
7893  return lsc_format_ret<T>(Tmp);
7894  }
7895 }
7896 
7912 template <atomic_op Op, typename T, int N, lsc_data_size DS>
7915  simd_mask<N> pred) {
7916  check_lsc_data_size<T, DS>();
7917  check_atomic<Op, T, N, 2, /*IsLSC*/ true>();
7918  constexpr uint16_t AddressScale = 1;
7919  constexpr int ImmOffset = 0;
7920  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
7921  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
7922  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
7923  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
7924  if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
7925  return __esimd_lsc_xatomic_slm_2<T, IOp, cache_hint::none, cache_hint::none,
7926  AddressScale, ImmOffset, EDS, VS,
7927  Transposed, N>(pred.data(), offsets.data(),
7928  src0.data(), src1.data());
7929  } else {
7930  using MsgT = typename lsc_expand_type<T>::type;
7931  simd<MsgT, N> Msg_data0 = lsc_format_input<MsgT>(src0);
7932  simd<MsgT, N> Msg_data1 = lsc_format_input<MsgT>(src1);
7933  simd<MsgT, N> Tmp =
7934  __esimd_lsc_xatomic_slm_2<MsgT, IOp, cache_hint::none, cache_hint::none,
7935  AddressScale, ImmOffset, EDS, VS, Transposed,
7936  N>(pred.data(), offsets.data(),
7937  Msg_data0.data(), Msg_data1.data());
7938  return lsc_format_ret<T>(Tmp);
7939  }
7940 }
7941 
7942 } // namespace detail
7943 
7947 
7951 
7956 
7961 
7963 
7981 template <atomic_op Op, typename T, int N>
7982 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0, simd<T, N>>
7984  // 2 byte, 8 byte types, non-power of two, and operations wider than
7985  // 32 are supported only by LSC.
7986  if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
7987  !__ESIMD_DNS::isPowerOf2(N, 32)) {
7988  return slm_atomic_update_impl<Op, T, N,
7990  byte_offset, mask);
7991  } else if constexpr (Op == atomic_op::load) {
7992  if constexpr (std::is_integral_v<T>) {
7993  return slm_atomic_update<atomic_op::bit_or, T, N>(byte_offset,
7994  simd<T, N>(0), mask);
7995  } else {
7996  using Tint = detail::uint_type_t<sizeof(T)>;
7997  simd<Tint, N> Res = slm_atomic_update<atomic_op::bit_or, Tint, N>(
7998  byte_offset, simd<Tint, N>(0), mask);
7999  return Res.template bit_cast_view<T>();
8000  }
8001  } else {
8002  detail::check_atomic<Op, T, N, 0>();
8004  return __esimd_dword_atomic0<Op, T, N>(mask.data(), si, byte_offset.data());
8005  }
8006 }
8007 
8016 template <atomic_op Op, typename T, int N, typename AccessorT>
8017 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0 &&
8018  __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
8019  simd<T, N>>
8020 atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset,
8021  simd_mask<N> mask = 1) {
8022  byte_offset += detail::localAccessorToOffset(lacc);
8023  return slm_atomic_update<Op, T, N>(byte_offset, mask);
8024 }
8025 
8027 
8033 
8040 
8042 
8060 template <atomic_op Op, typename T, int N>
8061 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1, simd<T, N>>
8063  simd_mask<N> mask = 1) {
8064  // Non-LSC atomic_update supports only 4-byte int vector operations with
8065  // 1,2,4,8,16,32 vector length. Non-LSC supports only 'store' for FP types.
8066  if constexpr (Op == atomic_op::fmin || Op == atomic_op::fmax ||
8067  Op == atomic_op::fadd || Op == atomic_op::fsub ||
8068  sizeof(T) != 4 || !__ESIMD_DNS::isPowerOf2(N, 32)) {
8069  return slm_atomic_update_impl<Op, T, N,
8071  byte_offset, src0, mask);
8072  } else if constexpr (Op == atomic_op::store) {
8073  if constexpr (std::is_integral_v<T>) {
8074  return slm_atomic_update<atomic_op::xchg, T, N>(byte_offset, src0, mask);
8075  } else {
8076  using Tint = detail::uint_type_t<sizeof(T)>;
8077  simd<Tint, N> Res = slm_atomic_update<atomic_op::xchg, Tint, N>(
8078  byte_offset, src0.template bit_cast_view<Tint>(), mask);
8079  return Res.template bit_cast_view<T>();
8080  }
8081  } else {
8082  detail::check_atomic<Op, T, N, 1>();
8084  return __esimd_dword_atomic1<Op, T, N>(mask.data(), si, byte_offset.data(),
8085  src0.data());
8086  }
8087 }
8088 
8106 template <atomic_op Op, typename SrcSimdViewT,
8107  typename T = SrcSimdViewT::value_type::element_type, int N>
8108 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1 &&
8109  detail::is_simd_view_type_v<SrcSimdViewT>,
8110  simd<T, N>>
8111 slm_atomic_update(simd<uint32_t, N> byte_offset, SrcSimdViewT src0,
8112  simd_mask<N> mask = 1) {
8113  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
8114  "Size of src0 parameter must correspond to the size of "
8115  "byte_offset parameter.");
8116  return slm_atomic_update<Op, T, N>(byte_offset, src0.read(), mask);
8117 }
8118 
8136 template <atomic_op Op, typename OffsetSimdViewT, typename T, int N>
8137 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1 &&
8138  detail::is_simd_view_type_v<OffsetSimdViewT>,
8139  simd<T, N>>
8140 slm_atomic_update(OffsetSimdViewT byte_offset, simd<T, N> src0,
8141  simd_mask<N> mask = 1) {
8142  static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
8143  "Size of src0 parameter must correspond to the size of "
8144  "byte_offset parameter.");
8145  return slm_atomic_update<Op, T, N>(byte_offset.read(), src0, mask);
8146 }
8147 
8165 template <atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT,
8166  typename T = SrcSimdViewT::value_type::element_type,
8167  int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY()>
8168 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1 &&
8169  detail::is_simd_view_type_v<OffsetSimdViewT> &&
8170  detail::is_simd_view_type_v<SrcSimdViewT>,
8171  simd<T, N>>
8172 slm_atomic_update(OffsetSimdViewT byte_offset, SrcSimdViewT src0,
8173  simd_mask<N> mask = 1) {
8174  static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
8175  "Size of src0 parameter must correspond to the size of "
8176  "byte_offset parameter.");
8177  return slm_atomic_update<Op, T, N>(byte_offset.read(), src0.read(), mask);
8178 }
8179 
8198 template <atomic_op Op, typename T, int N, typename AccessorT>
8199 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1 &&
8200  __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
8201  simd<T, N>>
8202 atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, simd<T, N> src0,
8203  simd_mask<N> mask = 1) {
8204  byte_offset += detail::localAccessorToOffset(lacc);
8205  return slm_atomic_update<Op, T, N>(byte_offset, src0, mask);
8206 }
8207 
8226 template <atomic_op Op, typename OffsetSimdViewT, typename T, int N,
8227  typename AccessorT>
8228 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1 &&
8229  detail::is_simd_view_type_v<OffsetSimdViewT> &&
8230  __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
8231  simd<T, N>>
8232 atomic_update(AccessorT lacc, OffsetSimdViewT byte_offset, simd<T, N> src0,
8233  simd_mask<N> mask = 1) {
8234  static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
8235  "Size of src0 parameter must correspond to the size of "
8236  "byte_offset parameter.");
8237  return atomic_update<Op, T, N>(lacc, byte_offset.read(), src0, mask);
8238 }
8239 
8258 template <atomic_op Op, typename SrcSimdViewT,
8259  typename T = SrcSimdViewT::value_type::element_type, int N,
8260  typename AccessorT>
8261 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1 &&
8262  detail::is_simd_view_type_v<SrcSimdViewT> &&
8263  __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
8264  simd<T, N>>
8265 atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, SrcSimdViewT src0,
8266  simd_mask<N> mask = 1) {
8267  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
8268  "Size of src0 parameter must correspond to the size of "
8269  "byte_offset parameter.");
8270  return atomic_update<Op, T, N>(lacc, byte_offset, src0.read(), mask);
8271 }
8272 
8291 template <atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT,
8292  typename T = SrcSimdViewT::value_type::element_type,
8293  int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
8294  typename AccessorT>
8295 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1 &&
8296  detail::is_simd_view_type_v<SrcSimdViewT> &&
8297  detail::is_simd_view_type_v<OffsetSimdViewT> &&
8298  __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
8299  simd<T, N>>
8300 atomic_update(AccessorT lacc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
8301  simd_mask<N> mask = 1) {
8302  static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
8303  "Size of src0 parameter must correspond to the size of "
8304  "byte_offset parameter.");
8305  return atomic_update<Op, T, N>(lacc, byte_offset.read(), src0.read(), mask);
8306 }
8308 
8313 
8321 
8339 template <atomic_op Op, typename T, int N>
8340 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2, simd<T, N>>
8342  simd<T, N> src1, simd_mask<N> mask = 1) {
8343  // Non-LSC atomic_update supports only 4-byte int vector operations with
8344  // 1,2,4,8,16,32 vector length.
8345  if constexpr (sizeof(T) != 4 || Op == atomic_op::fcmpxchg ||
8346  !__ESIMD_DNS::isPowerOf2(N, 32)) {
8347  // 2-argument lsc_atomic_update arguments order matches the standard one -
8348  // expected value first, then new value. But atomic_update uses reverse
8349  // order, hence the src1/src0 swap.
8350  return detail::slm_atomic_update_impl<Op, T, N,
8352  byte_offset, src1, src0, mask);
8353  } else {
8354  detail::check_atomic<Op, T, N, 2>();
8356  return __esimd_dword_atomic2<Op, T, N>(mask.data(), si, byte_offset.data(),
8357  src0.data(), src1.data());
8358  }
8359 }
8360 
8378 template <atomic_op Op, typename SrcSimdViewT, typename T, int N>
8379 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
8380  detail::is_simd_view_type_v<SrcSimdViewT>,
8381  simd<T, N>>
8382 slm_atomic_update(simd<uint32_t, N> byte_offset, SrcSimdViewT src0,
8383  simd<T, N> src1, simd_mask<N> mask = 1) {
8384  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
8385  "Size of src0 parameter must correspond to the size of "
8386  "byte_offset and src1 parameters.");
8387  return slm_atomic_update<Op, T, N>(byte_offset, src0.read(), src1, mask);
8388 }
8389 
8407 template <atomic_op Op, typename SrcSimdViewT, typename T, int N>
8408 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
8409  detail::is_simd_view_type_v<SrcSimdViewT>,
8410  simd<T, N>>
8412  SrcSimdViewT src1, simd_mask<N> mask = 1) {
8413  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
8414  "Size of src1 parameter must correspond to the size of "
8415  "byte_offset and src0 parameters.");
8416  return slm_atomic_update<Op, T, N>(byte_offset, src0, src1.read(), mask);
8417 }
8418 
8436 template <atomic_op Op, typename SrcSimdViewT,
8437  typename T = SrcSimdViewT::value_type::element_type, int N>
8438 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
8439  detail::is_simd_view_type_v<SrcSimdViewT>,
8440  simd<T, N>>
8441 slm_atomic_update(simd<uint32_t, N> byte_offset, SrcSimdViewT src0,
8442  SrcSimdViewT src1, simd_mask<N> mask = 1) {
8443  static_assert(
8444  N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
8445  "Size of src1 and src0 parameters must correspond to the size of "
8446  "byte_offset parameter.");
8447  return slm_atomic_update<Op, T, N>(byte_offset, src0.read(), src1.read(),
8448  mask);
8449 }
8450 
8468 template <atomic_op Op, typename OffsetSimdViewT, typename T, int N>
8469 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
8470  detail::is_simd_view_type_v<OffsetSimdViewT>,
8471  simd<T, N>>
8472 slm_atomic_update(OffsetSimdViewT byte_offset, simd<T, N> src0, simd<T, N> src1,
8473  simd_mask<N> mask = 1) {
8474  static_assert(
8475  N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
8476  "Size of src1 and src0 parameters must correspond to the size of "
8477  "byte_offset parameter.");
8478  return slm_atomic_update<Op, T, N>(byte_offset.read(), src0, src1, mask);
8479 }
8480 
8498 template <atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT,
8499  typename T, int N>
8500 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
8501  detail::is_simd_view_type_v<SrcSimdViewT> &&
8502  detail::is_simd_view_type_v<OffsetSimdViewT>,
8503  simd<T, N>>
8504 slm_atomic_update(OffsetSimdViewT byte_offset, SrcSimdViewT src0,
8505  simd<T, N> src1, simd_mask<N> mask = 1) {
8506  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
8507  N == OffsetSimdViewT::getSizeX() *
8508  OffsetSimdViewT::getSizeY(),
8509  "Size of src0 parameter must correspond to the size of "
8510  "byte_offset and src1 parameters.");
8511  return slm_atomic_update<Op, T, N>(byte_offset.read(), src0.read(), src1,
8512  mask);
8513 }
8514 
8532 template <atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT,
8533  typename T, int N>
8534 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
8535  detail::is_simd_view_type_v<SrcSimdViewT> &&
8536  detail::is_simd_view_type_v<OffsetSimdViewT>,
8537  simd<T, N>>
8538 slm_atomic_update(OffsetSimdViewT byte_offset, simd<T, N> src0,
8539  SrcSimdViewT src1, simd_mask<N> mask = 1) {
8540  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
8541  N == OffsetSimdViewT::getSizeX() *
8542  OffsetSimdViewT::getSizeY(),
8543  "Size of src1 parameter must correspond to the size of "
8544  "byte_offset and src0 parameters.");
8545  return slm_atomic_update<Op, T, N>(byte_offset.read(), src0, src1.read(),
8546  mask);
8547 }
8548 
8566 template <atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT,
8567  typename T = SrcSimdViewT::value_type::element_type,
8568  int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY()>
8569 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
8570  detail::is_simd_view_type_v<SrcSimdViewT> &&
8571  detail::is_simd_view_type_v<OffsetSimdViewT>,
8572  simd<T, N>>
8573 slm_atomic_update(OffsetSimdViewT byte_offset, SrcSimdViewT src0,
8574  SrcSimdViewT src1, simd_mask<N> mask = 1) {
8575  static_assert(
8576  N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
8577  "Size of src1 and src0 parameters must correspond to the size of "
8578  "byte_offset parameter.");
8579  return slm_atomic_update<Op, T, N>(byte_offset.read(), src0, src1, mask);
8580 }
8581 
8588 template <atomic_op Op, typename T, int N, typename AccessorT>
8589 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
8590  __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
8591  simd<T, N>>
8592 atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, simd<T, N> src0,
8593  simd<T, N> src1, simd_mask<N> mask = 1) {
8594  byte_offset += detail::localAccessorToOffset(lacc);
8595  return slm_atomic_update<Op, T, N>(byte_offset, src0, src1, mask);
8596 }
8597 
8606 template <atomic_op Op, typename SrcSimdViewT, typename T, int N,
8607  typename AccessorT>
8608 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
8609  detail::is_simd_view_type_v<SrcSimdViewT> &&
8610  __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
8611  simd<T, N>>
8612 atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, SrcSimdViewT src0,
8613  simd<T, N> src1, simd_mask<N> mask = 1) {
8614  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
8615  "Size of src0 parameter must correspond to the size of "
8616  "byte_offset and src1 parameters.");
8617  return atomic_update<Op, T, N>(lacc, byte_offset, src0.read(), src1, mask);
8618 }
8619 
8628 template <atomic_op Op, typename SrcSimdViewT, typename T, int N,
8629  typename AccessorT>
8630 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
8631  detail::is_simd_view_type_v<SrcSimdViewT> &&
8632  __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
8633  simd<T, N>>
8634 atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, simd<T, N> src0,
8635  SrcSimdViewT src1, simd_mask<N> mask = 1) {
8636  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
8637  "Size of src1 parameter must correspond to the size of "
8638  "byte_offset and src0 parameters.");
8639  return atomic_update<Op, T, N>(lacc, byte_offset, src0, src1.read(), mask);
8640 }
8641 
8650 template <atomic_op Op, typename SrcSimdViewT,
8651  typename T = SrcSimdViewT::value_type::element_type, int N,
8652  typename AccessorT>
8653 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
8654  detail::is_simd_view_type_v<SrcSimdViewT> &&
8655  __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
8656  simd<T, N>>
8657 atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, SrcSimdViewT src0,
8658  SrcSimdViewT src1, simd_mask<N> mask = 1) {
8659  static_assert(
8660  N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
8661  "Size of src1 and src0 parameters must correspond to the size of "
8662  "byte_offset parameter.");
8663  return atomic_update<Op, T, N>(lacc, byte_offset, src0.read(), src1.read(),
8664  mask);
8665 }
8666 
8675 template <atomic_op Op, typename OffsetSimdViewT, typename T, int N,
8676  typename AccessorT>
8677 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
8678  detail::is_simd_view_type_v<OffsetSimdViewT> &&
8679  __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
8680  simd<T, N>>
8681 atomic_update(AccessorT lacc, OffsetSimdViewT byte_offset, simd<T, N> src0,
8682  simd<T, N> src1, simd_mask<N> mask = 1) {
8683  static_assert(
8684  N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
8685  "Size of src1 and src0 parameters must correspond to the size of "
8686  "byte_offset parameter.");
8687  return atomic_update<Op, T, N>(lacc, byte_offset.read(), src0, src1, mask);
8688 }
8689 
8698 template <atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT,
8699  typename T, int N, typename AccessorT>
8700 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
8701  detail::is_simd_view_type_v<SrcSimdViewT> &&
8702  detail::is_simd_view_type_v<OffsetSimdViewT> &&
8703  __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
8704  simd<T, N>>
8705 atomic_update(AccessorT lacc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
8706  simd<T, N> src1, simd_mask<N> mask = 1) {
8707  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
8708  "Size of src0 parameter must correspond to the size of "
8709  "byte_offset and src1 parameters.");
8710  return atomic_update<Op, T, N>(lacc, byte_offset.read(), src0.read(), src1,
8711  mask);
8712 }
8713 
8722 template <atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT,
8723  typename T, int N, typename AccessorT>
8724 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
8725  detail::is_simd_view_type_v<SrcSimdViewT> &&
8726  detail::is_simd_view_type_v<OffsetSimdViewT> &&
8727  __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
8728  simd<T, N>>
8729 atomic_update(AccessorT lacc, OffsetSimdViewT byte_offset, simd<T, N> src0,
8730  SrcSimdViewT src1, simd_mask<N> mask = 1) {
8731  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
8732  N == OffsetSimdViewT::getSizeX() *
8733  OffsetSimdViewT::getSizeY(),
8734  "Size of src1 parameter must correspond to the size of "
8735  "byte_offset and src0 parameters.");
8736  return atomic_update<Op, T, N>(lacc, byte_offset.read(), src0, src1.read(),
8737  mask);
8738 }
8739 
8748 template <atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT,
8749  typename T = SrcSimdViewT::value_type::element_type,
8750  int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
8751  typename AccessorT>
8752 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
8753  detail::is_simd_view_type_v<SrcSimdViewT> &&
8754  detail::is_simd_view_type_v<OffsetSimdViewT> &&
8755  __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
8756  simd<T, N>>
8757 atomic_update(AccessorT lacc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
8758  SrcSimdViewT src1, simd_mask<N> mask = 1) {
8759  static_assert(
8760  N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
8761  "Size of src1 and src0 parameters must correspond to the size of "
8762  "byte_offset parameter.");
8763  return atomic_update<Op, T, N>(lacc, byte_offset.read(), src0.read(),
8764  src1.read(), mask);
8765 }
8766 
8768 
8769 namespace detail {
8770 
8784 template <atomic_op Op, typename T, int N, lsc_data_size DS,
8785  typename PropertyListT, typename Toffset>
8786 __ESIMD_API std::enable_if_t<get_num_args<Op>() == 0, simd<T, N>>
8788  static_assert(sizeof(T) > 1, "Unsupported data type");
8789  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
8790  check_atomic<Op, T, N, 0, /*IsLSC*/ true>();
8791  check_lsc_data_size<T, DS>();
8792  check_cache_hints<cache_action::atomic, PropertyListT>();
8793  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
8794  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
8795  constexpr uint16_t AddressScale = 1;
8796  constexpr int ImmOffset = 0;
8797  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
8798  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
8799  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
8800  using MsgT = typename lsc_expand_type<T>::type;
8801  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
8802  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
8803  addrs += convert<uintptr_t>(offsets);
8804  simd<MsgT, N> Tmp =
8805  __esimd_lsc_xatomic_stateless_0<MsgT, IOp, L1H, L2H, AddressScale,
8806  ImmOffset, EDS, VS, Transposed, N>(
8807  pred.data(), addrs.data());
8808  return lsc_format_ret<T>(Tmp);
8809 }
8810 
8825 template <atomic_op Op, typename T, int N, lsc_data_size DS,
8826  typename PropertyListT, typename Toffset>
8827 __ESIMD_API std::enable_if_t<get_num_args<Op>() == 1, simd<T, N>>
8829  simd_mask<N> pred) {
8830  static_assert(sizeof(T) > 1, "Unsupported data type");
8831  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
8832  check_lsc_data_size<T, DS>();
8833  check_atomic<Op, T, N, 1, /*IsLSC*/ true>();
8834  check_cache_hints<cache_action::atomic, PropertyListT>();
8835  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
8836  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
8837  constexpr uint16_t AddressScale = 1;
8838  constexpr int ImmOffset = 0;
8839  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
8840  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
8841  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
8842  using MsgT = typename lsc_expand_type<T>::type;
8843  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
8844  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
8845  addrs += convert<uintptr_t>(offsets);
8846  if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
8847  return __esimd_lsc_xatomic_stateless_1<T, IOp, L1H, L2H, AddressScale,
8848  ImmOffset, EDS, VS, Transposed, N>(
8849  pred.data(), addrs.data(), src0.data());
8850  } else {
8851  simd<MsgT, N> Msg_data = lsc_format_input<MsgT>(src0);
8852  simd<MsgT, N> Tmp =
8853  __esimd_lsc_xatomic_stateless_1<MsgT, IOp, L1H, L2H, AddressScale,
8854  ImmOffset, EDS, VS, Transposed, N>(
8855  pred.data(), addrs.data(), Msg_data.data());
8856  return lsc_format_ret<T>(Tmp);
8857  }
8858 }
8859 
8875 template <atomic_op Op, typename T, int N, lsc_data_size DS,
8876  typename PropertyListT, typename Toffset>
8877 __ESIMD_API std::enable_if_t<get_num_args<Op>() == 2, simd<T, N>>
8879  simd<T, N> src1, simd_mask<N> pred) {
8880  static_assert(sizeof(T) > 1, "Unsupported data type");
8881  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
8882  check_lsc_data_size<T, DS>();
8883  check_atomic<Op, T, N, 2, /*IsLSC*/ true>();
8884  check_cache_hints<cache_action::atomic, PropertyListT>();
8885  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
8886  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
8887  constexpr uint16_t AddressScale = 1;
8888  constexpr int ImmOffset = 0;
8889  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
8890  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
8891  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
8892  using MsgT = typename lsc_expand_type<T>::type;
8893  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
8894  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
8895  addrs += convert<uintptr_t>(offsets);
8896  if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
8897  return __esimd_lsc_xatomic_stateless_2<T, IOp, L1H, L2H, AddressScale,
8898  ImmOffset, EDS, VS, Transposed, N>(
8899  pred.data(), addrs.data(), src0.data(), src1.data());
8900  } else {
8901  simd<MsgT, N> Msg_data0 = lsc_format_input<MsgT>(src0);
8902  simd<MsgT, N> Msg_data1 = lsc_format_input<MsgT>(src1);
8903 
8904  simd<MsgT, N> Tmp =
8905  __esimd_lsc_xatomic_stateless_2<MsgT, IOp, L1H, L2H, AddressScale,
8906  ImmOffset, EDS, VS, Transposed, N>(
8907  pred.data(), addrs.data(), Msg_data0.data(), Msg_data1.data());
8908  return lsc_format_ret<T>(Tmp);
8909  }
8910 }
8911 
8927 template <atomic_op Op, typename T, int N,
8929  typename PropertyListT, typename AccessorTy, typename Toffset>
8930 __ESIMD_API
8931  std::enable_if_t<get_num_args<Op>() == 0 &&
8932  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy>,
8933  simd<T, N>>
8934  atomic_update_impl(AccessorTy acc, simd<Toffset, N> byte_offsets,
8935  simd_mask<N> pred) {
8936 #ifdef __ESIMD_FORCE_STATELESS_MEM
8937  return atomic_update_impl<Op, T, N, DS, PropertyListT>(
8938  accessorToPointer<T>(acc), byte_offsets, pred);
8939 #else
8940  static_assert(sizeof(T) > 1, "Unsupported data type");
8941  static_assert(std::is_integral_v<Toffset> && sizeof(Toffset) == 4,
8942  "Unsupported offset type");
8943  check_lsc_data_size<T, DS>();
8944  check_atomic<Op, T, N, 0, /*IsLSC*/ true>();
8945  check_cache_hints<cache_action::atomic, PropertyListT>();
8946  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
8947  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
8948  constexpr uint16_t AddressScale = 1;
8949  constexpr int ImmOffset = 0;
8950  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
8951  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
8952  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
8953  using MsgT = typename lsc_expand_type<T>::type;
8954  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
8955  auto si = get_surface_index(acc);
8956  simd<MsgT, N> Tmp =
8957  __esimd_lsc_xatomic_bti_0<MsgT, IOp, L1H, L2H, AddressScale, ImmOffset,
8958  EDS, VS, Transposed, N>(
8959  pred.data(), byte_offsets.data(), si);
8960  return lsc_format_ret<T>(Tmp);
8961 #endif
8962 }
8963 
8981 template <atomic_op Op, typename T, int N, lsc_data_size DS,
8982  typename PropertyListT, typename AccessorTy, typename Toffset>
8983 __ESIMD_API
8984  std::enable_if_t<get_num_args<Op>() == 1 &&
8985  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy>,
8986  simd<T, N>>
8987  atomic_update_impl(AccessorTy acc, simd<Toffset, N> byte_offset,
8988  simd<T, N> src0, simd_mask<N> pred) {
8989 #ifdef __ESIMD_FORCE_STATELESS_MEM
8990  return atomic_update_impl<Op, T, N, DS, PropertyListT>(
8991  accessorToPointer<T>(acc), byte_offset, src0, pred);
8992 #else
8993  static_assert(sizeof(T) > 1, "Unsupported data type");
8994  static_assert(std::is_integral_v<Toffset> && sizeof(Toffset) == 4,
8995  "Unsupported offset type");
8996  check_lsc_data_size<T, DS>();
8997  check_atomic<Op, T, N, 1, /*IsLSC*/ true>();
8998  check_cache_hints<cache_action::atomic, PropertyListT>();
8999  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
9000  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
9001  constexpr uint16_t AddressScale = 1;
9002  constexpr int ImmOffset = 0;
9003  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
9004  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
9005  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
9006  using MsgT = typename lsc_expand_type<T>::type;
9007  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
9008  auto si = get_surface_index(acc);
9009  if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
9010  return __esimd_lsc_xatomic_bti_1<T, IOp, L1H, L2H, AddressScale, ImmOffset,
9011  EDS, VS, Transposed, N>(
9012  pred.data(), byte_offset.data(), src0.data(), si);
9013  } else {
9014  simd<MsgT, N> Src0Msg = lsc_format_input<MsgT>(src0);
9015  simd<MsgT, N> Tmp =
9016  __esimd_lsc_xatomic_bti_1<MsgT, IOp, L1H, L2H, AddressScale, ImmOffset,
9017  EDS, VS, Transposed, N>(
9018  pred.data(), byte_offset.data(), Src0Msg.data(), si);
9019  return lsc_format_ret<T>(Tmp);
9020  }
9021 #endif
9022 }
9023 
9042 template <atomic_op Op, typename T, int N, lsc_data_size DS,
9043  typename PropertyListT, typename AccessorTy, typename Toffset>
9044 __ESIMD_API
9045  std::enable_if_t<get_num_args<Op>() == 2 &&
9046  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy>,
9047  simd<T, N>>
9048  atomic_update_impl(AccessorTy acc, simd<Toffset, N> byte_offset,
9050 #ifdef __ESIMD_FORCE_STATELESS_MEM
9051  return atomic_update_impl<Op, T, N, DS, PropertyListT>(
9052  __ESIMD_DNS::accessorToPointer<T>(acc), byte_offset, src0, src1, pred);
9053 #else
9054  static_assert(std::is_integral_v<Toffset> && sizeof(Toffset) == 4,
9055  "Unsupported offset type");
9056  check_lsc_vector_size<1>();
9057  check_lsc_data_size<T, DS>();
9058  check_atomic<Op, T, N, 2, /*IsLSC*/ true>();
9059  check_cache_hints<cache_action::atomic, PropertyListT>();
9060  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
9061  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
9062  constexpr uint16_t AddressScale = 1;
9063  constexpr int ImmOffset = 0;
9064  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
9065  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
9066  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
9067  using MsgT = typename lsc_expand_type<T>::type;
9068  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
9069  auto si = get_surface_index(acc);
9070  if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
9071  return __esimd_lsc_xatomic_bti_2<T, IOp, L1H, L2H, AddressScale, ImmOffset,
9072  EDS, VS, Transposed, N>(
9073  pred.data(), byte_offset.data(), src0.data(), src1.data(), si);
9074  } else {
9075  simd<MsgT, N> Msg_data0 = lsc_format_input<MsgT>(src0);
9076  simd<MsgT, N> Msg_data1 = lsc_format_input<MsgT>(src1);
9077  simd<MsgT, N> Tmp =
9078  __esimd_lsc_xatomic_bti_2<MsgT, IOp, L1H, L2H, AddressScale, ImmOffset,
9079  EDS, VS, Transposed, N>(
9080  pred.data(), byte_offset.data(), Msg_data0.data(), Msg_data1.data(),
9081  si);
9082  return lsc_format_ret<T>(Tmp);
9083  }
9084 #endif
9085 }
9086 } // namespace detail
9087 
9090 
9128 // Other properties are ignored.
9132 template <
9133  atomic_op Op, typename T, int N, typename Toffset,
9134  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
9135 __ESIMD_API std::enable_if_t<
9136  __ESIMD_DNS::get_num_args<Op>() == 0 &&
9137  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
9138  simd<T, N>>
9140  PropertyListT props = {}) {
9141  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
9142 
9143  if constexpr (detail::has_cache_hints<PropertyListT>() ||
9144  !__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) {
9146  Op, T, N, detail::lsc_data_size::default_size, PropertyListT, Toffset>(
9147  p, byte_offset, mask);
9148  } else if constexpr (N == 16 || N == 32) {
9149  // TODO: In fact GPU BE supports legalization for any N, even for
9150  // non-power-of-2, but it is implemented with an error now. For example,
9151  // N=17 is emulated as 2 calls (N=16 and N=1), while it must be 3 calls:
9152  // (N=8, N=8, N=1). I.e. Gen12 atomic instruction supports only N up to 8
9153  // and GPU thinks now it is up to 16.
9154  // Thus we emulate N=16 with 2 calls with N=8 each.
9155  // N=32 is emulated with 4 calls with N=8 each.
9156  // Task1: Remove the special-case emulation for N=16 and N=32 below when
9157  // GPU driver fixes the error.
9158  // Task2: remove the condition "!__ESIMD_DNS::isPowerOf2(N, 32)" above
9159  // and let svm.atomic for any N.
9160 
9161  simd<T, N> Res;
9162  for (int I = 0; I < N; I += 8) {
9163  simd_mask<8> Mask8 = mask.template select<8, 1>(I);
9164  simd<Toffset, 8> ByteOffset8 = byte_offset.template select<8, 1>(I);
9165  Res.template select<8, 1>(I) =
9166  atomic_update<Op, T, 8>(p, ByteOffset8, Mask8, props);
9167  }
9168  return Res;
9169  } else if constexpr (Op == atomic_op::load) {
9170  if constexpr (std::is_integral_v<T>) {
9171  return atomic_update<atomic_op::bit_or, T, N>(p, byte_offset,
9172  simd<T, N>(0), mask, props);
9173  } else {
9174  using Tint = detail::uint_type_t<sizeof(T)>;
9175  simd<Tint, N> Res = atomic_update<atomic_op::bit_or, Tint, N>(
9176  reinterpret_cast<Tint *>(p), byte_offset, simd<Tint, N>(0), mask,
9177  props);
9178  return Res.template bit_cast_view<T>();
9179  }
9180  } else {
9181  detail::check_atomic<Op, T, N, 0>();
9182  simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
9183  simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(byte_offset);
9184  vAddr += offset_i1;
9185  using Tx = typename detail::__raw_t<T>;
9186  return __esimd_svm_atomic0<Op, Tx, N>(vAddr.data(), mask.data());
9187  }
9188 }
9189 
9208 template <
9209  atomic_op Op, typename T, int N, typename Toffset,
9210  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
9211 __ESIMD_API std::enable_if_t<
9212  __ESIMD_DNS::get_num_args<Op>() == 0 &&
9213  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
9214  simd<T, N>>
9215 atomic_update(T *p, simd<Toffset, N> byte_offset, PropertyListT props = {}) {
9216  simd_mask<N> mask = 1;
9217  return atomic_update<Op, T, N>(p, byte_offset, mask, props);
9218 }
9219 
9240 template <
9241  atomic_op Op, typename T, int N, typename OffsetSimdViewT,
9242  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
9243 __ESIMD_API std::enable_if_t<
9244  __ESIMD_DNS::get_num_args<Op>() == 0 &&
9245  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
9246  detail::is_simd_view_type_v<OffsetSimdViewT>,
9247  simd<T, N>>
9248 atomic_update(T *p, OffsetSimdViewT offsets, simd_mask<N> mask,
9249  PropertyListT props = {}) {
9250  return atomic_update<Op, T, N>(p, offsets.read(), mask, props);
9251 }
9252 
9271 template <
9272  atomic_op Op, typename T, int N, typename OffsetSimdViewT,
9273  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
9274 __ESIMD_API std::enable_if_t<
9275  __ESIMD_DNS::get_num_args<Op>() == 0 &&
9276  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
9277  detail::is_simd_view_type_v<OffsetSimdViewT>,
9278  simd<T, N>>
9279 atomic_update(T *p, OffsetSimdViewT byte_offset, PropertyListT props = {}) {
9280  return atomic_update<Op, T, N>(p, byte_offset.read(), props);
9281 }
9282 
9300 template <
9301  atomic_op Op, typename OffsetSimdViewT, typename T,
9302  int N = OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
9303  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
9304 __ESIMD_API std::enable_if_t<
9305  __ESIMD_DNS::get_num_args<Op>() == 0 &&
9306  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
9307  detail::is_simd_view_type_v<OffsetSimdViewT>,
9308  simd<T, N>>
9309 atomic_update(T *p, OffsetSimdViewT byte_offset, PropertyListT props = {}) {
9310  return atomic_update<Op, T, N>(p, byte_offset.read(), props);
9311 }
9312 
9327 template <atomic_op Op, typename T, int N, typename Toffset>
9328 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>, simd<T, N>>
9329 atomic_update(T *p, Toffset byte_offset, simd_mask<N> mask = 1) {
9330  return atomic_update<Op, T, N>(p, simd<Toffset, N>(byte_offset), mask);
9331 }
9332 
9352 
9381 template <
9382  atomic_op Op, typename T, int N, typename Toffset,
9383  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
9384 __ESIMD_API std::enable_if_t<
9385  __ESIMD_DNS::get_num_args<Op>() == 1 &&
9386  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
9387  simd<T, N>>
9389  simd_mask<N> mask, PropertyListT props = {}) {
9390  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
9391 
9392  // Auto-convert FP atomics to LSC version.
9393  if constexpr (detail::has_cache_hints<PropertyListT>() ||
9394  (Op == atomic_op::fmin) || (Op == atomic_op::fmax) ||
9395  (Op == atomic_op::fadd) || (Op == atomic_op::fsub) ||
9396  !__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) {
9398  Op, T, N, detail::lsc_data_size::default_size, PropertyListT, Toffset>(
9399  p, byte_offset, src0, mask);
9400  } else if constexpr (N == 16 || N == 32) {
9401  // TODO: In fact GPU BE supports legalization for any N, even for
9402  // non-power-of-2, but it is implemented with an error now. For example,
9403  // N=17 is emulated as 2 calls (N=16 and N=1), while it must be 3 calls:
9404  // (N=8, N=8, N=1). I.e. Gen12 atomic instruction supports only N up to 8
9405  // and GPU thinks now it is up to 16.
9406  // Thus we emulate N=16 with 2 calls with N=8 each.
9407  // N=32 is emulated with 4 calls with N=8 each.
9408  // Task1: Remove the special-case emulation for N=16 and N=32 below when
9409  // GPU driver fixes the error.
9410  // Task2: remove the condition "!__ESIMD_DNS::isPowerOf2(N, 32)" above
9411  // and let svm.atomic for any N.
9412  simd<T, N> Res;
9413  for (int I = 0; I < N; I += 8) {
9414  simd_mask<8> Mask8 = mask.template select<8, 1>(I);
9415  simd<Toffset, 8> ByteOffset8 = byte_offset.template select<8, 1>(I);
9416  simd<T, 8> Src08 = src0.template select<8, 1>(I);
9417  Res.template select<8, 1>(I) =
9418  atomic_update<Op, T, 8>(p, ByteOffset8, Src08, Mask8, props);
9419  }
9420  return Res;
9421  } else if constexpr (Op == atomic_op::store) {
9422  if constexpr (std::is_integral_v<T>) {
9423  return atomic_update<atomic_op::xchg, T, N>(p, byte_offset, src0, mask,
9424  props);
9425  } else {
9426  using Tint = detail::uint_type_t<sizeof(T)>;
9427  simd<Tint, N> Res = atomic_update<atomic_op::xchg, Tint, N>(
9428  reinterpret_cast<Tint *>(p), byte_offset,
9429  src0.template bit_cast_view<Tint>(), mask, props);
9430  return Res.template bit_cast_view<T>();
9431  }
9432  } else {
9433  detail::check_atomic<Op, T, N, 1>();
9434  simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
9435  simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(byte_offset);
9436  vAddr += offset_i1;
9437 
9438  using Tx = typename detail::__raw_t<T>;
9439  return __esimd_svm_atomic1<Op, Tx, N>(vAddr.data(), src0.data(),
9440  mask.data());
9441  }
9442 }
9443 
9473 template <
9474  atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
9475  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
9476 __ESIMD_API std::enable_if_t<
9477  __ESIMD_DNS::get_num_args<Op>() == 1 &&
9478  detail::is_simd_view_type_v<SrcSimdViewT> &&
9479  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
9480  simd<T, N>>
9481 atomic_update(T *p, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
9482  simd_mask<N> mask, PropertyListT props = {}) {
9483  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
9484  "Size of src0 parameter must correspond to the size of "
9485  "byte_offset parameter.");
9486  return atomic_update<Op, T, N>(p, byte_offset, src0.read(), mask, props);
9487 }
9488 
9492 
9494 
9512 template <
9513  atomic_op Op, typename T, int N, typename Toffset,
9514  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
9515 __ESIMD_API std::enable_if_t<
9516  __ESIMD_DNS::get_num_args<Op>() == 1 &&
9517  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
9518  simd<T, N>>
9520  PropertyListT props = {}) {
9521  simd_mask<N> mask = 1;
9522  return atomic_update<Op, T, N>(p, byte_offset, src0, mask, props);
9523 }
9524 
9528 
9532 
9550 template <
9551  atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
9552  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
9553 __ESIMD_API std::enable_if_t<
9554  __ESIMD_DNS::get_num_args<Op>() == 1 &&
9555  detail::is_simd_view_type_v<SrcSimdViewT> &&
9556  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
9557  simd<T, N>>
9558 atomic_update(T *p, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
9559  PropertyListT props = {}) {
9560  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
9561  "Size of src0 parameter must correspond to the size of "
9562  "byte_offset parameter.");
9563  return atomic_update<Op, T, N>(p, byte_offset, src0.read(), props);
9564 }
9565 
9593 template <
9594  atomic_op Op, typename T, int N, typename OffsetSimdViewT,
9595  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
9596 __ESIMD_API std::enable_if_t<
9597  __ESIMD_DNS::get_num_args<Op>() == 1 &&
9598  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
9599  detail::is_simd_view_type_v<OffsetSimdViewT>,
9600  simd<T, N>>
9601 atomic_update(T *p, OffsetSimdViewT offsets, simd<T, N> src0, simd_mask<N> mask,
9602  PropertyListT props = {}) {
9603  return atomic_update<Op, T, N>(p, offsets.read(), src0, mask, props);
9604 }
9605 
9634 template <
9635  atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT, typename T,
9636  int N,
9637  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
9638 __ESIMD_API std::enable_if_t<
9639  __ESIMD_DNS::get_num_args<Op>() == 1 &&
9640  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
9641  detail::is_simd_view_type_v<OffsetSimdViewT> &&
9642  detail::is_simd_view_type_v<SrcSimdViewT>,
9643  simd<T, N>>
9644 atomic_update(T *p, OffsetSimdViewT offsets, SrcSimdViewT src0,
9645  simd_mask<N> mask, PropertyListT props = {}) {
9646  static_assert(
9647  N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY() &&
9648  N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
9649  "Size of src0 and offsets parameters must correspond to the size of "
9650  "mask parameter.");
9651  return atomic_update<Op, T, N>(p, offsets.read(), src0.read(), mask, props);
9652 }
9653 
9679 template <
9680  atomic_op Op, typename T, int N, typename OffsetSimdViewT,
9681  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
9682 __ESIMD_API std::enable_if_t<
9683  __ESIMD_DNS::get_num_args<Op>() == 1 &&
9684  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
9685  detail::is_simd_view_type_v<OffsetSimdViewT>,
9686  simd<T, N>>
9687 atomic_update(T *p, OffsetSimdViewT offsets, simd<T, N> src0,
9688  PropertyListT props = {}) {
9689  simd_mask<N> mask = 1;
9690  return atomic_update<Op, T, N>(p, offsets.read(), src0, mask, props);
9691 }
9692 
9717 template <
9718  atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT, typename T,
9719  int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
9720  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
9721 __ESIMD_API std::enable_if_t<
9722  __ESIMD_DNS::get_num_args<Op>() == 1 &&
9723  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
9724  detail::is_simd_view_type_v<OffsetSimdViewT> &&
9725  detail::is_simd_view_type_v<SrcSimdViewT>,
9726  simd<T, N>>
9727 atomic_update(T *p, OffsetSimdViewT offsets, SrcSimdViewT src0,
9728  PropertyListT props = {}) {
9729  static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
9730  "Size of src0 parameter must correspond to the size of "
9731  "offsets parameter.");
9732  return atomic_update<Op, T, N>(p, offsets.read(), src0.read(), props);
9733 }
9734 
9753 template <atomic_op Op, typename Tx, int N, typename Toffset>
9754 __ESIMD_API std::enable_if_t<
9755  std::is_integral_v<Toffset> &&
9756  ((Op != atomic_op::store && Op != atomic_op::xchg) || N == 1),
9757  simd<Tx, N>>
9758 atomic_update(Tx *p, Toffset byte_offset, simd<Tx, N> src0, simd_mask<N> mask) {
9759  return atomic_update<Op, Tx, N>(p, simd<Toffset, N>(byte_offset), src0, mask);
9760 }
9761 
9786 
9804 // Other properties are ignored.
9808 template <
9809  atomic_op Op, typename T, int N, typename Toffset,
9810  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
9811 __ESIMD_API std::enable_if_t<
9812  __ESIMD_DNS::get_num_args<Op>() == 2 &&
9813  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
9814  simd<T, N>>
9816  simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
9817  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
9818 
9819  // Use LSC atomic when cache hints are present, FP atomics is used,
9820  // non-power of two length is used, or operation width greater than 32, or the
9821  // data size is less than 4 bytes.
9822  if constexpr (detail::has_cache_hints<PropertyListT>() ||
9823  Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32) ||
9824  sizeof(T) < 4) {
9825  // 2-argument lsc_atomic_update arguments order matches the standard one -
9826  // expected value first, then new value. But atomic_update uses reverse
9827  // order, hence the src1/src0 swap.
9829  Op, T, N, detail::lsc_data_size::default_size, PropertyListT, Toffset>(
9830  p, byte_offset, src1, src0, mask);
9831  } else if constexpr (N == 16 || N == 32) {
9832  // TODO: In fact GPU BE supports legalization for any N, even for
9833  // non-power-of-2, but it is implemented with an error now. For example,
9834  // N=17 is emulated as 2 calls (N=16 and N=1), while it must be 3 calls:
9835  // (N=8, N=8, N=1). I.e. Gen12 atomic instruction supports only N up to 8
9836  // and GPU thinks now it is up to 16.
9837  // Thus we emulate N=16 with 2 calls with N=8 each.
9838  // N=32 is emulated with 4 calls with N=8 each.
9839  // Task1: Remove the special-case emulation for N=16 and N=32 below when
9840  // GPU driver fixes the error.
9841  // Task2: remove the condition "!__ESIMD_DNS::isPowerOf2(N, 32)" above
9842  // and let svm.atomic for any N.
9843  simd<T, N> Res;
9844  for (int I = 0; I < N; I += 8) {
9845  simd_mask<8> Mask8 = mask.template select<8, 1>(I);
9846  simd<Toffset, 8> ByteOffset8 = byte_offset.template select<8, 1>(I);
9847  simd<T, 8> Src08 = src0.template select<8, 1>(I);
9848  simd<T, 8> Src18 = src1.template select<8, 1>(I);
9849  Res.template select<8, 1>(I) =
9850  atomic_update<Op, T, 8>(p, ByteOffset8, Src08, Src18, Mask8, props);
9851  }
9852  return Res;
9853  } else {
9854  detail::check_atomic<Op, T, N, 2>();
9855  simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
9856  simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(byte_offset);
9857  vAddr += offset_i1;
9858  using Tx = typename detail::__raw_t<T>;
9859  return __esimd_svm_atomic2<Op, Tx, N>(vAddr.data(), src0.data(),
9860  src1.data(), mask.data());
9861  }
9862 }
9863 
9872 
9883 // Other properties are ignored.
9887 template <
9888  atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
9889  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
9890 __ESIMD_API std::enable_if_t<
9891  __ESIMD_DNS::get_num_args<Op>() == 2 &&
9892  detail::is_simd_view_type_v<SrcSimdViewT> &&
9893  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
9894  simd<T, N>>
9895 atomic_update(T *p, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
9896  simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
9897  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
9898  "Size of src0 parameter must correspond to the size of "
9899  "byte_offset parameter.");
9900  return atomic_update<Op, T, N>(p, byte_offset, src0.read(), src1, mask,
9901  props);
9902 }
9903 
9912 
9923 // Other properties are ignored.
9927 template <
9928  atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
9929  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
9930 __ESIMD_API std::enable_if_t<
9931  __ESIMD_DNS::get_num_args<Op>() == 2 &&
9932  detail::is_simd_view_type_v<SrcSimdViewT> &&
9933  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
9934  simd<T, N>>
9936  SrcSimdViewT src1, simd_mask<N> mask, PropertyListT props = {}) {
9937  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
9938  "Size of src1 parameter must correspond to the size of "
9939  "byte_offset parameter.");
9940  return atomic_update<Op, T, N>(p, byte_offset, src0, src1.read(), mask,
9941  props);
9942 }
9943 
9952 
9963 // Other properties are ignored.
9967 template <
9968  atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
9969  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
9970 __ESIMD_API std::enable_if_t<
9971  __ESIMD_DNS::get_num_args<Op>() == 2 &&
9972  detail::is_simd_view_type_v<SrcSimdViewT> &&
9973  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
9974  simd<T, N>>
9975 atomic_update(T *p, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
9976  SrcSimdViewT src1, simd_mask<N> mask, PropertyListT props = {}) {
9977  static_assert(
9978  N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
9979  "Size of src1 and src0 parameters must correspond to the size of "
9980  "byte_offset parameter.");
9981  return atomic_update<Op, T, N>(p, byte_offset, src0.read(), src1.read(), mask,
9982  props);
9983 }
9984 
9989 //
10000 // Other properties are ignored.
10004 template <
10005  atomic_op Op, typename T, int N, typename Toffset,
10006  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
10007 __ESIMD_API std::enable_if_t<
10008  __ESIMD_DNS::get_num_args<Op>() == 2 &&
10009  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
10010  simd<T, N>>
10012  simd<T, N> src1, PropertyListT props = {}) {
10013  simd_mask<N> mask = 1;
10014  return atomic_update<Op, T, N>(p, byte_offset, src0, src1, mask, props);
10015 }
10016 
10025 
10034 // Other properties are ignored.
10038 template <
10039  atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
10040  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
10041 __ESIMD_API std::enable_if_t<
10042  __ESIMD_DNS::get_num_args<Op>() == 2 &&
10043  detail::is_simd_view_type_v<SrcSimdViewT> &&
10044  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
10045  simd<T, N>>
10046 atomic_update(T *p, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
10047  simd<T, N> src1, PropertyListT props = {}) {
10048  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
10049  "Size of src0 parameter must correspond to the size of "
10050  "byte_offset parameter.");
10051  return atomic_update<Op, T, N>(p, byte_offset, src0.read(), src1, props);
10052 }
10053 
10062 
10071 // Other properties are ignored.
10075 template <
10076  atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
10077  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
10078 __ESIMD_API std::enable_if_t<
10079  __ESIMD_DNS::get_num_args<Op>() == 2 &&
10080  detail::is_simd_view_type_v<SrcSimdViewT> &&
10081  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
10082  simd<T, N>>
10084  SrcSimdViewT src1, PropertyListT props = {}) {
10085  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
10086  "Size of src1 parameter must correspond to the size of "
10087  "byte_offset parameter.");
10088  return atomic_update<Op, T, N>(p, byte_offset, src0, src1.read(), props);
10089 }
10090 
10099 
10108 // Other properties are ignored.
10112 template <
10113  atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
10114  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
10115 __ESIMD_API std::enable_if_t<
10116  __ESIMD_DNS::get_num_args<Op>() == 2 &&
10117  detail::is_simd_view_type_v<SrcSimdViewT> &&
10118  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
10119  simd<T, N>>
10120 atomic_update(T *p, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
10121  SrcSimdViewT src1, PropertyListT props = {}) {
10122  static_assert(
10123  N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
10124  "Size of src1 and src0 parameters must correspond to the size of "
10125  "byte_offset parameter.");
10126  return atomic_update<Op, T, N>(p, byte_offset, src0.read(), src1.read(),
10127  props);
10128 }
10129 
10147 // Other properties are ignored.
10150 template <
10151  atomic_op Op, typename T, int N, typename OffsetSimdViewT,
10152  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
10153 __ESIMD_API std::enable_if_t<
10154  __ESIMD_DNS::get_num_args<Op>() == 2 &&
10155  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
10156  detail::is_simd_view_type_v<OffsetSimdViewT>,
10157  simd<T, N>>
10158 atomic_update(T *p, OffsetSimdViewT byte_offset, simd<T, N> src0,
10159  simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
10160  return atomic_update<Op, T, N>(p, byte_offset.read(), src0, src1, mask,
10161  props);
10162 }
10163 
10179 // Other properties are ignored.
10182 template <
10183  atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
10184  int N,
10185  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
10186 __ESIMD_API std::enable_if_t<
10187  __ESIMD_DNS::get_num_args<Op>() == 2 &&
10188  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
10189  detail::is_simd_view_type_v<OffsetSimdViewT> &&
10190  detail::is_simd_view_type_v<SrcSimdViewT>,
10191  simd<T, N>>
10192 atomic_update(T *p, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
10193  simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
10194  static_assert(
10195  N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
10196  N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
10197  "Size of src0 and byte_offset parameters must correspond to the size of "
10198  "mask parameter.");
10199  return atomic_update<Op, T, N>(p, byte_offset.read(), src0.read(), src1, mask,
10200  props);
10201 }
10202 
10218 // Other properties are ignored.
10221 template <
10222  atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
10223  int N,
10224  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
10225 __ESIMD_API std::enable_if_t<
10226  __ESIMD_DNS::get_num_args<Op>() == 2 &&
10227  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
10228  detail::is_simd_view_type_v<OffsetSimdViewT> &&
10229  detail::is_simd_view_type_v<SrcSimdViewT>,
10230  simd<T, N>>
10231 atomic_update(T *p, OffsetSimdViewT byte_offset, simd<T, N> src0,
10232  SrcSimdViewT src1, simd_mask<N> mask, PropertyListT props = {}) {
10233  static_assert(
10234  N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
10235  N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
10236  "Size of src1 and byte_offset parameters must correspond to the size of "
10237  "mask parameter.");
10238  return atomic_update<Op, T, N>(p, byte_offset.read(), src0, src1.read(), mask,
10239  props);
10240 }
10241 
10257 // Other properties are ignored.
10260 template <
10261  atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
10262  int N,
10263  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
10264 __ESIMD_API std::enable_if_t<
10265  __ESIMD_DNS::get_num_args<Op>() == 2 &&
10266  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
10267  detail::is_simd_view_type_v<OffsetSimdViewT> &&
10268  detail::is_simd_view_type_v<SrcSimdViewT>,
10269  simd<T, N>>
10270 atomic_update(T *p, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
10271  SrcSimdViewT src1, simd_mask<N> mask, PropertyListT props = {}) {
10272  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
10273  N == OffsetSimdViewT::getSizeX() *
10274  OffsetSimdViewT::getSizeY(),
10275  "Size of src0, src1 and byte_offset parameters must correspond "
10276  "to the size of "
10277  "mask parameter.");
10278  return atomic_update<Op, T, N>(p, byte_offset.read(), src0.read(),
10279  src1.read(), mask, props);
10280 }
10281 
10297 // Other properties are ignored.
10300 template <
10301  atomic_op Op, typename T, int N, typename OffsetSimdViewT,
10302  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
10303 __ESIMD_API std::enable_if_t<
10304  __ESIMD_DNS::get_num_args<Op>() == 2 &&
10305  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
10306  detail::is_simd_view_type_v<OffsetSimdViewT>,
10307  simd<T, N>>
10308 atomic_update(T *p, OffsetSimdViewT byte_offset, simd<T, N> src0,
10309  simd<T, N> src1, PropertyListT props = {}) {
10310  simd_mask<N> mask = 1;
10311  return atomic_update<Op, T, N>(p, byte_offset.read(), src0, src1, mask,
10312  props);
10313 }
10314 
10328 // Other properties are ignored.
10331 template <
10332  atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
10333  int N,
10334  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
10335 __ESIMD_API std::enable_if_t<
10336  __ESIMD_DNS::get_num_args<Op>() == 2 &&
10337  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
10338  detail::is_simd_view_type_v<OffsetSimdViewT> &&
10339  detail::is_simd_view_type_v<SrcSimdViewT>,
10340  simd<T, N>>
10341 atomic_update(T *p, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
10342  simd<T, N> src1, PropertyListT props = {}) {
10343  static_assert(
10344  N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
10345  N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
10346  "Size of src0 and byte_offset parameters must correspond to the size of "
10347  "src1 parameter.");
10348  return atomic_update<Op, T, N>(p, byte_offset.read(), src0.read(), src1,
10349  props);
10350 }
10351 
10365 // Other properties are ignored.
10368 template <
10369  atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
10370  int N,
10371  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
10372 __ESIMD_API std::enable_if_t<
10373  __ESIMD_DNS::get_num_args<Op>() == 2 &&
10374  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
10375  detail::is_simd_view_type_v<OffsetSimdViewT> &&
10376  detail::is_simd_view_type_v<SrcSimdViewT>,
10377  simd<T, N>>
10378 atomic_update(T *p, OffsetSimdViewT byte_offset, simd<T, N> src0,
10379  SrcSimdViewT src1, PropertyListT props = {}) {
10380  static_assert(
10381  N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
10382  N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
10383  "Size of src1 and byte_offset parameters must correspond to the size of "
10384  "src0 parameter.");
10385  return atomic_update<Op, T, N>(p, byte_offset.read(), src0, src1.read(),
10386  props);
10387 }
10388 
10402 // Other properties are ignored.
10405 template <
10406  atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
10407  int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
10408  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
10409 __ESIMD_API std::enable_if_t<
10410  __ESIMD_DNS::get_num_args<Op>() == 2 &&
10411  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
10412  detail::is_simd_view_type_v<OffsetSimdViewT> &&
10413  detail::is_simd_view_type_v<SrcSimdViewT>,
10414  simd<T, N>>
10415 atomic_update(T *p, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
10416  SrcSimdViewT src1, PropertyListT props = {}) {
10417  static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
10418  "Size of src0, src1 and byte_offset parameters must be equal.");
10419  return atomic_update<Op, T, N>(p, byte_offset.read(), src0.read(),
10420  src1.read(), props);
10421 }
10422 
10439 template <atomic_op Op, typename Tx, int N, typename Toffset>
10440 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>, simd<Tx, N>>
10441 atomic_update(Tx *p, Toffset byte_offset, simd<Tx, N> src0, simd<Tx, N> src1,
10442  simd_mask<N> mask) {
10443  return atomic_update<Op, Tx, N>(p, simd<Toffset, N>(byte_offset), src0, src1,
10444  mask);
10445 }
10446 
10463 
10488 // Other properties are ignored.
10492 template <
10493  atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy,
10494  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
10495 __ESIMD_API std::enable_if_t<
10496  __ESIMD_DNS::get_num_args<Op>() == 0 &&
10497  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
10498  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
10499  simd<T, N>>
10500 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd_mask<N> mask,
10501  PropertyListT props = {}) {
10502 #ifdef __ESIMD_FORCE_STATELESS_MEM
10503  return atomic_update<Op, T, N>(__ESIMD_DNS::accessorToPointer<T>(acc),
10504  byte_offset, mask, props);
10505 #else
10506  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
10507 
10508  if constexpr (detail::has_cache_hints<PropertyListT>() ||
10509  !detail::isPowerOf2(N, 32) || sizeof(T) < 4) {
10511  Op, T, N, detail::lsc_data_size::default_size, PropertyListT>(
10512  acc, byte_offset, mask);
10513  } else {
10514  if constexpr (Op == atomic_op::load) {
10515  if constexpr (std::is_integral_v<T>) {
10516  return atomic_update<atomic_op::bit_or, T, N>(
10517  acc, byte_offset, simd<T, N>(0), mask, props);
10518  } else {
10519  using Tint = detail::uint_type_t<sizeof(T)>;
10520  simd<Tint, N> Res = atomic_update<atomic_op::bit_or, Tint, N>(
10521  acc, byte_offset, simd<Tint, N>(0), mask, props);
10522  return Res.template bit_cast_view<T>();
10523  }
10524  } else {
10525  detail::check_atomic<Op, T, N, 0>();
10526  static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported");
10527 
10528  static_assert(sizeof(T) == 4, "Only 32 bit data is supported");
10529  const auto si = get_surface_index(acc);
10530  using Tx = typename detail::__raw_t<T>;
10531  return __esimd_dword_atomic0<Op, Tx, N>(mask.data(), si,
10532  byte_offset.data());
10533  }
10534  }
10535 #endif
10536 }
10537 
10554 // Other properties are ignored.
10558 template <
10559  atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy,
10560  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
10561 __ESIMD_API std::enable_if_t<
10562  __ESIMD_DNS::get_num_args<Op>() == 0 &&
10563  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
10564  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
10565  simd<T, N>>
10566 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
10567  PropertyListT props = {}) {
10568  simd_mask<N> mask = 1;
10569  return atomic_update<Op, T, N>(acc, byte_offset, mask, props);
10570 }
10571 
10592 // Other properties are ignored.
10596 template <
10597  atomic_op Op, typename T, int N, typename OffsetSimdViewT,
10598  typename AccessorTy,
10599  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
10600 __ESIMD_API std::enable_if_t<
10601  __ESIMD_DNS::get_num_args<Op>() == 0 &&
10602  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
10603  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
10604  detail::is_simd_view_type_v<OffsetSimdViewT>,
10605  simd<T, N>>
10606 atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd_mask<N> mask,
10607  PropertyListT props = {}) {
10608  return atomic_update<Op, T, N>(acc, byte_offset.read(), mask, props);
10609 }
10610 
10630 template <
10631  atomic_op Op, typename T, int N, typename OffsetSimdViewT,
10632  typename AccessorTy,
10633  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
10634 __ESIMD_API std::enable_if_t<
10635  __ESIMD_DNS::get_num_args<Op>() == 0 &&
10636  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
10637  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
10638  detail::is_simd_view_type_v<OffsetSimdViewT>,
10639  simd<T, N>>
10640 atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset,
10641  PropertyListT props = {}) {
10642  simd_mask<N> mask = 1;
10643  return atomic_update<Op, T, N>(acc, byte_offset.read(), mask, props);
10644 }
10645 
10664 template <atomic_op Op, typename T, int N, typename Toffset,
10665  typename AccessorTy>
10666 __ESIMD_API
10667  std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0 &&
10668  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy>,
10669  simd<T, N>>
10670  atomic_update(AccessorTy acc, Toffset byte_offset, simd_mask<N> mask) {
10671  return atomic_update<Op, T, N>(acc, simd<Toffset, N>(byte_offset), mask);
10672 }
10673 
10692 template <atomic_op Op, typename T, int N, typename AccessorTy>
10693 __ESIMD_API
10694  std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0 &&
10695  __ESIMD_DNS::is_rw_local_accessor_v<AccessorTy>,
10696  simd<T, N>>
10697  atomic_update(AccessorTy acc, uint32_t byte_offset, simd_mask<N> mask) {
10698  return atomic_update<Op, T, N>(acc, simd<uint32_t, N>(byte_offset), mask);
10699 }
10700 
10722 
10754 
10755 template <
10756  atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy,
10757  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
10758 __ESIMD_API std::enable_if_t<
10759  __ESIMD_DNS::get_num_args<Op>() == 1 &&
10760  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
10761  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
10762  simd<T, N>>
10763 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
10764  simd_mask<N> mask, PropertyListT props = {}) {
10765 #ifdef __ESIMD_FORCE_STATELESS_MEM
10766  return atomic_update<Op, T, N>(__ESIMD_DNS::accessorToPointer<T>(acc),
10767  byte_offset, src0, mask, props);
10768 #else
10769  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
10770  static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported");
10771  // Auto-convert FP atomics to LSC version.
10772  if constexpr (detail::has_cache_hints<PropertyListT>() ||
10773  Op == atomic_op::fmin || Op == atomic_op::fmax ||
10774  Op == atomic_op::fadd || Op == atomic_op::fsub ||
10775  !__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) {
10777  Op, T, N, detail::lsc_data_size::default_size, PropertyListT>(
10778  acc, byte_offset, src0, mask);
10779  } else if constexpr (Op == atomic_op::store) {
10780  if constexpr (std::is_integral_v<T>) {
10781  return atomic_update<atomic_op::xchg, T, N>(acc, byte_offset, src0, mask,
10782  props);
10783  } else {
10784  using Tint = detail::uint_type_t<sizeof(T)>;
10785  simd<Tint, N> Res = atomic_update<atomic_op::xchg, Tint, N>(
10786  acc, byte_offset, src0.template bit_cast_view<Tint>(), mask, props);
10787  return Res.template bit_cast_view<T>();
10788  }
10789  } else {
10790  detail::check_atomic<Op, T, N, 1>();
10791  static_assert(sizeof(T) == 4, "Only 32 bit data is supported");
10792  const auto si = __ESIMD_NS::get_surface_index(acc);
10793  using Tx = typename detail::__raw_t<T>;
10794  return __esimd_dword_atomic1<Op, Tx, N>(
10795  mask.data(), si, byte_offset.data(),
10796  sycl::bit_cast<__ESIMD_DNS::vector_type_t<Tx, N>>(src0.data()));
10797  }
10798 #endif
10799 }
10800 
10833 
10834 template <
10835  atomic_op Op, typename SrcSimdViewT, typename Toffset,
10836  typename T = SrcSimdViewT::value_type::element_type, int N,
10837  typename AccessorTy,
10838  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
10839 __ESIMD_API std::enable_if_t<
10840  __ESIMD_DNS::get_num_args<Op>() == 1 &&
10841  detail::is_simd_view_type_v<SrcSimdViewT> &&
10842  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
10843  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
10844  simd<T, N>>
10845 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
10846  simd_mask<N> mask, PropertyListT props = {}) {
10847  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
10848  "Size of src0 parameter must correspond to the size of "
10849  "byte_offset parameter.");
10850  return atomic_update<Op, T, N>(acc, byte_offset, src0.read(), mask, props);
10851 }
10852 
10885 template <
10886  atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy,
10887  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
10888 __ESIMD_API std::enable_if_t<
10889  __ESIMD_DNS::get_num_args<Op>() == 1 &&
10890  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
10891  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
10892  simd<T, N>>
10893 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
10894  PropertyListT props = {}) {
10895  simd_mask<N> mask = 1;
10896  return atomic_update<Op, T, N>(acc, byte_offset, src0, mask, props);
10897 }
10898 
10931 template <
10932  atomic_op Op, typename SrcSimdViewT, typename Toffset,
10933  typename T = SrcSimdViewT::value_type::element_type, int N,
10934  typename AccessorTy,
10935  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
10936 __ESIMD_API std::enable_if_t<
10937  __ESIMD_DNS::get_num_args<Op>() == 1 &&
10938  detail::is_simd_view_type_v<SrcSimdViewT> &&
10939  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
10940  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
10941  simd<T, N>>
10942 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
10943  PropertyListT props = {}) {
10944  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
10945  "Size of src0 parameter must correspond to the size of "
10946  "byte_offset parameter.");
10947  return atomic_update<Op, T, N>(acc, byte_offset, src0.read(), props);
10948 }
10949 
10982 template <
10983  atomic_op Op, typename T, int N, typename OffsetSimdViewT,
10984  typename AccessorTy,
10985  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
10986 __ESIMD_API std::enable_if_t<
10987  __ESIMD_DNS::get_num_args<Op>() == 1 &&
10988  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
10989  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
10990  detail::is_simd_view_type_v<OffsetSimdViewT>,
10991  simd<T, N>>
10992 atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
10993  simd_mask<N> mask, PropertyListT props = {}) {
10994  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, mask, props);
10995 }
10996 
11028 template <
11029  atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT,
11030  typename T = SrcSimdViewT::value_type::element_type,
11031  int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
11032  typename AccessorTy,
11033  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
11034 __ESIMD_API std::enable_if_t<
11035  __ESIMD_DNS::get_num_args<Op>() == 1 &&
11036  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
11037  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
11038  detail::is_simd_view_type_v<OffsetSimdViewT> &&
11039  detail::is_simd_view_type_v<SrcSimdViewT>,
11040  simd<T, N>>
11041 atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
11042  simd_mask<N> mask, PropertyListT props = {}) {
11043  static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
11044  "Size of src0 parameter must correspond to the size of "
11045  "byte_offset parameter.");
11046  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0.read(), mask,
11047  props);
11048 }
11049 
11080 template <
11081  atomic_op Op, typename T, int N, typename OffsetSimdViewT,
11082  typename AccessorTy,
11083  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
11084 __ESIMD_API std::enable_if_t<
11085  __ESIMD_DNS::get_num_args<Op>() == 1 &&
11086  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
11087  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
11088  detail::is_simd_view_type_v<OffsetSimdViewT>,
11089  simd<T, N>>
11090 atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
11091  PropertyListT props = {}) {
11092  simd_mask<N> mask = 1;
11093  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, mask, props);
11094 }
11095 
11125 template <
11126  atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT,
11127  typename T = SrcSimdViewT::value_type::element_type,
11128  int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
11129  typename AccessorTy,
11130  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
11131 __ESIMD_API std::enable_if_t<
11132  __ESIMD_DNS::get_num_args<Op>() == 1 &&
11133  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
11134  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
11135  detail::is_simd_view_type_v<OffsetSimdViewT> &&
11136  detail::is_simd_view_type_v<SrcSimdViewT>,
11137  simd<T, N>>
11138 atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
11139  PropertyListT props = {}) {
11140  static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
11141  "Size of src0 parameter must correspond to the size of "
11142  "byte_offset parameter.");
11143  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0.read(), props);
11144 }
11145 
11167 template <atomic_op Op, typename T, int N, typename Toffset,
11168  typename AccessorTy>
11169 __ESIMD_API std::enable_if_t<
11170  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
11171  ((Op != atomic_op::store && Op != atomic_op::xchg) || N == 1),
11172  simd<T, N>>
11173 atomic_update(AccessorTy acc, Toffset offset, simd<T, N> src0,
11174  simd_mask<N> mask) {
11175  return atomic_update<Op, T, N>(acc, simd<Toffset, N>(offset), src0, mask);
11176 }
11177 
11197 template <atomic_op Op, typename Tx, int N, typename AccessorTy>
11198 __ESIMD_API std::enable_if_t<
11199  __ESIMD_DNS::is_rw_local_accessor_v<AccessorTy> &&
11200  ((Op != atomic_op::store && Op != atomic_op::xchg) || N == 1),
11201  simd<Tx, N>>
11202 atomic_update(AccessorTy acc, uint32_t offset, simd<Tx, N> src0,
11203  simd_mask<N> mask) {
11204  return atomic_update<Op, Tx, N>(acc, simd<uint32_t, N>(offset), src0, mask);
11205 }
11206 
11213 // simd_mask<N> mask,props = {}); // (acc-au2-1)
11229 
11233 // simd_mask<N> mask,props = {}); // (acc-au2-1)
11256 // Other properties are ignored.
11260 template <
11261  atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy,
11262  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
11263 __ESIMD_API std::enable_if_t<
11264  __ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
11265  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
11266  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
11267  simd<T, N>>
11268 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
11269  simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
11270 #ifdef __ESIMD_FORCE_STATELESS_MEM
11271  return atomic_update<Op, T, N>(__ESIMD_DNS::accessorToPointer<T>(acc),
11272  byte_offset, src0, src1, mask, props);
11273 #else
11274  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
11275  static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported");
11276  // Use LSC atomic when cache hints are present, FP atomics is used,
11277  // non-power of two length is used, operation width greater than 32, or the
11278  // data size is less than 4 bytes,
11279  if constexpr (detail::has_cache_hints<PropertyListT>() ||
11280  Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32) ||
11281  sizeof(T) < 4) {
11282  // 2-argument lsc_atomic_update arguments order matches the standard one -
11283  // expected value first, then new value. But atomic_update uses reverse
11284  // order, hence the src1/src0 swap.
11286  Op, T, N, detail::lsc_data_size::default_size, PropertyListT>(
11287  acc, byte_offset, src1, src0, mask);
11288  } else {
11289  detail::check_atomic<Op, T, N, 2>();
11290  static_assert(sizeof(T) == 4, "Only 32 bit data is supported");
11291  const auto si = __ESIMD_NS::get_surface_index(acc);
11292  using Tx = typename detail::__raw_t<T>;
11293  return __esimd_dword_atomic2<Op, Tx, N>(
11294  mask.data(), si, byte_offset.data(),
11295  sycl::bit_cast<__ESIMD_DNS::vector_type_t<Tx, N>>(src0.data()),
11296  sycl::bit_cast<__ESIMD_DNS::vector_type_t<Tx, N>>(src1.data()));
11297  }
11298 #endif
11299 }
11300 
11304 // simd_mask<N> mask,props = {});
11329 // Other properties are ignored.
11333 template <
11334  atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
11335  typename AccessorTy,
11336  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
11337 __ESIMD_API std::enable_if_t<
11338  __ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
11339  detail::is_simd_view_type_v<SrcSimdViewT> &&
11340  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
11341  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
11342  simd<T, N>>
11343 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
11344  simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
11345  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
11346  "Size of src0 parameter must correspond to the size of "
11347  "byte_offset parameter.");
11348  return atomic_update<Op, T, N>(acc, byte_offset, src0.read(), src1, mask,
11349  props);
11350 }
11351 
11355 // simd_mask<N> mask,props = {});
11380 // Other properties are ignored.
11384 template <
11385  atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
11386  typename AccessorTy,
11387  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
11388 __ESIMD_API std::enable_if_t<
11389  __ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
11390  detail::is_simd_view_type_v<SrcSimdViewT> &&
11391  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
11392  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
11393  simd<T, N>>
11394 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
11395  SrcSimdViewT src1, simd_mask<N> mask, PropertyListT props = {}) {
11396  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
11397  "Size of src1 parameter must correspond to the size of "
11398  "byte_offset parameter.");
11399  return atomic_update<Op, T, N>(acc, byte_offset, src0, src1.read(), mask,
11400  props);
11401 }
11402 
11406 // simd_mask<N> mask,props = {});
11431 // Other properties are ignored.
11435 template <
11436  atomic_op Op, typename SrcSimdViewT,
11437  typename T = SrcSimdViewT::value_type::element_type, int N,
11438  typename Toffset, typename AccessorTy,
11439  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
11440 __ESIMD_API std::enable_if_t<
11441  __ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
11442  detail::is_simd_view_type_v<SrcSimdViewT> &&
11443  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
11444  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
11445  simd<T, N>>
11446 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
11447  SrcSimdViewT src1, simd_mask<N> mask, PropertyListT props = {}) {
11448  static_assert(
11449  N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
11450  "Size of src0 and src1 parameters must correspond to the size of "
11451  "byte_offset parameter.");
11452  return atomic_update<Op, T, N>(acc, byte_offset, src0.read(), src1.read(),
11453  mask, props);
11454 }
11455 
11473 // Other properties are ignored.
11477 template <
11478  atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy,
11479  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
11480 __ESIMD_API std::enable_if_t<
11481  __ESIMD_DNS::get_num_args<Op>() == 2 &&
11482  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
11483  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
11484  simd<T, N>>
11485 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
11486  simd<T, N> src1, PropertyListT props = {}) {
11487  simd_mask<N> mask = 1;
11488  return atomic_update<Op, T, N>(acc, byte_offset, src0, src1, mask, props);
11489 }
11490 
11494 // props = {});
11519 // Other properties are ignored.
11523 template <
11524  atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
11525  typename AccessorTy,
11526  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
11527 __ESIMD_API std::enable_if_t<
11528  __ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
11529  detail::is_simd_view_type_v<SrcSimdViewT> &&
11530  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
11531  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
11532  simd<T, N>>
11533 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
11534  simd<T, N> src1, PropertyListT props = {}) {
11535  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
11536  "Size of src0 parameter must correspond to the size of "
11537  "byte_offset parameter.");
11538  return atomic_update<Op, T, N>(acc, byte_offset, src0.read(), src1, props);
11539 }
11540 
11544 // props = {});
11569 // Other properties are ignored.
11573 template <
11574  atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
11575  typename AccessorTy,
11576  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
11577 __ESIMD_API std::enable_if_t<
11578  __ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
11579  detail::is_simd_view_type_v<SrcSimdViewT> &&
11580  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
11581  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
11582  simd<T, N>>
11583 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
11584  SrcSimdViewT src1, PropertyListT props = {}) {
11585  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
11586  "Size of src1 parameter must correspond to the size of "
11587  "byte_offset parameter.");
11588  return atomic_update<Op, T, N>(acc, byte_offset, src0, src1.read(), props);
11589 }
11590 
11594 // props = {});
11619 // Other properties are ignored.
11623 template <
11624  atomic_op Op, typename SrcSimdViewT,
11625  typename T = SrcSimdViewT::value_type::element_type, int N,
11626  typename Toffset, typename AccessorTy,
11627  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
11628 __ESIMD_API std::enable_if_t<
11629  __ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
11630  detail::is_simd_view_type_v<SrcSimdViewT> &&
11631  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
11632  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
11633  simd<T, N>>
11634 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
11635  SrcSimdViewT src1, PropertyListT props = {}) {
11636  static_assert(
11637  N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
11638  "Size of src0 and src1 parameters must correspond to the size of "
11639  "byte_offset parameter.");
11640  return atomic_update<Op, T, N>(acc, byte_offset, src0.read(), src1.read(),
11641  props);
11642 }
11643 
11664 // Other properties are ignored.
11667 template <
11668  atomic_op Op, typename T, int N, typename OffsetSimdViewT,
11669  typename AccessorTy,
11670  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
11671 __ESIMD_API std::enable_if_t<
11672  __ESIMD_DNS::get_num_args<Op>() == 2 &&
11673  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
11674  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
11675  detail::is_simd_view_type_v<OffsetSimdViewT>,
11676  simd<T, N>>
11677 atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
11678  simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
11679  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, src1, mask,
11680  props);
11681 }
11682 
11704 // Other properties are ignored.
11707 template <
11708  atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
11709  int N, typename AccessorTy,
11710  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
11711 __ESIMD_API std::enable_if_t<
11712  __ESIMD_DNS::get_num_args<Op>() == 2 &&
11713  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
11714  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
11715  detail::is_simd_view_type_v<OffsetSimdViewT> &&
11716  detail::is_simd_view_type_v<SrcSimdViewT>,
11717  simd<T, N>>
11718 atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
11719  simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
11720  static_assert(
11721  N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
11722  N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
11723  "Size of src0 and byte_offset parameters must correspond to the size of "
11724  "src1 parameter.");
11725  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0.read(), src1,
11726  mask, props);
11727 }
11728 
11750 // Other properties are ignored.
11753 template <
11754  atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
11755  int N, typename AccessorTy,
11756  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
11757 __ESIMD_API std::enable_if_t<
11758  __ESIMD_DNS::get_num_args<Op>() == 2 &&
11759  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
11760  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
11761  detail::is_simd_view_type_v<OffsetSimdViewT> &&
11762  detail::is_simd_view_type_v<SrcSimdViewT>,
11763  simd<T, N>>
11764 atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
11765  SrcSimdViewT src1, simd_mask<N> mask, PropertyListT props = {}) {
11766  static_assert(
11767  N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
11768  N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
11769  "Size of src1 and byte_offset parameters must correspond to the size of "
11770  "src0 parameter.");
11771  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, src1.read(),
11772  mask, props);
11773 }
11774 
11796 // Other properties are ignored.
11799 template <
11800  atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT,
11801  typename T = SrcSimdViewT::value_type::element_type, int N,
11802  typename AccessorTy,
11803  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
11804 __ESIMD_API std::enable_if_t<
11805  __ESIMD_DNS::get_num_args<Op>() == 2 &&
11806  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
11807  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
11808  detail::is_simd_view_type_v<OffsetSimdViewT> &&
11809  detail::is_simd_view_type_v<SrcSimdViewT>,
11810  simd<T, N>>
11811 atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
11812  SrcSimdViewT src1, simd_mask<N> mask, PropertyListT props = {}) {
11813  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
11814  N == OffsetSimdViewT::getSizeX() *
11815  OffsetSimdViewT::getSizeY(),
11816  "Size of src0, src1 and byte_offset parameters must correspond "
11817  "to the size of "
11818  "mask parameter.");
11819  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0.read(),
11820  src1.read(), mask, props);
11821 }
11822 
11841 // Other properties are ignored.
11844 template <
11845  atomic_op Op, typename T, int N, typename OffsetSimdViewT,
11846  typename AccessorTy,
11847  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
11848 __ESIMD_API std::enable_if_t<
11849  __ESIMD_DNS::get_num_args<Op>() == 2 &&
11850  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
11851  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
11852  detail::is_simd_view_type_v<OffsetSimdViewT>,
11853  simd<T, N>>
11854 atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
11855  simd<T, N> src1, PropertyListT props = {}) {
11856  simd_mask<N> mask = 1;
11857  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, src1, mask,
11858  props);
11859 }
11860 
11880 // Other properties are ignored.
11883 template <
11884  atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
11885  int N, typename AccessorTy,
11886  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
11887 __ESIMD_API std::enable_if_t<
11888  __ESIMD_DNS::get_num_args<Op>() == 2 &&
11889  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
11890  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
11891  detail::is_simd_view_type_v<OffsetSimdViewT> &&
11892  detail::is_simd_view_type_v<SrcSimdViewT>,
11893  simd<T, N>>
11894 atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
11895  simd<T, N> src1, PropertyListT props = {}) {
11896  static_assert(
11897  N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
11898  N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
11899  "Size of src0 and byte_offset parameters must correspond to the size of "
11900  "src1 parameter.");
11901  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0.read(), src1,
11902  props);
11903 }
11904 
11924 // Other properties are ignored.
11927 template <
11928  atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
11929  int N, typename AccessorTy,
11930  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
11931 __ESIMD_API std::enable_if_t<
11932  __ESIMD_DNS::get_num_args<Op>() == 2 &&
11933  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
11934  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
11935  detail::is_simd_view_type_v<OffsetSimdViewT> &&
11936  detail::is_simd_view_type_v<SrcSimdViewT>,
11937  simd<T, N>>
11938 atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
11939  SrcSimdViewT src1, PropertyListT props = {}) {
11940  static_assert(
11941  N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
11942  N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
11943  "Size of src1 and byte_offset parameters must correspond to the size of "
11944  "src0 parameter.");
11945  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, src1.read(),
11946  props);
11947 }
11948 
11968 // Other properties are ignored.
11971 template <
11972  atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT,
11973  typename T = SrcSimdViewT::value_type::element_type,
11974  int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
11975  typename AccessorTy,
11976  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
11977 __ESIMD_API std::enable_if_t<
11978  __ESIMD_DNS::get_num_args<Op>() == 2 &&
11979  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
11980  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
11981  detail::is_simd_view_type_v<OffsetSimdViewT> &&
11982  detail::is_simd_view_type_v<SrcSimdViewT>,
11983  simd<T, N>>
11984 atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
11985  SrcSimdViewT src1, PropertyListT props = {}) {
11986  static_assert(
11987  N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
11988  "Size of src0, src1 and byte_offset parameters must correspond.");
11989  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0.read(),
11990  src1.read(), props);
11991 }
11992 
12013 template <atomic_op Op, typename Tx, int N, typename Toffset,
12014  typename AccessorTy>
12015 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_rw_device_accessor_v<AccessorTy>,
12016  simd<Tx, N>>
12017 atomic_update(AccessorTy acc, Toffset offset, simd<Tx, N> src0,
12018  simd<Tx, N> src1, simd_mask<N> mask) {
12019  return atomic_update<Op, Tx, N>(acc, simd<Toffset, N>(offset), src0, src1,
12020  mask);
12021 }
12022 
12040 template <atomic_op Op, typename Tx, int N, typename AccessorTy>
12041 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_rw_local_accessor_v<AccessorTy>,
12042  simd<Tx, N>>
12043 atomic_update(AccessorTy acc, uint32_t offset, simd<Tx, N> src0,
12044  simd<Tx, N> src1, simd_mask<N> mask) {
12045  return atomic_update<Op, Tx, N>(acc, simd<uint32_t, N>(offset), src0, src1,
12046  mask);
12047 }
12048 
12050 
12053 
12056 enum fence_mask : uint8_t {
12071 };
12072 
12076 template <uint8_t cntl> __ESIMD_API void fence() { __esimd_fence(cntl); }
12077 
12084 template <memory_kind Kind = memory_kind::global,
12087 __ESIMD_API void fence() {
12088  static_assert(
12089  Kind != memory_kind::local ||
12090  (FenceOp == fence_flush_op::none && Scope == fence_scope::group),
12091  "SLM fence must have 'none' lsc_fence_op and 'group' scope");
12092  constexpr int N = 16;
12093  simd_mask<N> Mask = 1;
12094  __esimd_lsc_fence<static_cast<uint8_t>(Kind), static_cast<uint8_t>(FenceOp),
12095  static_cast<uint8_t>(Scope), N>(Mask.data());
12096 }
12097 
12106 __ESIMD_API void barrier() {
12108  __esimd_barrier();
12109 }
12111 
12114 
12127 template <typename T, int m, int N, typename AccessorTy, unsigned plane = 0>
12128 __ESIMD_API simd<T, m * N> media_block_load(AccessorTy acc, unsigned x,
12129  unsigned y) {
12130  constexpr unsigned Width = N * sizeof(T);
12131  static_assert(Width * m <= 256u,
12132  "data does not fit into a single dataport transaction");
12133  static_assert(Width <= 64u, "valid block width is in range [1, 64]");
12134  static_assert(m <= 64u, "valid block height is in range [1, 64]");
12135  static_assert(plane <= 3u, "valid plane index is in range [0, 3]");
12136  static_assert(detail::isPowerOf2(N), "N must be a power of 2");
12137 
12138  const auto si = __ESIMD_NS::get_surface_index(acc);
12139  using SurfIndTy = decltype(si);
12140  constexpr unsigned int RoundedWidth =
12141  Width < 4 ? 4 : detail::getNextPowerOf2<Width>();
12142  constexpr int BlockWidth = sizeof(T) * N;
12143  constexpr int Mod = 0;
12144 
12145  if constexpr (Width < RoundedWidth) {
12146  constexpr unsigned int n1 = RoundedWidth / sizeof(T);
12147  simd<T, m * n1> temp =
12148  __esimd_media_ld<T, m, n1, Mod, SurfIndTy, (int)plane, BlockWidth>(
12149  si, x, y);
12150  return temp.template select<m, 1, N, 1>(0, 0);
12151  } else {
12152  return __esimd_media_ld<T, m, N, Mod, SurfIndTy, (int)plane, BlockWidth>(
12153  si, x, y);
12154  }
12155 }
12156 
12169 template <typename T, int m, int N, typename AccessorTy, unsigned plane = 0>
12170 __ESIMD_API void media_block_store(AccessorTy acc, unsigned x, unsigned y,
12171  simd<T, m * N> vals) {
12172  constexpr unsigned Width = N * sizeof(T);
12173  static_assert(Width * m <= 256u,
12174  "data does not fit into a single dataport transaction");
12175  static_assert(Width <= 64u, "valid block width is in range [1, 64]");
12176  static_assert(m <= 64u, "valid block height is in range [1, 64]");
12177  static_assert(plane <= 3u, "valid plane index is in range [0, 3]");
12178  const auto si = __ESIMD_NS::get_surface_index(acc);
12179  using SurfIndTy = decltype(si);
12180  constexpr unsigned int RoundedWidth =
12181  Width < 4 ? 4 : detail::getNextPowerOf2<Width>();
12182  constexpr unsigned int n1 = RoundedWidth / sizeof(T);
12183  constexpr int BlockWidth = sizeof(T) * N;
12184  constexpr int Mod = 0;
12185 
12186  if constexpr (Width < RoundedWidth) {
12187  simd<T, m * n1> temp;
12188  auto temp_ref = temp.template bit_cast_view<T, m, n1>();
12189  auto vals_ref = vals.template bit_cast_view<T, m, N>();
12190  temp_ref.template select<m, 1, N, 1>() = vals_ref;
12191  __esimd_media_st<T, m, n1, Mod, SurfIndTy, plane, BlockWidth>(si, x, y,
12192  temp.data());
12193  } else {
12194  __esimd_media_st<T, m, N, Mod, SurfIndTy, plane, BlockWidth>(si, x, y,
12195  vals.data());
12196  }
12197 }
12198 
12217 template <typename T, int N, typename AccessorTy,
12219 __ESIMD_API
12220  std::enable_if_t<detail::is_local_accessor_with_v<
12221  AccessorTy, detail::accessor_mode_cap::can_read> &&
12222  is_simd_flag_type_v<Flags>,
12223  simd<T, N>>
12224  block_load(AccessorTy acc, uint32_t byte_offset, Flags flags) {
12225  return slm_block_load<T, N>(byte_offset + detail::localAccessorToOffset(acc),
12226  flags);
12227 }
12228 
12246 template <typename T, int N, typename AccessorT, typename Flags>
12247 __ESIMD_API
12248  std::enable_if_t<detail::is_local_accessor_with_v<
12249  AccessorT, detail::accessor_mode_cap::can_write> &&
12250  is_simd_flag_type_v<Flags>>
12251  block_store(AccessorT acc, uint32_t offset, simd<T, N> vals, Flags flags) {
12252  slm_block_store<T, N>(offset + __ESIMD_DNS::localAccessorToOffset(acc), vals,
12253  flags);
12254 }
12255 
12286 // typename PropertyListT = empty_properties_t>
12295 
12327 template <
12328  typename T, int N, int VS, typename AccessorT,
12329  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
12330 __ESIMD_API std::enable_if_t<
12331  (detail::is_local_accessor_with_v<AccessorT,
12332  detail::accessor_mode_cap::can_read> &&
12333  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
12334  simd<T, N>>
12335 gather(AccessorT acc, simd<uint32_t, N / VS> byte_offsets,
12336  simd_mask<N / VS> mask, simd<T, N> pass_thru, PropertyListT props = {}) {
12337  return slm_gather<T, N, VS>(byte_offsets +
12338  __ESIMD_DNS::localAccessorToOffset(acc),
12339  mask, pass_thru, props);
12340 }
12341 
12372 template <
12373  typename T, int N, int VS, typename AccessorT,
12374  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
12375 __ESIMD_API std::enable_if_t<
12376  (detail::is_local_accessor_with_v<AccessorT,
12377  detail::accessor_mode_cap::can_read> &&
12378  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
12379  simd<T, N>>
12380 gather(AccessorT acc, simd<uint32_t, N / VS> byte_offsets,
12381  simd_mask<N / VS> mask, PropertyListT props = {}) {
12382  return slm_gather<T, N, VS>(
12383  byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), mask, props);
12384 }
12385 
12410 template <
12411  typename T, int N, int VS, typename AccessorT,
12412  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
12413 __ESIMD_API std::enable_if_t<
12414  (detail::is_local_accessor_with_v<AccessorT,
12415  detail::accessor_mode_cap::can_read> &&
12416  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
12417  simd<T, N>>
12418 gather(AccessorT acc, simd<uint32_t, N / VS> byte_offsets,
12419  PropertyListT props = {}) {
12420  return slm_gather<T, N, VS>(
12421  byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), props);
12422 }
12423 
12433 // Dev note: the mask type was turned into template parameter `MaskT` to
12434 // avoid the conflicts of this prototype with the old gather() function
12435 // accepting a 'global_offset' parameter and avoid 'ambiguous call' errors
12436 // for calls like this: gather(acc, byte_offsets_simd, 0, mask);
12437 template <
12438  typename T, int N, typename AccessorT, typename MaskT,
12439  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
12440 __ESIMD_API std::enable_if_t<
12441  (detail::is_local_accessor_with_v<AccessorT,
12442  detail::accessor_mode_cap::can_read> &&
12443  std::is_same_v<MaskT, simd_mask<N>> &&
12444  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
12445  simd<T, N>>
12446 gather(AccessorT acc, simd<uint32_t, N> byte_offsets, MaskT mask,
12447  simd<T, N> pass_thru, PropertyListT props = {}) {
12448  return slm_gather<T, N>(byte_offsets +
12449  __ESIMD_DNS::localAccessorToOffset(acc),
12450  mask, pass_thru, props);
12451 }
12452 
12460 // Dev note: the mask type was turned into template parameter `MaskT` to
12461 // avoid the conflicts of this prototype with the old gather() function
12462 // accepting a 'global_offset' parameter and avoid 'ambiguous call' errors
12463 // for calls like this: gather(acc, byte_offsets_simd, 0);
12464 template <
12465  typename T, int N, typename AccessorT, typename MaskT,
12466  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
12467 __ESIMD_API std::enable_if_t<
12468  (detail::is_local_accessor_with_v<AccessorT,
12469  detail::accessor_mode_cap::can_read> &&
12470  std::is_same_v<MaskT, simd_mask<N>> &&
12471  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
12472  simd<T, N>>
12473 gather(AccessorT acc, simd<uint32_t, N> byte_offsets, MaskT mask,
12474  PropertyListT props = {}) {
12475  return slm_gather<T, N>(
12476  byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), mask, props);
12477 }
12478 
12486 template <
12487  typename T, int N, typename AccessorT,
12488  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
12489 __ESIMD_API std::enable_if_t<
12490  (detail::is_local_accessor_with_v<AccessorT,
12491  detail::accessor_mode_cap::can_read> &&
12492  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
12493  simd<T, N>>
12494 gather(AccessorT acc, simd<uint32_t, N> byte_offsets,
12495  PropertyListT props = {}) {
12496  return slm_gather<T, N>(
12497  byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), props);
12498 }
12499 
12502 // typename PropertyListT = empty_properties_t>
12508 template <
12509  typename T, int N, int VS = 1, typename AccessorT, typename OffsetSimdViewT,
12510  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
12511 __ESIMD_API std::enable_if_t<
12512  (detail::is_local_accessor_with_v<AccessorT,
12513  detail::accessor_mode_cap::can_read> &&
12514  detail::is_simd_view_type_v<OffsetSimdViewT> &&
12515  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
12516  simd<T, N>>
12517 gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
12518  simd<T, N> pass_thru, PropertyListT props = {}) {
12519  return gather<T, N, VS>(acc, byte_offsets.read(), mask, pass_thru, props);
12520 }
12521 
12523 // typename PropertyListT = empty_properties_t>
12529 template <
12530  int VS, typename T, int N, typename AccessorT, typename OffsetSimdViewT,
12531  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
12532 __ESIMD_API std::enable_if_t<
12533  (detail::is_local_accessor_with_v<AccessorT,
12534  detail::accessor_mode_cap::can_read> &&
12535  detail::is_simd_view_type_v<OffsetSimdViewT> &&
12536  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
12537  simd<T, N>>
12538 gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
12539  simd<T, N> pass_thru, PropertyListT props = {}) {
12540  static_assert(N / VS ==
12541  OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
12542  "Size of pass_thru parameter must correspond to the size of "
12543  "byte_offsets parameter.");
12544  return gather<T, N, VS>(acc, byte_offsets.read(), mask, pass_thru, props);
12545 }
12546 
12557 template <
12558  int VS = 1, typename AccessorT, typename OffsetSimdViewT,
12559  typename PassThruSimdViewT,
12560  int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
12561  typename T = PassThruSimdViewT::value_type::element_type,
12562  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
12563 __ESIMD_API std::enable_if_t<
12564  (detail::is_local_accessor_with_v<AccessorT,
12565  detail::accessor_mode_cap::can_read> &&
12566  detail::is_simd_view_type_v<OffsetSimdViewT> &&
12567  detail::is_simd_view_type_v<PassThruSimdViewT> &&
12568  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
12569  simd<T, N>>
12570 gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
12571  PassThruSimdViewT pass_thru, PropertyListT props = {}) {
12572  static_assert(N / VS ==
12573  OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
12574  "Size of pass_thru parameter must correspond to the size of "
12575  "byte_offsets parameter.");
12576  return gather<T, N, VS>(acc, byte_offsets.read(), mask, pass_thru.read(),
12577  props);
12578 }
12579 
12590 template <
12591  int VS = 1, typename AccessorT, typename PassThruSimdViewT,
12592  int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
12593  typename T = PassThruSimdViewT::value_type::element_type,
12594  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
12595 __ESIMD_API std::enable_if_t<
12596  (detail::is_local_accessor_with_v<AccessorT,
12597  detail::accessor_mode_cap::can_read> &&
12598  detail::is_simd_view_type_v<PassThruSimdViewT> &&
12599  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
12600  simd<T, N>>
12601 gather(AccessorT acc, simd<uint32_t, N / VS> byte_offsets,
12602  simd_mask<N / VS> mask, PassThruSimdViewT pass_thru,
12603  PropertyListT props = {}) {
12604  return gather<T, N, VS>(acc, byte_offsets, mask, pass_thru.read(), props);
12605 }
12606 
12609 // typename PropertyListT = empty_properties_t>
12615 template <
12616  typename T, int N, int VS = 1, typename AccessorT, typename OffsetSimdViewT,
12617  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
12618 __ESIMD_API std::enable_if_t<
12619  (detail::is_local_accessor_with_v<AccessorT,
12620  detail::accessor_mode_cap::can_read> &&
12621  detail::is_simd_view_type_v<OffsetSimdViewT> &&
12622  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
12623  simd<T, N>>
12624 gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
12625  PropertyListT props = {}) {
12626  return gather<T, N, VS>(acc, byte_offsets.read(), mask, props);
12627 }
12628 
12631 // typename PropertyListT = empty_properties_t>
12636 template <
12637  typename T, int N, int VS = 1, typename AccessorT, typename OffsetSimdViewT,
12638  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
12639 __ESIMD_API std::enable_if_t<
12640  (detail::is_local_accessor_with_v<AccessorT,
12641  detail::accessor_mode_cap::can_read> &&
12642  detail::is_simd_view_type_v<OffsetSimdViewT> &&
12643  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
12644  simd<T, N>>
12645 gather(AccessorT acc, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
12646  return gather<T, N, VS>(acc, byte_offsets.read(), props);
12647 }
12648 
12666 template <typename T, int N, typename AccessorTy>
12667 __ESIMD_API
12668  std::enable_if_t<detail::is_local_accessor_with_v<
12669  AccessorTy, detail::accessor_mode_cap::can_read>,
12670  simd<T, N>>
12671  gather(AccessorTy acc, simd<uint32_t, N> offsets, uint32_t glob_offset,
12672  simd_mask<N> mask = 1) {
12673  return slm_gather<T, N>(
12674  offsets + glob_offset + __ESIMD_DNS::localAccessorToOffset(acc), mask);
12675 }
12676 
12685 
12692 
12695 
12704 
12712 
12739 template <
12740  typename T, int N, int VS = 1, typename AccessorT,
12741  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
12742 __ESIMD_API std::enable_if_t<
12743  detail::is_local_accessor_with_v<AccessorT,
12744  detail::accessor_mode_cap::can_write> &&
12745  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
12746 scatter(AccessorT acc, simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
12747  simd_mask<N / VS> mask, PropertyListT props = {}) {
12748  slm_scatter<T, N, VS>(byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc),
12749  vals, mask, props);
12750 }
12751 
12774 template <
12775  typename T, int N, int VS = 1, typename AccessorT,
12776  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
12777 __ESIMD_API std::enable_if_t<
12778  detail::is_local_accessor_with_v<AccessorT,
12779  detail::accessor_mode_cap::can_write> &&
12780  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
12781 scatter(AccessorT acc, simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
12782  PropertyListT props = {}) {
12783  simd_mask<N / VS> Mask = 1;
12784  scatter<T, N, VS>(acc, byte_offsets, vals, Mask, props);
12785 }
12786 
12814 template <
12815  typename T, int N, int VS = 1, typename OffsetSimdViewT, typename AccessorT,
12816  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
12817 __ESIMD_API std::enable_if_t<
12818  detail::is_local_accessor_with_v<AccessorT,
12819  detail::accessor_mode_cap::can_write> &&
12820  detail::is_simd_view_type_v<OffsetSimdViewT> &&
12821  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
12822 scatter(AccessorT acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
12823  simd_mask<N / VS> mask, PropertyListT props = {}) {
12824  scatter<T, N, VS>(acc, byte_offsets.read(), vals, mask, props);
12825 }
12826 
12852 template <
12853  int VS, typename AccessorTy, typename T, int N, typename OffsetSimdViewT,
12854  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
12855 __ESIMD_API std::enable_if_t<
12856  detail::is_local_accessor_with_v<AccessorTy,
12857  detail::accessor_mode_cap::can_write> &&
12858  detail::is_simd_view_type_v<OffsetSimdViewT> &&
12859  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
12860 scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
12861  simd_mask<N / VS> mask, PropertyListT props = {}) {
12862  static_assert(N / VS ==
12863  OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
12864  "Size of vals parameter must correspond to the size of "
12865  "byte_offsets parameter.");
12866  scatter<T, N, VS>(acc, byte_offsets.read(), vals, mask, props);
12867 }
12868 
12889 template <
12890  int VS, typename AccessorTy, typename T, int N, typename OffsetSimdViewT,
12891  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
12892 __ESIMD_API std::enable_if_t<
12893  detail::is_local_accessor_with_v<AccessorTy,
12894  detail::accessor_mode_cap::can_write> &&
12895  detail::is_simd_view_type_v<OffsetSimdViewT> &&
12896  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
12897 scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
12898  PropertyListT props = {}) {
12899  static_assert(N / VS ==
12900  OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
12901  "Size of vals parameter must correspond to the size of "
12902  "byte_offsets parameter.");
12903  scatter<T, N, VS>(acc, byte_offsets.read(), vals, props);
12904 }
12905 
12934 template <
12935  int VS = 1, typename AccessorTy, typename ValuesSimdViewT,
12936  typename OffsetSimdViewT,
12937  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
12938  typename T = ValuesSimdViewT::value_type::element_type,
12939  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
12940 __ESIMD_API std::enable_if_t<
12941  detail::is_local_accessor_with_v<AccessorTy,
12942  detail::accessor_mode_cap::can_write> &&
12943  detail::is_simd_view_type_v<OffsetSimdViewT> &&
12944  detail::is_simd_view_type_v<ValuesSimdViewT> &&
12945  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
12946 scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
12947  simd_mask<N / VS> mask, PropertyListT props = {}) {
12948  static_assert(N / VS ==
12949  OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
12950  "Size of vals parameter must correspond to the size of "
12951  "byte_offsets parameter.");
12952  scatter<T, N, VS>(acc, byte_offsets.read(), vals.read(), mask, props);
12953 }
12954 
12978 template <
12979  int VS = 1, typename AccessorTy, typename ValuesSimdViewT,
12980  typename OffsetSimdViewT,
12981  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
12982  typename T = ValuesSimdViewT::value_type::element_type,
12983  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
12984 __ESIMD_API std::enable_if_t<
12985  detail::is_local_accessor_with_v<AccessorTy,
12986  detail::accessor_mode_cap::can_write> &&
12987  detail::is_simd_view_type_v<OffsetSimdViewT> &&
12988  detail::is_simd_view_type_v<ValuesSimdViewT> &&
12989  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
12990 scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
12991  PropertyListT props = {}) {
12992  static_assert(N / VS ==
12993  OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
12994  "Size of vals parameter must correspond to the size of "
12995  "byte_offsets parameter.");
12996  scatter<T, N, VS>(acc, byte_offsets.read(), vals.read(), props);
12997 }
12998 
13027 template <
13028  int VS = 1, typename AccessorTy, typename ValuesSimdViewT, typename OffsetT,
13029  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
13030  typename T = ValuesSimdViewT::value_type::element_type,
13031  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13032 __ESIMD_API std::enable_if_t<
13033  detail::is_local_accessor_with_v<AccessorTy,
13034  detail::accessor_mode_cap::can_write> &&
13035  detail::is_simd_view_type_v<ValuesSimdViewT> &&
13036  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13037 scatter(AccessorTy acc, simd<OffsetT, N / VS> byte_offsets,
13038  ValuesSimdViewT vals, simd_mask<N / VS> mask,
13039  PropertyListT props = {}) {
13040  scatter<T, N, VS>(acc, byte_offsets, vals.read(), mask, props);
13041 }
13042 
13066 template <
13067  int VS = 1, typename AccessorTy, typename ValuesSimdViewT, typename OffsetT,
13068  int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
13069  typename T = ValuesSimdViewT::value_type::element_type,
13070  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13071 __ESIMD_API std::enable_if_t<
13072  detail::is_local_accessor_with_v<AccessorTy,
13073  detail::accessor_mode_cap::can_write> &&
13074  detail::is_simd_view_type_v<ValuesSimdViewT> &&
13075  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13076 scatter(AccessorTy acc, simd<OffsetT, N / VS> byte_offsets,
13077  ValuesSimdViewT vals, PropertyListT props = {}) {
13078  scatter<T, N, VS>(acc, byte_offsets, vals.read(), props);
13079 }
13080 
13105 template <
13106  typename T, int N, int VS = 1, typename OffsetSimdViewT, typename AccessorT,
13107  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13108 __ESIMD_API std::enable_if_t<
13109  detail::is_local_accessor_with_v<AccessorT,
13110  detail::accessor_mode_cap::can_write> &&
13111  detail::is_simd_view_type_v<OffsetSimdViewT> &&
13112  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13113 scatter(AccessorT acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
13114  PropertyListT props = {}) {
13115  simd_mask<N / VS> Mask = 1;
13116  scatter<T, N, VS>(acc, byte_offsets.read(), vals, Mask, props);
13117 }
13118 
13137 template <typename T, int N, typename AccessorTy>
13138 __ESIMD_API std::enable_if_t<detail::is_local_accessor_with_v<
13139  AccessorTy, detail::accessor_mode_cap::can_write>>
13140 scatter(AccessorTy acc, simd<uint32_t, N> offsets, simd<T, N> vals,
13141  uint32_t glob_offset, simd_mask<N> mask = 1) {
13142  slm_scatter<T, N>(offsets + glob_offset +
13143  __ESIMD_DNS::localAccessorToOffset(acc),
13144  vals, mask);
13145 }
13146 
13188 
13210 template <
13211  typename T, int N, int VS, typename OffsetT,
13212  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13213 __ESIMD_API std::enable_if_t<
13214  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13215 prefetch(const T *p, simd<OffsetT, N / VS> byte_offsets, simd_mask<N / VS> mask,
13216  PropertyListT props = {}) {
13217  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
13219  PropertyListT>(p, byte_offsets, mask);
13220 }
13221 
13238 template <
13239  typename T, int N, int VS, typename OffsetT,
13240  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13241 __ESIMD_API std::enable_if_t<
13242  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13243 prefetch(const T *p, simd<OffsetT, N / VS> byte_offsets,
13244  PropertyListT props = {}) {
13245  simd_mask<N / VS> Mask = 1;
13246  prefetch<T, N, VS>(p, byte_offsets, Mask, props);
13247 }
13248 
13268 template <
13269  typename T, int N, typename OffsetT,
13270  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13271 __ESIMD_API std::enable_if_t<
13272  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13273 prefetch(const T *p, simd<OffsetT, N> byte_offsets, simd_mask<N> mask,
13274  PropertyListT props = {}) {
13275  constexpr int VS = 1;
13276  prefetch<T, N, VS>(p, byte_offsets, mask, props);
13277 }
13278 
13293 template <
13294  typename T, int N, typename OffsetT,
13295  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13296 __ESIMD_API std::enable_if_t<
13297  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13298 prefetch(const T *p, simd<OffsetT, N> byte_offsets, PropertyListT props = {}) {
13299  constexpr int VS = 1;
13300  prefetch<T, N, VS>(p, byte_offsets, props);
13301 }
13302 
13323 template <
13324  typename T, int N, int VS = 1, typename OffsetSimdViewT,
13325  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13326 __ESIMD_API std::enable_if_t<
13327  detail::is_simd_view_type_v<OffsetSimdViewT> &&
13328  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13329 prefetch(const T *p, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
13330  PropertyListT props = {}) {
13331  prefetch<T, N, VS>(p, byte_offsets.read(), mask, props);
13332 }
13333 
13351 template <
13352  typename T, int N, int VS = 1, typename OffsetSimdViewT,
13353  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13354 __ESIMD_API std::enable_if_t<
13355  detail::is_simd_view_type_v<OffsetSimdViewT> &&
13356  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13357 prefetch(const T *p, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
13358  prefetch<T, N, VS>(p, byte_offsets.read(), props);
13359 }
13360 
13381 template <
13382  int VS = 1, typename OffsetSimdViewT, typename T,
13383  int N = OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY() * VS,
13384  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13385 __ESIMD_API std::enable_if_t<
13386  detail::is_simd_view_type_v<OffsetSimdViewT> &&
13387  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13388 prefetch(const T *p, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
13389  PropertyListT props = {}) {
13390  prefetch<T, N, VS>(p, byte_offsets.read(), mask, props);
13391 }
13392 
13410 template <
13411  int VS = 1, typename OffsetSimdViewT, typename T,
13412  int N = OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY() * VS,
13413  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13414 __ESIMD_API std::enable_if_t<
13415  detail::is_simd_view_type_v<OffsetSimdViewT> &&
13416  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13417 prefetch(const T *p, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
13418  prefetch<T, N, VS>(p, byte_offsets.read(), props);
13419 }
13420 
13438 
13447 template <
13448  typename T, int VS = 1, typename OffsetT,
13449  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13450 __ESIMD_API std::enable_if_t<
13451  std::is_integral_v<OffsetT> &&
13452  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13453 prefetch(const T *p, OffsetT byte_offset, simd_mask<1> mask,
13454  PropertyListT props = {}) {
13456  PropertyListT>(p, byte_offset, mask);
13457 }
13458 
13474 template <
13475  typename T, int VS = 1, typename OffsetT,
13476  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13477 __ESIMD_API std::enable_if_t<
13478  std::is_integral_v<OffsetT> &&
13479  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13480 prefetch(const T *p, OffsetT byte_offset, PropertyListT props = {}) {
13481  simd_mask<1> Mask = 1;
13482  prefetch<T, VS>(p, byte_offset, Mask, props);
13483 }
13484 
13501 template <
13502  typename T, int VS = 1,
13503  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13504 __ESIMD_API std::enable_if_t<
13505  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13506 prefetch(const T *p, simd_mask<1> mask, PropertyListT props = {}) {
13507  prefetch<T, VS>(p, 0, mask, props);
13508 }
13509 
13522 template <
13523  typename T, int VS = 1,
13524  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13525 __ESIMD_API std::enable_if_t<
13526  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13527 prefetch(const T *p, PropertyListT props = {}) {
13528  simd_mask<1> Mask = 1;
13529  prefetch<T, VS>(p, 0, Mask, props);
13530 }
13531 
13573 
13596 template <
13597  typename T, int N, int VS, typename AccessorT, typename OffsetT,
13598  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13599 __ESIMD_API std::enable_if_t<
13600  detail::is_device_accessor_with_v<AccessorT,
13601  detail::accessor_mode_cap::can_read> &&
13602  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13603 prefetch(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
13604  simd_mask<N / VS> mask, PropertyListT props = {}) {
13605 #ifdef __ESIMD_FORCE_STATELESS_MEM
13606  prefetch<T, N, VS>(detail::accessorToPointer<T>(acc), byte_offsets, mask,
13607  props);
13608 #else
13609  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
13611  PropertyListT>(acc, byte_offsets, mask);
13612 #endif // __ESIMD_FORCE_STATELESS_MEM
13613 }
13614 
13632 template <
13633  typename T, int N, int VS, typename AccessorT, typename OffsetT,
13634  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13635 __ESIMD_API std::enable_if_t<
13636  detail::is_device_accessor_with_v<AccessorT,
13637  detail::accessor_mode_cap::can_read> &&
13638  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13639 prefetch(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
13640  PropertyListT props = {}) {
13641  simd_mask<N / VS> Mask = 1;
13642  prefetch<T, N, VS>(acc, byte_offsets, Mask, props);
13643 }
13644 
13665 template <
13666  typename T, int N, typename AccessorT, typename OffsetT,
13667  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13668 __ESIMD_API std::enable_if_t<
13669  detail::is_device_accessor_with_v<AccessorT,
13670  detail::accessor_mode_cap::can_read> &&
13671  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13672 prefetch(AccessorT acc, simd<OffsetT, N> byte_offsets, simd_mask<N> mask,
13673  PropertyListT props = {}) {
13674  constexpr int VS = 1;
13675  prefetch<T, N, VS>(acc, byte_offsets, mask, props);
13676 }
13677 
13693 template <
13694  typename T, int N, typename AccessorT, typename OffsetT,
13695  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13696 __ESIMD_API std::enable_if_t<
13697  detail::is_device_accessor_with_v<AccessorT,
13698  detail::accessor_mode_cap::can_read> &&
13699  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13700 prefetch(AccessorT acc, simd<OffsetT, N> byte_offsets,
13701  PropertyListT props = {}) {
13702  constexpr int VS = 1;
13703  prefetch<T, N, VS>(acc, byte_offsets, props);
13704 }
13705 
13727 template <
13728  typename T, int N, int VS = 1, typename OffsetSimdViewT, typename AccessorT,
13729  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13730 __ESIMD_API std::enable_if_t<
13731  detail::is_device_accessor_with_v<AccessorT,
13732  detail::accessor_mode_cap::can_read> &&
13733  detail::is_simd_view_type_v<OffsetSimdViewT> &&
13734  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13735 prefetch(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
13736  PropertyListT props = {}) {
13737  prefetch<T, N, VS>(acc, byte_offsets.read(), mask, props);
13738 }
13739 
13758 template <
13759  typename T, int N, int VS = 1, typename OffsetSimdViewT, typename AccessorT,
13760  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13761 __ESIMD_API std::enable_if_t<
13762  detail::is_device_accessor_with_v<AccessorT,
13763  detail::accessor_mode_cap::can_read> &&
13764  detail::is_simd_view_type_v<OffsetSimdViewT> &&
13765  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13766 prefetch(AccessorT acc, OffsetSimdViewT byte_offsets,
13767  PropertyListT props = {}) {
13768  prefetch<T, N, VS>(acc, byte_offsets.read(), props);
13769 }
13770 
13796 template <
13797  typename T, int VS = 1, typename AccessorT, typename OffsetT,
13798  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13799 __ESIMD_API std::enable_if_t<
13800  std::is_integral_v<OffsetT> &&
13801  detail::is_device_accessor_with_v<AccessorT,
13802  detail::accessor_mode_cap::can_read> &&
13803  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13804 prefetch(AccessorT acc, OffsetT byte_offset, simd_mask<1> mask,
13805  PropertyListT props = {}) {
13806 #ifdef __ESIMD_FORCE_STATELESS_MEM
13807  prefetch<T, VS>(detail::accessorToPointer<T>(acc), byte_offset, mask, props);
13808 #else
13810  PropertyListT>(acc, byte_offset, mask);
13811 #endif // __ESIMD_FORCE_STATELESS_MEM
13812 }
13813 
13829 template <
13830  typename T, int VS = 1, typename AccessorT, typename OffsetT,
13831  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13832 __ESIMD_API std::enable_if_t<
13833  std::is_integral_v<OffsetT> &&
13834  detail::is_device_accessor_with_v<AccessorT,
13835  detail::accessor_mode_cap::can_read> &&
13836  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13837 prefetch(AccessorT acc, OffsetT byte_offset, PropertyListT props = {}) {
13838  simd_mask<1> Mask = 1;
13839  prefetch<T, VS>(acc, byte_offset, Mask, props);
13840 }
13841 
13858 template <
13859  typename T, int VS = 1, typename AccessorT,
13860  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13861 __ESIMD_API std::enable_if_t<
13862  detail::is_device_accessor_with_v<AccessorT,
13863  detail::accessor_mode_cap::can_read> &&
13864  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13865 prefetch(AccessorT acc, simd_mask<1> mask, PropertyListT props = {}) {
13866  prefetch<T, VS>(acc, 0, mask, props);
13867 }
13868 
13881 template <
13882  typename T, int VS = 1, typename AccessorT,
13883  typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
13884 __ESIMD_API std::enable_if_t<
13885  detail::is_device_accessor_with_v<AccessorT,
13886  detail::accessor_mode_cap::can_read> &&
13887  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13888 prefetch(AccessorT acc, PropertyListT props = {}) {
13889  simd_mask<1> Mask = 1;
13890  prefetch<T, VS>(acc, 0, Mask, props);
13891 }
13892 
13933 template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
13934  bool Transposed = false, bool Transformed = false,
13936  T, NBlocks, BlockHeight, BlockWidth, Transposed, Transformed>(),
13937  typename PropertyListT = oneapi::experimental::empty_properties_t>
13938 __ESIMD_API std::enable_if_t<
13939  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
13940 load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
13941  unsigned SurfacePitch, int X, int Y, PropertyListT props = {}) {
13942  return detail::load_2d_impl<T, BlockWidth, BlockHeight, NBlocks, Transposed,
13943  Transformed, PropertyListT>(
13944  Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
13945 }
13946 
13976 template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
13978  T, NBlocks, BlockHeight, BlockWidth, false /*Transposed*/,
13979  false /*Transformed*/>(),
13980  typename PropertyListT = oneapi::experimental::empty_properties_t>
13981 __ESIMD_API std::enable_if_t<
13982  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
13983 prefetch_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
13984  unsigned SurfacePitch, int X, int Y, PropertyListT props = {}) {
13985  detail::prefetch_2d_impl<T, BlockWidth, BlockHeight, NBlocks, PropertyListT>(
13986  Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
13987 }
13988 
14012 template <typename T, int BlockWidth, int BlockHeight = 1,
14014  T, 1u, BlockHeight, BlockWidth, false /*Transposed*/,
14015  false /*Transformed*/>(),
14016  typename PropertyListT = oneapi::experimental::empty_properties_t>
14017 __ESIMD_API std::enable_if_t<
14018  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
14019 store_2d(T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
14020  unsigned SurfacePitch, int X, int Y, simd<T, N> Vals,
14021  PropertyListT props = {}) {
14022  detail::store_2d_impl<T, BlockWidth, BlockHeight, PropertyListT>(
14023  Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, Vals);
14024 }
14025 
14050 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
14051  typename AccessorT, int N,
14052  typename T = typename AccessorT::value_type>
14053 __ESIMD_API
14054  std::enable_if_t<detail::is_local_accessor_with_v<
14055  AccessorT, detail::accessor_mode_cap::can_read>,
14056  simd<T, N * get_num_channels_enabled(RGBAMask)>>
14057  gather_rgba(AccessorT acc, simd<uint32_t, N> offsets,
14058  uint32_t global_offset = 0, simd_mask<N> mask = 1) {
14059  return slm_gather_rgba<T, N, RGBAMask>(
14060  offsets + global_offset + __ESIMD_DNS::localAccessorToOffset(acc), mask);
14061 }
14062 
14079 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
14080  typename AccessorT, int N,
14081  typename T = typename AccessorT::value_type>
14082 __ESIMD_API std::enable_if_t<detail::is_local_accessor_with_v<
14083  AccessorT, detail::accessor_mode_cap::can_write>>
14084 scatter_rgba(AccessorT acc, simd<uint32_t, N> offsets,
14085  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
14086  uint32_t global_offset = 0, simd_mask<N> mask = 1) {
14087  detail::validate_rgba_write_channel_mask<RGBAMask>();
14088  slm_scatter_rgba<T, N, RGBAMask>(offsets + global_offset +
14089  __ESIMD_DNS::localAccessorToOffset(acc),
14090  vals, mask);
14091 }
14092 
14095 
14116 template <uint8_t exec_size, uint8_t sfid, uint8_t num_src0, uint8_t num_src1,
14117  uint8_t num_dst, raw_send_eot eot = raw_send_eot::not_eot,
14118  raw_send_sendc sendc = raw_send_sendc::not_sendc, typename T1, int n1,
14119  typename T2, int n2, typename T3, int n3>
14120 __ESIMD_API __ESIMD_NS::simd<T1, n1>
14121 raw_sends(__ESIMD_NS::simd<T1, n1> msg_dst, __ESIMD_NS::simd<T2, n2> msg_src0,
14122  __ESIMD_NS::simd<T3, n3> msg_src1, uint32_t ex_desc,
14123  uint32_t msg_desc, __ESIMD_NS::simd_mask<exec_size> mask = 1) {
14124  constexpr unsigned _Width1 = n1 * sizeof(T1);
14125  static_assert(_Width1 % 32 == 0, "Invalid size for raw send rspVar");
14126  constexpr unsigned _Width2 = n2 * sizeof(T2);
14127  static_assert(_Width2 % 32 == 0, "Invalid size for raw send msg_src0");
14128  constexpr unsigned _Width3 = n3 * sizeof(T3);
14129  static_assert(_Width3 % 32 == 0, "Invalid size for raw send msg_src1");
14130 
14131  using ElemT1 = __ESIMD_DNS::__raw_t<T1>;
14132  using ElemT2 = __ESIMD_DNS::__raw_t<T2>;
14133  using ElemT3 = __ESIMD_DNS::__raw_t<T3>;
14134 
14135  constexpr uint8_t modifier =
14136  ((eot == raw_send_eot::eot) << 1) | (sendc == raw_send_sendc::sendc);
14137 
14138  return __esimd_raw_sends2<ElemT1, n1, ElemT2, n2, ElemT3, n3, exec_size>(
14139  modifier, exec_size, mask.data(), num_src0, num_src1, num_dst, sfid,
14140  ex_desc, msg_desc, msg_src0.data(), msg_src1.data(), msg_dst.data());
14141 }
14142 
14160 template <uint8_t exec_size, uint8_t sfid, uint8_t num_src0, uint8_t num_dst,
14162  raw_send_sendc sendc = raw_send_sendc::not_sendc, typename T1, int n1,
14163  typename T2, int n2>
14164 __ESIMD_API __ESIMD_NS::simd<T1, n1>
14165 raw_send(__ESIMD_NS::simd<T1, n1> msg_dst, __ESIMD_NS::simd<T2, n2> msg_src0,
14166  uint32_t ex_desc, uint32_t msg_desc,
14167  __ESIMD_NS::simd_mask<exec_size> mask = 1) {
14168  constexpr unsigned _Width1 = n1 * sizeof(T1);
14169  static_assert(_Width1 % 32 == 0, "Invalid size for raw send rspVar");
14170  constexpr unsigned _Width2 = n2 * sizeof(T2);
14171  static_assert(_Width2 % 32 == 0, "Invalid size for raw send msg_src0");
14172 
14173  using ElemT1 = __ESIMD_DNS::__raw_t<T1>;
14174  using ElemT2 = __ESIMD_DNS::__raw_t<T2>;
14175 
14176  constexpr uint8_t modifier =
14177  ((eot == raw_send_eot::eot) << 1) | (sendc == raw_send_sendc::sendc);
14178  return __esimd_raw_send2<ElemT1, n1, ElemT2, n2, exec_size>(
14179  modifier, exec_size, mask.data(), num_src0, num_dst, sfid, ex_desc,
14180  msg_desc, msg_src0.data(), msg_dst.data());
14181 }
14182 
14200 template <uint8_t exec_size, uint8_t sfid, uint8_t num_src0, uint8_t num_src1,
14202  raw_send_sendc sendc = raw_send_sendc::not_sendc, typename T1, int n1,
14203  typename T2, int n2>
14204 __ESIMD_API void raw_sends(__ESIMD_NS::simd<T1, n1> msg_src0,
14205  __ESIMD_NS::simd<T2, n2> msg_src1, uint32_t ex_desc,
14206  uint32_t msg_desc,
14207  __ESIMD_NS::simd_mask<exec_size> mask = 1) {
14208  constexpr unsigned _Width1 = n1 * sizeof(T1);
14209  static_assert(_Width1 % 32 == 0, "Invalid size for raw send msg_src0");
14210  constexpr unsigned _Width2 = n2 * sizeof(T2);
14211  static_assert(_Width2 % 32 == 0, "Invalid size for raw send msg_src1");
14212 
14213  using ElemT1 = __ESIMD_DNS::__raw_t<T1>;
14214  using ElemT2 = __ESIMD_DNS::__raw_t<T2>;
14215 
14216  constexpr uint8_t modifier =
14217  ((eot == raw_send_eot::eot) << 1) | (sendc == raw_send_sendc::sendc);
14218  __esimd_raw_sends2_noresult<ElemT1, n1, ElemT2, n2, exec_size>(
14219  modifier, exec_size, mask.data(), num_src0, num_src1, sfid, ex_desc,
14220  msg_desc, msg_src0.data(), msg_src1.data());
14221 }
14222 
14238 template <uint8_t exec_size, uint8_t sfid, uint8_t num_src0,
14240  raw_send_sendc sendc = raw_send_sendc::not_sendc, typename T1, int n1>
14241 __ESIMD_API void raw_send(__ESIMD_NS::simd<T1, n1> msg_src0, uint32_t ex_desc,
14242  uint32_t msg_desc,
14243  __ESIMD_NS::simd_mask<exec_size> mask = 1) {
14244  constexpr unsigned _Width1 = n1 * sizeof(T1);
14245  static_assert(_Width1 % 32 == 0, "Invalid size for raw send msg_src0");
14246  using ElemT1 = __ESIMD_DNS::__raw_t<T1>;
14247  constexpr uint8_t modifier =
14248  ((eot == raw_send_eot::eot) << 1) | (sendc == raw_send_sendc::sendc);
14249  __esimd_raw_send2_noresult<ElemT1, n1, exec_size>(
14250  modifier, exec_size, mask.data(), num_src0, sfid, ex_desc, msg_desc,
14251  msg_src0.data());
14252 }
14253 
14255 
14258 
14261 
14266 __ESIMD_API void named_barrier_wait(uint8_t id) {
14267  __esimd_nbarrier(0 /*wait*/, id, 0 /*thread count*/);
14268 }
14269 
14274 template <uint8_t NbarCount> __ESIMD_API void named_barrier_init() {
14275  __esimd_nbarrier_init(NbarCount);
14276 }
14277 
14292 template <bool Fence = true>
14293 __ESIMD_API void
14294 named_barrier_signal(uint8_t barrier_id, uint8_t producer_consumer_mode,
14295  uint32_t num_producers, uint32_t num_consumers) {
14296  if constexpr (Fence)
14297  __esimd_fence(fence_mask::global_coherent_fence |
14299  __esimd_nbarrier_arrive(barrier_id, producer_consumer_mode, num_producers,
14300  num_consumers);
14301 }
14302 
14304 
14306 
14308 
14309 namespace detail {
14310 // -- Outlined implementations of simd_obj_impl class memory access APIs.
14311 
14312 template <typename T, int N, class T1, class SFINAE>
14313 template <int ChunkSize, typename PropertyListT>
14314 std::enable_if_t<ext::oneapi::experimental::is_property_list_v<PropertyListT>>
14317  PropertyListT) SYCL_ESIMD_FUNCTION {
14319  constexpr unsigned Size = sizeof(T) * N;
14320  constexpr size_t Align =
14321  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(UT));
14322 
14323  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
14324  constexpr unsigned NumBlocks = Size / BlockSize;
14325  constexpr unsigned RemSize = Size % BlockSize;
14326 
14327  if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 &&
14328  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
14329  if constexpr (NumBlocks > 0) {
14330  constexpr unsigned BlockN = BlockSize / sizeof(T);
14331  ForHelper<NumBlocks>::unroll([BlockN, Addr, this](unsigned Block) {
14332  select<BlockN, 1>(Block * BlockN) =
14333  block_load<UT, BlockN>(Addr + (Block * BlockN), PropertyListT{});
14334  });
14335  }
14336  if constexpr (RemSize > 0) {
14337  constexpr unsigned RemN = RemSize / sizeof(T);
14338  constexpr unsigned BlockN = BlockSize / sizeof(T);
14339  select<RemN, 1>(NumBlocks * BlockN) =
14340  block_load<UT, RemN>(Addr + (NumBlocks * BlockN), PropertyListT{});
14341  }
14342  } else if constexpr (sizeof(T) == 8) {
14343  simd<int32_t, N * 2> BC(reinterpret_cast<const int32_t *>(Addr),
14344  PropertyListT{});
14345  bit_cast_view<int32_t>() = BC;
14346  } else {
14347  constexpr unsigned NumChunks = N / ChunkSize;
14348  if constexpr (NumChunks > 0) {
14349  simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
14350  ForHelper<NumChunks>::unroll([Addr, &Offsets, this](unsigned Block) {
14351  select<ChunkSize, 1>(Block * ChunkSize) = gather<UT, ChunkSize>(
14352  Addr + (Block * ChunkSize), Offsets, PropertyListT{});
14353  });
14354  }
14355  constexpr unsigned RemN = N % ChunkSize;
14356  if constexpr (RemN > 0) {
14357  if constexpr (RemN == 1) {
14358  select<1, 1>(NumChunks * ChunkSize) = Addr[NumChunks * ChunkSize];
14359  } else if constexpr (RemN == 8 || RemN == 16) {
14360  simd<uint32_t, RemN> Offsets(0u, sizeof(T));
14361  select<RemN, 1>(NumChunks * ChunkSize) = gather<UT, RemN>(
14362  Addr + (NumChunks * ChunkSize), Offsets, PropertyListT{});
14363  } else {
14364  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
14365  simd_mask_type<N1> Pred(0);
14366  Pred.template select<RemN, 1>() = 1;
14367  simd<uint32_t, N1> Offsets(0u, sizeof(T));
14368  simd<UT, N1> Vals = gather<UT, N1>(Addr + (NumChunks * ChunkSize),
14369  Offsets, Pred, PropertyListT{});
14370  select<RemN, 1>(NumChunks * ChunkSize) =
14371  Vals.template select<RemN, 1>();
14372  }
14373  }
14374  }
14375 }
14376 
14377 template <typename T, int N, class T1, class SFINAE>
14378 template <typename Flags, int ChunkSize>
14379 std::enable_if_t<is_simd_flag_type_v<Flags>>
14382  Flags) SYCL_ESIMD_FUNCTION {
14383  constexpr unsigned Align = Flags::template alignment<T1>;
14384  copy_from<ChunkSize>(Addr, properties{alignment<Align>});
14385 }
14386 
14387 template <typename T, int N, class T1, class SFINAE>
14388 template <int ChunkSize, typename PropertyListT, typename AccessorT,
14389  typename TOffset>
14390 ESIMD_INLINE std::enable_if_t<
14391  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
14392 simd_obj_impl<T, N, T1, SFINAE>::copy_to_impl(
14393  AccessorT acc, TOffset offset, PropertyListT) const SYCL_ESIMD_FUNCTION {
14395  constexpr unsigned Size = sizeof(T) * N;
14396  constexpr size_t Align =
14397  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(UT));
14398 
14399  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
14400  constexpr unsigned NumBlocks = Size / BlockSize;
14401  constexpr unsigned RemSize = Size % BlockSize;
14402 
14403  simd<UT, N> Tmp{data()};
14404  if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
14405  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
14406  if constexpr (NumBlocks > 0) {
14407  constexpr unsigned BlockN = BlockSize / sizeof(T);
14408  ForHelper<NumBlocks>::unroll([BlockN, acc, offset, &Tmp](unsigned Block) {
14409  block_store<UT, BlockN, AccessorT>(
14410  acc, offset + (Block * BlockSize),
14411  Tmp.template select<BlockN, 1>(Block * BlockN), PropertyListT{});
14412  });
14413  }
14414  if constexpr (RemSize > 0) {
14415  constexpr unsigned RemN = RemSize / sizeof(T);
14416  constexpr unsigned BlockN = BlockSize / sizeof(T);
14417  block_store<UT, RemN, AccessorT>(
14418  acc, offset + (NumBlocks * BlockSize),
14419  Tmp.template select<RemN, 1>(NumBlocks * BlockN), PropertyListT{});
14420  }
14421  } else if constexpr (sizeof(T) == 8) {
14422  simd<int32_t, N * 2> BC = Tmp.template bit_cast_view<int32_t>();
14423  BC.copy_to(acc, offset, PropertyListT{});
14424  } else {
14425  constexpr unsigned NumChunks = N / ChunkSize;
14426  if constexpr (NumChunks > 0) {
14427  simd<TOffset, ChunkSize> Offsets(0u, sizeof(T));
14428  ForHelper<NumChunks>::unroll(
14429  [acc, offset, &Offsets, &Tmp](unsigned Block) {
14430  scatter<UT, ChunkSize>(
14431  acc, Offsets + (offset + (Block * ChunkSize * sizeof(T))),
14432  Tmp.template select<ChunkSize, 1>(Block * ChunkSize),
14433  PropertyListT{});
14434  });
14435  }
14436  constexpr unsigned RemN = N % ChunkSize;
14437  if constexpr (RemN > 0) {
14438  if constexpr (RemN == 1 || RemN == 8 || RemN == 16) {
14439  simd<TOffset, RemN> Offsets(0u, sizeof(T));
14440  scatter<UT, RemN>(
14441  acc, Offsets + (offset + (NumChunks * ChunkSize * sizeof(T))),
14442  Tmp.template select<RemN, 1>(NumChunks * ChunkSize),
14443  PropertyListT{});
14444  } else {
14445  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
14446  simd_mask_type<N1> Pred(0);
14447  Pred.template select<RemN, 1>() = 1;
14448  simd<UT, N1> Vals;
14449  Vals.template select<RemN, 1>() =
14450  Tmp.template select<RemN, 1>(NumChunks * ChunkSize);
14451  simd<TOffset, N1> Offsets(0u, sizeof(T));
14452  scatter<UT, N1>(
14453  acc, Offsets + (offset + (NumChunks * ChunkSize * sizeof(T))), Vals,
14454  Pred, PropertyListT{});
14455  }
14456  }
14457  }
14458 }
14459 
14460 template <typename T, int N, class T1, class SFINAE>
14461 template <int ChunkSize, typename Flags, typename AccessorT, typename TOffset>
14462 ESIMD_INLINE std::enable_if_t<is_simd_flag_type_v<Flags>>
14463 simd_obj_impl<T, N, T1, SFINAE>::copy_to_impl(
14464  AccessorT acc, TOffset offset) const SYCL_ESIMD_FUNCTION {
14465  constexpr unsigned Align = Flags::template alignment<T1>;
14466  copy_to_impl<ChunkSize>(acc, offset, properties{alignment<Align>});
14467 }
14468 
14469 template <typename T, int N, class T1, class SFINAE>
14470 template <int ChunkSize, typename PropertyListT, typename AccessorT,
14471  typename TOffset>
14472 ESIMD_INLINE std::enable_if_t<
14473  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
14474 simd_obj_impl<T, N, T1, SFINAE>::copy_from_impl(
14475  AccessorT acc, TOffset offset, PropertyListT) SYCL_ESIMD_FUNCTION {
14477  static_assert(sizeof(UT) == sizeof(T));
14478  constexpr unsigned Size = sizeof(T) * N;
14479  constexpr size_t Align =
14480  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(UT));
14481 
14482  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
14483  constexpr unsigned NumBlocks = Size / BlockSize;
14484  constexpr unsigned RemSize = Size % BlockSize;
14485 
14486  if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 &&
14487  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
14488  if constexpr (NumBlocks > 0) {
14489  constexpr unsigned BlockN = BlockSize / sizeof(T);
14490  ForHelper<NumBlocks>::unroll([BlockN, acc, offset, this](unsigned Block) {
14491  select<BlockN, 1>(Block * BlockN) = block_load<UT, BlockN, AccessorT>(
14492  acc, offset + (Block * BlockSize), PropertyListT{});
14493  });
14494  }
14495  if constexpr (RemSize > 0) {
14496  constexpr unsigned RemN = RemSize / sizeof(T);
14497  constexpr unsigned BlockN = BlockSize / sizeof(T);
14498  select<RemN, 1>(NumBlocks * BlockN) = block_load<UT, RemN, AccessorT>(
14499  acc, offset + (NumBlocks * BlockSize), PropertyListT{});
14500  }
14501  } else if constexpr (sizeof(T) == 8) {
14502  simd<int32_t, N * 2> BC(acc, offset, PropertyListT{});
14503  bit_cast_view<int32_t>() = BC;
14504  } else {
14505  constexpr unsigned NumChunks = N / ChunkSize;
14506  if constexpr (NumChunks > 0) {
14507  simd<TOffset, ChunkSize> Offsets(0u, sizeof(T));
14508  ForHelper<NumChunks>::unroll(
14509  [acc, offset, &Offsets, this](unsigned Block) {
14510  select<ChunkSize, 1>(Block * ChunkSize) =
14511  gather<UT, ChunkSize, AccessorT>(
14512  acc, Offsets + (offset + (Block * ChunkSize * sizeof(T))),
14513  PropertyListT{});
14514  });
14515  }
14516  constexpr unsigned RemN = N % ChunkSize;
14517  if constexpr (RemN > 0) {
14518  if constexpr (RemN == 1 || RemN == 8 || RemN == 16) {
14519  simd<TOffset, RemN> Offsets(0u, sizeof(T));
14520  select<RemN, 1>(NumChunks * ChunkSize) = gather<UT, RemN, AccessorT>(
14521  acc, Offsets, offset + (NumChunks * ChunkSize * sizeof(T)));
14522  } else {
14523  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
14524  simd_mask_type<N1> Pred(0);
14525  Pred.template select<RemN, 1>() = 1;
14526  simd<TOffset, N1> Offsets(0u, sizeof(T));
14527  simd<UT, N1> Vals = gather<UT, N1>(
14528  acc, Offsets + (offset + (NumChunks * ChunkSize * sizeof(T))), Pred,
14529  PropertyListT{});
14530  select<RemN, 1>(NumChunks * ChunkSize) =
14531  Vals.template select<RemN, 1>();
14532  }
14533  }
14534  }
14535 }
14536 
14537 template <typename T, int N, class T1, class SFINAE>
14538 template <int ChunkSize, typename Flags, typename AccessorT, typename TOffset>
14539 ESIMD_INLINE std::enable_if_t<is_simd_flag_type_v<Flags>>
14540 simd_obj_impl<T, N, T1, SFINAE>::copy_from_impl(AccessorT acc, TOffset offset)
14541  SYCL_ESIMD_FUNCTION {
14542  constexpr unsigned Align = Flags::template alignment<T1>;
14543  copy_from_impl<ChunkSize>(acc, offset, properties{alignment<Align>});
14544 }
14545 
14546 template <typename T, int N, class T1, class SFINAE>
14547 template <typename AccessorT, typename Flags, int ChunkSize>
14548 ESIMD_INLINE std::enable_if_t<
14549  detail::is_device_accessor_with_v<AccessorT, accessor_mode_cap::can_read> &&
14550  is_simd_flag_type_v<Flags>>
14553  Flags) SYCL_ESIMD_FUNCTION {
14554 
14555  copy_from_impl<ChunkSize, Flags>(acc, offset);
14556 }
14557 
14558 template <typename T, int N, class T1, class SFINAE>
14559 template <typename AccessorT, int ChunkSize, typename PropertyListT>
14560 ESIMD_INLINE std::enable_if_t<
14561  detail::is_device_accessor_with_v<AccessorT, accessor_mode_cap::can_read> &&
14562  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
14565  PropertyListT) SYCL_ESIMD_FUNCTION {
14566 
14567  copy_from_impl<ChunkSize, PropertyListT>(acc, offset);
14568 }
14569 
14570 template <typename T, int N, class T1, class SFINAE>
14571 template <typename AccessorT, typename Flags, int ChunkSize>
14572 ESIMD_INLINE std::enable_if_t<
14573  detail::is_local_accessor_with_v<AccessorT, accessor_mode_cap::can_read> &&
14574  is_simd_flag_type_v<Flags>,
14575  void>
14576 simd_obj_impl<T, N, T1, SFINAE>::copy_from(AccessorT acc, uint32_t offset,
14577  Flags) SYCL_ESIMD_FUNCTION {
14578 
14579  copy_from_impl<ChunkSize, Flags>(acc, offset);
14580 }
14581 
14582 template <typename T, int N, class T1, class SFINAE>
14583 template <typename AccessorT, int ChunkSize, typename PropertyListT>
14584 ESIMD_INLINE std::enable_if_t<
14585  detail::is_local_accessor_with_v<AccessorT, accessor_mode_cap::can_read> &&
14586  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
14587  void>
14588 simd_obj_impl<T, N, T1, SFINAE>::copy_from(AccessorT acc, uint32_t offset,
14589  PropertyListT) SYCL_ESIMD_FUNCTION {
14590 
14591  copy_from_impl<ChunkSize, PropertyListT>(acc, offset);
14592 }
14593 
14594 template <typename T, int N, class T1, class SFINAE>
14595 template <int ChunkSize, typename PropertyListT>
14596 std::enable_if_t<ext::oneapi::experimental::is_property_list_v<PropertyListT>>
14599  PropertyListT) const SYCL_ESIMD_FUNCTION {
14601  constexpr unsigned Size = sizeof(T) * N;
14602  constexpr size_t Align =
14603  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(UT));
14604 
14605  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
14606  constexpr unsigned NumBlocks = Size / BlockSize;
14607  constexpr unsigned RemSize = Size % BlockSize;
14608 
14609  simd<UT, N> Tmp{data()};
14610  if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
14611  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
14612  if constexpr (NumBlocks > 0) {
14613  constexpr unsigned BlockN = BlockSize / sizeof(T);
14614  ForHelper<NumBlocks>::unroll([BlockN, Addr, &Tmp](unsigned Block) {
14615  block_store<UT, BlockN>(Addr + (Block * BlockN),
14616  Tmp.template select<BlockN, 1>(Block * BlockN),
14617  PropertyListT{});
14618  });
14619  }
14620  if constexpr (RemSize > 0) {
14621  constexpr unsigned RemN = RemSize / sizeof(T);
14622  constexpr unsigned BlockN = BlockSize / sizeof(T);
14623  block_store<UT, RemN>(Addr + (NumBlocks * BlockN),
14624  Tmp.template select<RemN, 1>(NumBlocks * BlockN),
14625  PropertyListT{});
14626  }
14627  } else if constexpr (sizeof(T) == 8) {
14628  simd<int32_t, N * 2> BC = Tmp.template bit_cast_view<int32_t>();
14629  BC.copy_to(reinterpret_cast<int32_t *>(Addr), PropertyListT{});
14630  } else {
14631  constexpr unsigned NumChunks = N / ChunkSize;
14632  if constexpr (NumChunks > 0) {
14633  simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
14634  ForHelper<NumChunks>::unroll([Addr, &Offsets, &Tmp](unsigned Block) {
14635  scatter<UT, ChunkSize>(
14636  Addr + (Block * ChunkSize), Offsets,
14637  Tmp.template select<ChunkSize, 1>(Block * ChunkSize),
14638  PropertyListT{});
14639  });
14640  }
14641  constexpr unsigned RemN = N % ChunkSize;
14642  if constexpr (RemN > 0) {
14643  if constexpr (RemN == 1) {
14644  Addr[NumChunks * ChunkSize] = Tmp[NumChunks * ChunkSize];
14645  } else if constexpr (RemN == 8 || RemN == 16) {
14646  // TODO: GPU runtime may handle scatter of 16 byte elements
14647  // incorrectly. The code below is a workaround which must be deleted
14648  // once GPU runtime is fixed.
14649  if constexpr (sizeof(T) == 1 && RemN == 16) {
14650  if constexpr (Align % OperandSize::DWORD > 0) {
14651  ForHelper<RemN>::unroll([Addr, &Tmp](unsigned Index) {
14652  Addr[Index + NumChunks * ChunkSize] =
14653  Tmp[Index + NumChunks * ChunkSize];
14654  });
14655  } else {
14656  simd_mask_type<8> Pred(0);
14657  simd<int32_t, 8> Vals;
14658  Pred.template select<4, 1>() = 1;
14659  Vals.template select<4, 1>() =
14660  Tmp.template bit_cast_view<int32_t>().template select<4, 1>(
14661  NumChunks * ChunkSize);
14662 
14663  simd<uint32_t, 8> Offsets(0u, sizeof(int32_t));
14664  scatter<int32_t, 8>(
14665  reinterpret_cast<int32_t *>(Addr + (NumChunks * ChunkSize)),
14666  Offsets, Vals, Pred, PropertyListT{});
14667  }
14668  } else {
14669  simd<uint32_t, RemN> Offsets(0u, sizeof(T));
14670  scatter<UT, RemN>(Addr + (NumChunks * ChunkSize), Offsets,
14671  Tmp.template select<RemN, 1>(NumChunks * ChunkSize),
14672  PropertyListT{});
14673  }
14674  } else {
14675  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
14676  simd_mask_type<N1> Pred(0);
14677  Pred.template select<RemN, 1>() = 1;
14678  simd<UT, N1> Vals;
14679  Vals.template select<RemN, 1>() =
14680  Tmp.template select<RemN, 1>(NumChunks * ChunkSize);
14681  simd<uint32_t, N1> Offsets(0u, sizeof(T));
14682  scatter<UT, N1>(Addr + (NumChunks * ChunkSize), Offsets, Vals, Pred,
14683  PropertyListT{});
14684  }
14685  }
14686  }
14687 }
14688 
14689 template <typename T, int N, class T1, class SFINAE>
14690 template <typename Flags, int ChunkSize>
14691 std::enable_if_t<is_simd_flag_type_v<Flags>>
14694  Flags) const SYCL_ESIMD_FUNCTION {
14695  constexpr unsigned Align = Flags::template alignment<T1>;
14696  copy_to<ChunkSize>(Addr, properties{alignment<Align>});
14697 }
14698 
14699 template <typename T, int N, class T1, class SFINAE>
14700 template <typename AccessorT, typename Flags, int ChunkSize>
14701 ESIMD_INLINE std::enable_if_t<detail::is_device_accessor_with_v<
14702  AccessorT, accessor_mode_cap::can_write> &&
14703  is_simd_flag_type_v<Flags>>
14706  Flags) const SYCL_ESIMD_FUNCTION {
14707  copy_to_impl<ChunkSize, Flags>(acc, offset);
14708 }
14709 
14710 template <typename T, int N, class T1, class SFINAE>
14711 template <typename AccessorT, int ChunkSize, typename PropertyListT>
14712 ESIMD_INLINE std::enable_if_t<
14713  detail::is_device_accessor_with_v<AccessorT,
14714  accessor_mode_cap::can_write> &&
14715  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
14717  AccessorT acc, detail::DeviceAccessorOffsetT offset,
14718  PropertyListT) const SYCL_ESIMD_FUNCTION {
14719  copy_to_impl<ChunkSize, PropertyListT>(acc, offset);
14720 }
14721 
14722 template <typename T, int N, class T1, class SFINAE>
14723 template <typename AccessorT, typename Flags, int ChunkSize>
14724 ESIMD_INLINE std::enable_if_t<
14725  detail::is_local_accessor_with_v<AccessorT, accessor_mode_cap::can_write> &&
14726  is_simd_flag_type_v<Flags>,
14727  void>
14728 simd_obj_impl<T, N, T1, SFINAE>::copy_to(AccessorT acc, uint32_t offset,
14729  Flags) const SYCL_ESIMD_FUNCTION {
14730  copy_to_impl<ChunkSize, Flags>(acc, offset);
14731 }
14732 
14733 template <typename T, int N, class T1, class SFINAE>
14734 template <typename AccessorT, int ChunkSize, typename PropertyListT>
14735 ESIMD_INLINE std::enable_if_t<
14736  detail::is_local_accessor_with_v<AccessorT, accessor_mode_cap::can_write> &&
14737  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
14738  void>
14740  AccessorT acc, uint32_t offset, PropertyListT) const SYCL_ESIMD_FUNCTION {
14741  copy_to_impl<ChunkSize, PropertyListT>(acc, offset);
14742 }
14743 
14744 } // namespace detail
14746 
14747 } // namespace ext::intel::esimd
14748 } // namespace _V1
14749 } // namespace sycl
const auto & data() const noexcept
Definition: simd.hpp:1673
Definition: simd.hpp:1387
std::enable_if_t< __vectorizable< _Up >) &&is_simd_flag_type< _Flags >::value > copy_to(_Up *__buffer, _Flags) const
Definition: simd.hpp:1526
get_vector_element_type< Derived > element_type
Element type of the derived (user) class.
ESIMD_INLINE std::enable_if_t< is_simd_flag_type_v< Flags > > copy_to(Ty *addr, Flags) const SYCL_ESIMD_FUNCTION
Copy all vector elements of this object into a contiguous block in memory.
ESIMD_INLINE std::enable_if_t< is_simd_flag_type_v< Flags > > copy_from(const Ty *addr, Flags) SYCL_ESIMD_FUNCTION
Copy a contiguous block of data from memory into this simd_obj_impl object.
The main simd vector class.
Definition: simd.hpp:53
typename base_type::raw_vector_type raw_vector_type
Definition: simd.hpp:60
RAII-style class used to implement "semi-dynamic" SLM allocation.
Definition: memory.hpp:5667
~slm_allocator()
Releases the SLM chunk allocated in the constructor.
Definition: memory.hpp:5678
slm_allocator()
Allocates the amount of SLM which is class' template parameter.
Definition: memory.hpp:5672
ESIMD_INLINE int get_offset() const
Definition: memory.hpp:5675
#define __ESIMD_FP_ATOMIC_OP_TYPE_CHECK(T)
Definition: memory.hpp:5584
raw_send_eot
Specify if end of thread should be set.
Definition: common.hpp:67
rgba_channel_mask
Represents a pixel's channel mask - all possible combinations of enabled channels.
Definition: common.hpp:122
raw_send_sendc
Specify if sendc should be used.
Definition: common.hpp:73
unsigned int SurfaceIndex
Surface index type.
Definition: common.hpp:64
constexpr int get_num_channels_enabled(rgba_channel_mask M)
Definition: common.hpp:145
atomic_op
Represents an atomic operation.
Definition: common.hpp:160
@ fsub
ACM/PVC: Subtraction (floating point): *addr = *addr - src0.
@ fmax
ACM/PVC: Minimum (floating point): *addr = min(*addr, src0).
@ fadd
ACM/PVC: Addition (floating point): *addr = *addr + src0.
@ xchg
Exchange. *addr == src0;
@ fmin
ACM/PVC: Maximum (floating point): *addr = max(*addr, src0).
@ fcmpxchg
ACM/PVC: Compare and exchange (floating point).
__ESIMD_API SZ simd< T, SZ > src1
Definition: math.hpp:184
__ESIMD_API SZ src0
Definition: math.hpp:184
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT >, simd< T, N > > block_load(const T *ptr, PropertyListT props={})
Each of the following block load functions loads a contiguous memory block from the address reference...
Definition: memory.hpp:1778
__ESIMD_API void named_barrier_wait(uint8_t id)
Wait on a named barrier Available only on PVC.
Definition: memory.hpp:14266
__ESIMD_API void named_barrier_init()
Initialize number of named barriers for a kernel Available only on PVC.
Definition: memory.hpp:14274
__ESIMD_API void named_barrier_signal(uint8_t barrier_id, uint8_t producer_consumer_mode, uint32_t num_producers, uint32_t num_consumers)
Perform signal operation for the given named barrier Available only on PVC.
Definition: memory.hpp:14294
__ESIMD_API std::enable_if_t< __ESIMD_DNS::get_num_args< Op >)==0 &&__ESIMD_DNS::is_rw_local_accessor_v< AccessorT >, simd< T, N > > atomic_update(AccessorT lacc, simd< uint32_t, N > byte_offset, simd_mask< N > mask=1)
simd<T, N> atomic_update(local_accessor lacc, simd<uint32_t, N> byte_offset, simd_mask<N> pred = 1); ...
Definition: memory.hpp:8020
__ESIMD_API std::enable_if_t<(N==8||N==16||N==32) &&(sizeof(T)==4)> slm_scatter_rgba(simd< uint32_t, N > offsets, simd< T, N *get_num_channels_enabled(Mask)> vals, simd_mask< N > mask=1)
Gather data from the Shared Local Memory at specified offsets and return it as simd vector.
Definition: memory.hpp:6656
__ESIMD_API T slm_scalar_load(uint32_t offset)
Load a scalar value from the Shared Local Memory.
Definition: memory.hpp:6234
__ESIMD_API std::enable_if_t< is_simd_flag_type_v< Flags >, simd< T, N > > slm_block_load(uint32_t byte_offset, Flags)
Loads a contiguous block of SLM memory referenced by the given byte-offset offset,...
Definition: memory.hpp:6685
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT > > slm_scatter(simd< uint32_t, N/VS > byte_offsets, simd< T, N > vals, simd_mask< N/VS > mask, PropertyListT props={})
template <typename T, int N, int VS = 1, typename PropertyListT = empty_properties_t> void slm_scatte...
Definition: memory.hpp:6284
__ESIMD_API std::enable_if_t< is_simd_flag_type_v< Flags > > slm_block_store(uint32_t offset, simd< T, N > vals, Flags)
Stores elements of the vector vals to a contiguous block of SLM memory at the given byte-offset offse...
Definition: memory.hpp:7304
__ESIMD_API std::enable_if_t< __ESIMD_DNS::get_num_args< Op >)==0, simd< T, N > > slm_atomic_update(simd< uint32_t, N > byte_offset, simd_mask< N > mask=1)
Definition: memory.hpp:7983
__ESIMD_API void slm_init()
Declare per-work-group slm size.
Definition: memory.hpp:5693
__ESIMD_API std::enable_if_t<(N==8||N==16||N==32) &&(sizeof(T)==4), simd< T, N *get_num_channels_enabled(RGBAMask)> > slm_gather_rgba(simd< uint32_t, N > offsets, simd_mask< N > mask=1)
Gather data from the Shared Local Memory at specified offsets and return it as simd vector.
Definition: memory.hpp:6638
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT >, simd< T, N > > slm_gather(simd< uint32_t, N/VS > byte_offsets, simd_mask< N/VS > mask, simd< T, N > pass_thru, PropertyListT props={})
template <typename T, int N, int VS, typename PropertyListT = empty_properties_t> simd<T,...
Definition: memory.hpp:5778
__ESIMD_API void slm_scalar_store(uint32_t offset, T val)
Store a scalar value into the Shared Local Memory.
Definition: memory.hpp:6621
__ESIMD_API simd< T, N *get_num_channels_enabled(RGBAMask)> gather_rgba(const T *p, simd< Toffset, N > offsets, simd_mask< N > mask=1)
Gather and transpose pixels from given memory locations defined by the base pointer p and offsets.
Definition: memory.hpp:5310
__ESIMD_API T scalar_load(AccessorTy acc, detail::DeviceAccessorOffsetT offset)
Load a scalar value from an accessor.
Definition: memory.hpp:5253
__ESIMD_API std::enable_if_t< is_simd_flag_type_v< Flags > > block_store(Tx *addr, simd< Tx, N > vals, Flags)
Stores elements of the vector vals to a contiguous block of memory at the given address addr.
Definition: memory.hpp:1703
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT > > store_2d(T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y, simd< T, N > Vals, PropertyListT props={})
2D USM pointer block store.
Definition: memory.hpp:14019
__ESIMD_API void scatter_rgba(T *p, simd< Toffset, N > offsets, simd< T, N *get_num_channels_enabled(RGBAMask)> vals, simd_mask< N > mask=1)
Transpose and scatter pixels to given memory locations defined by the base pointer p and offsets.
Definition: memory.hpp:5400
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT > > prefetch_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y, PropertyListT props={})
template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1, int N = detail::get_lsc_b...
Definition: memory.hpp:13983
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT > > scatter(T *p, simd< OffsetT, N/VS > byte_offsets, simd< T, N > vals, simd_mask< N/VS > mask, PropertyListT props={})
template <typename T, int N, int VS = 1, typename OffsetT, typename PropertyListT = empty_properties_...
Definition: memory.hpp:869
__ESIMD_API void fence()
esimd::fence sets the memory read/write order.
Definition: memory.hpp:12076
__ESIMD_API void scalar_store(AccessorTy acc, detail::DeviceAccessorOffsetT offset, T val)
Store a scalar value into an accessor.
Definition: memory.hpp:5268
__ESIMD_API void media_block_store(AccessorTy acc, unsigned x, unsigned y, simd< T, m *N > vals)
Media block store.
Definition: memory.hpp:12170
__ESIMD_API std::enable_if_t< detail::is_device_accessor_with_v< AccessorT, detail::accessor_mode_cap::can_read > &&ext::oneapi::experimental::is_property_list_v< PropertyListT > > prefetch(AccessorT acc, PropertyListT props={})
template <typename T, int VS = 1, typename AccessorT, typename PropertyListT = empty_properties_t> vo...
Definition: memory.hpp:13888
fence_mask
Represetns a bit mask to control behavior of esimd::fence.
Definition: memory.hpp:12056
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT >, simd< T, N > > load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y, PropertyListT props={})
template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1, bool Transposed = false,...
Definition: memory.hpp:13940
__ESIMD_API SurfaceIndex get_surface_index(AccessorTy acc)
Get surface index corresponding to a SYCL accessor.
Definition: memory.hpp:53
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT > > prefetch(const T *p, simd< OffsetT, N/VS > byte_offsets, simd_mask< N/VS > mask, PropertyListT props={})
template <typename T, int N, int VS, typename OffsetT, typename PropertyListT = empty_properties_t> v...
Definition: memory.hpp:13215
__ESIMD_API simd< T, m *N > media_block_load(AccessorTy acc, unsigned x, unsigned y)
Media block load.
Definition: memory.hpp:12128
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT >, simd< T, N > > gather(const T *p, simd< OffsetT, N/VS > byte_offsets, simd_mask< N/VS > mask, simd< T, N > pass_thru, PropertyListT props={})
template <typename T, int N, int VS, typename OffsetT, typename PropertyListT = empty_properties_t> s...
Definition: memory.hpp:289
__ESIMD_API void barrier()
Generic work-group barrier.
Definition: memory.hpp:12106
@ global_coherent_fence
“Commit enable” - wait for fence to complete before continuing.
Definition: memory.hpp:12058
@ l2_flush_constant_data
Flush constant cache.
Definition: memory.hpp:12064
@ local_barrier
Issue SLM memory barrier only. If not set, the memory barrier is global.
Definition: memory.hpp:12068
@ l1_flush_ro_data
Flush L1 read - only data cache.
Definition: memory.hpp:12070
@ l2_flush_rw_data
Flush constant cache.
Definition: memory.hpp:12066
@ l2_flush_texture_data
Flush sampler (texture) cache.
Definition: memory.hpp:12062
@ l2_flush_instructions
Flush the instruction cache.
Definition: memory.hpp:12060
__ESIMD_API sycl::ext::intel::esimd::simd< T1, n1 > raw_send(sycl::ext::intel::esimd::simd< T1, n1 > msg_dst, sycl::ext::intel::esimd::simd< T2, n2 > msg_src0, uint32_t ex_desc, uint32_t msg_desc, sycl::ext::intel::esimd::simd_mask< exec_size > mask=1)
Raw send.
Definition: memory.hpp:14165
__ESIMD_API sycl::ext::intel::esimd::simd< T1, n1 > raw_sends(sycl::ext::intel::esimd::simd< T1, n1 > msg_dst, sycl::ext::intel::esimd::simd< T2, n2 > msg_src0, sycl::ext::intel::esimd::simd< T3, n3 > msg_src1, uint32_t ex_desc, uint32_t msg_desc, sycl::ext::intel::esimd::simd_mask< exec_size > mask=1)
Raw sends.
Definition: memory.hpp:14121
void add(const void *DeviceGlobalPtr, const char *UniqueId)
ESIMD_INLINE simd< T, N > lsc_format_ret(simd< T1, N > Vals)
Definition: memory.hpp:88
__ESIMD_API std::enable_if_t< get_num_args< Op >)==0, simd< T, N > > slm_atomic_update_impl(simd< uint32_t, N > offsets, simd_mask< N > pred)
SLM atomic.
Definition: memory.hpp:7837
constexpr bool isMaskedGatherScatterLLVMAvailable()
Definition: memory.hpp:203
static constexpr SurfaceIndex SLM_BTI
Definition: common.hpp:115
constexpr cache_hint getCacheHintForIntrin()
Extracts a cache hint with the given 'Level' to pass it to ESIMD/GENX intrinsics.
Definition: memory.hpp:102
constexpr void check_atomic()
Check the legality of an atomic call in terms of size and type.
Definition: memory.hpp:5593
static void validate_rgba_write_channel_mask()
Definition: memory.hpp:5368
__ESIMD_API simd< T, N > slm_atomic_update_impl(simd< uint32_t, N > offsets, simd< T, N > src0, simd< T, N > src1, simd_mask< N > pred)
SLM atomic.
Definition: memory.hpp:7913
__ESIMD_API std::enable_if_t< detail::is_property_list_v< PropertyListT > > block_store_impl(T *p, simd< T, NElts > vals, simd_mask< 1 > pred)
Definition: memory.hpp:1565
__ESIMD_API simd< T, N *NElts > gather_impl(const T *p, simd< OffsetT, N > offsets, simd_mask< N > pred, simd< T, N *NElts > pass_thru)
USM pointer gather.
Definition: memory.hpp:133
lsc_data_size
Data size or format to read or store.
Definition: common.hpp:402
__ESIMD_API void scatter_impl(T *p, simd< Toffset, N > offsets, simd< T, N *NElts > vals, simd_mask< N > pred)
USM pointer scatter.
Definition: memory.hpp:176
ESIMD_INLINE simd< RT, N > lsc_format_input(simd< T, N > Vals)
Definition: memory.hpp:74
constexpr int lsc_to_internal_atomic_op()
Definition: memory.hpp:7815
__ESIMD_API std::enable_if_t< get_num_args< Op >)==0, simd< T, N > > atomic_update_impl(T *p, simd< Toffset, N > offsets, simd_mask< N > pred)
USM pointer atomic.
Definition: memory.hpp:8787
constexpr ESIMD_INLINE bool isPowerOf2(unsigned int n)
Check if a given 32 bit positive integer is a power of 2 at compile time.
Definition: common.hpp:96
__ESIMD_API std::enable_if_t< is_property_list_v< PropertyListT >, simd< T, NElts > > block_load_impl(const T *p, simd_mask< 1 > pred, simd< T, NElts > pass_thru)
USM pointer transposed gather with 1 channel.
Definition: memory.hpp:1310
constexpr lsc_data_size expand_data_size(lsc_data_size DS)
Definition: common.hpp:603
constexpr alignment_key::value_t< K > alignment
cache_hint
L1, L2 or L3 cache hints.
fence_scope
The scope that fence() operation should apply to.
Definition: common.hpp:345
@ group
Wait until all previous memory transactions from this thread are observed within the local thread-gro...
fence_flush_op
The cache flush operation to apply to caches after fence() is complete.
Definition: common.hpp:379
memory_kind
The target memory kind for fence() operation.
Definition: common.hpp:392
@ local
image (also known as typed global memory)
prefetch_impl< _B > prefetch
Definition: fpga_lsu.hpp:45
void prefetch_impl(T *ptr, size_t bytes, Properties properties)
Definition: prefetch.hpp:72
properties< std::tuple<> > empty_properties_t
Definition: properties.hpp:234
std::bit_or< T > bit_or
Definition: functional.hpp:22
std::enable_if_t< detail::is_bf16_storage_type< T >::value, T > fmax(T x, T y)
std::enable_if_t< detail::is_bf16_storage_type< T >::value, T > fmin(T x, T y)
std::bit_xor< T > bit_xor
Definition: functional.hpp:23
std::bit_and< T > bit_and
Definition: functional.hpp:24
std::enable_if_t< sizeof(To)==sizeof(From) &&std::is_trivially_copyable< From >::value &&std::is_trivially_copyable< To >::value, To > bit_cast(const From &from) noexcept
Definition: bit_cast.hpp:52
constexpr stream_manipulator dec
Definition: stream.hpp:785
autodecltype(x) x
const void value_type
Definition: multi_ptr.hpp:457
Definition: access.hpp:18
std::conditional_t< sizeof(T)<=4, std::conditional_t< std::is_signed_v< T >, int32_t, uint32_t >, std::conditional_t< std::is_signed_v< T >, int64_t, uint64_t > > type
Definition: common.hpp:615