DPC++ Runtime
Runtime libraries for oneAPI DPC++
memory.hpp
Go to the documentation of this file.
1 //==-------------- memory.hpp - DPC++ Explicit SIMD API --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // Implement Explicit SIMD memory-access APIs.
9 //===----------------------------------------------------------------------===//
10 
11 #pragma once
12 
20 #include <sycl/half_type.hpp>
21 
22 #include <algorithm>
23 #include <cstdint>
24 
25 namespace sycl {
26 inline namespace _V1 {
27 namespace ext::intel::esimd {
28 
31 
37 
39 
41 
43 
46 
52 template <typename AccessorTy>
53 __ESIMD_API SurfaceIndex get_surface_index(AccessorTy acc) {
54  if constexpr (std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
55  sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>) {
56  return detail::SLM_BTI;
57  } else {
58 #ifdef __ESIMD_FORCE_STATELESS_MEM
59  static_assert(sycl::detail::acc_properties::is_image_accessor_v<AccessorTy>,
60  "The function get_surface_index() is available only for "
61  "image- and local-accessors in stateless-only memory mode. "
62  "Consider using "
63  "-fno-sycl-esimd-force-stateless-mem compilation switch.");
64 #endif // __ESIMD_FORCE_STATELESS_MEM
65  return __esimd_get_surface_index(
66  detail::AccessorPrivateProxy::getQualifiedPtrOrImageObj(acc));
67  }
68 }
69 
70 namespace detail {
71 
72 // Format u8 and u16 to u8u32 and u16u32 by doing garbage-extension.
73 template <typename RT, typename T, int N>
75  if constexpr (sizeof(T) == 1) {
76  // Extend bytes to RT.
77  return Vals.template bit_cast_view<uint8_t>();
78  } else if constexpr (sizeof(T) == 2) {
79  // Extend words to RT.
80  return Vals.template bit_cast_view<uint16_t>();
81  } else {
82  return Vals.template bit_cast_view<RT>();
83  }
84 }
85 
86 // Format u8u32 and u16u32 back to u8 and u16.
87 template <typename T, typename T1, int N>
89  auto Formatted = Vals.template bit_cast_view<T>();
90  if constexpr (sizeof(T) == sizeof(T1)) {
91  return Formatted;
92  } else {
93  constexpr int Stride = Formatted.length / N;
94  return Formatted.template select<N, Stride>(0);
95  }
96 }
97 
101 template <typename PropertyListT, cache_level Level>
103  static_assert(Level == cache_level::L1 || Level == cache_level::L2,
104  "ESIMD/GENX intrinsics accept only L1/L2 cache hints");
105  if constexpr (Level == cache_level::L1) {
106  return getPropertyValue<PropertyListT, cache_hint_L1_key>(cache_hint::none);
107  } else {
108  return getPropertyValue<PropertyListT, cache_hint_L2_key>(cache_hint::none);
109  }
110 }
111 
131 template <typename T, int NElts, lsc_data_size DS, typename PropertyListT,
132  int N, typename OffsetT>
133 __ESIMD_API simd<T, N * NElts> gather_impl(const T *p, simd<OffsetT, N> offsets,
134  simd_mask<N> pred,
135  simd<T, N * NElts> pass_thru) {
136  static_assert(std::is_integral_v<OffsetT>, "Unsupported offset type");
137  check_lsc_vector_size<NElts>();
138  check_lsc_data_size<T, DS>();
139  check_cache_hints<cache_action::load, PropertyListT>();
140  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
141  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
142  constexpr uint16_t AddressScale = 1;
143  constexpr int ImmOffset = 0;
144  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
145  constexpr lsc_vector_size VS = to_lsc_vector_size<NElts>();
146  constexpr auto Transposed = lsc_data_order::nontranspose;
147  using MsgT = typename lsc_expand_type<T>::type;
148  simd<uintptr_t, N> Addrs = reinterpret_cast<uintptr_t>(p);
149  Addrs += convert<uintptr_t>(offsets);
150  simd<MsgT, N * NElts> PassThruExpanded = lsc_format_input<MsgT>(pass_thru);
151  simd<MsgT, N * NElts> Result =
152  __esimd_lsc_load_merge_stateless<MsgT, L1H, L2H, AddressScale, ImmOffset,
153  EDS, VS, Transposed, N>(
154  pred.data(), Addrs.data(), PassThruExpanded.data());
155  return lsc_format_ret<T>(Result);
156 }
157 
174 template <typename T, int NElts, lsc_data_size DS, typename PropertyListT,
175  int N, typename Toffset>
176 __ESIMD_API void scatter_impl(T *p, simd<Toffset, N> offsets,
177  simd<T, N * NElts> vals, simd_mask<N> pred) {
178  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
179  check_lsc_vector_size<NElts>();
180  check_lsc_data_size<T, DS>();
181  check_cache_hints<cache_action::store, PropertyListT>();
182  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
183  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
184  constexpr uint16_t AddressScale = 1;
185  constexpr int ImmOffset = 0;
186  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
187  constexpr lsc_vector_size VS = to_lsc_vector_size<NElts>();
188  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
189  using MsgT = typename lsc_expand_type<T>::type;
190  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
191  addrs += convert<uintptr_t>(offsets);
192  simd<MsgT, N * NElts> Tmp = lsc_format_input<MsgT, T>(vals);
193  __esimd_lsc_store_stateless<MsgT, L1H, L2H, AddressScale, ImmOffset, EDS, VS,
194  Transposed, N>(pred.data(), addrs.data(),
195  Tmp.data());
196 }
197 
198 // Returns true iff it is Ok to use llvm.masked.gather and llvm.masked.scatter.
199 // By default (without use specifying __ESIMD_GATHER_SCATTER_LLVM_IR) it is
200 // not used because of an issue in GPU driver, which does not recognize
201 // those operations in SPIR-V when they are used in mixed (scalar and vector)
202 // kernels using invoke_simd() API.
204 #ifdef __ESIMD_GATHER_SCATTER_LLVM_IR
205  return true;
206 #else
207  return false;
208 #endif
209 }
210 
211 } // namespace detail
212 
252 
258 #ifndef __ESIMD_GATHER_SCATTER_LLVM_IR
261 #endif // __ESIMD_GATHER_SCATTER_LLVM_IR
284 template <typename T, int N, int VS, typename OffsetT,
285  typename PropertyListT =
287 __ESIMD_API std::enable_if_t<
288  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
289 gather(const T *p, simd<OffsetT, N / VS> byte_offsets, simd_mask<N / VS> mask,
290  simd<T, N> pass_thru, PropertyListT props = {}) {
291  static_assert(std::is_integral_v<OffsetT>, "Unsupported offset type");
292  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
293 
294  constexpr size_t Alignment =
295  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
296  static_assert(Alignment >= sizeof(T),
297  "gather() requires at least element-size alignment");
298 
299  // Use LSC lowering if cache-hints are used or VS > 1. Also, if
300  // llvm.masked.gather is not available, then LSC is the only lowering option.
301  if constexpr (detail::has_cache_hints<PropertyListT>() || VS > 1 ||
303  static_assert(VS == 1 || sizeof(T) >= 4,
304  "VS > 1 is supprted only for 4- and 8-byte elements");
306  PropertyListT>(p, byte_offsets, mask, pass_thru);
307  } else {
308  simd<uint64_t, N> Addrs(reinterpret_cast<uint64_t>(p));
309  Addrs = Addrs + convert<uint64_t>(byte_offsets);
310 
311  using MsgT = detail::__raw_t<T>;
312  return __esimd_gather_ld<MsgT, N, Alignment>(
313  Addrs.data(), mask.data(),
314  sycl::bit_cast<__ESIMD_DNS::vector_type_t<MsgT, N>>(pass_thru.data()));
315  }
316 }
317 
343 template <typename T, int N, int VS, typename OffsetT,
344  typename PropertyListT =
346 __ESIMD_API std::enable_if_t<
347  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
348 gather(const T *p, simd<OffsetT, N / VS> byte_offsets, simd_mask<N / VS> mask,
349  PropertyListT props = {}) {
350  constexpr size_t Alignment =
351  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
352  static_assert(Alignment >= sizeof(T),
353  "gather() requires at least element-size alignment");
354 
355  if constexpr (detail::has_cache_hints<PropertyListT>() || VS > 1 ||
357  !detail::isPowerOf2(N, 32)) {
358  simd<T, N> PassThru; // it is intentionally undefined
359  return gather<T, N, VS>(p, byte_offsets, mask, PassThru, props);
360  } else {
361  simd<uintptr_t, N> Addrs = reinterpret_cast<uintptr_t>(p);
362  Addrs += convert<uintptr_t>(byte_offsets);
363  using MsgT = detail::__raw_t<T>;
364  if constexpr (sizeof(T) == 1) {
365  auto Ret = __esimd_svm_gather<MsgT, N, detail::ElemsPerAddrEncoding<4>(),
366  detail::ElemsPerAddrEncoding<1>()>(
367  Addrs.data(), mask.data());
368  detail::check_rdregion_params<N * 4, N, /*VS*/ 0, N, 4>();
369  return __esimd_rdregion<MsgT, N * 4, N, /*VS*/ 0, N, 4>(Ret, 0);
370  } else if constexpr (sizeof(T) == 2) {
371  auto Ret = __esimd_svm_gather<MsgT, N, detail::ElemsPerAddrEncoding<2>(),
372  detail::ElemsPerAddrEncoding<2>()>(
373  Addrs.data(), mask.data());
374  detail::check_rdregion_params<N * 2, N, /*VS*/ 0, N, 2>();
375  return __esimd_rdregion<MsgT, N * 2, N, /*VS*/ 0, N, 2>(Ret, 0);
376  } else {
377  return __esimd_svm_gather<MsgT, N, detail::ElemsPerAddrEncoding<1>(),
378  detail::ElemsPerAddrEncoding<1>()>(Addrs.data(),
379  mask.data());
380  }
381  }
382 }
383 
402 template <typename T, int N, int VS, typename OffsetT,
403  typename PropertyListT =
405 __ESIMD_API std::enable_if_t<
406  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
407 gather(const T *p, simd<OffsetT, N / VS> byte_offsets,
408  PropertyListT props = {}) {
409  simd_mask<N / VS> Mask = 1;
410  return gather<T, N, VS>(p, byte_offsets, Mask, props);
411 }
412 
437 template <typename T, int N, typename OffsetT,
438  typename PropertyListT =
440 __ESIMD_API std::enable_if_t<
441  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
442 gather(const T *p, simd<OffsetT, N> byte_offsets, simd_mask<N> mask,
443  simd<T, N> pass_thru, PropertyListT props = {}) {
444  constexpr int VS = 1;
445  return gather<T, N, VS>(p, byte_offsets, mask, pass_thru, props);
446 }
447 
469 template <typename T, int N, typename OffsetT,
470  typename PropertyListT =
472 __ESIMD_API std::enable_if_t<
473  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
474 gather(const T *p, simd<OffsetT, N> byte_offsets, simd_mask<N> mask,
475  PropertyListT props = {}) {
476  constexpr int VS = 1;
477  return gather<T, N, VS>(p, byte_offsets, mask, props);
478 }
479 
495 template <typename T, int N, typename OffsetT,
496  typename PropertyListT =
498 __ESIMD_API std::enable_if_t<
499  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
500 gather(const T *p, simd<OffsetT, N> byte_offsets, PropertyListT props = {}) {
501  constexpr int VS = 1;
502  return gather<T, N, VS>(p, byte_offsets, props);
503 }
504 
533 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
534  typename PropertyListT =
536 __ESIMD_API std::enable_if_t<
537  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
538  detail::is_simd_view_type_v<OffsetSimdViewT>,
539  simd<T, N>>
540 gather(const T *p, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
541  simd<T, N> pass_thru, PropertyListT props = {}) {
542  return gather<T, N, VS>(p, byte_offsets.read(), mask, pass_thru, props);
543 }
544 
568 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
569  typename PropertyListT =
571 __ESIMD_API std::enable_if_t<
572  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
573  detail::is_simd_view_type_v<OffsetSimdViewT>,
574  simd<T, N>>
575 gather(const T *p, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
576  PropertyListT props = {}) {
577  return gather<T, N, VS>(p, byte_offsets.read(), mask, props);
578 }
579 
597 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
598  typename PropertyListT =
600 __ESIMD_API std::enable_if_t<
601  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
602  detail::is_simd_view_type_v<OffsetSimdViewT>,
603  simd<T, N>>
604 gather(const T *p, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
605  return gather<T, N, VS>(p, byte_offsets.read(), props);
606 }
607 
620 template <typename Tx, int N, typename Toffset>
621 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>, simd<Tx, N>>
622 gather(const Tx *p, Toffset offset, simd_mask<N> mask = 1) {
623  return gather<Tx, N>(p, simd<Toffset, N>(offset), mask);
624 }
625 
630 
635 
638 
643 
648 
672 template <typename T, int N, int VS = 1, typename OffsetT,
673  typename PropertyListT =
675 __ESIMD_API std::enable_if_t<
676  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
677 scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
678  simd_mask<N / VS> mask, PropertyListT props = {}) {
679  static_assert(std::is_integral_v<OffsetT>, "Unsupported offset type");
680  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
681 
682  constexpr size_t Alignment =
683  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
684  static_assert(Alignment >= sizeof(T),
685  "scatter() requires at least element-size alignment");
686 
687  // Use LSC lowering if cache-hints are used or VS > 1.
688  if constexpr (detail::has_cache_hints<PropertyListT>() || VS > 1 ||
689  (!__ESIMD_DNS::isPowerOf2(N, 32) &&
691  static_assert(VS == 1 || sizeof(T) >= 4,
692  "VS > 1 is supprted only for 4- and 8-byte elements");
694  PropertyListT>(p, byte_offsets, vals, mask);
695  } else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) {
696  simd<uint64_t, N> Addrs(reinterpret_cast<uint64_t>(p));
697  Addrs = Addrs + convert<uint64_t>(byte_offsets);
698  using MsgT = detail::__raw_t<T>;
699  __esimd_scatter_st<MsgT, N, Alignment>(
700  sycl::bit_cast<__ESIMD_DNS::vector_type_t<MsgT, N>>(vals.data()),
701  Addrs.data(), mask.data());
702  } else {
703  using Tx = detail::__raw_t<T>;
704  simd<uint64_t, N> byte_offsets_i = convert<uint64_t>(byte_offsets);
705  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
706  addrs = addrs + byte_offsets_i;
707  if constexpr (sizeof(T) == 1) {
708  detail::check_wrregion_params<N * 4, N, /*VS*/ 0, N, 4>();
709  simd<T, N * 4> D = __esimd_wrregion<Tx, N * 4, N, /*VS*/ 0, N, 4>(
710  D.data(), vals.data(), 0);
711  __esimd_svm_scatter<Tx, N, detail::ElemsPerAddrEncoding<4>(),
712  detail::ElemsPerAddrEncoding<1>()>(
713  addrs.data(), D.data(), mask.data());
714  } else if constexpr (sizeof(T) == 2) {
715  detail::check_wrregion_params<N * 2, N, /*VS*/ 0, N, 2>();
716  simd<Tx, N * 2> D = __esimd_wrregion<Tx, N * 2, N, /*VS*/ 0, N, 2>(
717  D.data(), vals.data(), 0);
718  __esimd_svm_scatter<Tx, N, detail::ElemsPerAddrEncoding<2>(),
719  detail::ElemsPerAddrEncoding<2>()>(
720  addrs.data(), D.data(), mask.data());
721  } else
722  __esimd_svm_scatter<Tx, N, detail::ElemsPerAddrEncoding<1>(),
723  detail::ElemsPerAddrEncoding<1>()>(
724  addrs.data(), vals.data(), mask.data());
725  }
726 }
727 
728 // template <typename T, int N, int VS = 1, typename OffsetT,
729 // typename PropertyListT = empty_properties_t>
730 // void scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
731 // PropertyListT props = {}); // (usm-sc-2)
749 template <typename T, int N, int VS = 1, typename OffsetT,
750  typename PropertyListT =
752 __ESIMD_API std::enable_if_t<
753  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
754 scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
755  PropertyListT props = {}) {
756  simd_mask<N / VS> Mask = 1;
757  scatter<T, N, VS>(p, byte_offsets, vals, Mask, props);
758 }
759 
760 // template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
761 // typename PropertyListT = empty_properties_t>
762 // void scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
763 // simd_mask<N / VS> mask, PropertyListT props = {}); // (usm-sc-3)
784 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
785  typename PropertyListT =
787 __ESIMD_API std::enable_if_t<
788  detail::is_simd_view_type_v<OffsetSimdViewT> &&
789  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
790 scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
791  simd_mask<N / VS> mask, PropertyListT props = {}) {
792  scatter<T, N, VS>(p, byte_offsets.read(), vals, mask, props);
793 }
794 
817 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
818  typename PropertyListT =
820 __ESIMD_API std::enable_if_t<
821  detail::is_simd_view_type_v<OffsetSimdViewT> &&
822  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
823 scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
824  PropertyListT props = {}) {
825  simd_mask<N / VS> Mask = 1;
826  scatter<T, N, VS>(p, byte_offsets.read(), vals, Mask, props);
827 }
828 
840 template <typename Tx, int N, typename Toffset>
841 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> && N == 1>
842 scatter(Tx *p, Toffset offset, simd<Tx, N> vals, simd_mask<N> mask = 1) {
843  scatter<Tx, N, 1>(p, simd<Toffset, N>(offset), vals, mask);
844 }
845 
846 namespace detail {
847 // Accessors may get either 32-bit offset or 64-bit depending on
848 // the -fsycl-esimd-force-stateles-mem mode setting.
849 #ifdef __ESIMD_FORCE_STATELESS_MEM
850 using DeviceAccessorOffsetT = uint64_t;
851 #else
852 using DeviceAccessorOffsetT = uint32_t;
853 #endif
854 
886 template <typename T, int NElts, typename PropertyListT>
887 __ESIMD_API std::enable_if_t<is_property_list_v<PropertyListT>, simd<T, NElts>>
888 block_load_impl(const T *p, simd_mask<1> pred, simd<T, NElts> pass_thru) {
889  // Verify input template arguments.
890  check_cache_hints<cache_action::load, PropertyListT>();
891  constexpr size_t Alignment =
892  PropertyListT::template get_property<alignment_key>().value;
893  static_assert(
894  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
895  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
896  "Incorrect alignment for the data type");
897 
898  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
899  constexpr int SmallIntFactor32Bit =
900  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
901  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
902  "Number of elements is not supported by Transposed load");
903 
904  // If alignment >= 8 and (NElts * sizeof(T)) % 8 == 0) we can load QWORDs.
905  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
906  // because it would require a bit-cast, which is supposed to be NO-OP, but
907  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
908  constexpr bool Use64BitData =
909  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
910  (NElts * sizeof(T)) % sizeof(uint64_t) == 0 &&
911  (sizeof(T) != sizeof(uint32_t) || NElts * sizeof(T) > 256);
912  constexpr int SmallIntFactor =
913  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
914  constexpr int FactoredNElts = NElts / SmallIntFactor;
915  check_lsc_vector_size<FactoredNElts>();
916 
917  // Prepare template arguments for the call of intrinsic.
918  using LoadElemT = __ESIMD_DNS::__raw_t<
919  std::conditional_t<SmallIntFactor == 1, T,
920  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
921  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
922  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
923 
924  constexpr uint16_t AddressScale = 1;
925  constexpr int ImmOffset = 0;
926  constexpr lsc_data_size ActualDS =
927  Use64BitData ? lsc_data_size::u64 : lsc_data_size::u32;
928  constexpr lsc_vector_size VS = to_lsc_vector_size<FactoredNElts>();
929  constexpr auto Transposed = lsc_data_order::transpose;
930  constexpr int N = 1;
931 
932  // Prepare non-template arguments and call the intrinsic.
933  simd<uintptr_t, N> Addrs = reinterpret_cast<uintptr_t>(p);
935  pass_thru.template bit_cast_view<LoadElemT>();
937  __esimd_lsc_load_merge_stateless<LoadElemT, L1H, L2H, AddressScale,
938  ImmOffset, ActualDS, VS, Transposed, N>(
939  pred.data(), Addrs.data(), PassThru.data());
940  return Result.template bit_cast_view<T>();
941 }
942 
975 template <typename T, int NElts, typename PropertyListT, typename AccessorT>
976 __ESIMD_API
977  std::enable_if_t<detail::is_device_accessor_with_v<
978  AccessorT, detail::accessor_mode_cap::can_read> &&
979  is_property_list_v<PropertyListT>,
981  block_load_impl(AccessorT acc, DeviceAccessorOffsetT offset,
982  simd_mask<1> pred) {
983 #ifdef __ESIMD_FORCE_STATELESS_MEM
984  simd<T, NElts> PassThru; // Intentionally undefined.
985  return block_load_impl<T, NElts, PropertyListT>(
986  accessorToPointer<T>(acc, offset), pred, PassThru);
987 #else // !__ESIMD_FORCE_STATELESS_MEM
988  // Verify input template arguments.
989  check_cache_hints<cache_action::load, PropertyListT>();
990  constexpr size_t Alignment =
991  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
992  static_assert(
993  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
994  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
995  "Incorrect alignment for the data type");
996 
997  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
998  constexpr int SmallIntFactor32Bit =
999  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
1000  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1001  "Number of elements is not supported by Transposed load");
1002 
1003  // If alignment >= 8 and (NElts * sizeof(T)) % 8 == 0) we can load QWORDs.
1004  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
1005  // because it would require a bit-cast, which is supposed to be NO-OP, but
1006  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
1007  constexpr bool Use64BitData =
1008  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1009  (NElts * sizeof(T)) % sizeof(uint64_t) == 0 &&
1010  (sizeof(T) != sizeof(uint32_t) || NElts * sizeof(T) > 256);
1011  constexpr int SmallIntFactor =
1012  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1013  constexpr int FactoredNElts = NElts / SmallIntFactor;
1014  check_lsc_vector_size<FactoredNElts>();
1015 
1016  // Prepare template arguments for the call of intrinsic.
1017  using LoadElemT = __ESIMD_DNS::__raw_t<
1018  std::conditional_t<SmallIntFactor == 1, T,
1019  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1020  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
1021  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
1022  constexpr uint16_t AddressScale = 1;
1023  constexpr int ImmOffset = 0;
1024  constexpr lsc_data_size ActualDS =
1025  Use64BitData ? lsc_data_size::u64 : lsc_data_size::u32;
1026  constexpr auto VS = to_lsc_vector_size<FactoredNElts>();
1027  constexpr auto Transposed = lsc_data_order::transpose;
1028  constexpr int N = 1;
1029 
1030  // Prepare non-template arguments and call the intrinsic.
1031  simd<uint32_t, N> Offsets = offset;
1032  auto SI = get_surface_index(acc);
1034  __esimd_lsc_load_bti<LoadElemT, L1H, L2H, AddressScale, ImmOffset,
1035  ActualDS, VS, Transposed, N>(pred.data(),
1036  Offsets.data(), SI);
1037  return Result.template bit_cast_view<T>();
1038 #endif // !__ESIMD_FORCE_STATELESS_MEM
1039 }
1040 
1074 template <typename T, int NElts, typename PropertyListT, typename AccessorT>
1075 __ESIMD_API
1076  std::enable_if_t<detail::is_device_accessor_with_v<
1077  AccessorT, detail::accessor_mode_cap::can_read> &&
1078  is_property_list_v<PropertyListT>,
1081  simd_mask<1> pred, simd<T, NElts> pass_thru) {
1082 #ifdef __ESIMD_FORCE_STATELESS_MEM
1083  return block_load_impl<T, NElts, PropertyListT>(
1084  accessorToPointer<T>(acc, offset), pred, pass_thru);
1085 #else // !__ESIMD_FORCE_STATELESS_MEM
1086  // Verify input template arguments.
1087  check_cache_hints<cache_action::load, PropertyListT>();
1088  constexpr size_t Alignment =
1089  PropertyListT::template get_property<alignment_key>().value;
1090  static_assert(
1091  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
1092  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
1093  "Incorrect alignment for the data type");
1094 
1095  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
1096  constexpr int SmallIntFactor32Bit =
1097  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
1098  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1099  "Number of elements is not supported by Transposed load");
1100 
1101  // If alignment >= 8 and (NElts * sizeof(T)) % 8 == 0) we can load QWORDs.
1102  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
1103  // because it would require a bit-cast, which is supposed to be NO-OP, but
1104  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
1105  constexpr bool Use64BitData =
1106  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1107  (NElts * sizeof(T)) % sizeof(uint64_t) == 0 &&
1108  (sizeof(T) != sizeof(uint32_t) || NElts * sizeof(T) > 256);
1109  constexpr int SmallIntFactor =
1110  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1111  constexpr int FactoredNElts = NElts / SmallIntFactor;
1112  check_lsc_vector_size<FactoredNElts>();
1113 
1114  // Prepare template arguments for the call of intrinsic.
1115  using LoadElemT = __ESIMD_DNS::__raw_t<
1116  std::conditional_t<SmallIntFactor == 1, T,
1117  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1118  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
1119  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
1120  constexpr uint16_t AddressScale = 1;
1121  constexpr int ImmOffset = 0;
1122  constexpr lsc_data_size ActualDS =
1123  Use64BitData ? lsc_data_size::u64 : lsc_data_size::u32;
1124  constexpr auto VS = to_lsc_vector_size<FactoredNElts>();
1125  constexpr auto Transposed = lsc_data_order::transpose;
1126  constexpr int N = 1;
1127 
1128  // Prepare non-template arguments and call the intrinsic.
1129  simd<uint32_t, N> Offsets = offset;
1130  auto SI = get_surface_index(acc);
1132  pass_thru.template bit_cast_view<LoadElemT>();
1134  __esimd_lsc_load_merge_bti<LoadElemT, L1H, L2H, AddressScale, ImmOffset,
1135  ActualDS, VS, Transposed, N>(
1136  pred.data(), Offsets.data(), SI, PassThru.data());
1137  return Result.template bit_cast_view<T>();
1138 #endif // !__ESIMD_FORCE_STATELESS_MEM
1139 }
1140 
1141 template <typename T, int NElts, typename PropertyListT>
1142 __ESIMD_API std::enable_if_t<detail::is_property_list_v<PropertyListT>>
1144  detail::check_cache_hints<cache_action::store, PropertyListT>();
1145  constexpr size_t Alignment =
1146  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
1147  static_assert(
1148  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
1149  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
1150  "Incorrect alignment for the data type");
1151 
1152  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
1153  constexpr int SmallIntFactor32Bit =
1154  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
1155  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1156  "Number of elements is not supported by Transposed store");
1157 
1158  // If alignment >= 8 and (NElts * sizeof(T)) % 8 == 0) we can store QWORDs.
1159  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
1160  // because it would require a bit-cast, which is supposed to be NO-OP, but
1161  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
1162  constexpr bool Use64BitData =
1163  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1164  (NElts * sizeof(T)) % sizeof(uint64_t) == 0 &&
1165  (sizeof(T) != sizeof(uint32_t) || NElts * sizeof(T) > 256);
1166 
1167  constexpr int SmallIntFactor =
1168  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1169  constexpr int FactoredNElts = NElts / SmallIntFactor;
1170 
1171  check_lsc_vector_size<FactoredNElts>();
1172 
1173  using StoreType = __ESIMD_DNS::__raw_t<
1174  std::conditional_t<SmallIntFactor == 1, T,
1175  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1176  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
1177  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
1178  constexpr uint16_t AddressScale = 1;
1179  constexpr int ImmOffset = 0;
1180  constexpr lsc_data_size ActualDS =
1181  Use64BitData ? lsc_data_size::u64 : lsc_data_size::u32;
1182  constexpr lsc_vector_size VS = to_lsc_vector_size<FactoredNElts>();
1183  constexpr auto Transposed = lsc_data_order::transpose;
1184  constexpr int N = 1;
1185  simd<uintptr_t, N> Addrs = reinterpret_cast<uintptr_t>(p);
1186 
1187  __esimd_lsc_store_stateless<StoreType, L1H, L2H, AddressScale, ImmOffset,
1188  ActualDS, VS, Transposed, N>(
1189  pred.data(), Addrs.data(),
1190  sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreType, FactoredNElts>>(
1191  vals.data()));
1192 }
1193 
1194 template <typename T, int NElts, typename PropertyListT, typename AccessorT>
1195 __ESIMD_API
1196  std::enable_if_t<detail::is_device_accessor_with_v<
1197  AccessorT, detail::accessor_mode_cap::can_write> &&
1198  detail::is_property_list_v<PropertyListT>>
1200  simd<T, NElts> vals, simd_mask<1> pred) {
1201 #ifdef __ESIMD_FORCE_STATELESS_MEM
1202  block_store_impl<T, NElts, PropertyListT>(accessorToPointer<T>(acc, offset),
1203  vals, pred);
1204 #else
1205  // Verify input template arguments.
1206  check_cache_hints<cache_action::store, PropertyListT>();
1207  constexpr size_t Alignment =
1208  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
1209  static_assert(
1210  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
1211  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
1212  "Incorrect alignment for the data type");
1213 
1214  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
1215  constexpr int SmallIntFactor32Bit =
1216  sizeof(uint32_t) / sizeof(T) > static_cast<size_t>(1)
1217  ? sizeof(uint32_t) / sizeof(T)
1218  : static_cast<size_t>(1);
1219  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1220  "Number of elements is not supported by Transposed store");
1221 
1222  // If alignment >= 8 and (NElts * sizeof(T)) % 8 == 0) we can store QWORDs.
1223  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
1224  // because it would require a bit-cast, which is supposed to be NO-OP, but
1225  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
1226  constexpr bool Use64BitData =
1227  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1228  (NElts * sizeof(T)) % sizeof(uint64_t) == 0 &&
1229  (sizeof(T) != sizeof(uint32_t) || NElts * sizeof(T) > 256);
1230  constexpr int SmallIntFactor =
1231  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1232  constexpr int FactoredNElts = NElts / SmallIntFactor;
1233  check_lsc_vector_size<FactoredNElts>();
1234 
1235  // Prepare template arguments for the call of intrinsic.
1236  using StoreElemT = __ESIMD_DNS::__raw_t<
1237  std::conditional_t<SmallIntFactor == 1, T,
1238  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1239  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
1240  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
1241  constexpr uint16_t AddressScale = 1;
1242  constexpr int ImmOffset = 0;
1243  constexpr lsc_data_size ActualDS =
1244  Use64BitData ? lsc_data_size::u64 : lsc_data_size::u32;
1245  constexpr auto VS = to_lsc_vector_size<FactoredNElts>();
1246  constexpr auto Transposed = lsc_data_order::transpose;
1247  constexpr int N = 1;
1248 
1249  // Prepare non-template arguments and call the intrinsic.
1250  simd<uint32_t, N> Offsets = offset;
1251  auto SI = get_surface_index(acc);
1252 
1253  __esimd_lsc_store_bti<StoreElemT, L1H, L2H, AddressScale, ImmOffset, ActualDS,
1254  VS, Transposed, N>(
1255  pred.data(), Offsets.data(),
1256  sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreElemT, FactoredNElts>>(
1257  vals.data()),
1258  SI);
1259 #endif
1260 }
1261 
1262 } // namespace detail
1263 
1278 template <typename Tx, int N,
1280 __ESIMD_API std::enable_if_t<is_simd_flag_type_v<Flags>>
1281 block_store(Tx *addr, simd<Tx, N> vals, Flags) {
1282  using T = typename detail::__raw_t<Tx>;
1283  using VecT = typename simd<T, N>::raw_vector_type;
1284  constexpr size_t Align = Flags::template alignment<simd<T, N>>;
1285  __esimd_svm_block_st<T, N, Align>(reinterpret_cast<VecT *>(addr),
1286  vals.data());
1287 }
1288 
1291 
1302 
1306 
1311 
1317 
1351 template <typename T, int N,
1352  typename PropertyListT =
1354 __ESIMD_API std::enable_if_t<
1355  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
1356 block_load(const T *ptr, PropertyListT props = {}) {
1357  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
1358  using NewPropertyListT =
1359  detail::add_alignment_property_t<PropertyListT, DefaultAlignment>;
1360  if constexpr (detail::has_cache_hints<PropertyListT>()) {
1361  simd<T, N> PassThru; // Intentionally undefined.
1362  simd_mask<1> Mask = 1;
1363  return detail::block_load_impl<T, N, NewPropertyListT>(ptr, Mask, PassThru);
1364  } else {
1365  constexpr size_t Alignment =
1366  NewPropertyListT::template get_property<alignment_key>().value;
1367  return block_load<T, N>(ptr, overaligned_tag<Alignment>{});
1368  }
1369 }
1370 
1406 template <typename T, int N,
1407  typename PropertyListT =
1409 __ESIMD_API std::enable_if_t<
1410  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
1411 block_load(const T *ptr, size_t byte_offset, PropertyListT props = {}) {
1412  const T *AdjustedPtr = reinterpret_cast<const T *>(
1413  reinterpret_cast<const int8_t *>(ptr) + byte_offset);
1414  return block_load<T, N>(AdjustedPtr, props);
1415 }
1416 
1449 template <typename T, int N,
1450  typename PropertyListT =
1452 __ESIMD_API
1453  std::enable_if_t<detail::is_property_list_v<PropertyListT>, simd<T, N>>
1454  block_load(const T *ptr, simd_mask<1> pred, PropertyListT props = {}) {
1455  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
1456  using NewPropertyListT =
1457  detail::add_alignment_property_t<PropertyListT, DefaultAlignment>;
1458  simd<T, N> PassThru; // Intentionally uninitialized.
1459  return detail::block_load_impl<T, N, NewPropertyListT>(ptr, pred, PassThru);
1460 }
1461 
1495 template <typename T, int N,
1496  typename PropertyListT =
1498 __ESIMD_API std::enable_if_t<
1499  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
1500 block_load(const T *ptr, size_t byte_offset, simd_mask<1> pred,
1501  PropertyListT props = {}) {
1502  const T *AdjustedPtr = reinterpret_cast<const T *>(
1503  reinterpret_cast<const int8_t *>(ptr) + byte_offset);
1504  return block_load<T, N>(AdjustedPtr, pred, props);
1505 }
1506 
1539 template <typename T, int N,
1540  typename PropertyListT =
1542 __ESIMD_API std::enable_if_t<
1543  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
1544 block_load(const T *ptr, simd_mask<1> pred, simd<T, N> pass_thru,
1545  PropertyListT props = {}) {
1546  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
1547  using NewPropertyListT =
1548  detail::add_alignment_property_t<PropertyListT, DefaultAlignment>;
1549  return detail::block_load_impl<T, N, NewPropertyListT>(ptr, pred, pass_thru);
1550 }
1551 
1586 template <typename T, int N,
1587  typename PropertyListT =
1589 __ESIMD_API std::enable_if_t<
1590  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
1591 block_load(const T *ptr, size_t byte_offset, simd_mask<1> pred,
1592  simd<T, N> pass_thru, PropertyListT props = {}) {
1593  const T *AdjustedPtr = reinterpret_cast<const T *>(
1594  reinterpret_cast<const int8_t *>(ptr) + byte_offset);
1595  return block_load<T, N>(AdjustedPtr, pred, pass_thru, props);
1596 }
1597 
1613 template <typename Tx, int N,
1615 __ESIMD_API std::enable_if_t<is_simd_flag_type_v<Flags>, simd<Tx, N>>
1616 block_load(const Tx *addr, Flags) {
1617  using T = typename detail::__raw_t<Tx>;
1618  using VecT = typename simd<T, N>::raw_vector_type;
1619  constexpr size_t Align = Flags::template alignment<simd<T, N>>;
1620  return __esimd_svm_block_ld<T, N, Align>(
1621  reinterpret_cast<const VecT *>(addr));
1622 }
1623 
1639 template <typename Tx, int N, typename AccessorTy,
1640  typename Flags = vector_aligned_tag,
1641  typename = std::enable_if_t<
1642  is_simd_flag_type_v<Flags> &&
1643  detail::is_device_accessor_with_v<
1644  AccessorTy, detail::accessor_mode_cap::can_read>>,
1645  class T = detail::__raw_t<Tx>>
1646 __ESIMD_API simd<Tx, N> block_load(AccessorTy acc,
1647  detail::DeviceAccessorOffsetT byte_offset,
1648  Flags flags) {
1649 #ifdef __ESIMD_FORCE_STATELESS_MEM
1650  return block_load<Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc, byte_offset),
1651  flags);
1652 #else
1653  std::ignore = flags;
1654  constexpr unsigned Sz = sizeof(T) * N;
1655  static_assert(Sz >= detail::OperandSize::OWORD,
1656  "block size must be at least 1 oword");
1657  static_assert(Sz % detail::OperandSize::OWORD == 0,
1658  "block size must be whole number of owords");
1659  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
1660  "block must be 1, 2, 4 or 8 owords long");
1661  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
1662  "block size must be at most 8 owords");
1663 
1664  auto surf_ind = __esimd_get_surface_index(
1665  detail::AccessorPrivateProxy::getQualifiedPtrOrImageObj(acc));
1666 
1667  if constexpr (Flags::template alignment<simd<T, N>> >=
1668  detail::OperandSize::OWORD) {
1669  return __esimd_oword_ld<T, N>(surf_ind, byte_offset >> 4);
1670  } else {
1671  return __esimd_oword_ld_unaligned<T, N>(surf_ind, byte_offset);
1672  }
1673 #endif
1674 }
1675 
1685 
1689 
1696 
1702 
1740 template <typename T, int N, typename AccessorT,
1741  typename PropertyListT =
1743 __ESIMD_API std::enable_if_t<
1744  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
1745  detail::is_device_accessor_with_v<AccessorT,
1746  detail::accessor_mode_cap::can_read>,
1747  simd<T, N>>
1748 block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
1749  PropertyListT props = {}) {
1750 #ifdef __ESIMD_FORCE_STATELESS_MEM
1751  return block_load<T, N>(detail::accessorToPointer<T>(acc, byte_offset),
1752  props);
1753 #else // !__ESIMD_FORCE_STATELESS_MEM
1754  // If the alignment property is not passed, then assume the pointer
1755  // is element-aligned.
1756  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
1757  constexpr size_t Alignment =
1758  detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
1759 
1760  // Legacy surface index loads must be 1, 2, 4 or 8 owords long.
1761  constexpr size_t Size = sizeof(T) * N;
1762  constexpr size_t OWord = detail::OperandSize::OWORD;
1763  constexpr bool IsLegacySize = Size == OWord || Size == 2 * OWord ||
1764  Size == 4 * OWord || Size == 8 * OWord;
1765 
1766  using NewPropertyListT =
1767  detail::add_alignment_property_t<PropertyListT, DefaultAlignment>;
1768  if constexpr (detail::has_cache_hints<PropertyListT>() || !IsLegacySize) {
1769  return detail::block_load_impl<T, N, NewPropertyListT>(acc, byte_offset,
1770  simd_mask<1>(1));
1771  } else {
1772  constexpr size_t Alignment =
1773  NewPropertyListT::template get_property<alignment_key>().value;
1774  return block_load<T, N>(acc, byte_offset, overaligned_tag<Alignment>{});
1775  }
1776 #endif // !__ESIMD_FORCE_STATELESS_MEM
1777 }
1778 
1808 template <typename T, int N, typename AccessorT,
1809  typename PropertyListT =
1811 __ESIMD_API std::enable_if_t<
1812  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
1813  detail::is_device_accessor_with_v<AccessorT,
1814  detail::accessor_mode_cap::can_read>,
1815  simd<T, N>>
1816 block_load(AccessorT acc, PropertyListT /* props */ = {}) {
1817  // Create new properties without the alignment property passed in 'props',
1818  // and add alignment<16> as it is usable and most favourable in this case.
1819  using NewPropertyListT =
1820  detail::add_or_replace_alignment_property_t<PropertyListT, 16>;
1821  return block_load<T, N>(acc, 0, NewPropertyListT{});
1822 }
1823 
1855 template <typename T, int N, typename AccessorT,
1856  typename PropertyListT =
1858 __ESIMD_API std::enable_if_t<
1859  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
1860  detail::is_device_accessor_with_v<AccessorT,
1861  detail::accessor_mode_cap::can_read>,
1862  simd<T, N>>
1863 block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
1864  simd_mask<1> pred, simd<T, N> pass_thru,
1865  PropertyListT /* props */ = {}) {
1866  // If the alignment property is not passed, then assume the byte_offset
1867  // is element-aligned and is at least 4-bytes.
1868  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
1869  using NewPropertyListT =
1870  detail::add_alignment_property_t<PropertyListT, DefaultAlignment>;
1871  return detail::block_load_impl<T, N, NewPropertyListT>(acc, byte_offset, pred,
1872  pass_thru);
1873 }
1874 
1906 template <typename T, int N, typename AccessorT,
1907  typename PropertyListT =
1909 __ESIMD_API std::enable_if_t<
1910  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
1911  detail::is_device_accessor_with_v<AccessorT,
1912  detail::accessor_mode_cap::can_read>,
1913  simd<T, N>>
1914 block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
1915  simd_mask<1> pred, PropertyListT props = {}) {
1916  simd<T, N> PassThru; // Intentionally uninitialized.
1917  return block_load<T, N>(acc, byte_offset, pred, PassThru, props);
1918 }
1919 
1947 template <typename T, int N, typename AccessorT,
1948  typename PropertyListT =
1950 __ESIMD_API std::enable_if_t<
1951  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
1952  detail::is_device_accessor_with_v<AccessorT,
1953  detail::accessor_mode_cap::can_read>,
1954  simd<T, N>>
1955 block_load(AccessorT acc, simd_mask<1> pred, simd<T, N> pass_thru,
1956  PropertyListT /* props */ = {}) {
1957  // Create new properties without the alignment property passed in 'props',
1958  // and add alignment<16> as it is usable and most favourable in this case.
1959  using NewPropertyListT =
1960  detail::add_or_replace_alignment_property_t<PropertyListT, 16>;
1961  return block_load<T, N>(acc, 0, pred, pass_thru, NewPropertyListT{});
1962 }
1963 
1990 template <typename T, int N, typename AccessorT,
1991  typename PropertyListT =
1993 __ESIMD_API std::enable_if_t<
1994  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
1995  detail::is_device_accessor_with_v<AccessorT,
1996  detail::accessor_mode_cap::can_read>,
1997  simd<T, N>>
1998 block_load(AccessorT acc, simd_mask<1> pred, PropertyListT /* props */ = {}) {
1999  // Create new properties without the alignment property passed in 'props',
2000  // and add alignment<16> as it is usable and most favourable in this case.
2001  using NewPropertyListT =
2002  detail::add_or_replace_alignment_property_t<PropertyListT, 16>;
2003  simd<T, N> PassThru; // Intentionally uninitialized.
2004  return block_load<T, N>(acc, 0, pred, PassThru, NewPropertyListT{});
2005 }
2006 
2020 
2023 
2060 template <typename T, int N,
2061  typename PropertyListT =
2063 __ESIMD_API std::enable_if_t<detail::is_property_list_v<PropertyListT>>
2064 block_store(T *ptr, simd<T, N> vals, PropertyListT /* props */ = {}) {
2065  if constexpr (detail::has_cache_hints<PropertyListT>()) {
2066  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
2067  using NewPropertyListT =
2068  detail::add_alignment_property_t<PropertyListT, DefaultAlignment>;
2069  simd_mask<1> Mask = 1;
2070  detail::block_store_impl<T, N, NewPropertyListT>(ptr, vals, Mask);
2071  } else {
2072  // If the alignment property is not passed, then assume the pointer
2073  // is OWORD-aligned.
2074  constexpr size_t Alignment =
2075  detail::getPropertyValue<PropertyListT, alignment_key>(
2076  detail::OperandSize::OWORD);
2077  block_store<T, N>(ptr, vals, overaligned_tag<Alignment>{});
2078  }
2079 }
2080 
2115 template <typename T, int N,
2116  typename PropertyListT =
2118 __ESIMD_API std::enable_if_t<
2119  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
2120 block_store(T *ptr, size_t byte_offset, simd<T, N> vals,
2121  PropertyListT props = {}) {
2122  T *AdjustedPtr =
2123  reinterpret_cast<T *>(reinterpret_cast<int8_t *>(ptr) + byte_offset);
2124  block_store<T, N>(AdjustedPtr, vals, props);
2125 }
2126 
2159 template <typename T, int N,
2160  typename PropertyListT =
2162 __ESIMD_API std::enable_if_t<detail::is_property_list_v<PropertyListT>>
2164  PropertyListT /* props */ = {}) {
2165  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
2166  using NewPropertyListT =
2167  detail::add_alignment_property_t<PropertyListT, DefaultAlignment>;
2168  detail::block_store_impl<T, N, NewPropertyListT>(ptr, vals, pred);
2169 }
2170 
2191 // the minimally required element-size alignment otherwise.
2207 template <typename T, int N,
2208  typename PropertyListT =
2210 __ESIMD_API std::enable_if_t<
2211  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
2212 block_store(T *ptr, size_t byte_offset, simd<T, N> vals, simd_mask<1> pred,
2213  PropertyListT props = {}) {
2214  T *AdjustedPtr =
2215  reinterpret_cast<T *>(reinterpret_cast<int8_t *>(ptr) + byte_offset);
2216  block_store<T, N>(AdjustedPtr, vals, pred, props);
2217 }
2218 
2227 
2230 
2234 
2237 
2281 template <typename T, int N, typename AccessorT,
2282  typename PropertyListT =
2284 __ESIMD_API std::enable_if_t<
2285  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
2286  detail::is_device_accessor_with_v<AccessorT,
2287  detail::accessor_mode_cap::can_write>>
2288 block_store(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
2289  simd<T, N> vals, PropertyListT props = {}) {
2290 #ifdef __ESIMD_FORCE_STATELESS_MEM
2291  block_store<T, N>(detail::accessorToPointer<T>(acc, byte_offset), vals,
2292  props);
2293 #else
2294  constexpr int DefaultLSCAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
2295  constexpr size_t Alignment =
2296  detail::getPropertyValue<PropertyListT, alignment_key>(
2297  DefaultLSCAlignment);
2298  constexpr bool AlignmentRequiresLSC =
2299  PropertyListT::template has_property<alignment_key>() && Alignment < 16;
2300  using Tx = detail::__raw_t<T>;
2301  constexpr unsigned Sz = sizeof(Tx) * N;
2302  constexpr bool SzRequiresLSC =
2303  Sz < detail::OperandSize::OWORD || Sz % detail::OperandSize::OWORD != 0 ||
2304  !detail::isPowerOf2(Sz / detail::OperandSize::OWORD) ||
2305  Sz > 8 * detail::OperandSize::OWORD;
2306  if constexpr (detail::has_cache_hints<PropertyListT>() ||
2307  AlignmentRequiresLSC || SzRequiresLSC) {
2308  using NewPropertyListT =
2309  detail::add_alignment_property_t<PropertyListT, DefaultLSCAlignment>;
2310  simd_mask<1> Mask = 1;
2311  detail::block_store_impl<T, N, NewPropertyListT>(acc, byte_offset, vals,
2312  Mask);
2313  } else {
2314  auto surf_ind = __esimd_get_surface_index(
2315  detail::AccessorPrivateProxy::getQualifiedPtrOrImageObj(acc));
2316  __esimd_oword_st<Tx, N>(surf_ind, byte_offset >> 4, vals.data());
2317  }
2318 #endif
2319 }
2320 
2350 template <typename T, int N, typename AccessorT,
2351  typename PropertyListT =
2353 __ESIMD_API std::enable_if_t<
2354  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
2355  detail::is_device_accessor_with_v<AccessorT,
2356  detail::accessor_mode_cap::can_write>>
2357 block_store(AccessorT acc, simd<T, N> vals, PropertyListT props = {}) {
2358  // Create new properties without the alignment property passed in 'props',
2359  // and add alignment<16> as it is usable and most favourable in this case.
2360  using NewPropertyListT =
2361  detail::add_or_replace_alignment_property_t<PropertyListT, 16>;
2362  block_store<T, N>(acc, 0, vals, NewPropertyListT{});
2363 }
2364 
2396 template <typename T, int N, typename AccessorT,
2397  typename PropertyListT =
2399 __ESIMD_API std::enable_if_t<
2400  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
2401  detail::is_device_accessor_with_v<AccessorT,
2402  detail::accessor_mode_cap::can_write>>
2403 block_store(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
2404  simd<T, N> vals, simd_mask<1> pred, PropertyListT props = {}) {
2405  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
2406  using NewPropertyListT =
2407  detail::add_alignment_property_t<PropertyListT, DefaultAlignment>;
2408  detail::block_store_impl<T, N, NewPropertyListT>(acc, byte_offset, vals,
2409  pred);
2410 }
2411 
2436 template <typename T, int N, typename AccessorT,
2437  typename PropertyListT =
2439 __ESIMD_API std::enable_if_t<
2440  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
2441  detail::is_device_accessor_with_v<AccessorT,
2442  detail::accessor_mode_cap::can_write>>
2443 block_store(AccessorT acc, simd<T, N> vals, simd_mask<1> pred,
2444  PropertyListT props = {}) {
2445  // Create new properties without the alignment property passed in 'props',
2446  // and add alignment<16> as it is usable and most favourable in this case.
2447  using NewPropertyListT =
2448  detail::add_or_replace_alignment_property_t<PropertyListT, 16>;
2449  block_store<T, N>(acc, 0, vals, pred, NewPropertyListT{});
2450 }
2451 
2453 
2455 
2457 
2458 // Implementations of accessor-based gather and scatter functions
2459 namespace detail {
2460 template <typename T, int N, typename AccessorTy>
2461 ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t<
2462  std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
2463  is_accessor_with_v<AccessorTy, detail::accessor_mode_cap::can_write>>
2464 scatter_impl(AccessorTy acc, simd<T, N> vals, simd<uint32_t, N> offsets,
2465  uint32_t glob_offset, simd_mask<N> mask) {
2466 
2467  static_assert(detail::isPowerOf2(N, 32), "Unexpected vector length");
2468  if constexpr (sizeof(T) == 8) {
2469  scatter_impl<uint32_t, N>(
2470  acc, vals.template bit_cast_view<uint32_t>().template select<N, 2>(0),
2471  offsets, glob_offset, mask);
2472  scatter_impl<uint32_t, N>(
2473  acc, vals.template bit_cast_view<uint32_t>().template select<N, 2>(1),
2474  offsets, glob_offset + sizeof(uint32_t), mask);
2475  } else {
2476  constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
2477  // TODO (performance) use hardware-supported scale once BE supports it
2478  constexpr int16_t scale = 0;
2479  const auto si = __ESIMD_NS::get_surface_index(acc);
2480 
2481  if constexpr (sizeof(T) < 4) {
2482  using Tint = std::conditional_t<std::is_integral_v<T>, T,
2483  detail::uint_type_t<sizeof(T)>>;
2484  using Treal = __raw_t<T>;
2485  simd<Tint, N> vals_int = bitcast<Tint, Treal, N>(std::move(vals).data());
2486  using PromoT = typename std::conditional_t<std::is_signed<Tint>::value,
2487  int32_t, uint32_t>;
2488  const simd<PromoT, N> promo_vals = convert<PromoT>(std::move(vals_int));
2489  __esimd_scatter_scaled<PromoT, N, decltype(si), TypeSizeLog2, scale>(
2490  mask.data(), si, glob_offset, offsets.data(), promo_vals.data());
2491  } else {
2492  using Treal = __raw_t<T>;
2493  if constexpr (!std::is_same_v<Treal, T>) {
2494  simd<Treal, N> Values = vals.template bit_cast_view<Treal>();
2495  __esimd_scatter_scaled<Treal, N, decltype(si), TypeSizeLog2, scale>(
2496  mask.data(), si, glob_offset, offsets.data(), Values.data());
2497  } else {
2498  __esimd_scatter_scaled<T, N, decltype(si), TypeSizeLog2, scale>(
2499  mask.data(), si, glob_offset, offsets.data(), vals.data());
2500  }
2501  }
2502  }
2503 }
2504 
2505 #ifndef __ESIMD_FORCE_STATELESS_MEM
2523 template <typename T, int NElts, lsc_data_size DS, typename PropertyListT,
2524  int N, typename AccessorTy, typename OffsetT>
2525 __ESIMD_API std::enable_if_t<
2526  is_device_accessor_with_v<AccessorTy, accessor_mode_cap::can_write>>
2527 scatter_impl(AccessorTy acc, simd<OffsetT, N> offsets, simd<T, N * NElts> vals,
2528  simd_mask<N> pred) {
2529  static_assert(std::is_integral_v<OffsetT>,
2530  "Scatter must have integral byte_offset type");
2531  static_assert(sizeof(OffsetT) <= 4,
2532  "Implicit truncation of 64-bit byte_offset to 32-bit is "
2533  "disabled. Use -fsycl-esimd-force-stateless-mem or explicitly "
2534  "convert offsets to a 32-bit vector");
2535  check_lsc_vector_size<NElts>();
2536  check_lsc_data_size<T, DS>();
2537  check_cache_hints<cache_action::store, PropertyListT>();
2538  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
2539  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
2540  constexpr uint16_t AddressScale = 1;
2541  constexpr int ImmOffset = 0;
2542  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
2543  constexpr lsc_vector_size LSCNElts = to_lsc_vector_size<NElts>();
2544  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
2545  using MsgT = typename lsc_expand_type<T>::type;
2546  simd<MsgT, N * NElts> Tmp = lsc_format_input<MsgT, T>(vals);
2547  simd<uint32_t, N> ByteOffsets32 = convert<uint32_t>(offsets);
2548  auto si = get_surface_index(acc);
2549  __esimd_lsc_store_bti<MsgT, L1H, L2H, AddressScale, ImmOffset, EDS, LSCNElts,
2550  Transposed, N>(pred.data(), ByteOffsets32.data(),
2551  Tmp.data(), si);
2552 }
2553 #endif // __ESIMD_FORCE_STATELESS_MEM
2554 
2555 template <typename T, int N, typename AccessorTy>
2556 __ESIMD_API std::enable_if_t<
2557  (std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
2558  is_accessor_with_v<AccessorTy, detail::accessor_mode_cap::can_read>),
2559  simd<T, N>>
2560 gather_impl(AccessorTy acc, simd<uint32_t, N> offsets, uint32_t glob_offset,
2561  simd_mask<N> mask) {
2562  static_assert(detail::isPowerOf2(N, 32), "Unexpected vector length");
2563 
2564  if constexpr (sizeof(T) == 8) {
2565  simd<T, N> Res;
2566  Res.template bit_cast_view<uint32_t>().template select<N, 2>(0) =
2567  gather_impl<uint32_t, N>(acc, offsets, glob_offset, mask);
2568  Res.template bit_cast_view<uint32_t>().template select<N, 2>(1) =
2569  gather_impl<uint32_t, N>(acc, offsets, glob_offset + sizeof(uint32_t),
2570  mask);
2571  return Res;
2572  } else {
2573  using Treal = __raw_t<T>;
2574  constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
2575  // TODO (performance) use hardware-supported scale once BE supports it
2576  constexpr uint32_t scale = 0;
2577  const auto si = get_surface_index(acc);
2578  if constexpr (sizeof(T) < 4) {
2579  using Tint = std::conditional_t<std::is_integral_v<T>, T,
2580  detail::uint_type_t<sizeof(T)>>;
2581 
2582  static_assert(std::is_integral<Tint>::value,
2583  "only integral 1- & 2-byte types are supported");
2584  using PromoT = typename std::conditional_t<std::is_signed<Tint>::value,
2585  int32_t, uint32_t>;
2586  simd<PromoT, N> promo_vals =
2587  __esimd_gather_masked_scaled2<PromoT, N, decltype(si), TypeSizeLog2,
2588  scale>(si, glob_offset, offsets.data(),
2589  mask.data());
2590  auto Res = convert<Tint>(promo_vals);
2591 
2592  if constexpr (!std::is_same_v<Tint, T>) {
2593  return detail::bitcast<Treal, Tint, N>(Res.data());
2594  } else {
2595  return Res;
2596  }
2597  } else {
2598  simd<Treal, N> Res = __esimd_gather_masked_scaled2<Treal, N, decltype(si),
2599  TypeSizeLog2, scale>(
2600  si, glob_offset, offsets.data(), mask.data());
2601  if constexpr (!std::is_same_v<Treal, T>) {
2602  return Res.template bit_cast_view<T>();
2603  } else {
2604  return Res;
2605  }
2606  }
2607  }
2608 }
2609 
2610 #ifndef __ESIMD_FORCE_STATELESS_MEM
2611 template <typename T, int N, int VS, typename PropertyListT, lsc_data_size DS,
2612  typename OffsetT, typename AccessorT>
2613 __ESIMD_API std::enable_if_t<
2614  is_device_accessor_with_v<AccessorT, accessor_mode_cap::can_read>,
2615  simd<T, N>>
2616 gather_impl(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
2617  simd_mask<N / VS> pred, simd<T, N> pass_thru) {
2618  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
2619  static_assert(std::is_integral_v<OffsetT>,
2620  "Gather must have integral byte_offset type");
2621  static_assert(sizeof(OffsetT) <= 4,
2622  "Implicit truncation of 64-bit byte_offset to 32-bit is "
2623  "disabled. Use -fsycl-esimd-force-stateless-mem or explicitly "
2624  "convert offsets to a 32-bit vector");
2625  static_assert(VS == 1 || sizeof(T) >= 4,
2626  "VS > 1 is supprted only for 4- and 8-byte elements");
2627  check_lsc_vector_size<VS>();
2628  check_lsc_data_size<T, DS>();
2629  check_cache_hints<cache_action::load, PropertyListT>();
2630  constexpr uint16_t AddressScale = 1;
2631  constexpr int ImmOffset = 0;
2632  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
2633  constexpr lsc_vector_size LSCVS = to_lsc_vector_size<VS>();
2634  constexpr auto Transposed = lsc_data_order::nontranspose;
2635  using MsgT = typename lsc_expand_type<T>::type;
2636  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
2637  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
2638  auto SI = get_surface_index(acc);
2639  simd<uint32_t, N / VS> ByteOffsets32 = convert<uint32_t>(byte_offsets);
2640  simd<MsgT, N> PassThruExpanded = lsc_format_input<MsgT>(pass_thru);
2641  simd<MsgT, N> Result =
2642  __esimd_lsc_load_merge_bti<MsgT, L1H, L2H, AddressScale, ImmOffset, EDS,
2643  LSCVS, Transposed, N / VS>(
2644  pred.data(), ByteOffsets32.data(), SI, PassThruExpanded.data());
2645  return lsc_format_ret<T>(Result);
2646 }
2647 #endif // __ESIMD_FORCE_STATELESS_MEM
2648 
2666 template <typename T, int NElts, lsc_data_size DS, int N>
2667 __ESIMD_API simd<T, N * NElts> slm_gather_impl(simd<uint32_t, N> offsets,
2668  simd_mask<N> pred,
2669  simd<T, N * NElts> pass_thru) {
2670  check_lsc_vector_size<NElts>();
2671  check_lsc_data_size<T, DS>();
2672  constexpr uint16_t AddressScale = 1;
2673  constexpr int ImmOffset = 0;
2674  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
2675  constexpr lsc_vector_size LSCVS = to_lsc_vector_size<NElts>();
2676  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
2677  using MsgT = typename lsc_expand_type<T>::type;
2678  simd<MsgT, N * NElts> PassThruExpanded = lsc_format_input<MsgT>(pass_thru);
2679  simd<MsgT, N * NElts> Result =
2680  __esimd_lsc_load_merge_slm<MsgT, cache_hint::none, cache_hint::none,
2681  AddressScale, ImmOffset, EDS, LSCVS,
2682  Transposed, N>(pred.data(), offsets.data(),
2683  PassThruExpanded.data());
2684  return lsc_format_ret<T>(Result);
2685 }
2686 
2701 template <typename T, int NElts, lsc_data_size DS, int N>
2702 __ESIMD_API void slm_scatter_impl(simd<uint32_t, N> offsets,
2703  simd<T, N * NElts> vals, simd_mask<N> pred) {
2704  check_lsc_vector_size<NElts>();
2705  check_lsc_data_size<T, DS>();
2706  constexpr uint16_t AddressScale = 1;
2707  constexpr int ImmOffset = 0;
2708  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
2709  constexpr lsc_vector_size LSCVS = to_lsc_vector_size<NElts>();
2710  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
2711  using MsgT = typename lsc_expand_type<T>::type;
2712  simd<MsgT, N * NElts> Tmp = lsc_format_input<MsgT, T>(vals);
2713  __esimd_lsc_store_slm<MsgT, cache_hint::none, cache_hint::none, AddressScale,
2714  ImmOffset, EDS, LSCVS, Transposed, N>(
2715  pred.data(), offsets.data(), Tmp.data());
2716 }
2717 
2733 template <typename T, int NElts, lsc_data_size DS, typename PropertyListT,
2734  int N, typename Toffset>
2735 __ESIMD_API void prefetch_impl(const T *p, simd<Toffset, N> byte_offsets,
2736  simd_mask<N> pred) {
2737  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
2738  check_lsc_vector_size<NElts>();
2739  check_lsc_data_size<T, DS>();
2740  check_cache_hints<cache_action::prefetch, PropertyListT>();
2741  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
2742  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
2743  constexpr uint16_t AddressScale = 1;
2744  constexpr int ImmOffset = 0;
2745  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
2746  constexpr lsc_vector_size LSCVS = to_lsc_vector_size<NElts>();
2747  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
2748  using MsgT = typename lsc_expand_type<T>::type;
2749  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
2750  addrs += convert<uintptr_t>(byte_offsets);
2751  __esimd_lsc_prefetch_stateless<MsgT, L1H, L2H, AddressScale, ImmOffset, EDS,
2752  LSCVS, Transposed, N>(pred.data(),
2753  addrs.data());
2754 }
2755 
2756 template <typename T, int NElts, lsc_data_size DS, typename PropertyListT,
2757  typename Toffset>
2758 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>>
2759 prefetch_impl(const T *p, Toffset offset, simd_mask<1> pred) {
2760  check_lsc_data_size<T, DS>();
2761  check_cache_hints<cache_action::prefetch, PropertyListT>();
2762 
2763  constexpr size_t Alignment =
2764  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
2765  static_assert(
2766  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
2767  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
2768  "Incorrect alignment for the data type");
2769 
2770  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
2771  constexpr int SmallIntFactor32Bit =
2772  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
2773  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
2774  "Number of elements is not supported by Transposed load");
2775 
2776  // If alignment >= 8 and (NElts * sizeof(T)) % 8 == 0) we can prefetch QWORDs.
2777  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
2778  // because it would require a bit-cast, which is supposed to be NO-OP, but
2779  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
2780  constexpr bool Use64BitData =
2781  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
2782  (NElts * sizeof(T)) % sizeof(uint64_t) == 0 &&
2783  (sizeof(T) != sizeof(uint32_t) || NElts * sizeof(T) > 256);
2784  constexpr int SmallIntFactor =
2785  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
2786  constexpr int FactoredNElts = NElts / SmallIntFactor;
2787  check_lsc_vector_size<FactoredNElts>();
2788 
2789  // Prepare template arguments for the call of intrinsic.
2790  using LoadElemT = __ESIMD_DNS::__raw_t<
2791  std::conditional_t<SmallIntFactor == 1, T,
2792  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
2793 
2794  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
2795  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
2796  constexpr uint16_t AddressScale = 1;
2797  constexpr int ImmOffset = 0;
2798  constexpr lsc_data_size EDS = finalize_data_size<LoadElemT, DS>();
2799 
2800  static_assert(
2801  EDS == lsc_data_size::u32 || EDS == lsc_data_size::u64,
2802  "Transposed prefetch is supported only for data size u32 or u64");
2803  constexpr lsc_vector_size LSCVS = to_lsc_vector_size<FactoredNElts>();
2804  constexpr lsc_data_order Transposed = lsc_data_order::transpose;
2805  constexpr int N = 1;
2806 
2807  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p) + offset;
2808  __esimd_lsc_prefetch_stateless<LoadElemT, L1H, L2H, AddressScale, ImmOffset,
2809  EDS, LSCVS, Transposed, N>(pred.data(),
2810  addrs.data());
2811 }
2812 
2813 #ifndef __ESIMD_FORCE_STATELESS_MEM
2831 
2832 template <typename T, int NElts, lsc_data_size DS, typename PropertyListT,
2833  int N, typename AccessorTy, typename OffsetT>
2834 __ESIMD_API std::enable_if_t<
2835  is_device_accessor_with_v<AccessorTy, accessor_mode_cap::can_read>>
2836 prefetch_impl(AccessorTy acc, simd<OffsetT, N> byte_offsets,
2837  simd_mask<N> pred) {
2838  static_assert(std::is_integral_v<OffsetT>,
2839  "Prefetch must have integral byte_offset type");
2840  static_assert(sizeof(OffsetT) <= 4,
2841  "Implicit truncation of 64-bit byte_offset to 32-bit is "
2842  "disabled. Use -fsycl-esimd-force-stateless-mem or explicitly "
2843  "convert offsets to a 32-bit vector");
2844  check_lsc_vector_size<NElts>();
2845  check_lsc_data_size<T, DS>();
2846  check_cache_hints<cache_action::prefetch, PropertyListT>();
2847  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
2848  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
2849  constexpr uint16_t AddressScale = 1;
2850  constexpr int ImmOffset = 0;
2851  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
2852  constexpr lsc_vector_size LSCVS = to_lsc_vector_size<NElts>();
2853  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
2854  using MsgT = typename lsc_expand_type<T>::type;
2855  simd<uint32_t, N> ByteOffsets32 = convert<uint32_t>(byte_offsets);
2856  auto SI = get_surface_index(acc);
2857  __esimd_lsc_prefetch_bti<MsgT, L1H, L2H, AddressScale, ImmOffset, EDS, LSCVS,
2858  Transposed, N>(pred.data(), ByteOffsets32.data(),
2859  SI);
2860 }
2861 
2879 template <typename T, int NElts, lsc_data_size DS, typename PropertyListT,
2880  typename AccessorTy, typename OffsetT>
2881 __ESIMD_API std::enable_if_t<
2882  std::is_integral_v<OffsetT> &&
2883  is_device_accessor_with_v<AccessorTy, accessor_mode_cap::can_read>>
2884 prefetch_impl(AccessorTy acc, OffsetT byte_offset, simd_mask<1> pred) {
2885  static_assert(sizeof(OffsetT) <= 4,
2886  "Implicit truncation of 64-bit byte_offset to 32-bit is "
2887  "disabled. Use -fsycl-esimd-force-stateless-mem or explicitly "
2888  "convert offsets to a 32-bit vector");
2889  check_lsc_data_size<T, DS>();
2890  check_cache_hints<cache_action::prefetch, PropertyListT>();
2891 
2892  constexpr size_t Alignment =
2893  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
2894 
2895  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
2896  constexpr int SmallIntFactor32Bit =
2897  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
2898  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
2899  "Number of elements is not supported by Transposed load");
2900 
2901  // If alignment >= 8 and (NElts * sizeof(T)) % 8 == 0) we can load QWORDs.
2902  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
2903  // because it would require a bit-cast, which is supposed to be NO-OP, but
2904  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
2905  constexpr bool Use64BitData =
2906  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
2907  (NElts * sizeof(T)) % sizeof(uint64_t) == 0 &&
2908  (sizeof(T) != sizeof(uint32_t) || NElts * sizeof(T) > 256);
2909  constexpr int SmallIntFactor =
2910  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
2911  constexpr int FactoredNElts = NElts / SmallIntFactor;
2912  check_lsc_vector_size<FactoredNElts>();
2913 
2914  // Prepare template arguments for the call of intrinsic.
2915  using LoadElemT = __ESIMD_DNS::__raw_t<
2916  std::conditional_t<SmallIntFactor == 1, T,
2917  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
2918 
2919  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
2920  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
2921  constexpr uint16_t AddressScale = 1;
2922  constexpr int ImmOffset = 0;
2923  constexpr lsc_data_size EDS = finalize_data_size<LoadElemT, DS>();
2924 
2925  static_assert(
2926  EDS == lsc_data_size::u32 || EDS == lsc_data_size::u64,
2927  "Transposed prefetch is supported only for data size u32 or u64");
2928  constexpr lsc_vector_size LSCVS = to_lsc_vector_size<FactoredNElts>();
2929  constexpr lsc_data_order Transposed = lsc_data_order::transpose;
2930  constexpr int N = 1;
2931 
2932  simd<uint32_t, N> offsets = byte_offset;
2933  auto SI = get_surface_index(acc);
2934  __esimd_lsc_prefetch_bti<LoadElemT, L1H, L2H, AddressScale, ImmOffset, EDS,
2935  LSCVS, Transposed, N>(pred.data(), offsets.data(),
2936  SI);
2937 }
2938 #endif // __ESIMD_FORCE_STATELESS_MEM
2939 
2940 // Compute the data size for 2d block load or store.
2941 template <typename T, int NBlocks, int Height, int Width, bool Transposed,
2942  bool Transformed>
2943 constexpr int get_lsc_block_2d_data_size() {
2944  if constexpr (Transformed)
2945  return roundUpNextMultiple<Height, 4 / sizeof(T)>() *
2946  getNextPowerOf2<Width>() * NBlocks;
2947  return Width * Height * NBlocks;
2948 }
2949 
2950 #ifndef __ESIMD_DWORD_BLOCK_2D_WIDTH_SCALE
2951 #define __ESIMD_DWORD_BLOCK_2D_WIDTH_SCALE (1)
2952 #endif
2953 
2954 #ifndef __ESIMD_BLOCK_2D_WIDTH_CHECK
2955 #define __ESIMD_BLOCK_2D_WIDTH_CHECK(OP, BLOCK_WIDTH, NBLOCKS, SIZE) \
2956  static_assert((BLOCK_WIDTH) * (NBLOCKS) * (SIZE) <= 64, \
2957  "Unsupported block width");
2958 #endif
2959 
2960 enum class block_2d_op { prefetch, load, store };
2961 
2962 // Compile-time checks for lsc_load_2d/prefetch_2d/store_2d restrictions.
2963 template <typename T, int BlockWidth, int BlockHeight, int NBlocks,
2964  bool Transposed, bool Transformed, block_2d_op Op>
2965 constexpr void check_lsc_block_2d_restrictions() {
2966  constexpr int GRFByteSize = BlockWidth * BlockHeight * NBlocks * sizeof(T);
2967  static_assert(BlockWidth > 0, "Block width must be positive");
2968  static_assert(BlockHeight > 0, "Block height must be positive");
2969  // Restrictions based on documentation.
2970  if constexpr (Op == block_2d_op::store)
2971  static_assert(GRFByteSize <= 512, "2D store supports 512 bytes max");
2972  else
2973  static_assert(GRFByteSize <= 2048,
2974  "2D load/prefetch supports 2048 bytes max");
2975  static_assert(!Transposed || !Transformed,
2976  "Transposed and transformed is not supported");
2977  static_assert((sizeof(T) * BlockWidth) % 4 == 0,
2978  "Block width must be aligned by DW");
2979  if constexpr (Transposed) {
2980  static_assert(NBlocks == 1, "Transposed expected to be 1 block only");
2981  static_assert(sizeof(T) == 4 || sizeof(T) == 8,
2982  "Transposed load is supported only for data size u32 or u64");
2983  static_assert(sizeof(T) == 8 ? BlockHeight == 8
2984  : BlockHeight >= 1 && BlockHeight <= 32,
2985  "Unsupported block height");
2986  static_assert(sizeof(T) == 8
2987  ? __ESIMD_DNS::isPowerOf2(BlockWidth, 4)
2988  : BlockWidth >= 1 &&
2989  BlockWidth <=
2990  8 * __ESIMD_DWORD_BLOCK_2D_WIDTH_SCALE,
2991  "Unsupported block width");
2992  } else if constexpr (Transformed) {
2993  static_assert(sizeof(T) == 1 || sizeof(T) == 2,
2994  "VNNI transform is supported only for data size u8 or u16");
2995  static_assert(__ESIMD_DNS::isPowerOf2(NBlocks, 4),
2996  "Unsupported number of blocks");
2997  static_assert(BlockHeight * sizeof(T) >= 4 && BlockHeight <= 32,
2998  "Unsupported block height");
2999  static_assert(BlockWidth * sizeof(T) >= 4 && BlockWidth <= 16 &&
3000  BlockWidth * NBlocks * sizeof(T) <= 64,
3001  "Unsupported block width");
3002  } else {
3003  if constexpr (Op == block_2d_op::store) {
3004  static_assert(NBlocks == 1, "Unsupported number of blocks for 2D store");
3005  static_assert(BlockHeight <= 8, "Unsupported block height for store");
3006  } else {
3007  static_assert(
3008  __ESIMD_DNS::isPowerOf2(NBlocks, sizeof(T) == 1 ? 4 : 8 / sizeof(T)),
3009  "Unsupported number of blocks for 2D load/prefetch");
3010  static_assert(BlockHeight <= 32, "Unsupported block height for load");
3011  }
3012  static_assert(BlockWidth * sizeof(T) >= 4, "Unsupported block width");
3013  __ESIMD_BLOCK_2D_WIDTH_CHECK(Op, BlockWidth, NBlocks, sizeof(T));
3014  }
3015 }
3016 #undef __ESIMD_DWORD_BLOCK_2D_WIDTH_SCALE
3017 #undef __ESIMD_BLOCK_2D_WIDTH_CHECK
3018 
3049 template <
3050  typename T, int BlockWidth, int BlockHeight, int NBlocks, bool Transposed,
3051  bool Transformed, typename PropertyListT,
3052  int N = get_lsc_block_2d_data_size<__raw_t<T>, NBlocks, BlockHeight,
3053  BlockWidth, Transposed, Transformed>()>
3054 __ESIMD_API simd<T, N> load_2d_impl(const T *Ptr, unsigned SurfaceWidth,
3055  unsigned SurfaceHeight,
3056  unsigned SurfacePitch, int X, int Y) {
3057 
3058  check_cache_hints<cache_action::load, PropertyListT>();
3059  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
3060  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
3061  using RawT = __raw_t<T>;
3062  check_lsc_block_2d_restrictions<RawT, BlockWidth, BlockHeight, NBlocks,
3063  Transposed, Transformed, block_2d_op::load>();
3064  // For Load BlockWidth is padded up to the next power-of-two value.
3065  // For Load with Transpose the pre-operation BlockHeight is padded up
3066  // to the next power-of-two value.
3067  // For Load with Transform pre-operation BlockHeight is padded up to
3068  // multiple of K, where K = 4B / sizeof(T).
3069  constexpr int ElemsPerDword = 4 / sizeof(RawT);
3070  constexpr int GRFRowSize = Transposed ? BlockHeight
3071  : Transformed ? BlockWidth * ElemsPerDword
3072  : BlockWidth;
3073  constexpr int GRFRowPitch = getNextPowerOf2<GRFRowSize>();
3074  constexpr int GRFColSize =
3075  Transposed
3076  ? BlockWidth
3077  : (Transformed ? (BlockHeight + ElemsPerDword - 1) / ElemsPerDword
3078  : BlockHeight);
3079  constexpr int GRFBlockSize = GRFRowPitch * GRFColSize;
3080  constexpr int GRFBlockPitch =
3081  roundUpNextMultiple<64 / sizeof(RawT), GRFBlockSize>();
3082  constexpr int ActualN = NBlocks * GRFBlockPitch;
3083 
3084  constexpr int DstBlockElements = GRFColSize * GRFRowSize;
3085  constexpr int DstElements = DstBlockElements * NBlocks;
3086 
3087  static_assert(N == ActualN || N == DstElements, "Incorrect element count");
3088  simd_mask<ActualN> Mask = 1;
3089  constexpr lsc_data_size DS =
3090  finalize_data_size<RawT, lsc_data_size::default_size>();
3091  uintptr_t Addr = reinterpret_cast<uintptr_t>(Ptr);
3092  constexpr lsc_data_order Transpose =
3094  simd<RawT, ActualN> Raw =
3095  __esimd_lsc_load2d_stateless<RawT, L1H, L2H, DS, Transpose, NBlocks,
3096  BlockWidth, BlockHeight, Transformed,
3097  ActualN>(Mask.data(), Addr, SurfaceWidth,
3098  SurfaceHeight, SurfacePitch, X, Y);
3099 
3100  if constexpr (ActualN == N) {
3101  return Raw;
3102  } else {
3103  // HW restrictions force data which is read to contain padding filled with
3104  // zeros for 2d lsc loads. This code eliminates such padding.
3105 
3106  // For example, 2D block load of 5 elements of 1 byte data type will
3107  // take 8 bytes per row for each block.
3108  //
3109  // +----+----+----+----+----+----+-----+-----+
3110  // | 00 | 01 | 02 | 03 | 04 | 05 | 06* | 07* |
3111  // +----+----+----+----+----+----+-----+-----+
3112  // | 10 | 11 | 12 | 13 | 14 | 15 | 16* | 17* |
3113  // +----+----+----+----+----+----+-----+-----+
3114  // | 20 | 21 | 22 | 23 | 24 | 25 | 26* | 27* |
3115  // +----+----+----+----+----+----+-----+-----+
3116  // | 30 | 31 | 32 | 33 | 34 | 35 | 36* | 37* |
3117  // +----+----+----+----+----+----+-----+-----+
3118  // * signifies the padded element.
3119 
3121 
3122  for (auto i = 0; i < NBlocks; i++) {
3123  auto DstBlock =
3124  Dst.template select<DstBlockElements, 1>(i * DstBlockElements);
3125 
3126  auto RawBlock = Raw.template select<GRFBlockSize, 1>(i * GRFBlockPitch);
3127  DstBlock =
3128  RawBlock.template bit_cast_view<RawT, GRFColSize, GRFRowPitch>()
3129  .template select<GRFColSize, 1, GRFRowSize, 1>(0, 0)
3130  .template bit_cast_view<RawT>();
3131  }
3132 
3133  return Dst;
3134  }
3135 }
3136 
3159 template <typename T, int BlockWidth, int BlockHeight, int NBlocks,
3160  typename PropertyListT,
3161  int N = get_lsc_block_2d_data_size<__raw_t<T>, NBlocks, BlockHeight,
3162  BlockWidth, false /*Transposed*/,
3163  false /*Transformed*/>()>
3164 __ESIMD_API void prefetch_2d_impl(const T *Ptr, unsigned SurfaceWidth,
3165  unsigned SurfaceHeight, unsigned SurfacePitch,
3166  int X, int Y) {
3167  using RawT = __raw_t<T>;
3168  check_cache_hints<cache_action::prefetch, PropertyListT>();
3169  check_lsc_block_2d_restrictions<RawT, BlockWidth, BlockHeight, NBlocks, false,
3170  false, block_2d_op::prefetch>();
3171  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
3172  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
3173  constexpr lsc_data_size DS =
3174  finalize_data_size<RawT, lsc_data_size::default_size>();
3175  uintptr_t Addr = reinterpret_cast<uintptr_t>(Ptr);
3176  constexpr lsc_data_order Transpose = lsc_data_order::nontranspose;
3177  simd_mask<N> Mask = 1;
3178  __esimd_lsc_prefetch2d_stateless<RawT, L1H, L2H, DS, Transpose, NBlocks,
3179  BlockWidth, BlockHeight, false, N>(
3180  Mask.data(), Addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
3181 }
3182 
3207 template <typename T, int BlockWidth, int BlockHeight, typename PropertyListT,
3209  __raw_t<T>, 1u, BlockHeight, BlockWidth, false /*Transposed*/,
3210  false /*Transformed*/>()>
3211 __ESIMD_API void store_2d_impl(T *Ptr, unsigned SurfaceWidth,
3212  unsigned SurfaceHeight, unsigned SurfacePitch,
3213  int X, int Y, simd<T, N> Vals) {
3214  using RawT = __raw_t<T>;
3215  __ESIMD_DNS::check_cache_hints<__ESIMD_DNS::cache_action::store,
3216  PropertyListT>();
3217  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
3218  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
3219  check_lsc_block_2d_restrictions<RawT, BlockWidth, BlockHeight, 1, false,
3220  false, block_2d_op::store>();
3221  constexpr lsc_data_size DS =
3222  finalize_data_size<RawT, lsc_data_size::default_size>();
3223  uintptr_t Addr = reinterpret_cast<uintptr_t>(Ptr);
3224  constexpr lsc_data_order Transpose = lsc_data_order::nontranspose;
3225 
3226  constexpr int Pitch = getNextPowerOf2<BlockWidth>();
3227  constexpr int NElts = BlockHeight * Pitch;
3228  simd<RawT, NElts> Raw;
3229  simd_mask<NElts> Mask = 1;
3230 
3231  if constexpr (NElts == N) {
3232  Raw = Vals;
3233  } else {
3234  // For store with padding, allocate the block with padding, and place
3235  // original data there.
3236  auto Data2D = Vals.template bit_cast_view<RawT, BlockHeight, BlockWidth>();
3237  auto Raw2D = Raw.template bit_cast_view<RawT, BlockHeight, Pitch>();
3238  Raw2D.template select<BlockHeight, 1, BlockWidth, 1>(0, 0) = Data2D;
3239  }
3240 
3241  __esimd_lsc_store2d_stateless<RawT, L1H, L2H, DS, Transpose, 1u, BlockWidth,
3242  BlockHeight, false, NElts>(
3243  Mask.data(), Addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y,
3244  Raw.data());
3245 }
3246 
3247 } // namespace detail
3248 
3250 
3253 
3275 // Dev note: the argument \p glob_offset of this function does not have
3276 // a default value to not conflict with more generic variant (acc-ga-3)
3277 // defined below. This restriction though requires adding an additional
3278 // variant: simd<T, N> gather(acc, glob_offset) to support calls that require
3279 // implicit conversion of a scalar offset to a vector of offsets, e.g.
3280 // 'res = gather<T, N>(acc, 0);'
3281 template <typename T, int N, typename AccessorT>
3282 __ESIMD_API
3283  std::enable_if_t<detail::is_device_accessor_with_v<
3284  AccessorT, detail::accessor_mode_cap::can_read>,
3285  simd<T, N>>
3286  gather(AccessorT acc, simd<detail::DeviceAccessorOffsetT, N> byte_offsets,
3287  detail::DeviceAccessorOffsetT glob_offset, simd_mask<N> mask = 1) {
3288 #ifdef __ESIMD_FORCE_STATELESS_MEM
3289  return gather<T, N>(__ESIMD_DNS::accessorToPointer<T>(acc, glob_offset),
3290  byte_offsets, mask);
3291 #else
3292  if constexpr (!detail::isPowerOf2(N, 32)) {
3293  // Requires DG2 or PVC.
3294  simd<T, N> PassThru; // Intentionally undefined
3295  byte_offsets += glob_offset;
3296  return detail::gather_impl<T, N, 1,
3299  acc, byte_offsets, mask, PassThru);
3300  } else {
3301  return detail::gather_impl<T, N>(acc, byte_offsets, glob_offset, mask);
3302  }
3303 #endif // __ESIMD_FORCE_STATELESS_MEM
3304 }
3305 
3319 template <typename T, int N, typename AccessorT>
3320 __ESIMD_API
3321  std::enable_if_t<detail::is_device_accessor_with_v<
3322  AccessorT, detail::accessor_mode_cap::can_read>,
3323  simd<T, N>>
3324  gather(AccessorT acc, detail::DeviceAccessorOffsetT glob_offset) {
3326  return gather<T, N>(acc, ByteOffsets, glob_offset);
3327 }
3328 
3329 #ifdef __ESIMD_FORCE_STATELESS_MEM
3330 template <typename T, int N, typename AccessorTy, typename Toffset>
3331 __ESIMD_API std::enable_if_t<
3332  detail::is_device_accessor_with_v<AccessorTy,
3333  detail::accessor_mode_cap::can_read> &&
3334  std::is_integral_v<Toffset> && !std::is_same_v<Toffset, uint64_t>,
3335  simd<T, N>>
3336 gather(AccessorTy acc, simd<Toffset, N> offsets, uint64_t glob_offset,
3337  simd_mask<N> mask = 1) {
3338  return gather<T, N>(acc, convert<uint64_t>(offsets), glob_offset, mask);
3339 }
3340 #endif
3341 
3371 // typename PropertyListT = empty_properties_t>
3380 
3414 template <typename T, int N, int VS, typename AccessorT, typename OffsetT,
3415  typename PropertyListT =
3417 __ESIMD_API std::enable_if_t<
3418  (detail::is_device_accessor_with_v<AccessorT,
3419  detail::accessor_mode_cap::can_read> &&
3420  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3421  simd<T, N>>
3422 gather(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
3423  simd_mask<N / VS> mask, simd<T, N> pass_thru, PropertyListT props = {}) {
3424 #ifdef __ESIMD_FORCE_STATELESS_MEM
3425  return gather<T, N, VS>(detail::accessorToPointer<T>(acc), byte_offsets, mask,
3426  pass_thru, props);
3427 #else
3428  return detail::gather_impl<T, N, VS, PropertyListT,
3430  acc, byte_offsets, mask, pass_thru);
3431 #endif // __ESIMD_FORCE_STATELESS_MEM
3432 }
3433 
3463 template <typename T, int N, int VS, typename AccessorT, typename OffsetT,
3464  typename PropertyListT =
3466 __ESIMD_API std::enable_if_t<
3467  (detail::is_device_accessor_with_v<AccessorT,
3468  detail::accessor_mode_cap::can_read> &&
3469  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3470  simd<T, N>>
3471 gather(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
3472  simd_mask<N / VS> mask, PropertyListT props = {}) {
3473 #ifdef __ESIMD_FORCE_STATELESS_MEM
3474  return gather<T, N, VS>(detail::accessorToPointer<T>(acc), byte_offsets, mask,
3475  props);
3476 #else
3477  constexpr size_t Alignment =
3478  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
3479  static_assert(Alignment >= sizeof(T),
3480  "gather() requires at least element-size alignment");
3481 
3482  if constexpr (detail::has_cache_hints<PropertyListT>() || VS > 1 ||
3483  !(detail::isPowerOf2(N, 32))) {
3484  simd<T, N> PassThru; // Intentionally undefined
3485  return detail::gather_impl<T, N, VS, PropertyListT,
3487  acc, byte_offsets, mask, PassThru);
3488  } else {
3489  return detail::gather_impl<T, N>(acc, byte_offsets, 0, mask);
3490  }
3491 #endif // __ESIMD_FORCE_STATELESS_MEM
3492 }
3493 
3517 template <typename T, int N, int VS, typename AccessorT, typename OffsetT,
3518  typename PropertyListT =
3520 __ESIMD_API std::enable_if_t<
3521  (detail::is_device_accessor_with_v<AccessorT,
3522  detail::accessor_mode_cap::can_read> &&
3523  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3524  simd<T, N>>
3525 gather(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
3526  PropertyListT props = {}) {
3527  simd_mask<N / VS> Mask = 1;
3528  return gather<T, N, VS>(acc, byte_offsets, Mask, props);
3529 }
3530 
3540 // Dev note: the mask type was turned into template parameter `MaskT` to
3541 // avoid the conflicts of this prototype with the old gather() function
3542 // accepting a 'global_offset' parameter and avoid 'ambiguous call' errors
3543 // for calls like this: gather(acc, byte_offsets_simd, 0, mask);
3544 template <typename T, int N, typename AccessorT, typename OffsetT,
3545  typename MaskT,
3546  typename PropertyListT =
3548 __ESIMD_API std::enable_if_t<
3549  (detail::is_device_accessor_with_v<AccessorT,
3550  detail::accessor_mode_cap::can_read> &&
3551  std::is_same_v<MaskT, simd_mask<N>> &&
3552  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3553  simd<T, N>>
3554 gather(AccessorT acc, simd<OffsetT, N> byte_offsets, MaskT mask,
3555  simd<T, N> pass_thru, PropertyListT props = {}) {
3556  return gather<T, N, 1>(acc, byte_offsets, mask, pass_thru, props);
3557 }
3558 
3566 // Dev note: the mask type was turned into template parameter `MaskT` to
3567 // avoid the conflicts of this prototype with the old gather() function
3568 // accepting a 'global_offset' parameter and avoid 'ambiguous call' errors
3569 // for calls like this: gather(acc, byte_offsets_simd, 0);
3570 template <typename T, int N, typename AccessorT, typename OffsetT,
3571  typename MaskT,
3572  typename PropertyListT =
3574 __ESIMD_API std::enable_if_t<
3575  (detail::is_device_accessor_with_v<AccessorT,
3576  detail::accessor_mode_cap::can_read> &&
3577  std::is_same_v<MaskT, simd_mask<N>> &&
3578  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3579  simd<T, N>>
3580 gather(AccessorT acc, simd<OffsetT, N> byte_offsets, MaskT mask,
3581  PropertyListT props = {}) {
3582  return gather<T, N, 1>(acc, byte_offsets, mask, props);
3583 }
3584 
3592 template <typename T, int N, typename AccessorT, typename OffsetT,
3593  typename PropertyListT =
3595 __ESIMD_API std::enable_if_t<
3596  (detail::is_device_accessor_with_v<AccessorT,
3597  detail::accessor_mode_cap::can_read> &&
3598  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3599  simd<T, N>>
3600 gather(AccessorT acc, simd<OffsetT, N> byte_offsets, PropertyListT props = {}) {
3601  return gather<T, N, 1>(acc, byte_offsets, props);
3602 }
3603 
3606 // typename PropertyListT = empty_properties_t>
3612 template <typename T, int N, int VS = 1, typename AccessorT,
3613  typename OffsetSimdViewT,
3614  typename PropertyListT =
3616 __ESIMD_API std::enable_if_t<
3617  (detail::is_device_accessor_with_v<AccessorT,
3618  detail::accessor_mode_cap::can_read> &&
3619  detail::is_simd_view_type_v<OffsetSimdViewT> &&
3620  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3621  simd<T, N>>
3622 gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
3623  simd<T, N> pass_thru, PropertyListT props = {}) {
3624  return gather<T, N, VS>(acc, byte_offsets.read(), mask, pass_thru, props);
3625 }
3626 
3629 // typename PropertyListT = empty_properties_t>
3635 template <typename T, int N, int VS = 1, typename AccessorT,
3636  typename OffsetSimdViewT,
3637  typename PropertyListT =
3639 __ESIMD_API std::enable_if_t<
3640  (detail::is_device_accessor_with_v<AccessorT,
3641  detail::accessor_mode_cap::can_read> &&
3642  detail::is_simd_view_type_v<OffsetSimdViewT> &&
3643  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3644  simd<T, N>>
3645 gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
3646  PropertyListT props = {}) {
3647  return gather<T, N, VS>(acc, byte_offsets.read(), mask, props);
3648 }
3649 
3652 // typename PropertyListT = empty_properties_t>
3657 template <typename T, int N, int VS = 1, typename AccessorT,
3658  typename OffsetSimdViewT,
3659  typename PropertyListT =
3661 __ESIMD_API std::enable_if_t<
3662  (detail::is_device_accessor_with_v<AccessorT,
3663  detail::accessor_mode_cap::can_read> &&
3664  detail::is_simd_view_type_v<OffsetSimdViewT> &&
3665  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3666  simd<T, N>>
3667 gather(AccessorT acc, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
3668  return gather<T, N, VS>(acc, byte_offsets.read(), props);
3669 }
3670 
3684 
3687 
3722 template <typename T, int N, int VS = 1, typename AccessorTy, typename OffsetT,
3723  typename PropertyListT =
3725 __ESIMD_API std::enable_if_t<
3726  detail::is_device_accessor_with_v<AccessorTy,
3727  detail::accessor_mode_cap::can_write> &&
3728  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3729 scatter(AccessorTy acc, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
3730  simd_mask<N / VS> mask, PropertyListT props = {}) {
3731 #ifdef __ESIMD_FORCE_STATELESS_MEM
3732  scatter<T, N, VS>(__ESIMD_DNS::accessorToPointer<T>(acc), byte_offsets, vals,
3733  mask, props);
3734 #else
3735  constexpr size_t Alignment =
3736  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
3737  static_assert(Alignment >= sizeof(T),
3738  "gather() requires at least element-size alignment");
3739 
3740  if constexpr (detail::has_cache_hints<PropertyListT>() || VS > 1 ||
3741  !detail::isPowerOf2(N, 32)) {
3743  PropertyListT>(acc, byte_offsets, vals, mask);
3744  } else {
3745  detail::scatter_impl<T, N, AccessorTy>(acc, vals, byte_offsets, 0, mask);
3746  }
3747 
3748 #endif // __ESIMD_FORCE_STATELESS_MEM
3749 }
3769 template <typename T, int N, int VS = 1, typename AccessorTy, typename OffsetT,
3770  typename PropertyListT =
3772 __ESIMD_API std::enable_if_t<
3773  detail::is_device_accessor_with_v<AccessorTy,
3774  detail::accessor_mode_cap::can_write> &&
3775  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3776 scatter(AccessorTy acc, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
3777  PropertyListT props = {}) {
3778  simd_mask<N / VS> Mask = 1;
3779  scatter<T, N, VS>(acc, byte_offsets, vals, Mask, props);
3780 }
3781 
3807 template <typename T, int N, int VS = 1, typename AccessorTy,
3808  typename OffsetSimdViewT,
3809  typename PropertyListT =
3811 __ESIMD_API std::enable_if_t<
3812  detail::is_device_accessor_with_v<AccessorTy,
3813  detail::accessor_mode_cap::can_write> &&
3814  detail::is_simd_view_type_v<OffsetSimdViewT> &&
3815  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3816 scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
3817  simd_mask<N / VS> mask, PropertyListT props = {}) {
3818  scatter<T, N, VS>(acc, byte_offsets.read(), vals, mask, props);
3819 }
3820 
3841 template <typename T, int N, int VS = 1, typename AccessorTy,
3842  typename OffsetSimdViewT,
3843  typename PropertyListT =
3845 __ESIMD_API std::enable_if_t<
3846  detail::is_device_accessor_with_v<AccessorTy,
3847  detail::accessor_mode_cap::can_write> &&
3848  detail::is_simd_view_type_v<OffsetSimdViewT> &&
3849  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3850 scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
3851  PropertyListT props = {}) {
3852  simd_mask<N / VS> Mask = 1;
3853  scatter<T, N, VS>(acc, byte_offsets.read(), vals, Mask, props);
3854 }
3855 
3872 template <typename T, int N, typename AccessorTy>
3873 __ESIMD_API
3874  std::enable_if_t<(detail::isPowerOf2(N, 32)) &&
3875  detail::is_device_accessor_with_v<
3876  AccessorTy, detail::accessor_mode_cap::can_write>>
3878  simd<T, N> vals, detail::DeviceAccessorOffsetT glob_offset,
3879  simd_mask<N> mask = 1) {
3880  offsets += glob_offset;
3881  scatter<T, N>(acc, offsets, vals, mask);
3882 }
3883 
3884 template <typename T, int N, typename AccessorTy>
3885 __ESIMD_API
3886  std::enable_if_t<(detail::isPowerOf2(N, 32)) &&
3887  detail::is_device_accessor_with_v<
3888  AccessorTy, detail::accessor_mode_cap::can_write>>
3889  scatter(AccessorTy acc, detail::DeviceAccessorOffsetT glob_offset,
3890  simd<T, N> vals, simd_mask<N> mask = 1) {
3892  scatter<T, N>(acc, ByteOffsets, vals, glob_offset, mask);
3893 }
3894 
3895 #ifdef __ESIMD_FORCE_STATELESS_MEM
3896 template <typename T, int N, typename AccessorTy, typename Toffset>
3897 __ESIMD_API std::enable_if_t<
3898  detail::is_device_accessor_with_v<AccessorTy,
3899  detail::accessor_mode_cap::can_write> &&
3900  std::is_integral_v<Toffset> && !std::is_same_v<Toffset, uint64_t>>
3901 scatter(AccessorTy acc, simd<Toffset, N> offsets, simd<T, N> vals,
3902  uint64_t glob_offset, simd_mask<N> mask = 1) {
3903  scatter<T, N, AccessorTy>(acc, convert<uint64_t>(offsets), vals, glob_offset,
3904  mask);
3905 }
3906 #endif
3907 
3915 template <typename T, typename AccessorTy>
3916 __ESIMD_API T scalar_load(AccessorTy acc,
3918  const simd<T, 1> Res =
3919  gather<T, 1, AccessorTy>(acc, simd<decltype(offset), 1>(offset));
3920  return Res[0];
3921 }
3922 
3930 template <typename T, typename AccessorTy>
3931 __ESIMD_API void scalar_store(AccessorTy acc,
3932  detail::DeviceAccessorOffsetT offset, T val) {
3933  scatter<T, 1, AccessorTy>(acc, simd<decltype(offset), 1>(offset),
3934  simd<T, 1>(val));
3935 }
3936 
3970 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
3971  int N, typename Toffset>
3972 __ESIMD_API simd<T, N * get_num_channels_enabled(RGBAMask)>
3973 gather_rgba(const T *p, simd<Toffset, N> offsets, simd_mask<N> mask = 1) {
3974  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
3975  static_assert((N == 8 || N == 16 || N == 32), "Unsupported value of N");
3976  static_assert(sizeof(T) == 4, "Unsupported size of type T");
3977  simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
3978  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
3979  addrs = addrs + offsets_i;
3980  return __esimd_svm_gather4_scaled<detail::__raw_t<T>, N, RGBAMask>(
3981  addrs.data(), mask.data());
3982 }
3983 
3999 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
4000  int N, typename OffsetSimdViewT, typename RegionTy>
4001 __ESIMD_API std::enable_if_t<detail::is_simd_view_type_v<OffsetSimdViewT>,
4002  simd<T, N * get_num_channels_enabled(RGBAMask)>>
4003 gather_rgba(const T *p, OffsetSimdViewT offsets, simd_mask<N> mask = 1) {
4004  return gather_rgba<RGBAMask, T, N>(p, offsets.read(), mask);
4005 }
4006 
4022 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
4023  int N, typename Toffset>
4024 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>,
4025  simd<T, N * get_num_channels_enabled(RGBAMask)>>
4026 gather_rgba(const T *p, Toffset offset, simd_mask<N> mask = 1) {
4027  return gather_rgba<RGBAMask, T, N>(p, simd<Toffset, N>(offset), mask);
4028 }
4029 
4030 namespace detail {
4031 template <rgba_channel_mask M> static void validate_rgba_write_channel_mask() {
4032  using CM = rgba_channel_mask;
4033  static_assert(
4034  (M == CM::ABGR || M == CM::BGR || M == CM::GR || M == CM::R) &&
4035  "Only ABGR, BGR, GR, R channel masks are valid in write operations");
4036 }
4037 } // namespace detail
4038 
4060 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
4061  int N, typename Toffset>
4062 __ESIMD_API void
4064  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
4065  simd_mask<N> mask = 1) {
4066  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
4067  static_assert((N == 8 || N == 16 || N == 32), "Unsupported value of N");
4068  static_assert(sizeof(T) == 4, "Unsupported size of type T");
4069  detail::validate_rgba_write_channel_mask<RGBAMask>();
4070  simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
4071  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
4072  addrs = addrs + offsets_i;
4073  __esimd_svm_scatter4_scaled<detail::__raw_t<T>, N, RGBAMask>(
4074  addrs.data(), vals.data(), mask.data());
4075 }
4076 
4092 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
4093  int N, typename OffsetSimdViewT, typename RegionTy>
4094 __ESIMD_API std::enable_if_t<detail::is_simd_view_type_v<OffsetSimdViewT>>
4095 scatter_rgba(T *p, OffsetSimdViewT offsets,
4096  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
4097  simd_mask<N> mask = 1) {
4098  scatter_rgba<RGBAMask, T, N>(p, offsets.read(), vals, mask);
4099 }
4100 
4116 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
4117  int N, typename Toffset>
4118 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> && N == 1>
4119 scatter_rgba(T *p, Toffset offset,
4120  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
4121  simd_mask<N> mask = 1) {
4122  scatter_rgba<RGBAMask, T, N>(p, simd<Toffset, N>(offset), vals, mask);
4123 }
4124 
4125 template <typename T, int N, rgba_channel_mask RGBAMask>
4126 __SYCL_DEPRECATED("use scatter_rgba<rgba_channel_mask>()")
4127 __ESIMD_API std::
4128  enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4> scatter_rgba(
4129  T *p, simd<uint32_t, N> offsets,
4130  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
4131  simd_mask<N> mask = 1) {
4132  scatter_rgba<RGBAMask>(p, offsets, vals, mask);
4133 }
4134 
4157 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
4158  typename AccessorT, int N,
4159  typename T = typename AccessorT::value_type>
4160 __ESIMD_API
4161  std::enable_if_t<((N == 8 || N == 16 || N == 32) && sizeof(T) == 4 &&
4162  detail::is_device_accessor_with_v<
4163  AccessorT, detail::accessor_mode_cap::can_read>),
4164  simd<T, N * get_num_channels_enabled(RGBAMask)>>
4166  detail::DeviceAccessorOffsetT global_offset = 0,
4167  simd_mask<N> mask = 1) {
4168 #ifdef __ESIMD_FORCE_STATELESS_MEM
4169  return gather_rgba<RGBAMask>(
4170  __ESIMD_DNS::accessorToPointer<T>(acc, global_offset), offsets, mask);
4171 #else
4172  // TODO (performance) use hardware-supported scale once BE supports it
4173  constexpr uint32_t Scale = 0;
4174  const auto SI = get_surface_index(acc);
4175  return __esimd_gather4_masked_scaled2<detail::__raw_t<T>, N, RGBAMask,
4176  decltype(SI), Scale>(
4177  SI, global_offset, offsets.data(), mask.data());
4178 #endif
4179 }
4180 
4181 #ifdef __ESIMD_FORCE_STATELESS_MEM
4182 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
4183  typename AccessorT, int N,
4184  typename T = typename AccessorT::value_type, typename Toffset>
4185 __ESIMD_API std::enable_if_t<
4186  ((N == 8 || N == 16 || N == 32) && sizeof(T) == 4 &&
4187  detail::is_device_accessor_with_v<AccessorT,
4188  detail::accessor_mode_cap::can_read> &&
4189  std::is_integral_v<Toffset> && !std::is_same_v<Toffset, uint64_t>),
4190  simd<T, N * get_num_channels_enabled(RGBAMask)>>
4191 gather_rgba(AccessorT acc, simd<Toffset, N> offsets, uint64_t global_offset = 0,
4192  simd_mask<N> mask = 1) {
4193  return gather_rgba<RGBAMask, AccessorT, N, T>(acc, convert<uint64_t>(offsets),
4194  global_offset, mask);
4195 }
4196 #endif
4197 
4212 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
4213  typename AccessorT, int N,
4214  typename T = typename AccessorT::value_type>
4215 __ESIMD_API
4216  std::enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4 &&
4217  detail::is_device_accessor_with_v<
4218  AccessorT, detail::accessor_mode_cap::can_write>>
4220  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
4221  detail::DeviceAccessorOffsetT global_offset = 0,
4222  simd_mask<N> mask = 1) {
4223  detail::validate_rgba_write_channel_mask<RGBAMask>();
4224 #ifdef __ESIMD_FORCE_STATELESS_MEM
4225  scatter_rgba<RGBAMask>(__ESIMD_DNS::accessorToPointer<T>(acc, global_offset),
4226  offsets, vals, mask);
4227 #else
4228  // TODO (performance) use hardware-supported scale once BE supports it
4229  constexpr uint32_t Scale = 0;
4230  const auto SI = get_surface_index(acc);
4231  __esimd_scatter4_scaled<T, N, decltype(SI), RGBAMask, Scale>(
4232  mask.data(), SI, global_offset, offsets.data(), vals.data());
4233 #endif
4234 }
4235 
4236 #ifdef __ESIMD_FORCE_STATELESS_MEM
4237 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
4238  typename AccessorT, int N,
4239  typename T = typename AccessorT::value_type, typename Toffset>
4240 __ESIMD_API std::enable_if_t<
4241  (N == 8 || N == 16 || N == 32) && sizeof(T) == 4 &&
4242  detail::is_device_accessor_with_v<AccessorT,
4243  detail::accessor_mode_cap::can_write> &&
4244  std::is_integral_v<Toffset> && !std::is_same_v<Toffset, uint64_t>>
4245 scatter_rgba(AccessorT acc, simd<Toffset, N> offsets,
4246  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
4247  uint64_t global_offset = 0, simd_mask<N> mask = 1) {
4248  scatter_rgba<RGBAMask, AccessorT, N, T>(acc, convert<uint64_t>(offsets), vals,
4249  global_offset, mask);
4250 }
4251 #endif
4253 
4254 namespace detail {
4255 
4256 #ifndef __ESIMD_FP_ATOMIC_OP_TYPE_CHECK
4257 #define __ESIMD_FP_ATOMIC_OP_TYPE_CHECK(T) \
4258  static_assert(is_type<T, float, sycl::half, double>(), \
4259  "float, double or sycl::half type is expected");
4260 #endif // __ESIMD_FP_ATOMIC_OP_TYPE_CHECK
4261 
4264 template <__ESIMD_NS::atomic_op Op, typename T, int N, unsigned NumSrc,
4265  bool IsLSC = false>
4266 constexpr void check_atomic() {
4267 
4268  static_assert(sizeof(T) > 1, "Unsupported data type");
4269 
4270  // LSC atomic operation is supported for any width.
4271  if constexpr (!IsLSC)
4272  static_assert((detail::isPowerOf2(N, 32)),
4273  "Execution size 1, 2, 4, 8, 16, 32 are supported");
4274 
4275  static_assert(NumSrc == __ESIMD_DNS::get_num_args<Op>(),
4276  "Wrong number of operands");
4277  constexpr bool IsInt2BytePlus =
4278  std::is_integral_v<T> && (sizeof(T) >= sizeof(uint16_t));
4279 
4280  if constexpr (Op == __ESIMD_NS::atomic_op::xchg ||
4281  Op == __ESIMD_NS::atomic_op::cmpxchg ||
4282  Op == __ESIMD_NS::atomic_op::predec ||
4283  Op == __ESIMD_NS::atomic_op::inc ||
4285 
4286  static_assert(IsInt2BytePlus, "Integral 16-bit or wider type is expected");
4287  }
4288  // FP ops (are always delegated to native::lsc::<Op>)
4289  if constexpr (Op == __ESIMD_NS::atomic_op::fmax ||
4291  Op == __ESIMD_NS::atomic_op::fadd ||
4292  Op == __ESIMD_NS::atomic_op::fsub ||
4293  Op == __ESIMD_NS::atomic_op::fcmpxchg) {
4295  }
4296  if constexpr (Op == __ESIMD_NS::atomic_op::add ||
4297  Op == __ESIMD_NS::atomic_op::sub ||
4298  Op == __ESIMD_NS::atomic_op::umin ||
4299  Op == __ESIMD_NS::atomic_op::umax ||
4303  Op == __ESIMD_NS::atomic_op::smin ||
4304  Op == __ESIMD_NS::atomic_op::smax) {
4305  static_assert(IsInt2BytePlus, "Integral 16-bit or wider type is expected");
4306  constexpr bool IsSignedMinmax = (Op == __ESIMD_NS::atomic_op::smin) ||
4307  (Op == __ESIMD_NS::atomic_op::smax);
4308  constexpr bool IsUnsignedMinmax = (Op == __ESIMD_NS::atomic_op::umin) ||
4309  (Op == __ESIMD_NS::atomic_op::umax);
4310 
4311  if constexpr (IsSignedMinmax || IsUnsignedMinmax) {
4312  constexpr bool SignOK = std::is_signed_v<T> == IsSignedMinmax;
4313  static_assert(SignOK, "Signed/unsigned integer type expected for "
4314  "signed/unsigned min/max operation");
4315  }
4316  }
4317 }
4318 #undef __ESIMD_FP_ATOMIC_OP_TYPE_CHECK
4319 } // namespace detail
4320 
4323 
4336 template <uint32_t SLMSize> __ESIMD_API void slm_init() {
4337  __esimd_slm_init(SLMSize);
4338 }
4339 
4345 // with esimd::slm_allocator() class.
4348 __ESIMD_API void slm_init(uint32_t size) { __esimd_slm_init(size); }
4349 
4386 
4392 #ifndef __ESIMD_GATHER_SCATTER_LLVM_IR
4395 #endif // __ESIMD_GATHER_SCATTER_LLVM_IR
4416 template <typename T, int N, int VS,
4417  typename PropertyListT =
4419 __ESIMD_API std::enable_if_t<
4420  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
4422  simd<T, N> pass_thru, PropertyListT props = {}) {
4423  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
4424 
4425  constexpr size_t Alignment =
4426  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
4427  static_assert(Alignment >= sizeof(T),
4428  "slm_gather() requires at least element-size alignment");
4429 
4430  // Use LSC lowering if VS > 1. Also, if masked gather is
4431  // not available, then LSC is the only lowering option.
4432  if constexpr (VS > 1 || !detail::isMaskedGatherScatterLLVMAvailable()) {
4433  return __ESIMD_DNS::slm_gather_impl<T, VS,
4435  byte_offsets, mask, pass_thru);
4436  } else {
4437  if constexpr (sizeof(T) == 8) {
4438  simd<T, N> Res;
4439  Res.template bit_cast_view<uint32_t>().template select<N, 2>(0) =
4440  __esimd_slm_gather_ld<uint32_t, N, Alignment>(
4441  byte_offsets.data(), mask.data(),
4442  (pass_thru.template bit_cast_view<uint32_t>()
4443  .template select<N, 2>(0))
4444  .data());
4445  simd<uint32_t, N / VS> Offset = byte_offsets + sizeof(uint32_t);
4446  Res.template bit_cast_view<uint32_t>().template select<N, 2>(1) =
4447  __esimd_slm_gather_ld<uint32_t, N, sizeof(uint32_t)>(
4448  Offset.data(), mask.data(),
4449  (pass_thru.template bit_cast_view<uint32_t>()
4450  .template select<N, 2>(1))
4451  .data());
4452  return Res;
4453  } else {
4454  using MsgT = detail::__raw_t<T>;
4455  return __esimd_slm_gather_ld<MsgT, N, Alignment>(
4456  byte_offsets.data(), mask.data(), pass_thru.data());
4457  }
4458  }
4459 }
4460 
4484 template <typename T, int N, int VS,
4485  typename PropertyListT =
4487 __ESIMD_API std::enable_if_t<
4488  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
4490  PropertyListT props = {}) {
4491  constexpr size_t Alignment =
4492  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
4493  static_assert(Alignment >= sizeof(T),
4494  "slm_gather() requires at least element-size alignment");
4495 
4496  if constexpr (VS > 1 || (!detail::isPowerOf2(N, 32) &&
4498  simd<T, N> PassThru; // Intentionally undefined
4499  return detail::slm_gather_impl<T, VS, detail::lsc_data_size::default_size>(
4500  byte_offsets, mask, PassThru);
4501  } else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) {
4502  if constexpr (sizeof(T) == 8) {
4503  simd<T, N> Res;
4504  simd<uint32_t, N> PassThru; // it is intentionally undefined
4505 
4506  Res.template bit_cast_view<uint32_t>().template select<N, 2>(0) =
4507  __esimd_slm_gather_ld<uint32_t, N, Alignment>(
4508  byte_offsets.data(), mask.data(), PassThru.data());
4509  simd<uint32_t, N / VS> Offset = byte_offsets + sizeof(uint32_t);
4510  Res.template bit_cast_view<uint32_t>().template select<N, 2>(1) =
4511  __esimd_slm_gather_ld<uint32_t, N, sizeof(uint32_t)>(
4512  Offset.data(), mask.data(), PassThru.data());
4513  return Res;
4514  } else {
4515  using MsgT = detail::__raw_t<T>;
4516  simd<MsgT, N> PassThru; // it is intentionally undefined
4517  return __esimd_slm_gather_ld<MsgT, N, Alignment>(
4518  byte_offsets.data(), mask.data(), PassThru.data());
4519  }
4520  } else {
4521  detail::LocalAccessorMarker acc;
4522  return detail::gather_impl<T, N>(acc, byte_offsets, 0, mask);
4523  }
4524 }
4525 
4543 template <typename T, int N, int VS,
4544  typename PropertyListT =
4546 __ESIMD_API std::enable_if_t<
4547  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
4548 slm_gather(simd<uint32_t, N / VS> byte_offsets, PropertyListT props = {}) {
4549  simd_mask<N / VS> Mask = 1;
4550  return slm_gather<T, N, VS>(byte_offsets, Mask, props);
4551 }
4552 
4575 template <typename T, int N,
4576  typename PropertyListT =
4578 __ESIMD_API std::enable_if_t<
4579  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
4581  simd<T, N> pass_thru, PropertyListT props = {}) {
4582  constexpr int VS = 1;
4583  return slm_gather<T, N, VS>(byte_offsets, mask, pass_thru, props);
4584 }
4585 
4605 template <typename T, int N,
4606  typename PropertyListT =
4608 __ESIMD_API std::enable_if_t<
4609  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
4611  PropertyListT props = {}) {
4612  constexpr int VS = 1;
4613  return slm_gather<T, N, VS>(byte_offsets, mask, props);
4614 }
4615 
4630 template <typename T, int N,
4631  typename PropertyListT =
4633 __ESIMD_API std::enable_if_t<
4634  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
4635 slm_gather(simd<uint32_t, N> byte_offsets, PropertyListT props = {}) {
4636  constexpr int VS = 1;
4637  return slm_gather<T, N, VS>(byte_offsets, props);
4638 }
4639 
4666 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
4667  typename PropertyListT =
4669 __ESIMD_API std::enable_if_t<
4670  detail::is_simd_view_type_v<OffsetSimdViewT> &&
4671  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
4672  simd<T, N>>
4673 slm_gather(OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
4674  simd<T, N> pass_thru, PropertyListT props = {}) {
4675  return slm_gather<T, N, VS>(byte_offsets.read(), mask, pass_thru, props);
4676 }
4677 
4699 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
4700  typename PropertyListT =
4702 __ESIMD_API std::enable_if_t<
4703  detail::is_simd_view_type_v<OffsetSimdViewT> &&
4704  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
4705  simd<T, N>>
4706 slm_gather(OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
4707  PropertyListT props = {}) {
4708  return slm_gather<T, N, VS>(byte_offsets.read(), mask, props);
4709 }
4710 
4727 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
4728  typename PropertyListT =
4730 __ESIMD_API std::enable_if_t<
4731  detail::is_simd_view_type_v<OffsetSimdViewT> &&
4732  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
4733  simd<T, N>>
4734 slm_gather(OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
4735  return slm_gather<T, N, VS>(byte_offsets.read(), props);
4736 }
4737 
4743 template <typename T> __ESIMD_API T slm_scalar_load(uint32_t offset) {
4744  const simd<T, 1> Res = slm_gather<T, 1>(simd<uint32_t, 1>(offset));
4745  return Res[0];
4746 }
4747 
4765 
4788 template <typename T, int N, int VS = 1,
4789  typename PropertyListT =
4791 __ESIMD_API std::enable_if_t<
4792  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4794  simd_mask<N / VS> mask, PropertyListT props = {}) {
4795  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
4796 
4797  constexpr size_t Alignment =
4798  detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
4799  static_assert(Alignment >= sizeof(T),
4800  "slm_scatter() requires at least element-size alignment");
4801 
4802  // Use LSC lowering if VS > 1.
4803  if constexpr (VS > 1 || (!detail::isPowerOf2(N, 32) &&
4805  __ESIMD_DNS::slm_scatter_impl<T, VS, detail::lsc_data_size::default_size>(
4806  byte_offsets, vals, mask);
4807  } else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) {
4808  if constexpr (sizeof(T) == 8) {
4809  __esimd_slm_scatter_st<uint32_t, N, Alignment>(
4810  vals.template bit_cast_view<uint32_t>()
4811  .template select<N, 2>(0)
4812  .data(),
4813  byte_offsets.data(), mask.data());
4814  simd<uint32_t, N / VS> Offset = byte_offsets + sizeof(uint32_t);
4815  __esimd_slm_scatter_st<uint32_t, N, sizeof(uint32_t)>(
4816  vals.template bit_cast_view<uint32_t>()
4817  .template select<N, 2>(1)
4818  .data(),
4819  Offset.data(), mask.data());
4820 
4821  } else {
4822  using MsgT = detail::__raw_t<T>;
4823  __esimd_slm_scatter_st<MsgT, N, Alignment>(
4824  sycl::bit_cast<__ESIMD_DNS::vector_type_t<MsgT, N>>(vals.data()),
4825  byte_offsets.data(), mask.data());
4826  }
4827  } else {
4828  detail::LocalAccessorMarker acc;
4829  detail::scatter_impl<T, N>(acc, vals, byte_offsets, 0, mask);
4830  }
4831 }
4832 
4849 template <typename T, int N, int VS = 1,
4850  typename PropertyListT =
4852 __ESIMD_API std::enable_if_t<
4853  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4855  PropertyListT props = {}) {
4856  simd_mask<N / VS> Mask = 1;
4857  slm_scatter<T, N, VS>(byte_offsets, vals, Mask, props);
4858 }
4859 
4883 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
4884  typename PropertyListT =
4886 __ESIMD_API std::enable_if_t<
4887  detail::is_simd_view_type_v<OffsetSimdViewT> &&
4888  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4889 slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
4890  simd_mask<N / VS> mask, PropertyListT props = {}) {
4891  slm_scatter<T, N, VS>(byte_offsets.read(), vals, mask, props);
4892 }
4893 
4909 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
4910  typename PropertyListT =
4912 __ESIMD_API std::enable_if_t<
4913  detail::is_simd_view_type_v<OffsetSimdViewT> &&
4914  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4915 slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
4916  PropertyListT props = {}) {
4917  return slm_scatter<T, N, VS>(byte_offsets.read(), vals, props);
4918 }
4919 
4925 template <typename T>
4926 __ESIMD_API void slm_scalar_store(uint32_t offset, T val) {
4927  slm_scatter<T, 1>(simd<uint32_t, 1>(offset), simd<T, 1>(val), 1);
4928 }
4929 
4940 template <typename T, int N, rgba_channel_mask RGBAMask>
4941 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && (sizeof(T) == 4),
4942  simd<T, N * get_num_channels_enabled(RGBAMask)>>
4945  return __esimd_gather4_masked_scaled2<T, N, RGBAMask>(
4946  SI, 0 /*global_offset*/, offsets.data(), mask.data());
4947 }
4948 
4959 template <typename T, int N, rgba_channel_mask Mask>
4960 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && (sizeof(T) == 4)>
4962  simd<T, N * get_num_channels_enabled(Mask)> vals,
4963  simd_mask<N> mask = 1) {
4964  detail::validate_rgba_write_channel_mask<Mask>();
4966  constexpr int16_t Scale = 0;
4967  constexpr int global_offset = 0;
4968  __esimd_scatter4_scaled<T, N, decltype(si), Mask, Scale>(
4969  mask.data(), si, global_offset, offsets.data(), vals.data());
4970 }
4971 
4987 template <typename T, int N,
4989 __ESIMD_API std::enable_if_t<is_simd_flag_type_v<Flags>, simd<T, N>>
4990 slm_block_load(uint32_t byte_offset, Flags) {
4991  constexpr size_t Align = Flags::template alignment<simd<T, N>>;
4992  return __esimd_slm_block_ld<detail::__raw_t<T>, N, Align>(byte_offset);
4993 }
4994 
5003 
5010 
5028 
5043 template <typename T, int N,
5044  typename PropertyListT =
5046 __ESIMD_API std::enable_if_t<
5047  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
5048 slm_block_load(uint32_t byte_offset, PropertyListT props = {}) {
5049  constexpr size_t DefaultAlignment = detail::OperandSize::OWORD;
5050  constexpr size_t Alignment =
5051  detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
5052  return __esimd_slm_block_ld<detail::__raw_t<T>, N, Alignment>(byte_offset);
5053 }
5054 
5081 template <typename T, int N,
5082  typename PropertyListT =
5084 __ESIMD_API std::enable_if_t<
5085  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
5086 slm_block_load(uint32_t byte_offset, simd_mask<1> pred,
5087  PropertyListT props = {}) {
5088  // Verify input template arguments.
5089  constexpr size_t DefaultAlignment = sizeof(T) <= 4 ? 4 : sizeof(T);
5090  constexpr size_t Alignment =
5091  detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
5092  static_assert(
5093  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
5094  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
5095  "Incorrect alignment for the data type");
5096 
5097  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
5098  constexpr int SmallIntFactor32Bit =
5099  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
5100  static_assert(N > 0 && N % SmallIntFactor32Bit == 0,
5101  "Number of elements is not supported by Transposed load");
5102 
5103  // If alignment >= 8 and (N * sizeof(T)) % 8 == 0) we can load QWORDs.
5104  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
5105  // because it would require a bit-cast, which is supposed to be NO-OP, but
5106  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
5107  constexpr bool Use64BitData =
5108  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
5109  (N * sizeof(T)) % sizeof(uint64_t) == 0 &&
5110  (sizeof(T) != sizeof(uint32_t) || N * sizeof(T) > 256);
5111  constexpr int SmallIntFactor =
5112  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
5113  constexpr int FactoredN = N / SmallIntFactor;
5114  detail::check_lsc_vector_size<FactoredN>();
5115 
5116  // Prepare template arguments for the call of intrinsic.
5117  using LoadElemT = __ESIMD_DNS::__raw_t<
5118  std::conditional_t<SmallIntFactor == 1, T,
5119  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
5120 
5121  constexpr uint16_t AddressScale = 1;
5122  constexpr int ImmOffset = 0;
5123  constexpr detail::lsc_data_size DS =
5125  constexpr auto VS = detail::to_lsc_vector_size<FactoredN>();
5126  constexpr auto Transposed = detail::lsc_data_order::transpose;
5127  constexpr int NLanes = 1;
5128 
5129  // Prepare non-template arguments and call the intrinsic.
5130  simd<uint32_t, NLanes> Offsets = byte_offset;
5132  __esimd_lsc_load_slm<LoadElemT, cache_hint::none, cache_hint::none,
5133  AddressScale, ImmOffset, DS, VS, Transposed, NLanes>(
5134  pred.data(), Offsets.data());
5135  return Result.template bit_cast_view<T>();
5136 }
5137 
5167 template <typename T, int N,
5168  typename PropertyListT =
5170 __ESIMD_API std::enable_if_t<
5171  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
5172 slm_block_load(uint32_t offset, simd_mask<1> pred, simd<T, N> pass_thru,
5173  PropertyListT props = {}) {
5174  // Verify input template arguments.
5175  constexpr size_t DefaultAlignment = sizeof(T) <= 4 ? 4 : sizeof(T);
5176  constexpr size_t Alignment =
5177  detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
5178  static_assert(
5179  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
5180  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
5181  "Incorrect alignment for the data type");
5182 
5183  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
5184  constexpr int SmallIntFactor32Bit =
5185  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
5186  static_assert(N > 0 && N % SmallIntFactor32Bit == 0,
5187  "Number of elements is not supported by Transposed load");
5188 
5189  // If alignment >= 8 and (N * sizeof(T)) % 8 == 0) we can load QWORDs.
5190  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
5191  // because it would require a bit-cast, which is supposed to be NO-OP, but
5192  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
5193  constexpr bool Use64BitData =
5194  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
5195  (N * sizeof(T)) % sizeof(uint64_t) == 0 &&
5196  (sizeof(T) != sizeof(uint32_t) || N * sizeof(T) > 256);
5197  constexpr int SmallIntFactor =
5198  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
5199  constexpr int FactoredN = N / SmallIntFactor;
5200  detail::check_lsc_vector_size<FactoredN>();
5201 
5202  // Prepare template arguments for the call of intrinsic.
5203  using LoadElemT = __ESIMD_DNS::__raw_t<
5204  std::conditional_t<SmallIntFactor == 1, T,
5205  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
5206 
5207  constexpr uint16_t AddressScale = 1;
5208  constexpr int ImmOffset = 0;
5209  constexpr detail::lsc_data_size DS =
5211  constexpr auto VS = detail::to_lsc_vector_size<FactoredN>();
5212  constexpr auto Transposed = detail::lsc_data_order::transpose;
5213  constexpr int NLanes = 1;
5214 
5215  // Prepare non-template arguments and call the intrinsic.
5216  simd<uint32_t, NLanes> Offsets = offset;
5217  simd<LoadElemT, FactoredN> PassThru =
5218  pass_thru.template bit_cast_view<LoadElemT>();
5220  __esimd_lsc_load_merge_slm<LoadElemT, cache_hint::none, cache_hint::none,
5221  AddressScale, ImmOffset, DS, VS, Transposed,
5222  NLanes>(pred.data(), Offsets.data(),
5223  PassThru.data());
5224  return Result.template bit_cast_view<T>();
5225 }
5226 
5250 template <typename T, int N, typename AccessorT,
5251  typename PropertyListT =
5253 __ESIMD_API std::enable_if_t<
5254  detail::is_local_accessor_with_v<AccessorT,
5255  detail::accessor_mode_cap::can_read> &&
5256  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
5257  simd<T, N>>
5258 block_load(AccessorT lacc, uint32_t byte_offset, PropertyListT props = {}) {
5259  byte_offset += detail::localAccessorToOffset(lacc);
5260  return slm_block_load<T, N>(byte_offset, props);
5261 }
5262 
5285 template <typename T, int N, typename AccessorT,
5286  typename PropertyListT =
5288 __ESIMD_API std::enable_if_t<
5289  detail::is_local_accessor_with_v<AccessorT,
5290  detail::accessor_mode_cap::can_read> &&
5291  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
5292  simd<T, N>>
5293 block_load(AccessorT lacc, PropertyListT props = {}) {
5294  return slm_block_load<T, N>(detail::localAccessorToOffset(lacc), props);
5295 }
5296 
5324 template <typename T, int N, typename AccessorT,
5325  typename PropertyListT =
5327 __ESIMD_API std::enable_if_t<
5328  detail::is_local_accessor_with_v<AccessorT,
5329  detail::accessor_mode_cap::can_read> &&
5330  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
5331  simd<T, N>>
5332 block_load(AccessorT lacc, uint32_t byte_offset, simd_mask<1> pred,
5333  PropertyListT props = {}) {
5334  byte_offset += detail::localAccessorToOffset(lacc);
5335  return slm_block_load<T, N>(byte_offset, pred, props);
5336 }
5337 
5363 template <typename T, int N, typename AccessorT,
5364  typename PropertyListT =
5366 __ESIMD_API std::enable_if_t<
5367  detail::is_local_accessor_with_v<AccessorT,
5368  detail::accessor_mode_cap::can_read> &&
5369  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
5370  simd<T, N>>
5371 block_load(AccessorT lacc, simd_mask<1> pred, PropertyListT props = {}) {
5372  return slm_block_load<T, N>(detail::localAccessorToOffset(lacc), pred, props);
5373 }
5374 
5402 template <typename T, int N, typename AccessorT,
5403  typename PropertyListT =
5405 __ESIMD_API std::enable_if_t<
5406  detail::is_local_accessor_with_v<AccessorT,
5407  detail::accessor_mode_cap::can_read> &&
5408  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
5409  simd<T, N>>
5410 block_load(AccessorT lacc, uint32_t byte_offset, simd_mask<1> pred,
5411  simd<T, N> pass_thru, PropertyListT props = {}) {
5412  byte_offset += __ESIMD_DNS::localAccessorToOffset(lacc);
5413  return slm_block_load<T, N>(byte_offset, pred, pass_thru, props);
5414 }
5415 
5442 template <typename T, int N, typename AccessorT,
5443  typename PropertyListT =
5445 __ESIMD_API std::enable_if_t<
5446  detail::is_local_accessor_with_v<AccessorT,
5447  detail::accessor_mode_cap::can_read> &&
5448  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
5449  simd<T, N>>
5450 block_load(AccessorT lacc, simd_mask<1> pred, simd<T, N> pass_thru,
5451  PropertyListT props = {}) {
5452  return slm_block_load<T, N>(__ESIMD_DNS::localAccessorToOffset(lacc), pred,
5453  pass_thru, props);
5454 }
5455 
5471 template <typename T, int N, typename Flags>
5472 __ESIMD_API std::enable_if_t<is_simd_flag_type_v<Flags>>
5473 slm_block_store(uint32_t offset, simd<T, N> vals, Flags) {
5474  constexpr size_t Align = Flags::template alignment<simd<T, N>>;
5475  __esimd_slm_block_st<detail::__raw_t<T>, N, Align>(offset, vals.data());
5476 }
5477 
5485 
5533 template <typename T, int N,
5534  typename PropertyListT =
5536 __ESIMD_API std::enable_if_t<
5537  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
5538 slm_block_store(uint32_t byte_offset, simd<T, N> vals, simd_mask<1> pred,
5539  PropertyListT props = {}) {
5540  // Verify input template arguments.
5541  constexpr size_t DefaultAlignment = sizeof(T) <= 4 ? 4 : sizeof(T);
5542  constexpr size_t Alignment =
5543  detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
5544  static_assert(
5545  (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
5546  (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
5547  "Incorrect alignment for the data type");
5548 
5549  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
5550  constexpr int SmallIntFactor32Bit =
5551  sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
5552 
5553  static_assert(N > 0 && N % SmallIntFactor32Bit == 0,
5554  "Number of elements is not supported by Transposed store");
5555 
5556  // If alignment >= 8 and (N * sizeof(T)) % 8 == 0) we can store QWORDs.
5557  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
5558  // because it would require a bit-cast, which is supposed to be NO-OP, but
5559  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
5560  constexpr bool Use64BitData =
5561  Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
5562  (N * sizeof(T)) % sizeof(uint64_t) == 0 &&
5563  (sizeof(T) != sizeof(uint32_t) || N * sizeof(T) > 256);
5564  constexpr int SmallIntFactor =
5565  Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
5566  constexpr int FactoredN = N / SmallIntFactor;
5567  detail::check_lsc_vector_size<FactoredN>();
5568 
5569  // Prepare template arguments for the call of intrinsic.
5570  using StoreElemT = __ESIMD_DNS::__raw_t<
5571  std::conditional_t<SmallIntFactor == 1, T,
5572  std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
5573 
5574  constexpr uint16_t AddressScale = 1;
5575  constexpr int ImmOffset = 0;
5576  constexpr detail::lsc_data_size DS =
5578  constexpr auto VS = detail::to_lsc_vector_size<FactoredN>();
5579  constexpr auto Transposed = detail::lsc_data_order::transpose;
5580  constexpr int NLanes = 1;
5581 
5582  // Prepare non-template arguments and call the intrinsic.
5583  simd<uint32_t, NLanes> Offsets = byte_offset;
5584  __esimd_lsc_store_slm<StoreElemT, cache_hint::none, cache_hint::none,
5585  AddressScale, ImmOffset, DS, VS, Transposed, NLanes>(
5586  pred.data(), Offsets.data(),
5587  sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreElemT, FactoredN>>(
5588  vals.data()));
5589 }
5590 
5606 template <typename T, int N,
5607  typename PropertyListT =
5609 __ESIMD_API std::enable_if_t<
5610  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
5611 slm_block_store(uint32_t byte_offset, simd<T, N> vals,
5612  PropertyListT props = {}) {
5613  constexpr size_t DefaultAlignment = detail::OperandSize::OWORD;
5614  constexpr size_t Alignment =
5615  detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
5616  using StoreElemT = detail::__raw_t<T>;
5617  __esimd_slm_block_st<StoreElemT, N, Alignment>(
5618  byte_offset,
5619  sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreElemT, N>>(vals.data()));
5620 }
5621 
5638 template <typename T, int N, typename AccessorT,
5639  typename PropertyListT =
5641 __ESIMD_API std::enable_if_t<
5642  detail::is_local_accessor_with_v<AccessorT,
5643  detail::accessor_mode_cap::can_write> &&
5644  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
5645 block_store(AccessorT lacc, uint32_t byte_offset, simd<T, N> vals,
5646  PropertyListT props = {}) {
5647  byte_offset += detail::localAccessorToOffset(lacc);
5648  slm_block_store<T, N>(byte_offset, vals, props);
5649 }
5650 
5666 template <typename T, int N, typename AccessorT,
5667  typename PropertyListT =
5669 __ESIMD_API std::enable_if_t<
5670  detail::is_local_accessor_with_v<AccessorT,
5671  detail::accessor_mode_cap::can_write> &&
5672  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
5673 block_store(AccessorT lacc, simd<T, N> vals, PropertyListT props = {}) {
5674  slm_block_store<T, N>(detail::localAccessorToOffset(lacc), vals, props);
5675 }
5676 
5704 template <typename T, int N, typename AccessorT,
5705  typename PropertyListT =
5707 __ESIMD_API std::enable_if_t<
5708  detail::is_local_accessor_with_v<AccessorT,
5709  detail::accessor_mode_cap::can_write> &&
5710  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
5711 block_store(AccessorT lacc, uint32_t byte_offset, simd<T, N> vals,
5712  simd_mask<1> pred, PropertyListT props = {}) {
5713  byte_offset += detail::localAccessorToOffset(lacc);
5714  slm_block_store<T, N>(byte_offset, vals, pred, props);
5715 }
5716 
5742 template <typename T, int N, typename AccessorT,
5743  typename PropertyListT =
5745 __ESIMD_API std::enable_if_t<
5746  detail::is_local_accessor_with_v<AccessorT,
5747  detail::accessor_mode_cap::can_write> &&
5748  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
5749 block_store(AccessorT lacc, simd<T, N> vals, simd_mask<1> pred,
5750  PropertyListT props = {}) {
5751  slm_block_store<T, N>(detail::localAccessorToOffset(lacc), vals, pred, props);
5752 }
5753 namespace detail {
5754 
5755 // lsc_atomic_update() operations may share atomic_op values for data types
5756 // of the same (fp vs integral) class for convenience (e.g. re-use 'fmax' for
5757 // all FP types). In fact those data types may require using different internal
5758 // opcodes. This function returns the corresponding internal opcode for
5759 // the input type 'T' and operation 'Op'.
5760 template <typename T, __ESIMD_NS::atomic_op Op>
5761 constexpr int lsc_to_internal_atomic_op() {
5762  constexpr __ESIMD_NS::native::lsc::atomic_op LSCOp =
5763  __ESIMD_DNS::to_lsc_atomic_op<Op>();
5764  return static_cast<int>(LSCOp);
5765 }
5766 
5780 
5781 template <atomic_op Op, typename T, int N, lsc_data_size DS>
5782 __ESIMD_API std::enable_if_t<get_num_args<Op>() == 0, simd<T, N>>
5784  check_lsc_data_size<T, DS>();
5785  check_atomic<Op, T, N, 0, /*IsLSC*/ true>();
5786  constexpr uint16_t AddressScale = 1;
5787  constexpr int ImmOffset = 0;
5788  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
5789  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
5790  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
5791  using MsgT = typename lsc_expand_type<T>::type;
5792  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
5793  simd<MsgT, N> Tmp =
5794  __esimd_lsc_xatomic_slm_0<MsgT, IOp, cache_hint::none, cache_hint::none,
5795  AddressScale, ImmOffset, EDS, VS, Transposed,
5796  N>(pred.data(), offsets.data());
5797  return lsc_format_ret<T>(Tmp);
5798 }
5799 
5814 template <atomic_op Op, typename T, int N, lsc_data_size DS>
5815 __ESIMD_API std::enable_if_t<get_num_args<Op>() == 1, simd<T, N>>
5817  simd_mask<N> pred) {
5818  check_lsc_data_size<T, DS>();
5819  check_atomic<Op, T, N, 1, /*IsLSC*/ true>();
5820  constexpr uint16_t AddressScale = 1;
5821  constexpr int ImmOffset = 0;
5822  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
5823  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
5824  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
5825  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
5826  if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
5827  return __esimd_lsc_xatomic_slm_1<T, IOp, cache_hint::none, cache_hint::none,
5828  AddressScale, ImmOffset, EDS, VS,
5829  Transposed, N>(pred.data(), offsets.data(),
5830  src0.data());
5831  } else {
5832  using MsgT = typename lsc_expand_type<T>::type;
5833  simd<MsgT, N> Msg_data = lsc_format_input<MsgT>(src0);
5834  simd<MsgT, N> Tmp =
5835  __esimd_lsc_xatomic_slm_1<MsgT, IOp, cache_hint::none, cache_hint::none,
5836  AddressScale, ImmOffset, EDS, VS, Transposed,
5837  N>(pred.data(), offsets.data(),
5838  Msg_data.data());
5839  return lsc_format_ret<T>(Tmp);
5840  }
5841 }
5842 
5858 template <atomic_op Op, typename T, int N, lsc_data_size DS>
5861  simd_mask<N> pred) {
5862  check_lsc_data_size<T, DS>();
5863  check_atomic<Op, T, N, 2, /*IsLSC*/ true>();
5864  constexpr uint16_t AddressScale = 1;
5865  constexpr int ImmOffset = 0;
5866  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
5867  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
5868  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
5869  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
5870  if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
5871  return __esimd_lsc_xatomic_slm_2<T, IOp, cache_hint::none, cache_hint::none,
5872  AddressScale, ImmOffset, EDS, VS,
5873  Transposed, N>(pred.data(), offsets.data(),
5874  src0.data(), src1.data());
5875  } else {
5876  using MsgT = typename lsc_expand_type<T>::type;
5877  simd<MsgT, N> Msg_data0 = lsc_format_input<MsgT>(src0);
5878  simd<MsgT, N> Msg_data1 = lsc_format_input<MsgT>(src1);
5879  simd<MsgT, N> Tmp =
5880  __esimd_lsc_xatomic_slm_2<MsgT, IOp, cache_hint::none, cache_hint::none,
5881  AddressScale, ImmOffset, EDS, VS, Transposed,
5882  N>(pred.data(), offsets.data(),
5883  Msg_data0.data(), Msg_data1.data());
5884  return lsc_format_ret<T>(Tmp);
5885  }
5886 }
5887 
5888 } // namespace detail
5889 
5893 
5897 
5902 
5907 
5909 
5927 template <atomic_op Op, typename T, int N>
5928 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0, simd<T, N>>
5930  // 2 byte, 8 byte types, non-power of two, and operations wider than
5931  // 32 are supported only by LSC.
5932  if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
5933  !__ESIMD_DNS::isPowerOf2(N, 32)) {
5934  return slm_atomic_update_impl<Op, T, N,
5936  byte_offset, mask);
5937  } else if constexpr (Op == atomic_op::load) {
5938  if constexpr (std::is_integral_v<T>) {
5939  return slm_atomic_update<atomic_op::bit_or, T, N>(byte_offset,
5940  simd<T, N>(0), mask);
5941  } else {
5942  using Tint = detail::uint_type_t<sizeof(T)>;
5943  simd<Tint, N> Res = slm_atomic_update<atomic_op::bit_or, Tint, N>(
5944  byte_offset, simd<Tint, N>(0), mask);
5945  return Res.template bit_cast_view<T>();
5946  }
5947  } else {
5948  detail::check_atomic<Op, T, N, 0>();
5950  return __esimd_dword_atomic0<Op, T, N>(mask.data(), si, byte_offset.data());
5951  }
5952 }
5953 
5962 template <atomic_op Op, typename T, int N, typename AccessorT>
5963 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0 &&
5964  __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
5965  simd<T, N>>
5966 atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset,
5967  simd_mask<N> mask = 1) {
5968  byte_offset += detail::localAccessorToOffset(lacc);
5969  return slm_atomic_update<Op, T, N>(byte_offset, mask);
5970 }
5971 
5973 
5979 
5986 
5988 
6006 template <atomic_op Op, typename T, int N>
6007 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1, simd<T, N>>
6009  simd_mask<N> mask = 1) {
6010  // Non-LSC atomic_update supports only 4-byte int vector operations with
6011  // 1,2,4,8,16,32 vector length. Non-LSC supports only 'store' for FP types.
6012  if constexpr (Op == atomic_op::fmin || Op == atomic_op::fmax ||
6013  Op == atomic_op::fadd || Op == atomic_op::fsub ||
6014  sizeof(T) != 4 || !__ESIMD_DNS::isPowerOf2(N, 32)) {
6015  return slm_atomic_update_impl<Op, T, N,
6017  byte_offset, src0, mask);
6018  } else if constexpr (Op == atomic_op::store) {
6019  if constexpr (std::is_integral_v<T>) {
6020  return slm_atomic_update<atomic_op::xchg, T, N>(byte_offset, src0, mask);
6021  } else {
6022  using Tint = detail::uint_type_t<sizeof(T)>;
6023  simd<Tint, N> Res = slm_atomic_update<atomic_op::xchg, Tint, N>(
6024  byte_offset, src0.template bit_cast_view<Tint>(), mask);
6025  return Res.template bit_cast_view<T>();
6026  }
6027  } else {
6028  detail::check_atomic<Op, T, N, 1>();
6030  return __esimd_dword_atomic1<Op, T, N>(mask.data(), si, byte_offset.data(),
6031  src0.data());
6032  }
6033 }
6034 
6053 template <atomic_op Op, typename T, int N, typename AccessorT>
6054 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1 &&
6055  __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
6056  simd<T, N>>
6057 atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, simd<T, N> src0,
6058  simd_mask<N> mask = 1) {
6059  byte_offset += detail::localAccessorToOffset(lacc);
6060  return slm_atomic_update<Op, T, N>(byte_offset, src0, mask);
6061 }
6062 
6064 
6069 
6077 
6095 template <atomic_op Op, typename T, int N>
6096 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2, simd<T, N>>
6098  simd<T, N> src1, simd_mask<N> mask = 1) {
6099  // Non-LSC atomic_update supports only 4-byte int vector operations with
6100  // 1,2,4,8,16,32 vector length.
6101  if constexpr (sizeof(T) != 4 || Op == atomic_op::fcmpxchg ||
6102  !__ESIMD_DNS::isPowerOf2(N, 32)) {
6103  // 2-argument lsc_atomic_update arguments order matches the standard one -
6104  // expected value first, then new value. But atomic_update uses reverse
6105  // order, hence the src1/src0 swap.
6106  return detail::slm_atomic_update_impl<Op, T, N,
6108  byte_offset, src1, src0, mask);
6109  } else {
6110  detail::check_atomic<Op, T, N, 2>();
6112  return __esimd_dword_atomic2<Op, T, N>(mask.data(), si, byte_offset.data(),
6113  src0.data(), src1.data());
6114  }
6115 }
6116 
6123 template <atomic_op Op, typename T, int N, typename AccessorT>
6124 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
6125  __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
6126  simd<T, N>>
6127 atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, simd<T, N> src0,
6128  simd<T, N> src1, simd_mask<N> mask = 1) {
6129  byte_offset += detail::localAccessorToOffset(lacc);
6130  return slm_atomic_update<Op, T, N>(byte_offset, src0, src1, mask);
6131 }
6132 
6134 
6135 namespace detail {
6136 
6150 template <atomic_op Op, typename T, int N, lsc_data_size DS,
6151  typename PropertyListT, typename Toffset>
6152 __ESIMD_API std::enable_if_t<get_num_args<Op>() == 0, simd<T, N>>
6154  static_assert(sizeof(T) > 1, "Unsupported data type");
6155  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
6156  check_atomic<Op, T, N, 0, /*IsLSC*/ true>();
6157  check_lsc_data_size<T, DS>();
6158  check_cache_hints<cache_action::atomic, PropertyListT>();
6159  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
6160  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
6161  constexpr uint16_t AddressScale = 1;
6162  constexpr int ImmOffset = 0;
6163  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
6164  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
6165  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
6166  using MsgT = typename lsc_expand_type<T>::type;
6167  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
6168  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
6169  addrs += convert<uintptr_t>(offsets);
6170  simd<MsgT, N> Tmp =
6171  __esimd_lsc_xatomic_stateless_0<MsgT, IOp, L1H, L2H, AddressScale,
6172  ImmOffset, EDS, VS, Transposed, N>(
6173  pred.data(), addrs.data());
6174  return lsc_format_ret<T>(Tmp);
6175 }
6176 
6191 template <atomic_op Op, typename T, int N, lsc_data_size DS,
6192  typename PropertyListT, typename Toffset>
6193 __ESIMD_API std::enable_if_t<get_num_args<Op>() == 1, simd<T, N>>
6195  simd_mask<N> pred) {
6196  static_assert(sizeof(T) > 1, "Unsupported data type");
6197  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
6198  check_lsc_data_size<T, DS>();
6199  check_atomic<Op, T, N, 1, /*IsLSC*/ true>();
6200  check_cache_hints<cache_action::atomic, PropertyListT>();
6201  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
6202  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
6203  constexpr uint16_t AddressScale = 1;
6204  constexpr int ImmOffset = 0;
6205  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
6206  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
6207  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
6208  using MsgT = typename lsc_expand_type<T>::type;
6209  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
6210  simd<MsgT, N> Msg_data = lsc_format_input<MsgT>(src0);
6211  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
6212  addrs += convert<uintptr_t>(offsets);
6213  simd<MsgT, N> Tmp =
6214  __esimd_lsc_xatomic_stateless_1<MsgT, IOp, L1H, L2H, AddressScale,
6215  ImmOffset, EDS, VS, Transposed, N>(
6216  pred.data(), addrs.data(), Msg_data.data());
6217  return lsc_format_ret<T>(Tmp);
6218 }
6219 
6235 template <atomic_op Op, typename T, int N, lsc_data_size DS,
6236  typename PropertyListT, typename Toffset>
6237 __ESIMD_API std::enable_if_t<get_num_args<Op>() == 2, simd<T, N>>
6239  simd<T, N> src1, simd_mask<N> pred) {
6240  static_assert(sizeof(T) > 1, "Unsupported data type");
6241  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
6242  check_lsc_data_size<T, DS>();
6243  check_atomic<Op, T, N, 2, /*IsLSC*/ true>();
6244  check_cache_hints<cache_action::atomic, PropertyListT>();
6245  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
6246  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
6247  constexpr uint16_t AddressScale = 1;
6248  constexpr int ImmOffset = 0;
6249  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
6250  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
6251  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
6252  using MsgT = typename lsc_expand_type<T>::type;
6253  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
6254  simd<MsgT, N> Msg_data0 = lsc_format_input<MsgT>(src0);
6255  simd<MsgT, N> Msg_data1 = lsc_format_input<MsgT>(src1);
6256  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
6257  addrs += convert<uintptr_t>(offsets);
6258  simd<MsgT, N> Tmp =
6259  __esimd_lsc_xatomic_stateless_2<MsgT, IOp, L1H, L2H, AddressScale,
6260  ImmOffset, EDS, VS, Transposed, N>(
6261  pred.data(), addrs.data(), Msg_data0.data(), Msg_data1.data());
6262  return lsc_format_ret<T>(Tmp);
6263 }
6264 
6280 template <atomic_op Op, typename T, int N,
6282  typename PropertyListT, typename AccessorTy, typename Toffset>
6283 __ESIMD_API
6284  std::enable_if_t<get_num_args<Op>() == 0 &&
6285  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy>,
6286  simd<T, N>>
6287  atomic_update_impl(AccessorTy acc, simd<Toffset, N> byte_offsets,
6288  simd_mask<N> pred) {
6289 #ifdef __ESIMD_FORCE_STATELESS_MEM
6290  return atomic_update_impl<Op, T, N, DS, PropertyListT>(
6291  accessorToPointer<T>(acc), byte_offsets, pred);
6292 #else
6293  static_assert(sizeof(T) > 1, "Unsupported data type");
6294  static_assert(std::is_integral_v<Toffset> && sizeof(Toffset) == 4,
6295  "Unsupported offset type");
6296  check_lsc_data_size<T, DS>();
6297  check_atomic<Op, T, N, 0, /*IsLSC*/ true>();
6298  check_cache_hints<cache_action::atomic, PropertyListT>();
6299  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
6300  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
6301  constexpr uint16_t AddressScale = 1;
6302  constexpr int ImmOffset = 0;
6303  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
6304  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
6305  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
6306  using MsgT = typename lsc_expand_type<T>::type;
6307  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
6308  auto si = get_surface_index(acc);
6309  simd<MsgT, N> Tmp =
6310  __esimd_lsc_xatomic_bti_0<MsgT, IOp, L1H, L2H, AddressScale, ImmOffset,
6311  EDS, VS, Transposed, N>(
6312  pred.data(), byte_offsets.data(), si);
6313  return lsc_format_ret<T>(Tmp);
6314 #endif
6315 }
6316 
6334 template <atomic_op Op, typename T, int N, lsc_data_size DS,
6335  typename PropertyListT, typename AccessorTy, typename Toffset>
6336 __ESIMD_API
6337  std::enable_if_t<get_num_args<Op>() == 1 &&
6338  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy>,
6339  simd<T, N>>
6340  atomic_update_impl(AccessorTy acc, simd<Toffset, N> byte_offset,
6341  simd<T, N> src0, simd_mask<N> pred) {
6342 #ifdef __ESIMD_FORCE_STATELESS_MEM
6343  return atomic_update_impl<Op, T, N, DS, PropertyListT>(
6344  accessorToPointer<T>(acc), byte_offset, src0, pred);
6345 #else
6346  static_assert(sizeof(T) > 1, "Unsupported data type");
6347  static_assert(std::is_integral_v<Toffset> && sizeof(Toffset) == 4,
6348  "Unsupported offset type");
6349  check_lsc_data_size<T, DS>();
6350  check_atomic<Op, T, N, 1, /*IsLSC*/ true>();
6351  check_cache_hints<cache_action::atomic, PropertyListT>();
6352  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
6353  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
6354  constexpr uint16_t AddressScale = 1;
6355  constexpr int ImmOffset = 0;
6356  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
6357  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
6358  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
6359  using MsgT = typename lsc_expand_type<T>::type;
6360  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
6361  simd<MsgT, N> Src0Msg = lsc_format_input<MsgT>(src0);
6362  auto si = get_surface_index(acc);
6363  simd<MsgT, N> Tmp =
6364  __esimd_lsc_xatomic_bti_1<MsgT, IOp, L1H, L2H, AddressScale, ImmOffset,
6365  EDS, VS, Transposed, N>(
6366  pred.data(), byte_offset.data(), Src0Msg.data(), si);
6367  return lsc_format_ret<T>(Tmp);
6368 #endif
6369 }
6370 
6389 template <atomic_op Op, typename T, int N, lsc_data_size DS,
6390  typename PropertyListT, typename AccessorTy, typename Toffset>
6391 __ESIMD_API
6392  std::enable_if_t<get_num_args<Op>() == 2 &&
6393  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy>,
6394  simd<T, N>>
6395  atomic_update_impl(AccessorTy acc, simd<Toffset, N> byte_offset,
6397 #ifdef __ESIMD_FORCE_STATELESS_MEM
6398  return atomic_update_impl<Op, T, N, DS, PropertyListT>(
6399  __ESIMD_DNS::accessorToPointer<T>(acc), byte_offset, src0, src1, pred);
6400 #else
6401  static_assert(std::is_integral_v<Toffset> && sizeof(Toffset) == 4,
6402  "Unsupported offset type");
6403  check_lsc_vector_size<1>();
6404  check_lsc_data_size<T, DS>();
6405  check_atomic<Op, T, N, 2, /*IsLSC*/ true>();
6406  check_cache_hints<cache_action::atomic, PropertyListT>();
6407  constexpr auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
6408  constexpr auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
6409  constexpr uint16_t AddressScale = 1;
6410  constexpr int ImmOffset = 0;
6411  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
6412  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
6413  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
6414  using MsgT = typename lsc_expand_type<T>::type;
6415  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
6416  simd<MsgT, N> Msg_data0 = lsc_format_input<MsgT>(src0);
6417  simd<MsgT, N> Msg_data1 = lsc_format_input<MsgT>(src1);
6418  auto si = get_surface_index(acc);
6419  simd<MsgT, N> Tmp =
6420  __esimd_lsc_xatomic_bti_2<MsgT, IOp, L1H, L2H, AddressScale, ImmOffset,
6421  EDS, VS, Transposed, N>(
6422  pred.data(), byte_offset.data(), Msg_data0.data(), Msg_data1.data(),
6423  si);
6424  return lsc_format_ret<T>(Tmp);
6425 #endif
6426 }
6427 } // namespace detail
6428 
6431 
6469 // Other properties are ignored.
6473 template <atomic_op Op, typename T, int N, typename Toffset,
6474  typename PropertyListT =
6476 __ESIMD_API std::enable_if_t<
6477  __ESIMD_DNS::get_num_args<Op>() == 0 &&
6478  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6479  simd<T, N>>
6481  PropertyListT props = {}) {
6482  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
6483 
6484  if constexpr (detail::has_cache_hints<PropertyListT>() ||
6485  !__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) {
6487  Op, T, N, detail::lsc_data_size::default_size, PropertyListT, Toffset>(
6488  p, byte_offset, mask);
6489  } else if constexpr (N == 16 || N == 32) {
6490  // TODO: In fact GPU BE supports legalization for any N, even for
6491  // non-power-of-2, but it is implemented with an error now. For example,
6492  // N=17 is emulated as 2 calls (N=16 and N=1), while it must be 3 calls:
6493  // (N=8, N=8, N=1). I.e. Gen12 atomic instruction supports only N up to 8
6494  // and GPU thinks now it is up to 16.
6495  // Thus we emulate N=16 with 2 calls with N=8 each.
6496  // N=32 is emulated with 4 calls with N=8 each.
6497  // Task1: Remove the special-case emulation for N=16 and N=32 below when
6498  // GPU driver fixes the error.
6499  // Task2: remove the condition "!__ESIMD_DNS::isPowerOf2(N, 32)" above
6500  // and let svm.atomic for any N.
6501 
6502  simd<T, N> Res;
6503  for (int I = 0; I < N; I += 8) {
6504  simd_mask<8> Mask8 = mask.template select<8, 1>(I);
6505  simd<Toffset, 8> ByteOffset8 = byte_offset.template select<8, 1>(I);
6506  Res.template select<8, 1>(I) =
6507  atomic_update<Op, T, 8>(p, ByteOffset8, Mask8, props);
6508  }
6509  return Res;
6510  } else if constexpr (Op == atomic_op::load) {
6511  if constexpr (std::is_integral_v<T>) {
6512  return atomic_update<atomic_op::bit_or, T, N>(p, byte_offset,
6513  simd<T, N>(0), mask, props);
6514  } else {
6515  using Tint = detail::uint_type_t<sizeof(T)>;
6516  simd<Tint, N> Res = atomic_update<atomic_op::bit_or, Tint, N>(
6517  reinterpret_cast<Tint *>(p), byte_offset, simd<Tint, N>(0), mask,
6518  props);
6519  return Res.template bit_cast_view<T>();
6520  }
6521  } else {
6522  detail::check_atomic<Op, T, N, 0>();
6523  simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
6524  simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(byte_offset);
6525  vAddr += offset_i1;
6526  using Tx = typename detail::__raw_t<T>;
6527  return __esimd_svm_atomic0<Op, Tx, N>(vAddr.data(), mask.data());
6528  }
6529 }
6530 
6549 template <atomic_op Op, typename T, int N, typename Toffset,
6550  typename PropertyListT =
6552 __ESIMD_API std::enable_if_t<
6553  __ESIMD_DNS::get_num_args<Op>() == 0 &&
6554  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6555  simd<T, N>>
6556 atomic_update(T *p, simd<Toffset, N> byte_offset, PropertyListT props = {}) {
6557  simd_mask<N> mask = 1;
6558  return atomic_update<Op, T, N>(p, byte_offset, mask, props);
6559 }
6560 
6581 template <atomic_op Op, typename T, int N, typename OffsetSimdViewT,
6582  typename PropertyListT =
6584 __ESIMD_API std::enable_if_t<
6585  __ESIMD_DNS::get_num_args<Op>() == 0 &&
6586  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
6587  detail::is_simd_view_type_v<OffsetSimdViewT>,
6588  simd<T, N>>
6589 atomic_update(T *p, OffsetSimdViewT offsets, simd_mask<N> mask,
6590  PropertyListT props = {}) {
6591  return atomic_update<Op, T, N>(p, offsets.read(), mask, props);
6592 }
6593 
6612 template <atomic_op Op, typename T, int N, typename OffsetSimdViewT,
6613  typename PropertyListT =
6615 __ESIMD_API std::enable_if_t<
6616  __ESIMD_DNS::get_num_args<Op>() == 0 &&
6617  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
6618  detail::is_simd_view_type_v<OffsetSimdViewT>,
6619  simd<T, N>>
6620 atomic_update(T *p, OffsetSimdViewT byte_offset, PropertyListT props = {}) {
6621  simd_mask<N> mask = 1;
6622  return atomic_update<Op, T, N>(p, byte_offset.read(), mask, props);
6623 }
6624 
6639 template <atomic_op Op, typename T, int N, typename Toffset>
6640 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>, simd<T, N>>
6641 atomic_update(T *p, Toffset byte_offset, simd_mask<N> mask = 1) {
6642  return atomic_update<Op, T, N>(p, simd<Toffset, N>(byte_offset), mask);
6643 }
6644 
6664 
6693 template <atomic_op Op, typename T, int N, typename Toffset,
6694  typename PropertyListT =
6696 __ESIMD_API std::enable_if_t<
6697  __ESIMD_DNS::get_num_args<Op>() == 1 &&
6698  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6699  simd<T, N>>
6701  simd_mask<N> mask, PropertyListT props = {}) {
6702  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
6703 
6704  // Auto-convert FP atomics to LSC version.
6705  if constexpr (detail::has_cache_hints<PropertyListT>() ||
6706  (Op == atomic_op::fmin) || (Op == atomic_op::fmax) ||
6707  (Op == atomic_op::fadd) || (Op == atomic_op::fsub) ||
6708  !__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) {
6710  Op, T, N, detail::lsc_data_size::default_size, PropertyListT, Toffset>(
6711  p, byte_offset, src0, mask);
6712  } else if constexpr (N == 16 || N == 32) {
6713  // TODO: In fact GPU BE supports legalization for any N, even for
6714  // non-power-of-2, but it is implemented with an error now. For example,
6715  // N=17 is emulated as 2 calls (N=16 and N=1), while it must be 3 calls:
6716  // (N=8, N=8, N=1). I.e. Gen12 atomic instruction supports only N up to 8
6717  // and GPU thinks now it is up to 16.
6718  // Thus we emulate N=16 with 2 calls with N=8 each.
6719  // N=32 is emulated with 4 calls with N=8 each.
6720  // Task1: Remove the special-case emulation for N=16 and N=32 below when
6721  // GPU driver fixes the error.
6722  // Task2: remove the condition "!__ESIMD_DNS::isPowerOf2(N, 32)" above
6723  // and let svm.atomic for any N.
6724  simd<T, N> Res;
6725  for (int I = 0; I < N; I += 8) {
6726  simd_mask<8> Mask8 = mask.template select<8, 1>(I);
6727  simd<Toffset, 8> ByteOffset8 = byte_offset.template select<8, 1>(I);
6728  simd<T, 8> Src08 = src0.template select<8, 1>(I);
6729  Res.template select<8, 1>(I) =
6730  atomic_update<Op, T, 8>(p, ByteOffset8, Src08, Mask8, props);
6731  }
6732  return Res;
6733  } else if constexpr (Op == atomic_op::store) {
6734  if constexpr (std::is_integral_v<T>) {
6735  return atomic_update<atomic_op::xchg, T, N>(p, byte_offset, src0, mask,
6736  props);
6737  } else {
6738  using Tint = detail::uint_type_t<sizeof(T)>;
6739  simd<Tint, N> Res = atomic_update<atomic_op::xchg, Tint, N>(
6740  reinterpret_cast<Tint *>(p), byte_offset,
6741  src0.template bit_cast_view<Tint>(), mask, props);
6742  return Res.template bit_cast_view<T>();
6743  }
6744  } else {
6745  detail::check_atomic<Op, T, N, 1>();
6746  simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
6747  simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(byte_offset);
6748  vAddr += offset_i1;
6749 
6750  using Tx = typename detail::__raw_t<T>;
6751  return __esimd_svm_atomic1<Op, Tx, N>(vAddr.data(), src0.data(),
6752  mask.data());
6753  }
6754 }
6755 
6759 
6761 
6779 template <atomic_op Op, typename T, int N, typename Toffset,
6780  typename PropertyListT =
6782 __ESIMD_API std::enable_if_t<
6783  __ESIMD_DNS::get_num_args<Op>() == 1 &&
6784  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6785  simd<T, N>>
6787  PropertyListT props = {}) {
6788  simd_mask<N> mask = 1;
6789  return atomic_update<Op, T, N>(p, byte_offset, src0, mask, props);
6790 }
6791 
6819 template <atomic_op Op, typename T, int N, typename OffsetSimdViewT,
6820  typename PropertyListT =
6822 __ESIMD_API std::enable_if_t<
6823  __ESIMD_DNS::get_num_args<Op>() == 1 &&
6824  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
6825  detail::is_simd_view_type_v<OffsetSimdViewT>,
6826  simd<T, N>>
6827 atomic_update(T *p, OffsetSimdViewT offsets, simd<T, N> src0, simd_mask<N> mask,
6828  PropertyListT props = {}) {
6829  return atomic_update<Op, T, N>(p, offsets.read(), src0, mask, props);
6830 }
6831 
6857 template <atomic_op Op, typename T, int N, typename OffsetSimdViewT,
6858  typename PropertyListT =
6860 __ESIMD_API std::enable_if_t<
6861  __ESIMD_DNS::get_num_args<Op>() == 1 &&
6862  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
6863  detail::is_simd_view_type_v<OffsetSimdViewT>,
6864  simd<T, N>>
6865 atomic_update(T *p, OffsetSimdViewT offsets, simd<T, N> src0,
6866  PropertyListT props = {}) {
6867  simd_mask<N> mask = 1;
6868  return atomic_update<Op, T, N>(p, offsets.read(), src0, mask, props);
6869 }
6870 
6889 template <atomic_op Op, typename Tx, int N, typename Toffset>
6890 __ESIMD_API std::enable_if_t<
6891  std::is_integral_v<Toffset> &&
6892  ((Op != atomic_op::store && Op != atomic_op::xchg) || N == 1),
6893  simd<Tx, N>>
6894 atomic_update(Tx *p, Toffset byte_offset, simd<Tx, N> src0, simd_mask<N> mask) {
6895  return atomic_update<Op, Tx, N>(p, simd<Toffset, N>(byte_offset), src0, mask);
6896 }
6897 
6922 
6940 // Other properties are ignored.
6944 template <atomic_op Op, typename T, int N, typename Toffset,
6945  typename PropertyListT =
6947 __ESIMD_API std::enable_if_t<
6948  __ESIMD_DNS::get_num_args<Op>() == 2 &&
6949  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6950  simd<T, N>>
6952  simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
6953  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
6954 
6955  // Use LSC atomic when cache hints are present, FP atomics is used,
6956  // non-power of two length is used, or operation width greater than 32, or the
6957  // data size is less than 4 bytes.
6958  if constexpr (detail::has_cache_hints<PropertyListT>() ||
6959  Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32) ||
6960  sizeof(T) < 4) {
6961  // 2-argument lsc_atomic_update arguments order matches the standard one -
6962  // expected value first, then new value. But atomic_update uses reverse
6963  // order, hence the src1/src0 swap.
6965  Op, T, N, detail::lsc_data_size::default_size, PropertyListT, Toffset>(
6966  p, byte_offset, src1, src0, mask);
6967  } else if constexpr (N == 16 || N == 32) {
6968  // TODO: In fact GPU BE supports legalization for any N, even for
6969  // non-power-of-2, but it is implemented with an error now. For example,
6970  // N=17 is emulated as 2 calls (N=16 and N=1), while it must be 3 calls:
6971  // (N=8, N=8, N=1). I.e. Gen12 atomic instruction supports only N up to 8
6972  // and GPU thinks now it is up to 16.
6973  // Thus we emulate N=16 with 2 calls with N=8 each.
6974  // N=32 is emulated with 4 calls with N=8 each.
6975  // Task1: Remove the special-case emulation for N=16 and N=32 below when
6976  // GPU driver fixes the error.
6977  // Task2: remove the condition "!__ESIMD_DNS::isPowerOf2(N, 32)" above
6978  // and let svm.atomic for any N.
6979  simd<T, N> Res;
6980  for (int I = 0; I < N; I += 8) {
6981  simd_mask<8> Mask8 = mask.template select<8, 1>(I);
6982  simd<Toffset, 8> ByteOffset8 = byte_offset.template select<8, 1>(I);
6983  simd<T, 8> Src08 = src0.template select<8, 1>(I);
6984  simd<T, 8> Src18 = src1.template select<8, 1>(I);
6985  Res.template select<8, 1>(I) =
6986  atomic_update<Op, T, 8>(p, ByteOffset8, Src08, Src18, Mask8, props);
6987  }
6988  return Res;
6989  } else {
6990  detail::check_atomic<Op, T, N, 2>();
6991  simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
6992  simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(byte_offset);
6993  vAddr += offset_i1;
6994  using Tx = typename detail::__raw_t<T>;
6995  return __esimd_svm_atomic2<Op, Tx, N>(vAddr.data(), src0.data(),
6996  src1.data(), mask.data());
6997  }
6998 }
6999 
7004 //
7015 // Other properties are ignored.
7019 template <atomic_op Op, typename T, int N, typename Toffset,
7020  typename PropertyListT =
7022 __ESIMD_API std::enable_if_t<
7023  __ESIMD_DNS::get_num_args<Op>() == 2 &&
7024  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7025  simd<T, N>>
7027  simd<T, N> src1, PropertyListT props = {}) {
7028  simd_mask<N> mask = 1;
7029  return atomic_update<Op, T, N>(p, byte_offset, src0, src1, mask, props);
7030 }
7031 
7049 // Other properties are ignored.
7052 template <atomic_op Op, typename T, int N, typename OffsetSimdViewT,
7053  typename PropertyListT =
7055 __ESIMD_API std::enable_if_t<
7056  __ESIMD_DNS::get_num_args<Op>() == 2 &&
7057  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
7058  detail::is_simd_view_type_v<OffsetSimdViewT>,
7059  simd<T, N>>
7060 atomic_update(T *p, OffsetSimdViewT byte_offset, simd<T, N> src0,
7061  simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
7062  return atomic_update<Op, T, N>(p, byte_offset.read(), src0, src1, mask,
7063  props);
7064 }
7065 
7081 // Other properties are ignored.
7084 template <atomic_op Op, typename T, int N, typename OffsetSimdViewT,
7085  typename PropertyListT =
7087 __ESIMD_API std::enable_if_t<
7088  __ESIMD_DNS::get_num_args<Op>() == 2 &&
7089  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
7090  detail::is_simd_view_type_v<OffsetSimdViewT>,
7091  simd<T, N>>
7092 atomic_update(T *p, OffsetSimdViewT byte_offset, simd<T, N> src0,
7093  simd<T, N> src1, PropertyListT props = {}) {
7094  simd_mask<N> mask = 1;
7095  return atomic_update<Op, T, N>(p, byte_offset.read(), src0, src1, mask,
7096  props);
7097 }
7098 
7115 template <atomic_op Op, typename Tx, int N, typename Toffset>
7116 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>, simd<Tx, N>>
7117 atomic_update(Tx *p, Toffset byte_offset, simd<Tx, N> src0, simd<Tx, N> src1,
7118  simd_mask<N> mask) {
7119  return atomic_update<Op, Tx, N>(p, simd<Toffset, N>(byte_offset), src0, src1,
7120  mask);
7121 }
7122 
7139 
7164 // Other properties are ignored.
7168 template <atomic_op Op, typename T, int N, typename Toffset,
7169  typename AccessorTy,
7170  typename PropertyListT =
7172 __ESIMD_API std::enable_if_t<
7173  __ESIMD_DNS::get_num_args<Op>() == 0 &&
7174  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7175  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7176  simd<T, N>>
7177 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd_mask<N> mask,
7178  PropertyListT props = {}) {
7179 #ifdef __ESIMD_FORCE_STATELESS_MEM
7180  return atomic_update<Op, T, N>(__ESIMD_DNS::accessorToPointer<T>(acc),
7181  byte_offset, mask, props);
7182 #else
7183  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
7184 
7185  if constexpr (detail::has_cache_hints<PropertyListT>() ||
7186  !detail::isPowerOf2(N, 32) || sizeof(T) < 4) {
7188  Op, T, N, detail::lsc_data_size::default_size, PropertyListT>(
7189  acc, byte_offset, mask);
7190  } else {
7191  if constexpr (Op == atomic_op::load) {
7192  if constexpr (std::is_integral_v<T>) {
7193  return atomic_update<atomic_op::bit_or, T, N>(
7194  acc, byte_offset, simd<T, N>(0), mask, props);
7195  } else {
7196  using Tint = detail::uint_type_t<sizeof(T)>;
7197  simd<Tint, N> Res = atomic_update<atomic_op::bit_or, Tint, N>(
7198  acc, byte_offset, simd<Tint, N>(0), mask, props);
7199  return Res.template bit_cast_view<T>();
7200  }
7201  } else {
7202  detail::check_atomic<Op, T, N, 0>();
7203  static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported");
7204 
7205  static_assert(sizeof(T) == 4, "Only 32 bit data is supported");
7206  const auto si = get_surface_index(acc);
7207  using Tx = typename detail::__raw_t<T>;
7208  return __esimd_dword_atomic0<Op, Tx, N>(mask.data(), si,
7209  byte_offset.data());
7210  }
7211  }
7212 #endif
7213 }
7214 
7231 // Other properties are ignored.
7235 template <atomic_op Op, typename T, int N, typename Toffset,
7236  typename AccessorTy,
7237  typename PropertyListT =
7239 __ESIMD_API std::enable_if_t<
7240  __ESIMD_DNS::get_num_args<Op>() == 0 &&
7241  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7242  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7243  simd<T, N>>
7244 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
7245  PropertyListT props = {}) {
7246  simd_mask<N> mask = 1;
7247  return atomic_update<Op, T, N>(acc, byte_offset, mask, props);
7248 }
7249 
7270 // Other properties are ignored.
7274 template <atomic_op Op, typename T, int N, typename OffsetSimdViewT,
7275  typename AccessorTy,
7276  typename PropertyListT =
7278 __ESIMD_API std::enable_if_t<
7279  __ESIMD_DNS::get_num_args<Op>() == 0 &&
7280  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7281  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
7282  detail::is_simd_view_type_v<OffsetSimdViewT>,
7283  simd<T, N>>
7284 atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd_mask<N> mask,
7285  PropertyListT props = {}) {
7286  return atomic_update<Op, T, N>(acc, byte_offset.read(), mask, props);
7287 }
7288 
7308 template <atomic_op Op, typename T, int N, typename OffsetSimdViewT,
7309  typename AccessorTy,
7310  typename PropertyListT =
7312 __ESIMD_API std::enable_if_t<
7313  __ESIMD_DNS::get_num_args<Op>() == 0 &&
7314  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7315  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
7316  detail::is_simd_view_type_v<OffsetSimdViewT>,
7317  simd<T, N>>
7318 atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset,
7319  PropertyListT props = {}) {
7320  simd_mask<N> mask = 1;
7321  return atomic_update<Op, T, N>(acc, byte_offset.read(), mask, props);
7322 }
7323 
7342 template <atomic_op Op, typename T, int N, typename Toffset,
7343  typename AccessorTy>
7344 __ESIMD_API
7345  std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0 &&
7346  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy>,
7347  simd<T, N>>
7348  atomic_update(AccessorTy acc, Toffset byte_offset, simd_mask<N> mask) {
7349  return atomic_update<Op, T, N>(acc, simd<Toffset, N>(byte_offset), mask);
7350 }
7351 
7370 template <atomic_op Op, typename T, int N, typename AccessorTy>
7371 __ESIMD_API
7372  std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0 &&
7373  __ESIMD_DNS::is_rw_local_accessor_v<AccessorTy>,
7374  simd<T, N>>
7375  atomic_update(AccessorTy acc, uint32_t byte_offset, simd_mask<N> mask) {
7376  return atomic_update<Op, T, N>(acc, simd<uint32_t, N>(byte_offset), mask);
7377 }
7378 
7400 
7432 
7433 template <atomic_op Op, typename T, int N, typename Toffset,
7434  typename AccessorTy,
7435  typename PropertyListT =
7437 __ESIMD_API std::enable_if_t<
7438  __ESIMD_DNS::get_num_args<Op>() == 1 &&
7439  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7440  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7441  simd<T, N>>
7442 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
7443  simd_mask<N> mask, PropertyListT props = {}) {
7444 #ifdef __ESIMD_FORCE_STATELESS_MEM
7445  return atomic_update<Op, T, N>(__ESIMD_DNS::accessorToPointer<T>(acc),
7446  byte_offset, src0, mask, props);
7447 #else
7448  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
7449  static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported");
7450  // Auto-convert FP atomics to LSC version.
7451  if constexpr (detail::has_cache_hints<PropertyListT>() ||
7452  Op == atomic_op::fmin || Op == atomic_op::fmax ||
7453  Op == atomic_op::fadd || Op == atomic_op::fsub ||
7454  !__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) {
7456  Op, T, N, detail::lsc_data_size::default_size, PropertyListT>(
7457  acc, byte_offset, src0, mask);
7458  } else if constexpr (Op == atomic_op::store) {
7459  if constexpr (std::is_integral_v<T>) {
7460  return atomic_update<atomic_op::xchg, T, N>(acc, byte_offset, src0, mask,
7461  props);
7462  } else {
7463  using Tint = detail::uint_type_t<sizeof(T)>;
7464  simd<Tint, N> Res = atomic_update<atomic_op::xchg, Tint, N>(
7465  acc, byte_offset, src0.template bit_cast_view<Tint>(), mask, props);
7466  return Res.template bit_cast_view<T>();
7467  }
7468  } else {
7469  detail::check_atomic<Op, T, N, 1>();
7470  static_assert(sizeof(T) == 4, "Only 32 bit data is supported");
7471  const auto si = __ESIMD_NS::get_surface_index(acc);
7472  using Tx = typename detail::__raw_t<T>;
7473  return __esimd_dword_atomic1<Op, Tx, N>(
7474  mask.data(), si, byte_offset.data(),
7475  sycl::bit_cast<__ESIMD_DNS::vector_type_t<Tx, N>>(src0.data()));
7476  }
7477 #endif
7478 }
7479 
7511 template <atomic_op Op, typename T, int N, typename Toffset,
7512  typename AccessorTy,
7513  typename PropertyListT =
7515 __ESIMD_API std::enable_if_t<
7516  __ESIMD_DNS::get_num_args<Op>() == 1 &&
7517  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7518  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7519  simd<T, N>>
7520 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
7521  PropertyListT props = {}) {
7522  simd_mask<N> mask = 1;
7523  return atomic_update<Op, T, N>(acc, byte_offset, src0, mask, props);
7524 }
7525 
7558 template <atomic_op Op, typename T, int N, typename OffsetSimdViewT,
7559  typename AccessorTy,
7560  typename PropertyListT =
7562 __ESIMD_API std::enable_if_t<
7563  __ESIMD_DNS::get_num_args<Op>() == 1 &&
7564  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7565  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
7566  detail::is_simd_view_type_v<OffsetSimdViewT>,
7567  simd<T, N>>
7568 atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
7569  simd_mask<N> mask, PropertyListT props = {}) {
7570  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, mask, props);
7571 }
7572 
7603 template <atomic_op Op, typename T, int N, typename OffsetSimdViewT,
7604  typename AccessorTy,
7605  typename PropertyListT =
7607 __ESIMD_API std::enable_if_t<
7608  __ESIMD_DNS::get_num_args<Op>() == 1 &&
7609  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7610  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
7611  detail::is_simd_view_type_v<OffsetSimdViewT>,
7612  simd<T, N>>
7613 atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
7614  PropertyListT props = {}) {
7615  simd_mask<N> mask = 1;
7616  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, mask, props);
7617 }
7618 
7640 template <atomic_op Op, typename T, int N, typename Toffset,
7641  typename AccessorTy>
7642 __ESIMD_API std::enable_if_t<
7643  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7644  ((Op != atomic_op::store && Op != atomic_op::xchg) || N == 1),
7645  simd<T, N>>
7646 atomic_update(AccessorTy acc, Toffset offset, simd<T, N> src0,
7647  simd_mask<N> mask) {
7648  return atomic_update<Op, T, N>(acc, simd<Toffset, N>(offset), src0, mask);
7649 }
7650 
7670 template <atomic_op Op, typename Tx, int N, typename AccessorTy>
7671 __ESIMD_API std::enable_if_t<
7672  __ESIMD_DNS::is_rw_local_accessor_v<AccessorTy> &&
7673  ((Op != atomic_op::store && Op != atomic_op::xchg) || N == 1),
7674  simd<Tx, N>>
7675 atomic_update(AccessorTy acc, uint32_t offset, simd<Tx, N> src0,
7676  simd_mask<N> mask) {
7677  return atomic_update<Op, Tx, N>(acc, simd<uint32_t, N>(offset), src0, mask);
7678 }
7679 
7686 // simd_mask<N> mask,props = {}); // (acc-au2-1)
7702 
7706 // simd_mask<N> mask,props = {}); // (acc-au2-1)
7729 // Other properties are ignored.
7733 template <atomic_op Op, typename T, int N, typename Toffset,
7734  typename AccessorTy,
7735  typename PropertyListT =
7737 __ESIMD_API std::enable_if_t<
7738  __ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
7739  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7740  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7741  simd<T, N>>
7742 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
7743  simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
7744 #ifdef __ESIMD_FORCE_STATELESS_MEM
7745  return atomic_update<Op, T, N>(__ESIMD_DNS::accessorToPointer<T>(acc),
7746  byte_offset, src0, src1, mask, props);
7747 #else
7748  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
7749  static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported");
7750  // Use LSC atomic when cache hints are present, FP atomics is used,
7751  // non-power of two length is used, operation width greater than 32, or the
7752  // data size is less than 4 bytes,
7753  if constexpr (detail::has_cache_hints<PropertyListT>() ||
7754  Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32) ||
7755  sizeof(T) < 4) {
7756  // 2-argument lsc_atomic_update arguments order matches the standard one -
7757  // expected value first, then new value. But atomic_update uses reverse
7758  // order, hence the src1/src0 swap.
7760  Op, T, N, detail::lsc_data_size::default_size, PropertyListT>(
7761  acc, byte_offset, src1, src0, mask);
7762  } else {
7763  detail::check_atomic<Op, T, N, 2>();
7764  static_assert(sizeof(T) == 4, "Only 32 bit data is supported");
7765  const auto si = __ESIMD_NS::get_surface_index(acc);
7766  using Tx = typename detail::__raw_t<T>;
7767  return __esimd_dword_atomic2<Op, Tx, N>(
7768  mask.data(), si, byte_offset.data(),
7769  sycl::bit_cast<__ESIMD_DNS::vector_type_t<Tx, N>>(src0.data()),
7770  sycl::bit_cast<__ESIMD_DNS::vector_type_t<Tx, N>>(src1.data()));
7771  }
7772 #endif
7773 }
7774 
7792 // Other properties are ignored.
7796 template <atomic_op Op, typename T, int N, typename Toffset,
7797  typename AccessorTy,
7798  typename PropertyListT =
7800 __ESIMD_API std::enable_if_t<
7801  __ESIMD_DNS::get_num_args<Op>() == 2 &&
7802  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7803  ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7804  simd<T, N>>
7805 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
7806  simd<T, N> src1, PropertyListT props = {}) {
7807  simd_mask<N> mask = 1;
7808  return atomic_update<Op, T, N>(acc, byte_offset, src0, src1, mask, props);
7809 }
7810 
7831 // Other properties are ignored.
7834 template <atomic_op Op, typename T, int N, typename OffsetSimdViewT,
7835  typename AccessorTy,
7836  typename PropertyListT =
7838 __ESIMD_API std::enable_if_t<
7839  __ESIMD_DNS::get_num_args<Op>() == 2 &&
7840  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7841  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
7842  detail::is_simd_view_type_v<OffsetSimdViewT>,
7843  simd<T, N>>
7844 atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
7845  simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
7846  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, src1, mask,
7847  props);
7848 }
7849 
7868 // Other properties are ignored.
7871 template <atomic_op Op, typename T, int N, typename OffsetSimdViewT,
7872  typename AccessorTy,
7873  typename PropertyListT =
7875 __ESIMD_API std::enable_if_t<
7876  __ESIMD_DNS::get_num_args<Op>() == 2 &&
7877  __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7878  ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
7879  detail::is_simd_view_type_v<OffsetSimdViewT>,
7880  simd<T, N>>
7881 atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
7882  simd<T, N> src1, PropertyListT props = {}) {
7883  simd_mask<N> mask = 1;
7884  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, src1, mask,
7885  props);
7886 }
7887 
7908 template <atomic_op Op, typename Tx, int N, typename Toffset,
7909  typename AccessorTy>
7910 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_rw_device_accessor_v<AccessorTy>,
7911  simd<Tx, N>>
7912 atomic_update(AccessorTy acc, Toffset offset, simd<Tx, N> src0,
7913  simd<Tx, N> src1, simd_mask<N> mask) {
7914  return atomic_update<Op, Tx, N>(acc, simd<Toffset, N>(offset), src0, src1,
7915  mask);
7916 }
7917 
7935 template <atomic_op Op, typename Tx, int N, typename AccessorTy>
7936 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_rw_local_accessor_v<AccessorTy>,
7937  simd<Tx, N>>
7938 atomic_update(AccessorTy acc, uint32_t offset, simd<Tx, N> src0,
7939  simd<Tx, N> src1, simd_mask<N> mask) {
7940  return atomic_update<Op, Tx, N>(acc, simd<uint32_t, N>(offset), src0, src1,
7941  mask);
7942 }
7943 
7945 
7948 
7951 enum fence_mask : uint8_t {
7956  l3_flush_instructions __SYCL_DEPRECATED(
7957  "it means L2 here, use l2_flush_instructions") = l2_flush_instructions,
7960  l3_flush_texture_data __SYCL_DEPRECATED(
7961  "it means L2 here, use l2_flush_texture_data") = l2_flush_texture_data,
7964  l3_flush_constant_data __SYCL_DEPRECATED(
7965  "it means L2 here, use l2_flush_constant_data") = l2_flush_constant_data,
7968  l3_flush_rw_data __SYCL_DEPRECATED("it means L2 here, use l2_flush_rw_data") =
7977  sw_barrier __SYCL_DEPRECATED("reserved - this enum is ignored") = 0x80
7978 };
7979 
7983 template <uint8_t cntl> __ESIMD_API void fence() { __esimd_fence(cntl); }
7984 
7985 __SYCL_DEPRECATED("use fence<fence_mask>()")
7986 __ESIMD_API void fence(fence_mask cntl) { __esimd_fence(cntl); }
7987 
7994 template <memory_kind Kind = memory_kind::global,
7997 __ESIMD_API void fence() {
7998  static_assert(
7999  Kind != memory_kind::local ||
8000  (FenceOp == fence_flush_op::none && Scope == fence_scope::group),
8001  "SLM fence must have 'none' lsc_fence_op and 'group' scope");
8002  constexpr int N = 16;
8003  simd_mask<N> Mask = 1;
8004  __esimd_lsc_fence<static_cast<uint8_t>(Kind), static_cast<uint8_t>(FenceOp),
8005  static_cast<uint8_t>(Scope), N>(Mask.data());
8006 }
8007 
8016 __ESIMD_API void barrier() {
8018  __esimd_barrier();
8019 }
8021 
8024 
8037 template <typename T, int m, int N, typename AccessorTy, unsigned plane = 0>
8038 __ESIMD_API simd<T, m * N> media_block_load(AccessorTy acc, unsigned x,
8039  unsigned y) {
8040  constexpr unsigned Width = N * sizeof(T);
8041  static_assert(Width * m <= 256u,
8042  "data does not fit into a single dataport transaction");
8043  static_assert(Width <= 64u, "valid block width is in range [1, 64]");
8044  static_assert(m <= 64u, "valid block height is in range [1, 64]");
8045  static_assert(plane <= 3u, "valid plane index is in range [0, 3]");
8046 
8047  const auto si = __ESIMD_NS::get_surface_index(acc);
8048  using SurfIndTy = decltype(si);
8049  constexpr unsigned int RoundedWidth =
8050  Width < 4 ? 4 : detail::getNextPowerOf2<Width>();
8051  constexpr int BlockWidth = sizeof(T) * N;
8052  constexpr int Mod = 0;
8053 
8054  if constexpr (Width < RoundedWidth) {
8055  constexpr unsigned int n1 = RoundedWidth / sizeof(T);
8056  simd<T, m * n1> temp =
8057  __esimd_media_ld<T, m, n1, Mod, SurfIndTy, (int)plane, BlockWidth>(
8058  si, x, y);
8059  return temp.template select<m, 1, N, 1>(0, 0);
8060  } else {
8061  return __esimd_media_ld<T, m, N, Mod, SurfIndTy, (int)plane, BlockWidth>(
8062  si, x, y);
8063  }
8064 }
8065 
8078 template <typename T, int m, int N, typename AccessorTy, unsigned plane = 0>
8079 __ESIMD_API void media_block_store(AccessorTy acc, unsigned x, unsigned y,
8080  simd<T, m * N> vals) {
8081  constexpr unsigned Width = N * sizeof(T);
8082  static_assert(Width * m <= 256u,
8083  "data does not fit into a single dataport transaction");
8084  static_assert(Width <= 64u, "valid block width is in range [1, 64]");
8085  static_assert(m <= 64u, "valid block height is in range [1, 64]");
8086  static_assert(plane <= 3u, "valid plane index is in range [0, 3]");
8087  const auto si = __ESIMD_NS::get_surface_index(acc);
8088  using SurfIndTy = decltype(si);
8089  constexpr unsigned int RoundedWidth =
8090  Width < 4 ? 4 : detail::getNextPowerOf2<Width>();
8091  constexpr unsigned int n1 = RoundedWidth / sizeof(T);
8092  constexpr int BlockWidth = sizeof(T) * N;
8093  constexpr int Mod = 0;
8094 
8095  if constexpr (Width < RoundedWidth) {
8096  simd<T, m * n1> temp;
8097  auto temp_ref = temp.template bit_cast_view<T, m, n1>();
8098  auto vals_ref = vals.template bit_cast_view<T, m, N>();
8099  temp_ref.template select<m, 1, N, 1>() = vals_ref;
8100  __esimd_media_st<T, m, n1, Mod, SurfIndTy, plane, BlockWidth>(si, x, y,
8101  temp.data());
8102  } else {
8103  __esimd_media_st<T, m, N, Mod, SurfIndTy, plane, BlockWidth>(si, x, y,
8104  vals.data());
8105  }
8106 }
8107 
8126 template <typename T, int N, typename AccessorTy,
8128 __ESIMD_API
8129  std::enable_if_t<detail::is_local_accessor_with_v<
8130  AccessorTy, detail::accessor_mode_cap::can_read> &&
8131  is_simd_flag_type_v<Flags>,
8132  simd<T, N>>
8133  block_load(AccessorTy acc, uint32_t byte_offset, Flags flags) {
8134  return slm_block_load<T, N>(byte_offset + detail::localAccessorToOffset(acc),
8135  flags);
8136 }
8137 
8155 template <typename T, int N, typename AccessorT, typename Flags>
8156 __ESIMD_API
8157  std::enable_if_t<detail::is_local_accessor_with_v<
8158  AccessorT, detail::accessor_mode_cap::can_write> &&
8159  is_simd_flag_type_v<Flags>>
8160  block_store(AccessorT acc, uint32_t offset, simd<T, N> vals, Flags flags) {
8161  slm_block_store<T, N>(offset + __ESIMD_DNS::localAccessorToOffset(acc), vals,
8162  flags);
8163 }
8164 
8195 // typename PropertyListT = empty_properties_t>
8204 
8236 template <typename T, int N, int VS, typename AccessorT,
8237  typename PropertyListT =
8239 __ESIMD_API std::enable_if_t<
8240  (detail::is_local_accessor_with_v<AccessorT,
8241  detail::accessor_mode_cap::can_read> &&
8242  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8243  simd<T, N>>
8244 gather(AccessorT acc, simd<uint32_t, N / VS> byte_offsets,
8245  simd_mask<N / VS> mask, simd<T, N> pass_thru, PropertyListT props = {}) {
8246  return slm_gather<T, N, VS>(byte_offsets +
8247  __ESIMD_DNS::localAccessorToOffset(acc),
8248  mask, pass_thru, props);
8249 }
8250 
8281 template <typename T, int N, int VS, typename AccessorT,
8282  typename PropertyListT =
8284 __ESIMD_API std::enable_if_t<
8285  (detail::is_local_accessor_with_v<AccessorT,
8286  detail::accessor_mode_cap::can_read> &&
8287  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8288  simd<T, N>>
8289 gather(AccessorT acc, simd<uint32_t, N / VS> byte_offsets,
8290  simd_mask<N / VS> mask, PropertyListT props = {}) {
8291  return slm_gather<T, N, VS>(
8292  byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), mask, props);
8293 }
8294 
8319 template <typename T, int N, int VS, typename AccessorT,
8320  typename PropertyListT =
8322 __ESIMD_API std::enable_if_t<
8323  (detail::is_local_accessor_with_v<AccessorT,
8324  detail::accessor_mode_cap::can_read> &&
8325  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8326  simd<T, N>>
8327 gather(AccessorT acc, simd<uint32_t, N / VS> byte_offsets,
8328  PropertyListT props = {}) {
8329  return slm_gather<T, N, VS>(
8330  byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), props);
8331 }
8332 
8342 // Dev note: the mask type was turned into template parameter `MaskT` to
8343 // avoid the conflicts of this prototype with the old gather() function
8344 // accepting a 'global_offset' parameter and avoid 'ambiguous call' errors
8345 // for calls like this: gather(acc, byte_offsets_simd, 0, mask);
8346 template <typename T, int N, typename AccessorT, typename MaskT,
8347  typename PropertyListT =
8349 __ESIMD_API std::enable_if_t<
8350  (detail::is_local_accessor_with_v<AccessorT,
8351  detail::accessor_mode_cap::can_read> &&
8352  std::is_same_v<MaskT, simd_mask<N>> &&
8353  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8354  simd<T, N>>
8355 gather(AccessorT acc, simd<uint32_t, N> byte_offsets, MaskT mask,
8356  simd<T, N> pass_thru, PropertyListT props = {}) {
8357  return slm_gather<T, N>(byte_offsets +
8358  __ESIMD_DNS::localAccessorToOffset(acc),
8359  mask, pass_thru, props);
8360 }
8361 
8369 // Dev note: the mask type was turned into template parameter `MaskT` to
8370 // avoid the conflicts of this prototype with the old gather() function
8371 // accepting a 'global_offset' parameter and avoid 'ambiguous call' errors
8372 // for calls like this: gather(acc, byte_offsets_simd, 0);
8373 template <typename T, int N, typename AccessorT, typename MaskT,
8374  typename PropertyListT =
8376 __ESIMD_API std::enable_if_t<
8377  (detail::is_local_accessor_with_v<AccessorT,
8378  detail::accessor_mode_cap::can_read> &&
8379  std::is_same_v<MaskT, simd_mask<N>> &&
8380  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8381  simd<T, N>>
8382 gather(AccessorT acc, simd<uint32_t, N> byte_offsets, MaskT mask,
8383  PropertyListT props = {}) {
8384  return slm_gather<T, N>(
8385  byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), mask, props);
8386 }
8387 
8395 template <typename T, int N, typename AccessorT,
8396  typename PropertyListT =
8398 __ESIMD_API std::enable_if_t<
8399  (detail::is_local_accessor_with_v<AccessorT,
8400  detail::accessor_mode_cap::can_read> &&
8401  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8402  simd<T, N>>
8403 gather(AccessorT acc, simd<uint32_t, N> byte_offsets,
8404  PropertyListT props = {}) {
8405  return slm_gather<T, N>(
8406  byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), props);
8407 }
8408 
8411 // typename PropertyListT = empty_properties_t>
8417 template <typename T, int N, int VS = 1, typename AccessorT,
8418  typename OffsetSimdViewT,
8419  typename PropertyListT =
8421 __ESIMD_API std::enable_if_t<
8422  (detail::is_local_accessor_with_v<AccessorT,
8423  detail::accessor_mode_cap::can_read> &&
8424  detail::is_simd_view_type_v<OffsetSimdViewT> &&
8425  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8426  simd<T, N>>
8427 gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
8428  simd<T, N> pass_thru, PropertyListT props = {}) {
8429  return gather<T, N, VS>(acc, byte_offsets.read(), mask, pass_thru, props);
8430 }
8431 
8434 // typename PropertyListT = empty_properties_t>
8440 template <typename T, int N, int VS = 1, typename AccessorT,
8441  typename OffsetSimdViewT,
8442  typename PropertyListT =
8444 __ESIMD_API std::enable_if_t<
8445  (detail::is_local_accessor_with_v<AccessorT,
8446  detail::accessor_mode_cap::can_read> &&
8447  detail::is_simd_view_type_v<OffsetSimdViewT> &&
8448  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8449  simd<T, N>>
8450 gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
8451  PropertyListT props = {}) {
8452  return gather<T, N, VS>(acc, byte_offsets.read(), mask, props);
8453 }
8454 
8457 // typename PropertyListT = empty_properties_t>
8462 template <typename T, int N, int VS = 1, typename AccessorT,
8463  typename OffsetSimdViewT,
8464  typename PropertyListT =
8466 __ESIMD_API std::enable_if_t<
8467  (detail::is_local_accessor_with_v<AccessorT,
8468  detail::accessor_mode_cap::can_read> &&
8469  detail::is_simd_view_type_v<OffsetSimdViewT> &&
8470  ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8471  simd<T, N>>
8472 gather(AccessorT acc, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
8473  return gather<T, N, VS>(acc, byte_offsets.read(), props);
8474 }
8475 
8493 template <typename T, int N, typename AccessorTy>
8494 __ESIMD_API
8495  std::enable_if_t<detail::is_local_accessor_with_v<
8496  AccessorTy, detail::accessor_mode_cap::can_read>,
8497  simd<T, N>>
8498  gather(AccessorTy acc, simd<uint32_t, N> offsets, uint32_t glob_offset,
8499  simd_mask<N> mask = 1) {
8500  return slm_gather<T, N>(
8501  offsets + glob_offset + __ESIMD_DNS::localAccessorToOffset(acc), mask);
8502 }
8503 
8512 
8519 
8522 
8531 
8539 
8566 template <typename T, int N, int VS = 1, typename AccessorT,
8567  typename PropertyListT =
8569 __ESIMD_API std::enable_if_t<
8570  detail::is_local_accessor_with_v<AccessorT,
8571  detail::accessor_mode_cap::can_write> &&
8572  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8573 scatter(AccessorT acc, simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
8574  simd_mask<N / VS> mask, PropertyListT props = {}) {
8575  slm_scatter<T, N, VS>(byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc),
8576  vals, mask, props);
8577 }
8578 
8601 template <typename T, int N, int VS = 1, typename AccessorT,
8602  typename PropertyListT =
8604 __ESIMD_API std::enable_if_t<
8605  detail::is_local_accessor_with_v<AccessorT,
8606  detail::accessor_mode_cap::can_write> &&
8607  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8608 scatter(AccessorT acc, simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
8609  PropertyListT props = {}) {
8610  simd_mask<N / VS> Mask = 1;
8611  scatter<T, N, VS>(acc, byte_offsets, vals, Mask, props);
8612 }
8613 
8641 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
8642  typename AccessorT,
8643  typename PropertyListT =
8645 __ESIMD_API std::enable_if_t<
8646  detail::is_local_accessor_with_v<AccessorT,
8647  detail::accessor_mode_cap::can_write> &&
8648  detail::is_simd_view_type_v<OffsetSimdViewT> &&
8649  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8650 scatter(AccessorT acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
8651  simd_mask<N / VS> mask, PropertyListT props = {}) {
8652  scatter<T, N, VS>(acc, byte_offsets.read(), vals, mask, props);
8653 }
8654 
8679 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
8680  typename AccessorT,
8681  typename PropertyListT =
8683 __ESIMD_API std::enable_if_t<
8684  detail::is_local_accessor_with_v<AccessorT,
8685  detail::accessor_mode_cap::can_write> &&
8686  detail::is_simd_view_type_v<OffsetSimdViewT> &&
8687  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8688 scatter(AccessorT acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
8689  PropertyListT props = {}) {
8690  simd_mask<N / VS> Mask = 1;
8691  scatter<T, N, VS>(acc, byte_offsets.read(), vals, Mask, props);
8692 }
8693 
8712 template <typename T, int N, typename AccessorTy>
8713 __ESIMD_API std::enable_if_t<detail::is_local_accessor_with_v<
8714  AccessorTy, detail::accessor_mode_cap::can_write>>
8715 scatter(AccessorTy acc, simd<uint32_t, N> offsets, simd<T, N> vals,
8716  uint32_t glob_offset, simd_mask<N> mask = 1) {
8717  slm_scatter<T, N>(offsets + glob_offset +
8718  __ESIMD_DNS::localAccessorToOffset(acc),
8719  vals, mask);
8720 }
8721 
8763 
8785 template <typename T, int N, int VS, typename OffsetT,
8786  typename PropertyListT =
8788 __ESIMD_API std::enable_if_t<
8789  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8790 prefetch(const T *p, simd<OffsetT, N / VS> byte_offsets, simd_mask<N / VS> mask,
8791  PropertyListT props = {}) {
8792  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
8794  PropertyListT>(p, byte_offsets, mask);
8795 }
8796 
8813 template <typename T, int N, int VS, typename OffsetT,
8814  typename PropertyListT =
8816 __ESIMD_API std::enable_if_t<
8817  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8818 prefetch(const T *p, simd<OffsetT, N / VS> byte_offsets,
8819  PropertyListT props = {}) {
8820  simd_mask<N / VS> Mask = 1;
8821  prefetch<T, N, VS>(p, byte_offsets, Mask, props);
8822 }
8823 
8843 template <typename T, int N, typename OffsetT,
8844  typename PropertyListT =
8846 __ESIMD_API std::enable_if_t<
8847  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8848 prefetch(const T *p, simd<OffsetT, N> byte_offsets, simd_mask<N> mask,
8849  PropertyListT props = {}) {
8850  constexpr int VS = 1;
8851  prefetch<T, N, VS>(p, byte_offsets, mask, props);
8852 }
8853 
8868 template <typename T, int N, typename OffsetT,
8869  typename PropertyListT =
8871 __ESIMD_API std::enable_if_t<
8872  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8873 prefetch(const T *p, simd<OffsetT, N> byte_offsets, PropertyListT props = {}) {
8874  constexpr int VS = 1;
8875  prefetch<T, N, VS>(p, byte_offsets, props);
8876 }
8877 
8898 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
8899  typename PropertyListT =
8901 __ESIMD_API std::enable_if_t<
8902  detail::is_simd_view_type_v<OffsetSimdViewT> &&
8903  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8904 prefetch(const T *p, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
8905  PropertyListT props = {}) {
8906  prefetch<T, N, VS>(p, byte_offsets.read(), mask, props);
8907 }
8908 
8926 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
8927  typename PropertyListT =
8929 __ESIMD_API std::enable_if_t<
8930  detail::is_simd_view_type_v<OffsetSimdViewT> &&
8931  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8932 prefetch(const T *p, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
8933  prefetch<T, N, VS>(p, byte_offsets.read(), props);
8934 }
8935 
8953 
8962 template <typename T, int VS = 1, typename OffsetT,
8963  typename PropertyListT =
8965 __ESIMD_API std::enable_if_t<
8966  std::is_integral_v<OffsetT> &&
8967  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8968 prefetch(const T *p, OffsetT byte_offset, simd_mask<1> mask,
8969  PropertyListT props = {}) {
8971  PropertyListT>(p, byte_offset, mask);
8972 }
8973 
8989 template <typename T, int VS = 1, typename OffsetT,
8990  typename PropertyListT =
8992 __ESIMD_API std::enable_if_t<
8993  std::is_integral_v<OffsetT> &&
8994  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8995 prefetch(const T *p, OffsetT byte_offset, PropertyListT props = {}) {
8996  simd_mask<1> Mask = 1;
8997  prefetch<T, VS>(p, byte_offset, Mask, props);
8998 }
8999 
9016 template <typename T, int VS = 1,
9017  typename PropertyListT =
9019 __ESIMD_API std::enable_if_t<
9020  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9021 prefetch(const T *p, simd_mask<1> mask, PropertyListT props = {}) {
9022  prefetch<T, VS>(p, 0, mask, props);
9023 }
9024 
9037 template <typename T, int VS = 1,
9038  typename PropertyListT =
9040 __ESIMD_API std::enable_if_t<
9041  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9042 prefetch(const T *p, PropertyListT props = {}) {
9043  simd_mask<1> Mask = 1;
9044  prefetch<T, VS>(p, 0, Mask, props);
9045 }
9046 
9088 
9111 template <typename T, int N, int VS, typename AccessorT, typename OffsetT,
9112  typename PropertyListT =
9114 __ESIMD_API std::enable_if_t<
9115  detail::is_device_accessor_with_v<AccessorT,
9116  detail::accessor_mode_cap::can_read> &&
9117  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9118 prefetch(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
9119  simd_mask<N / VS> mask, PropertyListT props = {}) {
9120 #ifdef __ESIMD_FORCE_STATELESS_MEM
9121  prefetch<T, N, VS>(detail::accessorToPointer<T>(acc), byte_offsets, mask,
9122  props);
9123 #else
9124  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
9126  PropertyListT>(acc, byte_offsets, mask);
9127 #endif // __ESIMD_FORCE_STATELESS_MEM
9128 }
9129 
9147 template <typename T, int N, int VS, typename AccessorT, typename OffsetT,
9148  typename PropertyListT =
9150 __ESIMD_API std::enable_if_t<
9151  detail::is_device_accessor_with_v<AccessorT,
9152  detail::accessor_mode_cap::can_read> &&
9153  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9154 prefetch(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
9155  PropertyListT props = {}) {
9156  simd_mask<N / VS> Mask = 1;
9157  prefetch<T, N, VS>(acc, byte_offsets, Mask, props);
9158 }
9159 
9180 template <typename T, int N, typename AccessorT, typename OffsetT,
9181  typename PropertyListT =
9183 __ESIMD_API std::enable_if_t<
9184  detail::is_device_accessor_with_v<AccessorT,
9185  detail::accessor_mode_cap::can_read> &&
9186  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9187 prefetch(AccessorT acc, simd<OffsetT, N> byte_offsets, simd_mask<N> mask,
9188  PropertyListT props = {}) {
9189  constexpr int VS = 1;
9190  prefetch<T, N, VS>(acc, byte_offsets, mask, props);
9191 }
9192 
9208 template <typename T, int N, typename AccessorT, typename OffsetT,
9209  typename PropertyListT =
9211 __ESIMD_API std::enable_if_t<
9212  detail::is_device_accessor_with_v<AccessorT,
9213  detail::accessor_mode_cap::can_read> &&
9214  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9215 prefetch(AccessorT acc, simd<OffsetT, N> byte_offsets,
9216  PropertyListT props = {}) {
9217  constexpr int VS = 1;
9218  prefetch<T, N, VS>(acc, byte_offsets, props);
9219 }
9220 
9242 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
9243  typename AccessorT,
9244  typename PropertyListT =
9246 __ESIMD_API std::enable_if_t<
9247  detail::is_device_accessor_with_v<AccessorT,
9248  detail::accessor_mode_cap::can_read> &&
9249  detail::is_simd_view_type_v<OffsetSimdViewT> &&
9250  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9251 prefetch(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
9252  PropertyListT props = {}) {
9253  prefetch<T, N, VS>(acc, byte_offsets.read(), mask, props);
9254 }
9255 
9274 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
9275  typename AccessorT,
9276  typename PropertyListT =
9278 __ESIMD_API std::enable_if_t<
9279  detail::is_device_accessor_with_v<AccessorT,
9280  detail::accessor_mode_cap::can_read> &&
9281  detail::is_simd_view_type_v<OffsetSimdViewT> &&
9282  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9283 prefetch(AccessorT acc, OffsetSimdViewT byte_offsets,
9284  PropertyListT props = {}) {
9285  prefetch<T, N, VS>(acc, byte_offsets.read(), props);
9286 }
9287 
9313 template <typename T, int VS = 1, typename AccessorT, typename OffsetT,
9314  typename PropertyListT =
9316 __ESIMD_API std::enable_if_t<
9317  std::is_integral_v<OffsetT> &&
9318  detail::is_device_accessor_with_v<AccessorT,
9319  detail::accessor_mode_cap::can_read> &&
9320  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9321 prefetch(AccessorT acc, OffsetT byte_offset, simd_mask<1> mask,
9322  PropertyListT props = {}) {
9323 #ifdef __ESIMD_FORCE_STATELESS_MEM
9324  prefetch<T, VS>(detail::accessorToPointer<T>(acc), byte_offset, mask, props);
9325 #else
9327  PropertyListT>(acc, byte_offset, mask);
9328 #endif // __ESIMD_FORCE_STATELESS_MEM
9329 }
9330 
9346 template <typename T, int VS = 1, typename AccessorT, typename OffsetT,
9347  typename PropertyListT =
9349 __ESIMD_API std::enable_if_t<
9350  std::is_integral_v<OffsetT> &&
9351  detail::is_device_accessor_with_v<AccessorT,
9352  detail::accessor_mode_cap::can_read> &&
9353  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9354 prefetch(AccessorT acc, OffsetT byte_offset, PropertyListT props = {}) {
9355  simd_mask<1> Mask = 1;
9356  prefetch<T, VS>(acc, byte_offset, Mask, props);
9357 }
9358 
9375 template <typename T, int VS = 1, typename AccessorT,
9376  typename PropertyListT =
9378 __ESIMD_API std::enable_if_t<
9379  detail::is_device_accessor_with_v<AccessorT,
9380  detail::accessor_mode_cap::can_read> &&
9381  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9382 prefetch(AccessorT acc, simd_mask<1> mask, PropertyListT props = {}) {
9383  prefetch<T, VS>(acc, 0, mask, props);
9384 }
9385 
9398 template <typename T, int VS = 1, typename AccessorT,
9399  typename PropertyListT =
9401 __ESIMD_API std::enable_if_t<
9402  detail::is_device_accessor_with_v<AccessorT,
9403  detail::accessor_mode_cap::can_read> &&
9404  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9405 prefetch(AccessorT acc, PropertyListT props = {}) {
9406  simd_mask<1> Mask = 1;
9407  prefetch<T, VS>(acc, 0, Mask, props);
9408 }
9409 
9450 template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
9451  bool Transposed = false, bool Transformed = false,
9453  T, NBlocks, BlockHeight, BlockWidth, Transposed, Transformed>(),
9454  typename PropertyListT = oneapi::experimental::empty_properties_t>
9455 __ESIMD_API std::enable_if_t<
9456  ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
9457 load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
9458  unsigned SurfacePitch, int X, int Y, PropertyListT props = {}) {
9459  return detail::load_2d_impl<T, BlockWidth, BlockHeight, NBlocks, Transposed,
9460  Transformed, PropertyListT>(
9461  Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
9462 }
9463 
9493 template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
9495  T, NBlocks, BlockHeight, BlockWidth, false /*Transposed*/,
9496  false /*Transformed*/>(),
9497  typename PropertyListT = oneapi::experimental::empty_properties_t>
9498 __ESIMD_API std::enable_if_t<
9499  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9500 prefetch_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
9501  unsigned SurfacePitch, int X, int Y, PropertyListT props = {}) {
9502  detail::prefetch_2d_impl<T, BlockWidth, BlockHeight, NBlocks, PropertyListT>(
9503  Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
9504 }
9505 
9529 template <typename T, int BlockWidth, int BlockHeight = 1,
9531  T, 1u, BlockHeight, BlockWidth, false /*Transposed*/,
9532  false /*Transformed*/>(),
9533  typename PropertyListT = oneapi::experimental::empty_properties_t>
9534 __ESIMD_API std::enable_if_t<
9535  ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9536 store_2d(T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
9537  unsigned SurfacePitch, int X, int Y, simd<T, N> Vals,
9538  PropertyListT props = {}) {
9539  detail::store_2d_impl<T, BlockWidth, BlockHeight, PropertyListT>(
9540  Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, Vals);
9541 }
9542 
9567 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
9568  typename AccessorT, int N,
9569  typename T = typename AccessorT::value_type>
9570 __ESIMD_API
9571  std::enable_if_t<detail::is_local_accessor_with_v<
9572  AccessorT, detail::accessor_mode_cap::can_read>,
9573  simd<T, N * get_num_channels_enabled(RGBAMask)>>
9574  gather_rgba(AccessorT acc, simd<uint32_t, N> offsets,
9575  uint32_t global_offset = 0, simd_mask<N> mask = 1) {
9576  return slm_gather_rgba<T, N, RGBAMask>(
9577  offsets + global_offset + __ESIMD_DNS::localAccessorToOffset(acc), mask);
9578 }
9579 
9596 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
9597  typename AccessorT, int N,
9598  typename T = typename AccessorT::value_type>
9599 __ESIMD_API std::enable_if_t<detail::is_local_accessor_with_v<
9600  AccessorT, detail::accessor_mode_cap::can_write>>
9601 scatter_rgba(AccessorT acc, simd<uint32_t, N> offsets,
9602  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
9603  uint32_t global_offset = 0, simd_mask<N> mask = 1) {
9604  detail::validate_rgba_write_channel_mask<RGBAMask>();
9605  slm_scatter_rgba<T, N, RGBAMask>(offsets + global_offset +
9606  __ESIMD_DNS::localAccessorToOffset(acc),
9607  vals, mask);
9608 }
9609 
9612 
9633 template <uint8_t exec_size, uint8_t sfid, uint8_t num_src0, uint8_t num_src1,
9634  uint8_t num_dst, raw_send_eot eot = raw_send_eot::not_eot,
9635  raw_send_sendc sendc = raw_send_sendc::not_sendc, typename T1, int n1,
9636  typename T2, int n2, typename T3, int n3>
9637 __ESIMD_API __ESIMD_NS::simd<T1, n1>
9638 raw_sends(__ESIMD_NS::simd<T1, n1> msg_dst, __ESIMD_NS::simd<T2, n2> msg_src0,
9639  __ESIMD_NS::simd<T3, n3> msg_src1, uint32_t ex_desc,
9640  uint32_t msg_desc, __ESIMD_NS::simd_mask<exec_size> mask = 1) {
9641  constexpr unsigned _Width1 = n1 * sizeof(T1);
9642  static_assert(_Width1 % 32 == 0, "Invalid size for raw send rspVar");
9643  constexpr unsigned _Width2 = n2 * sizeof(T2);
9644  static_assert(_Width2 % 32 == 0, "Invalid size for raw send msg_src0");
9645  constexpr unsigned _Width3 = n3 * sizeof(T3);
9646  static_assert(_Width3 % 32 == 0, "Invalid size for raw send msg_src1");
9647 
9648  using ElemT1 = __ESIMD_DNS::__raw_t<T1>;
9649  using ElemT2 = __ESIMD_DNS::__raw_t<T2>;
9650  using ElemT3 = __ESIMD_DNS::__raw_t<T3>;
9651 
9652  constexpr uint8_t modifier =
9653  ((eot == raw_send_eot::eot) << 1) | (sendc == raw_send_sendc::sendc);
9654 
9655  return __esimd_raw_sends2<ElemT1, n1, ElemT2, n2, ElemT3, n3, exec_size>(
9656  modifier, exec_size, mask.data(), num_src0, num_src1, num_dst, sfid,
9657  ex_desc, msg_desc, msg_src0.data(), msg_src1.data(), msg_dst.data());
9658 }
9659 
9677 template <uint8_t exec_size, uint8_t sfid, uint8_t num_src0, uint8_t num_dst,
9679  raw_send_sendc sendc = raw_send_sendc::not_sendc, typename T1, int n1,
9680  typename T2, int n2>
9681 __ESIMD_API __ESIMD_NS::simd<T1, n1>
9682 raw_send(__ESIMD_NS::simd<T1, n1> msg_dst, __ESIMD_NS::simd<T2, n2> msg_src0,
9683  uint32_t ex_desc, uint32_t msg_desc,
9684  __ESIMD_NS::simd_mask<exec_size> mask = 1) {
9685  constexpr unsigned _Width1 = n1 * sizeof(T1);
9686  static_assert(_Width1 % 32 == 0, "Invalid size for raw send rspVar");
9687  constexpr unsigned _Width2 = n2 * sizeof(T2);
9688  static_assert(_Width2 % 32 == 0, "Invalid size for raw send msg_src0");
9689 
9690  using ElemT1 = __ESIMD_DNS::__raw_t<T1>;
9691  using ElemT2 = __ESIMD_DNS::__raw_t<T2>;
9692 
9693  constexpr uint8_t modifier =
9694  ((eot == raw_send_eot::eot) << 1) | (sendc == raw_send_sendc::sendc);
9695  return __esimd_raw_send2<ElemT1, n1, ElemT2, n2, exec_size>(
9696  modifier, exec_size, mask.data(), num_src0, num_dst, sfid, ex_desc,
9697  msg_desc, msg_src0.data(), msg_dst.data());
9698 }
9699 
9717 template <uint8_t exec_size, uint8_t sfid, uint8_t num_src0, uint8_t num_src1,
9719  raw_send_sendc sendc = raw_send_sendc::not_sendc, typename T1, int n1,
9720  typename T2, int n2>
9721 __ESIMD_API void raw_sends(__ESIMD_NS::simd<T1, n1> msg_src0,
9722  __ESIMD_NS::simd<T2, n2> msg_src1, uint32_t ex_desc,
9723  uint32_t msg_desc,
9724  __ESIMD_NS::simd_mask<exec_size> mask = 1) {
9725  constexpr unsigned _Width1 = n1 * sizeof(T1);
9726  static_assert(_Width1 % 32 == 0, "Invalid size for raw send msg_src0");
9727  constexpr unsigned _Width2 = n2 * sizeof(T2);
9728  static_assert(_Width2 % 32 == 0, "Invalid size for raw send msg_src1");
9729 
9730  using ElemT1 = __ESIMD_DNS::__raw_t<T1>;
9731  using ElemT2 = __ESIMD_DNS::__raw_t<T2>;
9732 
9733  constexpr uint8_t modifier =
9734  ((eot == raw_send_eot::eot) << 1) | (sendc == raw_send_sendc::sendc);
9735  __esimd_raw_sends2_noresult<ElemT1, n1, ElemT2, n2, exec_size>(
9736  modifier, exec_size, mask.data(), num_src0, num_src1, sfid, ex_desc,
9737  msg_desc, msg_src0.data(), msg_src1.data());
9738 }
9739 
9755 template <uint8_t exec_size, uint8_t sfid, uint8_t num_src0,
9757  raw_send_sendc sendc = raw_send_sendc::not_sendc, typename T1, int n1>
9758 __ESIMD_API void raw_send(__ESIMD_NS::simd<T1, n1> msg_src0, uint32_t ex_desc,
9759  uint32_t msg_desc,
9760  __ESIMD_NS::simd_mask<exec_size> mask = 1) {
9761  constexpr unsigned _Width1 = n1 * sizeof(T1);
9762  static_assert(_Width1 % 32 == 0, "Invalid size for raw send msg_src0");
9763  using ElemT1 = __ESIMD_DNS::__raw_t<T1>;
9764  constexpr uint8_t modifier =
9765  ((eot == raw_send_eot::eot) << 1) | (sendc == raw_send_sendc::sendc);
9766  __esimd_raw_send2_noresult<ElemT1, n1, exec_size>(
9767  modifier, exec_size, mask.data(), num_src0, sfid, ex_desc, msg_desc,
9768  msg_src0.data());
9769 }
9770 
9772 
9774 
9776 
9777 namespace detail {
9778 // -- Outlined implementations of simd_obj_impl class memory access APIs.
9779 
9780 template <typename T, int N, class T1, class SFINAE>
9781 template <typename Flags, int ChunkSize, typename>
9784  Flags) SYCL_ESIMD_FUNCTION {
9786  constexpr unsigned Size = sizeof(T) * N;
9787  constexpr unsigned Align = Flags::template alignment<T1>;
9788 
9789  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
9790  constexpr unsigned NumBlocks = Size / BlockSize;
9791  constexpr unsigned RemSize = Size % BlockSize;
9792 
9793  if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 &&
9794  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
9795  if constexpr (NumBlocks > 0) {
9796  constexpr unsigned BlockN = BlockSize / sizeof(T);
9797  ForHelper<NumBlocks>::unroll([BlockN, Addr, this](unsigned Block) {
9798  select<BlockN, 1>(Block * BlockN) =
9799  block_load<UT, BlockN, Flags>(Addr + (Block * BlockN), Flags{});
9800  });
9801  }
9802  if constexpr (RemSize > 0) {
9803  constexpr unsigned RemN = RemSize / sizeof(T);
9804  constexpr unsigned BlockN = BlockSize / sizeof(T);
9805  select<RemN, 1>(NumBlocks * BlockN) =
9806  block_load<UT, RemN, Flags>(Addr + (NumBlocks * BlockN), Flags{});
9807  }
9808  } else if constexpr (sizeof(T) == 8) {
9809  simd<int32_t, N * 2> BC(reinterpret_cast<const int32_t *>(Addr), Flags{});
9810  bit_cast_view<int32_t>() = BC;
9811  } else {
9812  constexpr unsigned NumChunks = N / ChunkSize;
9813  if constexpr (NumChunks > 0) {
9814  simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
9815  ForHelper<NumChunks>::unroll([Addr, &Offsets, this](unsigned Block) {
9816  select<ChunkSize, 1>(Block * ChunkSize) =
9817  gather<UT, ChunkSize>(Addr + (Block * ChunkSize), Offsets);
9818  });
9819  }
9820  constexpr unsigned RemN = N % ChunkSize;
9821  if constexpr (RemN > 0) {
9822  if constexpr (RemN == 1) {
9823  select<1, 1>(NumChunks * ChunkSize) = Addr[NumChunks * ChunkSize];
9824  } else if constexpr (RemN == 8 || RemN == 16) {
9825  simd<uint32_t, RemN> Offsets(0u, sizeof(T));
9826  select<RemN, 1>(NumChunks * ChunkSize) =
9827  gather<UT, RemN>(Addr + (NumChunks * ChunkSize), Offsets);
9828  } else {
9829  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
9830  simd_mask_type<N1> Pred(0);
9831  Pred.template select<RemN, 1>() = 1;
9832  simd<uint32_t, N1> Offsets(0u, sizeof(T));
9833  simd<UT, N1> Vals =
9834  gather<UT, N1>(Addr + (NumChunks * ChunkSize), Offsets, Pred);
9835  select<RemN, 1>(NumChunks * ChunkSize) =
9836  Vals.template select<RemN, 1>();
9837  }
9838  }
9839  }
9840 }
9841 
9842 template <typename T, int N, class T1, class SFINAE>
9843 template <int ChunkSize, typename Flags, typename AccessorT, typename TOffset>
9844 ESIMD_INLINE void simd_obj_impl<T, N, T1, SFINAE>::copy_to_impl(
9845  AccessorT acc, TOffset offset) const SYCL_ESIMD_FUNCTION {
9847  constexpr unsigned Size = sizeof(T) * N;
9848  constexpr unsigned Align = Flags::template alignment<T1>;
9849 
9850  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
9851  constexpr unsigned NumBlocks = Size / BlockSize;
9852  constexpr unsigned RemSize = Size % BlockSize;
9853 
9854  simd<UT, N> Tmp{data()};
9855  if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
9856  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
9857  if constexpr (NumBlocks > 0) {
9858  constexpr unsigned BlockN = BlockSize / sizeof(T);
9859  ForHelper<NumBlocks>::unroll([BlockN, acc, offset, &Tmp](unsigned Block) {
9860  block_store<UT, BlockN, AccessorT>(
9861  acc, offset + (Block * BlockSize),
9862  Tmp.template select<BlockN, 1>(Block * BlockN));
9863  });
9864  }
9865  if constexpr (RemSize > 0) {
9866  constexpr unsigned RemN = RemSize / sizeof(T);
9867  constexpr unsigned BlockN = BlockSize / sizeof(T);
9868  block_store<UT, RemN, AccessorT>(
9869  acc, offset + (NumBlocks * BlockSize),
9870  Tmp.template select<RemN, 1>(NumBlocks * BlockN));
9871  }
9872  } else if constexpr (sizeof(T) == 8) {
9873  simd<int32_t, N * 2> BC = Tmp.template bit_cast_view<int32_t>();
9874  BC.copy_to(acc, offset, Flags{});
9875  } else {
9876  constexpr unsigned NumChunks = N / ChunkSize;
9877  if constexpr (NumChunks > 0) {
9878  simd<TOffset, ChunkSize> Offsets(0u, sizeof(T));
9879  ForHelper<NumChunks>::unroll([acc, offset, &Offsets,
9880  &Tmp](unsigned Block) {
9881  scatter<UT, ChunkSize, AccessorT>(
9882  acc, Offsets, Tmp.template select<ChunkSize, 1>(Block * ChunkSize),
9883  offset + (Block * ChunkSize * sizeof(T)));
9884  });
9885  }
9886  constexpr unsigned RemN = N % ChunkSize;
9887  if constexpr (RemN > 0) {
9888  if constexpr (RemN == 1 || RemN == 8 || RemN == 16) {
9889  simd<TOffset, RemN> Offsets(0u, sizeof(T));
9890  scatter<UT, RemN, AccessorT>(
9891  acc, Offsets, Tmp.template select<RemN, 1>(NumChunks * ChunkSize),
9892  offset + (NumChunks * ChunkSize * sizeof(T)));
9893  } else {
9894  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
9895  simd_mask_type<N1> Pred(0);
9896  Pred.template select<RemN, 1>() = 1;
9897  simd<UT, N1> Vals;
9898  Vals.template select<RemN, 1>() =
9899  Tmp.template select<RemN, 1>(NumChunks * ChunkSize);
9900  simd<TOffset, N1> Offsets(0u, sizeof(T));
9901  scatter<UT, N1, AccessorT>(acc, Offsets, Vals,
9902  offset + (NumChunks * ChunkSize * sizeof(T)),
9903  Pred);
9904  }
9905  }
9906  }
9907 }
9908 
9909 template <typename T, int N, class T1, class SFINAE>
9910 template <int ChunkSize, typename Flags, typename AccessorT, typename TOffset>
9911 ESIMD_INLINE void simd_obj_impl<T, N, T1, SFINAE>::copy_from_impl(
9912  AccessorT acc, TOffset offset) SYCL_ESIMD_FUNCTION {
9914  static_assert(sizeof(UT) == sizeof(T));
9915  constexpr unsigned Size = sizeof(T) * N;
9916  constexpr unsigned Align = Flags::template alignment<T1>;
9917 
9918  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
9919  constexpr unsigned NumBlocks = Size / BlockSize;
9920  constexpr unsigned RemSize = Size % BlockSize;
9921 
9922  if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 &&
9923  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
9924  if constexpr (NumBlocks > 0) {
9925  constexpr unsigned BlockN = BlockSize / sizeof(T);
9926  ForHelper<NumBlocks>::unroll([BlockN, acc, offset, this](unsigned Block) {
9927  select<BlockN, 1>(Block * BlockN) =
9928  block_load<UT, BlockN, AccessorT, Flags>(
9929  acc, offset + (Block * BlockSize), Flags{});
9930  });
9931  }
9932  if constexpr (RemSize > 0) {
9933  constexpr unsigned RemN = RemSize / sizeof(T);
9934  constexpr unsigned BlockN = BlockSize / sizeof(T);
9935  select<RemN, 1>(NumBlocks * BlockN) =
9936  block_load<UT, RemN, AccessorT, Flags>(
9937  acc, offset + (NumBlocks * BlockSize), Flags{});
9938  }
9939  } else if constexpr (sizeof(T) == 8) {
9940  simd<int32_t, N * 2> BC(acc, offset, Flags{});
9941  bit_cast_view<int32_t>() = BC;
9942  } else {
9943  constexpr unsigned NumChunks = N / ChunkSize;
9944  if constexpr (NumChunks > 0) {
9945  simd<TOffset, ChunkSize> Offsets(0u, sizeof(T));
9946  ForHelper<NumChunks>::unroll(
9947  [acc, offset, &Offsets, this](unsigned Block) {
9948  select<ChunkSize, 1>(Block * ChunkSize) =
9949  gather<UT, ChunkSize, AccessorT>(
9950  acc, Offsets, offset + (Block * ChunkSize * sizeof(T)));
9951  });
9952  }
9953  constexpr unsigned RemN = N % ChunkSize;
9954  if constexpr (RemN > 0) {
9955  if constexpr (RemN == 1 || RemN == 8 || RemN == 16) {
9956  simd<TOffset, RemN> Offsets(0u, sizeof(T));
9957  select<RemN, 1>(NumChunks * ChunkSize) = gather<UT, RemN, AccessorT>(
9958  acc, Offsets, offset + (NumChunks * ChunkSize * sizeof(T)));
9959  } else {
9960  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
9961  simd_mask_type<N1> Pred(0);
9962  Pred.template select<RemN, 1>() = 1;
9963  simd<TOffset, N1> Offsets(0u, sizeof(T));
9964  simd<UT, N1> Vals = gather<UT, N1>(
9965  acc, Offsets, offset + (NumChunks * ChunkSize * sizeof(T)), Pred);
9966  select<RemN, 1>(NumChunks * ChunkSize) =
9967  Vals.template select<RemN, 1>();
9968  }
9969  }
9970  }
9971 }
9972 
9973 template <typename T, int N, class T1, class SFINAE>
9974 template <typename AccessorT, typename Flags, int ChunkSize, typename>
9975 ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_read, void>
9978  Flags) SYCL_ESIMD_FUNCTION {
9979 
9980  copy_from_impl<ChunkSize, Flags>(acc, offset);
9981 }
9982 
9983 template <typename T, int N, class T1, class SFINAE>
9984 template <typename AccessorT, typename Flags, int ChunkSize, typename>
9985 ESIMD_INLINE std::enable_if_t<
9986  detail::is_local_accessor_with_v<AccessorT, accessor_mode_cap::can_read>,
9987  void>
9988 simd_obj_impl<T, N, T1, SFINAE>::copy_from(AccessorT acc, uint32_t offset,
9989  Flags) SYCL_ESIMD_FUNCTION {
9990 
9991  copy_from_impl<ChunkSize, Flags>(acc, offset);
9992 }
9993 
9994 template <typename T, int N, class T1, class SFINAE>
9995 template <typename Flags, int ChunkSize, typename>
9998  Flags) const SYCL_ESIMD_FUNCTION {
10000  constexpr unsigned Size = sizeof(T) * N;
10001  constexpr unsigned Align = Flags::template alignment<T1>;
10002 
10003  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
10004  constexpr unsigned NumBlocks = Size / BlockSize;
10005  constexpr unsigned RemSize = Size % BlockSize;
10006 
10007  simd<UT, N> Tmp{data()};
10008  if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
10009  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
10010  if constexpr (NumBlocks > 0) {
10011  constexpr unsigned BlockN = BlockSize / sizeof(T);
10012  ForHelper<NumBlocks>::unroll([BlockN, Addr, &Tmp](unsigned Block) {
10013  block_store<UT, BlockN>(Addr + (Block * BlockN),
10014  Tmp.template select<BlockN, 1>(Block * BlockN));
10015  });
10016  }
10017  if constexpr (RemSize > 0) {
10018  constexpr unsigned RemN = RemSize / sizeof(T);
10019  constexpr unsigned BlockN = BlockSize / sizeof(T);
10020  block_store<UT, RemN>(Addr + (NumBlocks * BlockN),
10021  Tmp.template select<RemN, 1>(NumBlocks * BlockN));
10022  }
10023  } else if constexpr (sizeof(T) == 8) {
10024  simd<int32_t, N * 2> BC = Tmp.template bit_cast_view<int32_t>();
10025  BC.copy_to(reinterpret_cast<int32_t *>(Addr), Flags{});
10026  } else {
10027  constexpr unsigned NumChunks = N / ChunkSize;
10028  if constexpr (NumChunks > 0) {
10029  simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
10030  ForHelper<NumChunks>::unroll([Addr, &Offsets, &Tmp](unsigned Block) {
10031  scatter<UT, ChunkSize>(
10032  Addr + (Block * ChunkSize), Offsets,
10033  Tmp.template select<ChunkSize, 1>(Block * ChunkSize));
10034  });
10035  }
10036  constexpr unsigned RemN = N % ChunkSize;
10037  if constexpr (RemN > 0) {
10038  if constexpr (RemN == 1) {
10039  Addr[NumChunks * ChunkSize] = Tmp[NumChunks * ChunkSize];
10040  } else if constexpr (RemN == 8 || RemN == 16) {
10041  // TODO: GPU runtime may handle scatter of 16 byte elements
10042  // incorrectly. The code below is a workaround which must be deleted
10043  // once GPU runtime is fixed.
10044  if constexpr (sizeof(T) == 1 && RemN == 16) {
10045  if constexpr (Align % OperandSize::DWORD > 0) {
10046  ForHelper<RemN>::unroll([Addr, &Tmp](unsigned Index) {
10047  Addr[Index + NumChunks * ChunkSize] =
10048  Tmp[Index + NumChunks * ChunkSize];
10049  });
10050  } else {
10051  simd_mask_type<8> Pred(0);
10052  simd<int32_t, 8> Vals;
10053  Pred.template select<4, 1>() = 1;
10054  Vals.template select<4, 1>() =
10055  Tmp.template bit_cast_view<int32_t>().template select<4, 1>(
10056  NumChunks * ChunkSize);
10057 
10058  simd<uint32_t, 8> Offsets(0u, sizeof(int32_t));
10059  scatter<int32_t, 8>(
10060  reinterpret_cast<int32_t *>(Addr + (NumChunks * ChunkSize)),
10061  Offsets, Vals, Pred);
10062  }
10063  } else {
10064  simd<uint32_t, RemN> Offsets(0u, sizeof(T));
10065  scatter<UT, RemN>(
10066  Addr + (NumChunks * ChunkSize), Offsets,
10067  Tmp.template select<RemN, 1>(NumChunks * ChunkSize));
10068  }
10069  } else {
10070  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
10071  simd_mask_type<N1> Pred(0);
10072  Pred.template select<RemN, 1>() = 1;
10073  simd<UT, N1> Vals;
10074  Vals.template select<RemN, 1>() =
10075  Tmp.template select<RemN, 1>(NumChunks * ChunkSize);
10076  simd<uint32_t, N1> Offsets(0u, sizeof(T));
10077  scatter<UT, N1>(Addr + (NumChunks * ChunkSize), Offsets, Vals, Pred);
10078  }
10079  }
10080  }
10081 }
10082 
10083 template <typename T, int N, class T1, class SFINAE>
10084 template <typename AccessorT, typename Flags, int ChunkSize, typename>
10085 ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_write, void>
10088  Flags) const SYCL_ESIMD_FUNCTION {
10089  copy_to_impl<ChunkSize, Flags>(acc, offset);
10090 }
10091 
10092 template <typename T, int N, class T1, class SFINAE>
10093 template <typename AccessorT, typename Flags, int ChunkSize, typename>
10094 ESIMD_INLINE std::enable_if_t<
10095  detail::is_local_accessor_with_v<AccessorT, accessor_mode_cap::can_write>,
10096  void>
10097 simd_obj_impl<T, N, T1, SFINAE>::copy_to(AccessorT acc, uint32_t offset,
10098  Flags) const SYCL_ESIMD_FUNCTION {
10099  copy_to_impl<ChunkSize, Flags>(acc, offset);
10100 }
10101 
10102 } // namespace detail
10104 
10105 } // namespace ext::intel::esimd
10106 } // namespace _V1
10107 } // namespace sycl
const auto & data() const noexcept
Definition: simd.hpp:1673
Definition: simd.hpp:1387
std::enable_if_t< __vectorizable< _Up >) &&is_simd_flag_type< _Flags >::value > copy_to(_Up *__buffer, _Flags) const
Definition: simd.hpp:1526
get_vector_element_type< Derived > element_type
Element type of the derived (user) class.
ESIMD_INLINE void copy_from(const Ty *addr, Flags={}) SYCL_ESIMD_FUNCTION
Copy a contiguous block of data from memory into this simd_obj_impl object.
ESIMD_INLINE void copy_to(Ty *addr, Flags={}) const SYCL_ESIMD_FUNCTION
Copy all vector elements of this object into a contiguous block in memory.
The main simd vector class.
Definition: simd.hpp:53
typename base_type::raw_vector_type raw_vector_type
Definition: simd.hpp:60
#define __ESIMD_FP_ATOMIC_OP_TYPE_CHECK(T)
Definition: memory.hpp:4257
raw_send_eot
Specify if end of thread should be set.
Definition: common.hpp:67
rgba_channel_mask
Represents a pixel's channel mask - all possible combinations of enabled channels.
Definition: common.hpp:122
raw_send_sendc
Specify if sendc should be used.
Definition: common.hpp:73
unsigned int SurfaceIndex
Surface index type.
Definition: common.hpp:64
constexpr int get_num_channels_enabled(rgba_channel_mask M)
Definition: common.hpp:145
atomic_op
Represents an atomic operation.
Definition: common.hpp:160
@ fsub
ACM/PVC: Subtraction (floating point): *addr = *addr - src0.
@ fmax
ACM/PVC: Minimum (floating point): *addr = min(*addr, src0).
@ fadd
ACM/PVC: Addition (floating point): *addr = *addr + src0.
@ xchg
Exchange. *addr == src0;
@ fmin
ACM/PVC: Maximum (floating point): *addr = max(*addr, src0).
@ fcmpxchg
ACM/PVC: Compare and exchange (floating point).
__ESIMD_API SZ simd< T, SZ > src1
Definition: math.hpp:180
__ESIMD_API SZ src0
Definition: math.hpp:180
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT >, simd< T, N > > block_load(const T *ptr, PropertyListT props={})
Each of the following block load functions loads a contiguous memory block from the address reference...
Definition: memory.hpp:1356
__ESIMD_API std::enable_if_t< __ESIMD_DNS::get_num_args< Op >)==0 &&__ESIMD_DNS::is_rw_local_accessor_v< AccessorT >, simd< T, N > > atomic_update(AccessorT lacc, simd< uint32_t, N > byte_offset, simd_mask< N > mask=1)
simd<T, N> atomic_update(local_accessor lacc, simd<uint32_t, N> byte_offset, simd_mask<N> pred = 1); ...
Definition: memory.hpp:5966
__ESIMD_API std::enable_if_t<(N==8||N==16||N==32) &&(sizeof(T)==4)> slm_scatter_rgba(simd< uint32_t, N > offsets, simd< T, N *get_num_channels_enabled(Mask)> vals, simd_mask< N > mask=1)
Gather data from the Shared Local Memory at specified offsets and return it as simd vector.
Definition: memory.hpp:4961
__ESIMD_API T slm_scalar_load(uint32_t offset)
Load a scalar value from the Shared Local Memory.
Definition: memory.hpp:4743
__ESIMD_API std::enable_if_t< is_simd_flag_type_v< Flags >, simd< T, N > > slm_block_load(uint32_t byte_offset, Flags)
Loads a contiguous block of SLM memory referenced by the given byte-offset offset,...
Definition: memory.hpp:4990
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT > > slm_scatter(simd< uint32_t, N/VS > byte_offsets, simd< T, N > vals, simd_mask< N/VS > mask, PropertyListT props={})
template <typename T, int N, int VS = 1, typename PropertyListT = empty_properties_t> void slm_scatte...
Definition: memory.hpp:4793
__ESIMD_API std::enable_if_t< is_simd_flag_type_v< Flags > > slm_block_store(uint32_t offset, simd< T, N > vals, Flags)
Stores elements of the vector vals to a contiguous block of SLM memory at the given byte-offset offse...
Definition: memory.hpp:5473
__ESIMD_API std::enable_if_t< __ESIMD_DNS::get_num_args< Op >)==0, simd< T, N > > slm_atomic_update(simd< uint32_t, N > byte_offset, simd_mask< N > mask=1)
Definition: memory.hpp:5929
__ESIMD_API void slm_init()
Declare per-work-group slm size.
Definition: memory.hpp:4336
__ESIMD_API std::enable_if_t<(N==8||N==16||N==32) &&(sizeof(T)==4), simd< T, N *get_num_channels_enabled(RGBAMask)> > slm_gather_rgba(simd< uint32_t, N > offsets, simd_mask< N > mask=1)
Gather data from the Shared Local Memory at specified offsets and return it as simd vector.
Definition: memory.hpp:4943
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT >, simd< T, N > > slm_gather(simd< uint32_t, N/VS > byte_offsets, simd_mask< N/VS > mask, simd< T, N > pass_thru, PropertyListT props={})
template <typename T, int N, int VS, typename PropertyListT = empty_properties_t> simd<T,...
Definition: memory.hpp:4421
__ESIMD_API void slm_scalar_store(uint32_t offset, T val)
Store a scalar value into the Shared Local Memory.
Definition: memory.hpp:4926
__ESIMD_API simd< T, N *get_num_channels_enabled(RGBAMask)> gather_rgba(const T *p, simd< Toffset, N > offsets, simd_mask< N > mask=1)
Gather and transpose pixels from given memory locations defined by the base pointer p and offsets.
Definition: memory.hpp:3973
__ESIMD_API T scalar_load(AccessorTy acc, detail::DeviceAccessorOffsetT offset)
Load a scalar value from an accessor.
Definition: memory.hpp:3916
__ESIMD_API std::enable_if_t< is_simd_flag_type_v< Flags > > block_store(Tx *addr, simd< Tx, N > vals, Flags)
Stores elements of the vector vals to a contiguous block of memory at the given address addr.
Definition: memory.hpp:1281
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT > > store_2d(T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y, simd< T, N > Vals, PropertyListT props={})
2D USM pointer block store.
Definition: memory.hpp:9536
__ESIMD_API void scatter_rgba(T *p, simd< Toffset, N > offsets, simd< T, N *get_num_channels_enabled(RGBAMask)> vals, simd_mask< N > mask=1)
Transpose and scatter pixels to given memory locations defined by the base pointer p and offsets.
Definition: memory.hpp:4063
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT > > prefetch_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y, PropertyListT props={})
template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1, int N = detail::get_lsc_b...
Definition: memory.hpp:9500
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT > > scatter(T *p, simd< OffsetT, N/VS > byte_offsets, simd< T, N > vals, simd_mask< N/VS > mask, PropertyListT props={})
template <typename T, int N, int VS = 1, typename OffsetT, typename PropertyListT = empty_properties_...
Definition: memory.hpp:677
__ESIMD_API void fence()
esimd::fence sets the memory read/write order.
Definition: memory.hpp:7983
__ESIMD_API void scalar_store(AccessorTy acc, detail::DeviceAccessorOffsetT offset, T val)
Store a scalar value into an accessor.
Definition: memory.hpp:3931
__ESIMD_API void media_block_store(AccessorTy acc, unsigned x, unsigned y, simd< T, m *N > vals)
Media block store.
Definition: memory.hpp:8079
__ESIMD_API std::enable_if_t< detail::is_device_accessor_with_v< AccessorT, detail::accessor_mode_cap::can_read > &&ext::oneapi::experimental::is_property_list_v< PropertyListT > > prefetch(AccessorT acc, PropertyListT props={})
template <typename T, int VS = 1, typename AccessorT, typename PropertyListT = empty_properties_t> vo...
Definition: memory.hpp:9405
fence_mask
Represetns a bit mask to control behavior of esimd::fence.
Definition: memory.hpp:7951
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT >, simd< T, N > > load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y, PropertyListT props={})
template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1, bool Transposed = false,...
Definition: memory.hpp:9457
__ESIMD_API SurfaceIndex get_surface_index(AccessorTy acc)
Get surface index corresponding to a SYCL accessor.
Definition: memory.hpp:53
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT > > prefetch(const T *p, simd< OffsetT, N/VS > byte_offsets, simd_mask< N/VS > mask, PropertyListT props={})
template <typename T, int N, int VS, typename OffsetT, typename PropertyListT = empty_properties_t> v...
Definition: memory.hpp:8790
__ESIMD_API simd< T, m *N > media_block_load(AccessorTy acc, unsigned x, unsigned y)
Media block load.
Definition: memory.hpp:8038
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT >, simd< T, N > > gather(const T *p, simd< OffsetT, N/VS > byte_offsets, simd_mask< N/VS > mask, simd< T, N > pass_thru, PropertyListT props={})
template <typename T, int N, int VS, typename OffsetT, typename PropertyListT = empty_properties_t> s...
Definition: memory.hpp:289
__ESIMD_API void barrier()
Generic work-group barrier.
Definition: memory.hpp:8016
@ global_coherent_fence
“Commit enable” - wait for fence to complete before continuing.
Definition: memory.hpp:7953
@ l2_flush_constant_data
Flush constant cache.
Definition: memory.hpp:7963
@ local_barrier
Issue SLM memory barrier only. If not set, the memory barrier is global.
Definition: memory.hpp:7971
@ l1_flush_ro_data
Flush L1 read - only data cache.
Definition: memory.hpp:7973
@ l2_flush_rw_data
Flush constant cache.
Definition: memory.hpp:7967
@ __SYCL_DEPRECATED
Creates a software (compiler) barrier, which does not generate any instruction and only prevents inst...
Definition: memory.hpp:7956
@ l2_flush_texture_data
Flush sampler (texture) cache.
Definition: memory.hpp:7959
@ l2_flush_instructions
Flush the instruction cache.
Definition: memory.hpp:7955
__ESIMD_API sycl::ext::intel::esimd::simd< T1, n1 > raw_send(sycl::ext::intel::esimd::simd< T1, n1 > msg_dst, sycl::ext::intel::esimd::simd< T2, n2 > msg_src0, uint32_t ex_desc, uint32_t msg_desc, sycl::ext::intel::esimd::simd_mask< exec_size > mask=1)
Raw send.
Definition: memory.hpp:9682
__ESIMD_API sycl::ext::intel::esimd::simd< T1, n1 > raw_sends(sycl::ext::intel::esimd::simd< T1, n1 > msg_dst, sycl::ext::intel::esimd::simd< T2, n2 > msg_src0, sycl::ext::intel::esimd::simd< T3, n3 > msg_src1, uint32_t ex_desc, uint32_t msg_desc, sycl::ext::intel::esimd::simd_mask< exec_size > mask=1)
Raw sends.
Definition: memory.hpp:9638
void add(const void *DeviceGlobalPtr, const char *UniqueId)
ESIMD_INLINE simd< T, N > lsc_format_ret(simd< T1, N > Vals)
Definition: memory.hpp:88
__ESIMD_API std::enable_if_t< get_num_args< Op >)==0, simd< T, N > > slm_atomic_update_impl(simd< uint32_t, N > offsets, simd_mask< N > pred)
SLM atomic.
Definition: memory.hpp:5783
constexpr bool isMaskedGatherScatterLLVMAvailable()
Definition: memory.hpp:203
static constexpr SurfaceIndex SLM_BTI
Definition: common.hpp:115
constexpr cache_hint getCacheHintForIntrin()
Extracts a cache hint with the given 'Level' to pass it to ESIMD/GENX intrinsics.
Definition: memory.hpp:102
constexpr void check_atomic()
Check the legality of an atomic call in terms of size and type.
Definition: memory.hpp:4266
static void validate_rgba_write_channel_mask()
Definition: memory.hpp:4031
__ESIMD_API simd< T, N > slm_atomic_update_impl(simd< uint32_t, N > offsets, simd< T, N > src0, simd< T, N > src1, simd_mask< N > pred)
SLM atomic.
Definition: memory.hpp:5859
__ESIMD_API std::enable_if_t< detail::is_property_list_v< PropertyListT > > block_store_impl(T *p, simd< T, NElts > vals, simd_mask< 1 > pred)
Definition: memory.hpp:1143
__ESIMD_API simd< T, N *NElts > gather_impl(const T *p, simd< OffsetT, N > offsets, simd_mask< N > pred, simd< T, N *NElts > pass_thru)
USM pointer gather.
Definition: memory.hpp:133
lsc_data_size
Data size or format to read or store.
Definition: common.hpp:407
__ESIMD_API void scatter_impl(T *p, simd< Toffset, N > offsets, simd< T, N *NElts > vals, simd_mask< N > pred)
USM pointer scatter.
Definition: memory.hpp:176
ESIMD_INLINE simd< RT, N > lsc_format_input(simd< T, N > Vals)
Definition: memory.hpp:74
constexpr int lsc_to_internal_atomic_op()
Definition: memory.hpp:5761
__ESIMD_API std::enable_if_t< get_num_args< Op >)==0, simd< T, N > > atomic_update_impl(T *p, simd< Toffset, N > offsets, simd_mask< N > pred)
USM pointer atomic.
Definition: memory.hpp:6153
constexpr ESIMD_INLINE bool isPowerOf2(unsigned int n)
Check if a given 32 bit positive integer is a power of 2 at compile time.
Definition: common.hpp:96
__ESIMD_API std::enable_if_t< is_property_list_v< PropertyListT >, simd< T, NElts > > block_load_impl(const T *p, simd_mask< 1 > pred, simd< T, NElts > pass_thru)
USM pointer transposed gather with 1 channel.
Definition: memory.hpp:888
constexpr lsc_data_size expand_data_size(lsc_data_size DS)
Definition: common.hpp:608
constexpr alignment_key::value_t< K > alignment
cache_hint
L1, L2 or L3 cache hints.
fence_scope
The scope that fence() operation should apply to.
Definition: common.hpp:350
@ group
Wait until all previous memory transactions from this thread are observed within the local thread-gro...
fence_flush_op
The cache flush operation to apply to caches after fence() is complete.
Definition: common.hpp:384
memory_kind
The target memory kind for fence() operation.
Definition: common.hpp:397
@ local
image (also known as typed global memory)
prefetch_impl< _B > prefetch
Definition: fpga_lsu.hpp:45
void prefetch_impl(T *ptr, size_t bytes, Properties properties)
Definition: prefetch.hpp:71
properties< std::tuple<> > empty_properties_t
Definition: properties.hpp:207
decltype(properties{}) empty_properties_t
Definition: properties.hpp:190
std::bit_or< T > bit_or
Definition: functional.hpp:22
std::enable_if_t< detail::is_bf16_storage_type< T >::value, T > fmax(T x, T y)
std::enable_if_t< detail::is_bf16_storage_type< T >::value, T > fmin(T x, T y)
std::bit_xor< T > bit_xor
Definition: functional.hpp:23
std::bit_and< T > bit_and
Definition: functional.hpp:24
std::enable_if_t< sizeof(To)==sizeof(From) &&std::is_trivially_copyable< From >::value &&std::is_trivially_copyable< To >::value, To > bit_cast(const From &from) noexcept
Definition: bit_cast.hpp:52
constexpr stream_manipulator dec
Definition: stream.hpp:784
autodecltype(x) x
const void value_type
Definition: multi_ptr.hpp:457
Definition: access.hpp:18
std::conditional_t< sizeof(T)<=4, std::conditional_t< std::is_signed_v< T >, int32_t, uint32_t >, std::conditional_t< std::is_signed_v< T >, int64_t, uint64_t > > type
Definition: common.hpp:620