26 inline namespace _V1 {
27 namespace ext::intel::esimd {
52 template <
typename AccessorTy>
54 if constexpr (std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
55 sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>) {
58 #ifdef __ESIMD_FORCE_STATELESS_MEM
59 static_assert(sycl::detail::acc_properties::is_image_accessor_v<AccessorTy>,
60 "The function get_surface_index() is available only for "
61 "image- and local-accessors in stateless-only memory mode. "
63 "-fno-sycl-esimd-force-stateless-mem compilation switch.");
65 return __esimd_get_surface_index(
66 detail::AccessorPrivateProxy::getQualifiedPtrOrImageObj(acc));
73 template <
typename RT,
typename T,
int N>
75 if constexpr (
sizeof(T) == 1) {
77 return Vals.template bit_cast_view<uint8_t>();
78 }
else if constexpr (
sizeof(T) == 2) {
80 return Vals.template bit_cast_view<uint16_t>();
82 return Vals.template bit_cast_view<RT>();
87 template <
typename T,
typename T1,
int N>
89 auto Formatted = Vals.template bit_cast_view<T>();
90 if constexpr (
sizeof(T) ==
sizeof(T1)) {
93 constexpr
int Stride = Formatted.length / N;
94 return Formatted.template select<N, Stride>(0);
101 template <
typename PropertyListT, cache_level Level>
104 "ESIMD/GENX intrinsics accept only L1/L2 cache hints");
106 return getPropertyValue<PropertyListT, cache_hint_L1_key>(
cache_hint::none);
108 return getPropertyValue<PropertyListT, cache_hint_L2_key>(
cache_hint::none);
131 template <
typename T,
int NElts,
lsc_data_size DS,
typename PropertyListT,
132 int N,
typename OffsetT>
136 static_assert(std::is_integral_v<OffsetT>,
"Unsupported offset type");
137 check_lsc_vector_size<NElts>();
138 check_lsc_data_size<T, DS>();
139 check_cache_hints<cache_action::load, PropertyListT>();
140 constexpr
auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
141 constexpr
auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
142 constexpr uint16_t AddressScale = 1;
143 constexpr
int ImmOffset = 0;
149 Addrs += convert<uintptr_t>(offsets);
152 __esimd_lsc_load_merge_stateless<MsgT, L1H, L2H, AddressScale, ImmOffset,
153 EDS, VS, Transposed, N>(
155 return lsc_format_ret<T>(Result);
174 template <
typename T,
int NElts,
lsc_data_size DS,
typename PropertyListT,
175 int N,
typename Toffset>
178 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
179 check_lsc_vector_size<NElts>();
180 check_lsc_data_size<T, DS>();
181 check_cache_hints<cache_action::store, PropertyListT>();
182 constexpr
auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
183 constexpr
auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
184 constexpr uint16_t AddressScale = 1;
185 constexpr
int ImmOffset = 0;
191 addrs += convert<uintptr_t>(offsets);
193 __esimd_lsc_store_stateless<MsgT, L1H, L2H, AddressScale, ImmOffset, EDS, VS,
194 Transposed, N>(pred.
data(), addrs.
data(),
204 #ifdef __ESIMD_GATHER_SCATTER_LLVM_IR
258 #ifndef __ESIMD_GATHER_SCATTER_LLVM_IR
284 template <typename T, int N, int VS, typename OffsetT,
285 typename PropertyListT =
287 __ESIMD_API std::enable_if_t<
288 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
simd<T, N>>
290 simd<T, N> pass_thru, PropertyListT props = {}) {
291 static_assert(std::is_integral_v<OffsetT>,
"Unsupported offset type");
292 static_assert(N / VS >= 1 && N % VS == 0,
"N must be divisible by VS");
295 detail::getPropertyValue<PropertyListT, alignment_key>(
sizeof(T));
297 "gather() requires at least element-size alignment");
301 if constexpr (detail::has_cache_hints<PropertyListT>() || VS > 1 ||
303 static_assert(VS == 1 ||
sizeof(T) >= 4,
304 "VS > 1 is supprted only for 4- and 8-byte elements");
306 PropertyListT>(p, byte_offsets, mask, pass_thru);
309 Addrs = Addrs + convert<uint64_t>(byte_offsets);
311 using MsgT = detail::__raw_t<T>;
312 return __esimd_gather_ld<MsgT, N, Alignment>(
313 Addrs.data(), mask.
data(),
343 template <
typename T,
int N,
int VS,
typename OffsetT,
344 typename PropertyListT =
346 __ESIMD_API std::enable_if_t<
347 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
simd<T, N>>
349 PropertyListT props = {}) {
351 detail::getPropertyValue<PropertyListT, alignment_key>(
sizeof(T));
353 "gather() requires at least element-size alignment");
355 if constexpr (detail::has_cache_hints<PropertyListT>() || VS > 1 ||
359 return gather<T, N, VS>(p, byte_offsets, mask, PassThru, props);
362 Addrs += convert<uintptr_t>(byte_offsets);
363 using MsgT = detail::__raw_t<T>;
364 if constexpr (
sizeof(T) == 1) {
365 auto Ret = __esimd_svm_gather<MsgT, N, detail::ElemsPerAddrEncoding<4>(),
366 detail::ElemsPerAddrEncoding<1>()>(
367 Addrs.data(), mask.
data());
368 detail::check_rdregion_params<N * 4, N, 0, N, 4>();
369 return __esimd_rdregion<MsgT, N * 4, N, 0, N, 4>(Ret, 0);
370 }
else if constexpr (
sizeof(T) == 2) {
371 auto Ret = __esimd_svm_gather<MsgT, N, detail::ElemsPerAddrEncoding<2>(),
372 detail::ElemsPerAddrEncoding<2>()>(
373 Addrs.data(), mask.
data());
374 detail::check_rdregion_params<N * 2, N, 0, N, 2>();
375 return __esimd_rdregion<MsgT, N * 2, N, 0, N, 2>(Ret, 0);
377 return __esimd_svm_gather<MsgT, N, detail::ElemsPerAddrEncoding<1>(),
378 detail::ElemsPerAddrEncoding<1>()>(Addrs.data(),
402 template <
typename T,
int N,
int VS,
typename OffsetT,
403 typename PropertyListT =
405 __ESIMD_API std::enable_if_t<
406 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
simd<T, N>>
408 PropertyListT props = {}) {
410 return gather<T, N, VS>(p, byte_offsets, Mask, props);
437 template <
typename T,
int N,
typename OffsetT,
438 typename PropertyListT =
440 __ESIMD_API std::enable_if_t<
441 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
simd<T, N>>
443 simd<T, N> pass_thru, PropertyListT props = {}) {
444 constexpr
int VS = 1;
445 return gather<T, N, VS>(p, byte_offsets, mask, pass_thru, props);
469 template <
typename T,
int N,
typename OffsetT,
470 typename PropertyListT =
472 __ESIMD_API std::enable_if_t<
473 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
simd<T, N>>
475 PropertyListT props = {}) {
476 constexpr
int VS = 1;
477 return gather<T, N, VS>(p, byte_offsets, mask, props);
495 template <
typename T,
int N,
typename OffsetT,
496 typename PropertyListT =
498 __ESIMD_API std::enable_if_t<
499 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
simd<T, N>>
501 constexpr
int VS = 1;
502 return gather<T, N, VS>(p, byte_offsets, props);
533 template <
typename T,
int N,
int VS = 1,
typename OffsetSimdViewT,
534 typename PropertyListT =
536 __ESIMD_API std::enable_if_t<
537 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
538 detail::is_simd_view_type_v<OffsetSimdViewT>,
541 simd<T, N> pass_thru, PropertyListT props = {}) {
542 return gather<T, N, VS>(p, byte_offsets.read(), mask, pass_thru, props);
568 template <
typename T,
int N,
int VS = 1,
typename OffsetSimdViewT,
569 typename PropertyListT =
571 __ESIMD_API std::enable_if_t<
572 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
573 detail::is_simd_view_type_v<OffsetSimdViewT>,
576 PropertyListT props = {}) {
577 return gather<T, N, VS>(p, byte_offsets.read(), mask, props);
597 template <
typename T,
int N,
int VS = 1,
typename OffsetSimdViewT,
598 typename PropertyListT =
600 __ESIMD_API std::enable_if_t<
601 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
602 detail::is_simd_view_type_v<OffsetSimdViewT>,
604 gather(
const T *p, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
605 return gather<T, N, VS>(p, byte_offsets.read(), props);
620 template <
typename Tx,
int N,
typename Toffset>
621 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>,
simd<Tx, N>>
672 template <
typename T,
int N,
int VS = 1,
typename OffsetT,
673 typename PropertyListT =
675 __ESIMD_API std::enable_if_t<
676 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
679 static_assert(std::is_integral_v<OffsetT>,
"Unsupported offset type");
680 static_assert(N / VS >= 1 && N % VS == 0,
"N must be divisible by VS");
683 detail::getPropertyValue<PropertyListT, alignment_key>(
sizeof(T));
685 "scatter() requires at least element-size alignment");
688 if constexpr (detail::has_cache_hints<PropertyListT>() || VS > 1 ||
691 static_assert(VS == 1 ||
sizeof(T) >= 4,
692 "VS > 1 is supprted only for 4- and 8-byte elements");
694 PropertyListT>(p, byte_offsets, vals, mask);
697 Addrs = Addrs + convert<uint64_t>(byte_offsets);
698 using MsgT = detail::__raw_t<T>;
699 __esimd_scatter_st<MsgT, N, Alignment>(
701 Addrs.data(), mask.
data());
703 using Tx = detail::__raw_t<T>;
706 addrs = addrs + byte_offsets_i;
707 if constexpr (
sizeof(T) == 1) {
708 detail::check_wrregion_params<N * 4, N, 0, N, 4>();
710 D.data(), vals.
data(), 0);
711 __esimd_svm_scatter<Tx, N, detail::ElemsPerAddrEncoding<4>(),
712 detail::ElemsPerAddrEncoding<1>()>(
713 addrs.data(), D.data(), mask.
data());
714 }
else if constexpr (
sizeof(T) == 2) {
715 detail::check_wrregion_params<N * 2, N, 0, N, 2>();
717 D.data(), vals.
data(), 0);
718 __esimd_svm_scatter<Tx, N, detail::ElemsPerAddrEncoding<2>(),
719 detail::ElemsPerAddrEncoding<2>()>(
720 addrs.data(), D.data(), mask.
data());
722 __esimd_svm_scatter<Tx, N, detail::ElemsPerAddrEncoding<1>(),
723 detail::ElemsPerAddrEncoding<1>()>(
724 addrs.data(), vals.
data(), mask.
data());
749 template <
typename T,
int N,
int VS = 1,
typename OffsetT,
750 typename PropertyListT =
752 __ESIMD_API std::enable_if_t<
753 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
755 PropertyListT props = {}) {
757 scatter<T, N, VS>(p, byte_offsets, vals, Mask, props);
784 template <
typename T,
int N,
int VS = 1,
typename OffsetSimdViewT,
785 typename PropertyListT =
787 __ESIMD_API std::enable_if_t<
788 detail::is_simd_view_type_v<OffsetSimdViewT> &&
789 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
792 scatter<T, N, VS>(p, byte_offsets.read(), vals, mask, props);
817 template <
typename T,
int N,
int VS = 1,
typename OffsetSimdViewT,
818 typename PropertyListT =
820 __ESIMD_API std::enable_if_t<
821 detail::is_simd_view_type_v<OffsetSimdViewT> &&
822 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
824 PropertyListT props = {}) {
826 scatter<T, N, VS>(p, byte_offsets.read(), vals, Mask, props);
840 template <
typename Tx,
int N,
typename Toffset>
841 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> && N == 1>
849 #ifdef __ESIMD_FORCE_STATELESS_MEM
886 template <
typename T,
int NElts,
typename PropertyListT>
887 __ESIMD_API std::enable_if_t<is_property_list_v<PropertyListT>,
simd<T, NElts>>
890 check_cache_hints<cache_action::load, PropertyListT>();
892 PropertyListT::template get_property<alignment_key>().value;
894 (
Alignment >= __ESIMD_DNS::OperandSize::DWORD &&
sizeof(T) <= 4) ||
895 (
Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
sizeof(T) > 4),
896 "Incorrect alignment for the data type");
898 constexpr
int SmallIntFactor64Bit =
sizeof(uint64_t) /
sizeof(T);
899 constexpr
int SmallIntFactor32Bit =
900 sizeof(uint32_t) /
sizeof(T) > 1 ?
sizeof(uint32_t) /
sizeof(T) : 1;
901 static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
902 "Number of elements is not supported by Transposed load");
908 constexpr
bool Use64BitData =
909 Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
910 (NElts *
sizeof(T)) %
sizeof(uint64_t) == 0 &&
911 (
sizeof(T) !=
sizeof(uint32_t) || NElts *
sizeof(T) > 256);
912 constexpr
int SmallIntFactor =
913 Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
914 constexpr
int FactoredNElts = NElts / SmallIntFactor;
915 check_lsc_vector_size<FactoredNElts>();
918 using LoadElemT = __ESIMD_DNS::__raw_t<
919 std::conditional_t<SmallIntFactor == 1, T,
920 std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
921 constexpr
auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
922 constexpr
auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
924 constexpr uint16_t AddressScale = 1;
925 constexpr
int ImmOffset = 0;
935 pass_thru.template bit_cast_view<LoadElemT>();
937 __esimd_lsc_load_merge_stateless<LoadElemT, L1H, L2H, AddressScale,
938 ImmOffset, ActualDS, VS, Transposed, N>(
940 return Result.template bit_cast_view<T>();
975 template <
typename T,
int NElts,
typename PropertyListT,
typename AccessorT>
977 std::enable_if_t<detail::is_device_accessor_with_v<
978 AccessorT, detail::accessor_mode_cap::can_read> &&
979 is_property_list_v<PropertyListT>,
983 #ifdef __ESIMD_FORCE_STATELESS_MEM
985 return block_load_impl<T, NElts, PropertyListT>(
986 accessorToPointer<T>(acc, offset), pred, PassThru);
989 check_cache_hints<cache_action::load, PropertyListT>();
991 detail::getPropertyValue<PropertyListT, alignment_key>(
sizeof(T));
993 (
Alignment >= __ESIMD_DNS::OperandSize::DWORD &&
sizeof(T) <= 4) ||
994 (
Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
sizeof(T) > 4),
995 "Incorrect alignment for the data type");
997 constexpr
int SmallIntFactor64Bit =
sizeof(uint64_t) /
sizeof(T);
998 constexpr
int SmallIntFactor32Bit =
999 sizeof(uint32_t) /
sizeof(T) > 1 ?
sizeof(uint32_t) /
sizeof(T) : 1;
1000 static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1001 "Number of elements is not supported by Transposed load");
1007 constexpr
bool Use64BitData =
1008 Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1009 (NElts *
sizeof(T)) %
sizeof(uint64_t) == 0 &&
1010 (
sizeof(T) !=
sizeof(uint32_t) || NElts *
sizeof(T) > 256);
1011 constexpr
int SmallIntFactor =
1012 Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1013 constexpr
int FactoredNElts = NElts / SmallIntFactor;
1014 check_lsc_vector_size<FactoredNElts>();
1017 using LoadElemT = __ESIMD_DNS::__raw_t<
1018 std::conditional_t<SmallIntFactor == 1, T,
1019 std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1020 constexpr
auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
1021 constexpr
auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
1022 constexpr uint16_t AddressScale = 1;
1023 constexpr
int ImmOffset = 0;
1026 constexpr
auto VS = to_lsc_vector_size<FactoredNElts>();
1028 constexpr
int N = 1;
1034 __esimd_lsc_load_bti<LoadElemT, L1H, L2H, AddressScale, ImmOffset,
1035 ActualDS, VS, Transposed, N>(pred.
data(),
1036 Offsets.
data(), SI);
1037 return Result.template bit_cast_view<T>();
1074 template <
typename T,
int NElts,
typename PropertyListT,
typename AccessorT>
1076 std::enable_if_t<detail::is_device_accessor_with_v<
1077 AccessorT, detail::accessor_mode_cap::can_read> &&
1078 is_property_list_v<PropertyListT>,
1082 #ifdef __ESIMD_FORCE_STATELESS_MEM
1083 return block_load_impl<T, NElts, PropertyListT>(
1084 accessorToPointer<T>(acc, offset), pred, pass_thru);
1087 check_cache_hints<cache_action::load, PropertyListT>();
1089 PropertyListT::template get_property<alignment_key>().value;
1091 (
Alignment >= __ESIMD_DNS::OperandSize::DWORD &&
sizeof(T) <= 4) ||
1092 (
Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
sizeof(T) > 4),
1093 "Incorrect alignment for the data type");
1095 constexpr
int SmallIntFactor64Bit =
sizeof(uint64_t) /
sizeof(T);
1096 constexpr
int SmallIntFactor32Bit =
1097 sizeof(uint32_t) /
sizeof(T) > 1 ?
sizeof(uint32_t) /
sizeof(T) : 1;
1098 static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1099 "Number of elements is not supported by Transposed load");
1105 constexpr
bool Use64BitData =
1106 Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1107 (NElts *
sizeof(T)) %
sizeof(uint64_t) == 0 &&
1108 (
sizeof(T) !=
sizeof(uint32_t) || NElts *
sizeof(T) > 256);
1109 constexpr
int SmallIntFactor =
1110 Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1111 constexpr
int FactoredNElts = NElts / SmallIntFactor;
1112 check_lsc_vector_size<FactoredNElts>();
1115 using LoadElemT = __ESIMD_DNS::__raw_t<
1116 std::conditional_t<SmallIntFactor == 1, T,
1117 std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1118 constexpr
auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
1119 constexpr
auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
1120 constexpr uint16_t AddressScale = 1;
1121 constexpr
int ImmOffset = 0;
1124 constexpr
auto VS = to_lsc_vector_size<FactoredNElts>();
1126 constexpr
int N = 1;
1132 pass_thru.template bit_cast_view<LoadElemT>();
1134 __esimd_lsc_load_merge_bti<LoadElemT, L1H, L2H, AddressScale, ImmOffset,
1135 ActualDS, VS, Transposed, N>(
1137 return Result.template bit_cast_view<T>();
1141 template <
typename T,
int NElts,
typename PropertyListT>
1142 __ESIMD_API std::enable_if_t<detail::is_property_list_v<PropertyListT>>
1144 detail::check_cache_hints<cache_action::store, PropertyListT>();
1146 detail::getPropertyValue<PropertyListT, alignment_key>(
sizeof(T));
1148 (
Alignment >= __ESIMD_DNS::OperandSize::DWORD &&
sizeof(T) <= 4) ||
1149 (
Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
sizeof(T) > 4),
1150 "Incorrect alignment for the data type");
1152 constexpr
int SmallIntFactor64Bit =
sizeof(uint64_t) /
sizeof(T);
1153 constexpr
int SmallIntFactor32Bit =
1154 sizeof(uint32_t) /
sizeof(T) > 1 ?
sizeof(uint32_t) /
sizeof(T) : 1;
1155 static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1156 "Number of elements is not supported by Transposed store");
1162 constexpr
bool Use64BitData =
1163 Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1164 (NElts *
sizeof(T)) %
sizeof(uint64_t) == 0 &&
1165 (
sizeof(T) !=
sizeof(uint32_t) || NElts *
sizeof(T) > 256);
1167 constexpr
int SmallIntFactor =
1168 Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1169 constexpr
int FactoredNElts = NElts / SmallIntFactor;
1171 check_lsc_vector_size<FactoredNElts>();
1173 using StoreType = __ESIMD_DNS::__raw_t<
1174 std::conditional_t<SmallIntFactor == 1, T,
1175 std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1176 constexpr
auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
1177 constexpr
auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
1178 constexpr uint16_t AddressScale = 1;
1179 constexpr
int ImmOffset = 0;
1184 constexpr
int N = 1;
1187 __esimd_lsc_store_stateless<StoreType, L1H, L2H, AddressScale, ImmOffset,
1188 ActualDS, VS, Transposed, N>(
1190 sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreType, FactoredNElts>>(
1194 template <
typename T,
int NElts,
typename PropertyListT,
typename AccessorT>
1196 std::enable_if_t<detail::is_device_accessor_with_v<
1197 AccessorT, detail::accessor_mode_cap::can_write> &&
1198 detail::is_property_list_v<PropertyListT>>
1201 #ifdef __ESIMD_FORCE_STATELESS_MEM
1202 block_store_impl<T, NElts, PropertyListT>(accessorToPointer<T>(acc, offset),
1206 check_cache_hints<cache_action::store, PropertyListT>();
1208 detail::getPropertyValue<PropertyListT, alignment_key>(
sizeof(T));
1210 (
Alignment >= __ESIMD_DNS::OperandSize::DWORD &&
sizeof(T) <= 4) ||
1211 (
Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
sizeof(T) > 4),
1212 "Incorrect alignment for the data type");
1214 constexpr
int SmallIntFactor64Bit =
sizeof(uint64_t) /
sizeof(T);
1215 constexpr
int SmallIntFactor32Bit =
1216 sizeof(uint32_t) /
sizeof(T) >
static_cast<size_t>(1)
1217 ?
sizeof(uint32_t) /
sizeof(T)
1218 :
static_cast<size_t>(1);
1219 static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1220 "Number of elements is not supported by Transposed store");
1226 constexpr
bool Use64BitData =
1227 Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1228 (NElts *
sizeof(T)) %
sizeof(uint64_t) == 0 &&
1229 (
sizeof(T) !=
sizeof(uint32_t) || NElts *
sizeof(T) > 256);
1230 constexpr
int SmallIntFactor =
1231 Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1232 constexpr
int FactoredNElts = NElts / SmallIntFactor;
1233 check_lsc_vector_size<FactoredNElts>();
1236 using StoreElemT = __ESIMD_DNS::__raw_t<
1237 std::conditional_t<SmallIntFactor == 1, T,
1238 std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1239 constexpr
auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
1240 constexpr
auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
1241 constexpr uint16_t AddressScale = 1;
1242 constexpr
int ImmOffset = 0;
1245 constexpr
auto VS = to_lsc_vector_size<FactoredNElts>();
1247 constexpr
int N = 1;
1253 __esimd_lsc_store_bti<StoreElemT, L1H, L2H, AddressScale, ImmOffset, ActualDS,
1256 sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreElemT, FactoredNElts>>(
1278 template <
typename Tx,
int N,
1280 __ESIMD_API std::enable_if_t<is_simd_flag_type_v<Flags>>
1282 using T =
typename detail::__raw_t<Tx>;
1284 constexpr
size_t Align = Flags::template alignment<simd<T, N>>;
1285 __esimd_svm_block_st<T, N, Align>(
reinterpret_cast<VecT *
>(addr),
1351 template <
typename T,
int N,
1352 typename PropertyListT =
1354 __ESIMD_API std::enable_if_t<
1355 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
simd<T, N>>
1357 constexpr
size_t DefaultAlignment = (
sizeof(T) <= 4) ? 4 :
sizeof(T);
1358 using NewPropertyListT =
1359 detail::add_alignment_property_t<PropertyListT, DefaultAlignment>;
1360 if constexpr (detail::has_cache_hints<PropertyListT>()) {
1363 return detail::block_load_impl<T, N, NewPropertyListT>(ptr, Mask, PassThru);
1366 NewPropertyListT::template get_property<alignment_key>().value;
1406 template <
typename T,
int N,
1407 typename PropertyListT =
1409 __ESIMD_API std::enable_if_t<
1410 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
simd<T, N>>
1411 block_load(
const T *ptr,
size_t byte_offset, PropertyListT props = {}) {
1412 const T *AdjustedPtr =
reinterpret_cast<const T *
>(
1413 reinterpret_cast<const int8_t *
>(ptr) + byte_offset);
1414 return block_load<T, N>(AdjustedPtr, props);
1449 template <
typename T,
int N,
1450 typename PropertyListT =
1453 std::enable_if_t<detail::is_property_list_v<PropertyListT>,
simd<T, N>>
1455 constexpr
size_t DefaultAlignment = (
sizeof(T) <= 4) ? 4 :
sizeof(T);
1456 using NewPropertyListT =
1457 detail::add_alignment_property_t<PropertyListT, DefaultAlignment>;
1459 return detail::block_load_impl<T, N, NewPropertyListT>(ptr, pred, PassThru);
1495 template <
typename T,
int N,
1496 typename PropertyListT =
1498 __ESIMD_API std::enable_if_t<
1499 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
simd<T, N>>
1501 PropertyListT props = {}) {
1502 const T *AdjustedPtr =
reinterpret_cast<const T *
>(
1503 reinterpret_cast<const int8_t *
>(ptr) + byte_offset);
1504 return block_load<T, N>(AdjustedPtr, pred, props);
1539 template <
typename T,
int N,
1540 typename PropertyListT =
1542 __ESIMD_API std::enable_if_t<
1543 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
simd<T, N>>
1545 PropertyListT props = {}) {
1546 constexpr
size_t DefaultAlignment = (
sizeof(T) <= 4) ? 4 :
sizeof(T);
1547 using NewPropertyListT =
1548 detail::add_alignment_property_t<PropertyListT, DefaultAlignment>;
1549 return detail::block_load_impl<T, N, NewPropertyListT>(ptr, pred, pass_thru);
1586 template <
typename T,
int N,
1587 typename PropertyListT =
1589 __ESIMD_API std::enable_if_t<
1590 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
simd<T, N>>
1592 simd<T, N> pass_thru, PropertyListT props = {}) {
1593 const T *AdjustedPtr =
reinterpret_cast<const T *
>(
1594 reinterpret_cast<const int8_t *
>(ptr) + byte_offset);
1595 return block_load<T, N>(AdjustedPtr, pred, pass_thru, props);
1613 template <
typename Tx,
int N,
1615 __ESIMD_API std::enable_if_t<is_simd_flag_type_v<Flags>,
simd<Tx, N>>
1617 using T =
typename detail::__raw_t<Tx>;
1619 constexpr
size_t Align = Flags::template alignment<simd<T, N>>;
1620 return __esimd_svm_block_ld<T, N, Align>(
1621 reinterpret_cast<const VecT *
>(addr));
1639 template <
typename Tx,
int N,
typename AccessorTy,
1641 typename = std::enable_if_t<
1642 is_simd_flag_type_v<Flags> &&
1643 detail::is_device_accessor_with_v<
1644 AccessorTy, detail::accessor_mode_cap::can_read>>,
1645 class T = detail::__raw_t<Tx>>
1649 #ifdef __ESIMD_FORCE_STATELESS_MEM
1650 return block_load<Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc, byte_offset),
1653 std::ignore = flags;
1654 constexpr
unsigned Sz =
sizeof(T) * N;
1655 static_assert(Sz >= detail::OperandSize::OWORD,
1656 "block size must be at least 1 oword");
1657 static_assert(Sz % detail::OperandSize::OWORD == 0,
1658 "block size must be whole number of owords");
1660 "block must be 1, 2, 4 or 8 owords long");
1661 static_assert(Sz <= 8 * detail::OperandSize::OWORD,
1662 "block size must be at most 8 owords");
1664 auto surf_ind = __esimd_get_surface_index(
1665 detail::AccessorPrivateProxy::getQualifiedPtrOrImageObj(acc));
1668 detail::OperandSize::OWORD) {
1669 return __esimd_oword_ld<T, N>(surf_ind, byte_offset >> 4);
1671 return __esimd_oword_ld_unaligned<T, N>(surf_ind, byte_offset);
1740 template <
typename T,
int N,
typename AccessorT,
1741 typename PropertyListT =
1743 __ESIMD_API std::enable_if_t<
1744 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
1745 detail::is_device_accessor_with_v<AccessorT,
1746 detail::accessor_mode_cap::can_read>,
1749 PropertyListT props = {}) {
1750 #ifdef __ESIMD_FORCE_STATELESS_MEM
1751 return block_load<T, N>(detail::accessorToPointer<T>(acc, byte_offset),
1756 constexpr
size_t DefaultAlignment = (
sizeof(T) <= 4) ? 4 :
sizeof(T);
1758 detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
1761 constexpr
size_t Size =
sizeof(T) * N;
1762 constexpr
size_t OWord = detail::OperandSize::OWORD;
1763 constexpr
bool IsLegacySize = Size == OWord || Size == 2 * OWord ||
1764 Size == 4 * OWord || Size == 8 * OWord;
1766 using NewPropertyListT =
1767 detail::add_alignment_property_t<PropertyListT, DefaultAlignment>;
1768 if constexpr (detail::has_cache_hints<PropertyListT>() || !IsLegacySize) {
1769 return detail::block_load_impl<T, N, NewPropertyListT>(acc, byte_offset,
1773 NewPropertyListT::template get_property<alignment_key>().value;
1808 template <
typename T,
int N,
typename AccessorT,
1809 typename PropertyListT =
1811 __ESIMD_API std::enable_if_t<
1812 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
1813 detail::is_device_accessor_with_v<AccessorT,
1814 detail::accessor_mode_cap::can_read>,
1819 using NewPropertyListT =
1820 detail::add_or_replace_alignment_property_t<PropertyListT, 16>;
1821 return block_load<T, N>(acc, 0, NewPropertyListT{});
1855 template <
typename T,
int N,
typename AccessorT,
1856 typename PropertyListT =
1858 __ESIMD_API std::enable_if_t<
1859 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
1860 detail::is_device_accessor_with_v<AccessorT,
1861 detail::accessor_mode_cap::can_read>,
1865 PropertyListT = {}) {
1868 constexpr
size_t DefaultAlignment = (
sizeof(T) <= 4) ? 4 :
sizeof(T);
1869 using NewPropertyListT =
1870 detail::add_alignment_property_t<PropertyListT, DefaultAlignment>;
1871 return detail::block_load_impl<T, N, NewPropertyListT>(acc, byte_offset, pred,
1906 template <
typename T,
int N,
typename AccessorT,
1907 typename PropertyListT =
1909 __ESIMD_API std::enable_if_t<
1910 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
1911 detail::is_device_accessor_with_v<AccessorT,
1912 detail::accessor_mode_cap::can_read>,
1917 return block_load<T, N>(acc, byte_offset, pred, PassThru, props);
1947 template <
typename T,
int N,
typename AccessorT,
1948 typename PropertyListT =
1950 __ESIMD_API std::enable_if_t<
1951 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
1952 detail::is_device_accessor_with_v<AccessorT,
1953 detail::accessor_mode_cap::can_read>,
1956 PropertyListT = {}) {
1959 using NewPropertyListT =
1960 detail::add_or_replace_alignment_property_t<PropertyListT, 16>;
1961 return block_load<T, N>(acc, 0, pred, pass_thru, NewPropertyListT{});
1990 template <
typename T,
int N,
typename AccessorT,
1991 typename PropertyListT =
1993 __ESIMD_API std::enable_if_t<
1994 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
1995 detail::is_device_accessor_with_v<AccessorT,
1996 detail::accessor_mode_cap::can_read>,
2001 using NewPropertyListT =
2002 detail::add_or_replace_alignment_property_t<PropertyListT, 16>;
2004 return block_load<T, N>(acc, 0, pred, PassThru, NewPropertyListT{});
2060 template <
typename T,
int N,
2061 typename PropertyListT =
2063 __ESIMD_API std::enable_if_t<detail::is_property_list_v<PropertyListT>>
2065 if constexpr (detail::has_cache_hints<PropertyListT>()) {
2066 constexpr
size_t DefaultAlignment = (
sizeof(T) <= 4) ? 4 :
sizeof(T);
2067 using NewPropertyListT =
2068 detail::add_alignment_property_t<PropertyListT, DefaultAlignment>;
2070 detail::block_store_impl<T, N, NewPropertyListT>(ptr, vals, Mask);
2075 detail::getPropertyValue<PropertyListT, alignment_key>(
2076 detail::OperandSize::OWORD);
2115 template <
typename T,
int N,
2116 typename PropertyListT =
2118 __ESIMD_API std::enable_if_t<
2119 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
2121 PropertyListT props = {}) {
2123 reinterpret_cast<T *
>(
reinterpret_cast<int8_t *
>(ptr) + byte_offset);
2124 block_store<T, N>(AdjustedPtr, vals, props);
2159 template <
typename T,
int N,
2160 typename PropertyListT =
2162 __ESIMD_API std::enable_if_t<detail::is_property_list_v<PropertyListT>>
2164 PropertyListT = {}) {
2165 constexpr
size_t DefaultAlignment = (
sizeof(T) <= 4) ? 4 :
sizeof(T);
2166 using NewPropertyListT =
2167 detail::add_alignment_property_t<PropertyListT, DefaultAlignment>;
2168 detail::block_store_impl<T, N, NewPropertyListT>(ptr, vals, pred);
2207 template <
typename T,
int N,
2208 typename PropertyListT =
2210 __ESIMD_API std::enable_if_t<
2211 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
2213 PropertyListT props = {}) {
2215 reinterpret_cast<T *
>(
reinterpret_cast<int8_t *
>(ptr) + byte_offset);
2216 block_store<T, N>(AdjustedPtr, vals, pred, props);
2281 template <
typename T,
int N,
typename AccessorT,
2282 typename PropertyListT =
2284 __ESIMD_API std::enable_if_t<
2285 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
2286 detail::is_device_accessor_with_v<AccessorT,
2287 detail::accessor_mode_cap::can_write>>
2290 #ifdef __ESIMD_FORCE_STATELESS_MEM
2291 block_store<T, N>(detail::accessorToPointer<T>(acc, byte_offset), vals,
2294 constexpr
int DefaultLSCAlignment = (
sizeof(T) <= 4) ? 4 :
sizeof(T);
2296 detail::getPropertyValue<PropertyListT, alignment_key>(
2297 DefaultLSCAlignment);
2298 constexpr
bool AlignmentRequiresLSC =
2299 PropertyListT::template has_property<alignment_key>() &&
Alignment < 16;
2300 using Tx = detail::__raw_t<T>;
2301 constexpr
unsigned Sz =
sizeof(Tx) * N;
2302 constexpr
bool SzRequiresLSC =
2303 Sz < detail::OperandSize::OWORD || Sz % detail::OperandSize::OWORD != 0 ||
2305 Sz > 8 * detail::OperandSize::OWORD;
2306 if constexpr (detail::has_cache_hints<PropertyListT>() ||
2307 AlignmentRequiresLSC || SzRequiresLSC) {
2308 using NewPropertyListT =
2309 detail::add_alignment_property_t<PropertyListT, DefaultLSCAlignment>;
2311 detail::block_store_impl<T, N, NewPropertyListT>(acc, byte_offset, vals,
2314 auto surf_ind = __esimd_get_surface_index(
2315 detail::AccessorPrivateProxy::getQualifiedPtrOrImageObj(acc));
2316 __esimd_oword_st<Tx, N>(surf_ind, byte_offset >> 4, vals.
data());
2350 template <
typename T,
int N,
typename AccessorT,
2351 typename PropertyListT =
2353 __ESIMD_API std::enable_if_t<
2354 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
2355 detail::is_device_accessor_with_v<AccessorT,
2356 detail::accessor_mode_cap::can_write>>
2360 using NewPropertyListT =
2361 detail::add_or_replace_alignment_property_t<PropertyListT, 16>;
2362 block_store<T, N>(acc, 0, vals, NewPropertyListT{});
2396 template <
typename T,
int N,
typename AccessorT,
2397 typename PropertyListT =
2399 __ESIMD_API std::enable_if_t<
2400 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
2401 detail::is_device_accessor_with_v<AccessorT,
2402 detail::accessor_mode_cap::can_write>>
2405 constexpr
size_t DefaultAlignment = (
sizeof(T) <= 4) ? 4 :
sizeof(T);
2406 using NewPropertyListT =
2407 detail::add_alignment_property_t<PropertyListT, DefaultAlignment>;
2408 detail::block_store_impl<T, N, NewPropertyListT>(acc, byte_offset, vals,
2436 template <
typename T,
int N,
typename AccessorT,
2437 typename PropertyListT =
2439 __ESIMD_API std::enable_if_t<
2440 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
2441 detail::is_device_accessor_with_v<AccessorT,
2442 detail::accessor_mode_cap::can_write>>
2444 PropertyListT props = {}) {
2447 using NewPropertyListT =
2448 detail::add_or_replace_alignment_property_t<PropertyListT, 16>;
2449 block_store<T, N>(acc, 0, vals, pred, NewPropertyListT{});
2460 template <
typename T,
int N,
typename AccessorTy>
2461 ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t<
2462 std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
2463 is_accessor_with_v<AccessorTy, detail::accessor_mode_cap::can_write>>
2468 if constexpr (
sizeof(T) == 8) {
2469 scatter_impl<uint32_t, N>(
2470 acc, vals.template bit_cast_view<uint32_t>().template select<N, 2>(0),
2471 offsets, glob_offset, mask);
2472 scatter_impl<uint32_t, N>(
2473 acc, vals.template bit_cast_view<uint32_t>().template select<N, 2>(1),
2474 offsets, glob_offset +
sizeof(uint32_t), mask);
2476 constexpr
int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
2478 constexpr int16_t scale = 0;
2481 if constexpr (
sizeof(T) < 4) {
2482 using Tint = std::conditional_t<std::is_integral_v<T>, T,
2483 detail::uint_type_t<
sizeof(T)>>;
2484 using Treal = __raw_t<T>;
2485 simd<Tint, N> vals_int = bitcast<Tint, Treal, N>(std::move(vals).data());
2486 using PromoT =
typename std::conditional_t<std::is_signed<Tint>::value,
2488 const simd<PromoT, N> promo_vals = convert<PromoT>(std::move(vals_int));
2489 __esimd_scatter_scaled<PromoT, N, decltype(si), TypeSizeLog2, scale>(
2490 mask.
data(), si, glob_offset, offsets.data(), promo_vals.data());
2492 using Treal = __raw_t<T>;
2493 if constexpr (!std::is_same_v<Treal, T>) {
2495 __esimd_scatter_scaled<Treal, N, decltype(si), TypeSizeLog2, scale>(
2496 mask.
data(), si, glob_offset, offsets.data(), Values.data());
2498 __esimd_scatter_scaled<T, N, decltype(si), TypeSizeLog2, scale>(
2499 mask.
data(), si, glob_offset, offsets.data(), vals.data());
2505 #ifndef __ESIMD_FORCE_STATELESS_MEM
2523 template <
typename T,
int NElts,
lsc_data_size DS,
typename PropertyListT,
2524 int N,
typename AccessorTy,
typename OffsetT>
2525 __ESIMD_API std::enable_if_t<
2526 is_device_accessor_with_v<AccessorTy, accessor_mode_cap::can_write>>
2529 static_assert(std::is_integral_v<OffsetT>,
2530 "Scatter must have integral byte_offset type");
2531 static_assert(
sizeof(OffsetT) <= 4,
2532 "Implicit truncation of 64-bit byte_offset to 32-bit is "
2533 "disabled. Use -fsycl-esimd-force-stateless-mem or explicitly "
2534 "convert offsets to a 32-bit vector");
2535 check_lsc_vector_size<NElts>();
2536 check_lsc_data_size<T, DS>();
2537 check_cache_hints<cache_action::store, PropertyListT>();
2538 constexpr
auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
2539 constexpr
auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
2540 constexpr uint16_t AddressScale = 1;
2541 constexpr
int ImmOffset = 0;
2549 __esimd_lsc_store_bti<MsgT, L1H, L2H, AddressScale, ImmOffset, EDS, LSCNElts,
2550 Transposed, N>(pred.
data(), ByteOffsets32.data(),
2555 template <
typename T,
int N,
typename AccessorTy>
2556 __ESIMD_API std::enable_if_t<
2557 (std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
2558 is_accessor_with_v<AccessorTy, detail::accessor_mode_cap::can_read>),
2564 if constexpr (
sizeof(T) == 8) {
2566 Res.template bit_cast_view<uint32_t>().template select<N, 2>(0) =
2567 gather_impl<uint32_t, N>(acc, offsets, glob_offset, mask);
2568 Res.template bit_cast_view<uint32_t>().template select<N, 2>(1) =
2569 gather_impl<uint32_t, N>(acc, offsets, glob_offset +
sizeof(uint32_t),
2573 using Treal = __raw_t<T>;
2574 constexpr
int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
2576 constexpr uint32_t scale = 0;
2578 if constexpr (
sizeof(T) < 4) {
2579 using Tint = std::conditional_t<std::is_integral_v<T>, T,
2580 detail::uint_type_t<
sizeof(T)>>;
2582 static_assert(std::is_integral<Tint>::value,
2583 "only integral 1- & 2-byte types are supported");
2584 using PromoT =
typename std::conditional_t<std::is_signed<Tint>::value,
2587 __esimd_gather_masked_scaled2<PromoT, N, decltype(si), TypeSizeLog2,
2588 scale>(si, glob_offset, offsets.data(),
2590 auto Res = convert<Tint>(promo_vals);
2592 if constexpr (!std::is_same_v<Tint, T>) {
2593 return detail::bitcast<Treal, Tint, N>(Res.data());
2598 simd<Treal, N> Res = __esimd_gather_masked_scaled2<Treal, N, decltype(si),
2599 TypeSizeLog2, scale>(
2600 si, glob_offset, offsets.data(), mask.
data());
2601 if constexpr (!std::is_same_v<Treal, T>) {
2602 return Res.template bit_cast_view<T>();
2610 #ifndef __ESIMD_FORCE_STATELESS_MEM
2611 template <
typename T,
int N,
int VS,
typename PropertyListT,
lsc_data_size DS,
2612 typename OffsetT,
typename AccessorT>
2613 __ESIMD_API std::enable_if_t<
2614 is_device_accessor_with_v<AccessorT, accessor_mode_cap::can_read>,
2618 static_assert(N / VS >= 1 && N % VS == 0,
"N must be divisible by VS");
2619 static_assert(std::is_integral_v<OffsetT>,
2620 "Gather must have integral byte_offset type");
2621 static_assert(
sizeof(OffsetT) <= 4,
2622 "Implicit truncation of 64-bit byte_offset to 32-bit is "
2623 "disabled. Use -fsycl-esimd-force-stateless-mem or explicitly "
2624 "convert offsets to a 32-bit vector");
2625 static_assert(VS == 1 ||
sizeof(T) >= 4,
2626 "VS > 1 is supprted only for 4- and 8-byte elements");
2627 check_lsc_vector_size<VS>();
2628 check_lsc_data_size<T, DS>();
2629 check_cache_hints<cache_action::load, PropertyListT>();
2630 constexpr uint16_t AddressScale = 1;
2631 constexpr
int ImmOffset = 0;
2636 constexpr
auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
2637 constexpr
auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
2639 simd<uint32_t, N / VS> ByteOffsets32 = convert<uint32_t>(byte_offsets);
2640 simd<MsgT, N> PassThruExpanded = lsc_format_input<MsgT>(pass_thru);
2642 __esimd_lsc_load_merge_bti<MsgT, L1H, L2H, AddressScale, ImmOffset, EDS,
2643 LSCVS, Transposed, N / VS>(
2644 pred.
data(), ByteOffsets32.data(), SI, PassThruExpanded.data());
2645 return lsc_format_ret<T>(Result);
2666 template <
typename T,
int NElts, lsc_data_size DS,
int N>
2670 check_lsc_vector_size<NElts>();
2671 check_lsc_data_size<T, DS>();
2672 constexpr uint16_t AddressScale = 1;
2673 constexpr
int ImmOffset = 0;
2681 AddressScale, ImmOffset, EDS, LSCVS,
2682 Transposed, N>(pred.
data(), offsets.data(),
2683 PassThruExpanded.data());
2684 return lsc_format_ret<T>(Result);
2701 template <
typename T,
int NElts, lsc_data_size DS,
int N>
2704 check_lsc_vector_size<NElts>();
2705 check_lsc_data_size<T, DS>();
2706 constexpr uint16_t AddressScale = 1;
2707 constexpr
int ImmOffset = 0;
2714 ImmOffset, EDS, LSCVS, Transposed, N>(
2715 pred.
data(), offsets.data(), Tmp.data());
2733 template <
typename T,
int NElts,
lsc_data_size DS,
typename PropertyListT,
2734 int N,
typename Toffset>
2737 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
2738 check_lsc_vector_size<NElts>();
2739 check_lsc_data_size<T, DS>();
2740 check_cache_hints<cache_action::prefetch, PropertyListT>();
2741 constexpr
auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
2742 constexpr
auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
2743 constexpr uint16_t AddressScale = 1;
2744 constexpr
int ImmOffset = 0;
2750 addrs += convert<uintptr_t>(byte_offsets);
2751 __esimd_lsc_prefetch_stateless<MsgT, L1H, L2H, AddressScale, ImmOffset, EDS,
2752 LSCVS, Transposed, N>(pred.
data(),
2756 template <
typename T,
int NElts,
lsc_data_size DS,
typename PropertyListT,
2758 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>>
2760 check_lsc_data_size<T, DS>();
2761 check_cache_hints<cache_action::prefetch, PropertyListT>();
2764 detail::getPropertyValue<PropertyListT, alignment_key>(
sizeof(T));
2766 (
Alignment >= __ESIMD_DNS::OperandSize::DWORD &&
sizeof(T) <= 4) ||
2767 (
Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
sizeof(T) > 4),
2768 "Incorrect alignment for the data type");
2770 constexpr
int SmallIntFactor64Bit =
sizeof(uint64_t) /
sizeof(T);
2771 constexpr
int SmallIntFactor32Bit =
2772 sizeof(uint32_t) /
sizeof(T) > 1 ?
sizeof(uint32_t) /
sizeof(T) : 1;
2773 static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
2774 "Number of elements is not supported by Transposed load");
2780 constexpr
bool Use64BitData =
2781 Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
2782 (NElts *
sizeof(T)) %
sizeof(uint64_t) == 0 &&
2783 (
sizeof(T) !=
sizeof(uint32_t) || NElts *
sizeof(T) > 256);
2784 constexpr
int SmallIntFactor =
2785 Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
2786 constexpr
int FactoredNElts = NElts / SmallIntFactor;
2787 check_lsc_vector_size<FactoredNElts>();
2790 using LoadElemT = __ESIMD_DNS::__raw_t<
2791 std::conditional_t<SmallIntFactor == 1, T,
2792 std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
2794 constexpr
auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
2795 constexpr
auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
2796 constexpr uint16_t AddressScale = 1;
2797 constexpr
int ImmOffset = 0;
2798 constexpr
lsc_data_size EDS = finalize_data_size<LoadElemT, DS>();
2802 "Transposed prefetch is supported only for data size u32 or u64");
2803 constexpr
lsc_vector_size LSCVS = to_lsc_vector_size<FactoredNElts>();
2805 constexpr
int N = 1;
2808 __esimd_lsc_prefetch_stateless<LoadElemT, L1H, L2H, AddressScale, ImmOffset,
2809 EDS, LSCVS, Transposed, N>(pred.
data(),
2813 #ifndef __ESIMD_FORCE_STATELESS_MEM
2832 template <
typename T,
int NElts,
lsc_data_size DS,
typename PropertyListT,
2833 int N,
typename AccessorTy,
typename OffsetT>
2834 __ESIMD_API std::enable_if_t<
2835 is_device_accessor_with_v<AccessorTy, accessor_mode_cap::can_read>>
2838 static_assert(std::is_integral_v<OffsetT>,
2839 "Prefetch must have integral byte_offset type");
2840 static_assert(
sizeof(OffsetT) <= 4,
2841 "Implicit truncation of 64-bit byte_offset to 32-bit is "
2842 "disabled. Use -fsycl-esimd-force-stateless-mem or explicitly "
2843 "convert offsets to a 32-bit vector");
2844 check_lsc_vector_size<NElts>();
2845 check_lsc_data_size<T, DS>();
2846 check_cache_hints<cache_action::prefetch, PropertyListT>();
2847 constexpr
auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
2848 constexpr
auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
2849 constexpr uint16_t AddressScale = 1;
2850 constexpr
int ImmOffset = 0;
2857 __esimd_lsc_prefetch_bti<MsgT, L1H, L2H, AddressScale, ImmOffset, EDS, LSCVS,
2858 Transposed, N>(pred.
data(), ByteOffsets32.data(),
2879 template <
typename T,
int NElts,
lsc_data_size DS,
typename PropertyListT,
2880 typename AccessorTy,
typename OffsetT>
2881 __ESIMD_API std::enable_if_t<
2882 std::is_integral_v<OffsetT> &&
2883 is_device_accessor_with_v<AccessorTy, accessor_mode_cap::can_read>>
2885 static_assert(
sizeof(OffsetT) <= 4,
2886 "Implicit truncation of 64-bit byte_offset to 32-bit is "
2887 "disabled. Use -fsycl-esimd-force-stateless-mem or explicitly "
2888 "convert offsets to a 32-bit vector");
2889 check_lsc_data_size<T, DS>();
2890 check_cache_hints<cache_action::prefetch, PropertyListT>();
2893 detail::getPropertyValue<PropertyListT, alignment_key>(
sizeof(T));
2895 constexpr
int SmallIntFactor64Bit =
sizeof(uint64_t) /
sizeof(T);
2896 constexpr
int SmallIntFactor32Bit =
2897 sizeof(uint32_t) /
sizeof(T) > 1 ?
sizeof(uint32_t) /
sizeof(T) : 1;
2898 static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
2899 "Number of elements is not supported by Transposed load");
2905 constexpr
bool Use64BitData =
2906 Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
2907 (NElts *
sizeof(T)) %
sizeof(uint64_t) == 0 &&
2908 (
sizeof(T) !=
sizeof(uint32_t) || NElts *
sizeof(T) > 256);
2909 constexpr
int SmallIntFactor =
2910 Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
2911 constexpr
int FactoredNElts = NElts / SmallIntFactor;
2912 check_lsc_vector_size<FactoredNElts>();
2915 using LoadElemT = __ESIMD_DNS::__raw_t<
2916 std::conditional_t<SmallIntFactor == 1, T,
2917 std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
2919 constexpr
auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
2920 constexpr
auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
2921 constexpr uint16_t AddressScale = 1;
2922 constexpr
int ImmOffset = 0;
2923 constexpr
lsc_data_size EDS = finalize_data_size<LoadElemT, DS>();
2927 "Transposed prefetch is supported only for data size u32 or u64");
2928 constexpr
lsc_vector_size LSCVS = to_lsc_vector_size<FactoredNElts>();
2930 constexpr
int N = 1;
2934 __esimd_lsc_prefetch_bti<LoadElemT, L1H, L2H, AddressScale, ImmOffset, EDS,
2935 LSCVS, Transposed, N>(pred.
data(), offsets.data(),
2941 template <
typename T,
int NBlocks,
int Height,
int Width,
bool Transposed,
2944 if constexpr (Transformed)
2945 return roundUpNextMultiple<Height, 4 /
sizeof(T)>() *
2946 getNextPowerOf2<Width>() * NBlocks;
2947 return Width * Height * NBlocks;
2950 #ifndef __ESIMD_DWORD_BLOCK_2D_WIDTH_SCALE
2951 #define __ESIMD_DWORD_BLOCK_2D_WIDTH_SCALE (1)
2954 #ifndef __ESIMD_BLOCK_2D_WIDTH_CHECK
2955 #define __ESIMD_BLOCK_2D_WIDTH_CHECK(OP, BLOCK_WIDTH, NBLOCKS, SIZE) \
2956 static_assert((BLOCK_WIDTH) * (NBLOCKS) * (SIZE) <= 64, \
2957 "Unsupported block width");
2963 template <
typename T,
int BlockWidth,
int BlockHeight,
int NBlocks,
2964 bool Transposed,
bool Transformed, block_2d_op Op>
2965 constexpr
void check_lsc_block_2d_restrictions() {
2966 constexpr
int GRFByteSize = BlockWidth * BlockHeight * NBlocks *
sizeof(T);
2967 static_assert(BlockWidth > 0,
"Block width must be positive");
2968 static_assert(BlockHeight > 0,
"Block height must be positive");
2970 if constexpr (Op == block_2d_op::store)
2971 static_assert(GRFByteSize <= 512,
"2D store supports 512 bytes max");
2973 static_assert(GRFByteSize <= 2048,
2974 "2D load/prefetch supports 2048 bytes max");
2975 static_assert(!Transposed || !Transformed,
2976 "Transposed and transformed is not supported");
2977 static_assert((
sizeof(T) * BlockWidth) % 4 == 0,
2978 "Block width must be aligned by DW");
2979 if constexpr (Transposed) {
2980 static_assert(NBlocks == 1,
"Transposed expected to be 1 block only");
2981 static_assert(
sizeof(T) == 4 ||
sizeof(T) == 8,
2982 "Transposed load is supported only for data size u32 or u64");
2983 static_assert(
sizeof(T) == 8 ? BlockHeight == 8
2984 : BlockHeight >= 1 && BlockHeight <= 32,
2985 "Unsupported block height");
2986 static_assert(
sizeof(T) == 8
2988 : BlockWidth >= 1 &&
2990 8 * __ESIMD_DWORD_BLOCK_2D_WIDTH_SCALE,
2991 "Unsupported block width");
2992 }
else if constexpr (Transformed) {
2993 static_assert(
sizeof(T) == 1 ||
sizeof(T) == 2,
2994 "VNNI transform is supported only for data size u8 or u16");
2996 "Unsupported number of blocks");
2997 static_assert(BlockHeight *
sizeof(T) >= 4 && BlockHeight <= 32,
2998 "Unsupported block height");
2999 static_assert(BlockWidth *
sizeof(T) >= 4 && BlockWidth <= 16 &&
3000 BlockWidth * NBlocks *
sizeof(T) <= 64,
3001 "Unsupported block width");
3003 if constexpr (Op == block_2d_op::store) {
3004 static_assert(NBlocks == 1,
"Unsupported number of blocks for 2D store");
3005 static_assert(BlockHeight <= 8,
"Unsupported block height for store");
3009 "Unsupported number of blocks for 2D load/prefetch");
3010 static_assert(BlockHeight <= 32,
"Unsupported block height for load");
3012 static_assert(BlockWidth *
sizeof(T) >= 4,
"Unsupported block width");
3013 __ESIMD_BLOCK_2D_WIDTH_CHECK(Op, BlockWidth, NBlocks,
sizeof(T));
3016 #undef __ESIMD_DWORD_BLOCK_2D_WIDTH_SCALE
3017 #undef __ESIMD_BLOCK_2D_WIDTH_CHECK
3050 typename T,
int BlockWidth,
int BlockHeight,
int NBlocks,
bool Transposed,
3051 bool Transformed,
typename PropertyListT,
3052 int N = get_lsc_block_2d_data_size<__raw_t<T>, NBlocks, BlockHeight,
3053 BlockWidth, Transposed, Transformed>()>
3054 __ESIMD_API
simd<T, N> load_2d_impl(
const T *Ptr,
unsigned SurfaceWidth,
3055 unsigned SurfaceHeight,
3056 unsigned SurfacePitch,
int X,
int Y) {
3058 check_cache_hints<cache_action::load, PropertyListT>();
3059 constexpr
auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
3060 constexpr
auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
3061 using RawT = __raw_t<T>;
3062 check_lsc_block_2d_restrictions<RawT, BlockWidth, BlockHeight, NBlocks,
3063 Transposed, Transformed, block_2d_op::load>();
3069 constexpr
int ElemsPerDword = 4 /
sizeof(RawT);
3070 constexpr
int GRFRowSize = Transposed ? BlockHeight
3071 : Transformed ? BlockWidth * ElemsPerDword
3073 constexpr
int GRFRowPitch = getNextPowerOf2<GRFRowSize>();
3074 constexpr
int GRFColSize =
3077 : (Transformed ? (BlockHeight + ElemsPerDword - 1) / ElemsPerDword
3079 constexpr
int GRFBlockSize = GRFRowPitch * GRFColSize;
3080 constexpr
int GRFBlockPitch =
3081 roundUpNextMultiple<64 /
sizeof(RawT), GRFBlockSize>();
3082 constexpr
int ActualN = NBlocks * GRFBlockPitch;
3084 constexpr
int DstBlockElements = GRFColSize * GRFRowSize;
3085 constexpr
int DstElements = DstBlockElements * NBlocks;
3087 static_assert(N == ActualN || N == DstElements,
"Incorrect element count");
3090 finalize_data_size<RawT, lsc_data_size::default_size>();
3091 uintptr_t Addr =
reinterpret_cast<uintptr_t
>(Ptr);
3095 __esimd_lsc_load2d_stateless<RawT, L1H, L2H, DS, Transpose, NBlocks,
3096 BlockWidth, BlockHeight, Transformed,
3097 ActualN>(Mask.
data(), Addr, SurfaceWidth,
3098 SurfaceHeight, SurfacePitch, X, Y);
3100 if constexpr (ActualN == N) {
3122 for (
auto i = 0; i < NBlocks; i++) {
3124 Dst.template select<DstBlockElements, 1>(i * DstBlockElements);
3126 auto RawBlock = Raw.template select<GRFBlockSize, 1>(i * GRFBlockPitch);
3128 RawBlock.template bit_cast_view<RawT, GRFColSize, GRFRowPitch>()
3129 .template select<GRFColSize, 1, GRFRowSize, 1>(0, 0)
3130 .template bit_cast_view<RawT>();
3159 template <
typename T,
int BlockWidth,
int BlockHeight,
int NBlocks,
3160 typename PropertyListT,
3161 int N = get_lsc_block_2d_data_size<__raw_t<T>, NBlocks, BlockHeight,
3164 __ESIMD_API
void prefetch_2d_impl(
const T *Ptr,
unsigned SurfaceWidth,
3165 unsigned SurfaceHeight,
unsigned SurfacePitch,
3167 using RawT = __raw_t<T>;
3168 check_cache_hints<cache_action::prefetch, PropertyListT>();
3169 check_lsc_block_2d_restrictions<RawT, BlockWidth, BlockHeight, NBlocks,
false,
3171 constexpr
auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
3172 constexpr
auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
3174 finalize_data_size<RawT, lsc_data_size::default_size>();
3175 uintptr_t Addr =
reinterpret_cast<uintptr_t
>(Ptr);
3178 __esimd_lsc_prefetch2d_stateless<RawT, L1H, L2H, DS, Transpose, NBlocks,
3179 BlockWidth, BlockHeight,
false, N>(
3180 Mask.
data(), Addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
3207 template <
typename T,
int BlockWidth,
int BlockHeight,
typename PropertyListT,
3209 __raw_t<T>, 1u, BlockHeight, BlockWidth,
false ,
3211 __ESIMD_API
void store_2d_impl(T *Ptr,
unsigned SurfaceWidth,
3212 unsigned SurfaceHeight,
unsigned SurfacePitch,
3214 using RawT = __raw_t<T>;
3217 constexpr
auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
3218 constexpr
auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
3219 check_lsc_block_2d_restrictions<RawT, BlockWidth, BlockHeight, 1,
false,
3220 false, block_2d_op::store>();
3222 finalize_data_size<RawT, lsc_data_size::default_size>();
3223 uintptr_t Addr =
reinterpret_cast<uintptr_t
>(Ptr);
3226 constexpr
int Pitch = getNextPowerOf2<BlockWidth>();
3227 constexpr
int NElts = BlockHeight * Pitch;
3231 if constexpr (NElts == N) {
3236 auto Data2D = Vals.template bit_cast_view<RawT, BlockHeight, BlockWidth>();
3237 auto Raw2D = Raw.template bit_cast_view<RawT, BlockHeight, Pitch>();
3238 Raw2D.template select<BlockHeight, 1, BlockWidth, 1>(0, 0) = Data2D;
3241 __esimd_lsc_store2d_stateless<RawT, L1H, L2H, DS, Transpose, 1u, BlockWidth,
3242 BlockHeight,
false, NElts>(
3243 Mask.
data(), Addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y,
3281 template <
typename T,
int N,
typename AccessorT>
3283 std::enable_if_t<detail::is_device_accessor_with_v<
3284 AccessorT, detail::accessor_mode_cap::can_read>,
3288 #ifdef __ESIMD_FORCE_STATELESS_MEM
3289 return gather<T, N>(__ESIMD_DNS::accessorToPointer<T>(acc, glob_offset),
3290 byte_offsets, mask);
3295 byte_offsets += glob_offset;
3299 acc, byte_offsets, mask, PassThru);
3301 return detail::gather_impl<T, N>(acc, byte_offsets, glob_offset, mask);
3319 template <
typename T,
int N,
typename AccessorT>
3321 std::enable_if_t<detail::is_device_accessor_with_v<
3322 AccessorT, detail::accessor_mode_cap::can_read>,
3326 return gather<T, N>(acc, ByteOffsets, glob_offset);
3329 #ifdef __ESIMD_FORCE_STATELESS_MEM
3330 template <
typename T,
int N,
typename AccessorTy,
typename Toffset>
3331 __ESIMD_API std::enable_if_t<
3332 detail::is_device_accessor_with_v<AccessorTy,
3333 detail::accessor_mode_cap::can_read> &&
3334 std::is_integral_v<Toffset> && !std::is_same_v<Toffset, uint64_t>,
3338 return gather<T, N>(acc, convert<uint64_t>(offsets), glob_offset, mask);
3414 template <
typename T,
int N,
int VS,
typename AccessorT,
typename OffsetT,
3415 typename PropertyListT =
3417 __ESIMD_API std::enable_if_t<
3418 (detail::is_device_accessor_with_v<AccessorT,
3419 detail::accessor_mode_cap::can_read> &&
3420 ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3424 #ifdef __ESIMD_FORCE_STATELESS_MEM
3425 return gather<T, N, VS>(detail::accessorToPointer<T>(acc), byte_offsets, mask,
3430 acc, byte_offsets, mask, pass_thru);
3463 template <
typename T,
int N,
int VS,
typename AccessorT,
typename OffsetT,
3464 typename PropertyListT =
3466 __ESIMD_API std::enable_if_t<
3467 (detail::is_device_accessor_with_v<AccessorT,
3468 detail::accessor_mode_cap::can_read> &&
3469 ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3473 #ifdef __ESIMD_FORCE_STATELESS_MEM
3474 return gather<T, N, VS>(detail::accessorToPointer<T>(acc), byte_offsets, mask,
3478 detail::getPropertyValue<PropertyListT, alignment_key>(
sizeof(T));
3480 "gather() requires at least element-size alignment");
3482 if constexpr (detail::has_cache_hints<PropertyListT>() || VS > 1 ||
3487 acc, byte_offsets, mask, PassThru);
3489 return detail::gather_impl<T, N>(acc, byte_offsets, 0, mask);
3517 template <
typename T,
int N,
int VS,
typename AccessorT,
typename OffsetT,
3518 typename PropertyListT =
3520 __ESIMD_API std::enable_if_t<
3521 (detail::is_device_accessor_with_v<AccessorT,
3522 detail::accessor_mode_cap::can_read> &&
3523 ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3526 PropertyListT props = {}) {
3528 return gather<T, N, VS>(acc, byte_offsets, Mask, props);
3544 template <
typename T,
int N,
typename AccessorT,
typename OffsetT,
3546 typename PropertyListT =
3548 __ESIMD_API std::enable_if_t<
3549 (detail::is_device_accessor_with_v<AccessorT,
3550 detail::accessor_mode_cap::can_read> &&
3551 std::is_same_v<MaskT, simd_mask<N>> &&
3552 ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3555 simd<T, N> pass_thru, PropertyListT props = {}) {
3556 return gather<T, N, 1>(acc, byte_offsets, mask, pass_thru, props);
3570 template <
typename T,
int N,
typename AccessorT,
typename OffsetT,
3572 typename PropertyListT =
3574 __ESIMD_API std::enable_if_t<
3575 (detail::is_device_accessor_with_v<AccessorT,
3576 detail::accessor_mode_cap::can_read> &&
3577 std::is_same_v<MaskT, simd_mask<N>> &&
3578 ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3581 PropertyListT props = {}) {
3582 return gather<T, N, 1>(acc, byte_offsets, mask, props);
3592 template <
typename T,
int N,
typename AccessorT,
typename OffsetT,
3593 typename PropertyListT =
3595 __ESIMD_API std::enable_if_t<
3596 (detail::is_device_accessor_with_v<AccessorT,
3597 detail::accessor_mode_cap::can_read> &&
3598 ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3601 return gather<T, N, 1>(acc, byte_offsets, props);
3612 template <
typename T,
int N,
int VS = 1,
typename AccessorT,
3613 typename OffsetSimdViewT,
3614 typename PropertyListT =
3616 __ESIMD_API std::enable_if_t<
3617 (detail::is_device_accessor_with_v<AccessorT,
3618 detail::accessor_mode_cap::can_read> &&
3619 detail::is_simd_view_type_v<OffsetSimdViewT> &&
3620 ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3623 simd<T, N> pass_thru, PropertyListT props = {}) {
3624 return gather<T, N, VS>(acc, byte_offsets.read(), mask, pass_thru, props);
3635 template <
typename T,
int N,
int VS = 1,
typename AccessorT,
3636 typename OffsetSimdViewT,
3637 typename PropertyListT =
3639 __ESIMD_API std::enable_if_t<
3640 (detail::is_device_accessor_with_v<AccessorT,
3641 detail::accessor_mode_cap::can_read> &&
3642 detail::is_simd_view_type_v<OffsetSimdViewT> &&
3643 ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3646 PropertyListT props = {}) {
3647 return gather<T, N, VS>(acc, byte_offsets.read(), mask, props);
3657 template <
typename T,
int N,
int VS = 1,
typename AccessorT,
3658 typename OffsetSimdViewT,
3659 typename PropertyListT =
3661 __ESIMD_API std::enable_if_t<
3662 (detail::is_device_accessor_with_v<AccessorT,
3663 detail::accessor_mode_cap::can_read> &&
3664 detail::is_simd_view_type_v<OffsetSimdViewT> &&
3665 ext::oneapi::experimental::is_property_list_v<PropertyListT>),
3667 gather(AccessorT acc, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
3668 return gather<T, N, VS>(acc, byte_offsets.read(), props);
3722 template <
typename T,
int N,
int VS = 1,
typename AccessorTy,
typename OffsetT,
3723 typename PropertyListT =
3725 __ESIMD_API std::enable_if_t<
3726 detail::is_device_accessor_with_v<AccessorTy,
3727 detail::accessor_mode_cap::can_write> &&
3728 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3731 #ifdef __ESIMD_FORCE_STATELESS_MEM
3732 scatter<T, N, VS>(__ESIMD_DNS::accessorToPointer<T>(acc), byte_offsets, vals,
3736 detail::getPropertyValue<PropertyListT, alignment_key>(
sizeof(T));
3738 "gather() requires at least element-size alignment");
3740 if constexpr (detail::has_cache_hints<PropertyListT>() || VS > 1 ||
3743 PropertyListT>(acc, byte_offsets, vals, mask);
3745 detail::scatter_impl<T, N, AccessorTy>(acc, vals, byte_offsets, 0, mask);
3769 template <
typename T,
int N,
int VS = 1,
typename AccessorTy,
typename OffsetT,
3770 typename PropertyListT =
3772 __ESIMD_API std::enable_if_t<
3773 detail::is_device_accessor_with_v<AccessorTy,
3774 detail::accessor_mode_cap::can_write> &&
3775 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3777 PropertyListT props = {}) {
3779 scatter<T, N, VS>(acc, byte_offsets, vals, Mask, props);
3807 template <
typename T,
int N,
int VS = 1,
typename AccessorTy,
3808 typename OffsetSimdViewT,
3809 typename PropertyListT =
3811 __ESIMD_API std::enable_if_t<
3812 detail::is_device_accessor_with_v<AccessorTy,
3813 detail::accessor_mode_cap::can_write> &&
3814 detail::is_simd_view_type_v<OffsetSimdViewT> &&
3815 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3818 scatter<T, N, VS>(acc, byte_offsets.read(), vals, mask, props);
3841 template <
typename T,
int N,
int VS = 1,
typename AccessorTy,
3842 typename OffsetSimdViewT,
3843 typename PropertyListT =
3845 __ESIMD_API std::enable_if_t<
3846 detail::is_device_accessor_with_v<AccessorTy,
3847 detail::accessor_mode_cap::can_write> &&
3848 detail::is_simd_view_type_v<OffsetSimdViewT> &&
3849 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
3851 PropertyListT props = {}) {
3853 scatter<T, N, VS>(acc, byte_offsets.read(), vals, Mask, props);
3872 template <
typename T,
int N,
typename AccessorTy>
3875 detail::is_device_accessor_with_v<
3876 AccessorTy, detail::accessor_mode_cap::can_write>>
3880 offsets += glob_offset;
3881 scatter<T, N>(acc, offsets, vals, mask);
3884 template <
typename T,
int N,
typename AccessorTy>
3887 detail::is_device_accessor_with_v<
3888 AccessorTy, detail::accessor_mode_cap::can_write>>
3892 scatter<T, N>(acc, ByteOffsets, vals, glob_offset, mask);
3895 #ifdef __ESIMD_FORCE_STATELESS_MEM
3896 template <
typename T,
int N,
typename AccessorTy,
typename Toffset>
3897 __ESIMD_API std::enable_if_t<
3898 detail::is_device_accessor_with_v<AccessorTy,
3899 detail::accessor_mode_cap::can_write> &&
3900 std::is_integral_v<Toffset> && !std::is_same_v<Toffset, uint64_t>>
3903 scatter<T, N, AccessorTy>(acc, convert<uint64_t>(offsets), vals, glob_offset,
3915 template <
typename T,
typename AccessorTy>
3919 gather<T, 1, AccessorTy>(acc,
simd<decltype(offset), 1>(offset));
3930 template <
typename T,
typename AccessorTy>
3933 scatter<T, 1, AccessorTy>(acc,
simd<decltype(offset), 1>(offset),
3971 int N,
typename Toffset>
3974 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
3975 static_assert((N == 8 || N == 16 || N == 32),
"Unsupported value of N");
3976 static_assert(
sizeof(T) == 4,
"Unsupported size of type T");
3979 addrs = addrs + offsets_i;
3980 return __esimd_svm_gather4_scaled<detail::__raw_t<T>, N, RGBAMask>(
4000 int N,
typename OffsetSimdViewT,
typename RegionTy>
4001 __ESIMD_API std::enable_if_t<detail::is_simd_view_type_v<OffsetSimdViewT>,
4004 return gather_rgba<RGBAMask, T, N>(p, offsets.read(), mask);
4023 int N,
typename Toffset>
4024 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>,
4034 (M == CM::ABGR || M == CM::BGR || M == CM::GR || M == CM::R) &&
4035 "Only ABGR, BGR, GR, R channel masks are valid in write operations");
4061 int N,
typename Toffset>
4066 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
4067 static_assert((N == 8 || N == 16 || N == 32),
"Unsupported value of N");
4068 static_assert(
sizeof(T) == 4,
"Unsupported size of type T");
4069 detail::validate_rgba_write_channel_mask<RGBAMask>();
4072 addrs = addrs + offsets_i;
4073 __esimd_svm_scatter4_scaled<detail::__raw_t<T>, N, RGBAMask>(
4074 addrs.
data(), vals.data(), mask.
data());
4093 int N,
typename OffsetSimdViewT,
typename RegionTy>
4094 __ESIMD_API std::enable_if_t<detail::is_simd_view_type_v<OffsetSimdViewT>>
4098 scatter_rgba<RGBAMask, T, N>(p, offsets.read(), vals, mask);
4117 int N,
typename Toffset>
4118 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> && N == 1>
4125 template <
typename T,
int N, rgba_channel_mask RGBAMask>
4128 enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4>
scatter_rgba(
4129 T *p,
simd<uint32_t, N> offsets,
4132 scatter_rgba<RGBAMask>(p, offsets, vals, mask);
4158 typename AccessorT,
int N,
4161 std::enable_if_t<((N == 8 || N == 16 || N == 32) &&
sizeof(T) == 4 &&
4162 detail::is_device_accessor_with_v<
4163 AccessorT, detail::accessor_mode_cap::can_read>),
4168 #ifdef __ESIMD_FORCE_STATELESS_MEM
4169 return gather_rgba<RGBAMask>(
4170 __ESIMD_DNS::accessorToPointer<T>(acc, global_offset), offsets, mask);
4173 constexpr uint32_t Scale = 0;
4175 return __esimd_gather4_masked_scaled2<detail::__raw_t<T>, N, RGBAMask,
4176 decltype(SI), Scale>(
4177 SI, global_offset, offsets.
data(), mask.
data());
4181 #ifdef __ESIMD_FORCE_STATELESS_MEM
4183 typename AccessorT,
int N,
4185 __ESIMD_API std::enable_if_t<
4186 ((N == 8 || N == 16 || N == 32) &&
sizeof(T) == 4 &&
4187 detail::is_device_accessor_with_v<AccessorT,
4188 detail::accessor_mode_cap::can_read> &&
4189 std::is_integral_v<Toffset> && !std::is_same_v<Toffset, uint64_t>),
4193 return gather_rgba<RGBAMask, AccessorT, N, T>(acc, convert<uint64_t>(offsets),
4194 global_offset, mask);
4213 typename AccessorT,
int N,
4216 std::enable_if_t<(N == 8 || N == 16 || N == 32) &&
sizeof(T) == 4 &&
4217 detail::is_device_accessor_with_v<
4218 AccessorT, detail::accessor_mode_cap::can_write>>
4223 detail::validate_rgba_write_channel_mask<RGBAMask>();
4224 #ifdef __ESIMD_FORCE_STATELESS_MEM
4225 scatter_rgba<RGBAMask>(__ESIMD_DNS::accessorToPointer<T>(acc, global_offset),
4226 offsets, vals, mask);
4229 constexpr uint32_t Scale = 0;
4231 __esimd_scatter4_scaled<T, N, decltype(SI), RGBAMask, Scale>(
4232 mask.
data(), SI, global_offset, offsets.
data(), vals.data());
4236 #ifdef __ESIMD_FORCE_STATELESS_MEM
4238 typename AccessorT,
int N,
4240 __ESIMD_API std::enable_if_t<
4241 (N == 8 || N == 16 || N == 32) &&
sizeof(T) == 4 &&
4242 detail::is_device_accessor_with_v<AccessorT,
4243 detail::accessor_mode_cap::can_write> &&
4244 std::is_integral_v<Toffset> && !std::is_same_v<Toffset, uint64_t>>
4248 scatter_rgba<RGBAMask, AccessorT, N, T>(acc, convert<uint64_t>(offsets), vals,
4249 global_offset, mask);
4256 #ifndef __ESIMD_FP_ATOMIC_OP_TYPE_CHECK
4257 #define __ESIMD_FP_ATOMIC_OP_TYPE_CHECK(T) \
4258 static_assert(is_type<T, float, sycl::half, double>(), \
4259 "float, double or sycl::half type is expected");
4268 static_assert(
sizeof(T) > 1,
"Unsupported data type");
4271 if constexpr (!IsLSC)
4273 "Execution size 1, 2, 4, 8, 16, 32 are supported");
4275 static_assert(NumSrc == __ESIMD_DNS::get_num_args<Op>(),
4276 "Wrong number of operands");
4277 constexpr
bool IsInt2BytePlus =
4278 std::is_integral_v<T> && (
sizeof(T) >=
sizeof(uint16_t));
4280 if constexpr (Op == __ESIMD_NS::atomic_op::xchg ||
4281 Op == __ESIMD_NS::atomic_op::cmpxchg ||
4282 Op == __ESIMD_NS::atomic_op::predec ||
4283 Op == __ESIMD_NS::atomic_op::inc ||
4286 static_assert(IsInt2BytePlus,
"Integral 16-bit or wider type is expected");
4291 Op == __ESIMD_NS::atomic_op::fadd ||
4292 Op == __ESIMD_NS::atomic_op::fsub ||
4293 Op == __ESIMD_NS::atomic_op::fcmpxchg) {
4297 Op == __ESIMD_NS::atomic_op::sub ||
4298 Op == __ESIMD_NS::atomic_op::umin ||
4299 Op == __ESIMD_NS::atomic_op::umax ||
4303 Op == __ESIMD_NS::atomic_op::smin ||
4304 Op == __ESIMD_NS::atomic_op::smax) {
4305 static_assert(IsInt2BytePlus,
"Integral 16-bit or wider type is expected");
4306 constexpr
bool IsSignedMinmax = (Op == __ESIMD_NS::atomic_op::smin) ||
4307 (Op == __ESIMD_NS::atomic_op::smax);
4308 constexpr
bool IsUnsignedMinmax = (Op == __ESIMD_NS::atomic_op::umin) ||
4309 (Op == __ESIMD_NS::atomic_op::umax);
4311 if constexpr (IsSignedMinmax || IsUnsignedMinmax) {
4312 constexpr
bool SignOK = std::is_signed_v<T> == IsSignedMinmax;
4313 static_assert(SignOK,
"Signed/unsigned integer type expected for "
4314 "signed/unsigned min/max operation");
4318 #undef __ESIMD_FP_ATOMIC_OP_TYPE_CHECK
4336 template <u
int32_t SLMSize> __ESIMD_API
void slm_init() {
4337 __esimd_slm_init(SLMSize);
4348 __ESIMD_API
void slm_init(uint32_t size) { __esimd_slm_init(size); }
4392 #ifndef __ESIMD_GATHER_SCATTER_LLVM_IR
4416 template <typename T, int N, int VS,
4417 typename PropertyListT =
4419 __ESIMD_API std::enable_if_t<
4420 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
simd<T, N>>
4422 simd<T, N> pass_thru, PropertyListT props = {}) {
4423 static_assert(N / VS >= 1 && N % VS == 0,
"N must be divisible by VS");
4426 detail::getPropertyValue<PropertyListT, alignment_key>(
sizeof(T));
4428 "slm_gather() requires at least element-size alignment");
4433 return __ESIMD_DNS::slm_gather_impl<T, VS,
4435 byte_offsets, mask, pass_thru);
4437 if constexpr (
sizeof(T) == 8) {
4439 Res.template bit_cast_view<uint32_t>().template select<N, 2>(0) =
4440 __esimd_slm_gather_ld<uint32_t, N, Alignment>(
4442 (pass_thru.template bit_cast_view<uint32_t>()
4443 .template select<N, 2>(0))
4445 simd<uint32_t, N / VS> Offset = byte_offsets +
sizeof(uint32_t);
4446 Res.template bit_cast_view<uint32_t>().template select<N, 2>(1) =
4447 __esimd_slm_gather_ld<uint32_t, N, sizeof(uint32_t)>(
4448 Offset.data(), mask.
data(),
4449 (pass_thru.template bit_cast_view<uint32_t>()
4450 .template select<N, 2>(1))
4454 using MsgT = detail::__raw_t<T>;
4455 return __esimd_slm_gather_ld<MsgT, N, Alignment>(
4484 template <
typename T,
int N,
int VS,
4485 typename PropertyListT =
4487 __ESIMD_API std::enable_if_t<
4488 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
simd<T, N>>
4490 PropertyListT props = {}) {
4492 detail::getPropertyValue<PropertyListT, alignment_key>(
sizeof(T));
4494 "slm_gather() requires at least element-size alignment");
4499 return detail::slm_gather_impl<T, VS, detail::lsc_data_size::default_size>(
4500 byte_offsets, mask, PassThru);
4502 if constexpr (
sizeof(T) == 8) {
4506 Res.template bit_cast_view<uint32_t>().template select<N, 2>(0) =
4507 __esimd_slm_gather_ld<uint32_t, N, Alignment>(
4508 byte_offsets.
data(), mask.
data(), PassThru.data());
4509 simd<uint32_t, N / VS> Offset = byte_offsets +
sizeof(uint32_t);
4510 Res.template bit_cast_view<uint32_t>().template select<N, 2>(1) =
4511 __esimd_slm_gather_ld<uint32_t, N, sizeof(uint32_t)>(
4512 Offset.data(), mask.
data(), PassThru.data());
4515 using MsgT = detail::__raw_t<T>;
4517 return __esimd_slm_gather_ld<MsgT, N, Alignment>(
4518 byte_offsets.
data(), mask.
data(), PassThru.data());
4521 detail::LocalAccessorMarker acc;
4522 return detail::gather_impl<T, N>(acc, byte_offsets, 0, mask);
4543 template <
typename T,
int N,
int VS,
4544 typename PropertyListT =
4546 __ESIMD_API std::enable_if_t<
4547 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
simd<T, N>>
4550 return slm_gather<T, N, VS>(byte_offsets, Mask, props);
4575 template <
typename T,
int N,
4576 typename PropertyListT =
4578 __ESIMD_API std::enable_if_t<
4579 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
simd<T, N>>
4581 simd<T, N> pass_thru, PropertyListT props = {}) {
4582 constexpr
int VS = 1;
4583 return slm_gather<T, N, VS>(byte_offsets, mask, pass_thru, props);
4605 template <
typename T,
int N,
4606 typename PropertyListT =
4608 __ESIMD_API std::enable_if_t<
4609 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
simd<T, N>>
4611 PropertyListT props = {}) {
4612 constexpr
int VS = 1;
4613 return slm_gather<T, N, VS>(byte_offsets, mask, props);
4630 template <
typename T,
int N,
4631 typename PropertyListT =
4633 __ESIMD_API std::enable_if_t<
4634 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
simd<T, N>>
4636 constexpr
int VS = 1;
4637 return slm_gather<T, N, VS>(byte_offsets, props);
4666 template <
typename T,
int N,
int VS = 1,
typename OffsetSimdViewT,
4667 typename PropertyListT =
4669 __ESIMD_API std::enable_if_t<
4670 detail::is_simd_view_type_v<OffsetSimdViewT> &&
4671 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
4674 simd<T, N> pass_thru, PropertyListT props = {}) {
4675 return slm_gather<T, N, VS>(byte_offsets.read(), mask, pass_thru, props);
4699 template <
typename T,
int N,
int VS = 1,
typename OffsetSimdViewT,
4700 typename PropertyListT =
4702 __ESIMD_API std::enable_if_t<
4703 detail::is_simd_view_type_v<OffsetSimdViewT> &&
4704 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
4707 PropertyListT props = {}) {
4708 return slm_gather<T, N, VS>(byte_offsets.read(), mask, props);
4727 template <
typename T,
int N,
int VS = 1,
typename OffsetSimdViewT,
4728 typename PropertyListT =
4730 __ESIMD_API std::enable_if_t<
4731 detail::is_simd_view_type_v<OffsetSimdViewT> &&
4732 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
4734 slm_gather(OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
4735 return slm_gather<T, N, VS>(byte_offsets.read(), props);
4788 template <
typename T,
int N,
int VS = 1,
4789 typename PropertyListT =
4791 __ESIMD_API std::enable_if_t<
4792 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4795 static_assert(N / VS >= 1 && N % VS == 0,
"N must be divisible by VS");
4798 detail::getPropertyValue<PropertyListT, alignment_key>(
sizeof(T));
4800 "slm_scatter() requires at least element-size alignment");
4805 __ESIMD_DNS::slm_scatter_impl<T, VS, detail::lsc_data_size::default_size>(
4806 byte_offsets, vals, mask);
4808 if constexpr (
sizeof(T) == 8) {
4809 __esimd_slm_scatter_st<uint32_t, N, Alignment>(
4810 vals.template bit_cast_view<uint32_t>()
4811 .template select<N, 2>(0)
4814 simd<uint32_t, N / VS> Offset = byte_offsets +
sizeof(uint32_t);
4815 __esimd_slm_scatter_st<uint32_t, N, sizeof(uint32_t)>(
4816 vals.template bit_cast_view<uint32_t>()
4817 .template select<N, 2>(1)
4819 Offset.data(), mask.
data());
4822 using MsgT = detail::__raw_t<T>;
4823 __esimd_slm_scatter_st<MsgT, N, Alignment>(
4828 detail::LocalAccessorMarker acc;
4829 detail::scatter_impl<T, N>(acc, vals, byte_offsets, 0, mask);
4849 template <
typename T,
int N,
int VS = 1,
4850 typename PropertyListT =
4852 __ESIMD_API std::enable_if_t<
4853 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4855 PropertyListT props = {}) {
4857 slm_scatter<T, N, VS>(byte_offsets, vals, Mask, props);
4883 template <
typename T,
int N,
int VS = 1,
typename OffsetSimdViewT,
4884 typename PropertyListT =
4886 __ESIMD_API std::enable_if_t<
4887 detail::is_simd_view_type_v<OffsetSimdViewT> &&
4888 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4891 slm_scatter<T, N, VS>(byte_offsets.read(), vals, mask, props);
4909 template <
typename T,
int N,
int VS = 1,
typename OffsetSimdViewT,
4910 typename PropertyListT =
4912 __ESIMD_API std::enable_if_t<
4913 detail::is_simd_view_type_v<OffsetSimdViewT> &&
4914 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
4916 PropertyListT props = {}) {
4917 return slm_scatter<T, N, VS>(byte_offsets.read(), vals, props);
4925 template <
typename T>
4940 template <
typename T,
int N, rgba_channel_mask RGBAMask>
4941 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && (
sizeof(T) == 4),
4945 return __esimd_gather4_masked_scaled2<T, N, RGBAMask>(
4946 SI, 0 , offsets.
data(), mask.
data());
4959 template <
typename T,
int N, rgba_channel_mask Mask>
4960 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && (
sizeof(T) == 4)>
4964 detail::validate_rgba_write_channel_mask<Mask>();
4966 constexpr int16_t Scale = 0;
4967 constexpr
int global_offset = 0;
4968 __esimd_scatter4_scaled<T, N, decltype(si), Mask, Scale>(
4969 mask.data(), si, global_offset, offsets.
data(), vals.
data());
4987 template <
typename T,
int N,
4989 __ESIMD_API std::enable_if_t<is_simd_flag_type_v<Flags>,
simd<T, N>>
4991 constexpr
size_t Align = Flags::template alignment<simd<T, N>>;
4992 return __esimd_slm_block_ld<detail::__raw_t<T>, N, Align>(byte_offset);
5043 template <
typename T,
int N,
5044 typename PropertyListT =
5046 __ESIMD_API std::enable_if_t<
5047 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
simd<T, N>>
5049 constexpr
size_t DefaultAlignment = detail::OperandSize::OWORD;
5051 detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
5052 return __esimd_slm_block_ld<detail::__raw_t<T>, N,
Alignment>(byte_offset);
5081 template <
typename T,
int N,
5082 typename PropertyListT =
5084 __ESIMD_API std::enable_if_t<
5085 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
simd<T, N>>
5087 PropertyListT props = {}) {
5089 constexpr
size_t DefaultAlignment =
sizeof(T) <= 4 ? 4 :
sizeof(T);
5091 detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
5093 (
Alignment >= __ESIMD_DNS::OperandSize::DWORD &&
sizeof(T) <= 4) ||
5094 (
Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
sizeof(T) > 4),
5095 "Incorrect alignment for the data type");
5097 constexpr
int SmallIntFactor64Bit =
sizeof(uint64_t) /
sizeof(T);
5098 constexpr
int SmallIntFactor32Bit =
5099 sizeof(uint32_t) /
sizeof(T) > 1 ?
sizeof(uint32_t) /
sizeof(T) : 1;
5100 static_assert(N > 0 && N % SmallIntFactor32Bit == 0,
5101 "Number of elements is not supported by Transposed load");
5107 constexpr
bool Use64BitData =
5108 Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
5109 (N *
sizeof(T)) %
sizeof(uint64_t) == 0 &&
5110 (
sizeof(T) !=
sizeof(uint32_t) || N *
sizeof(T) > 256);
5111 constexpr
int SmallIntFactor =
5112 Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
5113 constexpr
int FactoredN = N / SmallIntFactor;
5114 detail::check_lsc_vector_size<FactoredN>();
5117 using LoadElemT = __ESIMD_DNS::__raw_t<
5118 std::conditional_t<SmallIntFactor == 1, T,
5119 std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
5121 constexpr uint16_t AddressScale = 1;
5122 constexpr
int ImmOffset = 0;
5125 constexpr
auto VS = detail::to_lsc_vector_size<FactoredN>();
5127 constexpr
int NLanes = 1;
5133 AddressScale, ImmOffset, DS, VS, Transposed, NLanes>(
5134 pred.
data(), Offsets.data());
5135 return Result.template bit_cast_view<T>();
5167 template <
typename T,
int N,
5168 typename PropertyListT =
5170 __ESIMD_API std::enable_if_t<
5171 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
simd<T, N>>
5173 PropertyListT props = {}) {
5175 constexpr
size_t DefaultAlignment =
sizeof(T) <= 4 ? 4 :
sizeof(T);
5177 detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
5179 (
Alignment >= __ESIMD_DNS::OperandSize::DWORD &&
sizeof(T) <= 4) ||
5180 (
Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
sizeof(T) > 4),
5181 "Incorrect alignment for the data type");
5183 constexpr
int SmallIntFactor64Bit =
sizeof(uint64_t) /
sizeof(T);
5184 constexpr
int SmallIntFactor32Bit =
5185 sizeof(uint32_t) /
sizeof(T) > 1 ?
sizeof(uint32_t) /
sizeof(T) : 1;
5186 static_assert(N > 0 && N % SmallIntFactor32Bit == 0,
5187 "Number of elements is not supported by Transposed load");
5193 constexpr
bool Use64BitData =
5194 Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
5195 (N *
sizeof(T)) %
sizeof(uint64_t) == 0 &&
5196 (
sizeof(T) !=
sizeof(uint32_t) || N *
sizeof(T) > 256);
5197 constexpr
int SmallIntFactor =
5198 Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
5199 constexpr
int FactoredN = N / SmallIntFactor;
5200 detail::check_lsc_vector_size<FactoredN>();
5203 using LoadElemT = __ESIMD_DNS::__raw_t<
5204 std::conditional_t<SmallIntFactor == 1, T,
5205 std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
5207 constexpr uint16_t AddressScale = 1;
5208 constexpr
int ImmOffset = 0;
5211 constexpr
auto VS = detail::to_lsc_vector_size<FactoredN>();
5213 constexpr
int NLanes = 1;
5218 pass_thru.template bit_cast_view<LoadElemT>();
5221 AddressScale, ImmOffset, DS, VS, Transposed,
5222 NLanes>(pred.
data(), Offsets.data(),
5224 return Result.template bit_cast_view<T>();
5250 template <
typename T,
int N,
typename AccessorT,
5251 typename PropertyListT =
5253 __ESIMD_API std::enable_if_t<
5254 detail::is_local_accessor_with_v<AccessorT,
5255 detail::accessor_mode_cap::can_read> &&
5256 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
5258 block_load(AccessorT lacc, uint32_t byte_offset, PropertyListT props = {}) {
5259 byte_offset += detail::localAccessorToOffset(lacc);
5260 return slm_block_load<T, N>(byte_offset, props);
5285 template <
typename T,
int N,
typename AccessorT,
5286 typename PropertyListT =
5288 __ESIMD_API std::enable_if_t<
5289 detail::is_local_accessor_with_v<AccessorT,
5290 detail::accessor_mode_cap::can_read> &&
5291 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
5294 return slm_block_load<T, N>(detail::localAccessorToOffset(lacc), props);
5324 template <
typename T,
int N,
typename AccessorT,
5325 typename PropertyListT =
5327 __ESIMD_API std::enable_if_t<
5328 detail::is_local_accessor_with_v<AccessorT,
5329 detail::accessor_mode_cap::can_read> &&
5330 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
5333 PropertyListT props = {}) {
5334 byte_offset += detail::localAccessorToOffset(lacc);
5335 return slm_block_load<T, N>(byte_offset, pred, props);
5363 template <
typename T,
int N,
typename AccessorT,
5364 typename PropertyListT =
5366 __ESIMD_API std::enable_if_t<
5367 detail::is_local_accessor_with_v<AccessorT,
5368 detail::accessor_mode_cap::can_read> &&
5369 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
5372 return slm_block_load<T, N>(detail::localAccessorToOffset(lacc), pred, props);
5402 template <
typename T,
int N,
typename AccessorT,
5403 typename PropertyListT =
5405 __ESIMD_API std::enable_if_t<
5406 detail::is_local_accessor_with_v<AccessorT,
5407 detail::accessor_mode_cap::can_read> &&
5408 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
5411 simd<T, N> pass_thru, PropertyListT props = {}) {
5412 byte_offset += __ESIMD_DNS::localAccessorToOffset(lacc);
5413 return slm_block_load<T, N>(byte_offset, pred, pass_thru, props);
5442 template <
typename T,
int N,
typename AccessorT,
5443 typename PropertyListT =
5445 __ESIMD_API std::enable_if_t<
5446 detail::is_local_accessor_with_v<AccessorT,
5447 detail::accessor_mode_cap::can_read> &&
5448 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
5451 PropertyListT props = {}) {
5452 return slm_block_load<T, N>(__ESIMD_DNS::localAccessorToOffset(lacc), pred,
5471 template <
typename T,
int N,
typename Flags>
5472 __ESIMD_API std::enable_if_t<is_simd_flag_type_v<Flags>>
5474 constexpr
size_t Align = Flags::template alignment<simd<T, N>>;
5475 __esimd_slm_block_st<detail::__raw_t<T>, N, Align>(offset, vals.
data());
5533 template <
typename T,
int N,
5534 typename PropertyListT =
5536 __ESIMD_API std::enable_if_t<
5537 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
5539 PropertyListT props = {}) {
5541 constexpr
size_t DefaultAlignment =
sizeof(T) <= 4 ? 4 :
sizeof(T);
5543 detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
5545 (
Alignment >= __ESIMD_DNS::OperandSize::DWORD &&
sizeof(T) <= 4) ||
5546 (
Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
sizeof(T) > 4),
5547 "Incorrect alignment for the data type");
5549 constexpr
int SmallIntFactor64Bit =
sizeof(uint64_t) /
sizeof(T);
5550 constexpr
int SmallIntFactor32Bit =
5551 sizeof(uint32_t) /
sizeof(T) > 1 ?
sizeof(uint32_t) /
sizeof(T) : 1;
5553 static_assert(N > 0 && N % SmallIntFactor32Bit == 0,
5554 "Number of elements is not supported by Transposed store");
5560 constexpr
bool Use64BitData =
5561 Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
5562 (N *
sizeof(T)) %
sizeof(uint64_t) == 0 &&
5563 (
sizeof(T) !=
sizeof(uint32_t) || N *
sizeof(T) > 256);
5564 constexpr
int SmallIntFactor =
5565 Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
5566 constexpr
int FactoredN = N / SmallIntFactor;
5567 detail::check_lsc_vector_size<FactoredN>();
5570 using StoreElemT = __ESIMD_DNS::__raw_t<
5571 std::conditional_t<SmallIntFactor == 1, T,
5572 std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
5574 constexpr uint16_t AddressScale = 1;
5575 constexpr
int ImmOffset = 0;
5578 constexpr
auto VS = detail::to_lsc_vector_size<FactoredN>();
5580 constexpr
int NLanes = 1;
5585 AddressScale, ImmOffset, DS, VS, Transposed, NLanes>(
5586 pred.
data(), Offsets.data(),
5587 sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreElemT, FactoredN>>(
5606 template <
typename T,
int N,
5607 typename PropertyListT =
5609 __ESIMD_API std::enable_if_t<
5610 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
5612 PropertyListT props = {}) {
5613 constexpr
size_t DefaultAlignment = detail::OperandSize::OWORD;
5615 detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
5616 using StoreElemT = detail::__raw_t<T>;
5617 __esimd_slm_block_st<StoreElemT, N, Alignment>(
5638 template <
typename T,
int N,
typename AccessorT,
5639 typename PropertyListT =
5641 __ESIMD_API std::enable_if_t<
5642 detail::is_local_accessor_with_v<AccessorT,
5643 detail::accessor_mode_cap::can_write> &&
5644 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
5646 PropertyListT props = {}) {
5647 byte_offset += detail::localAccessorToOffset(lacc);
5648 slm_block_store<T, N>(byte_offset, vals, props);
5666 template <
typename T,
int N,
typename AccessorT,
5667 typename PropertyListT =
5669 __ESIMD_API std::enable_if_t<
5670 detail::is_local_accessor_with_v<AccessorT,
5671 detail::accessor_mode_cap::can_write> &&
5672 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
5674 slm_block_store<T, N>(detail::localAccessorToOffset(lacc), vals, props);
5704 template <
typename T,
int N,
typename AccessorT,
5705 typename PropertyListT =
5707 __ESIMD_API std::enable_if_t<
5708 detail::is_local_accessor_with_v<AccessorT,
5709 detail::accessor_mode_cap::can_write> &&
5710 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
5713 byte_offset += detail::localAccessorToOffset(lacc);
5714 slm_block_store<T, N>(byte_offset, vals, pred, props);
5742 template <
typename T,
int N,
typename AccessorT,
5743 typename PropertyListT =
5745 __ESIMD_API std::enable_if_t<
5746 detail::is_local_accessor_with_v<AccessorT,
5747 detail::accessor_mode_cap::can_write> &&
5748 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
5750 PropertyListT props = {}) {
5751 slm_block_store<T, N>(detail::localAccessorToOffset(lacc), vals, pred, props);
5760 template <
typename T, __ESIMD_NS::atomic_op Op>
5763 __ESIMD_DNS::to_lsc_atomic_op<Op>();
5764 return static_cast<int>(LSCOp);
5781 template <atomic_op Op,
typename T,
int N, lsc_data_size DS>
5782 __ESIMD_API std::enable_if_t<get_num_args<Op>() == 0,
simd<T, N>>
5784 check_lsc_data_size<T, DS>();
5786 constexpr uint16_t AddressScale = 1;
5787 constexpr
int ImmOffset = 0;
5792 constexpr
int IOp = lsc_to_internal_atomic_op<T, Op>();
5795 AddressScale, ImmOffset, EDS, VS, Transposed,
5797 return lsc_format_ret<T>(Tmp);
5814 template <atomic_op Op,
typename T,
int N, lsc_data_size DS>
5815 __ESIMD_API std::enable_if_t<get_num_args<Op>() == 1,
simd<T, N>>
5818 check_lsc_data_size<T, DS>();
5820 constexpr uint16_t AddressScale = 1;
5821 constexpr
int ImmOffset = 0;
5825 constexpr
int IOp = lsc_to_internal_atomic_op<T, Op>();
5826 if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
5828 AddressScale, ImmOffset, EDS, VS,
5829 Transposed, N>(pred.
data(), offsets.
data(),
5836 AddressScale, ImmOffset, EDS, VS, Transposed,
5839 return lsc_format_ret<T>(Tmp);
5858 template <atomic_op Op,
typename T,
int N, lsc_data_size DS>
5862 check_lsc_data_size<T, DS>();
5864 constexpr uint16_t AddressScale = 1;
5865 constexpr
int ImmOffset = 0;
5869 constexpr
int IOp = lsc_to_internal_atomic_op<T, Op>();
5870 if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
5872 AddressScale, ImmOffset, EDS, VS,
5873 Transposed, N>(pred.
data(), offsets.
data(),
5881 AddressScale, ImmOffset, EDS, VS, Transposed,
5883 Msg_data0.
data(), Msg_data1.
data());
5884 return lsc_format_ret<T>(Tmp);
5927 template <atomic_op Op,
typename T,
int N>
5928 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0,
simd<T, N>>
5932 if constexpr (
sizeof(T) == 2 ||
sizeof(T) == 8 ||
5938 if constexpr (std::is_integral_v<T>) {
5939 return slm_atomic_update<atomic_op::bit_or, T, N>(byte_offset,
5942 using Tint = detail::uint_type_t<
sizeof(T)>;
5943 simd<Tint, N> Res = slm_atomic_update<atomic_op::bit_or, Tint, N>(
5945 return Res.template bit_cast_view<T>();
5948 detail::check_atomic<Op, T, N, 0>();
5950 return __esimd_dword_atomic0<Op, T, N>(mask.data(), si, byte_offset.
data());
5962 template <atomic_op Op,
typename T,
int N,
typename AccessorT>
5963 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0 &&
5964 __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
5968 byte_offset += detail::localAccessorToOffset(lacc);
5969 return slm_atomic_update<Op, T, N>(byte_offset, mask);
6006 template <atomic_op Op,
typename T,
int N>
6007 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1,
simd<T, N>>
6017 byte_offset,
src0, mask);
6019 if constexpr (std::is_integral_v<T>) {
6020 return slm_atomic_update<atomic_op::xchg, T, N>(byte_offset,
src0, mask);
6022 using Tint = detail::uint_type_t<
sizeof(T)>;
6023 simd<Tint, N> Res = slm_atomic_update<atomic_op::xchg, Tint, N>(
6024 byte_offset,
src0.template bit_cast_view<Tint>(), mask);
6025 return Res.template bit_cast_view<T>();
6028 detail::check_atomic<Op, T, N, 1>();
6030 return __esimd_dword_atomic1<Op, T, N>(mask.data(), si, byte_offset.
data(),
6053 template <atomic_op Op,
typename T,
int N,
typename AccessorT>
6054 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1 &&
6055 __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
6059 byte_offset += detail::localAccessorToOffset(lacc);
6060 return slm_atomic_update<Op, T, N>(byte_offset,
src0, mask);
6095 template <atomic_op Op,
typename T,
int N>
6096 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2,
simd<T, N>>
6110 detail::check_atomic<Op, T, N, 2>();
6112 return __esimd_dword_atomic2<Op, T, N>(mask.data(), si, byte_offset.
data(),
6123 template <atomic_op Op,
typename T,
int N,
typename AccessorT>
6124 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
6125 __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
6129 byte_offset += detail::localAccessorToOffset(lacc);
6130 return slm_atomic_update<Op, T, N>(byte_offset,
src0,
src1, mask);
6151 typename PropertyListT,
typename Toffset>
6152 __ESIMD_API std::enable_if_t<get_num_args<Op>() == 0,
simd<T, N>>
6154 static_assert(
sizeof(T) > 1,
"Unsupported data type");
6155 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
6157 check_lsc_data_size<T, DS>();
6158 check_cache_hints<cache_action::atomic, PropertyListT>();
6159 constexpr
auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
6160 constexpr
auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
6161 constexpr uint16_t AddressScale = 1;
6162 constexpr
int ImmOffset = 0;
6167 constexpr
int IOp = lsc_to_internal_atomic_op<T, Op>();
6169 addrs += convert<uintptr_t>(offsets);
6171 __esimd_lsc_xatomic_stateless_0<MsgT, IOp, L1H, L2H, AddressScale,
6172 ImmOffset, EDS, VS, Transposed, N>(
6174 return lsc_format_ret<T>(Tmp);
6192 typename PropertyListT,
typename Toffset>
6193 __ESIMD_API std::enable_if_t<get_num_args<Op>() == 1,
simd<T, N>>
6196 static_assert(
sizeof(T) > 1,
"Unsupported data type");
6197 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
6198 check_lsc_data_size<T, DS>();
6200 check_cache_hints<cache_action::atomic, PropertyListT>();
6201 constexpr
auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
6202 constexpr
auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
6203 constexpr uint16_t AddressScale = 1;
6204 constexpr
int ImmOffset = 0;
6209 constexpr
int IOp = lsc_to_internal_atomic_op<T, Op>();
6212 addrs += convert<uintptr_t>(offsets);
6214 __esimd_lsc_xatomic_stateless_1<MsgT, IOp, L1H, L2H, AddressScale,
6215 ImmOffset, EDS, VS, Transposed, N>(
6217 return lsc_format_ret<T>(Tmp);
6236 typename PropertyListT,
typename Toffset>
6237 __ESIMD_API std::enable_if_t<get_num_args<Op>() == 2,
simd<T, N>>
6240 static_assert(
sizeof(T) > 1,
"Unsupported data type");
6241 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
6242 check_lsc_data_size<T, DS>();
6244 check_cache_hints<cache_action::atomic, PropertyListT>();
6245 constexpr
auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
6246 constexpr
auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
6247 constexpr uint16_t AddressScale = 1;
6248 constexpr
int ImmOffset = 0;
6253 constexpr
int IOp = lsc_to_internal_atomic_op<T, Op>();
6257 addrs += convert<uintptr_t>(offsets);
6259 __esimd_lsc_xatomic_stateless_2<MsgT, IOp, L1H, L2H, AddressScale,
6260 ImmOffset, EDS, VS, Transposed, N>(
6262 return lsc_format_ret<T>(Tmp);
6280 template <
atomic_op Op,
typename T,
int N,
6282 typename PropertyListT,
typename AccessorTy,
typename Toffset>
6284 std::enable_if_t<get_num_args<Op>() == 0 &&
6285 __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy>,
6289 #ifdef __ESIMD_FORCE_STATELESS_MEM
6290 return atomic_update_impl<Op, T, N, DS, PropertyListT>(
6291 accessorToPointer<T>(acc), byte_offsets, pred);
6293 static_assert(
sizeof(T) > 1,
"Unsupported data type");
6294 static_assert(std::is_integral_v<Toffset> &&
sizeof(Toffset) == 4,
6295 "Unsupported offset type");
6296 check_lsc_data_size<T, DS>();
6298 check_cache_hints<cache_action::atomic, PropertyListT>();
6299 constexpr
auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
6300 constexpr
auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
6301 constexpr uint16_t AddressScale = 1;
6302 constexpr
int ImmOffset = 0;
6307 constexpr
int IOp = lsc_to_internal_atomic_op<T, Op>();
6310 __esimd_lsc_xatomic_bti_0<MsgT, IOp, L1H, L2H, AddressScale, ImmOffset,
6311 EDS, VS, Transposed, N>(
6312 pred.
data(), byte_offsets.
data(), si);
6313 return lsc_format_ret<T>(Tmp);
6335 typename PropertyListT,
typename AccessorTy,
typename Toffset>
6337 std::enable_if_t<get_num_args<Op>() == 1 &&
6338 __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy>,
6342 #ifdef __ESIMD_FORCE_STATELESS_MEM
6343 return atomic_update_impl<Op, T, N, DS, PropertyListT>(
6344 accessorToPointer<T>(acc), byte_offset,
src0, pred);
6346 static_assert(
sizeof(T) > 1,
"Unsupported data type");
6347 static_assert(std::is_integral_v<Toffset> &&
sizeof(Toffset) == 4,
6348 "Unsupported offset type");
6349 check_lsc_data_size<T, DS>();
6351 check_cache_hints<cache_action::atomic, PropertyListT>();
6352 constexpr
auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
6353 constexpr
auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
6354 constexpr uint16_t AddressScale = 1;
6355 constexpr
int ImmOffset = 0;
6360 constexpr
int IOp = lsc_to_internal_atomic_op<T, Op>();
6364 __esimd_lsc_xatomic_bti_1<MsgT, IOp, L1H, L2H, AddressScale, ImmOffset,
6365 EDS, VS, Transposed, N>(
6367 return lsc_format_ret<T>(Tmp);
6390 typename PropertyListT,
typename AccessorTy,
typename Toffset>
6392 std::enable_if_t<get_num_args<Op>() == 2 &&
6393 __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy>,
6397 #ifdef __ESIMD_FORCE_STATELESS_MEM
6398 return atomic_update_impl<Op, T, N, DS, PropertyListT>(
6399 __ESIMD_DNS::accessorToPointer<T>(acc), byte_offset,
src0,
src1, pred);
6401 static_assert(std::is_integral_v<Toffset> &&
sizeof(Toffset) == 4,
6402 "Unsupported offset type");
6403 check_lsc_vector_size<1>();
6404 check_lsc_data_size<T, DS>();
6406 check_cache_hints<cache_action::atomic, PropertyListT>();
6407 constexpr
auto L1H = getCacheHintForIntrin<PropertyListT, cache_level::L1>();
6408 constexpr
auto L2H = getCacheHintForIntrin<PropertyListT, cache_level::L2>();
6409 constexpr uint16_t AddressScale = 1;
6410 constexpr
int ImmOffset = 0;
6415 constexpr
int IOp = lsc_to_internal_atomic_op<T, Op>();
6420 __esimd_lsc_xatomic_bti_2<MsgT, IOp, L1H, L2H, AddressScale, ImmOffset,
6421 EDS, VS, Transposed, N>(
6424 return lsc_format_ret<T>(Tmp);
6473 template <
atomic_op Op,
typename T,
int N,
typename Toffset,
6474 typename PropertyListT =
6476 __ESIMD_API std::enable_if_t<
6477 __ESIMD_DNS::get_num_args<Op>() == 0 &&
6478 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6481 PropertyListT props = {}) {
6482 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
6484 if constexpr (detail::has_cache_hints<PropertyListT>() ||
6488 p, byte_offset, mask);
6489 }
else if constexpr (N == 16 || N == 32) {
6503 for (
int I = 0; I < N; I += 8) {
6506 Res.template select<8, 1>(I) =
6507 atomic_update<Op, T, 8>(p, ByteOffset8, Mask8, props);
6511 if constexpr (std::is_integral_v<T>) {
6512 return atomic_update<atomic_op::bit_or, T, N>(p, byte_offset,
6515 using Tint = detail::uint_type_t<
sizeof(T)>;
6516 simd<Tint, N> Res = atomic_update<atomic_op::bit_or, Tint, N>(
6517 reinterpret_cast<Tint *
>(p), byte_offset,
simd<Tint, N>(0), mask,
6519 return Res.template bit_cast_view<T>();
6522 detail::check_atomic<Op, T, N, 0>();
6526 using Tx =
typename detail::__raw_t<T>;
6527 return __esimd_svm_atomic0<Op, Tx, N>(vAddr.data(), mask.
data());
6549 template <
atomic_op Op,
typename T,
int N,
typename Toffset,
6550 typename PropertyListT =
6552 __ESIMD_API std::enable_if_t<
6553 __ESIMD_DNS::get_num_args<Op>() == 0 &&
6554 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6558 return atomic_update<Op, T, N>(p, byte_offset, mask, props);
6581 template <
atomic_op Op,
typename T,
int N,
typename OffsetSimdViewT,
6582 typename PropertyListT =
6584 __ESIMD_API std::enable_if_t<
6585 __ESIMD_DNS::get_num_args<Op>() == 0 &&
6586 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
6587 detail::is_simd_view_type_v<OffsetSimdViewT>,
6590 PropertyListT props = {}) {
6591 return atomic_update<Op, T, N>(p, offsets.read(), mask, props);
6612 template <
atomic_op Op,
typename T,
int N,
typename OffsetSimdViewT,
6613 typename PropertyListT =
6615 __ESIMD_API std::enable_if_t<
6616 __ESIMD_DNS::get_num_args<Op>() == 0 &&
6617 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
6618 detail::is_simd_view_type_v<OffsetSimdViewT>,
6622 return atomic_update<Op, T, N>(p, byte_offset.read(), mask, props);
6639 template <atomic_op Op,
typename T,
int N,
typename Toffset>
6640 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>,
simd<T, N>>
6693 template <
atomic_op Op,
typename T,
int N,
typename Toffset,
6694 typename PropertyListT =
6696 __ESIMD_API std::enable_if_t<
6697 __ESIMD_DNS::get_num_args<Op>() == 1 &&
6698 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6702 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
6705 if constexpr (detail::has_cache_hints<PropertyListT>() ||
6711 p, byte_offset,
src0, mask);
6712 }
else if constexpr (N == 16 || N == 32) {
6725 for (
int I = 0; I < N; I += 8) {
6729 Res.template select<8, 1>(I) =
6730 atomic_update<Op, T, 8>(p, ByteOffset8, Src08, Mask8, props);
6734 if constexpr (std::is_integral_v<T>) {
6735 return atomic_update<atomic_op::xchg, T, N>(p, byte_offset,
src0, mask,
6738 using Tint = detail::uint_type_t<
sizeof(T)>;
6739 simd<Tint, N> Res = atomic_update<atomic_op::xchg, Tint, N>(
6740 reinterpret_cast<Tint *
>(p), byte_offset,
6741 src0.template bit_cast_view<Tint>(), mask, props);
6742 return Res.template bit_cast_view<T>();
6745 detail::check_atomic<Op, T, N, 1>();
6750 using Tx =
typename detail::__raw_t<T>;
6751 return __esimd_svm_atomic1<Op, Tx, N>(vAddr.data(),
src0.data(),
6779 template <
atomic_op Op,
typename T,
int N,
typename Toffset,
6780 typename PropertyListT =
6782 __ESIMD_API std::enable_if_t<
6783 __ESIMD_DNS::get_num_args<Op>() == 1 &&
6784 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6787 PropertyListT props = {}) {
6789 return atomic_update<Op, T, N>(p, byte_offset,
src0, mask, props);
6819 template <
atomic_op Op,
typename T,
int N,
typename OffsetSimdViewT,
6820 typename PropertyListT =
6822 __ESIMD_API std::enable_if_t<
6823 __ESIMD_DNS::get_num_args<Op>() == 1 &&
6824 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
6825 detail::is_simd_view_type_v<OffsetSimdViewT>,
6828 PropertyListT props = {}) {
6829 return atomic_update<Op, T, N>(p, offsets.read(),
src0, mask, props);
6857 template <
atomic_op Op,
typename T,
int N,
typename OffsetSimdViewT,
6858 typename PropertyListT =
6860 __ESIMD_API std::enable_if_t<
6861 __ESIMD_DNS::get_num_args<Op>() == 1 &&
6862 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
6863 detail::is_simd_view_type_v<OffsetSimdViewT>,
6866 PropertyListT props = {}) {
6868 return atomic_update<Op, T, N>(p, offsets.read(),
src0, mask, props);
6889 template <atomic_op Op,
typename Tx,
int N,
typename Toffset>
6890 __ESIMD_API std::enable_if_t<
6891 std::is_integral_v<Toffset> &&
6944 template <
atomic_op Op,
typename T,
int N,
typename Toffset,
6945 typename PropertyListT =
6947 __ESIMD_API std::enable_if_t<
6948 __ESIMD_DNS::get_num_args<Op>() == 2 &&
6949 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6953 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
6958 if constexpr (detail::has_cache_hints<PropertyListT>() ||
6967 }
else if constexpr (N == 16 || N == 32) {
6980 for (
int I = 0; I < N; I += 8) {
6985 Res.template select<8, 1>(I) =
6986 atomic_update<Op, T, 8>(p, ByteOffset8, Src08, Src18, Mask8, props);
6990 detail::check_atomic<Op, T, N, 2>();
6994 using Tx =
typename detail::__raw_t<T>;
6995 return __esimd_svm_atomic2<Op, Tx, N>(vAddr.data(),
src0.data(),
7019 template <
atomic_op Op,
typename T,
int N,
typename Toffset,
7020 typename PropertyListT =
7022 __ESIMD_API std::enable_if_t<
7023 __ESIMD_DNS::get_num_args<Op>() == 2 &&
7024 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7029 return atomic_update<Op, T, N>(p, byte_offset,
src0,
src1, mask, props);
7052 template <
atomic_op Op,
typename T,
int N,
typename OffsetSimdViewT,
7053 typename PropertyListT =
7055 __ESIMD_API std::enable_if_t<
7056 __ESIMD_DNS::get_num_args<Op>() == 2 &&
7057 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
7058 detail::is_simd_view_type_v<OffsetSimdViewT>,
7062 return atomic_update<Op, T, N>(p, byte_offset.read(),
src0,
src1, mask,
7084 template <
atomic_op Op,
typename T,
int N,
typename OffsetSimdViewT,
7085 typename PropertyListT =
7087 __ESIMD_API std::enable_if_t<
7088 __ESIMD_DNS::get_num_args<Op>() == 2 &&
7089 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
7090 detail::is_simd_view_type_v<OffsetSimdViewT>,
7095 return atomic_update<Op, T, N>(p, byte_offset.read(),
src0,
src1, mask,
7115 template <atomic_op Op,
typename Tx,
int N,
typename Toffset>
7116 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>,
simd<Tx, N>>
7168 template <
atomic_op Op,
typename T,
int N,
typename Toffset,
7169 typename AccessorTy,
7170 typename PropertyListT =
7172 __ESIMD_API std::enable_if_t<
7173 __ESIMD_DNS::get_num_args<Op>() == 0 &&
7174 __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7175 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7178 PropertyListT props = {}) {
7179 #ifdef __ESIMD_FORCE_STATELESS_MEM
7180 return atomic_update<Op, T, N>(__ESIMD_DNS::accessorToPointer<T>(acc),
7181 byte_offset, mask, props);
7183 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
7185 if constexpr (detail::has_cache_hints<PropertyListT>() ||
7189 acc, byte_offset, mask);
7192 if constexpr (std::is_integral_v<T>) {
7193 return atomic_update<atomic_op::bit_or, T, N>(
7194 acc, byte_offset,
simd<T, N>(0), mask, props);
7196 using Tint = detail::uint_type_t<
sizeof(T)>;
7197 simd<Tint, N> Res = atomic_update<atomic_op::bit_or, Tint, N>(
7199 return Res.template bit_cast_view<T>();
7202 detail::check_atomic<Op, T, N, 0>();
7203 static_assert(
sizeof(Toffset) == 4,
"Only 32 bit offset is supported");
7205 static_assert(
sizeof(T) == 4,
"Only 32 bit data is supported");
7207 using Tx =
typename detail::__raw_t<T>;
7208 return __esimd_dword_atomic0<Op, Tx, N>(mask.
data(), si,
7209 byte_offset.
data());
7235 template <
atomic_op Op,
typename T,
int N,
typename Toffset,
7236 typename AccessorTy,
7237 typename PropertyListT =
7239 __ESIMD_API std::enable_if_t<
7240 __ESIMD_DNS::get_num_args<Op>() == 0 &&
7241 __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7242 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7245 PropertyListT props = {}) {
7247 return atomic_update<Op, T, N>(acc, byte_offset, mask, props);
7274 template <
atomic_op Op,
typename T,
int N,
typename OffsetSimdViewT,
7275 typename AccessorTy,
7276 typename PropertyListT =
7278 __ESIMD_API std::enable_if_t<
7279 __ESIMD_DNS::get_num_args<Op>() == 0 &&
7280 __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7281 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
7282 detail::is_simd_view_type_v<OffsetSimdViewT>,
7285 PropertyListT props = {}) {
7286 return atomic_update<Op, T, N>(acc, byte_offset.read(), mask, props);
7308 template <
atomic_op Op,
typename T,
int N,
typename OffsetSimdViewT,
7309 typename AccessorTy,
7310 typename PropertyListT =
7312 __ESIMD_API std::enable_if_t<
7313 __ESIMD_DNS::get_num_args<Op>() == 0 &&
7314 __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7315 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
7316 detail::is_simd_view_type_v<OffsetSimdViewT>,
7319 PropertyListT props = {}) {
7321 return atomic_update<Op, T, N>(acc, byte_offset.read(), mask, props);
7342 template <
atomic_op Op,
typename T,
int N,
typename Toffset,
7343 typename AccessorTy>
7345 std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0 &&
7346 __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy>,
7370 template <atomic_op Op,
typename T,
int N,
typename AccessorTy>
7372 std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0 &&
7373 __ESIMD_DNS::is_rw_local_accessor_v<AccessorTy>,
7433 template <
atomic_op Op,
typename T,
int N,
typename Toffset,
7434 typename AccessorTy,
7435 typename PropertyListT =
7437 __ESIMD_API std::enable_if_t<
7438 __ESIMD_DNS::get_num_args<Op>() == 1 &&
7439 __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7440 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7444 #ifdef __ESIMD_FORCE_STATELESS_MEM
7445 return atomic_update<Op, T, N>(__ESIMD_DNS::accessorToPointer<T>(acc),
7446 byte_offset,
src0, mask, props);
7448 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
7449 static_assert(
sizeof(Toffset) == 4,
"Only 32 bit offset is supported");
7451 if constexpr (detail::has_cache_hints<PropertyListT>() ||
7457 acc, byte_offset,
src0, mask);
7459 if constexpr (std::is_integral_v<T>) {
7460 return atomic_update<atomic_op::xchg, T, N>(acc, byte_offset,
src0, mask,
7463 using Tint = detail::uint_type_t<
sizeof(T)>;
7464 simd<Tint, N> Res = atomic_update<atomic_op::xchg, Tint, N>(
7465 acc, byte_offset,
src0.template bit_cast_view<Tint>(), mask, props);
7466 return Res.template bit_cast_view<T>();
7469 detail::check_atomic<Op, T, N, 1>();
7470 static_assert(
sizeof(T) == 4,
"Only 32 bit data is supported");
7472 using Tx =
typename detail::__raw_t<T>;
7473 return __esimd_dword_atomic1<Op, Tx, N>(
7474 mask.
data(), si, byte_offset.
data(),
7511 template <
atomic_op Op,
typename T,
int N,
typename Toffset,
7512 typename AccessorTy,
7513 typename PropertyListT =
7515 __ESIMD_API std::enable_if_t<
7516 __ESIMD_DNS::get_num_args<Op>() == 1 &&
7517 __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7518 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7521 PropertyListT props = {}) {
7523 return atomic_update<Op, T, N>(acc, byte_offset,
src0, mask, props);
7558 template <
atomic_op Op,
typename T,
int N,
typename OffsetSimdViewT,
7559 typename AccessorTy,
7560 typename PropertyListT =
7562 __ESIMD_API std::enable_if_t<
7563 __ESIMD_DNS::get_num_args<Op>() == 1 &&
7564 __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7565 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
7566 detail::is_simd_view_type_v<OffsetSimdViewT>,
7570 return atomic_update<Op, T, N>(acc, byte_offset.read(),
src0, mask, props);
7603 template <
atomic_op Op,
typename T,
int N,
typename OffsetSimdViewT,
7604 typename AccessorTy,
7605 typename PropertyListT =
7607 __ESIMD_API std::enable_if_t<
7608 __ESIMD_DNS::get_num_args<Op>() == 1 &&
7609 __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7610 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
7611 detail::is_simd_view_type_v<OffsetSimdViewT>,
7614 PropertyListT props = {}) {
7616 return atomic_update<Op, T, N>(acc, byte_offset.read(),
src0, mask, props);
7640 template <
atomic_op Op,
typename T,
int N,
typename Toffset,
7641 typename AccessorTy>
7642 __ESIMD_API std::enable_if_t<
7643 __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7670 template <atomic_op Op,
typename Tx,
int N,
typename AccessorTy>
7671 __ESIMD_API std::enable_if_t<
7672 __ESIMD_DNS::is_rw_local_accessor_v<AccessorTy> &&
7733 template <
atomic_op Op,
typename T,
int N,
typename Toffset,
7734 typename AccessorTy,
7735 typename PropertyListT =
7737 __ESIMD_API std::enable_if_t<
7738 __ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
7739 __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7740 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7744 #ifdef __ESIMD_FORCE_STATELESS_MEM
7745 return atomic_update<Op, T, N>(__ESIMD_DNS::accessorToPointer<T>(acc),
7746 byte_offset,
src0,
src1, mask, props);
7748 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
7749 static_assert(
sizeof(Toffset) == 4,
"Only 32 bit offset is supported");
7753 if constexpr (detail::has_cache_hints<PropertyListT>() ||
7761 acc, byte_offset,
src1,
src0, mask);
7763 detail::check_atomic<Op, T, N, 2>();
7764 static_assert(
sizeof(T) == 4,
"Only 32 bit data is supported");
7766 using Tx =
typename detail::__raw_t<T>;
7767 return __esimd_dword_atomic2<Op, Tx, N>(
7768 mask.
data(), si, byte_offset.
data(),
7796 template <
atomic_op Op,
typename T,
int N,
typename Toffset,
7797 typename AccessorTy,
7798 typename PropertyListT =
7800 __ESIMD_API std::enable_if_t<
7801 __ESIMD_DNS::get_num_args<Op>() == 2 &&
7802 __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7803 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
7808 return atomic_update<Op, T, N>(acc, byte_offset,
src0,
src1, mask, props);
7834 template <
atomic_op Op,
typename T,
int N,
typename OffsetSimdViewT,
7835 typename AccessorTy,
7836 typename PropertyListT =
7838 __ESIMD_API std::enable_if_t<
7839 __ESIMD_DNS::get_num_args<Op>() == 2 &&
7840 __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7841 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
7842 detail::is_simd_view_type_v<OffsetSimdViewT>,
7846 return atomic_update<Op, T, N>(acc, byte_offset.read(),
src0,
src1, mask,
7871 template <
atomic_op Op,
typename T,
int N,
typename OffsetSimdViewT,
7872 typename AccessorTy,
7873 typename PropertyListT =
7875 __ESIMD_API std::enable_if_t<
7876 __ESIMD_DNS::get_num_args<Op>() == 2 &&
7877 __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
7878 ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
7879 detail::is_simd_view_type_v<OffsetSimdViewT>,
7884 return atomic_update<Op, T, N>(acc, byte_offset.read(),
src0,
src1, mask,
7908 template <
atomic_op Op,
typename Tx,
int N,
typename Toffset,
7909 typename AccessorTy>
7910 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_rw_device_accessor_v<AccessorTy>,
7935 template <atomic_op Op,
typename Tx,
int N,
typename AccessorTy>
7936 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_rw_local_accessor_v<AccessorTy>,
7983 template <u
int8_t cntl> __ESIMD_API
void fence() { __esimd_fence(cntl); }
8001 "SLM fence must have 'none' lsc_fence_op and 'group' scope");
8002 constexpr
int N = 16;
8004 __esimd_lsc_fence<static_cast<uint8_t>(Kind),
static_cast<uint8_t
>(FenceOp),
8005 static_cast<uint8_t
>(Scope), N>(Mask.
data());
8037 template <
typename T,
int m,
int N,
typename AccessorTy,
unsigned plane = 0>
8040 constexpr
unsigned Width = N *
sizeof(T);
8041 static_assert(Width * m <= 256u,
8042 "data does not fit into a single dataport transaction");
8043 static_assert(Width <= 64u,
"valid block width is in range [1, 64]");
8044 static_assert(m <= 64u,
"valid block height is in range [1, 64]");
8045 static_assert(plane <= 3u,
"valid plane index is in range [0, 3]");
8048 using SurfIndTy = decltype(si);
8049 constexpr
unsigned int RoundedWidth =
8050 Width < 4 ? 4 : detail::getNextPowerOf2<Width>();
8051 constexpr
int BlockWidth =
sizeof(T) * N;
8052 constexpr
int Mod = 0;
8054 if constexpr (Width < RoundedWidth) {
8055 constexpr
unsigned int n1 = RoundedWidth /
sizeof(T);
8057 __esimd_media_ld<T, m, n1, Mod, SurfIndTy, (int)plane, BlockWidth>(
8059 return temp.template select<m, 1, N, 1>(0, 0);
8061 return __esimd_media_ld<T, m, N, Mod, SurfIndTy, (int)plane, BlockWidth>(
8078 template <
typename T,
int m,
int N,
typename AccessorTy,
unsigned plane = 0>
8081 constexpr
unsigned Width = N *
sizeof(T);
8082 static_assert(Width * m <= 256u,
8083 "data does not fit into a single dataport transaction");
8084 static_assert(Width <= 64u,
"valid block width is in range [1, 64]");
8085 static_assert(m <= 64u,
"valid block height is in range [1, 64]");
8086 static_assert(plane <= 3u,
"valid plane index is in range [0, 3]");
8088 using SurfIndTy = decltype(si);
8089 constexpr
unsigned int RoundedWidth =
8090 Width < 4 ? 4 : detail::getNextPowerOf2<Width>();
8091 constexpr
unsigned int n1 = RoundedWidth /
sizeof(T);
8092 constexpr
int BlockWidth =
sizeof(T) * N;
8093 constexpr
int Mod = 0;
8095 if constexpr (Width < RoundedWidth) {
8097 auto temp_ref = temp.template bit_cast_view<T, m, n1>();
8098 auto vals_ref = vals.template bit_cast_view<T, m, N>();
8099 temp_ref.template select<m, 1, N, 1>() = vals_ref;
8100 __esimd_media_st<T, m, n1, Mod, SurfIndTy, plane, BlockWidth>(si,
x,
y,
8103 __esimd_media_st<T, m, N, Mod, SurfIndTy, plane, BlockWidth>(si,
x,
y,
8126 template <
typename T,
int N,
typename AccessorTy,
8129 std::enable_if_t<detail::is_local_accessor_with_v<
8130 AccessorTy, detail::accessor_mode_cap::can_read> &&
8131 is_simd_flag_type_v<Flags>,
8134 return slm_block_load<T, N>(byte_offset + detail::localAccessorToOffset(acc),
8155 template <
typename T,
int N,
typename AccessorT,
typename Flags>
8157 std::enable_if_t<detail::is_local_accessor_with_v<
8158 AccessorT, detail::accessor_mode_cap::can_write> &&
8159 is_simd_flag_type_v<Flags>>
8161 slm_block_store<T, N>(offset + __ESIMD_DNS::localAccessorToOffset(acc), vals,
8236 template <
typename T,
int N,
int VS,
typename AccessorT,
8237 typename PropertyListT =
8239 __ESIMD_API std::enable_if_t<
8240 (detail::is_local_accessor_with_v<AccessorT,
8241 detail::accessor_mode_cap::can_read> &&
8242 ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8246 return slm_gather<T, N, VS>(byte_offsets +
8247 __ESIMD_DNS::localAccessorToOffset(acc),
8248 mask, pass_thru, props);
8281 template <
typename T,
int N,
int VS,
typename AccessorT,
8282 typename PropertyListT =
8284 __ESIMD_API std::enable_if_t<
8285 (detail::is_local_accessor_with_v<AccessorT,
8286 detail::accessor_mode_cap::can_read> &&
8287 ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8291 return slm_gather<T, N, VS>(
8292 byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), mask, props);
8319 template <
typename T,
int N,
int VS,
typename AccessorT,
8320 typename PropertyListT =
8322 __ESIMD_API std::enable_if_t<
8323 (detail::is_local_accessor_with_v<AccessorT,
8324 detail::accessor_mode_cap::can_read> &&
8325 ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8328 PropertyListT props = {}) {
8329 return slm_gather<T, N, VS>(
8330 byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), props);
8346 template <
typename T,
int N,
typename AccessorT,
typename MaskT,
8347 typename PropertyListT =
8349 __ESIMD_API std::enable_if_t<
8350 (detail::is_local_accessor_with_v<AccessorT,
8351 detail::accessor_mode_cap::can_read> &&
8352 std::is_same_v<MaskT, simd_mask<N>> &&
8353 ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8356 simd<T, N> pass_thru, PropertyListT props = {}) {
8357 return slm_gather<T, N>(byte_offsets +
8358 __ESIMD_DNS::localAccessorToOffset(acc),
8359 mask, pass_thru, props);
8373 template <
typename T,
int N,
typename AccessorT,
typename MaskT,
8374 typename PropertyListT =
8376 __ESIMD_API std::enable_if_t<
8377 (detail::is_local_accessor_with_v<AccessorT,
8378 detail::accessor_mode_cap::can_read> &&
8379 std::is_same_v<MaskT, simd_mask<N>> &&
8380 ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8383 PropertyListT props = {}) {
8384 return slm_gather<T, N>(
8385 byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), mask, props);
8395 template <
typename T,
int N,
typename AccessorT,
8396 typename PropertyListT =
8398 __ESIMD_API std::enable_if_t<
8399 (detail::is_local_accessor_with_v<AccessorT,
8400 detail::accessor_mode_cap::can_read> &&
8401 ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8404 PropertyListT props = {}) {
8405 return slm_gather<T, N>(
8406 byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), props);
8417 template <
typename T,
int N,
int VS = 1,
typename AccessorT,
8418 typename OffsetSimdViewT,
8419 typename PropertyListT =
8421 __ESIMD_API std::enable_if_t<
8422 (detail::is_local_accessor_with_v<AccessorT,
8423 detail::accessor_mode_cap::can_read> &&
8424 detail::is_simd_view_type_v<OffsetSimdViewT> &&
8425 ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8428 simd<T, N> pass_thru, PropertyListT props = {}) {
8429 return gather<T, N, VS>(acc, byte_offsets.read(), mask, pass_thru, props);
8440 template <
typename T,
int N,
int VS = 1,
typename AccessorT,
8441 typename OffsetSimdViewT,
8442 typename PropertyListT =
8444 __ESIMD_API std::enable_if_t<
8445 (detail::is_local_accessor_with_v<AccessorT,
8446 detail::accessor_mode_cap::can_read> &&
8447 detail::is_simd_view_type_v<OffsetSimdViewT> &&
8448 ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8451 PropertyListT props = {}) {
8452 return gather<T, N, VS>(acc, byte_offsets.read(), mask, props);
8462 template <
typename T,
int N,
int VS = 1,
typename AccessorT,
8463 typename OffsetSimdViewT,
8464 typename PropertyListT =
8466 __ESIMD_API std::enable_if_t<
8467 (detail::is_local_accessor_with_v<AccessorT,
8468 detail::accessor_mode_cap::can_read> &&
8469 detail::is_simd_view_type_v<OffsetSimdViewT> &&
8470 ext::oneapi::experimental::is_property_list_v<PropertyListT>),
8472 gather(AccessorT acc, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
8473 return gather<T, N, VS>(acc, byte_offsets.read(), props);
8493 template <
typename T,
int N,
typename AccessorTy>
8495 std::enable_if_t<detail::is_local_accessor_with_v<
8496 AccessorTy, detail::accessor_mode_cap::can_read>,
8500 return slm_gather<T, N>(
8501 offsets + glob_offset + __ESIMD_DNS::localAccessorToOffset(acc), mask);
8566 template <
typename T,
int N,
int VS = 1,
typename AccessorT,
8567 typename PropertyListT =
8569 __ESIMD_API std::enable_if_t<
8570 detail::is_local_accessor_with_v<AccessorT,
8571 detail::accessor_mode_cap::can_write> &&
8572 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8575 slm_scatter<T, N, VS>(byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc),
8601 template <
typename T,
int N,
int VS = 1,
typename AccessorT,
8602 typename PropertyListT =
8604 __ESIMD_API std::enable_if_t<
8605 detail::is_local_accessor_with_v<AccessorT,
8606 detail::accessor_mode_cap::can_write> &&
8607 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8609 PropertyListT props = {}) {
8611 scatter<T, N, VS>(acc, byte_offsets, vals, Mask, props);
8641 template <
typename T,
int N,
int VS = 1,
typename OffsetSimdViewT,
8643 typename PropertyListT =
8645 __ESIMD_API std::enable_if_t<
8646 detail::is_local_accessor_with_v<AccessorT,
8647 detail::accessor_mode_cap::can_write> &&
8648 detail::is_simd_view_type_v<OffsetSimdViewT> &&
8649 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8652 scatter<T, N, VS>(acc, byte_offsets.read(), vals, mask, props);
8679 template <
typename T,
int N,
int VS = 1,
typename OffsetSimdViewT,
8681 typename PropertyListT =
8683 __ESIMD_API std::enable_if_t<
8684 detail::is_local_accessor_with_v<AccessorT,
8685 detail::accessor_mode_cap::can_write> &&
8686 detail::is_simd_view_type_v<OffsetSimdViewT> &&
8687 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8689 PropertyListT props = {}) {
8691 scatter<T, N, VS>(acc, byte_offsets.read(), vals, Mask, props);
8712 template <
typename T,
int N,
typename AccessorTy>
8713 __ESIMD_API std::enable_if_t<detail::is_local_accessor_with_v<
8714 AccessorTy, detail::accessor_mode_cap::can_write>>
8717 slm_scatter<T, N>(offsets + glob_offset +
8718 __ESIMD_DNS::localAccessorToOffset(acc),
8785 template <
typename T,
int N,
int VS,
typename OffsetT,
8786 typename PropertyListT =
8788 __ESIMD_API std::enable_if_t<
8789 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8791 PropertyListT props = {}) {
8792 static_assert(N / VS >= 1 && N % VS == 0,
"N must be divisible by VS");
8794 PropertyListT>(p, byte_offsets, mask);
8813 template <
typename T,
int N,
int VS,
typename OffsetT,
8814 typename PropertyListT =
8816 __ESIMD_API std::enable_if_t<
8817 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8819 PropertyListT props = {}) {
8843 template <
typename T,
int N,
typename OffsetT,
8844 typename PropertyListT =
8846 __ESIMD_API std::enable_if_t<
8847 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8849 PropertyListT props = {}) {
8850 constexpr
int VS = 1;
8868 template <
typename T,
int N,
typename OffsetT,
8869 typename PropertyListT =
8871 __ESIMD_API std::enable_if_t<
8872 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8874 constexpr
int VS = 1;
8898 template <
typename T,
int N,
int VS = 1,
typename OffsetSimdViewT,
8899 typename PropertyListT =
8901 __ESIMD_API std::enable_if_t<
8902 detail::is_simd_view_type_v<OffsetSimdViewT> &&
8903 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8905 PropertyListT props = {}) {
8926 template <
typename T,
int N,
int VS = 1,
typename OffsetSimdViewT,
8927 typename PropertyListT =
8929 __ESIMD_API std::enable_if_t<
8930 detail::is_simd_view_type_v<OffsetSimdViewT> &&
8931 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8932 prefetch(
const T *p, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
8962 template <
typename T,
int VS = 1,
typename OffsetT,
8963 typename PropertyListT =
8965 __ESIMD_API std::enable_if_t<
8966 std::is_integral_v<OffsetT> &&
8967 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8969 PropertyListT props = {}) {
8971 PropertyListT>(p, byte_offset, mask);
8989 template <
typename T,
int VS = 1,
typename OffsetT,
8990 typename PropertyListT =
8992 __ESIMD_API std::enable_if_t<
8993 std::is_integral_v<OffsetT> &&
8994 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
8995 prefetch(
const T *p, OffsetT byte_offset, PropertyListT props = {}) {
9016 template <
typename T,
int VS = 1,
9017 typename PropertyListT =
9019 __ESIMD_API std::enable_if_t<
9020 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9037 template <
typename T,
int VS = 1,
9038 typename PropertyListT =
9040 __ESIMD_API std::enable_if_t<
9041 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9111 template <
typename T,
int N,
int VS,
typename AccessorT,
typename OffsetT,
9112 typename PropertyListT =
9114 __ESIMD_API std::enable_if_t<
9115 detail::is_device_accessor_with_v<AccessorT,
9116 detail::accessor_mode_cap::can_read> &&
9117 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9120 #ifdef __ESIMD_FORCE_STATELESS_MEM
9124 static_assert(N / VS >= 1 && N % VS == 0,
"N must be divisible by VS");
9126 PropertyListT>(acc, byte_offsets, mask);
9147 template <
typename T,
int N,
int VS,
typename AccessorT,
typename OffsetT,
9148 typename PropertyListT =
9150 __ESIMD_API std::enable_if_t<
9151 detail::is_device_accessor_with_v<AccessorT,
9152 detail::accessor_mode_cap::can_read> &&
9153 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9155 PropertyListT props = {}) {
9180 template <
typename T,
int N,
typename AccessorT,
typename OffsetT,
9181 typename PropertyListT =
9183 __ESIMD_API std::enable_if_t<
9184 detail::is_device_accessor_with_v<AccessorT,
9185 detail::accessor_mode_cap::can_read> &&
9186 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9188 PropertyListT props = {}) {
9189 constexpr
int VS = 1;
9208 template <
typename T,
int N,
typename AccessorT,
typename OffsetT,
9209 typename PropertyListT =
9211 __ESIMD_API std::enable_if_t<
9212 detail::is_device_accessor_with_v<AccessorT,
9213 detail::accessor_mode_cap::can_read> &&
9214 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9216 PropertyListT props = {}) {
9217 constexpr
int VS = 1;
9242 template <
typename T,
int N,
int VS = 1,
typename OffsetSimdViewT,
9244 typename PropertyListT =
9246 __ESIMD_API std::enable_if_t<
9247 detail::is_device_accessor_with_v<AccessorT,
9248 detail::accessor_mode_cap::can_read> &&
9249 detail::is_simd_view_type_v<OffsetSimdViewT> &&
9250 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9252 PropertyListT props = {}) {
9274 template <
typename T,
int N,
int VS = 1,
typename OffsetSimdViewT,
9276 typename PropertyListT =
9278 __ESIMD_API std::enable_if_t<
9279 detail::is_device_accessor_with_v<AccessorT,
9280 detail::accessor_mode_cap::can_read> &&
9281 detail::is_simd_view_type_v<OffsetSimdViewT> &&
9282 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9284 PropertyListT props = {}) {
9313 template <
typename T,
int VS = 1,
typename AccessorT,
typename OffsetT,
9314 typename PropertyListT =
9316 __ESIMD_API std::enable_if_t<
9317 std::is_integral_v<OffsetT> &&
9318 detail::is_device_accessor_with_v<AccessorT,
9319 detail::accessor_mode_cap::can_read> &&
9320 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9322 PropertyListT props = {}) {
9323 #ifdef __ESIMD_FORCE_STATELESS_MEM
9324 prefetch<T, VS>(detail::accessorToPointer<T>(acc), byte_offset, mask, props);
9327 PropertyListT>(acc, byte_offset, mask);
9346 template <
typename T,
int VS = 1,
typename AccessorT,
typename OffsetT,
9347 typename PropertyListT =
9349 __ESIMD_API std::enable_if_t<
9350 std::is_integral_v<OffsetT> &&
9351 detail::is_device_accessor_with_v<AccessorT,
9352 detail::accessor_mode_cap::can_read> &&
9353 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9354 prefetch(AccessorT acc, OffsetT byte_offset, PropertyListT props = {}) {
9375 template <
typename T,
int VS = 1,
typename AccessorT,
9376 typename PropertyListT =
9378 __ESIMD_API std::enable_if_t<
9379 detail::is_device_accessor_with_v<AccessorT,
9380 detail::accessor_mode_cap::can_read> &&
9381 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9398 template <
typename T,
int VS = 1,
typename AccessorT,
9399 typename PropertyListT =
9401 __ESIMD_API std::enable_if_t<
9402 detail::is_device_accessor_with_v<AccessorT,
9403 detail::accessor_mode_cap::can_read> &&
9404 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9450 template <
typename T,
int BlockWidth,
int BlockHeight = 1,
int NBlocks = 1,
9451 bool Transposed =
false,
bool Transformed =
false,
9453 T, NBlocks, BlockHeight, BlockWidth, Transposed, Transformed>(),
9455 __ESIMD_API std::enable_if_t<
9456 ext::oneapi::experimental::is_property_list_v<PropertyListT>,
simd<T, N>>
9457 load_2d(
const T *Ptr,
unsigned SurfaceWidth,
unsigned SurfaceHeight,
9458 unsigned SurfacePitch,
int X,
int Y, PropertyListT props = {}) {
9459 return detail::load_2d_impl<T, BlockWidth, BlockHeight, NBlocks, Transposed,
9460 Transformed, PropertyListT>(
9461 Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
9493 template <
typename T,
int BlockWidth,
int BlockHeight = 1,
int NBlocks = 1,
9495 T, NBlocks, BlockHeight, BlockWidth,
false ,
9498 __ESIMD_API std::enable_if_t<
9499 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9500 prefetch_2d(
const T *Ptr,
unsigned SurfaceWidth,
unsigned SurfaceHeight,
9501 unsigned SurfacePitch,
int X,
int Y, PropertyListT props = {}) {
9502 detail::prefetch_2d_impl<T, BlockWidth, BlockHeight, NBlocks, PropertyListT>(
9503 Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
9529 template <
typename T,
int BlockWidth,
int BlockHeight = 1,
9531 T, 1u, BlockHeight, BlockWidth,
false ,
9534 __ESIMD_API std::enable_if_t<
9535 ext::oneapi::experimental::is_property_list_v<PropertyListT>>
9536 store_2d(T *Ptr,
unsigned SurfaceWidth,
unsigned SurfaceHeight,
9537 unsigned SurfacePitch,
int X,
int Y,
simd<T, N> Vals,
9538 PropertyListT props = {}) {
9539 detail::store_2d_impl<T, BlockWidth, BlockHeight, PropertyListT>(
9540 Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, Vals);
9568 typename AccessorT,
int N,
9571 std::enable_if_t<detail::is_local_accessor_with_v<
9572 AccessorT, detail::accessor_mode_cap::can_read>,
9576 return slm_gather_rgba<T, N, RGBAMask>(
9577 offsets + global_offset + __ESIMD_DNS::localAccessorToOffset(acc), mask);
9597 typename AccessorT,
int N,
9599 __ESIMD_API std::enable_if_t<detail::is_local_accessor_with_v<
9600 AccessorT, detail::accessor_mode_cap::can_write>>
9604 detail::validate_rgba_write_channel_mask<RGBAMask>();
9605 slm_scatter_rgba<T, N, RGBAMask>(offsets + global_offset +
9606 __ESIMD_DNS::localAccessorToOffset(acc),
9633 template <uint8_t exec_size, uint8_t sfid, uint8_t num_src0, uint8_t num_src1,
9636 typename T2,
int n2,
typename T3,
int n3>
9637 __ESIMD_API __ESIMD_NS::simd<T1, n1>
9638 raw_sends(__ESIMD_NS::simd<T1, n1> msg_dst, __ESIMD_NS::simd<T2, n2> msg_src0,
9639 __ESIMD_NS::simd<T3, n3> msg_src1, uint32_t ex_desc,
9640 uint32_t msg_desc, __ESIMD_NS::simd_mask<exec_size> mask = 1) {
9641 constexpr
unsigned _Width1 = n1 *
sizeof(T1);
9642 static_assert(_Width1 % 32 == 0,
"Invalid size for raw send rspVar");
9643 constexpr
unsigned _Width2 = n2 *
sizeof(T2);
9644 static_assert(_Width2 % 32 == 0,
"Invalid size for raw send msg_src0");
9645 constexpr
unsigned _Width3 = n3 *
sizeof(T3);
9646 static_assert(_Width3 % 32 == 0,
"Invalid size for raw send msg_src1");
9648 using ElemT1 = __ESIMD_DNS::__raw_t<T1>;
9649 using ElemT2 = __ESIMD_DNS::__raw_t<T2>;
9650 using ElemT3 = __ESIMD_DNS::__raw_t<T3>;
9652 constexpr uint8_t modifier =
9655 return __esimd_raw_sends2<ElemT1, n1, ElemT2, n2, ElemT3, n3, exec_size>(
9656 modifier, exec_size, mask.
data(), num_src0, num_src1, num_dst, sfid,
9657 ex_desc, msg_desc, msg_src0.data(), msg_src1.data(), msg_dst.data());
9677 template <uint8_t exec_size, uint8_t sfid, uint8_t num_src0, uint8_t num_dst,
9680 typename T2,
int n2>
9681 __ESIMD_API __ESIMD_NS::simd<T1, n1>
9682 raw_send(__ESIMD_NS::simd<T1, n1> msg_dst, __ESIMD_NS::simd<T2, n2> msg_src0,
9683 uint32_t ex_desc, uint32_t msg_desc,
9684 __ESIMD_NS::simd_mask<exec_size> mask = 1) {
9685 constexpr
unsigned _Width1 = n1 *
sizeof(T1);
9686 static_assert(_Width1 % 32 == 0,
"Invalid size for raw send rspVar");
9687 constexpr
unsigned _Width2 = n2 *
sizeof(T2);
9688 static_assert(_Width2 % 32 == 0,
"Invalid size for raw send msg_src0");
9690 using ElemT1 = __ESIMD_DNS::__raw_t<T1>;
9691 using ElemT2 = __ESIMD_DNS::__raw_t<T2>;
9693 constexpr uint8_t modifier =
9695 return __esimd_raw_send2<ElemT1, n1, ElemT2, n2, exec_size>(
9696 modifier, exec_size, mask.
data(), num_src0, num_dst, sfid, ex_desc,
9697 msg_desc, msg_src0.data(), msg_dst.data());
9717 template <uint8_t exec_size, uint8_t sfid, uint8_t num_src0, uint8_t num_src1,
9720 typename T2,
int n2>
9721 __ESIMD_API
void raw_sends(__ESIMD_NS::simd<T1, n1> msg_src0,
9722 __ESIMD_NS::simd<T2, n2> msg_src1, uint32_t ex_desc,
9724 __ESIMD_NS::simd_mask<exec_size> mask = 1) {
9725 constexpr
unsigned _Width1 = n1 *
sizeof(T1);
9726 static_assert(_Width1 % 32 == 0,
"Invalid size for raw send msg_src0");
9727 constexpr
unsigned _Width2 = n2 *
sizeof(T2);
9728 static_assert(_Width2 % 32 == 0,
"Invalid size for raw send msg_src1");
9730 using ElemT1 = __ESIMD_DNS::__raw_t<T1>;
9731 using ElemT2 = __ESIMD_DNS::__raw_t<T2>;
9733 constexpr uint8_t modifier =
9735 __esimd_raw_sends2_noresult<ElemT1, n1, ElemT2, n2, exec_size>(
9736 modifier, exec_size, mask.
data(), num_src0, num_src1, sfid, ex_desc,
9737 msg_desc, msg_src0.data(), msg_src1.data());
9755 template <uint8_t exec_size, uint8_t sfid, uint8_t num_src0,
9758 __ESIMD_API
void raw_send(__ESIMD_NS::simd<T1, n1> msg_src0, uint32_t ex_desc,
9760 __ESIMD_NS::simd_mask<exec_size> mask = 1) {
9761 constexpr
unsigned _Width1 = n1 *
sizeof(T1);
9762 static_assert(_Width1 % 32 == 0,
"Invalid size for raw send msg_src0");
9763 using ElemT1 = __ESIMD_DNS::__raw_t<T1>;
9764 constexpr uint8_t modifier =
9766 __esimd_raw_send2_noresult<ElemT1, n1, exec_size>(
9767 modifier, exec_size, mask.
data(), num_src0, sfid, ex_desc, msg_desc,
9780 template <
typename T,
int N,
class T1,
class SFINAE>
9781 template <
typename Flags,
int ChunkSize,
typename>
9784 Flags) SYCL_ESIMD_FUNCTION {
9786 constexpr
unsigned Size =
sizeof(T) * N;
9787 constexpr
unsigned Align = Flags::template alignment<T1>;
9789 constexpr
unsigned BlockSize = OperandSize::OWORD * 8;
9790 constexpr
unsigned NumBlocks = Size / BlockSize;
9791 constexpr
unsigned RemSize = Size % BlockSize;
9793 if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 &&
9795 if constexpr (NumBlocks > 0) {
9796 constexpr
unsigned BlockN = BlockSize /
sizeof(T);
9797 ForHelper<NumBlocks>::unroll([BlockN, Addr,
this](
unsigned Block) {
9798 select<BlockN, 1>(Block * BlockN) =
9799 block_load<UT, BlockN, Flags>(Addr + (Block * BlockN), Flags{});
9802 if constexpr (RemSize > 0) {
9803 constexpr
unsigned RemN = RemSize /
sizeof(T);
9804 constexpr
unsigned BlockN = BlockSize /
sizeof(T);
9805 select<RemN, 1>(NumBlocks * BlockN) =
9806 block_load<UT, RemN, Flags>(Addr + (NumBlocks * BlockN), Flags{});
9808 }
else if constexpr (
sizeof(T) == 8) {
9810 bit_cast_view<int32_t>() = BC;
9812 constexpr
unsigned NumChunks = N / ChunkSize;
9813 if constexpr (NumChunks > 0) {
9815 ForHelper<NumChunks>::unroll([Addr, &Offsets,
this](
unsigned Block) {
9816 select<ChunkSize, 1>(Block * ChunkSize) =
9817 gather<UT, ChunkSize>(Addr + (Block * ChunkSize), Offsets);
9820 constexpr
unsigned RemN = N % ChunkSize;
9821 if constexpr (RemN > 0) {
9822 if constexpr (RemN == 1) {
9823 select<1, 1>(NumChunks * ChunkSize) = Addr[NumChunks * ChunkSize];
9824 }
else if constexpr (RemN == 8 || RemN == 16) {
9826 select<RemN, 1>(NumChunks * ChunkSize) =
9827 gather<UT, RemN>(Addr + (NumChunks * ChunkSize), Offsets);
9829 constexpr
int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
9830 simd_mask_type<N1> Pred(0);
9831 Pred.template select<RemN, 1>() = 1;
9834 gather<UT, N1>(Addr + (NumChunks * ChunkSize), Offsets, Pred);
9835 select<RemN, 1>(NumChunks * ChunkSize) =
9836 Vals.template select<RemN, 1>();
9842 template <
typename T,
int N,
class T1,
class SFINAE>
9843 template <
int ChunkSize,
typename Flags,
typename AccessorT,
typename TOffset>
9844 ESIMD_INLINE
void simd_obj_impl<T, N, T1, SFINAE>::copy_to_impl(
9845 AccessorT acc, TOffset offset)
const SYCL_ESIMD_FUNCTION {
9847 constexpr
unsigned Size =
sizeof(T) * N;
9848 constexpr
unsigned Align = Flags::template alignment<T1>;
9850 constexpr
unsigned BlockSize = OperandSize::OWORD * 8;
9851 constexpr
unsigned NumBlocks = Size / BlockSize;
9852 constexpr
unsigned RemSize = Size % BlockSize;
9855 if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
9857 if constexpr (NumBlocks > 0) {
9858 constexpr
unsigned BlockN = BlockSize /
sizeof(T);
9859 ForHelper<NumBlocks>::unroll([BlockN, acc, offset, &Tmp](
unsigned Block) {
9860 block_store<UT, BlockN, AccessorT>(
9861 acc, offset + (Block * BlockSize),
9862 Tmp.template select<BlockN, 1>(Block * BlockN));
9865 if constexpr (RemSize > 0) {
9866 constexpr
unsigned RemN = RemSize /
sizeof(T);
9867 constexpr
unsigned BlockN = BlockSize /
sizeof(T);
9868 block_store<UT, RemN, AccessorT>(
9869 acc, offset + (NumBlocks * BlockSize),
9870 Tmp.template select<RemN, 1>(NumBlocks * BlockN));
9872 }
else if constexpr (
sizeof(T) == 8) {
9874 BC.
copy_to(acc, offset, Flags{});
9876 constexpr
unsigned NumChunks = N / ChunkSize;
9877 if constexpr (NumChunks > 0) {
9879 ForHelper<NumChunks>::unroll([acc, offset, &Offsets,
9880 &Tmp](
unsigned Block) {
9881 scatter<UT, ChunkSize, AccessorT>(
9882 acc, Offsets, Tmp.template select<ChunkSize, 1>(Block * ChunkSize),
9883 offset + (Block * ChunkSize *
sizeof(T)));
9886 constexpr
unsigned RemN = N % ChunkSize;
9887 if constexpr (RemN > 0) {
9888 if constexpr (RemN == 1 || RemN == 8 || RemN == 16) {
9890 scatter<UT, RemN, AccessorT>(
9891 acc, Offsets, Tmp.template select<RemN, 1>(NumChunks * ChunkSize),
9892 offset + (NumChunks * ChunkSize *
sizeof(T)));
9894 constexpr
int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
9895 simd_mask_type<N1> Pred(0);
9896 Pred.template select<RemN, 1>() = 1;
9898 Vals.template select<RemN, 1>() =
9899 Tmp.template select<RemN, 1>(NumChunks * ChunkSize);
9901 scatter<UT, N1, AccessorT>(acc, Offsets, Vals,
9902 offset + (NumChunks * ChunkSize *
sizeof(T)),
9909 template <
typename T,
int N,
class T1,
class SFINAE>
9910 template <
int ChunkSize,
typename Flags,
typename AccessorT,
typename TOffset>
9911 ESIMD_INLINE
void simd_obj_impl<T, N, T1, SFINAE>::copy_from_impl(
9912 AccessorT acc, TOffset offset) SYCL_ESIMD_FUNCTION {
9914 static_assert(
sizeof(UT) ==
sizeof(T));
9915 constexpr
unsigned Size =
sizeof(T) * N;
9916 constexpr
unsigned Align = Flags::template alignment<T1>;
9918 constexpr
unsigned BlockSize = OperandSize::OWORD * 8;
9919 constexpr
unsigned NumBlocks = Size / BlockSize;
9920 constexpr
unsigned RemSize = Size % BlockSize;
9922 if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 &&
9924 if constexpr (NumBlocks > 0) {
9925 constexpr
unsigned BlockN = BlockSize /
sizeof(T);
9926 ForHelper<NumBlocks>::unroll([BlockN, acc, offset,
this](
unsigned Block) {
9927 select<BlockN, 1>(Block * BlockN) =
9928 block_load<UT, BlockN, AccessorT, Flags>(
9929 acc, offset + (Block * BlockSize), Flags{});
9932 if constexpr (RemSize > 0) {
9933 constexpr
unsigned RemN = RemSize /
sizeof(T);
9934 constexpr
unsigned BlockN = BlockSize /
sizeof(T);
9935 select<RemN, 1>(NumBlocks * BlockN) =
9936 block_load<UT, RemN, AccessorT, Flags>(
9937 acc, offset + (NumBlocks * BlockSize), Flags{});
9939 }
else if constexpr (
sizeof(T) == 8) {
9941 bit_cast_view<int32_t>() = BC;
9943 constexpr
unsigned NumChunks = N / ChunkSize;
9944 if constexpr (NumChunks > 0) {
9946 ForHelper<NumChunks>::unroll(
9947 [acc, offset, &Offsets,
this](
unsigned Block) {
9948 select<ChunkSize, 1>(Block * ChunkSize) =
9949 gather<UT, ChunkSize, AccessorT>(
9950 acc, Offsets, offset + (Block * ChunkSize *
sizeof(T)));
9953 constexpr
unsigned RemN = N % ChunkSize;
9954 if constexpr (RemN > 0) {
9955 if constexpr (RemN == 1 || RemN == 8 || RemN == 16) {
9957 select<RemN, 1>(NumChunks * ChunkSize) = gather<UT, RemN, AccessorT>(
9958 acc, Offsets, offset + (NumChunks * ChunkSize *
sizeof(T)));
9960 constexpr
int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
9961 simd_mask_type<N1> Pred(0);
9962 Pred.template select<RemN, 1>() = 1;
9965 acc, Offsets, offset + (NumChunks * ChunkSize *
sizeof(T)), Pred);
9966 select<RemN, 1>(NumChunks * ChunkSize) =
9967 Vals.template select<RemN, 1>();
9973 template <
typename T,
int N,
class T1,
class SFINAE>
9974 template <
typename AccessorT,
typename Flags,
int ChunkSize,
typename>
9975 ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_read, void>
9978 Flags) SYCL_ESIMD_FUNCTION {
9980 copy_from_impl<ChunkSize, Flags>(acc, offset);
9983 template <
typename T,
int N,
class T1,
class SFINAE>
9984 template <
typename AccessorT,
typename Flags,
int ChunkSize,
typename>
9985 ESIMD_INLINE std::enable_if_t<
9986 detail::is_local_accessor_with_v<AccessorT, accessor_mode_cap::can_read>,
9989 Flags) SYCL_ESIMD_FUNCTION {
9991 copy_from_impl<ChunkSize, Flags>(acc, offset);
9994 template <
typename T,
int N,
class T1,
class SFINAE>
9995 template <
typename Flags,
int ChunkSize,
typename>
9998 Flags)
const SYCL_ESIMD_FUNCTION {
10000 constexpr
unsigned Size =
sizeof(T) * N;
10001 constexpr
unsigned Align = Flags::template alignment<T1>;
10003 constexpr
unsigned BlockSize = OperandSize::OWORD * 8;
10004 constexpr
unsigned NumBlocks = Size / BlockSize;
10005 constexpr
unsigned RemSize = Size % BlockSize;
10008 if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
10010 if constexpr (NumBlocks > 0) {
10011 constexpr
unsigned BlockN = BlockSize /
sizeof(T);
10012 ForHelper<NumBlocks>::unroll([BlockN, Addr, &Tmp](
unsigned Block) {
10013 block_store<UT, BlockN>(Addr + (Block * BlockN),
10014 Tmp.template select<BlockN, 1>(Block * BlockN));
10017 if constexpr (RemSize > 0) {
10018 constexpr
unsigned RemN = RemSize /
sizeof(T);
10019 constexpr
unsigned BlockN = BlockSize /
sizeof(T);
10020 block_store<UT, RemN>(Addr + (NumBlocks * BlockN),
10021 Tmp.template select<RemN, 1>(NumBlocks * BlockN));
10023 }
else if constexpr (
sizeof(T) == 8) {
10025 BC.
copy_to(
reinterpret_cast<int32_t *
>(Addr), Flags{});
10027 constexpr
unsigned NumChunks = N / ChunkSize;
10028 if constexpr (NumChunks > 0) {
10030 ForHelper<NumChunks>::unroll([Addr, &Offsets, &Tmp](
unsigned Block) {
10031 scatter<UT, ChunkSize>(
10032 Addr + (Block * ChunkSize), Offsets,
10033 Tmp.template select<ChunkSize, 1>(Block * ChunkSize));
10036 constexpr
unsigned RemN = N % ChunkSize;
10037 if constexpr (RemN > 0) {
10038 if constexpr (RemN == 1) {
10039 Addr[NumChunks * ChunkSize] = Tmp[NumChunks * ChunkSize];
10040 }
else if constexpr (RemN == 8 || RemN == 16) {
10044 if constexpr (
sizeof(T) == 1 && RemN == 16) {
10045 if constexpr (Align % OperandSize::DWORD > 0) {
10046 ForHelper<RemN>::unroll([Addr, &Tmp](
unsigned Index) {
10047 Addr[Index + NumChunks * ChunkSize] =
10048 Tmp[Index + NumChunks * ChunkSize];
10051 simd_mask_type<8> Pred(0);
10053 Pred.template select<4, 1>() = 1;
10054 Vals.template select<4, 1>() =
10055 Tmp.template bit_cast_view<int32_t>().template select<4, 1>(
10056 NumChunks * ChunkSize);
10059 scatter<int32_t, 8>(
10060 reinterpret_cast<int32_t *
>(Addr + (NumChunks * ChunkSize)),
10061 Offsets, Vals, Pred);
10066 Addr + (NumChunks * ChunkSize), Offsets,
10067 Tmp.template select<RemN, 1>(NumChunks * ChunkSize));
10070 constexpr
int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
10071 simd_mask_type<N1> Pred(0);
10072 Pred.template select<RemN, 1>() = 1;
10074 Vals.template select<RemN, 1>() =
10075 Tmp.template select<RemN, 1>(NumChunks * ChunkSize);
10077 scatter<UT, N1>(Addr + (NumChunks * ChunkSize), Offsets, Vals, Pred);
10083 template <
typename T,
int N,
class T1,
class SFINAE>
10084 template <
typename AccessorT,
typename Flags,
int ChunkSize,
typename>
10085 ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_write, void>
10088 Flags)
const SYCL_ESIMD_FUNCTION {
10089 copy_to_impl<ChunkSize, Flags>(acc, offset);
10092 template <
typename T,
int N,
class T1,
class SFINAE>
10093 template <
typename AccessorT,
typename Flags,
int ChunkSize,
typename>
10094 ESIMD_INLINE std::enable_if_t<
10095 detail::is_local_accessor_with_v<AccessorT, accessor_mode_cap::can_write>,
10098 Flags)
const SYCL_ESIMD_FUNCTION {
10099 copy_to_impl<ChunkSize, Flags>(acc, offset);
const auto & data() const noexcept
std::enable_if_t< __vectorizable< _Up >) &&is_simd_flag_type< _Flags >::value > copy_to(_Up *__buffer, _Flags) const
get_vector_element_type< Derived > element_type
Element type of the derived (user) class.
raw_vector_type data() const
ESIMD_INLINE void copy_from(const Ty *addr, Flags={}) SYCL_ESIMD_FUNCTION
Copy a contiguous block of data from memory into this simd_obj_impl object.
ESIMD_INLINE void copy_to(Ty *addr, Flags={}) const SYCL_ESIMD_FUNCTION
Copy all vector elements of this object into a contiguous block in memory.
The main simd vector class.
typename base_type::raw_vector_type raw_vector_type
#define __ESIMD_FP_ATOMIC_OP_TYPE_CHECK(T)
raw_send_eot
Specify if end of thread should be set.
rgba_channel_mask
Represents a pixel's channel mask - all possible combinations of enabled channels.
raw_send_sendc
Specify if sendc should be used.
unsigned int SurfaceIndex
Surface index type.
constexpr int get_num_channels_enabled(rgba_channel_mask M)
atomic_op
Represents an atomic operation.
@ fsub
ACM/PVC: Subtraction (floating point): *addr = *addr - src0.
@ fmax
ACM/PVC: Minimum (floating point): *addr = min(*addr, src0).
@ fadd
ACM/PVC: Addition (floating point): *addr = *addr + src0.
@ xchg
Exchange. *addr == src0;
@ fmin
ACM/PVC: Maximum (floating point): *addr = max(*addr, src0).
@ fcmpxchg
ACM/PVC: Compare and exchange (floating point).
__ESIMD_API SZ simd< T, SZ > src1
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT >, simd< T, N > > block_load(const T *ptr, PropertyListT props={})
Each of the following block load functions loads a contiguous memory block from the address reference...
__ESIMD_API std::enable_if_t< __ESIMD_DNS::get_num_args< Op >)==0 &&__ESIMD_DNS::is_rw_local_accessor_v< AccessorT >, simd< T, N > > atomic_update(AccessorT lacc, simd< uint32_t, N > byte_offset, simd_mask< N > mask=1)
simd<T, N> atomic_update(local_accessor lacc, simd<uint32_t, N> byte_offset, simd_mask<N> pred = 1); ...
__ESIMD_API std::enable_if_t<(N==8||N==16||N==32) &&(sizeof(T)==4)> slm_scatter_rgba(simd< uint32_t, N > offsets, simd< T, N *get_num_channels_enabled(Mask)> vals, simd_mask< N > mask=1)
Gather data from the Shared Local Memory at specified offsets and return it as simd vector.
__ESIMD_API T slm_scalar_load(uint32_t offset)
Load a scalar value from the Shared Local Memory.
__ESIMD_API std::enable_if_t< is_simd_flag_type_v< Flags >, simd< T, N > > slm_block_load(uint32_t byte_offset, Flags)
Loads a contiguous block of SLM memory referenced by the given byte-offset offset,...
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT > > slm_scatter(simd< uint32_t, N/VS > byte_offsets, simd< T, N > vals, simd_mask< N/VS > mask, PropertyListT props={})
template <typename T, int N, int VS = 1, typename PropertyListT = empty_properties_t> void slm_scatte...
__ESIMD_API std::enable_if_t< is_simd_flag_type_v< Flags > > slm_block_store(uint32_t offset, simd< T, N > vals, Flags)
Stores elements of the vector vals to a contiguous block of SLM memory at the given byte-offset offse...
__ESIMD_API std::enable_if_t< __ESIMD_DNS::get_num_args< Op >)==0, simd< T, N > > slm_atomic_update(simd< uint32_t, N > byte_offset, simd_mask< N > mask=1)
__ESIMD_API void slm_init()
Declare per-work-group slm size.
__ESIMD_API std::enable_if_t<(N==8||N==16||N==32) &&(sizeof(T)==4), simd< T, N *get_num_channels_enabled(RGBAMask)> > slm_gather_rgba(simd< uint32_t, N > offsets, simd_mask< N > mask=1)
Gather data from the Shared Local Memory at specified offsets and return it as simd vector.
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT >, simd< T, N > > slm_gather(simd< uint32_t, N/VS > byte_offsets, simd_mask< N/VS > mask, simd< T, N > pass_thru, PropertyListT props={})
template <typename T, int N, int VS, typename PropertyListT = empty_properties_t> simd<T,...
__ESIMD_API void slm_scalar_store(uint32_t offset, T val)
Store a scalar value into the Shared Local Memory.
__ESIMD_API simd< T, N *get_num_channels_enabled(RGBAMask)> gather_rgba(const T *p, simd< Toffset, N > offsets, simd_mask< N > mask=1)
Gather and transpose pixels from given memory locations defined by the base pointer p and offsets.
__ESIMD_API T scalar_load(AccessorTy acc, detail::DeviceAccessorOffsetT offset)
Load a scalar value from an accessor.
__ESIMD_API std::enable_if_t< is_simd_flag_type_v< Flags > > block_store(Tx *addr, simd< Tx, N > vals, Flags)
Stores elements of the vector vals to a contiguous block of memory at the given address addr.
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT > > store_2d(T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y, simd< T, N > Vals, PropertyListT props={})
2D USM pointer block store.
__ESIMD_API void scatter_rgba(T *p, simd< Toffset, N > offsets, simd< T, N *get_num_channels_enabled(RGBAMask)> vals, simd_mask< N > mask=1)
Transpose and scatter pixels to given memory locations defined by the base pointer p and offsets.
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT > > prefetch_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y, PropertyListT props={})
template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1, int N = detail::get_lsc_b...
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT > > scatter(T *p, simd< OffsetT, N/VS > byte_offsets, simd< T, N > vals, simd_mask< N/VS > mask, PropertyListT props={})
template <typename T, int N, int VS = 1, typename OffsetT, typename PropertyListT = empty_properties_...
__ESIMD_API void fence()
esimd::fence sets the memory read/write order.
__ESIMD_API void scalar_store(AccessorTy acc, detail::DeviceAccessorOffsetT offset, T val)
Store a scalar value into an accessor.
__ESIMD_API void media_block_store(AccessorTy acc, unsigned x, unsigned y, simd< T, m *N > vals)
Media block store.
__ESIMD_API std::enable_if_t< detail::is_device_accessor_with_v< AccessorT, detail::accessor_mode_cap::can_read > &&ext::oneapi::experimental::is_property_list_v< PropertyListT > > prefetch(AccessorT acc, PropertyListT props={})
template <typename T, int VS = 1, typename AccessorT, typename PropertyListT = empty_properties_t> vo...
fence_mask
Represetns a bit mask to control behavior of esimd::fence.
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT >, simd< T, N > > load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, int X, int Y, PropertyListT props={})
template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1, bool Transposed = false,...
__ESIMD_API SurfaceIndex get_surface_index(AccessorTy acc)
Get surface index corresponding to a SYCL accessor.
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT > > prefetch(const T *p, simd< OffsetT, N/VS > byte_offsets, simd_mask< N/VS > mask, PropertyListT props={})
template <typename T, int N, int VS, typename OffsetT, typename PropertyListT = empty_properties_t> v...
__ESIMD_API simd< T, m *N > media_block_load(AccessorTy acc, unsigned x, unsigned y)
Media block load.
__ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v< PropertyListT >, simd< T, N > > gather(const T *p, simd< OffsetT, N/VS > byte_offsets, simd_mask< N/VS > mask, simd< T, N > pass_thru, PropertyListT props={})
template <typename T, int N, int VS, typename OffsetT, typename PropertyListT = empty_properties_t> s...
__ESIMD_API void barrier()
Generic work-group barrier.
@ global_coherent_fence
“Commit enable” - wait for fence to complete before continuing.
@ l2_flush_constant_data
Flush constant cache.
@ local_barrier
Issue SLM memory barrier only. If not set, the memory barrier is global.
@ l1_flush_ro_data
Flush L1 read - only data cache.
@ l2_flush_rw_data
Flush constant cache.
@ __SYCL_DEPRECATED
Creates a software (compiler) barrier, which does not generate any instruction and only prevents inst...
@ l2_flush_texture_data
Flush sampler (texture) cache.
@ l2_flush_instructions
Flush the instruction cache.
__ESIMD_API sycl::ext::intel::esimd::simd< T1, n1 > raw_send(sycl::ext::intel::esimd::simd< T1, n1 > msg_dst, sycl::ext::intel::esimd::simd< T2, n2 > msg_src0, uint32_t ex_desc, uint32_t msg_desc, sycl::ext::intel::esimd::simd_mask< exec_size > mask=1)
Raw send.
__ESIMD_API sycl::ext::intel::esimd::simd< T1, n1 > raw_sends(sycl::ext::intel::esimd::simd< T1, n1 > msg_dst, sycl::ext::intel::esimd::simd< T2, n2 > msg_src0, sycl::ext::intel::esimd::simd< T3, n3 > msg_src1, uint32_t ex_desc, uint32_t msg_desc, sycl::ext::intel::esimd::simd_mask< exec_size > mask=1)
Raw sends.
void add(const void *DeviceGlobalPtr, const char *UniqueId)
ESIMD_INLINE simd< T, N > lsc_format_ret(simd< T1, N > Vals)
__ESIMD_API std::enable_if_t< get_num_args< Op >)==0, simd< T, N > > slm_atomic_update_impl(simd< uint32_t, N > offsets, simd_mask< N > pred)
SLM atomic.
constexpr bool isMaskedGatherScatterLLVMAvailable()
static constexpr SurfaceIndex SLM_BTI
constexpr cache_hint getCacheHintForIntrin()
Extracts a cache hint with the given 'Level' to pass it to ESIMD/GENX intrinsics.
constexpr void check_atomic()
Check the legality of an atomic call in terms of size and type.
static void validate_rgba_write_channel_mask()
__ESIMD_API simd< T, N > slm_atomic_update_impl(simd< uint32_t, N > offsets, simd< T, N > src0, simd< T, N > src1, simd_mask< N > pred)
SLM atomic.
__ESIMD_API std::enable_if_t< detail::is_property_list_v< PropertyListT > > block_store_impl(T *p, simd< T, NElts > vals, simd_mask< 1 > pred)
__ESIMD_API simd< T, N *NElts > gather_impl(const T *p, simd< OffsetT, N > offsets, simd_mask< N > pred, simd< T, N *NElts > pass_thru)
USM pointer gather.
lsc_data_size
Data size or format to read or store.
__ESIMD_API void scatter_impl(T *p, simd< Toffset, N > offsets, simd< T, N *NElts > vals, simd_mask< N > pred)
USM pointer scatter.
ESIMD_INLINE simd< RT, N > lsc_format_input(simd< T, N > Vals)
constexpr int lsc_to_internal_atomic_op()
__ESIMD_API std::enable_if_t< get_num_args< Op >)==0, simd< T, N > > atomic_update_impl(T *p, simd< Toffset, N > offsets, simd_mask< N > pred)
USM pointer atomic.
constexpr ESIMD_INLINE bool isPowerOf2(unsigned int n)
Check if a given 32 bit positive integer is a power of 2 at compile time.
__ESIMD_API std::enable_if_t< is_property_list_v< PropertyListT >, simd< T, NElts > > block_load_impl(const T *p, simd_mask< 1 > pred, simd< T, NElts > pass_thru)
USM pointer transposed gather with 1 channel.
constexpr lsc_data_size expand_data_size(lsc_data_size DS)
uint32_t DeviceAccessorOffsetT
constexpr alignment_key::value_t< K > alignment
cache_hint
L1, L2 or L3 cache hints.
fence_scope
The scope that fence() operation should apply to.
@ group
Wait until all previous memory transactions from this thread are observed within the local thread-gro...
fence_flush_op
The cache flush operation to apply to caches after fence() is complete.
memory_kind
The target memory kind for fence() operation.
@ local
image (also known as typed global memory)
constexpr int get_lsc_block_2d_data_size()
prefetch_impl< _B > prefetch
void prefetch_impl(T *ptr, size_t bytes, Properties properties)
properties< std::tuple<> > empty_properties_t
decltype(properties{}) empty_properties_t
std::enable_if_t< detail::is_bf16_storage_type< T >::value, T > fmax(T x, T y)
std::enable_if_t< detail::is_bf16_storage_type< T >::value, T > fmin(T x, T y)
std::bit_xor< T > bit_xor
std::bit_and< T > bit_and
std::enable_if_t< sizeof(To)==sizeof(From) &&std::is_trivially_copyable< From >::value &&std::is_trivially_copyable< To >::value, To > bit_cast(const From &from) noexcept
constexpr stream_manipulator dec
std::conditional_t< sizeof(T)<=4, std::conditional_t< std::is_signed_v< T >, int32_t, uint32_t >, std::conditional_t< std::is_signed_v< T >, int64_t, uint64_t > > type