25 namespace ext::intel::esimd {
47 struct LocalAccessorMarker {};
61 template <
typename AccessorTy>
63 if constexpr (std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
64 sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>) {
67 return __esimd_get_surface_index(
68 detail::AccessorPrivateProxy::getQualifiedPtrOrImageObj(acc));
130 template <
typename Tx,
int N,
typename Toffset>
133 using T = detail::__raw_t<Tx>;
134 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
138 addrs = addrs + offsets_i;
140 if constexpr (
sizeof(T) == 1) {
141 auto Ret = __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<4>(),
142 detail::ElemsPerAddrEncoding<1>()>(
143 addrs.data(), mask.data());
144 return __esimd_rdregion<T, N * 4, N, 0, N, 4>(Ret, 0);
145 }
else if constexpr (
sizeof(T) == 2) {
146 auto Ret = __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<2>(),
147 detail::ElemsPerAddrEncoding<2>()>(
148 addrs.data(), mask.data());
149 return __esimd_rdregion<T, N * 2, N, 0, N, 2>(Ret, 0);
151 return __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<1>(),
152 detail::ElemsPerAddrEncoding<1>()>(addrs.data(),
169 template <
typename Tx,
int N,
typename Toffset,
170 typename RegionTy = region1d_t<Toffset, N, 1>>
174 return gather<Tx, N>(p, offsets.read(), mask);
189 template <
typename Tx,
int N,
typename Toffset>
190 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>,
simd<Tx, N>>
208 template <
typename Tx,
int N,
typename Toffset>
211 using T = detail::__raw_t<Tx>;
212 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
216 addrs = addrs + offsets_i;
217 if constexpr (
sizeof(T) == 1) {
219 D = __esimd_wrregion<T, N * 4, N, 0, N, 4>(D.data(), vals.data(), 0);
220 __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<4>(),
221 detail::ElemsPerAddrEncoding<1>()>(
222 addrs.data(), D.data(), mask.data());
223 }
else if constexpr (
sizeof(T) == 2) {
225 D = __esimd_wrregion<T, N * 2, N, 0, N, 2>(D.data(), vals.data(), 0);
226 __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<2>(),
227 detail::ElemsPerAddrEncoding<2>()>(
228 addrs.data(), D.data(), mask.data());
230 __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<1>(),
231 detail::ElemsPerAddrEncoding<1>()>(
232 addrs.data(), vals.data(), mask.data());
247 template <
typename Tx,
int N,
typename Toffset,
248 typename RegionTy = region1d_t<Toffset, N, 1>>
251 scatter<Tx, N>(p, offsets.read(), vals, mask);
265 template <
typename Tx,
int N,
typename Toffset>
266 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> && N == 1>
285 class T = detail::__raw_t<Tx>,
286 typename = std::enable_if_t<is_simd_flag_type_v<Flags>>>
288 constexpr
unsigned Sz =
sizeof(T) * N;
289 static_assert(Sz >= detail::OperandSize::OWORD,
290 "block size must be at least 1 oword");
291 static_assert(Sz % detail::OperandSize::OWORD == 0,
292 "block size must be whole number of owords");
294 "block must be 1, 2, 4 or 8 owords long");
295 static_assert(Sz <= 8 * detail::OperandSize::OWORD,
296 "block size must be at most 8 owords");
298 uintptr_t Addr =
reinterpret_cast<uintptr_t
>(addr);
300 detail::OperandSize::OWORD) {
301 return __esimd_svm_block_ld<T, N>(Addr);
303 return __esimd_svm_block_ld_unaligned<T, N>(Addr);
322 template <
typename Tx,
int N,
typename AccessorTy,
324 typename = std::enable_if_t<is_simd_flag_type_v<Flags>>,
325 class T = detail::__raw_t<Tx>>
327 #ifdef __ESIMD_FORCE_STATELESS_MEM
333 #ifdef __ESIMD_FORCE_STATELESS_MEM
334 return block_load<Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc, offset));
336 constexpr
unsigned Sz =
sizeof(T) * N;
337 static_assert(Sz >= detail::OperandSize::OWORD,
338 "block size must be at least 1 oword");
339 static_assert(Sz % detail::OperandSize::OWORD == 0,
340 "block size must be whole number of owords");
342 "block must be 1, 2, 4 or 8 owords long");
343 static_assert(Sz <= 8 * detail::OperandSize::OWORD,
344 "block size must be at most 8 owords");
346 auto surf_ind = __esimd_get_surface_index(
347 detail::AccessorPrivateProxy::getQualifiedPtrOrImageObj(acc));
350 detail::OperandSize::OWORD) {
351 return __esimd_oword_ld<T, N>(surf_ind, offset >> 4);
353 return __esimd_oword_ld_unaligned<T, N>(surf_ind, offset);
366 template <
typename Tx,
int N,
class T = detail::__raw_t<Tx>>
368 constexpr
unsigned Sz =
sizeof(T) * N;
369 static_assert(Sz >= detail::OperandSize::OWORD,
370 "block size must be at least 1 oword");
371 static_assert(Sz % detail::OperandSize::OWORD == 0,
372 "block size must be whole number of owords");
374 "block must be 1, 2, 4 or 8 owords long");
375 static_assert(Sz <= 8 * detail::OperandSize::OWORD,
376 "block size must be at most 8 owords");
378 uintptr_t Addr =
reinterpret_cast<uintptr_t
>(p);
379 __esimd_svm_block_st<T, N>(Addr, vals.data());
393 template <
typename Tx,
int N,
typename AccessorTy,
394 class T = detail::__raw_t<Tx>>
396 #ifdef __ESIMD_FORCE_STATELESS_MEM
402 #ifdef __ESIMD_FORCE_STATELESS_MEM
403 block_store<Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc, offset), vals);
405 constexpr
unsigned Sz =
sizeof(T) * N;
406 static_assert(Sz >= detail::OperandSize::OWORD,
407 "block size must be at least 1 oword");
408 static_assert(Sz % detail::OperandSize::OWORD == 0,
409 "block size must be whole number of owords");
411 "block must be 1, 2, 4 or 8 owords long");
412 static_assert(Sz <= 8 * detail::OperandSize::OWORD,
413 "block size must be at most 8 owords");
415 auto surf_ind = __esimd_get_surface_index(
416 detail::AccessorPrivateProxy::getQualifiedPtrOrImageObj(acc));
417 __esimd_oword_st<T, N>(surf_ind, offset >> 4, vals.data());
427 template <
typename T,
int N,
typename AccessorTy>
429 ESIMD_NODEBUG std::enable_if_t<(
sizeof(T) <= 4) &&
430 (N == 1 || N == 8 || N == 16 || N == 32) &&
431 !std::is_pointer<AccessorTy>::value>
435 constexpr
int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
437 constexpr int16_t scale = 0;
440 if constexpr (
sizeof(T) < 4) {
441 using Tint = std::conditional_t<std::is_integral_v<T>, T,
442 detail::uint_type_t<
sizeof(T)>>;
443 using Treal = __raw_t<T>;
444 simd<Tint, N> vals_int = bitcast<Tint, Treal, N>(std::move(vals).data());
445 using PromoT =
typename std::conditional_t<std::is_signed<Tint>::value,
447 const simd<PromoT, N> promo_vals = convert<PromoT>(std::move(vals_int));
448 __esimd_scatter_scaled<PromoT, N, decltype(si), TypeSizeLog2, scale>(
449 mask.data(), si, glob_offset, offsets.data(), promo_vals.data());
451 using Treal = __raw_t<T>;
452 if constexpr (!std::is_same_v<Treal, T>) {
454 __esimd_scatter_scaled<Treal, N, decltype(si), TypeSizeLog2, scale>(
455 mask.data(), si, glob_offset, offsets.data(), Values.data());
457 __esimd_scatter_scaled<T, N, decltype(si), TypeSizeLog2, scale>(
458 mask.data(), si, glob_offset, offsets.data(), vals.data());
463 template <
typename T,
int N,
typename AccessorTy>
464 ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t<
465 (
sizeof(
T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) &&
466 !std::is_pointer<AccessorTy>::value,
471 constexpr
int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
473 constexpr uint32_t scale = 0;
476 if constexpr (
sizeof(
T) < 4) {
477 using Tint = std::conditional_t<std::is_integral_v<T>,
T,
478 detail::uint_type_t<
sizeof(
T)>>;
479 using Treal = __raw_t<T>;
480 static_assert(std::is_integral<Tint>::value,
481 "only integral 1- & 2-byte types are supported");
482 using PromoT =
typename std::conditional_t<std::is_signed<Tint>::value,
485 __esimd_gather_masked_scaled2<PromoT, N, decltype(si), TypeSizeLog2,
486 scale>(si, glob_offset, offsets.data(),
488 auto Res = convert<Tint>(promo_vals);
490 if constexpr (!std::is_same_v<Tint, T>) {
491 return detail::bitcast<Treal, Tint, N>(Res.data());
496 using Treal = __raw_t<T>;
497 simd<Treal, N> Res = __esimd_gather_masked_scaled2<Treal, N, decltype(si),
498 TypeSizeLog2, scale>(
499 si, glob_offset, offsets.data(), mask.data());
500 if constexpr (!std::is_same_v<Treal, T>) {
501 return Res.template bit_cast_view<T>();
533 template <
typename T,
int N,
typename AccessorTy,
typename Toffset>
534 __ESIMD_API std::enable_if_t<
535 (
sizeof(
T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) &&
536 !std::is_pointer<AccessorTy>::value && std::is_integral_v<Toffset>,
539 #ifdef __ESIMD_FORCE_STATELESS_MEM
540 uint64_t glob_offset = 0,
542 uint32_t glob_offset = 0,
545 #ifdef __ESIMD_FORCE_STATELESS_MEM
546 return gather<T, N>(__ESIMD_DNS::accessorToPointer<T>(acc, glob_offset),
549 return detail::gather_impl<T, N, AccessorTy>(acc, offsets, glob_offset, mask);
573 template <
typename T,
int N,
typename AccessorTy,
typename Toffset>
574 __ESIMD_API std::enable_if_t<
575 (
sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) &&
576 !std::is_pointer<AccessorTy>::value && std::is_integral_v<Toffset>>
578 #ifdef __ESIMD_FORCE_STATELESS_MEM
579 uint64_t glob_offset = 0,
581 uint32_t glob_offset = 0,
584 #ifdef __ESIMD_FORCE_STATELESS_MEM
585 scatter<T, N>(__ESIMD_DNS::accessorToPointer<T>(acc, glob_offset), offsets,
588 detail::scatter_impl<T, N, AccessorTy>(acc, vals, offsets, glob_offset, mask);
599 template <
typename T,
typename AccessorTy>
613 template <
typename T,
typename AccessorTy>
614 __ESIMD_API
void scalar_store(AccessorTy acc, uint32_t offset, T val) {
652 int N,
typename Toffset>
655 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
656 static_assert((N == 8 || N == 16 || N == 32),
"Unsupported value of N");
657 static_assert(
sizeof(T) == 4,
"Unsupported size of type T");
660 addrs = addrs + offsets_i;
661 return __esimd_svm_gather4_scaled<detail::__raw_t<T>, N, RGBAMask>(
662 addrs.data(), mask.data());
681 int N,
typename Toffset,
682 typename RegionTy = region1d_t<Toffset, N, 1>>
686 return gather_rgba<RGBAMask, T, N>(p, offsets.read(), mask);
705 int N,
typename Toffset>
706 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>,
712 template <
typename T,
int N, rgba_channel_mask RGBAMask>
714 __ESIMD_API
std::enable_if_t<
715 (N == 8 || N == 16 || N == 32) && sizeof(T) == 4,
718 simd<uint32_t, N> offsets,
720 return gather_rgba<RGBAMask>(p, offsets, mask);
727 (M == CM::ABGR || M == CM::BGR || M == CM::GR || M == CM::R) &&
728 "Only ABGR, BGR, GR, R channel masks are valid in write operations");
754 int N,
typename Toffset>
759 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
760 static_assert((N == 8 || N == 16 || N == 32),
"Unsupported value of N");
761 static_assert(
sizeof(T) == 4,
"Unsupported size of type T");
762 detail::validate_rgba_write_channel_mask<RGBAMask>();
765 addrs = addrs + offsets_i;
766 __esimd_svm_scatter4_scaled<detail::__raw_t<T>, N, RGBAMask>(
767 addrs.data(), vals.data(), mask.data());
786 int N,
typename Toffset,
787 typename RegionTy = region1d_t<Toffset, N, 1>>
792 scatter_rgba<RGBAMask, T, N>(p, offsets.read(), vals, mask);
811 int N,
typename Toffset>
812 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> && N == 1>
819 template <
typename T,
int N, rgba_channel_mask RGBAMask>
822 enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4>
scatter_rgba(
823 T *p,
simd<uint32_t, N> offsets,
826 scatter_rgba<RGBAMask>(p, offsets, vals, mask);
852 typename AccessorT,
int N,
853 typename T =
typename AccessorT::value_type>
854 __ESIMD_API std::enable_if_t<((N == 8 || N == 16 || N == 32) &&
855 sizeof(T) == 4 && !std::is_pointer_v<AccessorT>),
859 #ifdef __ESIMD_FORCE_STATELESS_MEM
860 return gather_rgba<RGBAMask>(
861 __ESIMD_DNS::accessorToPointer<T>(acc, global_offset), offsets, mask);
864 constexpr uint32_t Scale = 0;
866 return __esimd_gather4_masked_scaled2<detail::__raw_t<T>, N, RGBAMask,
867 decltype(SI), Scale>(
868 SI, global_offset, offsets.data(), mask.data());
887 typename AccessorT,
int N,
888 typename T =
typename AccessorT::value_type>
889 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) &&
sizeof(T) == 4 &&
890 !std::is_pointer_v<AccessorT>>
894 detail::validate_rgba_write_channel_mask<RGBAMask>();
895 #ifdef __ESIMD_FORCE_STATELESS_MEM
896 scatter_rgba<RGBAMask>(__ESIMD_DNS::accessorToPointer<T>(acc, global_offset),
897 offsets, vals, mask);
900 constexpr uint32_t Scale = 0;
902 __esimd_scatter4_scaled<T, N, decltype(SI), RGBAMask, Scale>(
903 mask.data(), SI, global_offset, offsets.data(), vals.data());
912 template <__ESIMD_NS::atomic_op Op,
typename T,
int N,
unsigned NumSrc>
916 "Execution size 1, 2, 4, 8, 16, 32 are supported");
918 static_assert(NumSrc == __ESIMD_DNS::get_num_args<Op>(),
919 "wrong number of operands");
920 constexpr
bool IsInt2BytePlus =
921 std::is_integral_v<T> && (
sizeof(T) >=
sizeof(uint16_t));
923 if constexpr (Op == __ESIMD_NS::atomic_op::xchg ||
924 Op == __ESIMD_NS::atomic_op::cmpxchg ||
925 Op == __ESIMD_NS::atomic_op::predec ||
926 Op == __ESIMD_NS::atomic_op::inc ||
929 static_assert(IsInt2BytePlus,
"Integral 16-bit or wider type is expected");
934 Op == __ESIMD_NS::atomic_op::fadd ||
935 Op == __ESIMD_NS::atomic_op::fsub) {
936 static_assert((is_type<T, float, sycl::half, double>()),
937 "float, double or sycl::half type is expected");
940 Op == __ESIMD_NS::atomic_op::sub ||
946 Op == __ESIMD_NS::atomic_op::minsint ||
947 Op == __ESIMD_NS::atomic_op::maxsint) {
948 static_assert(IsInt2BytePlus,
"Integral 16-bit or wider type is expected");
949 constexpr
bool IsSignedMinmax = (Op == __ESIMD_NS::atomic_op::minsint) ||
950 (Op == __ESIMD_NS::atomic_op::maxsint);
954 if constexpr (IsSignedMinmax || IsUnsignedMinmax) {
955 constexpr
bool SignOK = std::is_signed_v<T> == IsSignedMinmax;
956 static_assert(SignOK,
"Signed/unsigned integer type expected for "
957 "signed/unsigned min/max operation");
989 template <atomic_op Op,
typename Tx,
int N,
typename Toffset>
992 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
994 (Op == atomic_op::fadd) || (Op == atomic_op::fsub)) {
996 return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(p, offset, src0,
998 }
else if constexpr (Op == atomic_op::store) {
999 if constexpr (std::is_integral_v<Tx>) {
1000 return atomic_update<atomic_op::xchg, Tx, N>(p, offset, src0, mask);
1002 using Tint = detail::uint_type_t<
sizeof(Tx)>;
1003 simd<Tint, N> Res = atomic_update<atomic_op::xchg, Tint, N>(
1004 reinterpret_cast<Tint *
>(p), offset,
1005 src0.template bit_cast_view<Tint>(), mask);
1006 return Res.template bit_cast_view<Tx>();
1009 detail::check_atomic<Op, Tx, N, 1>();
1014 using T =
typename detail::__raw_t<Tx>;
1015 return __esimd_svm_atomic1<Op, T, N>(vAddr.data(), src0.data(),
1039 template <
atomic_op Op,
typename Tx,
int N,
typename Toffset,
1040 typename RegionTy = region1d_t<Toffset, N, 1>>
1044 return atomic_update<Op, Tx, N>(p, offsets.read(), src0, mask);
1065 template <atomic_op Op,
typename Tx,
int N,
typename Toffset>
1066 __ESIMD_API std::enable_if_t<
1067 std::is_integral_v<Toffset> &&
1068 ((Op != atomic_op::store && Op != atomic_op::xchg) || N == 1),
1093 template <atomic_op Op,
typename Tx,
int N,
typename Toffset>
1096 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
1097 if constexpr (Op == atomic_op::load) {
1098 if constexpr (std::is_integral_v<Tx>) {
1099 return atomic_update<atomic_op::bit_or, Tx, N>(p, offset,
simd<Tx, N>(0),
1102 using Tint = detail::uint_type_t<
sizeof(Tx)>;
1103 simd<Tint, N> Res = atomic_update<atomic_op::bit_or, Tint, N>(
1104 reinterpret_cast<Tint *
>(p), offset,
simd<Tint, N>(0), mask);
1105 return Res.template bit_cast_view<Tx>();
1108 detail::check_atomic<Op, Tx, N, 0>();
1113 using T =
typename detail::__raw_t<Tx>;
1114 return __esimd_svm_atomic0<Op, T, N>(vAddr.data(), mask.data());
1132 template <
atomic_op Op,
typename Tx,
int N,
typename Toffset,
1133 typename RegionTy = region1d_t<Toffset, N, 1>>
1137 return atomic_update<Op, Tx, N>(p, offsets.read(), mask);
1154 template <atomic_op Op,
typename Tx,
int N,
typename Toffset>
1155 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>,
simd<Tx, N>>
1179 template <atomic_op Op,
typename Tx,
int N,
typename Toffset>
1183 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
1184 if constexpr (Op == atomic_op::fcmpwr) {
1186 return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(p, offset, src0,
1189 detail::check_atomic<Op, Tx, N, 2>();
1193 using T =
typename detail::__raw_t<Tx>;
1194 return __esimd_svm_atomic2<Op, T, N>(vAddr.data(), src0.data(), src1.data(),
1215 template <
atomic_op Op,
typename Tx,
int N,
typename Toffset,
1216 typename RegionTy = region1d_t<Toffset, N, 1>>
1220 return atomic_update<Op, Tx, N>(p, offsets.read(), src0, src1, mask);
1239 template <atomic_op Op,
typename Tx,
int N,
typename Toffset>
1240 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>,
simd<Tx, N>>
1273 template <
atomic_op Op,
typename Tx,
int N,
typename Toffset,
1274 typename AccessorTy>
1275 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
1276 !std::is_pointer<AccessorTy>::value,
1280 #ifdef __ESIMD_FORCE_STATELESS_MEM
1281 return atomic_update<Op, Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc),
1282 offset, src0, mask);
1284 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
1285 static_assert(
sizeof(Toffset) == 4,
"Only 32 bit offset is supported");
1287 (Op == atomic_op::fadd) || (Op == atomic_op::fsub)) {
1289 return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(acc, offset,
1291 }
else if constexpr (Op == atomic_op::store) {
1292 if constexpr (std::is_integral_v<Tx>) {
1293 return atomic_update<atomic_op::xchg, Tx, N>(acc, offset, src0, mask);
1295 using Tint = detail::uint_type_t<
sizeof(Tx)>;
1296 simd<Tint, N> Res = atomic_update<atomic_op::xchg, Tint, N>(
1297 acc, offset, src0.template bit_cast_view<Tint>(), mask);
1298 return Res.template bit_cast_view<Tx>();
1301 detail::check_atomic<Op, Tx, N, 1>();
1302 static_assert(
sizeof(Tx) == 4,
"Only 32 bit data is supported");
1304 using T =
typename detail::__raw_t<Tx>;
1305 return __esimd_dword_atomic1<Op, T, N>(mask.data(), si, offset.data(),
1333 template <
atomic_op Op,
typename Tx,
int N,
typename Toffset,
1334 typename AccessorTy,
typename RegionTy = region1d_t<Toffset, N, 1>>
1335 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
1336 !std::is_pointer<AccessorTy>::value,
1340 return atomic_update<Op, Tx, N>(acc, offsets.read(), src0, mask);
1364 template <
atomic_op Op,
typename Tx,
int N,
typename Toffset,
1365 typename AccessorTy>
1366 __ESIMD_API std::enable_if_t<
1367 std::is_integral_v<Toffset> && !std::is_pointer<AccessorTy>::value &&
1368 ((Op != atomic_op::store && Op != atomic_op::xchg) || N == 1),
1372 return atomic_update<Op, Tx, N>(acc,
simd<Toffset, N>(offset), src0, mask);
1397 template <
atomic_op Op,
typename Tx,
int N,
typename Toffset,
1398 typename AccessorTy>
1400 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
1401 !std::is_pointer<AccessorTy>::value,
1404 #ifdef __ESIMD_FORCE_STATELESS_MEM
1405 return atomic_update<Op, Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc),
1408 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
1409 if constexpr (Op == atomic_op::load) {
1410 if constexpr (std::is_integral_v<Tx>) {
1411 return atomic_update<atomic_op::bit_or, Tx, N>(acc, offset,
1414 using Tint = detail::uint_type_t<
sizeof(Tx)>;
1415 simd<Tint, N> Res = atomic_update<atomic_op::bit_or, Tint, N>(
1417 return Res.template bit_cast_view<Tx>();
1420 detail::check_atomic<Op, Tx, N, 0>();
1421 static_assert(
sizeof(Toffset) == 4,
"Only 32 bit offset is supported");
1423 static_assert(
sizeof(Tx) == 4,
"Only 32 bit data is supported");
1425 using T =
typename detail::__raw_t<Tx>;
1426 return __esimd_dword_atomic0<Op, T, N>(mask.data(), si, offset.data());
1448 template <
atomic_op Op,
typename Tx,
int N,
typename Toffset,
1449 typename AccessorTy,
typename RegionTy = region1d_t<Toffset, N, 1>>
1450 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
1451 !std::is_pointer<AccessorTy>::value,
1455 return atomic_update<Op, Tx, N>(acc, offsets.read(), mask);
1475 template <
atomic_op Op,
typename Tx,
int N,
typename Toffset,
1476 typename AccessorTy>
1477 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
1478 !std::is_pointer<AccessorTy>::value,
1506 template <
atomic_op Op,
typename Tx,
int N,
typename Toffset,
1507 typename AccessorTy>
1508 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
1509 !std::is_pointer<AccessorTy>::value,
1513 #ifdef __ESIMD_FORCE_STATELESS_MEM
1514 return atomic_update<Op, Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc),
1515 offset, src0, src1, mask);
1517 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
1518 static_assert(
sizeof(Toffset) == 4,
"Only 32 bit offset is supported");
1519 if constexpr (Op == atomic_op::fcmpwr) {
1521 return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(
1522 acc, offset, src0, src1, mask);
1524 detail::check_atomic<Op, Tx, N, 2>();
1525 static_assert(
sizeof(Tx) == 4,
"Only 32 bit data is supported");
1527 using T =
typename detail::__raw_t<Tx>;
1528 return __esimd_dword_atomic2<Op, T, N>(mask.data(), si, offset.data(),
1529 src0.data(), src1.data());
1553 template <
atomic_op Op,
typename Tx,
int N,
typename Toffset,
1554 typename AccessorTy,
typename RegionTy = region1d_t<Toffset, N, 1>>
1555 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
1556 !std::is_pointer<AccessorTy>::value,
1560 return atomic_update<Op, Tx, N>(acc, offsets.read(), src0, src1, mask);
1582 template <
atomic_op Op,
typename Tx,
int N,
typename Toffset,
1583 typename AccessorTy>
1584 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
1585 !std::is_pointer<AccessorTy>::value,
1624 template <u
int8_t cntl> __ESIMD_API
void fence() { __esimd_fence(cntl); }
1659 template <u
int32_t SLMSize> __ESIMD_API
void slm_init() {
1660 __esimd_slm_init(SLMSize);
1671 __ESIMD_API
void slm_init(uint32_t size) { __esimd_slm_init(size); }
1678 template <
typename T,
int N>
1680 std::enable_if_t<(N == 1 || N == 8 || N == 16 || N == 32),
simd<T, N>>
1682 detail::LocalAccessorMarker acc;
1683 return detail::gather_impl<T, N>(acc, offsets, 0, mask);
1701 template <
typename T,
int N>
1702 __ESIMD_API std::enable_if_t<(N == 1 || N == 8 || N == 16 || N == 32) &&
1705 detail::LocalAccessorMarker acc;
1706 detail::scatter_impl<T, N>(acc, vals, offsets, 0, mask);
1714 template <
typename T>
1729 template <
typename T,
int N, rgba_channel_mask RGBAMask>
1730 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && (
sizeof(T) == 4),
1734 return __esimd_gather4_masked_scaled2<T, N, RGBAMask>(
1735 SI, 0 , offsets.data(), mask.data());
1748 template <
typename T,
int N, rgba_channel_mask Mask>
1749 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && (
sizeof(T) == 4)>
1753 detail::validate_rgba_write_channel_mask<Mask>();
1755 constexpr int16_t Scale = 0;
1756 constexpr
int global_offset = 0;
1757 __esimd_scatter4_scaled<T, N, decltype(si), Mask, Scale>(
1758 mask.data(), si, global_offset, offsets.data(), vals.data());
1769 template <
typename T,
int N>
1771 constexpr
unsigned Sz =
sizeof(T) * N;
1772 static_assert(Sz >= detail::OperandSize::OWORD,
1773 "block size must be at least 1 oword");
1774 static_assert(Sz % detail::OperandSize::OWORD == 0,
1775 "block size must be whole number of owords");
1777 "block must be 1, 2, 4 or 8 owords long");
1778 static_assert(Sz <= 16 * detail::OperandSize::OWORD,
1779 "block size must be at most 16 owords");
1782 return __esimd_oword_ld<detail::__raw_t<T>, N>(si, offset >> 4);
1793 template <
typename T,
int N>
1795 constexpr
unsigned Sz =
sizeof(T) * N;
1796 static_assert(Sz >= detail::OperandSize::OWORD,
1797 "block size must be at least 1 oword");
1798 static_assert(Sz % detail::OperandSize::OWORD == 0,
1799 "block size must be whole number of owords");
1801 "block must be 1, 2, 4 or 8 owords long");
1802 static_assert(Sz <= 8 * detail::OperandSize::OWORD,
1803 "block size must be at most 8 owords");
1806 __esimd_oword_st<detail::__raw_t<T>, N>(si, offset >> 4, vals.data());
1812 template <atomic_op Op,
typename Tx,
int N,
class T = detail::__raw_t<Tx>>
1815 detail::check_atomic<Op, T, N, 0>();
1817 return __esimd_dword_atomic0<Op, T, N>(mask.data(), si, offsets.data());
1823 template <atomic_op Op,
typename Tx,
int N,
class T = detail::__raw_t<Tx>>
1826 detail::check_atomic<Op, T, N, 1>();
1828 return __esimd_dword_atomic1<Op, T, N>(mask.data(), si, offsets.data(),
1835 template <atomic_op Op,
typename Tx,
int N,
class T = detail::__raw_t<Tx>>
1839 detail::check_atomic<Op, T, N, 2>();
1841 return __esimd_dword_atomic2<Op, T, N>(mask.data(), si, offsets.data(),
1842 src0.data(), src1.data());
1847 #ifndef __ESIMD_FORCE_STATELESS_MEM
1863 template <
typename T,
int m,
int N,
typename AccessorTy,
unsigned plane = 0>
1866 constexpr
unsigned Width = N *
sizeof(T);
1867 static_assert(Width * m <= 256u,
1868 "data does not fit into a single dataport transaction");
1869 static_assert(Width <= 64u,
"valid block width is in range [1, 64]");
1870 static_assert(m <= 64u,
"valid block height is in range [1, 64]");
1871 static_assert(plane <= 3u,
"valid plane index is in range [0, 3]");
1874 using SurfIndTy = decltype(si);
1875 constexpr
unsigned int RoundedWidth =
1876 Width < 4 ? 4 : detail::getNextPowerOf2<Width>();
1877 constexpr
int BlockWidth =
sizeof(T) * N;
1878 constexpr
int Mod = 0;
1880 if constexpr (Width < RoundedWidth) {
1881 constexpr
unsigned int n1 = RoundedWidth /
sizeof(T);
1883 __esimd_media_ld<T, m, n1, Mod, SurfIndTy, (int)plane, BlockWidth>(
1885 return temp.template select<m, 1, N, 1>(0, 0);
1887 return __esimd_media_ld<T, m, N, Mod, SurfIndTy, (int)plane, BlockWidth>(
1904 template <
typename T,
int m,
int N,
typename AccessorTy,
unsigned plane = 0>
1907 constexpr
unsigned Width = N *
sizeof(T);
1908 static_assert(Width * m <= 256u,
1909 "data does not fit into a single dataport transaction");
1910 static_assert(Width <= 64u,
"valid block width is in range [1, 64]");
1911 static_assert(m <= 64u,
"valid block height is in range [1, 64]");
1912 static_assert(plane <= 3u,
"valid plane index is in range [0, 3]");
1914 using SurfIndTy = decltype(si);
1915 constexpr
unsigned int RoundedWidth =
1916 Width < 4 ? 4 : detail::getNextPowerOf2<Width>();
1917 constexpr
unsigned int n1 = RoundedWidth /
sizeof(T);
1918 constexpr
int BlockWidth =
sizeof(T) * N;
1919 constexpr
int Mod = 0;
1921 if constexpr (Width < RoundedWidth) {
1923 auto temp_ref = temp.template bit_cast_view<T, m, n1>();
1924 auto vals_ref = vals.template bit_cast_view<T, m, N>();
1925 temp_ref.template select<m, 1, N, 1>() = vals_ref;
1926 __esimd_media_st<T, m, n1, Mod, SurfIndTy, plane, BlockWidth>(si, x, y,
1929 __esimd_media_st<T, m, N, Mod, SurfIndTy, plane, BlockWidth>(si, x, y,
1933 #endif // !__ESIMD_FORCE_STATELESS_MEM
1942 template <
typename T,
int N,
class T1,
class SFINAE>
1943 template <
typename Flags,
int ChunkSize,
typename>
1944 void simd_obj_impl<T, N, T1, SFINAE>::copy_from(
1945 const simd_obj_impl<T, N, T1, SFINAE>::element_type *Addr,
1946 Flags) SYCL_ESIMD_FUNCTION {
1947 using UT = simd_obj_impl<T, N, T1, SFINAE>::element_type;
1948 constexpr
unsigned Size =
sizeof(T) * N;
1949 constexpr
unsigned Align = Flags::template alignment<T1>;
1951 constexpr
unsigned BlockSize = OperandSize::OWORD * 8;
1952 constexpr
unsigned NumBlocks = Size / BlockSize;
1953 constexpr
unsigned RemSize = Size % BlockSize;
1955 if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 &&
1957 if constexpr (NumBlocks > 0) {
1958 constexpr
unsigned BlockN = BlockSize /
sizeof(T);
1959 ForHelper<NumBlocks>::unroll([BlockN, Addr,
this](
unsigned Block) {
1960 select<BlockN, 1>(Block * BlockN) =
1961 block_load<UT, BlockN, Flags>(Addr + (Block * BlockN), Flags{});
1964 if constexpr (RemSize > 0) {
1965 constexpr
unsigned RemN = RemSize /
sizeof(T);
1966 constexpr
unsigned BlockN = BlockSize /
sizeof(T);
1967 select<RemN, 1>(NumBlocks * BlockN) =
1968 block_load<UT, RemN, Flags>(Addr + (NumBlocks * BlockN), Flags{});
1970 }
else if constexpr (
sizeof(
T) == 8) {
1972 bit_cast_view<int32_t>() = BC;
1974 constexpr
unsigned NumChunks = N / ChunkSize;
1975 if constexpr (NumChunks > 0) {
1977 ForHelper<NumChunks>::unroll([Addr, &Offsets,
this](
unsigned Block) {
1978 select<ChunkSize, 1>(Block * ChunkSize) =
1979 gather<UT, ChunkSize>(Addr + (Block * ChunkSize), Offsets);
1982 constexpr
unsigned RemN = N % ChunkSize;
1983 if constexpr (RemN > 0) {
1984 if constexpr (RemN == 1) {
1985 select<1, 1>(NumChunks * ChunkSize) = Addr[NumChunks * ChunkSize];
1986 }
else if constexpr (RemN == 8 || RemN == 16) {
1988 select<RemN, 1>(NumChunks * ChunkSize) =
1989 gather<UT, RemN>(Addr + (NumChunks * ChunkSize), Offsets);
1991 constexpr
int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
1992 simd_mask_type<N1> Pred(0);
1993 Pred.template select<RemN, 1>() = 1;
1996 gather<UT, N1>(Addr + (NumChunks * ChunkSize), Offsets, Pred);
1997 select<RemN, 1>(NumChunks * ChunkSize) =
1998 Vals.template select<RemN, 1>();
2004 template <
typename T,
int N,
class T1,
class SFINAE>
2005 template <
typename AccessorT,
typename Flags,
int ChunkSize,
typename>
2006 ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_read,
2007 sycl::access::target::device,
void>
2008 simd_obj_impl<T, N, T1, SFINAE>::copy_from(AccessorT acc, uint32_t offset,
2009 Flags) SYCL_ESIMD_FUNCTION {
2010 using UT = simd_obj_impl<T, N, T1, SFINAE>::element_type;
2011 static_assert(
sizeof(UT) ==
sizeof(
T));
2012 constexpr
unsigned Size =
sizeof(
T) * N;
2013 constexpr
unsigned Align = Flags::template alignment<T1>;
2015 constexpr
unsigned BlockSize = OperandSize::OWORD * 8;
2016 constexpr
unsigned NumBlocks = Size / BlockSize;
2017 constexpr
unsigned RemSize = Size % BlockSize;
2019 if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 &&
2021 if constexpr (NumBlocks > 0) {
2022 constexpr
unsigned BlockN = BlockSize /
sizeof(
T);
2023 ForHelper<NumBlocks>::unroll([BlockN, acc, offset,
this](
unsigned Block) {
2024 select<BlockN, 1>(Block * BlockN) =
2025 block_load<UT, BlockN, AccessorT, Flags>(
2026 acc, offset + (Block * BlockSize), Flags{});
2029 if constexpr (RemSize > 0) {
2030 constexpr
unsigned RemN = RemSize /
sizeof(
T);
2031 constexpr
unsigned BlockN = BlockSize /
sizeof(
T);
2032 select<RemN, 1>(NumBlocks * BlockN) =
2033 block_load<UT, RemN, AccessorT, Flags>(
2034 acc, offset + (NumBlocks * BlockSize), Flags{});
2036 }
else if constexpr (
sizeof(
T) == 8) {
2038 bit_cast_view<int32_t>() = BC;
2040 constexpr
unsigned NumChunks = N / ChunkSize;
2041 if constexpr (NumChunks > 0) {
2043 ForHelper<NumChunks>::unroll(
2044 [acc, offset, &Offsets,
this](
unsigned Block) {
2045 select<ChunkSize, 1>(Block * ChunkSize) =
2046 gather<UT, ChunkSize, AccessorT>(
2047 acc, Offsets, offset + (Block * ChunkSize *
sizeof(
T)));
2050 constexpr
unsigned RemN = N % ChunkSize;
2051 if constexpr (RemN > 0) {
2052 if constexpr (RemN == 1 || RemN == 8 || RemN == 16) {
2054 select<RemN, 1>(NumChunks * ChunkSize) = gather<UT, RemN, AccessorT>(
2055 acc, Offsets, offset + (NumChunks * ChunkSize *
sizeof(
T)));
2057 constexpr
int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
2058 simd_mask_type<N1> Pred(0);
2059 Pred.template select<RemN, 1>() = 1;
2062 acc, Offsets, offset + (NumChunks * ChunkSize *
sizeof(
T)), Pred);
2063 select<RemN, 1>(NumChunks * ChunkSize) =
2064 Vals.template select<RemN, 1>();
2070 template <
typename T,
int N,
class T1,
class SFINAE>
2071 template <
typename Flags,
int ChunkSize,
typename>
2072 void simd_obj_impl<T, N, T1, SFINAE>::copy_to(
2073 simd_obj_impl<T, N, T1, SFINAE>::element_type *Addr,
2074 Flags)
const SYCL_ESIMD_FUNCTION {
2075 using UT = simd_obj_impl<T, N, T1, SFINAE>::element_type;
2076 constexpr
unsigned Size =
sizeof(
T) * N;
2077 constexpr
unsigned Align = Flags::template alignment<T1>;
2079 constexpr
unsigned BlockSize = OperandSize::OWORD * 8;
2080 constexpr
unsigned NumBlocks = Size / BlockSize;
2081 constexpr
unsigned RemSize = Size % BlockSize;
2084 if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
2086 if constexpr (NumBlocks > 0) {
2087 constexpr
unsigned BlockN = BlockSize /
sizeof(
T);
2088 ForHelper<NumBlocks>::unroll([BlockN, Addr, &Tmp](
unsigned Block) {
2089 block_store<UT, BlockN>(Addr + (Block * BlockN),
2090 Tmp.template select<BlockN, 1>(Block * BlockN));
2093 if constexpr (RemSize > 0) {
2094 constexpr
unsigned RemN = RemSize /
sizeof(
T);
2095 constexpr
unsigned BlockN = BlockSize /
sizeof(
T);
2096 block_store<UT, RemN>(Addr + (NumBlocks * BlockN),
2097 Tmp.template select<RemN, 1>(NumBlocks * BlockN));
2099 }
else if constexpr (
sizeof(
T) == 8) {
2101 BC.
copy_to(
reinterpret_cast<int32_t *
>(Addr), Flags{});
2103 constexpr
unsigned NumChunks = N / ChunkSize;
2104 if constexpr (NumChunks > 0) {
2106 ForHelper<NumChunks>::unroll([Addr, &Offsets, &Tmp](
unsigned Block) {
2107 scatter<UT, ChunkSize>(
2108 Addr + (Block * ChunkSize), Offsets,
2109 Tmp.template select<ChunkSize, 1>(Block * ChunkSize));
2112 constexpr
unsigned RemN = N % ChunkSize;
2113 if constexpr (RemN > 0) {
2114 if constexpr (RemN == 1) {
2115 Addr[NumChunks * ChunkSize] = Tmp[NumChunks * ChunkSize];
2116 }
else if constexpr (RemN == 8 || RemN == 16) {
2120 if constexpr (
sizeof(
T) == 1 && RemN == 16) {
2121 if constexpr (Align % OperandSize::DWORD > 0) {
2122 ForHelper<RemN>::unroll([Addr, &Tmp](
unsigned Index) {
2123 Addr[Index + NumChunks * ChunkSize] =
2124 Tmp[Index + NumChunks * ChunkSize];
2127 simd_mask_type<8> Pred(0);
2129 Pred.template select<4, 1>() = 1;
2130 Vals.template select<4, 1>() =
2131 Tmp.template bit_cast_view<int32_t>().template select<4, 1>(
2132 NumChunks * ChunkSize);
2135 scatter<int32_t, 8>(
2136 reinterpret_cast<int32_t *
>(Addr + (NumChunks * ChunkSize)),
2137 Offsets, Vals, Pred);
2142 Addr + (NumChunks * ChunkSize), Offsets,
2143 Tmp.template select<RemN, 1>(NumChunks * ChunkSize));
2146 constexpr
int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
2147 simd_mask_type<N1> Pred(0);
2148 Pred.template select<RemN, 1>() = 1;
2150 Vals.template select<RemN, 1>() =
2151 Tmp.template select<RemN, 1>(NumChunks * ChunkSize);
2153 scatter<UT, N1>(Addr + (NumChunks * ChunkSize), Offsets, Vals, Pred);
2159 template <
typename T,
int N,
class T1,
class SFINAE>
2160 template <
typename AccessorT,
typename Flags,
int ChunkSize,
typename>
2161 ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_write,
2162 sycl::access::target::device,
void>
2163 simd_obj_impl<T, N, T1, SFINAE>::copy_to(AccessorT acc, uint32_t offset,
2164 Flags)
const SYCL_ESIMD_FUNCTION {
2165 using UT = simd_obj_impl<T, N, T1, SFINAE>::element_type;
2166 constexpr
unsigned Size =
sizeof(
T) * N;
2167 constexpr
unsigned Align = Flags::template alignment<T1>;
2169 constexpr
unsigned BlockSize = OperandSize::OWORD * 8;
2170 constexpr
unsigned NumBlocks = Size / BlockSize;
2171 constexpr
unsigned RemSize = Size % BlockSize;
2175 if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
2177 if constexpr (NumBlocks > 0) {
2178 constexpr
unsigned BlockN = BlockSize /
sizeof(
T);
2179 ForHelper<NumBlocks>::unroll([BlockN, acc, offset, &Tmp](
unsigned Block) {
2180 block_store<UT, BlockN, AccessorT>(
2181 acc, offset + (Block * BlockSize),
2182 Tmp.template select<BlockN, 1>(Block * BlockN));
2185 if constexpr (RemSize > 0) {
2186 constexpr
unsigned RemN = RemSize /
sizeof(
T);
2187 constexpr
unsigned BlockN = BlockSize /
sizeof(
T);
2188 block_store<UT, RemN, AccessorT>(
2189 acc, offset + (NumBlocks * BlockSize),
2190 Tmp.template select<RemN, 1>(NumBlocks * BlockN));
2192 }
else if constexpr (
sizeof(
T) == 8) {
2194 BC.
copy_to(acc, offset, Flags{});
2196 constexpr
unsigned NumChunks = N / ChunkSize;
2197 if constexpr (NumChunks > 0) {
2199 ForHelper<NumChunks>::unroll([acc, offset, &Offsets,
2200 &Tmp](
unsigned Block) {
2201 scatter<UT, ChunkSize, AccessorT>(
2202 acc, Offsets, Tmp.template select<ChunkSize, 1>(Block * ChunkSize),
2203 offset + (Block * ChunkSize *
sizeof(
T)));
2206 constexpr
unsigned RemN = N % ChunkSize;
2207 if constexpr (RemN > 0) {
2208 if constexpr (RemN == 1 || RemN == 8 || RemN == 16) {
2210 scatter<UT, RemN, AccessorT>(
2211 acc, Offsets, Tmp.template select<RemN, 1>(NumChunks * ChunkSize),
2212 offset + (NumChunks * ChunkSize *
sizeof(
T)));
2214 constexpr
int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
2215 simd_mask_type<N1> Pred(0);
2216 Pred.template select<RemN, 1>() = 1;
2218 Vals.template select<RemN, 1>() =
2219 Tmp.template select<RemN, 1>(NumChunks * ChunkSize);
2221 scatter<UT, N1, AccessorT>(acc, Offsets, Vals,
2222 offset + (NumChunks * ChunkSize *
sizeof(
T)),