23 namespace __ESIMD_NS {
45 struct LocalAccessorMarker {};
59 template <
typename AccessorTy>
61 if constexpr (std::is_same_v<detail::LocalAccessorMarker, AccessorTy>) {
64 return __esimd_get_surface_index(
65 detail::AccessorPrivateProxy::getNativeImageObj(acc));
69 #define __ESIMD_GET_SURF_HANDLE(acc) get_surface_index(acc)
129 template <
typename Tx,
int N,
class T = detail::__raw_t<Tx>>
134 addrs = addrs + offsets_i;
136 if constexpr (
sizeof(
T) == 1) {
137 auto Ret = __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<4>(),
138 detail::ElemsPerAddrEncoding<1>()>(
139 addrs.data(), mask.data());
140 return __esimd_rdregion<
T, N * 4, N, 0, N, 4>(Ret, 0);
141 }
else if constexpr (
sizeof(
T) == 2) {
142 auto Ret = __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<2>(),
143 detail::ElemsPerAddrEncoding<2>()>(
144 addrs.data(), mask.data());
145 return __esimd_rdregion<
T, N * 2, N, 0, N, 2>(Ret, 0);
147 return __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<1>(),
148 detail::ElemsPerAddrEncoding<1>()>(addrs.data(),
165 template <
typename Tx,
int N,
class T = detail::__raw_t<Tx>>
171 addrs = addrs + offsets_i;
172 if constexpr (
sizeof(
T) == 1) {
174 D = __esimd_wrregion<
T, N * 4, N, 0, N, 4>(D.data(), vals.data(), 0);
175 __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<4>(),
176 detail::ElemsPerAddrEncoding<1>()>(
177 addrs.data(), D.data(), mask.data());
178 }
else if constexpr (
sizeof(
T) == 2) {
180 D = __esimd_wrregion<
T, N * 2, N, 0, N, 2>(D.data(), vals.data(), 0);
181 __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<2>(),
182 detail::ElemsPerAddrEncoding<2>()>(
183 addrs.data(), D.data(), mask.data());
185 __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<1>(),
186 detail::ElemsPerAddrEncoding<1>()>(
187 addrs.data(), vals.data(), mask.data());
204 class T = detail::__raw_t<Tx>,
205 typename = std::enable_if_t<is_simd_flag_type_v<Flags>>>
207 constexpr
unsigned Sz =
sizeof(T) * N;
208 static_assert(Sz >= detail::OperandSize::OWORD,
209 "block size must be at least 1 oword");
210 static_assert(Sz % detail::OperandSize::OWORD == 0,
211 "block size must be whole number of owords");
212 static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
213 "block must be 1, 2, 4 or 8 owords long");
214 static_assert(Sz <= 8 * detail::OperandSize::OWORD,
215 "block size must be at most 8 owords");
217 uintptr_t Addr =
reinterpret_cast<uintptr_t
>(addr);
218 if constexpr (Flags::template alignment<
simd<T, N>> >=
219 detail::OperandSize::OWORD) {
220 return __esimd_svm_block_ld<T, N>(Addr);
222 return __esimd_svm_block_ld_unaligned<T, N>(Addr);
241 template <
typename Tx,
int N,
typename AccessorTy,
243 typename = std::enable_if_t<is_simd_flag_type_v<Flags>>,
244 class T = detail::__raw_t<Tx>>
247 #ifdef __ESIMD_FORCE_STATELESS_MEM
248 return block_load<Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc, offset));
250 constexpr
unsigned Sz =
sizeof(T) * N;
251 static_assert(Sz >= detail::OperandSize::OWORD,
252 "block size must be at least 1 oword");
253 static_assert(Sz % detail::OperandSize::OWORD == 0,
254 "block size must be whole number of owords");
255 static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
256 "block must be 1, 2, 4 or 8 owords long");
257 static_assert(Sz <= 8 * detail::OperandSize::OWORD,
258 "block size must be at most 8 owords");
260 auto surf_ind = __esimd_get_surface_index(
261 detail::AccessorPrivateProxy::getNativeImageObj(acc));
263 if constexpr (Flags::template alignment<
simd<T, N>> >=
264 detail::OperandSize::OWORD) {
265 return __esimd_oword_ld<T, N>(surf_ind, offset >> 4);
267 return __esimd_oword_ld_unaligned<T, N>(surf_ind, offset);
280 template <
typename Tx,
int N,
class T = detail::__raw_t<Tx>>
282 constexpr
unsigned Sz =
sizeof(
T) * N;
283 static_assert(Sz >= detail::OperandSize::OWORD,
284 "block size must be at least 1 oword");
285 static_assert(Sz % detail::OperandSize::OWORD == 0,
286 "block size must be whole number of owords");
287 static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
288 "block must be 1, 2, 4 or 8 owords long");
289 static_assert(Sz <= 8 * detail::OperandSize::OWORD,
290 "block size must be at most 8 owords");
292 uintptr_t Addr =
reinterpret_cast<uintptr_t
>(p);
293 __esimd_svm_block_st<T, N>(Addr, vals.data());
307 template <
typename Tx,
int N,
typename AccessorTy,
308 class T = detail::__raw_t<Tx>>
311 #ifdef __ESIMD_FORCE_STATELESS_MEM
312 block_store<Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc, offset), vals);
314 constexpr
unsigned Sz =
sizeof(
T) * N;
315 static_assert(Sz >= detail::OperandSize::OWORD,
316 "block size must be at least 1 oword");
317 static_assert(Sz % detail::OperandSize::OWORD == 0,
318 "block size must be whole number of owords");
319 static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
320 "block must be 1, 2, 4 or 8 owords long");
321 static_assert(Sz <= 8 * detail::OperandSize::OWORD,
322 "block size must be at most 8 owords");
324 auto surf_ind = __esimd_get_surface_index(
325 detail::AccessorPrivateProxy::getNativeImageObj(acc));
326 __esimd_oword_st<T, N>(surf_ind, offset >> 4, vals.data());
336 template <
typename T,
int N,
typename AccessorTy>
339 (N == 1 || N == 8 || N == 16 || N == 32) &&
340 !std::is_pointer<AccessorTy>::value>
344 constexpr
int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
346 constexpr int16_t scale = 0;
349 if constexpr (
sizeof(T) < 4) {
350 using Tint = std::conditional_t<std::is_integral_v<T>, T,
351 detail::uint_type_t<
sizeof(T)>>;
352 using Treal = __raw_t<T>;
353 simd<Tint, N> vals_int = bitcast<Tint, Treal, N>(std::move(vals).data());
357 const simd<PromoT, N> promo_vals = convert<PromoT>(std::move(vals_int));
358 __esimd_scatter_scaled<PromoT, N, decltype(si), TypeSizeLog2, scale>(
359 mask.data(), si, glob_offset, offsets.data(), promo_vals.data());
361 __esimd_scatter_scaled<T, N, decltype(si), TypeSizeLog2, scale>(
362 mask.data(), si, glob_offset, offsets.data(), vals.data());
366 template <
typename T,
int N,
typename AccessorTy>
368 (
sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) &&
369 !std::is_pointer<AccessorTy>::value,
374 constexpr
int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
376 constexpr uint32_t scale = 0;
379 if constexpr (
sizeof(T) < 4) {
380 using Tint = std::conditional_t<std::is_integral_v<T>, T,
381 detail::uint_type_t<
sizeof(T)>>;
382 using Treal = __raw_t<T>;
383 static_assert(std::is_integral<Tint>::value,
384 "only integral 1- & 2-byte types are supported");
389 __esimd_gather_masked_scaled2<PromoT, N, decltype(si), TypeSizeLog2,
390 scale>(si, glob_offset, offsets.data(),
392 auto Res = convert<Tint>(promo_vals);
394 if constexpr (!std::is_same_v<Tint, T>) {
395 return detail::bitcast<Treal, Tint, N>(Res.data());
400 return __esimd_gather_masked_scaled2<
T, N, decltype(si), TypeSizeLog2,
401 scale>(si, glob_offset, offsets.data(),
430 template <
typename T,
int N,
typename AccessorTy>
432 (N == 1 || N == 8 || N == 16 || N == 32) &&
433 !std::is_pointer<AccessorTy>::value,
437 #ifdef __ESIMD_FORCE_STATELESS_MEM
438 return gather<T, N>(__ESIMD_DNS::accessorToPointer<T>(acc, glob_offset),
441 return detail::gather_impl<T, N, AccessorTy>(acc, offsets, glob_offset, mask);
464 template <
typename T,
int N,
typename AccessorTy>
466 (N == 1 || N == 8 || N == 16 || N == 32) &&
467 !std::is_pointer<AccessorTy>::value>
470 #ifdef __ESIMD_FORCE_STATELESS_MEM
471 scatter<T, N>(__ESIMD_DNS::accessorToPointer<T>(acc, glob_offset), offsets,
474 detail::scatter_impl<T, N, AccessorTy>(acc, vals, offsets, glob_offset, mask);
485 template <
typename T,
typename AccessorTy>
499 template <
typename T,
typename AccessorTy>
538 __ESIMD_API
std::enable_if_t<(N == 8 || N == 16 || N == 32) &&
sizeof(T) == 4,
543 addrs = addrs + offsets_i;
544 return __esimd_svm_gather4_scaled<detail::__raw_t<T>, N, RGBAMask>(
545 addrs.data(), mask.data());
548 template <
typename T,
int N, rgba_channel_mask RGBAMask>
551 (N == 8 || N == 16 || N == 32) && sizeof(T) == 4,
554 simd<uint32_t, N> offsets,
556 return gather_rgba<RGBAMask>(p, offsets, mask);
563 (M == CM::ABGR || M == CM::BGR || M == CM::GR || M == CM::R) &&
564 "Only ABGR, BGR, GR, R channel masks are valid in write operations");
590 __ESIMD_API
std::enable_if_t<(N == 8 || N == 16 || N == 32) &&
sizeof(T) == 4>
594 detail::validate_rgba_write_channel_mask<RGBAMask>();
597 addrs = addrs + offsets_i;
598 __esimd_svm_scatter4_scaled<detail::__raw_t<T>, N, RGBAMask>(
599 addrs.data(), vals.data(), mask.data());
602 template <
typename T,
int N, rgba_channel_mask RGBAMask>
606 T *p,
simd<uint32_t, N> offsets,
609 scatter_rgba<RGBAMask>(p, offsets, vals, mask);
635 typename AccessorT,
int N,
636 typename T =
typename AccessorT::value_type>
638 sizeof(T) == 4 && !std::is_pointer_v<AccessorT>),
642 #ifdef __ESIMD_FORCE_STATELESS_MEM
643 return gather_rgba<RGBAMask>(
644 __ESIMD_DNS::accessorToPointer<T>(acc, global_offset), offsets, mask);
647 constexpr uint32_t Scale = 0;
649 return __esimd_gather4_masked_scaled2<detail::__raw_t<T>, N, RGBAMask,
650 decltype(SI), Scale>(
651 SI, global_offset, offsets.data(), mask.data());
670 typename AccessorT,
int N,
671 typename T =
typename AccessorT::value_type>
672 __ESIMD_API
std::enable_if_t<(N == 8 || N == 16 || N == 32) &&
sizeof(T) == 4 &&
673 !std::is_pointer_v<AccessorT>>
677 detail::validate_rgba_write_channel_mask<RGBAMask>();
678 #ifdef __ESIMD_FORCE_STATELESS_MEM
679 scatter_rgba<RGBAMask>(__ESIMD_DNS::accessorToPointer<T>(acc, global_offset),
680 offsets, vals, mask);
683 constexpr uint32_t Scale = 0;
685 __esimd_scatter4_scaled<T, N, decltype(SI), RGBAMask, Scale>(
686 mask.data(), SI, global_offset, offsets.data(), vals.data());
697 template <atomic_op Op,
typename T,
int N,
unsigned NumSrc>
698 constexpr
bool check_atomic() {
699 if constexpr (!detail::isPowerOf2(N, 32)) {
700 static_assert((detail::isPowerOf2(N, 32)),
701 "Execution size 1, 2, 4, 8, 16, 32 are supported");
707 if constexpr (NumSrc != 0) {
708 static_assert(NumSrc == 0,
"No source operands are expected");
711 if constexpr (!is_type<T, uint16_t, uint32_t, uint64_t>()) {
712 static_assert((is_type<T, uint16_t, uint32_t, uint64_t>()),
713 "Type UW, UD or UQ is expected");
724 Op == atomic_op::minsint || Op == atomic_op::maxsint) {
725 if constexpr (NumSrc != 1) {
726 static_assert(NumSrc == 1,
"One source operand is expected");
729 if constexpr ((Op != atomic_op::minsint && Op != atomic_op::maxsint) &&
730 !is_type<T, uint16_t, uint32_t, uint64_t>()) {
731 static_assert((is_type<T, uint16_t, uint32_t, uint64_t>()),
732 "Type UW, UD or UQ is expected");
735 if constexpr ((Op == atomic_op::minsint || Op == atomic_op::maxsint) &&
736 !is_type<T, int16_t, int32_t, int64_t>()) {
737 static_assert((is_type<T, int16_t, int32_t, int64_t>()),
738 "Type W, D or Q is expected");
746 Op == atomic_op::fadd || Op == atomic_op::fsub) {
747 if constexpr (NumSrc != 1) {
748 static_assert(NumSrc == 1,
"One source operand is expected");
751 if constexpr (!is_type<T, float, sycl::half>()) {
752 static_assert((is_type<T, float, sycl::half>()),
753 "Type F or HF is expected");
760 if constexpr (Op == atomic_op::cmpxchg || Op == atomic_op::fcmpwr) {
761 if constexpr (NumSrc != 2) {
762 static_assert(NumSrc == 2,
"Two source operands are expected");
765 if constexpr (Op == atomic_op::cmpxchg &&
766 !is_type<T, uint16_t, uint32_t, uint64_t>()) {
767 static_assert((is_type<T, uint16_t, uint32_t, uint64_t>()),
768 "Type UW, UD or UQ is expected");
771 if constexpr (Op == atomic_op::fcmpwr && !is_type<T, float, sycl::half>()) {
772 static_assert((is_type<T, float, sycl::half>()),
773 "Type F or HF is expected");
805 template <atomic_op Op,
typename Tx,
int N,
class T = detail::__raw_t<Tx>>
806 __ESIMD_API std::enable_if_t<detail::check_atomic<Op, Tx, N, 0>(),
simd<Tx, N>>
811 return __esimd_svm_atomic0<Op, T, N>(vAddr.data(), mask.data());
835 template <atomic_op Op,
typename Tx,
int N,
class T = detail::__raw_t<Tx>>
836 __ESIMD_API std::enable_if_t<detail::check_atomic<Op, Tx, N, 1>(),
simd<Tx, N>>
842 return __esimd_svm_atomic1<Op, T, N>(vAddr.data(), src0.data(), mask.data());
864 template <atomic_op Op,
typename Tx,
int N,
class T = detail::__raw_t<Tx>>
865 __ESIMD_API std::enable_if_t<detail::check_atomic<Op, Tx, N, 2>(),
simd<Tx, N>>
871 return __esimd_svm_atomic2<Op, T, N>(vAddr.data(), src0.data(), src1.data(),
904 template <u
int8_t cntl> __ESIMD_API
void fence() { __esimd_fence(cntl); }
929 template <u
int32_t SLMSize> __ESIMD_API
void slm_init() {
930 __esimd_slm_init(SLMSize);
936 __ESIMD_API
void slm_init(uint32_t size) { __esimd_slm_init(size); }
943 template <
typename T,
int N>
947 detail::LocalAccessorMarker acc;
948 return detail::gather_impl<T, N>(acc, offsets, 0, mask);
966 template <
typename T,
int N>
970 detail::LocalAccessorMarker acc;
971 detail::scatter_impl<T, N>(acc, vals, offsets, 0, mask);
979 template <
typename T>
994 template <
typename T,
int N, rgba_channel_mask RGBAMask>
995 __ESIMD_API
std::enable_if_t<(N == 8 || N == 16 || N == 32) && (
sizeof(T) == 4),
1000 return __esimd_gather4_masked_scaled2<T, N, RGBAMask>(
1001 SI, 0 , offsets.data(), mask.data());
1014 template <
typename T,
int N, rgba_channel_mask Mask>
1015 __ESIMD_API
std::enable_if_t<(N == 8 || N == 16 || N == 32) && (
sizeof(T) == 4)>
1019 detail::validate_rgba_write_channel_mask<Mask>();
1021 constexpr int16_t Scale = 0;
1022 constexpr
int global_offset = 0;
1023 __esimd_scatter4_scaled<T, N, decltype(si), Mask, Scale>(
1024 mask.data(), si, global_offset, offsets.data(), vals.data());
1035 template <
typename T,
int N>
1037 constexpr
unsigned Sz =
sizeof(
T) * N;
1038 static_assert(Sz >= detail::OperandSize::OWORD,
1039 "block size must be at least 1 oword");
1040 static_assert(Sz % detail::OperandSize::OWORD == 0,
1041 "block size must be whole number of owords");
1042 static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
1043 "block must be 1, 2, 4 or 8 owords long");
1044 static_assert(Sz <= 16 * detail::OperandSize::OWORD,
1045 "block size must be at most 16 owords");
1048 return __esimd_oword_ld<detail::__raw_t<T>, N>(si, offset >> 4);
1059 template <
typename T,
int N>
1061 constexpr
unsigned Sz =
sizeof(
T) * N;
1062 static_assert(Sz >= detail::OperandSize::OWORD,
1063 "block size must be at least 1 oword");
1064 static_assert(Sz % detail::OperandSize::OWORD == 0,
1065 "block size must be whole number of owords");
1066 static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
1067 "block must be 1, 2, 4 or 8 owords long");
1068 static_assert(Sz <= 8 * detail::OperandSize::OWORD,
1069 "block size must be at most 8 owords");
1072 __esimd_oword_st<detail::__raw_t<T>, N>(si, offset >> 4, vals.data());
1078 template <atomic_op Op,
typename Tx,
int N,
class T = detail::__raw_t<Tx>>
1079 __ESIMD_API std::enable_if_t<detail::check_atomic<Op, T, N, 0>(),
simd<Tx, N>>
1082 return __esimd_dword_atomic0<Op, T, N>(mask.data(), si, offsets.data());
1088 template <atomic_op Op,
typename Tx,
int N,
class T = detail::__raw_t<Tx>>
1089 __ESIMD_API std::enable_if_t<detail::check_atomic<Op, T, N, 1>(),
simd<Tx, N>>
1093 return __esimd_dword_atomic1<Op, T, N>(mask.data(), si, offsets.data(),
1100 template <atomic_op Op,
typename Tx,
int N,
class T = detail::__raw_t<Tx>>
1101 __ESIMD_API std::enable_if_t<detail::check_atomic<Op, T, N, 2>(),
simd<Tx, N>>
1105 return __esimd_dword_atomic2<Op, T, N>(mask.data(), si, offsets.data(),
1106 src0.data(), src1.data());
1111 #ifndef __ESIMD_FORCE_STATELESS_MEM
1127 template <
typename T,
int m,
int N,
typename AccessorTy,
unsigned plane = 0>
1130 constexpr
unsigned Width = N *
sizeof(
T);
1131 static_assert(Width * m <= 256u,
1132 "data does not fit into a single dataport transaction");
1133 static_assert(Width <= 64u,
"valid block width is in range [1, 64]");
1134 static_assert(m <= 64u,
"valid block height is in range [1, 64]");
1135 static_assert(plane <= 3u,
"valid plane index is in range [0, 3]");
1138 using SurfIndTy = decltype(si);
1139 constexpr
unsigned int RoundedWidth =
1140 Width < 4 ? 4 : detail::getNextPowerOf2<Width>();
1141 constexpr
int BlockWidth =
sizeof(
T) * N;
1142 constexpr
int Mod = 0;
1144 if constexpr (Width < RoundedWidth) {
1145 constexpr
unsigned int n1 = RoundedWidth /
sizeof(
T);
1147 __esimd_media_ld<T, m, n1, Mod, SurfIndTy, (int)plane, BlockWidth>(
1149 return temp.template select<m, 1, N, 1>(0, 0);
1151 return __esimd_media_ld<T, m, N, Mod, SurfIndTy, (int)plane, BlockWidth>(
1168 template <
typename T,
int m,
int N,
typename AccessorTy,
unsigned plane = 0>
1171 constexpr
unsigned Width = N *
sizeof(
T);
1172 static_assert(Width * m <= 256u,
1173 "data does not fit into a single dataport transaction");
1174 static_assert(Width <= 64u,
"valid block width is in range [1, 64]");
1175 static_assert(m <= 64u,
"valid block height is in range [1, 64]");
1176 static_assert(plane <= 3u,
"valid plane index is in range [0, 3]");
1178 using SurfIndTy = decltype(si);
1179 constexpr
unsigned int RoundedWidth =
1180 Width < 4 ? 4 : detail::getNextPowerOf2<Width>();
1181 constexpr
unsigned int n1 = RoundedWidth /
sizeof(
T);
1182 constexpr
int BlockWidth =
sizeof(
T) * N;
1183 constexpr
int Mod = 0;
1185 if constexpr (Width < RoundedWidth) {
1187 auto temp_ref = temp.template bit_cast_view<T, m, n1>();
1188 auto vals_ref = vals.template bit_cast_view<T, m, N>();
1189 temp_ref.template select<m, 1, N, 1>() = vals_ref;
1190 __esimd_media_st<T, m, n1, Mod, SurfIndTy, plane, BlockWidth>(si, x, y,
1193 __esimd_media_st<T, m, N, Mod, SurfIndTy, plane, BlockWidth>(si, x, y,
1197 #endif // !__ESIMD_FORCE_STATELESS_MEM
1201 #undef __ESIMD_GET_SURF_HANDLE
1209 template <
typename T,
int N,
class T1,
class SFINAE>
1210 template <
typename Flags,
int ChunkSize,
typename>
1213 Flags) SYCL_ESIMD_FUNCTION {
1215 constexpr
unsigned Size =
sizeof(T) * N;
1216 constexpr
unsigned Align = Flags::template alignment<T1>;
1218 constexpr
unsigned BlockSize = OperandSize::OWORD * 8;
1219 constexpr
unsigned NumBlocks = Size / BlockSize;
1220 constexpr
unsigned RemSize = Size % BlockSize;
1222 if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 &&
1223 detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
1224 if constexpr (NumBlocks > 0) {
1225 constexpr
unsigned BlockN = BlockSize /
sizeof(T);
1226 ForHelper<NumBlocks>::unroll([BlockN, Addr,
this](
unsigned Block) {
1227 select<BlockN, 1>(Block * BlockN) =
1228 block_load<UT, BlockN, Flags>(Addr + (Block * BlockN), Flags{});
1231 if constexpr (RemSize > 0) {
1232 constexpr
unsigned RemN = RemSize /
sizeof(T);
1233 constexpr
unsigned BlockN = BlockSize /
sizeof(T);
1234 select<RemN, 1>(NumBlocks * BlockN) =
1235 block_load<UT, RemN, Flags>(Addr + (NumBlocks * BlockN), Flags{});
1237 }
else if constexpr (
sizeof(
T) == 8) {
1239 bit_cast_view<int32_t>() = BC;
1241 constexpr
unsigned NumChunks = N / ChunkSize;
1242 if constexpr (NumChunks > 0) {
1244 ForHelper<NumChunks>::unroll([Addr, &Offsets,
this](
unsigned Block) {
1245 select<ChunkSize, 1>(Block * ChunkSize) =
1246 gather<UT, ChunkSize>(Addr + (Block * ChunkSize), Offsets);
1249 constexpr
unsigned RemN = N % ChunkSize;
1250 if constexpr (RemN > 0) {
1251 if constexpr (RemN == 1) {
1252 select<1, 1>(NumChunks * ChunkSize) = Addr[NumChunks * ChunkSize];
1253 }
else if constexpr (RemN == 8 || RemN == 16) {
1255 select<RemN, 1>(NumChunks * ChunkSize) =
1256 gather<UT, RemN>(Addr + (NumChunks * ChunkSize), Offsets);
1258 constexpr
int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
1259 simd_mask_type<N1> Pred(0);
1260 Pred.template select<RemN, 1>() = 1;
1263 gather<UT, N1>(Addr + (NumChunks * ChunkSize), Offsets, Pred);
1264 select<RemN, 1>(NumChunks * ChunkSize) =
1265 Vals.template select<RemN, 1>();
1271 template <
typename T,
int N,
class T1,
class SFINAE>
1272 template <
typename AccessorT,
typename Flags,
int ChunkSize,
typename>
1273 ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_read,
1276 Flags) SYCL_ESIMD_FUNCTION {
1278 static_assert(
sizeof(UT) ==
sizeof(
T));
1279 constexpr
unsigned Size =
sizeof(
T) * N;
1280 constexpr
unsigned Align = Flags::template alignment<T1>;
1282 constexpr
unsigned BlockSize = OperandSize::OWORD * 8;
1283 constexpr
unsigned NumBlocks = Size / BlockSize;
1284 constexpr
unsigned RemSize = Size % BlockSize;
1286 if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 &&
1287 detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
1288 if constexpr (NumBlocks > 0) {
1289 constexpr
unsigned BlockN = BlockSize /
sizeof(
T);
1290 ForHelper<NumBlocks>::unroll([BlockN, acc, offset,
this](
unsigned Block) {
1291 select<BlockN, 1>(Block * BlockN) =
1292 block_load<UT, BlockN, AccessorT, Flags>(
1293 acc, offset + (Block * BlockSize), Flags{});
1296 if constexpr (RemSize > 0) {
1297 constexpr
unsigned RemN = RemSize /
sizeof(
T);
1298 constexpr
unsigned BlockN = BlockSize /
sizeof(
T);
1299 select<RemN, 1>(NumBlocks * BlockN) =
1300 block_load<UT, RemN, AccessorT, Flags>(
1301 acc, offset + (NumBlocks * BlockSize), Flags{});
1303 }
else if constexpr (
sizeof(
T) == 8) {
1305 bit_cast_view<int32_t>() = BC;
1307 constexpr
unsigned NumChunks = N / ChunkSize;
1308 if constexpr (NumChunks > 0) {
1310 ForHelper<NumChunks>::unroll(
1311 [acc, offset, &Offsets,
this](
unsigned Block) {
1312 select<ChunkSize, 1>(Block * ChunkSize) =
1313 gather<UT, ChunkSize, AccessorT>(
1314 acc, Offsets, offset + (Block * ChunkSize *
sizeof(
T)));
1317 constexpr
unsigned RemN = N % ChunkSize;
1318 if constexpr (RemN > 0) {
1319 if constexpr (RemN == 1 || RemN == 8 || RemN == 16) {
1321 select<RemN, 1>(NumChunks * ChunkSize) = gather<UT, RemN, AccessorT>(
1322 acc, Offsets, offset + (NumChunks * ChunkSize *
sizeof(
T)));
1324 constexpr
int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
1325 simd_mask_type<N1> Pred(0);
1326 Pred.template select<RemN, 1>() = 1;
1329 acc, Offsets, offset + (NumChunks * ChunkSize *
sizeof(
T)), Pred);
1330 select<RemN, 1>(NumChunks * ChunkSize) =
1331 Vals.template select<RemN, 1>();
1337 template <
typename T,
int N,
class T1,
class SFINAE>
1338 template <
typename Flags,
int ChunkSize,
typename>
1341 Flags)
const SYCL_ESIMD_FUNCTION {
1343 constexpr
unsigned Size =
sizeof(
T) * N;
1344 constexpr
unsigned Align = Flags::template alignment<T1>;
1346 constexpr
unsigned BlockSize = OperandSize::OWORD * 8;
1347 constexpr
unsigned NumBlocks = Size / BlockSize;
1348 constexpr
unsigned RemSize = Size % BlockSize;
1351 if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
1352 detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
1353 if constexpr (NumBlocks > 0) {
1354 constexpr
unsigned BlockN = BlockSize /
sizeof(
T);
1355 ForHelper<NumBlocks>::unroll([BlockN, Addr, &Tmp](
unsigned Block) {
1356 block_store<UT, BlockN>(Addr + (Block * BlockN),
1357 Tmp.template select<BlockN, 1>(Block * BlockN));
1360 if constexpr (RemSize > 0) {
1361 constexpr
unsigned RemN = RemSize /
sizeof(
T);
1362 constexpr
unsigned BlockN = BlockSize /
sizeof(
T);
1363 block_store<UT, RemN>(Addr + (NumBlocks * BlockN),
1364 Tmp.template select<RemN, 1>(NumBlocks * BlockN));
1366 }
else if constexpr (
sizeof(
T) == 8) {
1368 BC.
copy_to(
reinterpret_cast<int32_t *
>(Addr), Flags{});
1370 constexpr
unsigned NumChunks = N / ChunkSize;
1371 if constexpr (NumChunks > 0) {
1373 ForHelper<NumChunks>::unroll([Addr, &Offsets, &Tmp](
unsigned Block) {
1374 scatter<UT, ChunkSize>(
1375 Addr + (Block * ChunkSize), Offsets,
1376 Tmp.template select<ChunkSize, 1>(Block * ChunkSize));
1379 constexpr
unsigned RemN = N % ChunkSize;
1380 if constexpr (RemN > 0) {
1381 if constexpr (RemN == 1) {
1382 Addr[NumChunks * ChunkSize] = Tmp[NumChunks * ChunkSize];
1383 }
else if constexpr (RemN == 8 || RemN == 16) {
1385 scatter<UT, RemN>(Addr + (NumChunks * ChunkSize), Offsets,
1386 Tmp.template select<RemN, 1>(NumChunks * ChunkSize));
1388 constexpr
int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
1389 simd_mask_type<N1> Pred(0);
1390 Pred.template select<RemN, 1>() = 1;
1392 Vals.template select<RemN, 1>() =
1393 Tmp.template select<RemN, 1>(NumChunks * ChunkSize);
1395 scatter<UT, N1>(Addr + (NumChunks * ChunkSize), Offsets, Vals, Pred);
1401 template <
typename T,
int N,
class T1,
class SFINAE>
1402 template <
typename AccessorT,
typename Flags,
int ChunkSize,
typename>
1403 ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_write,
1406 Flags)
const SYCL_ESIMD_FUNCTION {
1408 constexpr
unsigned Size =
sizeof(
T) * N;
1409 constexpr
unsigned Align = Flags::template alignment<T1>;
1411 constexpr
unsigned BlockSize = OperandSize::OWORD * 8;
1412 constexpr
unsigned NumBlocks = Size / BlockSize;
1413 constexpr
unsigned RemSize = Size % BlockSize;
1417 if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
1418 detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
1419 if constexpr (NumBlocks > 0) {
1420 constexpr
unsigned BlockN = BlockSize /
sizeof(
T);
1421 ForHelper<NumBlocks>::unroll([BlockN, acc, offset, &Tmp](
unsigned Block) {
1422 block_store<UT, BlockN, AccessorT>(
1423 acc, offset + (Block * BlockSize),
1424 Tmp.template select<BlockN, 1>(Block * BlockN));
1427 if constexpr (RemSize > 0) {
1428 constexpr
unsigned RemN = RemSize /
sizeof(
T);
1429 constexpr
unsigned BlockN = BlockSize /
sizeof(
T);
1430 block_store<UT, RemN, AccessorT>(
1431 acc, offset + (NumBlocks * BlockSize),
1432 Tmp.template select<RemN, 1>(NumBlocks * BlockN));
1434 }
else if constexpr (
sizeof(
T) == 8) {
1436 BC.
copy_to(acc, offset, Flags{});
1438 constexpr
unsigned NumChunks = N / ChunkSize;
1439 if constexpr (NumChunks > 0) {
1441 ForHelper<NumChunks>::unroll([acc, offset, &Offsets,
1442 &Tmp](
unsigned Block) {
1443 scatter<UT, ChunkSize, AccessorT>(
1444 acc, Offsets, Tmp.template select<ChunkSize, 1>(Block * ChunkSize),
1445 offset + (Block * ChunkSize *
sizeof(
T)));
1448 constexpr
unsigned RemN = N % ChunkSize;
1449 if constexpr (RemN > 0) {
1450 if constexpr (RemN == 1 || RemN == 8 || RemN == 16) {
1452 scatter<UT, RemN, AccessorT>(
1453 acc, Offsets, Tmp.template select<RemN, 1>(NumChunks * ChunkSize),
1454 offset + (NumChunks * ChunkSize *
sizeof(
T)));
1456 constexpr
int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
1457 simd_mask_type<N1> Pred(0);
1458 Pred.template select<RemN, 1>() = 1;
1460 Vals.template select<RemN, 1>() =
1461 Tmp.template select<RemN, 1>(NumChunks * ChunkSize);
1463 scatter<UT, N1, AccessorT>(acc, Offsets, Vals,
1464 offset + (NumChunks * ChunkSize *
sizeof(
T)),