25 namespace ext::intel::esimd {
47 struct LocalAccessorMarker {};
61 template <
typename AccessorTy>
63 if constexpr (std::is_same_v<detail::LocalAccessorMarker, AccessorTy>) {
66 return __esimd_get_surface_index(
67 detail::AccessorPrivateProxy::getNativeImageObj(acc));
129 template <
typename Tx,
int N,
typename Toffset>
132 using T = detail::__raw_t<Tx>;
133 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
137 addrs = addrs + offsets_i;
139 if constexpr (
sizeof(T) == 1) {
140 auto Ret = __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<4>(),
141 detail::ElemsPerAddrEncoding<1>()>(
142 addrs.
data(), mask.data());
143 return __esimd_rdregion<T, N * 4, N, 0, N, 4>(Ret, 0);
144 }
else if constexpr (
sizeof(T) == 2) {
145 auto Ret = __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<2>(),
146 detail::ElemsPerAddrEncoding<2>()>(
147 addrs.
data(), mask.data());
148 return __esimd_rdregion<T, N * 2, N, 0, N, 2>(Ret, 0);
150 return __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<1>(),
151 detail::ElemsPerAddrEncoding<1>()>(addrs.
data(),
168 template <
typename Tx,
int N,
typename Toffset,
169 typename RegionTy = region1d_t<Toffset, N, 1>>
173 using T = detail::__raw_t<Tx>;
175 return gather<Tx, N>(p,
simd<Ty, N>(offsets), mask);
191 template <
typename Tx,
int N,
typename Toffset>
194 using T = detail::__raw_t<Tx>;
195 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
199 addrs = addrs + offsets_i;
200 if constexpr (
sizeof(T) == 1) {
202 D = __esimd_wrregion<T, N * 4, N, 0, N, 4>(D.
data(), vals.
data(), 0);
203 __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<4>(),
204 detail::ElemsPerAddrEncoding<1>()>(
205 addrs.
data(), D.
data(), mask.data());
206 }
else if constexpr (
sizeof(T) == 2) {
208 D = __esimd_wrregion<T, N * 2, N, 0, N, 2>(D.
data(), vals.
data(), 0);
209 __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<2>(),
210 detail::ElemsPerAddrEncoding<2>()>(
211 addrs.
data(), D.
data(), mask.data());
213 __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<1>(),
214 detail::ElemsPerAddrEncoding<1>()>(
215 addrs.
data(), vals.
data(), mask.data());
230 template <
typename Tx,
int N,
typename Toffset,
231 typename RegionTy = region1d_t<Toffset, N, 1>>
234 using T = detail::__raw_t<Tx>;
236 scatter<Tx, N>(p,
simd<Ty, N>(offsets), vals, mask);
253 class T = detail::__raw_t<Tx>,
254 typename = std::enable_if_t<is_simd_flag_type_v<Flags>>>
256 constexpr
unsigned Sz =
sizeof(T) * N;
257 static_assert(Sz >= detail::OperandSize::OWORD,
258 "block size must be at least 1 oword");
259 static_assert(Sz % detail::OperandSize::OWORD == 0,
260 "block size must be whole number of owords");
262 "block must be 1, 2, 4 or 8 owords long");
263 static_assert(Sz <= 8 * detail::OperandSize::OWORD,
264 "block size must be at most 8 owords");
266 uintptr_t Addr =
reinterpret_cast<uintptr_t
>(addr);
267 if constexpr (Flags::template alignment<
simd<T, N>> >=
268 detail::OperandSize::OWORD) {
269 return __esimd_svm_block_ld<T, N>(Addr);
271 return __esimd_svm_block_ld_unaligned<T, N>(Addr);
290 template <
typename Tx,
int N,
typename AccessorTy,
292 typename = std::enable_if_t<is_simd_flag_type_v<Flags>>,
293 class T = detail::__raw_t<Tx>>
296 #ifdef __ESIMD_FORCE_STATELESS_MEM
297 return block_load<Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc, offset));
299 constexpr
unsigned Sz =
sizeof(T) * N;
300 static_assert(Sz >= detail::OperandSize::OWORD,
301 "block size must be at least 1 oword");
302 static_assert(Sz % detail::OperandSize::OWORD == 0,
303 "block size must be whole number of owords");
305 "block must be 1, 2, 4 or 8 owords long");
306 static_assert(Sz <= 8 * detail::OperandSize::OWORD,
307 "block size must be at most 8 owords");
309 auto surf_ind = __esimd_get_surface_index(
310 detail::AccessorPrivateProxy::getNativeImageObj(acc));
312 if constexpr (Flags::template alignment<
simd<T, N>> >=
313 detail::OperandSize::OWORD) {
314 return __esimd_oword_ld<T, N>(surf_ind, offset >> 4);
316 return __esimd_oword_ld_unaligned<T, N>(surf_ind, offset);
329 template <
typename Tx,
int N,
class T = detail::__raw_t<Tx>>
331 constexpr
unsigned Sz =
sizeof(T) * N;
332 static_assert(Sz >= detail::OperandSize::OWORD,
333 "block size must be at least 1 oword");
334 static_assert(Sz % detail::OperandSize::OWORD == 0,
335 "block size must be whole number of owords");
337 "block must be 1, 2, 4 or 8 owords long");
338 static_assert(Sz <= 8 * detail::OperandSize::OWORD,
339 "block size must be at most 8 owords");
341 uintptr_t Addr =
reinterpret_cast<uintptr_t
>(p);
342 __esimd_svm_block_st<T, N>(Addr, vals.
data());
356 template <
typename Tx,
int N,
typename AccessorTy,
357 class T = detail::__raw_t<Tx>>
360 #ifdef __ESIMD_FORCE_STATELESS_MEM
361 block_store<Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc, offset), vals);
363 constexpr
unsigned Sz =
sizeof(T) * N;
364 static_assert(Sz >= detail::OperandSize::OWORD,
365 "block size must be at least 1 oword");
366 static_assert(Sz % detail::OperandSize::OWORD == 0,
367 "block size must be whole number of owords");
369 "block must be 1, 2, 4 or 8 owords long");
370 static_assert(Sz <= 8 * detail::OperandSize::OWORD,
371 "block size must be at most 8 owords");
373 auto surf_ind = __esimd_get_surface_index(
374 detail::AccessorPrivateProxy::getNativeImageObj(acc));
375 __esimd_oword_st<T, N>(surf_ind, offset >> 4, vals.
data());
385 template <
typename T,
int N,
typename AccessorTy>
388 (N == 1 || N == 8 || N == 16 || N == 32) &&
389 !std::is_pointer<AccessorTy>::value>
393 constexpr
int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
395 constexpr int16_t scale = 0;
398 if constexpr (
sizeof(T) < 4) {
399 using Tint = std::conditional_t<std::is_integral_v<T>, T,
400 detail::uint_type_t<
sizeof(T)>>;
401 using Treal = __raw_t<T>;
402 simd<Tint, N> vals_int = bitcast<Tint, Treal, N>(std::move(vals).data());
406 const simd<PromoT, N> promo_vals = convert<PromoT>(std::move(vals_int));
407 __esimd_scatter_scaled<PromoT, N, decltype(si), TypeSizeLog2, scale>(
408 mask.data(), si, glob_offset, offsets.data(), promo_vals.data());
410 using Treal = __raw_t<T>;
411 if constexpr (!std::is_same_v<Treal, T>) {
413 __esimd_scatter_scaled<Treal, N, decltype(si), TypeSizeLog2, scale>(
414 mask.data(), si, glob_offset, offsets.data(), Values.data());
416 __esimd_scatter_scaled<T, N, decltype(si), TypeSizeLog2, scale>(
417 mask.data(), si, glob_offset, offsets.data(), vals.data());
422 template <
typename T,
int N,
typename AccessorTy>
424 (
sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) &&
425 !std::is_pointer<AccessorTy>::value,
430 constexpr
int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
432 constexpr uint32_t scale = 0;
435 if constexpr (
sizeof(T) < 4) {
436 using Tint = std::conditional_t<std::is_integral_v<T>, T,
437 detail::uint_type_t<
sizeof(T)>>;
438 using Treal = __raw_t<T>;
439 static_assert(std::is_integral<Tint>::value,
440 "only integral 1- & 2-byte types are supported");
445 __esimd_gather_masked_scaled2<PromoT, N, decltype(si), TypeSizeLog2,
446 scale>(si, glob_offset, offsets.data(),
448 auto Res = convert<Tint>(promo_vals);
450 if constexpr (!std::is_same_v<Tint, T>) {
451 return detail::bitcast<Treal, Tint, N>(Res.data());
456 using Treal = __raw_t<T>;
457 simd<Treal, N> Res = __esimd_gather_masked_scaled2<Treal, N, decltype(si),
458 TypeSizeLog2, scale>(
459 si, glob_offset, offsets.data(), mask.data());
460 if constexpr (!std::is_same_v<Treal, T>) {
461 return Res.template bit_cast_view<T>();
492 template <
typename T,
int N,
typename AccessorTy>
494 (N == 1 || N == 8 || N == 16 || N == 32) &&
495 !std::is_pointer<AccessorTy>::value,
499 #ifdef __ESIMD_FORCE_STATELESS_MEM
500 return gather<T, N>(__ESIMD_DNS::accessorToPointer<T>(acc, glob_offset),
503 return detail::gather_impl<T, N, AccessorTy>(acc, offsets, glob_offset, mask);
526 template <
typename T,
int N,
typename AccessorTy>
528 (N == 1 || N == 8 || N == 16 || N == 32) &&
529 !std::is_pointer<AccessorTy>::value>
532 #ifdef __ESIMD_FORCE_STATELESS_MEM
533 scatter<T, N>(__ESIMD_DNS::accessorToPointer<T>(acc, glob_offset), offsets,
536 detail::scatter_impl<T, N, AccessorTy>(acc, vals, offsets, glob_offset, mask);
547 template <
typename T,
typename AccessorTy>
561 template <
typename T,
typename AccessorTy>
562 __ESIMD_API
void scalar_store(AccessorTy acc, uint32_t offset, T val) {
600 int N,
typename Toffset>
603 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
604 static_assert((N == 8 || N == 16 || N == 32),
"Unsupported value of N");
605 static_assert(
sizeof(T) == 4,
"Unsupported size of type T");
608 addrs = addrs + offsets_i;
609 return __esimd_svm_gather4_scaled<detail::__raw_t<T>, N, RGBAMask>(
610 addrs.
data(), mask.data());
629 int N,
typename Toffset,
630 typename RegionTy = region1d_t<Toffset, N, 1>>
635 return gather_rgba<RGBAMask, T, N>(p,
simd<Ty, N>(offsets), mask);
638 template <
typename T,
int N, rgba_channel_mask RGBAMask>
641 (N == 8 || N == 16 || N == 32) && sizeof(T) == 4,
644 simd<uint32_t, N> offsets,
646 return gather_rgba<RGBAMask>(p, offsets, mask);
653 (M == CM::ABGR || M == CM::BGR || M == CM::GR || M == CM::R) &&
654 "Only ABGR, BGR, GR, R channel masks are valid in write operations");
680 int N,
typename Toffset>
685 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
686 static_assert((N == 8 || N == 16 || N == 32),
"Unsupported value of N");
687 static_assert(
sizeof(T) == 4,
"Unsupported size of type T");
688 detail::validate_rgba_write_channel_mask<RGBAMask>();
691 addrs = addrs + offsets_i;
692 __esimd_svm_scatter4_scaled<detail::__raw_t<T>, N, RGBAMask>(
693 addrs.
data(), vals.
data(), mask.data());
712 int N,
typename Toffset,
713 typename RegionTy = region1d_t<Toffset, N, 1>>
719 scatter_rgba<RGBAMask, T, N>(p,
simd<Ty, N>(offsets), vals, mask);
722 template <
typename T,
int N, rgba_channel_mask RGBAMask>
726 T *p,
simd<uint32_t, N> offsets,
729 scatter_rgba<RGBAMask>(p, offsets, vals, mask);
755 typename AccessorT,
int N,
756 typename T =
typename AccessorT::value_type>
758 sizeof(T) == 4 && !std::is_pointer_v<AccessorT>),
762 #ifdef __ESIMD_FORCE_STATELESS_MEM
763 return gather_rgba<RGBAMask>(
764 __ESIMD_DNS::accessorToPointer<T>(acc, global_offset), offsets, mask);
767 constexpr uint32_t Scale = 0;
769 return __esimd_gather4_masked_scaled2<detail::__raw_t<T>, N, RGBAMask,
770 decltype(SI), Scale>(
771 SI, global_offset, offsets.
data(), mask.data());
790 typename AccessorT,
int N,
791 typename T =
typename AccessorT::value_type>
792 __ESIMD_API
std::enable_if_t<(N == 8 || N == 16 || N == 32) &&
sizeof(T) == 4 &&
793 !std::is_pointer_v<AccessorT>>
797 detail::validate_rgba_write_channel_mask<RGBAMask>();
798 #ifdef __ESIMD_FORCE_STATELESS_MEM
799 scatter_rgba<RGBAMask>(__ESIMD_DNS::accessorToPointer<T>(acc, global_offset),
800 offsets, vals, mask);
803 constexpr uint32_t Scale = 0;
805 __esimd_scatter4_scaled<T, N, decltype(SI), RGBAMask, Scale>(
806 mask.data(), SI, global_offset, offsets.
data(), vals.
data());
815 template <__ESIMD_NS::atomic_op Op,
typename T,
int N,
unsigned NumSrc>
818 "Execution size 1, 2, 4, 8, 16, 32 are supported");
819 static_assert(NumSrc == __ESIMD_DNS::get_num_args<Op>(),
820 "wrong number of operands");
821 constexpr
bool IsInt2BytePlus =
822 std::is_integral_v<T> && (
sizeof(T) >=
sizeof(uint16_t));
824 if constexpr (Op == __ESIMD_NS::atomic_op::xchg ||
825 Op == __ESIMD_NS::atomic_op::cmpxchg ||
826 Op == __ESIMD_NS::atomic_op::predec ||
827 Op == __ESIMD_NS::atomic_op::inc ||
829 Op == __ESIMD_NS::atomic_op::load) {
831 static_assert(IsInt2BytePlus,
"Integral 16-bit or wider type is expected");
836 Op == __ESIMD_NS::atomic_op::fadd ||
837 Op == __ESIMD_NS::atomic_op::fsub) {
838 static_assert((is_type<T, float, sycl::half>()),
839 "Type F or HF is expected");
842 Op == __ESIMD_NS::atomic_op::sub ||
848 Op == __ESIMD_NS::atomic_op::minsint ||
849 Op == __ESIMD_NS::atomic_op::maxsint) {
850 static_assert(IsInt2BytePlus,
"Integral 16-bit or wider type is expected");
851 constexpr
bool IsSignedMinmax = (Op == __ESIMD_NS::atomic_op::minsint) ||
852 (Op == __ESIMD_NS::atomic_op::maxsint);
856 if constexpr (IsSignedMinmax || IsUnsignedMinmax) {
857 constexpr
bool SignOK = std::is_signed_v<T> == IsSignedMinmax;
858 static_assert(SignOK,
"Signed/unsigned integer type expected for "
859 "signed/unsigned min/max operation");
891 template <atomic_op Op,
typename Tx,
int N,
typename Toffset>
894 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
895 detail::check_atomic<Op, Tx, N, 1>();
897 (Op == atomic_op::fadd) || (Op == atomic_op::fsub)) {
899 return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(p, offset, src0,
901 }
else if constexpr (Op == atomic_op::store) {
902 return atomic_update<atomic_op::xchg, Tx, N>(p, offset, src0, mask);
908 using T =
typename detail::__raw_t<Tx>;
909 return __esimd_svm_atomic1<Op, T, N>(vAddr.
data(), src0.
data(),
933 template <atomic_op Op,
typename Tx,
int N,
typename Toffset>
936 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
937 detail::check_atomic<Op, Tx, N, 0>();
938 if constexpr (Op == atomic_op::load) {
939 return atomic_update<atomic_op::bit_or, Tx, N>(p, offset,
simd<Tx, N>(0),
945 using T =
typename detail::__raw_t<Tx>;
946 return __esimd_svm_atomic0<Op, T, N>(vAddr.
data(), mask.data());
964 template <
atomic_op Op,
typename Tx,
int N,
typename Toffset,
965 typename RegionTy = region1d_t<Toffset, N, 1>>
970 return atomic_update<Op, Tx, N>(p,
simd<Ty, N>(offsets), mask);
992 template <
atomic_op Op,
typename Tx,
int N,
typename Toffset,
993 typename RegionTy = region1d_t<Toffset, N, 1>>
998 return atomic_update<Op, Tx, N>(p,
simd<Ty, N>(offsets), src0, mask);
1020 template <atomic_op Op,
typename Tx,
int N,
typename Toffset>
1024 static_assert(std::is_integral_v<Toffset>,
"Unsupported offset type");
1025 detail::check_atomic<Op, Tx, N, 2>();
1026 if constexpr (Op == atomic_op::fcmpwr) {
1028 return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(p, offset, src0,
1034 using T =
typename detail::__raw_t<Tx>;
1035 return __esimd_svm_atomic2<Op, T, N>(vAddr.
data(), src0.
data(), src1.
data(),
1056 template <
atomic_op Op,
typename Tx,
int N,
typename Toffset,
1057 typename RegionTy = region1d_t<Toffset, N, 1>>
1062 return atomic_update<Op, Tx, N>(p,
simd<Ty, N>(offsets), src0, src1, mask);
1096 template <u
int8_t cntl> __ESIMD_API
void fence() { __esimd_fence(cntl); }
1121 template <u
int32_t SLMSize> __ESIMD_API
void slm_init() {
1122 __esimd_slm_init(SLMSize);
1128 __ESIMD_API
void slm_init(uint32_t size) { __esimd_slm_init(size); }
1135 template <
typename T,
int N>
1139 detail::LocalAccessorMarker acc;
1140 return detail::gather_impl<T, N>(acc, offsets, 0, mask);
1158 template <
typename T,
int N>
1162 detail::LocalAccessorMarker acc;
1163 detail::scatter_impl<T, N>(acc, vals, offsets, 0, mask);
1171 template <
typename T>
1186 template <
typename T,
int N, rgba_channel_mask RGBAMask>
1187 __ESIMD_API
std::enable_if_t<(N == 8 || N == 16 || N == 32) && (
sizeof(T) == 4),
1192 return __esimd_gather4_masked_scaled2<T, N, RGBAMask>(
1193 SI, 0 , offsets.
data(), mask.data());
1206 template <
typename T,
int N, rgba_channel_mask Mask>
1207 __ESIMD_API
std::enable_if_t<(N == 8 || N == 16 || N == 32) && (
sizeof(T) == 4)>
1211 detail::validate_rgba_write_channel_mask<Mask>();
1213 constexpr int16_t Scale = 0;
1214 constexpr
int global_offset = 0;
1215 __esimd_scatter4_scaled<T, N, decltype(si), Mask, Scale>(
1216 mask.data(), si, global_offset, offsets.
data(), vals.
data());
1227 template <
typename T,
int N>
1229 constexpr
unsigned Sz =
sizeof(T) * N;
1230 static_assert(Sz >= detail::OperandSize::OWORD,
1231 "block size must be at least 1 oword");
1232 static_assert(Sz % detail::OperandSize::OWORD == 0,
1233 "block size must be whole number of owords");
1235 "block must be 1, 2, 4 or 8 owords long");
1236 static_assert(Sz <= 16 * detail::OperandSize::OWORD,
1237 "block size must be at most 16 owords");
1240 return __esimd_oword_ld<detail::__raw_t<T>, N>(si, offset >> 4);
1251 template <
typename T,
int N>
1253 constexpr
unsigned Sz =
sizeof(T) * N;
1254 static_assert(Sz >= detail::OperandSize::OWORD,
1255 "block size must be at least 1 oword");
1256 static_assert(Sz % detail::OperandSize::OWORD == 0,
1257 "block size must be whole number of owords");
1259 "block must be 1, 2, 4 or 8 owords long");
1260 static_assert(Sz <= 8 * detail::OperandSize::OWORD,
1261 "block size must be at most 8 owords");
1264 __esimd_oword_st<detail::__raw_t<T>, N>(si, offset >> 4, vals.
data());
1270 template <atomic_op Op,
typename Tx,
int N,
class T = detail::__raw_t<Tx>>
1273 detail::check_atomic<Op, T, N, 0>();
1275 return __esimd_dword_atomic0<Op, T, N>(mask.data(), si, offsets.
data());
1281 template <atomic_op Op,
typename Tx,
int N,
class T = detail::__raw_t<Tx>>
1284 detail::check_atomic<Op, T, N, 1>();
1286 return __esimd_dword_atomic1<Op, T, N>(mask.data(), si, offsets.
data(),
1293 template <atomic_op Op,
typename Tx,
int N,
class T = detail::__raw_t<Tx>>
1297 detail::check_atomic<Op, T, N, 2>();
1299 return __esimd_dword_atomic2<Op, T, N>(mask.data(), si, offsets.
data(),
1305 #ifndef __ESIMD_FORCE_STATELESS_MEM
1321 template <
typename T,
int m,
int N,
typename AccessorTy,
unsigned plane = 0>
1324 constexpr
unsigned Width = N *
sizeof(T);
1325 static_assert(Width * m <= 256u,
1326 "data does not fit into a single dataport transaction");
1327 static_assert(Width <= 64u,
"valid block width is in range [1, 64]");
1328 static_assert(m <= 64u,
"valid block height is in range [1, 64]");
1329 static_assert(plane <= 3u,
"valid plane index is in range [0, 3]");
1332 using SurfIndTy = decltype(si);
1333 constexpr
unsigned int RoundedWidth =
1334 Width < 4 ? 4 : detail::getNextPowerOf2<Width>();
1335 constexpr
int BlockWidth =
sizeof(T) * N;
1336 constexpr
int Mod = 0;
1338 if constexpr (Width < RoundedWidth) {
1339 constexpr
unsigned int n1 = RoundedWidth /
sizeof(T);
1341 __esimd_media_ld<T, m, n1, Mod, SurfIndTy, (int)plane, BlockWidth>(
1343 return temp.template select<m, 1, N, 1>(0, 0);
1345 return __esimd_media_ld<T, m, N, Mod, SurfIndTy, (int)plane, BlockWidth>(
1362 template <
typename T,
int m,
int N,
typename AccessorTy,
unsigned plane = 0>
1365 constexpr
unsigned Width = N *
sizeof(T);
1366 static_assert(Width * m <= 256u,
1367 "data does not fit into a single dataport transaction");
1368 static_assert(Width <= 64u,
"valid block width is in range [1, 64]");
1369 static_assert(m <= 64u,
"valid block height is in range [1, 64]");
1370 static_assert(plane <= 3u,
"valid plane index is in range [0, 3]");
1372 using SurfIndTy = decltype(si);
1373 constexpr
unsigned int RoundedWidth =
1374 Width < 4 ? 4 : detail::getNextPowerOf2<Width>();
1375 constexpr
unsigned int n1 = RoundedWidth /
sizeof(T);
1376 constexpr
int BlockWidth =
sizeof(T) * N;
1377 constexpr
int Mod = 0;
1379 if constexpr (Width < RoundedWidth) {
1381 auto temp_ref = temp.template bit_cast_view<T, m, n1>();
1382 auto vals_ref = vals.template bit_cast_view<T, m, N>();
1383 temp_ref.template select<m, 1, N, 1>() = vals_ref;
1384 __esimd_media_st<T, m, n1, Mod, SurfIndTy, plane, BlockWidth>(si, x, y,
1387 __esimd_media_st<T, m, N, Mod, SurfIndTy, plane, BlockWidth>(si, x, y,
1401 template <
typename T,
int N,
class T1,
class SFINAE>
1402 template <
typename Flags,
int ChunkSize,
typename>
1403 void simd_obj_impl<T, N, T1, SFINAE>::copy_from(
1404 const simd_obj_impl<T, N, T1, SFINAE>::element_type *Addr,
1405 Flags) SYCL_ESIMD_FUNCTION {
1406 using UT = simd_obj_impl<T, N, T1, SFINAE>::element_type;
1407 constexpr
unsigned Size =
sizeof(T) * N;
1408 constexpr
unsigned Align = Flags::template alignment<T1>;
1410 constexpr
unsigned BlockSize = OperandSize::OWORD * 8;
1411 constexpr
unsigned NumBlocks = Size / BlockSize;
1412 constexpr
unsigned RemSize = Size % BlockSize;
1414 if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 &&
1416 if constexpr (NumBlocks > 0) {
1417 constexpr
unsigned BlockN = BlockSize /
sizeof(T);
1418 ForHelper<NumBlocks>::unroll([BlockN, Addr,
this](
unsigned Block) {
1419 select<BlockN, 1>(Block * BlockN) =
1420 block_load<UT, BlockN, Flags>(Addr + (Block * BlockN), Flags{});
1423 if constexpr (RemSize > 0) {
1424 constexpr
unsigned RemN = RemSize /
sizeof(T);
1425 constexpr
unsigned BlockN = BlockSize /
sizeof(T);
1426 select<RemN, 1>(NumBlocks * BlockN) =
1427 block_load<UT, RemN, Flags>(Addr + (NumBlocks * BlockN), Flags{});
1429 }
else if constexpr (
sizeof(T) == 8) {
1431 bit_cast_view<int32_t>() = BC;
1433 constexpr
unsigned NumChunks = N / ChunkSize;
1434 if constexpr (NumChunks > 0) {
1436 ForHelper<NumChunks>::unroll([Addr, &Offsets,
this](
unsigned Block) {
1437 select<ChunkSize, 1>(Block * ChunkSize) =
1438 gather<UT, ChunkSize>(Addr + (Block * ChunkSize), Offsets);
1441 constexpr
unsigned RemN = N % ChunkSize;
1442 if constexpr (RemN > 0) {
1443 if constexpr (RemN == 1) {
1444 select<1, 1>(NumChunks * ChunkSize) = Addr[NumChunks * ChunkSize];
1445 }
else if constexpr (RemN == 8 || RemN == 16) {
1447 select<RemN, 1>(NumChunks * ChunkSize) =
1448 gather<UT, RemN>(Addr + (NumChunks * ChunkSize), Offsets);
1450 constexpr
int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
1451 simd_mask_type<N1> Pred(0);
1452 Pred.template select<RemN, 1>() = 1;
1455 gather<UT, N1>(Addr + (NumChunks * ChunkSize), Offsets, Pred);
1456 select<RemN, 1>(NumChunks * ChunkSize) =
1457 Vals.template select<RemN, 1>();
1463 template <
typename T,
int N,
class T1,
class SFINAE>
1464 template <
typename AccessorT,
typename Flags,
int ChunkSize,
typename>
1465 ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_read,
1466 sycl::access::target::device,
void>
1467 simd_obj_impl<T, N, T1, SFINAE>::copy_from(AccessorT acc, uint32_t offset,
1468 Flags) SYCL_ESIMD_FUNCTION {
1469 using UT = simd_obj_impl<T, N, T1, SFINAE>::element_type;
1470 static_assert(
sizeof(UT) ==
sizeof(T));
1471 constexpr
unsigned Size =
sizeof(T) * N;
1472 constexpr
unsigned Align = Flags::template alignment<T1>;
1474 constexpr
unsigned BlockSize = OperandSize::OWORD * 8;
1475 constexpr
unsigned NumBlocks = Size / BlockSize;
1476 constexpr
unsigned RemSize = Size % BlockSize;
1478 if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 &&
1480 if constexpr (NumBlocks > 0) {
1481 constexpr
unsigned BlockN = BlockSize /
sizeof(T);
1482 ForHelper<NumBlocks>::unroll([BlockN, acc, offset,
this](
unsigned Block) {
1483 select<BlockN, 1>(Block * BlockN) =
1484 block_load<UT, BlockN, AccessorT, Flags>(
1485 acc, offset + (Block * BlockSize), Flags{});
1488 if constexpr (RemSize > 0) {
1489 constexpr
unsigned RemN = RemSize /
sizeof(T);
1490 constexpr
unsigned BlockN = BlockSize /
sizeof(T);
1491 select<RemN, 1>(NumBlocks * BlockN) =
1492 block_load<UT, RemN, AccessorT, Flags>(
1493 acc, offset + (NumBlocks * BlockSize), Flags{});
1495 }
else if constexpr (
sizeof(T) == 8) {
1497 bit_cast_view<int32_t>() = BC;
1499 constexpr
unsigned NumChunks = N / ChunkSize;
1500 if constexpr (NumChunks > 0) {
1502 ForHelper<NumChunks>::unroll(
1503 [acc, offset, &Offsets,
this](
unsigned Block) {
1504 select<ChunkSize, 1>(Block * ChunkSize) =
1505 gather<UT, ChunkSize, AccessorT>(
1506 acc, Offsets, offset + (Block * ChunkSize *
sizeof(T)));
1509 constexpr
unsigned RemN = N % ChunkSize;
1510 if constexpr (RemN > 0) {
1511 if constexpr (RemN == 1 || RemN == 8 || RemN == 16) {
1513 select<RemN, 1>(NumChunks * ChunkSize) = gather<UT, RemN, AccessorT>(
1514 acc, Offsets, offset + (NumChunks * ChunkSize *
sizeof(T)));
1516 constexpr
int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
1517 simd_mask_type<N1> Pred(0);
1518 Pred.template select<RemN, 1>() = 1;
1521 acc, Offsets, offset + (NumChunks * ChunkSize *
sizeof(T)), Pred);
1522 select<RemN, 1>(NumChunks * ChunkSize) =
1523 Vals.template select<RemN, 1>();
1529 template <
typename T,
int N,
class T1,
class SFINAE>
1530 template <
typename Flags,
int ChunkSize,
typename>
1531 void simd_obj_impl<T, N, T1, SFINAE>::copy_to(
1532 simd_obj_impl<T, N, T1, SFINAE>::element_type *Addr,
1533 Flags)
const SYCL_ESIMD_FUNCTION {
1534 using UT = simd_obj_impl<T, N, T1, SFINAE>::element_type;
1535 constexpr
unsigned Size =
sizeof(T) * N;
1536 constexpr
unsigned Align = Flags::template alignment<T1>;
1538 constexpr
unsigned BlockSize = OperandSize::OWORD * 8;
1539 constexpr
unsigned NumBlocks = Size / BlockSize;
1540 constexpr
unsigned RemSize = Size % BlockSize;
1543 if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
1545 if constexpr (NumBlocks > 0) {
1546 constexpr
unsigned BlockN = BlockSize /
sizeof(T);
1547 ForHelper<NumBlocks>::unroll([BlockN, Addr, &Tmp](
unsigned Block) {
1548 block_store<UT, BlockN>(Addr + (Block * BlockN),
1549 Tmp.template select<BlockN, 1>(Block * BlockN));
1552 if constexpr (RemSize > 0) {
1553 constexpr
unsigned RemN = RemSize /
sizeof(T);
1554 constexpr
unsigned BlockN = BlockSize /
sizeof(T);
1555 block_store<UT, RemN>(Addr + (NumBlocks * BlockN),
1556 Tmp.template select<RemN, 1>(NumBlocks * BlockN));
1558 }
else if constexpr (
sizeof(T) == 8) {
1560 BC.
copy_to(
reinterpret_cast<int32_t *
>(Addr), Flags{});
1562 constexpr
unsigned NumChunks = N / ChunkSize;
1563 if constexpr (NumChunks > 0) {
1565 ForHelper<NumChunks>::unroll([Addr, &Offsets, &Tmp](
unsigned Block) {
1566 scatter<UT, ChunkSize>(
1567 Addr + (Block * ChunkSize), Offsets,
1568 Tmp.template select<ChunkSize, 1>(Block * ChunkSize));
1571 constexpr
unsigned RemN = N % ChunkSize;
1572 if constexpr (RemN > 0) {
1573 if constexpr (RemN == 1) {
1574 Addr[NumChunks * ChunkSize] = Tmp[NumChunks * ChunkSize];
1575 }
else if constexpr (RemN == 8 || RemN == 16) {
1579 if constexpr (
sizeof(T) == 1 && RemN == 16) {
1580 if constexpr (Align % OperandSize::DWORD > 0) {
1581 ForHelper<RemN>::unroll([Addr, &Tmp](
unsigned Index) {
1582 Addr[Index + NumChunks * ChunkSize] =
1583 Tmp[Index + NumChunks * ChunkSize];
1586 simd_mask_type<8> Pred(0);
1588 Pred.template select<4, 1>() = 1;
1589 Vals.template select<4, 1>() =
1590 Tmp.template bit_cast_view<int32_t>().template select<4, 1>(
1591 NumChunks * ChunkSize);
1594 scatter<int32_t, 8>(
1595 reinterpret_cast<int32_t *
>(Addr + (NumChunks * ChunkSize)),
1596 Offsets, Vals, Pred);
1601 Addr + (NumChunks * ChunkSize), Offsets,
1602 Tmp.template select<RemN, 1>(NumChunks * ChunkSize));
1605 constexpr
int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
1606 simd_mask_type<N1> Pred(0);
1607 Pred.template select<RemN, 1>() = 1;
1609 Vals.template select<RemN, 1>() =
1610 Tmp.template select<RemN, 1>(NumChunks * ChunkSize);
1612 scatter<UT, N1>(Addr + (NumChunks * ChunkSize), Offsets, Vals, Pred);
1618 template <
typename T,
int N,
class T1,
class SFINAE>
1619 template <
typename AccessorT,
typename Flags,
int ChunkSize,
typename>
1620 ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_write,
1621 sycl::access::target::device,
void>
1622 simd_obj_impl<T, N, T1, SFINAE>::copy_to(AccessorT acc, uint32_t offset,
1623 Flags)
const SYCL_ESIMD_FUNCTION {
1624 using UT = simd_obj_impl<T, N, T1, SFINAE>::element_type;
1625 constexpr
unsigned Size =
sizeof(T) * N;
1626 constexpr
unsigned Align = Flags::template alignment<T1>;
1628 constexpr
unsigned BlockSize = OperandSize::OWORD * 8;
1629 constexpr
unsigned NumBlocks = Size / BlockSize;
1630 constexpr
unsigned RemSize = Size % BlockSize;
1634 if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
1636 if constexpr (NumBlocks > 0) {
1637 constexpr
unsigned BlockN = BlockSize /
sizeof(T);
1638 ForHelper<NumBlocks>::unroll([BlockN, acc, offset, &Tmp](
unsigned Block) {
1639 block_store<UT, BlockN, AccessorT>(
1640 acc, offset + (Block * BlockSize),
1641 Tmp.template select<BlockN, 1>(Block * BlockN));
1644 if constexpr (RemSize > 0) {
1645 constexpr
unsigned RemN = RemSize /
sizeof(T);
1646 constexpr
unsigned BlockN = BlockSize /
sizeof(T);
1647 block_store<UT, RemN, AccessorT>(
1648 acc, offset + (NumBlocks * BlockSize),
1649 Tmp.template select<RemN, 1>(NumBlocks * BlockN));
1651 }
else if constexpr (
sizeof(T) == 8) {
1653 BC.
copy_to(acc, offset, Flags{});
1655 constexpr
unsigned NumChunks = N / ChunkSize;
1656 if constexpr (NumChunks > 0) {
1658 ForHelper<NumChunks>::unroll([acc, offset, &Offsets,
1659 &Tmp](
unsigned Block) {
1660 scatter<UT, ChunkSize, AccessorT>(
1661 acc, Offsets, Tmp.template select<ChunkSize, 1>(Block * ChunkSize),
1662 offset + (Block * ChunkSize *
sizeof(T)));
1665 constexpr
unsigned RemN = N % ChunkSize;
1666 if constexpr (RemN > 0) {
1667 if constexpr (RemN == 1 || RemN == 8 || RemN == 16) {
1669 scatter<UT, RemN, AccessorT>(
1670 acc, Offsets, Tmp.template select<RemN, 1>(NumChunks * ChunkSize),
1671 offset + (NumChunks * ChunkSize *
sizeof(T)));
1673 constexpr
int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
1674 simd_mask_type<N1> Pred(0);
1675 Pred.template select<RemN, 1>() = 1;
1677 Vals.template select<RemN, 1>() =
1678 Tmp.template select<RemN, 1>(NumChunks * ChunkSize);
1680 scatter<UT, N1, AccessorT>(acc, Offsets, Vals,
1681 offset + (NumChunks * ChunkSize *
sizeof(T)),
std::enable_if< __vectorizable< _Up >) &&is_simd_flag_type< _Flags >::value >::type copy_to(_Up *__buffer, _Flags) const
raw_vector_type data() const
This class represents a reference to a sub-region of a base simd object.
typename ShapeTy::element_type element_type
The element type of this class, which could be different from the element type of the base object typ...
The main simd vector class.
#define __SYCL_INLINE_VER_NAMESPACE(X)
#define __SYCL_DEPRECATED(message)
rgba_channel_mask
Represents a pixel's channel mask - all possible combinations of enabled channels.
unsigned int SurfaceIndex
Surface index type.
constexpr int get_num_channels_enabled(rgba_channel_mask M)
atomic_op
Represents an atomic operation.
__ESIMD_API std::enable_if_t<(N==8||N==16||N==32) &&(sizeof(T)==4)> slm_scatter_rgba(simd< uint32_t, N > offsets, simd< T, N *get_num_channels_enabled(Mask)> vals, simd_mask< N > mask=1)
Gather data from the Shared Local Memory at specified offsets and return it as simd vector.
__ESIMD_API T slm_scalar_load(uint32_t offset)
Load a scalar value from the Shared Local Memory.
__ESIMD_API void slm_init(uint32_t size)
Declare per-work-group slm size.
__ESIMD_API simd< Tx, N > slm_atomic_update(simd< uint32_t, N > offsets, simd< Tx, N > src0, simd< Tx, N > src1, simd_mask< N > mask)
Atomic update operation performed on SLM.
__ESIMD_API simd< T, N > slm_block_load(uint32_t offset)
Loads a contiguous block of memory from the SLM at given offset and returns the loaded data as a vect...
__ESIMD_API std::enable_if_t<(N==8||N==16||N==32) &&(sizeof(T)==4), simd< T, N *get_num_channels_enabled(RGBAMask)> > slm_gather_rgba(simd< uint32_t, N > offsets, simd_mask< N > mask=1)
Gather data from the Shared Local Memory at specified offsets and return it as simd vector.
__ESIMD_API std::enable_if_t<(N==1||N==8||N==16||N==32), simd< T, N > > slm_gather(simd< uint32_t, N > offsets, simd_mask< N > mask=1)
Gather operation over the Shared Local Memory.
__ESIMD_API std::enable_if_t<(N==1||N==8||N==16||N==32) &&(sizeof(T)<=4)> slm_scatter(simd< uint32_t, N > offsets, simd< T, N > vals, simd_mask< N > mask=1)
Scatter operation over the Shared Local Memory.
__ESIMD_API void slm_scalar_store(uint32_t offset, T val)
Store a scalar value into the Shared Local Memory.
__ESIMD_API void slm_block_store(uint32_t offset, simd< T, N > vals)
Stores elements of a vector to a contiguous block of SLM at given offset.
__ESIMD_API simd< Tx, N > block_load(AccessorTy acc, uint32_t offset, Flags={})
Loads a contiguous block of memory from given accessor and offset and returns the loaded data as a ve...
__ESIMD_API void fence(fence_mask cntl)
__ESIMD_API std::enable_if_t<((N==8||N==16||N==32) &&sizeof(T)==4 &&!std::is_pointer_v< AccessorT >), simd< T, N *get_num_channels_enabled(RGBAMask)> > gather_rgba(AccessorT acc, simd< uint32_t, N > offsets, uint32_t global_offset=0, simd_mask< N > mask=1)
Gather and transpose pixels from the given memory locations defined by the base specified by acc,...
__ESIMD_API T scalar_load(AccessorTy acc, uint32_t offset)
Load a scalar value from an accessor.
__ESIMD_API void media_block_store(AccessorTy acc, unsigned x, unsigned y, simd< T, m *N > vals)
Media block store.
__ESIMD_API void block_store(AccessorTy acc, uint32_t offset, simd< Tx, N > vals)
Stores elements of a vector to a contiguous block of memory represented by an accessor and an offset ...
fence_mask
Represetns a bit mask to control behavior of esimd::fence.
__ESIMD_API std::enable_if_t<(N==8||N==16||N==32) &&sizeof(T)==4 &&!std::is_pointer_v< AccessorT > > scatter_rgba(AccessorT acc, simd< uint32_t, N > offsets, simd< T, N *get_num_channels_enabled(RGBAMask)> vals, uint32_t global_offset=0, simd_mask< N > mask=1)
Gather data from the memory addressed by accessor acc, offset common for all loaded elements global_o...
__ESIMD_API std::enable_if_t<(sizeof(T)<=4) &&(N==1||N==8||N==16||N==32) &&!std::is_pointer< AccessorTy >::value > scatter(AccessorTy acc, simd< uint32_t, N > offsets, simd< T, N > vals, uint32_t glob_offset=0, simd_mask< N > mask=1)
Accessor-based scatter.
__ESIMD_API SurfaceIndex get_surface_index(AccessorTy acc)
Get surface index corresponding to a SYCL accessor.
__ESIMD_API simd< T, m *N > media_block_load(AccessorTy acc, unsigned x, unsigned y)
Media block load.
__ESIMD_API void scalar_store(AccessorTy acc, uint32_t offset, T val)
Store a scalar value into an accessor.
__ESIMD_API std::enable_if_t<(sizeof(T)<=4) &&(N==1||N==8||N==16||N==32) &&!std::is_pointer< AccessorTy >::value, simd< T, N > > gather(AccessorTy acc, simd< uint32_t, N > offsets, uint32_t glob_offset=0, simd_mask< N > mask=1)
Accessor-based gather.
__ESIMD_API void barrier()
Generic work-group barrier.
@ l3_flush_constant_data
Flush constant cache.
@ global_coherent_fence
“Commit enable” - wait for fence to complete before continuing.
@ local_barrier
Issue SLM memory barrier only. If not set, the memory barrier is global.
@ l1_flush_ro_data
Flush L1 read - only data cache.
@ l3_flush_instructions
Flush the instruction cache.
@ sw_barrier
Creates a software (compiler) barrier, which does not generate any instruction and only prevents inst...
@ l3_flush_rw_data
Flush constant cache.
@ l3_flush_texture_data
Flush sampler (texture) cache.
void add(const void *DeviceGlobalPtr, const char *UniqueId)
typename std::enable_if< B, T >::type enable_if_t
static constexpr SurfaceIndex SLM_BTI
static void validate_rgba_write_channel_mask()
constexpr ESIMD_INLINE bool isPowerOf2(unsigned int n)
Check if a given 32 bit positive integer is a power of 2 at compile time.
constexpr void check_atomic()
Check the legality of an atomic call in terms of size and type.
__ESIMD_API simd< T, N > atomic_update(T *p, simd_view< Toffset, RegionTy > offsets, simd< T, N > src0, simd< T, N > src1, simd_mask< N > mask=1)
std::enable_if_t< detail::is_bf16_storage_type< T >::value, T > fmax(T x, T y)
std::enable_if_t< detail::is_bf16_storage_type< T >::value, T > fmin(T x, T y)
std::bit_xor< T > bit_xor
std::bit_and< T > bit_and
constexpr stream_manipulator dec
---— Error handling, matching OpenCL plugin semantics.
simd< _Tp, _Abi > min(const simd< _Tp, _Abi > &, const simd< _Tp, _Abi > &) noexcept
simd< _Tp, _Abi > max(const simd< _Tp, _Abi > &, const simd< _Tp, _Abi > &) noexcept