DPC++ Runtime
Runtime libraries for oneAPI DPC++
memory.hpp
Go to the documentation of this file.
1 //==-------------- memory.hpp - DPC++ Explicit SIMD API --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // Implement Explicit SIMD memory-access APIs.
9 //===----------------------------------------------------------------------===//
10 
11 #pragma once
12 
18 #include <sycl/half_type.hpp>
19 
20 #include <cstdint>
21 
22 namespace sycl {
24 namespace ext::intel::esimd {
25 
28 
34 
36 
38 
40 
41 namespace detail {
42 // Type used in internal functions to designate SLM access by
43 // providing dummy accessor of this type. Used to make it possible to delegate
44 // implemenations of SLM memory accesses to general surface-based memory
45 // accesses and thus reuse validity checks etc.
46 struct LocalAccessorMarker {};
47 
48 } // namespace detail
49 
51 
54 
60 template <typename AccessorTy>
61 __ESIMD_API SurfaceIndex get_surface_index(AccessorTy acc) {
62  if constexpr (std::is_same_v<detail::LocalAccessorMarker, AccessorTy>) {
63  return detail::SLM_BTI;
64  } else {
65  return __esimd_get_surface_index(
66  detail::AccessorPrivateProxy::getNativeImageObj(acc));
67  }
68 }
69 
70 // TODO @Pennycook
71 // {quote}
72 // ...I'd like us to think more about what we can do to make these interfaces
73 // more user - friendly. A user providing cache hints has to provide a lot more
74 // template arguments than required.Could we make this nicer by providing the
75 // hints as tag - type arguments ?
76 // ...
77 // // Without cache hints, type and length can be deduced from offsets
78 // float* p;
79 // simd<uint32_t, 16> offsets;
80 // auto result = flat_load(p, offsets);
81 //
82 // // With cache hints as templates, verbosity increases significantly:
83 // // - Providing any cache hint forces the user to specify the type and
84 // length float* p; simd<uint32_t, 16> offsets; auto result =
85 // flat_load<uint32_t, 16, 1, CacheHint::Foo, CacheHint::Bar>(p, offsets);
86 //
87 // // With cache hints as tag types, verbosity is reduced:
88 // // - Providing a cache hint does not prevent deduction of type and length
89 // float* p;
90 // simd <uint32_t, 16> offsets;
91 // auto result = flat_load(p, offsets, CacheHint::Foo{});
92 //
93 // Note also that the templated form prevents a developer from specifying an L3
94 // hint without also explicitly specifying an L1 hint. If flat_load accepted a
95 // list of hints, it might be possible to refactor the hints to specify them in
96 // any order, and it may be more extensible to future cache hints:
97 // {/quote}
98 //
99 // TODO @keryell
100 // {quote}
101 // An approach a la https ://github.com/chriskohlhoff/propria from
102 // @chriskohlhoff would be to add a property to the pointer, such as
103 //
104 // auto result = flat_load(p, offsets);
105 // auto result = flat_load(decorate<CacheHint::Foo, CacheHint::Bar>(p),
106 // offsets);
107 // The advantage is that you do not have to change all tour API and all the uses
108 // of this decorated pointer will benefit from this. decorate is to be bikeshed
109 // accordingly.
110 // {/quote}
111 //
112 
128 template <typename Tx, int N, class T = detail::__raw_t<Tx>>
130 gather(const Tx *p, simd<uint32_t, N> offsets, simd_mask<N> mask = 1) {
131  simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
132  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
133  addrs = addrs + offsets_i;
134 
135  if constexpr (sizeof(T) == 1) {
136  auto Ret = __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<4>(),
137  detail::ElemsPerAddrEncoding<1>()>(
138  addrs.data(), mask.data());
139  return __esimd_rdregion<T, N * 4, N, /*VS*/ 0, N, 4>(Ret, 0);
140  } else if constexpr (sizeof(T) == 2) {
141  auto Ret = __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<2>(),
142  detail::ElemsPerAddrEncoding<2>()>(
143  addrs.data(), mask.data());
144  return __esimd_rdregion<T, N * 2, N, /*VS*/ 0, N, 2>(Ret, 0);
145  } else
146  return __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<1>(),
147  detail::ElemsPerAddrEncoding<1>()>(addrs.data(),
148  mask.data());
149 }
150 
164 template <typename Tx, int N, class T = detail::__raw_t<Tx>>
165 __ESIMD_API std::enable_if_t<detail::isPowerOf2(N, 32)>
166 scatter(Tx *p, simd<uint32_t, N> offsets, simd<Tx, N> vals,
167  simd_mask<N> mask = 1) {
168  simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
169  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
170  addrs = addrs + offsets_i;
171  if constexpr (sizeof(T) == 1) {
172  simd<T, N * 4> D;
173  D = __esimd_wrregion<T, N * 4, N, /*VS*/ 0, N, 4>(D.data(), vals.data(), 0);
174  __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<4>(),
175  detail::ElemsPerAddrEncoding<1>()>(
176  addrs.data(), D.data(), mask.data());
177  } else if constexpr (sizeof(T) == 2) {
178  simd<T, N * 2> D;
179  D = __esimd_wrregion<T, N * 2, N, /*VS*/ 0, N, 2>(D.data(), vals.data(), 0);
180  __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<2>(),
181  detail::ElemsPerAddrEncoding<2>()>(
182  addrs.data(), D.data(), mask.data());
183  } else
184  __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<1>(),
185  detail::ElemsPerAddrEncoding<1>()>(
186  addrs.data(), vals.data(), mask.data());
187 }
188 
202 template <typename Tx, int N, typename Flags = vector_aligned_tag,
203  class T = detail::__raw_t<Tx>,
204  typename = std::enable_if_t<is_simd_flag_type_v<Flags>>>
205 __ESIMD_API simd<Tx, N> block_load(const Tx *addr, Flags = {}) {
206  constexpr unsigned Sz = sizeof(T) * N;
207  static_assert(Sz >= detail::OperandSize::OWORD,
208  "block size must be at least 1 oword");
209  static_assert(Sz % detail::OperandSize::OWORD == 0,
210  "block size must be whole number of owords");
211  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
212  "block must be 1, 2, 4 or 8 owords long");
213  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
214  "block size must be at most 8 owords");
215 
216  uintptr_t Addr = reinterpret_cast<uintptr_t>(addr);
217  if constexpr (Flags::template alignment<simd<T, N>> >=
218  detail::OperandSize::OWORD) {
219  return __esimd_svm_block_ld<T, N>(Addr);
220  } else {
221  return __esimd_svm_block_ld_unaligned<T, N>(Addr);
222  }
223 }
224 
240 template <typename Tx, int N, typename AccessorTy,
241  typename Flags = vector_aligned_tag,
242  typename = std::enable_if_t<is_simd_flag_type_v<Flags>>,
243  class T = detail::__raw_t<Tx>>
244 __ESIMD_API simd<Tx, N> block_load(AccessorTy acc, uint32_t offset,
245  Flags = {}) {
246 #ifdef __ESIMD_FORCE_STATELESS_MEM
247  return block_load<Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc, offset));
248 #else
249  constexpr unsigned Sz = sizeof(T) * N;
250  static_assert(Sz >= detail::OperandSize::OWORD,
251  "block size must be at least 1 oword");
252  static_assert(Sz % detail::OperandSize::OWORD == 0,
253  "block size must be whole number of owords");
254  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
255  "block must be 1, 2, 4 or 8 owords long");
256  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
257  "block size must be at most 8 owords");
258 
259  auto surf_ind = __esimd_get_surface_index(
260  detail::AccessorPrivateProxy::getNativeImageObj(acc));
261 
262  if constexpr (Flags::template alignment<simd<T, N>> >=
263  detail::OperandSize::OWORD) {
264  return __esimd_oword_ld<T, N>(surf_ind, offset >> 4);
265  } else {
266  return __esimd_oword_ld_unaligned<T, N>(surf_ind, offset);
267  }
268 #endif
269 }
270 
279 template <typename Tx, int N, class T = detail::__raw_t<Tx>>
280 __ESIMD_API void block_store(Tx *p, simd<Tx, N> vals) {
281  constexpr unsigned Sz = sizeof(T) * N;
282  static_assert(Sz >= detail::OperandSize::OWORD,
283  "block size must be at least 1 oword");
284  static_assert(Sz % detail::OperandSize::OWORD == 0,
285  "block size must be whole number of owords");
286  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
287  "block must be 1, 2, 4 or 8 owords long");
288  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
289  "block size must be at most 8 owords");
290 
291  uintptr_t Addr = reinterpret_cast<uintptr_t>(p);
292  __esimd_svm_block_st<T, N>(Addr, vals.data());
293 }
294 
306 template <typename Tx, int N, typename AccessorTy,
307  class T = detail::__raw_t<Tx>>
308 __ESIMD_API void block_store(AccessorTy acc, uint32_t offset,
309  simd<Tx, N> vals) {
310 #ifdef __ESIMD_FORCE_STATELESS_MEM
311  block_store<Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc, offset), vals);
312 #else
313  constexpr unsigned Sz = sizeof(T) * N;
314  static_assert(Sz >= detail::OperandSize::OWORD,
315  "block size must be at least 1 oword");
316  static_assert(Sz % detail::OperandSize::OWORD == 0,
317  "block size must be whole number of owords");
318  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
319  "block must be 1, 2, 4 or 8 owords long");
320  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
321  "block size must be at most 8 owords");
322 
323  auto surf_ind = __esimd_get_surface_index(
324  detail::AccessorPrivateProxy::getNativeImageObj(acc));
325  __esimd_oword_st<T, N>(surf_ind, offset >> 4, vals.data());
326 #endif
327 }
328 
330 
332 
333 // Implementations of accessor-based gather and scatter functions
334 namespace detail {
335 template <typename T, int N, typename AccessorTy>
336 ESIMD_INLINE
337  ESIMD_NODEBUG std::enable_if_t<(sizeof(T) <= 4) &&
338  (N == 1 || N == 8 || N == 16 || N == 32) &&
339  !std::is_pointer<AccessorTy>::value>
340  scatter_impl(AccessorTy acc, simd<T, N> vals, simd<uint32_t, N> offsets,
341  uint32_t glob_offset, simd_mask<N> mask) {
342 
343  constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
344  // TODO (performance) use hardware-supported scale once BE supports it
345  constexpr int16_t scale = 0;
346  const auto si = __ESIMD_NS::get_surface_index(acc);
347 
348  if constexpr (sizeof(T) < 4) {
349  using Tint = std::conditional_t<std::is_integral_v<T>, T,
350  detail::uint_type_t<sizeof(T)>>;
351  using Treal = __raw_t<T>;
352  simd<Tint, N> vals_int = bitcast<Tint, Treal, N>(std::move(vals).data());
353  using PromoT =
355  int32_t, uint32_t>;
356  const simd<PromoT, N> promo_vals = convert<PromoT>(std::move(vals_int));
357  __esimd_scatter_scaled<PromoT, N, decltype(si), TypeSizeLog2, scale>(
358  mask.data(), si, glob_offset, offsets.data(), promo_vals.data());
359  } else {
360  using Treal = __raw_t<T>;
361  if constexpr (!std::is_same_v<Treal, T>) {
362  simd<Treal, N> Values = vals.template bit_cast_view<Treal>();
363  __esimd_scatter_scaled<Treal, N, decltype(si), TypeSizeLog2, scale>(
364  mask.data(), si, glob_offset, offsets.data(), Values.data());
365  } else {
366  __esimd_scatter_scaled<T, N, decltype(si), TypeSizeLog2, scale>(
367  mask.data(), si, glob_offset, offsets.data(), vals.data());
368  }
369  }
370 }
371 
372 template <typename T, int N, typename AccessorTy>
373 ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t<
374  (sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) &&
375  !std::is_pointer<AccessorTy>::value,
376  simd<T, N>>
377 gather_impl(AccessorTy acc, simd<uint32_t, N> offsets, uint32_t glob_offset,
378  simd_mask<N> mask) {
379 
380  constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
381  // TODO (performance) use hardware-supported scale once BE supports it
382  constexpr uint32_t scale = 0;
383  const auto si = get_surface_index(acc);
384 
385  if constexpr (sizeof(T) < 4) {
386  using Tint = std::conditional_t<std::is_integral_v<T>, T,
387  detail::uint_type_t<sizeof(T)>>;
388  using Treal = __raw_t<T>;
389  static_assert(std::is_integral<Tint>::value,
390  "only integral 1- & 2-byte types are supported");
391  using PromoT =
393  int32_t, uint32_t>;
394  const simd<PromoT, N> promo_vals =
395  __esimd_gather_masked_scaled2<PromoT, N, decltype(si), TypeSizeLog2,
396  scale>(si, glob_offset, offsets.data(),
397  mask.data());
398  auto Res = convert<Tint>(promo_vals);
399 
400  if constexpr (!std::is_same_v<Tint, T>) {
401  return detail::bitcast<Treal, Tint, N>(Res.data());
402  } else {
403  return Res;
404  }
405  } else {
406  using Treal = __raw_t<T>;
407  simd<Treal, N> Res = __esimd_gather_masked_scaled2<Treal, N, decltype(si),
408  TypeSizeLog2, scale>(
409  si, glob_offset, offsets.data(), mask.data());
410  if constexpr (!std::is_same_v<Treal, T>) {
411  return Res.template bit_cast_view<T>();
412  } else {
413  return Res;
414  }
415  }
416 }
417 
418 } // namespace detail
419 
421 
424 
442 template <typename T, int N, typename AccessorTy>
443 __ESIMD_API std::enable_if_t<(sizeof(T) <= 4) &&
444  (N == 1 || N == 8 || N == 16 || N == 32) &&
445  !std::is_pointer<AccessorTy>::value,
446  simd<T, N>>
447 gather(AccessorTy acc, simd<uint32_t, N> offsets, uint32_t glob_offset = 0,
448  simd_mask<N> mask = 1) {
449 #ifdef __ESIMD_FORCE_STATELESS_MEM
450  return gather<T, N>(__ESIMD_DNS::accessorToPointer<T>(acc, glob_offset),
451  offsets, mask);
452 #else
453  return detail::gather_impl<T, N, AccessorTy>(acc, offsets, glob_offset, mask);
454 #endif
455 }
456 
476 template <typename T, int N, typename AccessorTy>
477 __ESIMD_API std::enable_if_t<(sizeof(T) <= 4) &&
478  (N == 1 || N == 8 || N == 16 || N == 32) &&
479  !std::is_pointer<AccessorTy>::value>
480 scatter(AccessorTy acc, simd<uint32_t, N> offsets, simd<T, N> vals,
481  uint32_t glob_offset = 0, simd_mask<N> mask = 1) {
482 #ifdef __ESIMD_FORCE_STATELESS_MEM
483  scatter<T, N>(__ESIMD_DNS::accessorToPointer<T>(acc, glob_offset), offsets,
484  vals, mask);
485 #else
486  detail::scatter_impl<T, N, AccessorTy>(acc, vals, offsets, glob_offset, mask);
487 #endif
488 }
489 
497 template <typename T, typename AccessorTy>
498 __ESIMD_API T scalar_load(AccessorTy acc, uint32_t offset) {
499  const simd<T, 1> Res =
500  gather<T, 1, AccessorTy>(acc, simd<uint32_t, 1>(offset));
501  return Res[0];
502 }
503 
511 template <typename T, typename AccessorTy>
512 __ESIMD_API void scalar_store(AccessorTy acc, uint32_t offset, T val) {
513  scatter<T, 1, AccessorTy>(acc, simd<uint32_t, 1>(offset), simd<T, 1>(val));
514 }
515 
548 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
549  int N>
550 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4,
551  simd<T, N * get_num_channels_enabled(RGBAMask)>>
552 gather_rgba(const T *p, simd<uint32_t, N> offsets, simd_mask<N> mask = 1) {
553  simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
554  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
555  addrs = addrs + offsets_i;
556  return __esimd_svm_gather4_scaled<detail::__raw_t<T>, N, RGBAMask>(
557  addrs.data(), mask.data());
558 }
559 
560 template <typename T, int N, rgba_channel_mask RGBAMask>
561 __SYCL_DEPRECATED("use gather_rgba<rgba_channel_mask>()")
562 __ESIMD_API std::enable_if_t<
563  (N == 8 || N == 16 || N == 32) && sizeof(T) == 4,
565  RGBAMask)>> gather_rgba(const T *p,
566  simd<uint32_t, N> offsets,
567  simd_mask<N> mask = 1) {
568  return gather_rgba<RGBAMask>(p, offsets, mask);
569 }
570 
571 namespace detail {
572 template <rgba_channel_mask M> static void validate_rgba_write_channel_mask() {
573  using CM = rgba_channel_mask;
574  static_assert(
575  (M == CM::ABGR || M == CM::BGR || M == CM::GR || M == CM::R) &&
576  "Only ABGR, BGR, GR, R channel masks are valid in write operations");
577 }
578 } // namespace detail
579 
600 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
601  int N>
602 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4>
604  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
605  simd_mask<N> mask = 1) {
606  detail::validate_rgba_write_channel_mask<RGBAMask>();
607  simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
608  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
609  addrs = addrs + offsets_i;
610  __esimd_svm_scatter4_scaled<detail::__raw_t<T>, N, RGBAMask>(
611  addrs.data(), vals.data(), mask.data());
612 }
613 
614 template <typename T, int N, rgba_channel_mask RGBAMask>
615 __SYCL_DEPRECATED("use scatter_rgba<rgba_channel_mask>()")
616 __ESIMD_API std::
617  enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4> scatter_rgba(
618  T *p, simd<uint32_t, N> offsets,
619  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
620  simd_mask<N> mask = 1) {
621  scatter_rgba<RGBAMask>(p, offsets, vals, mask);
622 }
623 
646 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
647  typename AccessorT, int N,
648  typename T = typename AccessorT::value_type>
649 __ESIMD_API std::enable_if_t<((N == 8 || N == 16 || N == 32) &&
650  sizeof(T) == 4 && !std::is_pointer_v<AccessorT>),
651  simd<T, N * get_num_channels_enabled(RGBAMask)>>
652 gather_rgba(AccessorT acc, simd<uint32_t, N> offsets,
653  uint32_t global_offset = 0, simd_mask<N> mask = 1) {
654 #ifdef __ESIMD_FORCE_STATELESS_MEM
655  return gather_rgba<RGBAMask>(
656  __ESIMD_DNS::accessorToPointer<T>(acc, global_offset), offsets, mask);
657 #else
658  // TODO (performance) use hardware-supported scale once BE supports it
659  constexpr uint32_t Scale = 0;
660  const auto SI = get_surface_index(acc);
661  return __esimd_gather4_masked_scaled2<detail::__raw_t<T>, N, RGBAMask,
662  decltype(SI), Scale>(
663  SI, global_offset, offsets.data(), mask.data());
664 #endif
665 }
666 
681 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
682  typename AccessorT, int N,
683  typename T = typename AccessorT::value_type>
684 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4 &&
685  !std::is_pointer_v<AccessorT>>
686 scatter_rgba(AccessorT acc, simd<uint32_t, N> offsets,
687  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
688  uint32_t global_offset = 0, simd_mask<N> mask = 1) {
689  detail::validate_rgba_write_channel_mask<RGBAMask>();
690 #ifdef __ESIMD_FORCE_STATELESS_MEM
691  scatter_rgba<RGBAMask>(__ESIMD_DNS::accessorToPointer<T>(acc, global_offset),
692  offsets, vals, mask);
693 #else
694  // TODO (performance) use hardware-supported scale once BE supports it
695  constexpr uint32_t Scale = 0;
696  const auto SI = get_surface_index(acc);
697  __esimd_scatter4_scaled<T, N, decltype(SI), RGBAMask, Scale>(
698  mask.data(), SI, global_offset, offsets.data(), vals.data());
699 #endif
700 }
701 
703 
704 namespace detail {
707 template <__ESIMD_NS::atomic_op Op, typename T, int N, unsigned NumSrc>
708 constexpr void check_atomic() {
709  static_assert((detail::isPowerOf2(N, 32)),
710  "Execution size 1, 2, 4, 8, 16, 32 are supported");
711  static_assert(NumSrc == __ESIMD_DNS::get_num_args<Op>(),
712  "wrong number of operands");
713  constexpr bool IsInt2BytePlus =
714  std::is_integral_v<T> && (sizeof(T) >= sizeof(uint16_t));
715 
716  if constexpr (Op == __ESIMD_NS::atomic_op::xchg ||
717  Op == __ESIMD_NS::atomic_op::cmpxchg ||
718  Op == __ESIMD_NS::atomic_op::predec ||
719  Op == __ESIMD_NS::atomic_op::inc ||
721  Op == __ESIMD_NS::atomic_op::load) {
722 
723  static_assert(IsInt2BytePlus, "Integral 16-bit or wider type is expected");
724  }
725  // FP ops (are always delegated to native::lsc::<Op>)
726  if constexpr (Op == __ESIMD_NS::atomic_op::fmax ||
728  Op == __ESIMD_NS::atomic_op::fadd ||
729  Op == __ESIMD_NS::atomic_op::fsub) {
730  static_assert((is_type<T, float, sycl::half>()),
731  "Type F or HF is expected");
732  }
733  if constexpr (Op == __ESIMD_NS::atomic_op::add ||
734  Op == __ESIMD_NS::atomic_op::sub ||
740  Op == __ESIMD_NS::atomic_op::minsint ||
741  Op == __ESIMD_NS::atomic_op::maxsint) {
742  static_assert(IsInt2BytePlus, "Integral 16-bit or wider type is expected");
743  constexpr bool IsSignedMinmax = (Op == __ESIMD_NS::atomic_op::minsint) ||
744  (Op == __ESIMD_NS::atomic_op::maxsint);
745  constexpr bool IsUnsignedMinmax = (Op == __ESIMD_NS::atomic_op::min) ||
747 
748  if constexpr (IsSignedMinmax || IsUnsignedMinmax) {
749  constexpr bool SignOK = std::is_signed_v<T> == IsSignedMinmax;
750  static_assert(SignOK, "Signed/unsigned integer type expected for "
751  "signed/unsigned min/max operation");
752  }
753  }
754 }
755 } // namespace detail
756 
759 
779 template <atomic_op Op, typename Tx, int N>
780 __ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<unsigned, N> offset,
781  simd_mask<N> mask) {
782  detail::check_atomic<Op, Tx, N, 0>();
783  simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
784  simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(offset);
785  vAddr += offset_i1;
786  using T = typename detail::__raw_t<Tx>;
787  return __esimd_svm_atomic0<Op, T, N>(vAddr.data(), mask.data());
788 }
789 
813 template <atomic_op Op, typename Tx, int N>
814 __ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<unsigned, N> offset,
815  simd<Tx, N> src0, simd_mask<N> mask) {
816  if constexpr ((Op == atomic_op::fmin) || (Op == atomic_op::fmax) ||
817  (Op == atomic_op::fadd) || (Op == atomic_op::fsub)) {
818  // Auto-convert FP atomics to LSC version. Warning is given - see enum.
819  return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(p, offset, src0,
820  mask);
821  } else {
822  detail::check_atomic<Op, Tx, N, 1>();
823  simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
824  simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(offset);
825  vAddr += offset_i1;
826  using T = typename detail::__raw_t<Tx>;
827  return __esimd_svm_atomic1<Op, T, N>(vAddr.data(), src0.data(),
828  mask.data());
829  }
830 }
831 
851 template <atomic_op Op, typename Tx, int N>
852 __ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<unsigned, N> offset,
853  simd<Tx, N> src0, simd<Tx, N> src1,
854  simd_mask<N> mask) {
855  if constexpr (Op == atomic_op::fcmpwr) {
856  // Auto-convert FP atomics to LSC version. Warning is given - see enum.
857  return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(p, offset, src0,
858  src1, mask);
859  } else {
860  detail::check_atomic<Op, Tx, N, 2>();
861  simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
862  simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(offset);
863  vAddr += offset_i1;
864  using T = typename detail::__raw_t<Tx>;
865  return __esimd_svm_atomic2<Op, T, N>(vAddr.data(), src0.data(), src1.data(),
866  mask.data());
867  }
868 }
869 
871 
874 
877 enum fence_mask : uint8_t {
895  sw_barrier = 0x80
896 };
897 
901 template <uint8_t cntl> __ESIMD_API void fence() { __esimd_fence(cntl); }
902 
903 __SYCL_DEPRECATED("use fence<fence_mask>()")
904 __ESIMD_API void fence(fence_mask cntl) { __esimd_fence(cntl); }
905 
914 __ESIMD_API void barrier() {
916  __esimd_barrier();
917 }
918 
920 
923 
926 template <uint32_t SLMSize> __ESIMD_API void slm_init() {
927  __esimd_slm_init(SLMSize);
928 }
929 
933 __ESIMD_API void slm_init(uint32_t size) { __esimd_slm_init(size); }
934 
940 template <typename T, int N>
941 __ESIMD_API
942  std::enable_if_t<(N == 1 || N == 8 || N == 16 || N == 32), simd<T, N>>
944  detail::LocalAccessorMarker acc;
945  return detail::gather_impl<T, N>(acc, offsets, 0, mask);
946 }
947 
953 template <typename T> __ESIMD_API T slm_scalar_load(uint32_t offset) {
954  const simd<T, 1> Res = slm_gather<T, 1>(simd<uint32_t, 1>(offset));
955  return Res[0];
956 }
957 
963 template <typename T, int N>
964 __ESIMD_API std::enable_if_t<(N == 1 || N == 8 || N == 16 || N == 32) &&
965  (sizeof(T) <= 4)>
967  detail::LocalAccessorMarker acc;
968  detail::scatter_impl<T, N>(acc, vals, offsets, 0, mask);
969 }
970 
976 template <typename T>
977 __ESIMD_API void slm_scalar_store(uint32_t offset, T val) {
978  slm_scatter<T, 1>(simd<uint32_t, 1>(offset), simd<T, 1>(val), 1);
979 }
980 
991 template <typename T, int N, rgba_channel_mask RGBAMask>
992 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && (sizeof(T) == 4),
993  simd<T, N * get_num_channels_enabled(RGBAMask)>>
995 
996  const auto SI = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
997  return __esimd_gather4_masked_scaled2<T, N, RGBAMask>(
998  SI, 0 /*global_offset*/, offsets.data(), mask.data());
999 }
1000 
1011 template <typename T, int N, rgba_channel_mask Mask>
1012 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && (sizeof(T) == 4)>
1014  simd<T, N * get_num_channels_enabled(Mask)> vals,
1015  simd_mask<N> mask = 1) {
1016  detail::validate_rgba_write_channel_mask<Mask>();
1017  const auto si = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
1018  constexpr int16_t Scale = 0;
1019  constexpr int global_offset = 0;
1020  __esimd_scatter4_scaled<T, N, decltype(si), Mask, Scale>(
1021  mask.data(), si, global_offset, offsets.data(), vals.data());
1022 }
1023 
1032 template <typename T, int N>
1033 __ESIMD_API simd<T, N> slm_block_load(uint32_t offset) {
1034  constexpr unsigned Sz = sizeof(T) * N;
1035  static_assert(Sz >= detail::OperandSize::OWORD,
1036  "block size must be at least 1 oword");
1037  static_assert(Sz % detail::OperandSize::OWORD == 0,
1038  "block size must be whole number of owords");
1039  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
1040  "block must be 1, 2, 4 or 8 owords long");
1041  static_assert(Sz <= 16 * detail::OperandSize::OWORD,
1042  "block size must be at most 16 owords");
1043 
1044  const auto si = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
1045  return __esimd_oword_ld<detail::__raw_t<T>, N>(si, offset >> 4);
1046 }
1047 
1056 template <typename T, int N>
1057 __ESIMD_API void slm_block_store(uint32_t offset, simd<T, N> vals) {
1058  constexpr unsigned Sz = sizeof(T) * N;
1059  static_assert(Sz >= detail::OperandSize::OWORD,
1060  "block size must be at least 1 oword");
1061  static_assert(Sz % detail::OperandSize::OWORD == 0,
1062  "block size must be whole number of owords");
1063  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
1064  "block must be 1, 2, 4 or 8 owords long");
1065  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
1066  "block size must be at most 8 owords");
1067  const auto si = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
1068  // offset in genx.oword.st is in owords
1069  __esimd_oword_st<detail::__raw_t<T>, N>(si, offset >> 4, vals.data());
1070 }
1071 
1075 template <atomic_op Op, typename Tx, int N, class T = detail::__raw_t<Tx>>
1077  simd_mask<N> mask) {
1078  detail::check_atomic<Op, T, N, 0>();
1079  const auto si = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
1080  return __esimd_dword_atomic0<Op, T, N>(mask.data(), si, offsets.data());
1081 }
1082 
1086 template <atomic_op Op, typename Tx, int N, class T = detail::__raw_t<Tx>>
1088  simd<Tx, N> src0, simd_mask<N> mask) {
1089  detail::check_atomic<Op, T, N, 1>();
1090  const auto si = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
1091  return __esimd_dword_atomic1<Op, T, N>(mask.data(), si, offsets.data(),
1092  src0.data());
1093 }
1094 
1098 template <atomic_op Op, typename Tx, int N, class T = detail::__raw_t<Tx>>
1100  simd<Tx, N> src0, simd<Tx, N> src1,
1101  simd_mask<N> mask) {
1102  detail::check_atomic<Op, T, N, 2>();
1103  const auto si = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
1104  return __esimd_dword_atomic2<Op, T, N>(mask.data(), si, offsets.data(),
1105  src0.data(), src1.data());
1106 }
1107 
1109 
1110 #ifndef __ESIMD_FORCE_STATELESS_MEM
1111 
1126 template <typename T, int m, int N, typename AccessorTy, unsigned plane = 0>
1127 __ESIMD_API simd<T, m * N> media_block_load(AccessorTy acc, unsigned x,
1128  unsigned y) {
1129  constexpr unsigned Width = N * sizeof(T);
1130  static_assert(Width * m <= 256u,
1131  "data does not fit into a single dataport transaction");
1132  static_assert(Width <= 64u, "valid block width is in range [1, 64]");
1133  static_assert(m <= 64u, "valid block height is in range [1, 64]");
1134  static_assert(plane <= 3u, "valid plane index is in range [0, 3]");
1135 
1136  const auto si = __ESIMD_NS::get_surface_index(acc);
1137  using SurfIndTy = decltype(si);
1138  constexpr unsigned int RoundedWidth =
1139  Width < 4 ? 4 : detail::getNextPowerOf2<Width>();
1140  constexpr int BlockWidth = sizeof(T) * N;
1141  constexpr int Mod = 0;
1142 
1143  if constexpr (Width < RoundedWidth) {
1144  constexpr unsigned int n1 = RoundedWidth / sizeof(T);
1145  simd<T, m *n1> temp =
1146  __esimd_media_ld<T, m, n1, Mod, SurfIndTy, (int)plane, BlockWidth>(
1147  si, x, y);
1148  return temp.template select<m, 1, N, 1>(0, 0);
1149  } else {
1150  return __esimd_media_ld<T, m, N, Mod, SurfIndTy, (int)plane, BlockWidth>(
1151  si, x, y);
1152  }
1153 }
1154 
1167 template <typename T, int m, int N, typename AccessorTy, unsigned plane = 0>
1168 __ESIMD_API void media_block_store(AccessorTy acc, unsigned x, unsigned y,
1169  simd<T, m * N> vals) {
1170  constexpr unsigned Width = N * sizeof(T);
1171  static_assert(Width * m <= 256u,
1172  "data does not fit into a single dataport transaction");
1173  static_assert(Width <= 64u, "valid block width is in range [1, 64]");
1174  static_assert(m <= 64u, "valid block height is in range [1, 64]");
1175  static_assert(plane <= 3u, "valid plane index is in range [0, 3]");
1176  const auto si = __ESIMD_NS::get_surface_index(acc);
1177  using SurfIndTy = decltype(si);
1178  constexpr unsigned int RoundedWidth =
1179  Width < 4 ? 4 : detail::getNextPowerOf2<Width>();
1180  constexpr unsigned int n1 = RoundedWidth / sizeof(T);
1181  constexpr int BlockWidth = sizeof(T) * N;
1182  constexpr int Mod = 0;
1183 
1184  if constexpr (Width < RoundedWidth) {
1185  simd<T, m * n1> temp;
1186  auto temp_ref = temp.template bit_cast_view<T, m, n1>();
1187  auto vals_ref = vals.template bit_cast_view<T, m, N>();
1188  temp_ref.template select<m, 1, N, 1>() = vals_ref;
1189  __esimd_media_st<T, m, n1, Mod, SurfIndTy, plane, BlockWidth>(si, x, y,
1190  temp.data());
1191  } else {
1192  __esimd_media_st<T, m, N, Mod, SurfIndTy, plane, BlockWidth>(si, x, y,
1193  vals.data());
1194  }
1195 }
1196 #endif // !__ESIMD_FORCE_STATELESS_MEM
1197 
1199 
1201 
1202 namespace detail {
1203 
1204 // ----- Outlined implementations of simd_obj_impl class memory access APIs.
1205 
1206 template <typename T, int N, class T1, class SFINAE>
1207 template <typename Flags, int ChunkSize, typename>
1208 void simd_obj_impl<T, N, T1, SFINAE>::copy_from(
1209  const simd_obj_impl<T, N, T1, SFINAE>::element_type *Addr,
1210  Flags) SYCL_ESIMD_FUNCTION {
1211  using UT = simd_obj_impl<T, N, T1, SFINAE>::element_type;
1212  constexpr unsigned Size = sizeof(T) * N;
1213  constexpr unsigned Align = Flags::template alignment<T1>;
1214 
1215  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
1216  constexpr unsigned NumBlocks = Size / BlockSize;
1217  constexpr unsigned RemSize = Size % BlockSize;
1218 
1219  if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 &&
1220  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
1221  if constexpr (NumBlocks > 0) {
1222  constexpr unsigned BlockN = BlockSize / sizeof(T);
1223  ForHelper<NumBlocks>::unroll([BlockN, Addr, this](unsigned Block) {
1224  select<BlockN, 1>(Block * BlockN) =
1225  block_load<UT, BlockN, Flags>(Addr + (Block * BlockN), Flags{});
1226  });
1227  }
1228  if constexpr (RemSize > 0) {
1229  constexpr unsigned RemN = RemSize / sizeof(T);
1230  constexpr unsigned BlockN = BlockSize / sizeof(T);
1231  select<RemN, 1>(NumBlocks * BlockN) =
1232  block_load<UT, RemN, Flags>(Addr + (NumBlocks * BlockN), Flags{});
1233  }
1234  } else if constexpr (sizeof(T) == 8) {
1235  simd<int32_t, N * 2> BC(reinterpret_cast<const int32_t *>(Addr), Flags{});
1236  bit_cast_view<int32_t>() = BC;
1237  } else {
1238  constexpr unsigned NumChunks = N / ChunkSize;
1239  if constexpr (NumChunks > 0) {
1240  simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
1241  ForHelper<NumChunks>::unroll([Addr, &Offsets, this](unsigned Block) {
1242  select<ChunkSize, 1>(Block * ChunkSize) =
1243  gather<UT, ChunkSize>(Addr + (Block * ChunkSize), Offsets);
1244  });
1245  }
1246  constexpr unsigned RemN = N % ChunkSize;
1247  if constexpr (RemN > 0) {
1248  if constexpr (RemN == 1) {
1249  select<1, 1>(NumChunks * ChunkSize) = Addr[NumChunks * ChunkSize];
1250  } else if constexpr (RemN == 8 || RemN == 16) {
1251  simd<uint32_t, RemN> Offsets(0u, sizeof(T));
1252  select<RemN, 1>(NumChunks * ChunkSize) =
1253  gather<UT, RemN>(Addr + (NumChunks * ChunkSize), Offsets);
1254  } else {
1255  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
1256  simd_mask_type<N1> Pred(0);
1257  Pred.template select<RemN, 1>() = 1;
1258  simd<uint32_t, N1> Offsets(0u, sizeof(T));
1259  simd<UT, N1> Vals =
1260  gather<UT, N1>(Addr + (NumChunks * ChunkSize), Offsets, Pred);
1261  select<RemN, 1>(NumChunks * ChunkSize) =
1262  Vals.template select<RemN, 1>();
1263  }
1264  }
1265  }
1266 }
1267 
1268 template <typename T, int N, class T1, class SFINAE>
1269 template <typename AccessorT, typename Flags, int ChunkSize, typename>
1270 ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_read,
1271  sycl::access::target::device, void>
1272 simd_obj_impl<T, N, T1, SFINAE>::copy_from(AccessorT acc, uint32_t offset,
1273  Flags) SYCL_ESIMD_FUNCTION {
1274  using UT = simd_obj_impl<T, N, T1, SFINAE>::element_type;
1275  static_assert(sizeof(UT) == sizeof(T));
1276  constexpr unsigned Size = sizeof(T) * N;
1277  constexpr unsigned Align = Flags::template alignment<T1>;
1278 
1279  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
1280  constexpr unsigned NumBlocks = Size / BlockSize;
1281  constexpr unsigned RemSize = Size % BlockSize;
1282 
1283  if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 &&
1284  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
1285  if constexpr (NumBlocks > 0) {
1286  constexpr unsigned BlockN = BlockSize / sizeof(T);
1287  ForHelper<NumBlocks>::unroll([BlockN, acc, offset, this](unsigned Block) {
1288  select<BlockN, 1>(Block * BlockN) =
1289  block_load<UT, BlockN, AccessorT, Flags>(
1290  acc, offset + (Block * BlockSize), Flags{});
1291  });
1292  }
1293  if constexpr (RemSize > 0) {
1294  constexpr unsigned RemN = RemSize / sizeof(T);
1295  constexpr unsigned BlockN = BlockSize / sizeof(T);
1296  select<RemN, 1>(NumBlocks * BlockN) =
1297  block_load<UT, RemN, AccessorT, Flags>(
1298  acc, offset + (NumBlocks * BlockSize), Flags{});
1299  }
1300  } else if constexpr (sizeof(T) == 8) {
1301  simd<int32_t, N * 2> BC(acc, offset, Flags{});
1302  bit_cast_view<int32_t>() = BC;
1303  } else {
1304  constexpr unsigned NumChunks = N / ChunkSize;
1305  if constexpr (NumChunks > 0) {
1306  simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
1307  ForHelper<NumChunks>::unroll(
1308  [acc, offset, &Offsets, this](unsigned Block) {
1309  select<ChunkSize, 1>(Block * ChunkSize) =
1310  gather<UT, ChunkSize, AccessorT>(
1311  acc, Offsets, offset + (Block * ChunkSize * sizeof(T)));
1312  });
1313  }
1314  constexpr unsigned RemN = N % ChunkSize;
1315  if constexpr (RemN > 0) {
1316  if constexpr (RemN == 1 || RemN == 8 || RemN == 16) {
1317  simd<uint32_t, RemN> Offsets(0u, sizeof(T));
1318  select<RemN, 1>(NumChunks * ChunkSize) = gather<UT, RemN, AccessorT>(
1319  acc, Offsets, offset + (NumChunks * ChunkSize * sizeof(T)));
1320  } else {
1321  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
1322  simd_mask_type<N1> Pred(0);
1323  Pred.template select<RemN, 1>() = 1;
1324  simd<uint32_t, N1> Offsets(0u, sizeof(T));
1325  simd<UT, N1> Vals = gather<UT, N1>(
1326  acc, Offsets, offset + (NumChunks * ChunkSize * sizeof(T)), Pred);
1327  select<RemN, 1>(NumChunks * ChunkSize) =
1328  Vals.template select<RemN, 1>();
1329  }
1330  }
1331  }
1332 }
1333 
1334 template <typename T, int N, class T1, class SFINAE>
1335 template <typename Flags, int ChunkSize, typename>
1336 void simd_obj_impl<T, N, T1, SFINAE>::copy_to(
1337  simd_obj_impl<T, N, T1, SFINAE>::element_type *Addr,
1338  Flags) const SYCL_ESIMD_FUNCTION {
1339  using UT = simd_obj_impl<T, N, T1, SFINAE>::element_type;
1340  constexpr unsigned Size = sizeof(T) * N;
1341  constexpr unsigned Align = Flags::template alignment<T1>;
1342 
1343  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
1344  constexpr unsigned NumBlocks = Size / BlockSize;
1345  constexpr unsigned RemSize = Size % BlockSize;
1346 
1347  simd<UT, N> Tmp{data()};
1348  if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
1349  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
1350  if constexpr (NumBlocks > 0) {
1351  constexpr unsigned BlockN = BlockSize / sizeof(T);
1352  ForHelper<NumBlocks>::unroll([BlockN, Addr, &Tmp](unsigned Block) {
1353  block_store<UT, BlockN>(Addr + (Block * BlockN),
1354  Tmp.template select<BlockN, 1>(Block * BlockN));
1355  });
1356  }
1357  if constexpr (RemSize > 0) {
1358  constexpr unsigned RemN = RemSize / sizeof(T);
1359  constexpr unsigned BlockN = BlockSize / sizeof(T);
1360  block_store<UT, RemN>(Addr + (NumBlocks * BlockN),
1361  Tmp.template select<RemN, 1>(NumBlocks * BlockN));
1362  }
1363  } else if constexpr (sizeof(T) == 8) {
1364  simd<int32_t, N * 2> BC = Tmp.template bit_cast_view<int32_t>();
1365  BC.copy_to(reinterpret_cast<int32_t *>(Addr), Flags{});
1366  } else {
1367  constexpr unsigned NumChunks = N / ChunkSize;
1368  if constexpr (NumChunks > 0) {
1369  simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
1370  ForHelper<NumChunks>::unroll([Addr, &Offsets, &Tmp](unsigned Block) {
1371  scatter<UT, ChunkSize>(
1372  Addr + (Block * ChunkSize), Offsets,
1373  Tmp.template select<ChunkSize, 1>(Block * ChunkSize));
1374  });
1375  }
1376  constexpr unsigned RemN = N % ChunkSize;
1377  if constexpr (RemN > 0) {
1378  if constexpr (RemN == 1) {
1379  Addr[NumChunks * ChunkSize] = Tmp[NumChunks * ChunkSize];
1380  } else if constexpr (RemN == 8 || RemN == 16) {
1381  // TODO: GPU runtime may handle scatter of 16 byte elements incorrectly.
1382  // The code below is a workaround which must be deleted once GPU runtime
1383  // is fixed.
1384  if constexpr (sizeof(T) == 1 && RemN == 16) {
1385  if constexpr (Align % OperandSize::DWORD > 0) {
1386  ForHelper<RemN>::unroll([Addr, &Tmp](unsigned Index) {
1387  Addr[Index + NumChunks * ChunkSize] =
1388  Tmp[Index + NumChunks * ChunkSize];
1389  });
1390  } else {
1391  simd_mask_type<8> Pred(0);
1392  simd<int32_t, 8> Vals;
1393  Pred.template select<4, 1>() = 1;
1394  Vals.template select<4, 1>() =
1395  Tmp.template bit_cast_view<int32_t>().template select<4, 1>(
1396  NumChunks * ChunkSize);
1397 
1398  simd<uint32_t, 8> Offsets(0u, sizeof(int32_t));
1399  scatter<int32_t, 8>(
1400  reinterpret_cast<int32_t *>(Addr + (NumChunks * ChunkSize)),
1401  Offsets, Vals, Pred);
1402  }
1403  } else {
1404  simd<uint32_t, RemN> Offsets(0u, sizeof(T));
1405  scatter<UT, RemN>(
1406  Addr + (NumChunks * ChunkSize), Offsets,
1407  Tmp.template select<RemN, 1>(NumChunks * ChunkSize));
1408  }
1409  } else {
1410  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
1411  simd_mask_type<N1> Pred(0);
1412  Pred.template select<RemN, 1>() = 1;
1413  simd<UT, N1> Vals;
1414  Vals.template select<RemN, 1>() =
1415  Tmp.template select<RemN, 1>(NumChunks * ChunkSize);
1416  simd<uint32_t, N1> Offsets(0u, sizeof(T));
1417  scatter<UT, N1>(Addr + (NumChunks * ChunkSize), Offsets, Vals, Pred);
1418  }
1419  }
1420  }
1421 }
1422 
1423 template <typename T, int N, class T1, class SFINAE>
1424 template <typename AccessorT, typename Flags, int ChunkSize, typename>
1425 ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_write,
1426  sycl::access::target::device, void>
1427 simd_obj_impl<T, N, T1, SFINAE>::copy_to(AccessorT acc, uint32_t offset,
1428  Flags) const SYCL_ESIMD_FUNCTION {
1429  using UT = simd_obj_impl<T, N, T1, SFINAE>::element_type;
1430  constexpr unsigned Size = sizeof(T) * N;
1431  constexpr unsigned Align = Flags::template alignment<T1>;
1432 
1433  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
1434  constexpr unsigned NumBlocks = Size / BlockSize;
1435  constexpr unsigned RemSize = Size % BlockSize;
1436 
1437  simd<UT, N> Tmp{data()};
1438 
1439  if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
1440  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
1441  if constexpr (NumBlocks > 0) {
1442  constexpr unsigned BlockN = BlockSize / sizeof(T);
1443  ForHelper<NumBlocks>::unroll([BlockN, acc, offset, &Tmp](unsigned Block) {
1444  block_store<UT, BlockN, AccessorT>(
1445  acc, offset + (Block * BlockSize),
1446  Tmp.template select<BlockN, 1>(Block * BlockN));
1447  });
1448  }
1449  if constexpr (RemSize > 0) {
1450  constexpr unsigned RemN = RemSize / sizeof(T);
1451  constexpr unsigned BlockN = BlockSize / sizeof(T);
1452  block_store<UT, RemN, AccessorT>(
1453  acc, offset + (NumBlocks * BlockSize),
1454  Tmp.template select<RemN, 1>(NumBlocks * BlockN));
1455  }
1456  } else if constexpr (sizeof(T) == 8) {
1457  simd<int32_t, N * 2> BC = Tmp.template bit_cast_view<int32_t>();
1458  BC.copy_to(acc, offset, Flags{});
1459  } else {
1460  constexpr unsigned NumChunks = N / ChunkSize;
1461  if constexpr (NumChunks > 0) {
1462  simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
1463  ForHelper<NumChunks>::unroll([acc, offset, &Offsets,
1464  &Tmp](unsigned Block) {
1465  scatter<UT, ChunkSize, AccessorT>(
1466  acc, Offsets, Tmp.template select<ChunkSize, 1>(Block * ChunkSize),
1467  offset + (Block * ChunkSize * sizeof(T)));
1468  });
1469  }
1470  constexpr unsigned RemN = N % ChunkSize;
1471  if constexpr (RemN > 0) {
1472  if constexpr (RemN == 1 || RemN == 8 || RemN == 16) {
1473  simd<uint32_t, RemN> Offsets(0u, sizeof(T));
1474  scatter<UT, RemN, AccessorT>(
1475  acc, Offsets, Tmp.template select<RemN, 1>(NumChunks * ChunkSize),
1476  offset + (NumChunks * ChunkSize * sizeof(T)));
1477  } else {
1478  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
1479  simd_mask_type<N1> Pred(0);
1480  Pred.template select<RemN, 1>() = 1;
1481  simd<UT, N1> Vals;
1482  Vals.template select<RemN, 1>() =
1483  Tmp.template select<RemN, 1>(NumChunks * ChunkSize);
1484  simd<uint32_t, N1> Offsets(0u, sizeof(T));
1485  scatter<UT, N1, AccessorT>(acc, Offsets, Vals,
1486  offset + (NumChunks * ChunkSize * sizeof(T)),
1487  Pred);
1488  }
1489  }
1490  }
1491 }
1492 
1493 } // namespace detail
1495 
1496 } // namespace ext::intel::esimd
1497 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
1498 } // namespace sycl
sycl::_V1::ext::intel::esimd::rgba_channel_mask
rgba_channel_mask
Represents a pixel's channel mask - all possible combinations of enabled channels.
Definition: common.hpp:105
sycl::_V1::ext::intel::esimd::block_store
__ESIMD_API void block_store(AccessorTy acc, uint32_t offset, simd< Tx, N > vals)
Stores elements of a vector to a contiguous block of memory represented by an accessor and an offset ...
Definition: memory.hpp:308
simd_mask
Definition: simd.hpp:1029
sycl::_V1::ext::intel::esimd::slm_init
__ESIMD_API void slm_init(uint32_t size)
Declare per-work-group slm size.
Definition: memory.hpp:933
sycl::_V1::ext::oneapi::bit_and
std::bit_and< T > bit_and
Definition: functional.hpp:23
sycl::_V1::ext::intel::esimd::l3_flush_constant_data
@ l3_flush_constant_data
Flush constant cache.
Definition: memory.hpp:885
sycl::_V1::ext::intel::esimd::scalar_store
__ESIMD_API void scalar_store(AccessorTy acc, uint32_t offset, T val)
Store a scalar value into an accessor.
Definition: memory.hpp:512
vector_aligned_tag
Definition: simd.hpp:1032
sycl::_V1::ext::intel::esimd::fence
__ESIMD_API void fence(fence_mask cntl)
Definition: memory.hpp:904
conditional_t
common.hpp
__SYCL_INLINE_VER_NAMESPACE
#define __SYCL_INLINE_VER_NAMESPACE(X)
Definition: defines_elementary.hpp:13
sycl::_V1::ext::intel::esimd::slm_scalar_store
__ESIMD_API void slm_scalar_store(uint32_t offset, T val)
Store a scalar value into the Shared Local Memory.
Definition: memory.hpp:977
sycl::_V1::ext::oneapi::bit_xor
std::bit_xor< T > bit_xor
Definition: functional.hpp:22
sycl::_V1::ext::intel::esimd::simd
The main simd vector class.
Definition: types.hpp:34
sycl::_V1::ext::intel::esimd::slm_scatter
__ESIMD_API std::enable_if_t<(N==1||N==8||N==16||N==32) &&(sizeof(T)<=4)> slm_scatter(simd< uint32_t, N > offsets, simd< T, N > vals, simd_mask< N > mask=1)
Scatter operation over the Shared Local Memory.
Definition: memory.hpp:966
sycl::_V1::ext::intel::esimd::slm_block_store
__ESIMD_API void slm_block_store(uint32_t offset, simd< T, N > vals)
Stores elements of a vector to a contiguous block of SLM at given offset.
Definition: memory.hpp:1057
sycl
---— Error handling, matching OpenCL plugin semantics.
Definition: access.hpp:13
max
simd< _Tp, _Abi > max(const simd< _Tp, _Abi > &, const simd< _Tp, _Abi > &) noexcept
sycl::_V1::ext::intel::esimd::slm_scalar_load
__ESIMD_API T slm_scalar_load(uint32_t offset)
Load a scalar value from the Shared Local Memory.
Definition: memory.hpp:953
sycl::_V1::ext::intel::esimd::detail::validate_rgba_write_channel_mask
static void validate_rgba_write_channel_mask()
Definition: memory.hpp:572
sycl::_V1::ext::intel::esimd::sw_barrier
@ sw_barrier
Creates a software (compiler) barrier, which does not generate any instruction and only prevents inst...
Definition: memory.hpp:895
sycl::_V1::ext::intel::esimd::local_barrier
@ local_barrier
Issue SLM memory barrier only. If not set, the memory barrier is global.
Definition: memory.hpp:889
sycl::_V1::ext::intel::esimd::l1_flush_ro_data
@ l1_flush_ro_data
Flush L1 read - only data cache.
Definition: memory.hpp:891
__SYCL_DEPRECATED
#define __SYCL_DEPRECATED(message)
Definition: defines_elementary.hpp:45
sycl::_V1::ext::intel::esimd::l3_flush_rw_data
@ l3_flush_rw_data
Flush constant cache.
Definition: memory.hpp:887
sycl::_V1::detail::enable_if_t
typename std::enable_if< B, T >::type enable_if_t
Definition: stl_type_traits.hpp:24
sycl::_V1::ext::intel::esimd::barrier
__ESIMD_API void barrier()
Generic work-group barrier.
Definition: memory.hpp:914
sycl::_V1::fmin
detail::enable_if_t< detail::is_genfloat< T >::value, T > fmin(T x, T y) __NOEXC
Definition: builtins.hpp:216
sycl::_V1::ext::intel::esimd::fence_mask
fence_mask
Represetns a bit mask to control behavior of esimd::fence.
Definition: memory.hpp:877
sycl::_V1::ext::intel::esimd::scalar_load
__ESIMD_API T scalar_load(AccessorTy acc, uint32_t offset)
Load a scalar value from an accessor.
Definition: memory.hpp:498
sycl::_V1::ext::intel::esimd::detail::check_atomic
constexpr void check_atomic()
Check the legality of an atomic call in terms of size and type.
Definition: memory.hpp:708
sycl::_V1::ext::intel::esimd::SurfaceIndex
unsigned int SurfaceIndex
Surface index type.
Definition: common.hpp:64
sycl::_V1::ext::intel::esimd::slm_scatter_rgba
__ESIMD_API std::enable_if_t<(N==8||N==16||N==32) &&(sizeof(T)==4)> slm_scatter_rgba(simd< uint32_t, N > offsets, simd< T, N *get_num_channels_enabled(Mask)> vals, simd_mask< N > mask=1)
Gather data from the Shared Local Memory at specified offsets and return it as simd vector.
Definition: memory.hpp:1013
sycl::_V1::fmax
detail::enable_if_t< detail::is_genfloat< T >::value, T > fmax(T x, T y) __NOEXC
Definition: builtins.hpp:203
types.hpp
sycl::_V1::ext::intel::esimd::detail::isPowerOf2
constexpr ESIMD_INLINE bool isPowerOf2(unsigned int n)
Check if a given 32 bit positive integer is a power of 2 at compile time.
Definition: common.hpp:79
sycl::_V1::ext::intel::esimd::l3_flush_instructions
@ l3_flush_instructions
Flush the instruction cache.
Definition: memory.hpp:881
sycl::_V1::ext::intel::esimd::media_block_store
__ESIMD_API void media_block_store(AccessorTy acc, unsigned x, unsigned y, simd< T, m *N > vals)
Media block store.
Definition: memory.hpp:1168
sycl::_V1::ext::intel::esimd::get_surface_index
__ESIMD_API SurfaceIndex get_surface_index(AccessorTy acc)
Get surface index corresponding to a SYCL accessor.
Definition: memory.hpp:61
simd.hpp
util.hpp
sycl::_V1::ext::intel::esimd::detail::SLM_BTI
static constexpr SurfaceIndex SLM_BTI
Definition: common.hpp:98
sycl::_V1::ext::intel::esimd::scatter
__ESIMD_API std::enable_if_t<(sizeof(T)<=4) &&(N==1||N==8||N==16||N==32) &&!std::is_pointer< AccessorTy >::value > scatter(AccessorTy acc, simd< uint32_t, N > offsets, simd< T, N > vals, uint32_t glob_offset=0, simd_mask< N > mask=1)
Definition: memory.hpp:480
sycl::_V1::dec
constexpr stream_manipulator dec
Definition: stream.hpp:678
sycl::_V1::ext::intel::esimd::slm_block_load
__ESIMD_API simd< T, N > slm_block_load(uint32_t offset)
Loads a contiguous block of memory from the SLM at given offset and returns the loaded data as a vect...
Definition: memory.hpp:1033
sycl::_V1::ext::intel::esimd::gather
__ESIMD_API std::enable_if_t<(sizeof(T)<=4) &&(N==1||N==8||N==16||N==32) &&!std::is_pointer< AccessorTy >::value, simd< T, N > > gather(AccessorTy acc, simd< uint32_t, N > offsets, uint32_t glob_offset=0, simd_mask< N > mask=1)
Definition: memory.hpp:447
simd
Definition: simd.hpp:1027
sycl::_V1::ext::intel::esimd::atomic_update
__ESIMD_API simd< T, N > atomic_update(T *p, simd< unsigned, N > offset, simd< T, N > src0, simd< T, N > src1, simd_mask< N > mask)
LSC version of the two-argument atomic update.
Definition: memory.hpp:1695
sycl::_V1::ext::intel::esimd::slm_atomic_update
__ESIMD_API simd< Tx, N > slm_atomic_update(simd< uint32_t, N > offsets, simd< Tx, N > src0, simd< Tx, N > src1, simd_mask< N > mask)
Atomic update operation performed on SLM.
Definition: memory.hpp:1099
std
Definition: accessor.hpp:3071
sycl::_V1::ext::intel::esimd::block_load
__ESIMD_API simd< Tx, N > block_load(AccessorTy acc, uint32_t offset, Flags={})
Loads a contiguous block of memory from given accessor and offset and returns the loaded data as a ve...
Definition: memory.hpp:244
sycl::_V1::ext::intel::esimd::media_block_load
__ESIMD_API simd< T, m *N > media_block_load(AccessorTy acc, unsigned x, unsigned y)
Media block load.
Definition: memory.hpp:1127
half_type.hpp
sycl::_V1::ext::intel::esimd::scatter_rgba
__ESIMD_API std::enable_if_t<(N==8||N==16||N==32) &&sizeof(T)==4 &&!std::is_pointer_v< AccessorT > > scatter_rgba(AccessorT acc, simd< uint32_t, N > offsets, simd< T, N *get_num_channels_enabled(RGBAMask)> vals, uint32_t global_offset=0, simd_mask< N > mask=1)
Gather data from the memory addressed by accessor acc, offset common for all loaded elements global_o...
Definition: memory.hpp:686
sycl::_V1::detail::device_global_map::add
void add(const void *DeviceGlobalPtr, const char *UniqueId)
Definition: device_global_map.cpp:16
memory_intrin.hpp
sycl::_V1::ext::intel::esimd::slm_gather
__ESIMD_API std::enable_if_t<(N==1||N==8||N==16||N==32), simd< T, N > > slm_gather(simd< uint32_t, N > offsets, simd_mask< N > mask=1)
Gather operation over the Shared Local Memory.
Definition: memory.hpp:943
sycl::_V1::ext::intel::esimd::l3_flush_texture_data
@ l3_flush_texture_data
Flush sampler (texture) cache.
Definition: memory.hpp:883
sycl::_V1::ext::intel::esimd::get_num_channels_enabled
constexpr int get_num_channels_enabled(rgba_channel_mask M)
Definition: common.hpp:128
simd::copy_to
std::enable_if< __vectorizable< _Up >) &&is_simd_flag_type< _Flags >::value >::type copy_to(_Up *__buffer, _Flags) const
Definition: simd.hpp:1523
sycl::_V1::ext::intel::esimd::gather_rgba
__ESIMD_API std::enable_if_t<((N==8||N==16||N==32) &&sizeof(T)==4 &&!std::is_pointer_v< AccessorT >), simd< T, N *get_num_channels_enabled(RGBAMask)> > gather_rgba(AccessorT acc, simd< uint32_t, N > offsets, uint32_t global_offset=0, simd_mask< N > mask=1)
Gather and transpose pixels from the given memory locations defined by the base specified by acc,...
Definition: memory.hpp:652
sycl::_V1::ext::intel::esimd::global_coherent_fence
@ global_coherent_fence
“Commit enable” - wait for fence to complete before continuing.
Definition: memory.hpp:879
min
simd< _Tp, _Abi > min(const simd< _Tp, _Abi > &, const simd< _Tp, _Abi > &) noexcept
sycl::_V1::ext::oneapi::bit_or
std::bit_or< T > bit_or
Definition: functional.hpp:21
sycl::_V1::ext::intel::esimd::slm_gather_rgba
__ESIMD_API std::enable_if_t<(N==8||N==16||N==32) &&(sizeof(T)==4), simd< T, N *get_num_channels_enabled(RGBAMask)> > slm_gather_rgba(simd< uint32_t, N > offsets, simd_mask< N > mask=1)
Gather data from the Shared Local Memory at specified offsets and return it as simd vector.
Definition: memory.hpp:994