DPC++ Runtime
Runtime libraries for oneAPI DPC++
memory.hpp
Go to the documentation of this file.
1 //==-------------- memory.hpp - DPC++ Explicit SIMD API --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // Implement Explicit SIMD memory-access APIs.
9 //===----------------------------------------------------------------------===//
10 
11 #pragma once
12 
19 #include <sycl/half_type.hpp>
20 
21 #include <cstdint>
22 
23 namespace sycl {
25 namespace ext::intel::esimd {
26 
29 
35 
37 
39 
41 
42 namespace detail {
43 // Type used in internal functions to designate SLM access by
44 // providing dummy accessor of this type. Used to make it possible to delegate
45 // implemenations of SLM memory accesses to general surface-based memory
46 // accesses and thus reuse validity checks etc.
47 struct LocalAccessorMarker {};
48 
49 } // namespace detail
50 
52 
55 
61 template <typename AccessorTy>
62 __ESIMD_API SurfaceIndex get_surface_index(AccessorTy acc) {
63  if constexpr (std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
64  sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>) {
65  return detail::SLM_BTI;
66  } else {
67  return __esimd_get_surface_index(
68  detail::AccessorPrivateProxy::getQualifiedPtrOrImageObj(acc));
69  }
70 }
71 
72 // TODO @Pennycook
73 // {quote}
74 // ...I'd like us to think more about what we can do to make these interfaces
75 // more user - friendly. A user providing cache hints has to provide a lot more
76 // template arguments than required.Could we make this nicer by providing the
77 // hints as tag - type arguments ?
78 // ...
79 // // Without cache hints, type and length can be deduced from offsets
80 // float* p;
81 // simd<uint32_t, 16> offsets;
82 // auto result = flat_load(p, offsets);
83 //
84 // // With cache hints as templates, verbosity increases significantly:
85 // // - Providing any cache hint forces the user to specify the type and
86 // length float* p; simd<uint32_t, 16> offsets; auto result =
87 // flat_load<uint32_t, 16, 1, CacheHint::Foo, CacheHint::Bar>(p, offsets);
88 //
89 // // With cache hints as tag types, verbosity is reduced:
90 // // - Providing a cache hint does not prevent deduction of type and length
91 // float* p;
92 // simd <uint32_t, 16> offsets;
93 // auto result = flat_load(p, offsets, CacheHint::Foo{});
94 //
95 // Note also that the templated form prevents a developer from specifying an L3
96 // hint without also explicitly specifying an L1 hint. If flat_load accepted a
97 // list of hints, it might be possible to refactor the hints to specify them in
98 // any order, and it may be more extensible to future cache hints:
99 // {/quote}
100 //
101 // TODO @keryell
102 // {quote}
103 // An approach a la https ://github.com/chriskohlhoff/propria from
104 // @chriskohlhoff would be to add a property to the pointer, such as
105 //
106 // auto result = flat_load(p, offsets);
107 // auto result = flat_load(decorate<CacheHint::Foo, CacheHint::Bar>(p),
108 // offsets);
109 // The advantage is that you do not have to change all tour API and all the uses
110 // of this decorated pointer will benefit from this. decorate is to be bikeshed
111 // accordingly.
112 // {/quote}
113 //
114 
130 template <typename Tx, int N, typename Toffset>
131 __ESIMD_API simd<Tx, N> gather(const Tx *p, simd<Toffset, N> offsets,
132  simd_mask<N> mask = 1) {
133  using T = detail::__raw_t<Tx>;
134  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
135  static_assert(detail::isPowerOf2(N, 32), "Unsupported value of N");
136  simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
137  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
138  addrs = addrs + offsets_i;
139 
140  if constexpr (sizeof(T) == 1) {
141  auto Ret = __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<4>(),
142  detail::ElemsPerAddrEncoding<1>()>(
143  addrs.data(), mask.data());
144  return __esimd_rdregion<T, N * 4, N, /*VS*/ 0, N, 4>(Ret, 0);
145  } else if constexpr (sizeof(T) == 2) {
146  auto Ret = __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<2>(),
147  detail::ElemsPerAddrEncoding<2>()>(
148  addrs.data(), mask.data());
149  return __esimd_rdregion<T, N * 2, N, /*VS*/ 0, N, 2>(Ret, 0);
150  } else
151  return __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<1>(),
152  detail::ElemsPerAddrEncoding<1>()>(addrs.data(),
153  mask.data());
154 }
155 
169 template <typename Tx, int N, typename Toffset,
170  typename RegionTy = region1d_t<Toffset, N, 1>>
171 __ESIMD_API simd<Tx, N> gather(const Tx *p,
173  simd_mask<N> mask = 1) {
174  return gather<Tx, N>(p, offsets.read(), mask);
175 }
176 
189 template <typename Tx, int N, typename Toffset>
190 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>, simd<Tx, N>>
191 gather(const Tx *p, Toffset offset, simd_mask<N> mask = 1) {
192  return gather<Tx, N>(p, simd<Toffset, N>(offset), mask);
193 }
194 
208 template <typename Tx, int N, typename Toffset>
209 __ESIMD_API void scatter(Tx *p, simd<Toffset, N> offsets, simd<Tx, N> vals,
210  simd_mask<N> mask = 1) {
211  using T = detail::__raw_t<Tx>;
212  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
213  static_assert(detail::isPowerOf2(N, 32), "Unsupported value of N");
214  simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
215  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
216  addrs = addrs + offsets_i;
217  if constexpr (sizeof(T) == 1) {
218  simd<T, N * 4> D;
219  D = __esimd_wrregion<T, N * 4, N, /*VS*/ 0, N, 4>(D.data(), vals.data(), 0);
220  __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<4>(),
221  detail::ElemsPerAddrEncoding<1>()>(
222  addrs.data(), D.data(), mask.data());
223  } else if constexpr (sizeof(T) == 2) {
224  simd<T, N * 2> D;
225  D = __esimd_wrregion<T, N * 2, N, /*VS*/ 0, N, 2>(D.data(), vals.data(), 0);
226  __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<2>(),
227  detail::ElemsPerAddrEncoding<2>()>(
228  addrs.data(), D.data(), mask.data());
229  } else
230  __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<1>(),
231  detail::ElemsPerAddrEncoding<1>()>(
232  addrs.data(), vals.data(), mask.data());
233 }
234 
247 template <typename Tx, int N, typename Toffset,
248  typename RegionTy = region1d_t<Toffset, N, 1>>
249 __ESIMD_API void scatter(Tx *p, simd_view<Toffset, RegionTy> offsets,
250  simd<Tx, N> vals, simd_mask<N> mask = 1) {
251  scatter<Tx, N>(p, offsets.read(), vals, mask);
252 }
253 
265 template <typename Tx, int N, typename Toffset>
266 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> && N == 1>
267 scatter(Tx *p, Toffset offset, simd<Tx, N> vals, simd_mask<N> mask = 1) {
268  scatter<Tx, N>(p, simd<Toffset, N>(offset), vals, mask);
269 }
270 
284 template <typename Tx, int N, typename Flags = vector_aligned_tag,
285  class T = detail::__raw_t<Tx>,
286  typename = std::enable_if_t<is_simd_flag_type_v<Flags>>>
287 __ESIMD_API simd<Tx, N> block_load(const Tx *addr, Flags = {}) {
288  constexpr unsigned Sz = sizeof(T) * N;
289  static_assert(Sz >= detail::OperandSize::OWORD,
290  "block size must be at least 1 oword");
291  static_assert(Sz % detail::OperandSize::OWORD == 0,
292  "block size must be whole number of owords");
293  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
294  "block must be 1, 2, 4 or 8 owords long");
295  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
296  "block size must be at most 8 owords");
297 
298  uintptr_t Addr = reinterpret_cast<uintptr_t>(addr);
299  if constexpr (Flags::template alignment<simd<T, N>> >=
300  detail::OperandSize::OWORD) {
301  return __esimd_svm_block_ld<T, N>(Addr);
302  } else {
303  return __esimd_svm_block_ld_unaligned<T, N>(Addr);
304  }
305 }
306 
322 template <typename Tx, int N, typename AccessorTy,
323  typename Flags = vector_aligned_tag,
324  typename = std::enable_if_t<is_simd_flag_type_v<Flags>>,
325  class T = detail::__raw_t<Tx>>
326 __ESIMD_API simd<Tx, N> block_load(AccessorTy acc,
327 #ifdef __ESIMD_FORCE_STATELESS_MEM
328  uint64_t offset,
329 #else
330  uint32_t offset,
331 #endif
332  Flags = {}) {
333 #ifdef __ESIMD_FORCE_STATELESS_MEM
334  return block_load<Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc, offset));
335 #else
336  constexpr unsigned Sz = sizeof(T) * N;
337  static_assert(Sz >= detail::OperandSize::OWORD,
338  "block size must be at least 1 oword");
339  static_assert(Sz % detail::OperandSize::OWORD == 0,
340  "block size must be whole number of owords");
341  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
342  "block must be 1, 2, 4 or 8 owords long");
343  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
344  "block size must be at most 8 owords");
345 
346  auto surf_ind = __esimd_get_surface_index(
347  detail::AccessorPrivateProxy::getQualifiedPtrOrImageObj(acc));
348 
349  if constexpr (Flags::template alignment<simd<T, N>> >=
350  detail::OperandSize::OWORD) {
351  return __esimd_oword_ld<T, N>(surf_ind, offset >> 4);
352  } else {
353  return __esimd_oword_ld_unaligned<T, N>(surf_ind, offset);
354  }
355 #endif
356 }
357 
366 template <typename Tx, int N, class T = detail::__raw_t<Tx>>
367 __ESIMD_API void block_store(Tx *p, simd<Tx, N> vals) {
368  constexpr unsigned Sz = sizeof(T) * N;
369  static_assert(Sz >= detail::OperandSize::OWORD,
370  "block size must be at least 1 oword");
371  static_assert(Sz % detail::OperandSize::OWORD == 0,
372  "block size must be whole number of owords");
373  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
374  "block must be 1, 2, 4 or 8 owords long");
375  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
376  "block size must be at most 8 owords");
377 
378  uintptr_t Addr = reinterpret_cast<uintptr_t>(p);
379  __esimd_svm_block_st<T, N>(Addr, vals.data());
380 }
381 
393 template <typename Tx, int N, typename AccessorTy,
394  class T = detail::__raw_t<Tx>>
395 __ESIMD_API void block_store(AccessorTy acc,
396 #ifdef __ESIMD_FORCE_STATELESS_MEM
397  uint64_t offset,
398 #else
399  uint32_t offset,
400 #endif
401  simd<Tx, N> vals) {
402 #ifdef __ESIMD_FORCE_STATELESS_MEM
403  block_store<Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc, offset), vals);
404 #else
405  constexpr unsigned Sz = sizeof(T) * N;
406  static_assert(Sz >= detail::OperandSize::OWORD,
407  "block size must be at least 1 oword");
408  static_assert(Sz % detail::OperandSize::OWORD == 0,
409  "block size must be whole number of owords");
410  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
411  "block must be 1, 2, 4 or 8 owords long");
412  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
413  "block size must be at most 8 owords");
414 
415  auto surf_ind = __esimd_get_surface_index(
416  detail::AccessorPrivateProxy::getQualifiedPtrOrImageObj(acc));
417  __esimd_oword_st<T, N>(surf_ind, offset >> 4, vals.data());
418 #endif
419 }
420 
422 
424 
425 // Implementations of accessor-based gather and scatter functions
426 namespace detail {
427 template <typename T, int N, typename AccessorTy>
428 ESIMD_INLINE
429  ESIMD_NODEBUG std::enable_if_t<(sizeof(T) <= 4) &&
430  (N == 1 || N == 8 || N == 16 || N == 32) &&
431  !std::is_pointer<AccessorTy>::value>
432  scatter_impl(AccessorTy acc, simd<T, N> vals, simd<uint32_t, N> offsets,
433  uint32_t glob_offset, simd_mask<N> mask) {
434 
435  constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
436  // TODO (performance) use hardware-supported scale once BE supports it
437  constexpr int16_t scale = 0;
438  const auto si = __ESIMD_NS::get_surface_index(acc);
439 
440  if constexpr (sizeof(T) < 4) {
441  using Tint = std::conditional_t<std::is_integral_v<T>, T,
442  detail::uint_type_t<sizeof(T)>>;
443  using Treal = __raw_t<T>;
444  simd<Tint, N> vals_int = bitcast<Tint, Treal, N>(std::move(vals).data());
445  using PromoT = typename std::conditional_t<std::is_signed<Tint>::value,
446  int32_t, uint32_t>;
447  const simd<PromoT, N> promo_vals = convert<PromoT>(std::move(vals_int));
448  __esimd_scatter_scaled<PromoT, N, decltype(si), TypeSizeLog2, scale>(
449  mask.data(), si, glob_offset, offsets.data(), promo_vals.data());
450  } else {
451  using Treal = __raw_t<T>;
452  if constexpr (!std::is_same_v<Treal, T>) {
453  simd<Treal, N> Values = vals.template bit_cast_view<Treal>();
454  __esimd_scatter_scaled<Treal, N, decltype(si), TypeSizeLog2, scale>(
455  mask.data(), si, glob_offset, offsets.data(), Values.data());
456  } else {
457  __esimd_scatter_scaled<T, N, decltype(si), TypeSizeLog2, scale>(
458  mask.data(), si, glob_offset, offsets.data(), vals.data());
459  }
460  }
461 }
462 
463 template <typename T, int N, typename AccessorTy>
464 ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t<
465  (sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) &&
466  !std::is_pointer<AccessorTy>::value,
467  simd<T, N>>
468 gather_impl(AccessorTy acc, simd<uint32_t, N> offsets, uint32_t glob_offset,
469  simd_mask<N> mask) {
470 
471  constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
472  // TODO (performance) use hardware-supported scale once BE supports it
473  constexpr uint32_t scale = 0;
474  const auto si = get_surface_index(acc);
475 
476  if constexpr (sizeof(T) < 4) {
477  using Tint = std::conditional_t<std::is_integral_v<T>, T,
478  detail::uint_type_t<sizeof(T)>>;
479  using Treal = __raw_t<T>;
480  static_assert(std::is_integral<Tint>::value,
481  "only integral 1- & 2-byte types are supported");
482  using PromoT = typename std::conditional_t<std::is_signed<Tint>::value,
483  int32_t, uint32_t>;
484  const simd<PromoT, N> promo_vals =
485  __esimd_gather_masked_scaled2<PromoT, N, decltype(si), TypeSizeLog2,
486  scale>(si, glob_offset, offsets.data(),
487  mask.data());
488  auto Res = convert<Tint>(promo_vals);
489 
490  if constexpr (!std::is_same_v<Tint, T>) {
491  return detail::bitcast<Treal, Tint, N>(Res.data());
492  } else {
493  return Res;
494  }
495  } else {
496  using Treal = __raw_t<T>;
497  simd<Treal, N> Res = __esimd_gather_masked_scaled2<Treal, N, decltype(si),
498  TypeSizeLog2, scale>(
499  si, glob_offset, offsets.data(), mask.data());
500  if constexpr (!std::is_same_v<Treal, T>) {
501  return Res.template bit_cast_view<T>();
502  } else {
503  return Res;
504  }
505  }
506 }
507 
508 } // namespace detail
509 
511 
514 
533 template <typename T, int N, typename AccessorTy, typename Toffset>
534 __ESIMD_API std::enable_if_t<
535  (sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) &&
536  !std::is_pointer<AccessorTy>::value && std::is_integral_v<Toffset>,
537  simd<T, N>>
538 gather(AccessorTy acc, simd<Toffset, N> offsets,
539 #ifdef __ESIMD_FORCE_STATELESS_MEM
540  uint64_t glob_offset = 0,
541 #else
542  uint32_t glob_offset = 0,
543 #endif
544  simd_mask<N> mask = 1) {
545 #ifdef __ESIMD_FORCE_STATELESS_MEM
546  return gather<T, N>(__ESIMD_DNS::accessorToPointer<T>(acc, glob_offset),
547  offsets, mask);
548 #else
549  return detail::gather_impl<T, N, AccessorTy>(acc, offsets, glob_offset, mask);
550 #endif
551 }
552 
573 template <typename T, int N, typename AccessorTy, typename Toffset>
574 __ESIMD_API std::enable_if_t<
575  (sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) &&
576  !std::is_pointer<AccessorTy>::value && std::is_integral_v<Toffset>>
577 scatter(AccessorTy acc, simd<Toffset, N> offsets, simd<T, N> vals,
578 #ifdef __ESIMD_FORCE_STATELESS_MEM
579  uint64_t glob_offset = 0,
580 #else
581  uint32_t glob_offset = 0,
582 #endif
583  simd_mask<N> mask = 1) {
584 #ifdef __ESIMD_FORCE_STATELESS_MEM
585  scatter<T, N>(__ESIMD_DNS::accessorToPointer<T>(acc, glob_offset), offsets,
586  vals, mask);
587 #else
588  detail::scatter_impl<T, N, AccessorTy>(acc, vals, offsets, glob_offset, mask);
589 #endif
590 }
591 
599 template <typename T, typename AccessorTy>
600 __ESIMD_API T scalar_load(AccessorTy acc, uint32_t offset) {
601  const simd<T, 1> Res =
602  gather<T, 1, AccessorTy>(acc, simd<uint32_t, 1>(offset));
603  return Res[0];
604 }
605 
613 template <typename T, typename AccessorTy>
614 __ESIMD_API void scalar_store(AccessorTy acc, uint32_t offset, T val) {
615  scatter<T, 1, AccessorTy>(acc, simd<uint32_t, 1>(offset), simd<T, 1>(val));
616 }
617 
651 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
652  int N, typename Toffset>
653 __ESIMD_API simd<T, N * get_num_channels_enabled(RGBAMask)>
654 gather_rgba(const T *p, simd<Toffset, N> offsets, simd_mask<N> mask = 1) {
655  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
656  static_assert((N == 8 || N == 16 || N == 32), "Unsupported value of N");
657  static_assert(sizeof(T) == 4, "Unsupported size of type T");
658  simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
659  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
660  addrs = addrs + offsets_i;
661  return __esimd_svm_gather4_scaled<detail::__raw_t<T>, N, RGBAMask>(
662  addrs.data(), mask.data());
663 }
664 
680 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
681  int N, typename Toffset,
682  typename RegionTy = region1d_t<Toffset, N, 1>>
683 __ESIMD_API simd<T, N * get_num_channels_enabled(RGBAMask)>
685  simd_mask<N> mask = 1) {
686  return gather_rgba<RGBAMask, T, N>(p, offsets.read(), mask);
687 }
688 
704 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
705  int N, typename Toffset>
706 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>,
707  simd<T, N * get_num_channels_enabled(RGBAMask)>>
708 gather_rgba(const T *p, Toffset offset, simd_mask<N> mask = 1) {
709  return gather_rgba<RGBAMask, T, N>(p, simd<Toffset, N>(offset), mask);
710 }
711 
712 template <typename T, int N, rgba_channel_mask RGBAMask>
713 __SYCL_DEPRECATED("use gather_rgba<rgba_channel_mask>()")
714 __ESIMD_API std::enable_if_t<
715  (N == 8 || N == 16 || N == 32) && sizeof(T) == 4,
717  RGBAMask)>> gather_rgba(const T *p,
718  simd<uint32_t, N> offsets,
719  simd_mask<N> mask = 1) {
720  return gather_rgba<RGBAMask>(p, offsets, mask);
721 }
722 
723 namespace detail {
724 template <rgba_channel_mask M> static void validate_rgba_write_channel_mask() {
725  using CM = rgba_channel_mask;
726  static_assert(
727  (M == CM::ABGR || M == CM::BGR || M == CM::GR || M == CM::R) &&
728  "Only ABGR, BGR, GR, R channel masks are valid in write operations");
729 }
730 } // namespace detail
731 
753 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
754  int N, typename Toffset>
755 __ESIMD_API void
757  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
758  simd_mask<N> mask = 1) {
759  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
760  static_assert((N == 8 || N == 16 || N == 32), "Unsupported value of N");
761  static_assert(sizeof(T) == 4, "Unsupported size of type T");
762  detail::validate_rgba_write_channel_mask<RGBAMask>();
763  simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
764  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
765  addrs = addrs + offsets_i;
766  __esimd_svm_scatter4_scaled<detail::__raw_t<T>, N, RGBAMask>(
767  addrs.data(), vals.data(), mask.data());
768 }
769 
785 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
786  int N, typename Toffset,
787  typename RegionTy = region1d_t<Toffset, N, 1>>
788 __ESIMD_API void
790  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
791  simd_mask<N> mask = 1) {
792  scatter_rgba<RGBAMask, T, N>(p, offsets.read(), vals, mask);
793 }
794 
810 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
811  int N, typename Toffset>
812 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> && N == 1>
813 scatter_rgba(T *p, Toffset offset,
814  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
815  simd_mask<N> mask = 1) {
816  scatter_rgba<RGBAMask, T, N>(p, simd<Toffset, N>(offset), vals, mask);
817 }
818 
819 template <typename T, int N, rgba_channel_mask RGBAMask>
820 __SYCL_DEPRECATED("use scatter_rgba<rgba_channel_mask>()")
821 __ESIMD_API std::
822  enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4> scatter_rgba(
823  T *p, simd<uint32_t, N> offsets,
824  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
825  simd_mask<N> mask = 1) {
826  scatter_rgba<RGBAMask>(p, offsets, vals, mask);
827 }
828 
851 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
852  typename AccessorT, int N,
853  typename T = typename AccessorT::value_type>
854 __ESIMD_API std::enable_if_t<((N == 8 || N == 16 || N == 32) &&
855  sizeof(T) == 4 && !std::is_pointer_v<AccessorT>),
856  simd<T, N * get_num_channels_enabled(RGBAMask)>>
857 gather_rgba(AccessorT acc, simd<uint32_t, N> offsets,
858  uint32_t global_offset = 0, simd_mask<N> mask = 1) {
859 #ifdef __ESIMD_FORCE_STATELESS_MEM
860  return gather_rgba<RGBAMask>(
861  __ESIMD_DNS::accessorToPointer<T>(acc, global_offset), offsets, mask);
862 #else
863  // TODO (performance) use hardware-supported scale once BE supports it
864  constexpr uint32_t Scale = 0;
865  const auto SI = get_surface_index(acc);
866  return __esimd_gather4_masked_scaled2<detail::__raw_t<T>, N, RGBAMask,
867  decltype(SI), Scale>(
868  SI, global_offset, offsets.data(), mask.data());
869 #endif
870 }
871 
886 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
887  typename AccessorT, int N,
888  typename T = typename AccessorT::value_type>
889 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4 &&
890  !std::is_pointer_v<AccessorT>>
891 scatter_rgba(AccessorT acc, simd<uint32_t, N> offsets,
892  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
893  uint32_t global_offset = 0, simd_mask<N> mask = 1) {
894  detail::validate_rgba_write_channel_mask<RGBAMask>();
895 #ifdef __ESIMD_FORCE_STATELESS_MEM
896  scatter_rgba<RGBAMask>(__ESIMD_DNS::accessorToPointer<T>(acc, global_offset),
897  offsets, vals, mask);
898 #else
899  // TODO (performance) use hardware-supported scale once BE supports it
900  constexpr uint32_t Scale = 0;
901  const auto SI = get_surface_index(acc);
902  __esimd_scatter4_scaled<T, N, decltype(SI), RGBAMask, Scale>(
903  mask.data(), SI, global_offset, offsets.data(), vals.data());
904 #endif
905 }
906 
908 
909 namespace detail {
912 template <__ESIMD_NS::atomic_op Op, typename T, int N, unsigned NumSrc>
913 constexpr void check_atomic() {
914 
915  static_assert((detail::isPowerOf2(N, 32)),
916  "Execution size 1, 2, 4, 8, 16, 32 are supported");
917 
918  static_assert(NumSrc == __ESIMD_DNS::get_num_args<Op>(),
919  "wrong number of operands");
920  constexpr bool IsInt2BytePlus =
921  std::is_integral_v<T> && (sizeof(T) >= sizeof(uint16_t));
922 
923  if constexpr (Op == __ESIMD_NS::atomic_op::xchg ||
924  Op == __ESIMD_NS::atomic_op::cmpxchg ||
925  Op == __ESIMD_NS::atomic_op::predec ||
926  Op == __ESIMD_NS::atomic_op::inc ||
928 
929  static_assert(IsInt2BytePlus, "Integral 16-bit or wider type is expected");
930  }
931  // FP ops (are always delegated to native::lsc::<Op>)
932  if constexpr (Op == __ESIMD_NS::atomic_op::fmax ||
934  Op == __ESIMD_NS::atomic_op::fadd ||
935  Op == __ESIMD_NS::atomic_op::fsub) {
936  static_assert((is_type<T, float, sycl::half, double>()),
937  "float, double or sycl::half type is expected");
938  }
939  if constexpr (Op == __ESIMD_NS::atomic_op::add ||
940  Op == __ESIMD_NS::atomic_op::sub ||
946  Op == __ESIMD_NS::atomic_op::minsint ||
947  Op == __ESIMD_NS::atomic_op::maxsint) {
948  static_assert(IsInt2BytePlus, "Integral 16-bit or wider type is expected");
949  constexpr bool IsSignedMinmax = (Op == __ESIMD_NS::atomic_op::minsint) ||
950  (Op == __ESIMD_NS::atomic_op::maxsint);
951  constexpr bool IsUnsignedMinmax = (Op == __ESIMD_NS::atomic_op::min) ||
953 
954  if constexpr (IsSignedMinmax || IsUnsignedMinmax) {
955  constexpr bool SignOK = std::is_signed_v<T> == IsSignedMinmax;
956  static_assert(SignOK, "Signed/unsigned integer type expected for "
957  "signed/unsigned min/max operation");
958  }
959  }
960 }
961 } // namespace detail
962 
965 
989 template <atomic_op Op, typename Tx, int N, typename Toffset>
990 __ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<Toffset, N> offset,
991  simd<Tx, N> src0, simd_mask<N> mask) {
992  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
993  if constexpr ((Op == atomic_op::fmin) || (Op == atomic_op::fmax) ||
994  (Op == atomic_op::fadd) || (Op == atomic_op::fsub)) {
995  // Auto-convert FP atomics to LSC version. Warning is given - see enum.
996  return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(p, offset, src0,
997  mask);
998  } else if constexpr (Op == atomic_op::store) {
999  if constexpr (std::is_integral_v<Tx>) {
1000  return atomic_update<atomic_op::xchg, Tx, N>(p, offset, src0, mask);
1001  } else {
1002  using Tint = detail::uint_type_t<sizeof(Tx)>;
1003  simd<Tint, N> Res = atomic_update<atomic_op::xchg, Tint, N>(
1004  reinterpret_cast<Tint *>(p), offset,
1005  src0.template bit_cast_view<Tint>(), mask);
1006  return Res.template bit_cast_view<Tx>();
1007  }
1008  } else {
1009  detail::check_atomic<Op, Tx, N, 1>();
1010  simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
1011  simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(offset);
1012  vAddr += offset_i1;
1013 
1014  using T = typename detail::__raw_t<Tx>;
1015  return __esimd_svm_atomic1<Op, T, N>(vAddr.data(), src0.data(),
1016  mask.data());
1017  }
1018 }
1019 
1039 template <atomic_op Op, typename Tx, int N, typename Toffset,
1040  typename RegionTy = region1d_t<Toffset, N, 1>>
1041 __ESIMD_API simd<Tx, N> atomic_update(Tx *p,
1043  simd<Tx, N> src0, simd_mask<N> mask) {
1044  return atomic_update<Op, Tx, N>(p, offsets.read(), src0, mask);
1045 }
1046 
1065 template <atomic_op Op, typename Tx, int N, typename Toffset>
1066 __ESIMD_API std::enable_if_t<
1067  std::is_integral_v<Toffset> &&
1068  ((Op != atomic_op::store && Op != atomic_op::xchg) || N == 1),
1069  simd<Tx, N>>
1070 atomic_update(Tx *p, Toffset offset, simd<Tx, N> src0, simd_mask<N> mask) {
1071  return atomic_update<Op, Tx, N>(p, simd<Toffset, N>(offset), src0, mask);
1072 }
1073 
1093 template <atomic_op Op, typename Tx, int N, typename Toffset>
1094 __ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<Toffset, N> offset,
1095  simd_mask<N> mask) {
1096  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
1097  if constexpr (Op == atomic_op::load) {
1098  if constexpr (std::is_integral_v<Tx>) {
1099  return atomic_update<atomic_op::bit_or, Tx, N>(p, offset, simd<Tx, N>(0),
1100  mask);
1101  } else {
1102  using Tint = detail::uint_type_t<sizeof(Tx)>;
1103  simd<Tint, N> Res = atomic_update<atomic_op::bit_or, Tint, N>(
1104  reinterpret_cast<Tint *>(p), offset, simd<Tint, N>(0), mask);
1105  return Res.template bit_cast_view<Tx>();
1106  }
1107  } else {
1108  detail::check_atomic<Op, Tx, N, 0>();
1109 
1110  simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
1111  simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(offset);
1112  vAddr += offset_i1;
1113  using T = typename detail::__raw_t<Tx>;
1114  return __esimd_svm_atomic0<Op, T, N>(vAddr.data(), mask.data());
1115  }
1116 }
1117 
1132 template <atomic_op Op, typename Tx, int N, typename Toffset,
1133  typename RegionTy = region1d_t<Toffset, N, 1>>
1134 __ESIMD_API simd<Tx, N> atomic_update(Tx *p,
1136  simd_mask<N> mask = 1) {
1137  return atomic_update<Op, Tx, N>(p, offsets.read(), mask);
1138 }
1139 
1154 template <atomic_op Op, typename Tx, int N, typename Toffset>
1155 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>, simd<Tx, N>>
1156 atomic_update(Tx *p, Toffset offset, simd_mask<N> mask = 1) {
1157  return atomic_update<Op, Tx, N>(p, simd<Toffset, N>(offset), mask);
1158 }
1159 
1179 template <atomic_op Op, typename Tx, int N, typename Toffset>
1180 __ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<Toffset, N> offset,
1181  simd<Tx, N> src0, simd<Tx, N> src1,
1182  simd_mask<N> mask) {
1183  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
1184  if constexpr (Op == atomic_op::fcmpwr) {
1185  // Auto-convert FP atomics to LSC version. Warning is given - see enum.
1186  return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(p, offset, src0,
1187  src1, mask);
1188  } else {
1189  detail::check_atomic<Op, Tx, N, 2>();
1190  simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
1191  simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(offset);
1192  vAddr += offset_i1;
1193  using T = typename detail::__raw_t<Tx>;
1194  return __esimd_svm_atomic2<Op, T, N>(vAddr.data(), src0.data(), src1.data(),
1195  mask.data());
1196  }
1197 }
1198 
1215 template <atomic_op Op, typename Tx, int N, typename Toffset,
1216  typename RegionTy = region1d_t<Toffset, N, 1>>
1217 __ESIMD_API simd<Tx, N>
1219  simd<Tx, N> src1, simd_mask<N> mask) {
1220  return atomic_update<Op, Tx, N>(p, offsets.read(), src0, src1, mask);
1221 }
1222 
1239 template <atomic_op Op, typename Tx, int N, typename Toffset>
1240 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>, simd<Tx, N>>
1241 atomic_update(Tx *p, Toffset offset, simd<Tx, N> src0, simd<Tx, N> src1,
1242  simd_mask<N> mask) {
1243  return atomic_update<Op, Tx, N>(p, simd<Toffset, N>(offset), src0, src1,
1244  mask);
1245 }
1246 
1273 template <atomic_op Op, typename Tx, int N, typename Toffset,
1274  typename AccessorTy>
1275 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
1276  !std::is_pointer<AccessorTy>::value,
1277  simd<Tx, N>>
1278 atomic_update(AccessorTy acc, simd<Toffset, N> offset, simd<Tx, N> src0,
1279  simd_mask<N> mask) {
1280 #ifdef __ESIMD_FORCE_STATELESS_MEM
1281  return atomic_update<Op, Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc),
1282  offset, src0, mask);
1283 #else
1284  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
1285  static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported");
1286  if constexpr ((Op == atomic_op::fmin) || (Op == atomic_op::fmax) ||
1287  (Op == atomic_op::fadd) || (Op == atomic_op::fsub)) {
1288  // Auto-convert FP atomics to LSC version. Warning is given - see enum.
1289  return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(acc, offset,
1290  src0, mask);
1291  } else if constexpr (Op == atomic_op::store) {
1292  if constexpr (std::is_integral_v<Tx>) {
1293  return atomic_update<atomic_op::xchg, Tx, N>(acc, offset, src0, mask);
1294  } else {
1295  using Tint = detail::uint_type_t<sizeof(Tx)>;
1296  simd<Tint, N> Res = atomic_update<atomic_op::xchg, Tint, N>(
1297  acc, offset, src0.template bit_cast_view<Tint>(), mask);
1298  return Res.template bit_cast_view<Tx>();
1299  }
1300  } else {
1301  detail::check_atomic<Op, Tx, N, 1>();
1302  static_assert(sizeof(Tx) == 4, "Only 32 bit data is supported");
1303  const auto si = __ESIMD_NS::get_surface_index(acc);
1304  using T = typename detail::__raw_t<Tx>;
1305  return __esimd_dword_atomic1<Op, T, N>(mask.data(), si, offset.data(),
1306  src0.data());
1307  }
1308 #endif
1309 }
1310 
1333 template <atomic_op Op, typename Tx, int N, typename Toffset,
1334  typename AccessorTy, typename RegionTy = region1d_t<Toffset, N, 1>>
1335 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
1336  !std::is_pointer<AccessorTy>::value,
1337  simd<Tx, N>>
1339  simd<Tx, N> src0, simd_mask<N> mask) {
1340  return atomic_update<Op, Tx, N>(acc, offsets.read(), src0, mask);
1341 }
1342 
1364 template <atomic_op Op, typename Tx, int N, typename Toffset,
1365  typename AccessorTy>
1366 __ESIMD_API std::enable_if_t<
1367  std::is_integral_v<Toffset> && !std::is_pointer<AccessorTy>::value &&
1368  ((Op != atomic_op::store && Op != atomic_op::xchg) || N == 1),
1369  simd<Tx, N>>
1370 atomic_update(AccessorTy acc, Toffset offset, simd<Tx, N> src0,
1371  simd_mask<N> mask) {
1372  return atomic_update<Op, Tx, N>(acc, simd<Toffset, N>(offset), src0, mask);
1373 }
1374 
1397 template <atomic_op Op, typename Tx, int N, typename Toffset,
1398  typename AccessorTy>
1399 __ESIMD_API
1400  __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
1401  !std::is_pointer<AccessorTy>::value,
1402  simd<Tx, N>>
1403  atomic_update(AccessorTy acc, simd<Toffset, N> offset, simd_mask<N> mask) {
1404 #ifdef __ESIMD_FORCE_STATELESS_MEM
1405  return atomic_update<Op, Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc),
1406  offset, mask);
1407 #else
1408  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
1409  if constexpr (Op == atomic_op::load) {
1410  if constexpr (std::is_integral_v<Tx>) {
1411  return atomic_update<atomic_op::bit_or, Tx, N>(acc, offset,
1412  simd<Tx, N>(0), mask);
1413  } else {
1414  using Tint = detail::uint_type_t<sizeof(Tx)>;
1415  simd<Tint, N> Res = atomic_update<atomic_op::bit_or, Tint, N>(
1416  acc, offset, simd<Tint, N>(0), mask);
1417  return Res.template bit_cast_view<Tx>();
1418  }
1419  } else {
1420  detail::check_atomic<Op, Tx, N, 0>();
1421  static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported");
1422 
1423  static_assert(sizeof(Tx) == 4, "Only 32 bit data is supported");
1424  const auto si = __ESIMD_NS::get_surface_index(acc);
1425  using T = typename detail::__raw_t<Tx>;
1426  return __esimd_dword_atomic0<Op, T, N>(mask.data(), si, offset.data());
1427  }
1428 #endif
1429 }
1430 
1448 template <atomic_op Op, typename Tx, int N, typename Toffset,
1449  typename AccessorTy, typename RegionTy = region1d_t<Toffset, N, 1>>
1450 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
1451  !std::is_pointer<AccessorTy>::value,
1452  simd<Tx, N>>
1454  simd_mask<N> mask) {
1455  return atomic_update<Op, Tx, N>(acc, offsets.read(), mask);
1456 }
1457 
1475 template <atomic_op Op, typename Tx, int N, typename Toffset,
1476  typename AccessorTy>
1477 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
1478  !std::is_pointer<AccessorTy>::value,
1479  simd<Tx, N>>
1480 atomic_update(AccessorTy acc, Toffset offset, simd_mask<N> mask) {
1481  return atomic_update<Op, Tx, N>(acc, simd<Toffset, N>(offset), mask);
1482 }
1483 
1506 template <atomic_op Op, typename Tx, int N, typename Toffset,
1507  typename AccessorTy>
1508 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
1509  !std::is_pointer<AccessorTy>::value,
1510  simd<Tx, N>>
1511 atomic_update(AccessorTy acc, simd<Toffset, N> offset, simd<Tx, N> src0,
1512  simd<Tx, N> src1, simd_mask<N> mask) {
1513 #ifdef __ESIMD_FORCE_STATELESS_MEM
1514  return atomic_update<Op, Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc),
1515  offset, src0, src1, mask);
1516 #else
1517  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
1518  static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported");
1519  if constexpr (Op == atomic_op::fcmpwr) {
1520  // Auto-convert FP atomics to LSC version. Warning is given - see enum.
1521  return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(
1522  acc, offset, src0, src1, mask);
1523  } else {
1524  detail::check_atomic<Op, Tx, N, 2>();
1525  static_assert(sizeof(Tx) == 4, "Only 32 bit data is supported");
1526  const auto si = __ESIMD_NS::get_surface_index(acc);
1527  using T = typename detail::__raw_t<Tx>;
1528  return __esimd_dword_atomic2<Op, T, N>(mask.data(), si, offset.data(),
1529  src0.data(), src1.data());
1530  }
1531 #endif
1532 }
1533 
1553 template <atomic_op Op, typename Tx, int N, typename Toffset,
1554  typename AccessorTy, typename RegionTy = region1d_t<Toffset, N, 1>>
1555 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
1556  !std::is_pointer<AccessorTy>::value,
1557  simd<Tx, N>>
1559  simd<Tx, N> src0, simd<Tx, N> src1, simd_mask<N> mask) {
1560  return atomic_update<Op, Tx, N>(acc, offsets.read(), src0, src1, mask);
1561 }
1562 
1582 template <atomic_op Op, typename Tx, int N, typename Toffset,
1583  typename AccessorTy>
1584 __ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
1585  !std::is_pointer<AccessorTy>::value,
1586  simd<Tx, N>>
1587 atomic_update(AccessorTy acc, Toffset offset, simd<Tx, N> src0,
1588  simd<Tx, N> src1, simd_mask<N> mask) {
1589  return atomic_update<Op, Tx, N>(acc, simd<Toffset, N>(offset), src0, src1,
1590  mask);
1591 }
1592 
1594 
1597 
1600 enum fence_mask : uint8_t {
1618  sw_barrier = 0x80
1619 };
1620 
1624 template <uint8_t cntl> __ESIMD_API void fence() { __esimd_fence(cntl); }
1625 
1626 __SYCL_DEPRECATED("use fence<fence_mask>()")
1627 __ESIMD_API void fence(fence_mask cntl) { __esimd_fence(cntl); }
1628 
1637 __ESIMD_API void barrier() {
1639  __esimd_barrier();
1640 }
1641 
1643 
1646 
1659 template <uint32_t SLMSize> __ESIMD_API void slm_init() {
1660  __esimd_slm_init(SLMSize);
1661 }
1662 
1668 // with esimd::slm_allocator() class.
1671 __ESIMD_API void slm_init(uint32_t size) { __esimd_slm_init(size); }
1672 
1678 template <typename T, int N>
1679 __ESIMD_API
1680  std::enable_if_t<(N == 1 || N == 8 || N == 16 || N == 32), simd<T, N>>
1682  detail::LocalAccessorMarker acc;
1683  return detail::gather_impl<T, N>(acc, offsets, 0, mask);
1684 }
1685 
1691 template <typename T> __ESIMD_API T slm_scalar_load(uint32_t offset) {
1692  const simd<T, 1> Res = slm_gather<T, 1>(simd<uint32_t, 1>(offset));
1693  return Res[0];
1694 }
1695 
1701 template <typename T, int N>
1702 __ESIMD_API std::enable_if_t<(N == 1 || N == 8 || N == 16 || N == 32) &&
1703  (sizeof(T) <= 4)>
1705  detail::LocalAccessorMarker acc;
1706  detail::scatter_impl<T, N>(acc, vals, offsets, 0, mask);
1707 }
1708 
1714 template <typename T>
1715 __ESIMD_API void slm_scalar_store(uint32_t offset, T val) {
1716  slm_scatter<T, 1>(simd<uint32_t, 1>(offset), simd<T, 1>(val), 1);
1717 }
1718 
1729 template <typename T, int N, rgba_channel_mask RGBAMask>
1730 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && (sizeof(T) == 4),
1731  simd<T, N * get_num_channels_enabled(RGBAMask)>>
1733  const auto SI = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
1734  return __esimd_gather4_masked_scaled2<T, N, RGBAMask>(
1735  SI, 0 /*global_offset*/, offsets.data(), mask.data());
1736 }
1737 
1748 template <typename T, int N, rgba_channel_mask Mask>
1749 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && (sizeof(T) == 4)>
1751  simd<T, N * get_num_channels_enabled(Mask)> vals,
1752  simd_mask<N> mask = 1) {
1753  detail::validate_rgba_write_channel_mask<Mask>();
1754  const auto si = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
1755  constexpr int16_t Scale = 0;
1756  constexpr int global_offset = 0;
1757  __esimd_scatter4_scaled<T, N, decltype(si), Mask, Scale>(
1758  mask.data(), si, global_offset, offsets.data(), vals.data());
1759 }
1760 
1769 template <typename T, int N>
1770 __ESIMD_API simd<T, N> slm_block_load(uint32_t offset) {
1771  constexpr unsigned Sz = sizeof(T) * N;
1772  static_assert(Sz >= detail::OperandSize::OWORD,
1773  "block size must be at least 1 oword");
1774  static_assert(Sz % detail::OperandSize::OWORD == 0,
1775  "block size must be whole number of owords");
1776  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
1777  "block must be 1, 2, 4 or 8 owords long");
1778  static_assert(Sz <= 16 * detail::OperandSize::OWORD,
1779  "block size must be at most 16 owords");
1780 
1781  const auto si = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
1782  return __esimd_oword_ld<detail::__raw_t<T>, N>(si, offset >> 4);
1783 }
1784 
1793 template <typename T, int N>
1794 __ESIMD_API void slm_block_store(uint32_t offset, simd<T, N> vals) {
1795  constexpr unsigned Sz = sizeof(T) * N;
1796  static_assert(Sz >= detail::OperandSize::OWORD,
1797  "block size must be at least 1 oword");
1798  static_assert(Sz % detail::OperandSize::OWORD == 0,
1799  "block size must be whole number of owords");
1800  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
1801  "block must be 1, 2, 4 or 8 owords long");
1802  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
1803  "block size must be at most 8 owords");
1804  const auto si = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
1805  // offset in genx.oword.st is in owords
1806  __esimd_oword_st<detail::__raw_t<T>, N>(si, offset >> 4, vals.data());
1807 }
1808 
1812 template <atomic_op Op, typename Tx, int N, class T = detail::__raw_t<Tx>>
1814  simd_mask<N> mask) {
1815  detail::check_atomic<Op, T, N, 0>();
1816  const auto si = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
1817  return __esimd_dword_atomic0<Op, T, N>(mask.data(), si, offsets.data());
1818 }
1819 
1823 template <atomic_op Op, typename Tx, int N, class T = detail::__raw_t<Tx>>
1825  simd<Tx, N> src0, simd_mask<N> mask) {
1826  detail::check_atomic<Op, T, N, 1>();
1827  const auto si = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
1828  return __esimd_dword_atomic1<Op, T, N>(mask.data(), si, offsets.data(),
1829  src0.data());
1830 }
1831 
1835 template <atomic_op Op, typename Tx, int N, class T = detail::__raw_t<Tx>>
1837  simd<Tx, N> src0, simd<Tx, N> src1,
1838  simd_mask<N> mask) {
1839  detail::check_atomic<Op, T, N, 2>();
1840  const auto si = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
1841  return __esimd_dword_atomic2<Op, T, N>(mask.data(), si, offsets.data(),
1842  src0.data(), src1.data());
1843 }
1844 
1846 
1847 #ifndef __ESIMD_FORCE_STATELESS_MEM
1848 
1863 template <typename T, int m, int N, typename AccessorTy, unsigned plane = 0>
1864 __ESIMD_API simd<T, m * N> media_block_load(AccessorTy acc, unsigned x,
1865  unsigned y) {
1866  constexpr unsigned Width = N * sizeof(T);
1867  static_assert(Width * m <= 256u,
1868  "data does not fit into a single dataport transaction");
1869  static_assert(Width <= 64u, "valid block width is in range [1, 64]");
1870  static_assert(m <= 64u, "valid block height is in range [1, 64]");
1871  static_assert(plane <= 3u, "valid plane index is in range [0, 3]");
1872 
1873  const auto si = __ESIMD_NS::get_surface_index(acc);
1874  using SurfIndTy = decltype(si);
1875  constexpr unsigned int RoundedWidth =
1876  Width < 4 ? 4 : detail::getNextPowerOf2<Width>();
1877  constexpr int BlockWidth = sizeof(T) * N;
1878  constexpr int Mod = 0;
1879 
1880  if constexpr (Width < RoundedWidth) {
1881  constexpr unsigned int n1 = RoundedWidth / sizeof(T);
1882  simd<T, m *n1> temp =
1883  __esimd_media_ld<T, m, n1, Mod, SurfIndTy, (int)plane, BlockWidth>(
1884  si, x, y);
1885  return temp.template select<m, 1, N, 1>(0, 0);
1886  } else {
1887  return __esimd_media_ld<T, m, N, Mod, SurfIndTy, (int)plane, BlockWidth>(
1888  si, x, y);
1889  }
1890 }
1891 
1904 template <typename T, int m, int N, typename AccessorTy, unsigned plane = 0>
1905 __ESIMD_API void media_block_store(AccessorTy acc, unsigned x, unsigned y,
1906  simd<T, m * N> vals) {
1907  constexpr unsigned Width = N * sizeof(T);
1908  static_assert(Width * m <= 256u,
1909  "data does not fit into a single dataport transaction");
1910  static_assert(Width <= 64u, "valid block width is in range [1, 64]");
1911  static_assert(m <= 64u, "valid block height is in range [1, 64]");
1912  static_assert(plane <= 3u, "valid plane index is in range [0, 3]");
1913  const auto si = __ESIMD_NS::get_surface_index(acc);
1914  using SurfIndTy = decltype(si);
1915  constexpr unsigned int RoundedWidth =
1916  Width < 4 ? 4 : detail::getNextPowerOf2<Width>();
1917  constexpr unsigned int n1 = RoundedWidth / sizeof(T);
1918  constexpr int BlockWidth = sizeof(T) * N;
1919  constexpr int Mod = 0;
1920 
1921  if constexpr (Width < RoundedWidth) {
1922  simd<T, m * n1> temp;
1923  auto temp_ref = temp.template bit_cast_view<T, m, n1>();
1924  auto vals_ref = vals.template bit_cast_view<T, m, N>();
1925  temp_ref.template select<m, 1, N, 1>() = vals_ref;
1926  __esimd_media_st<T, m, n1, Mod, SurfIndTy, plane, BlockWidth>(si, x, y,
1927  temp.data());
1928  } else {
1929  __esimd_media_st<T, m, N, Mod, SurfIndTy, plane, BlockWidth>(si, x, y,
1930  vals.data());
1931  }
1932 }
1933 #endif // !__ESIMD_FORCE_STATELESS_MEM
1934 
1936 
1938 
1939 namespace detail {
1940 // ----- Outlined implementations of simd_obj_impl class memory access APIs.
1941 
1942 template <typename T, int N, class T1, class SFINAE>
1943 template <typename Flags, int ChunkSize, typename>
1944 void simd_obj_impl<T, N, T1, SFINAE>::copy_from(
1945  const simd_obj_impl<T, N, T1, SFINAE>::element_type *Addr,
1946  Flags) SYCL_ESIMD_FUNCTION {
1947  using UT = simd_obj_impl<T, N, T1, SFINAE>::element_type;
1948  constexpr unsigned Size = sizeof(T) * N;
1949  constexpr unsigned Align = Flags::template alignment<T1>;
1950 
1951  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
1952  constexpr unsigned NumBlocks = Size / BlockSize;
1953  constexpr unsigned RemSize = Size % BlockSize;
1954 
1955  if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 &&
1956  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
1957  if constexpr (NumBlocks > 0) {
1958  constexpr unsigned BlockN = BlockSize / sizeof(T);
1959  ForHelper<NumBlocks>::unroll([BlockN, Addr, this](unsigned Block) {
1960  select<BlockN, 1>(Block * BlockN) =
1961  block_load<UT, BlockN, Flags>(Addr + (Block * BlockN), Flags{});
1962  });
1963  }
1964  if constexpr (RemSize > 0) {
1965  constexpr unsigned RemN = RemSize / sizeof(T);
1966  constexpr unsigned BlockN = BlockSize / sizeof(T);
1967  select<RemN, 1>(NumBlocks * BlockN) =
1968  block_load<UT, RemN, Flags>(Addr + (NumBlocks * BlockN), Flags{});
1969  }
1970  } else if constexpr (sizeof(T) == 8) {
1971  simd<int32_t, N * 2> BC(reinterpret_cast<const int32_t *>(Addr), Flags{});
1972  bit_cast_view<int32_t>() = BC;
1973  } else {
1974  constexpr unsigned NumChunks = N / ChunkSize;
1975  if constexpr (NumChunks > 0) {
1976  simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
1977  ForHelper<NumChunks>::unroll([Addr, &Offsets, this](unsigned Block) {
1978  select<ChunkSize, 1>(Block * ChunkSize) =
1979  gather<UT, ChunkSize>(Addr + (Block * ChunkSize), Offsets);
1980  });
1981  }
1982  constexpr unsigned RemN = N % ChunkSize;
1983  if constexpr (RemN > 0) {
1984  if constexpr (RemN == 1) {
1985  select<1, 1>(NumChunks * ChunkSize) = Addr[NumChunks * ChunkSize];
1986  } else if constexpr (RemN == 8 || RemN == 16) {
1987  simd<uint32_t, RemN> Offsets(0u, sizeof(T));
1988  select<RemN, 1>(NumChunks * ChunkSize) =
1989  gather<UT, RemN>(Addr + (NumChunks * ChunkSize), Offsets);
1990  } else {
1991  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
1992  simd_mask_type<N1> Pred(0);
1993  Pred.template select<RemN, 1>() = 1;
1994  simd<uint32_t, N1> Offsets(0u, sizeof(T));
1995  simd<UT, N1> Vals =
1996  gather<UT, N1>(Addr + (NumChunks * ChunkSize), Offsets, Pred);
1997  select<RemN, 1>(NumChunks * ChunkSize) =
1998  Vals.template select<RemN, 1>();
1999  }
2000  }
2001  }
2002 }
2003 
2004 template <typename T, int N, class T1, class SFINAE>
2005 template <typename AccessorT, typename Flags, int ChunkSize, typename>
2006 ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_read,
2007  sycl::access::target::device, void>
2008 simd_obj_impl<T, N, T1, SFINAE>::copy_from(AccessorT acc, uint32_t offset,
2009  Flags) SYCL_ESIMD_FUNCTION {
2010  using UT = simd_obj_impl<T, N, T1, SFINAE>::element_type;
2011  static_assert(sizeof(UT) == sizeof(T));
2012  constexpr unsigned Size = sizeof(T) * N;
2013  constexpr unsigned Align = Flags::template alignment<T1>;
2014 
2015  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
2016  constexpr unsigned NumBlocks = Size / BlockSize;
2017  constexpr unsigned RemSize = Size % BlockSize;
2018 
2019  if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 &&
2020  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
2021  if constexpr (NumBlocks > 0) {
2022  constexpr unsigned BlockN = BlockSize / sizeof(T);
2023  ForHelper<NumBlocks>::unroll([BlockN, acc, offset, this](unsigned Block) {
2024  select<BlockN, 1>(Block * BlockN) =
2025  block_load<UT, BlockN, AccessorT, Flags>(
2026  acc, offset + (Block * BlockSize), Flags{});
2027  });
2028  }
2029  if constexpr (RemSize > 0) {
2030  constexpr unsigned RemN = RemSize / sizeof(T);
2031  constexpr unsigned BlockN = BlockSize / sizeof(T);
2032  select<RemN, 1>(NumBlocks * BlockN) =
2033  block_load<UT, RemN, AccessorT, Flags>(
2034  acc, offset + (NumBlocks * BlockSize), Flags{});
2035  }
2036  } else if constexpr (sizeof(T) == 8) {
2037  simd<int32_t, N * 2> BC(acc, offset, Flags{});
2038  bit_cast_view<int32_t>() = BC;
2039  } else {
2040  constexpr unsigned NumChunks = N / ChunkSize;
2041  if constexpr (NumChunks > 0) {
2042  simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
2043  ForHelper<NumChunks>::unroll(
2044  [acc, offset, &Offsets, this](unsigned Block) {
2045  select<ChunkSize, 1>(Block * ChunkSize) =
2046  gather<UT, ChunkSize, AccessorT>(
2047  acc, Offsets, offset + (Block * ChunkSize * sizeof(T)));
2048  });
2049  }
2050  constexpr unsigned RemN = N % ChunkSize;
2051  if constexpr (RemN > 0) {
2052  if constexpr (RemN == 1 || RemN == 8 || RemN == 16) {
2053  simd<uint32_t, RemN> Offsets(0u, sizeof(T));
2054  select<RemN, 1>(NumChunks * ChunkSize) = gather<UT, RemN, AccessorT>(
2055  acc, Offsets, offset + (NumChunks * ChunkSize * sizeof(T)));
2056  } else {
2057  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
2058  simd_mask_type<N1> Pred(0);
2059  Pred.template select<RemN, 1>() = 1;
2060  simd<uint32_t, N1> Offsets(0u, sizeof(T));
2061  simd<UT, N1> Vals = gather<UT, N1>(
2062  acc, Offsets, offset + (NumChunks * ChunkSize * sizeof(T)), Pred);
2063  select<RemN, 1>(NumChunks * ChunkSize) =
2064  Vals.template select<RemN, 1>();
2065  }
2066  }
2067  }
2068 }
2069 
2070 template <typename T, int N, class T1, class SFINAE>
2071 template <typename Flags, int ChunkSize, typename>
2072 void simd_obj_impl<T, N, T1, SFINAE>::copy_to(
2073  simd_obj_impl<T, N, T1, SFINAE>::element_type *Addr,
2074  Flags) const SYCL_ESIMD_FUNCTION {
2075  using UT = simd_obj_impl<T, N, T1, SFINAE>::element_type;
2076  constexpr unsigned Size = sizeof(T) * N;
2077  constexpr unsigned Align = Flags::template alignment<T1>;
2078 
2079  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
2080  constexpr unsigned NumBlocks = Size / BlockSize;
2081  constexpr unsigned RemSize = Size % BlockSize;
2082 
2083  simd<UT, N> Tmp{data()};
2084  if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
2085  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
2086  if constexpr (NumBlocks > 0) {
2087  constexpr unsigned BlockN = BlockSize / sizeof(T);
2088  ForHelper<NumBlocks>::unroll([BlockN, Addr, &Tmp](unsigned Block) {
2089  block_store<UT, BlockN>(Addr + (Block * BlockN),
2090  Tmp.template select<BlockN, 1>(Block * BlockN));
2091  });
2092  }
2093  if constexpr (RemSize > 0) {
2094  constexpr unsigned RemN = RemSize / sizeof(T);
2095  constexpr unsigned BlockN = BlockSize / sizeof(T);
2096  block_store<UT, RemN>(Addr + (NumBlocks * BlockN),
2097  Tmp.template select<RemN, 1>(NumBlocks * BlockN));
2098  }
2099  } else if constexpr (sizeof(T) == 8) {
2100  simd<int32_t, N * 2> BC = Tmp.template bit_cast_view<int32_t>();
2101  BC.copy_to(reinterpret_cast<int32_t *>(Addr), Flags{});
2102  } else {
2103  constexpr unsigned NumChunks = N / ChunkSize;
2104  if constexpr (NumChunks > 0) {
2105  simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
2106  ForHelper<NumChunks>::unroll([Addr, &Offsets, &Tmp](unsigned Block) {
2107  scatter<UT, ChunkSize>(
2108  Addr + (Block * ChunkSize), Offsets,
2109  Tmp.template select<ChunkSize, 1>(Block * ChunkSize));
2110  });
2111  }
2112  constexpr unsigned RemN = N % ChunkSize;
2113  if constexpr (RemN > 0) {
2114  if constexpr (RemN == 1) {
2115  Addr[NumChunks * ChunkSize] = Tmp[NumChunks * ChunkSize];
2116  } else if constexpr (RemN == 8 || RemN == 16) {
2117  // TODO: GPU runtime may handle scatter of 16 byte elements
2118  // incorrectly. The code below is a workaround which must be deleted
2119  // once GPU runtime is fixed.
2120  if constexpr (sizeof(T) == 1 && RemN == 16) {
2121  if constexpr (Align % OperandSize::DWORD > 0) {
2122  ForHelper<RemN>::unroll([Addr, &Tmp](unsigned Index) {
2123  Addr[Index + NumChunks * ChunkSize] =
2124  Tmp[Index + NumChunks * ChunkSize];
2125  });
2126  } else {
2127  simd_mask_type<8> Pred(0);
2128  simd<int32_t, 8> Vals;
2129  Pred.template select<4, 1>() = 1;
2130  Vals.template select<4, 1>() =
2131  Tmp.template bit_cast_view<int32_t>().template select<4, 1>(
2132  NumChunks * ChunkSize);
2133 
2134  simd<uint32_t, 8> Offsets(0u, sizeof(int32_t));
2135  scatter<int32_t, 8>(
2136  reinterpret_cast<int32_t *>(Addr + (NumChunks * ChunkSize)),
2137  Offsets, Vals, Pred);
2138  }
2139  } else {
2140  simd<uint32_t, RemN> Offsets(0u, sizeof(T));
2141  scatter<UT, RemN>(
2142  Addr + (NumChunks * ChunkSize), Offsets,
2143  Tmp.template select<RemN, 1>(NumChunks * ChunkSize));
2144  }
2145  } else {
2146  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
2147  simd_mask_type<N1> Pred(0);
2148  Pred.template select<RemN, 1>() = 1;
2149  simd<UT, N1> Vals;
2150  Vals.template select<RemN, 1>() =
2151  Tmp.template select<RemN, 1>(NumChunks * ChunkSize);
2152  simd<uint32_t, N1> Offsets(0u, sizeof(T));
2153  scatter<UT, N1>(Addr + (NumChunks * ChunkSize), Offsets, Vals, Pred);
2154  }
2155  }
2156  }
2157 }
2158 
2159 template <typename T, int N, class T1, class SFINAE>
2160 template <typename AccessorT, typename Flags, int ChunkSize, typename>
2161 ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_write,
2162  sycl::access::target::device, void>
2163 simd_obj_impl<T, N, T1, SFINAE>::copy_to(AccessorT acc, uint32_t offset,
2164  Flags) const SYCL_ESIMD_FUNCTION {
2165  using UT = simd_obj_impl<T, N, T1, SFINAE>::element_type;
2166  constexpr unsigned Size = sizeof(T) * N;
2167  constexpr unsigned Align = Flags::template alignment<T1>;
2168 
2169  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
2170  constexpr unsigned NumBlocks = Size / BlockSize;
2171  constexpr unsigned RemSize = Size % BlockSize;
2172 
2173  simd<UT, N> Tmp{data()};
2174 
2175  if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
2176  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
2177  if constexpr (NumBlocks > 0) {
2178  constexpr unsigned BlockN = BlockSize / sizeof(T);
2179  ForHelper<NumBlocks>::unroll([BlockN, acc, offset, &Tmp](unsigned Block) {
2180  block_store<UT, BlockN, AccessorT>(
2181  acc, offset + (Block * BlockSize),
2182  Tmp.template select<BlockN, 1>(Block * BlockN));
2183  });
2184  }
2185  if constexpr (RemSize > 0) {
2186  constexpr unsigned RemN = RemSize / sizeof(T);
2187  constexpr unsigned BlockN = BlockSize / sizeof(T);
2188  block_store<UT, RemN, AccessorT>(
2189  acc, offset + (NumBlocks * BlockSize),
2190  Tmp.template select<RemN, 1>(NumBlocks * BlockN));
2191  }
2192  } else if constexpr (sizeof(T) == 8) {
2193  simd<int32_t, N * 2> BC = Tmp.template bit_cast_view<int32_t>();
2194  BC.copy_to(acc, offset, Flags{});
2195  } else {
2196  constexpr unsigned NumChunks = N / ChunkSize;
2197  if constexpr (NumChunks > 0) {
2198  simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
2199  ForHelper<NumChunks>::unroll([acc, offset, &Offsets,
2200  &Tmp](unsigned Block) {
2201  scatter<UT, ChunkSize, AccessorT>(
2202  acc, Offsets, Tmp.template select<ChunkSize, 1>(Block * ChunkSize),
2203  offset + (Block * ChunkSize * sizeof(T)));
2204  });
2205  }
2206  constexpr unsigned RemN = N % ChunkSize;
2207  if constexpr (RemN > 0) {
2208  if constexpr (RemN == 1 || RemN == 8 || RemN == 16) {
2209  simd<uint32_t, RemN> Offsets(0u, sizeof(T));
2210  scatter<UT, RemN, AccessorT>(
2211  acc, Offsets, Tmp.template select<RemN, 1>(NumChunks * ChunkSize),
2212  offset + (NumChunks * ChunkSize * sizeof(T)));
2213  } else {
2214  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
2215  simd_mask_type<N1> Pred(0);
2216  Pred.template select<RemN, 1>() = 1;
2217  simd<UT, N1> Vals;
2218  Vals.template select<RemN, 1>() =
2219  Tmp.template select<RemN, 1>(NumChunks * ChunkSize);
2220  simd<uint32_t, N1> Offsets(0u, sizeof(T));
2221  scatter<UT, N1, AccessorT>(acc, Offsets, Vals,
2222  offset + (NumChunks * ChunkSize * sizeof(T)),
2223  Pred);
2224  }
2225  }
2226  }
2227 }
2228 
2229 } // namespace detail
2231 
2232 } // namespace ext::intel::esimd
2233 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
2234 } // namespace sycl
sycl::_V1::ext::intel::esimd::rgba_channel_mask
rgba_channel_mask
Represents a pixel's channel mask - all possible combinations of enabled channels.
Definition: common.hpp:105
sycl::_V1::ext::intel::esimd::block_store
__ESIMD_API void block_store(AccessorTy acc, uint32_t offset, simd< Tx, N > vals)
Stores elements of a vector to a contiguous block of memory represented by an accessor and an offset ...
Definition: memory.hpp:395
simd_mask
Definition: simd.hpp:1029
sycl::_V1::ext::intel::esimd::slm_init
__ESIMD_API void slm_init(uint32_t size)
Declare per-work-group slm size.
Definition: memory.hpp:1671
sycl::_V1::ext::oneapi::bit_and
std::bit_and< T > bit_and
Definition: functional.hpp:24
sycl::_V1::ext::intel::esimd::l3_flush_constant_data
@ l3_flush_constant_data
Flush constant cache.
Definition: memory.hpp:1608
T
sycl::_V1::ext::intel::esimd::scalar_store
__ESIMD_API void scalar_store(AccessorTy acc, uint32_t offset, T val)
Store a scalar value into an accessor.
Definition: memory.hpp:614
vector_aligned_tag
Definition: simd.hpp:1032
sycl::_V1::ext::intel::esimd::fence
__ESIMD_API void fence(fence_mask cntl)
Definition: memory.hpp:1627
simd_view.hpp
common.hpp
__SYCL_INLINE_VER_NAMESPACE
#define __SYCL_INLINE_VER_NAMESPACE(X)
Definition: defines_elementary.hpp:11
sycl::_V1::ext::intel::esimd::slm_scalar_store
__ESIMD_API void slm_scalar_store(uint32_t offset, T val)
Store a scalar value into the Shared Local Memory.
Definition: memory.hpp:1715
sycl::_V1::ext::oneapi::bit_xor
std::bit_xor< T > bit_xor
Definition: functional.hpp:23
sycl::_V1::ext::intel::esimd::atomic_update
__ESIMD_API std::enable_if_t< std::is_integral_v< Toffset > &&__ESIMD_DNS::get_num_args< Op >)==2 &&!std::is_pointer< AccessorTy >::value, sycl::ext::intel::esimd::simd< T, N > > atomic_update(AccessorTy acc, Toffset offset, simd< T, N > src0, simd< T, N > src1, simd_mask< N > mask)
Definition: memory.hpp:3515
sycl::_V1::ext::oneapi::experimental::alignment
constexpr alignment_key::value_t< K > alignment
Definition: properties.hpp:349
sycl::_V1::ext::intel::esimd::simd
The main simd vector class.
Definition: types.hpp:34
sycl::_V1::ext::intel::esimd::slm_scatter
__ESIMD_API std::enable_if_t<(N==1||N==8||N==16||N==32) &&(sizeof(T)<=4)> slm_scatter(simd< uint32_t, N > offsets, simd< T, N > vals, simd_mask< N > mask=1)
Scatter operation over the Shared Local Memory.
Definition: memory.hpp:1704
sycl::_V1::ext::intel::esimd::slm_block_store
__ESIMD_API void slm_block_store(uint32_t offset, simd< T, N > vals)
Stores elements of a vector to a contiguous block of SLM at given offset.
Definition: memory.hpp:1794
sycl
---— Error handling, matching OpenCL plugin semantics.
Definition: access.hpp:14
sycl::_V1::ext::intel::esimd::simd_view
This class represents a reference to a sub-region of a base simd object.
Definition: types.hpp:35
max
simd< _Tp, _Abi > max(const simd< _Tp, _Abi > &, const simd< _Tp, _Abi > &) noexcept
sycl::_V1::ext::intel::esimd::slm_scalar_load
__ESIMD_API T slm_scalar_load(uint32_t offset)
Load a scalar value from the Shared Local Memory.
Definition: memory.hpp:1691
sycl::_V1::ext::intel::esimd::detail::validate_rgba_write_channel_mask
static void validate_rgba_write_channel_mask()
Definition: memory.hpp:724
sycl::_V1::ext::intel::esimd::sw_barrier
@ sw_barrier
Creates a software (compiler) barrier, which does not generate any instruction and only prevents inst...
Definition: memory.hpp:1618
sycl::_V1::ext::intel::esimd::local_barrier
@ local_barrier
Issue SLM memory barrier only. If not set, the memory barrier is global.
Definition: memory.hpp:1612
sycl::_V1::ext::intel::esimd::l1_flush_ro_data
@ l1_flush_ro_data
Flush L1 read - only data cache.
Definition: memory.hpp:1614
sycl::_V1::ext::oneapi::fmax
std::enable_if_t< detail::is_bf16_storage_type< T >::value, T > fmax(T x, T y)
Definition: bf16_storage_builtins.hpp:60
__SYCL_DEPRECATED
#define __SYCL_DEPRECATED(message)
Definition: defines_elementary.hpp:46
sycl::_V1::ext::intel::esimd::l3_flush_rw_data
@ l3_flush_rw_data
Flush constant cache.
Definition: memory.hpp:1610
sycl::_V1::ext::intel::esimd::barrier
__ESIMD_API void barrier()
Generic work-group barrier.
Definition: memory.hpp:1637
sycl::_V1::ext::intel::esimd::scatter
__ESIMD_API std::enable_if_t<(sizeof(T)<=4) &&(N==1||N==8||N==16||N==32) &&!std::is_pointer< AccessorTy >::value &&std::is_integral_v< Toffset > > scatter(AccessorTy acc, simd< Toffset, N > offsets, simd< T, N > vals, uint32_t glob_offset=0, simd_mask< N > mask=1)
Definition: memory.hpp:577
sycl::_V1::ext::intel::esimd::fence_mask
fence_mask
Represetns a bit mask to control behavior of esimd::fence.
Definition: memory.hpp:1600
sycl::_V1::ext::intel::esimd::scalar_load
__ESIMD_API T scalar_load(AccessorTy acc, uint32_t offset)
Load a scalar value from an accessor.
Definition: memory.hpp:600
sycl::_V1::ext::intel::esimd::detail::check_atomic
constexpr void check_atomic()
Check the legality of an atomic call in terms of size and type.
Definition: memory.hpp:913
sycl::_V1::ext::intel::esimd::SurfaceIndex
unsigned int SurfaceIndex
Surface index type.
Definition: common.hpp:64
sycl::_V1::ext::intel::esimd::slm_scatter_rgba
__ESIMD_API std::enable_if_t<(N==8||N==16||N==32) &&(sizeof(T)==4)> slm_scatter_rgba(simd< uint32_t, N > offsets, simd< T, N *get_num_channels_enabled(Mask)> vals, simd_mask< N > mask=1)
Gather data from the Shared Local Memory at specified offsets and return it as simd vector.
Definition: memory.hpp:1750
types.hpp
sycl::_V1::ext::intel::esimd::detail::isPowerOf2
constexpr ESIMD_INLINE bool isPowerOf2(unsigned int n)
Check if a given 32 bit positive integer is a power of 2 at compile time.
Definition: common.hpp:79
sycl::_V1::ext::oneapi::fmin
std::enable_if_t< detail::is_bf16_storage_type< T >::value, T > fmin(T x, T y)
Definition: bf16_storage_builtins.hpp:49
sycl::_V1::ext::intel::esimd::l3_flush_instructions
@ l3_flush_instructions
Flush the instruction cache.
Definition: memory.hpp:1604
sycl::_V1::ext::intel::esimd::media_block_store
__ESIMD_API void media_block_store(AccessorTy acc, unsigned x, unsigned y, simd< T, m *N > vals)
Media block store.
Definition: memory.hpp:1905
sycl::_V1::ext::intel::esimd::get_surface_index
__ESIMD_API SurfaceIndex get_surface_index(AccessorTy acc)
Get surface index corresponding to a SYCL accessor.
Definition: memory.hpp:62
simd.hpp
util.hpp
sycl::_V1::ext::intel::esimd::detail::SLM_BTI
static constexpr SurfaceIndex SLM_BTI
Definition: common.hpp:98
simd::copy_to
std::enable_if_t< __vectorizable< _Up >) &&is_simd_flag_type< _Flags >::value > copy_to(_Up *__buffer, _Flags) const
Definition: simd.hpp:1523
sycl::_V1::dec
constexpr stream_manipulator dec
Definition: stream.hpp:744
sycl::_V1::ext::intel::esimd::slm_block_load
__ESIMD_API simd< T, N > slm_block_load(uint32_t offset)
Loads a contiguous block of memory from the SLM at given offset and returns the loaded data as a vect...
Definition: memory.hpp:1770
simd
Definition: simd.hpp:1027
sycl::_V1::ext::intel::esimd::slm_atomic_update
__ESIMD_API simd< Tx, N > slm_atomic_update(simd< uint32_t, N > offsets, simd< Tx, N > src0, simd< Tx, N > src1, simd_mask< N > mask)
Atomic update operation performed on SLM.
Definition: memory.hpp:1836
std
Definition: accessor.hpp:3910
sycl::_V1::ext::intel::esimd::block_load
__ESIMD_API simd< Tx, N > block_load(AccessorTy acc, uint32_t offset, Flags={})
Loads a contiguous block of memory from given accessor and offset and returns the loaded data as a ve...
Definition: memory.hpp:326
sycl::_V1::ext::intel::esimd::media_block_load
__ESIMD_API simd< T, m *N > media_block_load(AccessorTy acc, unsigned x, unsigned y)
Media block load.
Definition: memory.hpp:1864
half_type.hpp
sycl::_V1::ext::intel::esimd::scatter_rgba
__ESIMD_API std::enable_if_t<(N==8||N==16||N==32) &&sizeof(T)==4 &&!std::is_pointer_v< AccessorT > > scatter_rgba(AccessorT acc, simd< uint32_t, N > offsets, simd< T, N *get_num_channels_enabled(RGBAMask)> vals, uint32_t global_offset=0, simd_mask< N > mask=1)
Gather data from the memory addressed by accessor acc, offset common for all loaded elements global_o...
Definition: memory.hpp:891
sycl::_V1::ext::intel::esimd::gather
__ESIMD_API std::enable_if_t<(sizeof(T)<=4) &&(N==1||N==8||N==16||N==32) &&!std::is_pointer< AccessorTy >::value &&std::is_integral_v< Toffset >, simd< T, N > > gather(AccessorTy acc, simd< Toffset, N > offsets, uint32_t glob_offset=0, simd_mask< N > mask=1)
Definition: memory.hpp:538
sycl::_V1::detail::device_global_map::add
void add(const void *DeviceGlobalPtr, const char *UniqueId)
Definition: device_global_map.cpp:15
memory_intrin.hpp
sycl::_V1::ext::intel::esimd::slm_gather
__ESIMD_API std::enable_if_t<(N==1||N==8||N==16||N==32), simd< T, N > > slm_gather(simd< uint32_t, N > offsets, simd_mask< N > mask=1)
Gather operation over the Shared Local Memory.
Definition: memory.hpp:1681
sycl::_V1::ext::intel::esimd::l3_flush_texture_data
@ l3_flush_texture_data
Flush sampler (texture) cache.
Definition: memory.hpp:1606
sycl::_V1::ext::intel::esimd::get_num_channels_enabled
constexpr int get_num_channels_enabled(rgba_channel_mask M)
Definition: common.hpp:128
sycl::_V1::ext::intel::esimd::gather_rgba
__ESIMD_API std::enable_if_t<((N==8||N==16||N==32) &&sizeof(T)==4 &&!std::is_pointer_v< AccessorT >), simd< T, N *get_num_channels_enabled(RGBAMask)> > gather_rgba(AccessorT acc, simd< uint32_t, N > offsets, uint32_t global_offset=0, simd_mask< N > mask=1)
Gather and transpose pixels from the given memory locations defined by the base specified by acc,...
Definition: memory.hpp:857
sycl::_V1::ext::intel::esimd::atomic_op
atomic_op
Represents an atomic operation.
Definition: common.hpp:145
sycl::_V1::ext::intel::esimd::global_coherent_fence
@ global_coherent_fence
“Commit enable” - wait for fence to complete before continuing.
Definition: memory.hpp:1602
min
simd< _Tp, _Abi > min(const simd< _Tp, _Abi > &, const simd< _Tp, _Abi > &) noexcept
sycl::_V1::ext::oneapi::bit_or
std::bit_or< T > bit_or
Definition: functional.hpp:22
sycl::_V1::ext::intel::esimd::slm_gather_rgba
__ESIMD_API std::enable_if_t<(N==8||N==16||N==32) &&(sizeof(T)==4), simd< T, N *get_num_channels_enabled(RGBAMask)> > slm_gather_rgba(simd< uint32_t, N > offsets, simd_mask< N > mask=1)
Gather data from the Shared Local Memory at specified offsets and return it as simd vector.
Definition: memory.hpp:1732