DPC++ Runtime
Runtime libraries for oneAPI DPC++
memory.hpp
Go to the documentation of this file.
1 //==-------------- memory.hpp - DPC++ Explicit SIMD API --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // Implement Explicit SIMD memory-access APIs.
9 //===----------------------------------------------------------------------===//
10 
11 #pragma once
12 
19 #include <sycl/half_type.hpp>
20 
21 #include <cstdint>
22 
23 namespace sycl {
25 namespace ext::intel::esimd {
26 
29 
35 
37 
39 
41 
42 namespace detail {
43 // Type used in internal functions to designate SLM access by
44 // providing dummy accessor of this type. Used to make it possible to delegate
45 // implemenations of SLM memory accesses to general surface-based memory
46 // accesses and thus reuse validity checks etc.
47 struct LocalAccessorMarker {};
48 
49 } // namespace detail
50 
52 
55 
61 template <typename AccessorTy>
62 __ESIMD_API SurfaceIndex get_surface_index(AccessorTy acc) {
63  if constexpr (std::is_same_v<detail::LocalAccessorMarker, AccessorTy>) {
64  return detail::SLM_BTI;
65  } else {
66  return __esimd_get_surface_index(
67  detail::AccessorPrivateProxy::getNativeImageObj(acc));
68  }
69 }
70 
71 // TODO @Pennycook
72 // {quote}
73 // ...I'd like us to think more about what we can do to make these interfaces
74 // more user - friendly. A user providing cache hints has to provide a lot more
75 // template arguments than required.Could we make this nicer by providing the
76 // hints as tag - type arguments ?
77 // ...
78 // // Without cache hints, type and length can be deduced from offsets
79 // float* p;
80 // simd<uint32_t, 16> offsets;
81 // auto result = flat_load(p, offsets);
82 //
83 // // With cache hints as templates, verbosity increases significantly:
84 // // - Providing any cache hint forces the user to specify the type and
85 // length float* p; simd<uint32_t, 16> offsets; auto result =
86 // flat_load<uint32_t, 16, 1, CacheHint::Foo, CacheHint::Bar>(p, offsets);
87 //
88 // // With cache hints as tag types, verbosity is reduced:
89 // // - Providing a cache hint does not prevent deduction of type and length
90 // float* p;
91 // simd <uint32_t, 16> offsets;
92 // auto result = flat_load(p, offsets, CacheHint::Foo{});
93 //
94 // Note also that the templated form prevents a developer from specifying an L3
95 // hint without also explicitly specifying an L1 hint. If flat_load accepted a
96 // list of hints, it might be possible to refactor the hints to specify them in
97 // any order, and it may be more extensible to future cache hints:
98 // {/quote}
99 //
100 // TODO @keryell
101 // {quote}
102 // An approach a la https ://github.com/chriskohlhoff/propria from
103 // @chriskohlhoff would be to add a property to the pointer, such as
104 //
105 // auto result = flat_load(p, offsets);
106 // auto result = flat_load(decorate<CacheHint::Foo, CacheHint::Bar>(p),
107 // offsets);
108 // The advantage is that you do not have to change all tour API and all the uses
109 // of this decorated pointer will benefit from this. decorate is to be bikeshed
110 // accordingly.
111 // {/quote}
112 //
113 
129 template <typename Tx, int N, typename Toffset>
130 __ESIMD_API simd<Tx, N> gather(const Tx *p, simd<Toffset, N> offsets,
131  simd_mask<N> mask = 1) {
132  using T = detail::__raw_t<Tx>;
133  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
134  static_assert(detail::isPowerOf2(N, 32), "Unsupported value of N");
135  simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
136  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
137  addrs = addrs + offsets_i;
138 
139  if constexpr (sizeof(T) == 1) {
140  auto Ret = __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<4>(),
141  detail::ElemsPerAddrEncoding<1>()>(
142  addrs.data(), mask.data());
143  return __esimd_rdregion<T, N * 4, N, /*VS*/ 0, N, 4>(Ret, 0);
144  } else if constexpr (sizeof(T) == 2) {
145  auto Ret = __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<2>(),
146  detail::ElemsPerAddrEncoding<2>()>(
147  addrs.data(), mask.data());
148  return __esimd_rdregion<T, N * 2, N, /*VS*/ 0, N, 2>(Ret, 0);
149  } else
150  return __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<1>(),
151  detail::ElemsPerAddrEncoding<1>()>(addrs.data(),
152  mask.data());
153 }
154 
168 template <typename Tx, int N, typename Toffset,
169  typename RegionTy = region1d_t<Toffset, N, 1>>
170 __ESIMD_API simd<Tx, N> gather(const Tx *p,
172  simd_mask<N> mask = 1) {
173  using T = detail::__raw_t<Tx>;
174  using Ty = typename simd_view<Toffset, RegionTy>::element_type;
175  return gather<Tx, N>(p, simd<Ty, N>(offsets), mask);
176 }
177 
191 template <typename Tx, int N, typename Toffset>
192 __ESIMD_API void scatter(Tx *p, simd<Toffset, N> offsets, simd<Tx, N> vals,
193  simd_mask<N> mask = 1) {
194  using T = detail::__raw_t<Tx>;
195  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
196  static_assert(detail::isPowerOf2(N, 32), "Unsupported value of N");
197  simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
198  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
199  addrs = addrs + offsets_i;
200  if constexpr (sizeof(T) == 1) {
201  simd<T, N * 4> D;
202  D = __esimd_wrregion<T, N * 4, N, /*VS*/ 0, N, 4>(D.data(), vals.data(), 0);
203  __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<4>(),
204  detail::ElemsPerAddrEncoding<1>()>(
205  addrs.data(), D.data(), mask.data());
206  } else if constexpr (sizeof(T) == 2) {
207  simd<T, N * 2> D;
208  D = __esimd_wrregion<T, N * 2, N, /*VS*/ 0, N, 2>(D.data(), vals.data(), 0);
209  __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<2>(),
210  detail::ElemsPerAddrEncoding<2>()>(
211  addrs.data(), D.data(), mask.data());
212  } else
213  __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<1>(),
214  detail::ElemsPerAddrEncoding<1>()>(
215  addrs.data(), vals.data(), mask.data());
216 }
217 
230 template <typename Tx, int N, typename Toffset,
231  typename RegionTy = region1d_t<Toffset, N, 1>>
232 __ESIMD_API void scatter(Tx *p, simd_view<Toffset, RegionTy> offsets,
233  simd<Tx, N> vals, simd_mask<N> mask = 1) {
234  using T = detail::__raw_t<Tx>;
235  using Ty = typename simd_view<Toffset, RegionTy>::element_type;
236  scatter<Tx, N>(p, simd<Ty, N>(offsets), vals, mask);
237 }
238 
252 template <typename Tx, int N, typename Flags = vector_aligned_tag,
253  class T = detail::__raw_t<Tx>,
254  typename = std::enable_if_t<is_simd_flag_type_v<Flags>>>
255 __ESIMD_API simd<Tx, N> block_load(const Tx *addr, Flags = {}) {
256  constexpr unsigned Sz = sizeof(T) * N;
257  static_assert(Sz >= detail::OperandSize::OWORD,
258  "block size must be at least 1 oword");
259  static_assert(Sz % detail::OperandSize::OWORD == 0,
260  "block size must be whole number of owords");
261  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
262  "block must be 1, 2, 4 or 8 owords long");
263  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
264  "block size must be at most 8 owords");
265 
266  uintptr_t Addr = reinterpret_cast<uintptr_t>(addr);
267  if constexpr (Flags::template alignment<simd<T, N>> >=
268  detail::OperandSize::OWORD) {
269  return __esimd_svm_block_ld<T, N>(Addr);
270  } else {
271  return __esimd_svm_block_ld_unaligned<T, N>(Addr);
272  }
273 }
274 
290 template <typename Tx, int N, typename AccessorTy,
291  typename Flags = vector_aligned_tag,
292  typename = std::enable_if_t<is_simd_flag_type_v<Flags>>,
293  class T = detail::__raw_t<Tx>>
294 __ESIMD_API simd<Tx, N> block_load(AccessorTy acc, uint32_t offset,
295  Flags = {}) {
296 #ifdef __ESIMD_FORCE_STATELESS_MEM
297  return block_load<Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc, offset));
298 #else
299  constexpr unsigned Sz = sizeof(T) * N;
300  static_assert(Sz >= detail::OperandSize::OWORD,
301  "block size must be at least 1 oword");
302  static_assert(Sz % detail::OperandSize::OWORD == 0,
303  "block size must be whole number of owords");
304  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
305  "block must be 1, 2, 4 or 8 owords long");
306  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
307  "block size must be at most 8 owords");
308 
309  auto surf_ind = __esimd_get_surface_index(
310  detail::AccessorPrivateProxy::getNativeImageObj(acc));
311 
312  if constexpr (Flags::template alignment<simd<T, N>> >=
313  detail::OperandSize::OWORD) {
314  return __esimd_oword_ld<T, N>(surf_ind, offset >> 4);
315  } else {
316  return __esimd_oword_ld_unaligned<T, N>(surf_ind, offset);
317  }
318 #endif
319 }
320 
329 template <typename Tx, int N, class T = detail::__raw_t<Tx>>
330 __ESIMD_API void block_store(Tx *p, simd<Tx, N> vals) {
331  constexpr unsigned Sz = sizeof(T) * N;
332  static_assert(Sz >= detail::OperandSize::OWORD,
333  "block size must be at least 1 oword");
334  static_assert(Sz % detail::OperandSize::OWORD == 0,
335  "block size must be whole number of owords");
336  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
337  "block must be 1, 2, 4 or 8 owords long");
338  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
339  "block size must be at most 8 owords");
340 
341  uintptr_t Addr = reinterpret_cast<uintptr_t>(p);
342  __esimd_svm_block_st<T, N>(Addr, vals.data());
343 }
344 
356 template <typename Tx, int N, typename AccessorTy,
357  class T = detail::__raw_t<Tx>>
358 __ESIMD_API void block_store(AccessorTy acc, uint32_t offset,
359  simd<Tx, N> vals) {
360 #ifdef __ESIMD_FORCE_STATELESS_MEM
361  block_store<Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc, offset), vals);
362 #else
363  constexpr unsigned Sz = sizeof(T) * N;
364  static_assert(Sz >= detail::OperandSize::OWORD,
365  "block size must be at least 1 oword");
366  static_assert(Sz % detail::OperandSize::OWORD == 0,
367  "block size must be whole number of owords");
368  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
369  "block must be 1, 2, 4 or 8 owords long");
370  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
371  "block size must be at most 8 owords");
372 
373  auto surf_ind = __esimd_get_surface_index(
374  detail::AccessorPrivateProxy::getNativeImageObj(acc));
375  __esimd_oword_st<T, N>(surf_ind, offset >> 4, vals.data());
376 #endif
377 }
378 
380 
382 
383 // Implementations of accessor-based gather and scatter functions
384 namespace detail {
385 template <typename T, int N, typename AccessorTy>
386 ESIMD_INLINE
387  ESIMD_NODEBUG std::enable_if_t<(sizeof(T) <= 4) &&
388  (N == 1 || N == 8 || N == 16 || N == 32) &&
389  !std::is_pointer<AccessorTy>::value>
390  scatter_impl(AccessorTy acc, simd<T, N> vals, simd<uint32_t, N> offsets,
391  uint32_t glob_offset, simd_mask<N> mask) {
392 
393  constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
394  // TODO (performance) use hardware-supported scale once BE supports it
395  constexpr int16_t scale = 0;
396  const auto si = __ESIMD_NS::get_surface_index(acc);
397 
398  if constexpr (sizeof(T) < 4) {
399  using Tint = std::conditional_t<std::is_integral_v<T>, T,
400  detail::uint_type_t<sizeof(T)>>;
401  using Treal = __raw_t<T>;
402  simd<Tint, N> vals_int = bitcast<Tint, Treal, N>(std::move(vals).data());
403  using PromoT =
405  int32_t, uint32_t>;
406  const simd<PromoT, N> promo_vals = convert<PromoT>(std::move(vals_int));
407  __esimd_scatter_scaled<PromoT, N, decltype(si), TypeSizeLog2, scale>(
408  mask.data(), si, glob_offset, offsets.data(), promo_vals.data());
409  } else {
410  using Treal = __raw_t<T>;
411  if constexpr (!std::is_same_v<Treal, T>) {
412  simd<Treal, N> Values = vals.template bit_cast_view<Treal>();
413  __esimd_scatter_scaled<Treal, N, decltype(si), TypeSizeLog2, scale>(
414  mask.data(), si, glob_offset, offsets.data(), Values.data());
415  } else {
416  __esimd_scatter_scaled<T, N, decltype(si), TypeSizeLog2, scale>(
417  mask.data(), si, glob_offset, offsets.data(), vals.data());
418  }
419  }
420 }
421 
422 template <typename T, int N, typename AccessorTy>
423 ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t<
424  (sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) &&
425  !std::is_pointer<AccessorTy>::value,
426  simd<T, N>>
427 gather_impl(AccessorTy acc, simd<uint32_t, N> offsets, uint32_t glob_offset,
428  simd_mask<N> mask) {
429 
430  constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
431  // TODO (performance) use hardware-supported scale once BE supports it
432  constexpr uint32_t scale = 0;
433  const auto si = get_surface_index(acc);
434 
435  if constexpr (sizeof(T) < 4) {
436  using Tint = std::conditional_t<std::is_integral_v<T>, T,
437  detail::uint_type_t<sizeof(T)>>;
438  using Treal = __raw_t<T>;
439  static_assert(std::is_integral<Tint>::value,
440  "only integral 1- & 2-byte types are supported");
441  using PromoT =
443  int32_t, uint32_t>;
444  const simd<PromoT, N> promo_vals =
445  __esimd_gather_masked_scaled2<PromoT, N, decltype(si), TypeSizeLog2,
446  scale>(si, glob_offset, offsets.data(),
447  mask.data());
448  auto Res = convert<Tint>(promo_vals);
449 
450  if constexpr (!std::is_same_v<Tint, T>) {
451  return detail::bitcast<Treal, Tint, N>(Res.data());
452  } else {
453  return Res;
454  }
455  } else {
456  using Treal = __raw_t<T>;
457  simd<Treal, N> Res = __esimd_gather_masked_scaled2<Treal, N, decltype(si),
458  TypeSizeLog2, scale>(
459  si, glob_offset, offsets.data(), mask.data());
460  if constexpr (!std::is_same_v<Treal, T>) {
461  return Res.template bit_cast_view<T>();
462  } else {
463  return Res;
464  }
465  }
466 }
467 
468 } // namespace detail
469 
471 
474 
492 template <typename T, int N, typename AccessorTy>
493 __ESIMD_API std::enable_if_t<(sizeof(T) <= 4) &&
494  (N == 1 || N == 8 || N == 16 || N == 32) &&
495  !std::is_pointer<AccessorTy>::value,
496  simd<T, N>>
497 gather(AccessorTy acc, simd<uint32_t, N> offsets, uint32_t glob_offset = 0,
498  simd_mask<N> mask = 1) {
499 #ifdef __ESIMD_FORCE_STATELESS_MEM
500  return gather<T, N>(__ESIMD_DNS::accessorToPointer<T>(acc, glob_offset),
501  offsets, mask);
502 #else
503  return detail::gather_impl<T, N, AccessorTy>(acc, offsets, glob_offset, mask);
504 #endif
505 }
506 
526 template <typename T, int N, typename AccessorTy>
527 __ESIMD_API std::enable_if_t<(sizeof(T) <= 4) &&
528  (N == 1 || N == 8 || N == 16 || N == 32) &&
529  !std::is_pointer<AccessorTy>::value>
530 scatter(AccessorTy acc, simd<uint32_t, N> offsets, simd<T, N> vals,
531  uint32_t glob_offset = 0, simd_mask<N> mask = 1) {
532 #ifdef __ESIMD_FORCE_STATELESS_MEM
533  scatter<T, N>(__ESIMD_DNS::accessorToPointer<T>(acc, glob_offset), offsets,
534  vals, mask);
535 #else
536  detail::scatter_impl<T, N, AccessorTy>(acc, vals, offsets, glob_offset, mask);
537 #endif
538 }
539 
547 template <typename T, typename AccessorTy>
548 __ESIMD_API T scalar_load(AccessorTy acc, uint32_t offset) {
549  const simd<T, 1> Res =
550  gather<T, 1, AccessorTy>(acc, simd<uint32_t, 1>(offset));
551  return Res[0];
552 }
553 
561 template <typename T, typename AccessorTy>
562 __ESIMD_API void scalar_store(AccessorTy acc, uint32_t offset, T val) {
563  scatter<T, 1, AccessorTy>(acc, simd<uint32_t, 1>(offset), simd<T, 1>(val));
564 }
565 
599 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
600  int N, typename Toffset>
601 __ESIMD_API simd<T, N * get_num_channels_enabled(RGBAMask)>
602 gather_rgba(const T *p, simd<Toffset, N> offsets, simd_mask<N> mask = 1) {
603  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
604  static_assert((N == 8 || N == 16 || N == 32), "Unsupported value of N");
605  static_assert(sizeof(T) == 4, "Unsupported size of type T");
606  simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
607  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
608  addrs = addrs + offsets_i;
609  return __esimd_svm_gather4_scaled<detail::__raw_t<T>, N, RGBAMask>(
610  addrs.data(), mask.data());
611 }
612 
628 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
629  int N, typename Toffset,
630  typename RegionTy = region1d_t<Toffset, N, 1>>
631 __ESIMD_API simd<T, N * get_num_channels_enabled(RGBAMask)>
633  simd_mask<N> mask = 1) {
634  using Ty = typename simd_view<Toffset, RegionTy>::element_type;
635  return gather_rgba<RGBAMask, T, N>(p, simd<Ty, N>(offsets), mask);
636 }
637 
638 template <typename T, int N, rgba_channel_mask RGBAMask>
639 __SYCL_DEPRECATED("use gather_rgba<rgba_channel_mask>()")
640 __ESIMD_API std::enable_if_t<
641  (N == 8 || N == 16 || N == 32) && sizeof(T) == 4,
643  RGBAMask)>> gather_rgba(const T *p,
644  simd<uint32_t, N> offsets,
645  simd_mask<N> mask = 1) {
646  return gather_rgba<RGBAMask>(p, offsets, mask);
647 }
648 
649 namespace detail {
650 template <rgba_channel_mask M> static void validate_rgba_write_channel_mask() {
651  using CM = rgba_channel_mask;
652  static_assert(
653  (M == CM::ABGR || M == CM::BGR || M == CM::GR || M == CM::R) &&
654  "Only ABGR, BGR, GR, R channel masks are valid in write operations");
655 }
656 } // namespace detail
657 
679 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
680  int N, typename Toffset>
681 __ESIMD_API void
683  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
684  simd_mask<N> mask = 1) {
685  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
686  static_assert((N == 8 || N == 16 || N == 32), "Unsupported value of N");
687  static_assert(sizeof(T) == 4, "Unsupported size of type T");
688  detail::validate_rgba_write_channel_mask<RGBAMask>();
689  simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
690  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
691  addrs = addrs + offsets_i;
692  __esimd_svm_scatter4_scaled<detail::__raw_t<T>, N, RGBAMask>(
693  addrs.data(), vals.data(), mask.data());
694 }
695 
711 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
712  int N, typename Toffset,
713  typename RegionTy = region1d_t<Toffset, N, 1>>
714 __ESIMD_API void
716  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
717  simd_mask<N> mask = 1) {
718  using Ty = typename simd_view<Toffset, RegionTy>::element_type;
719  scatter_rgba<RGBAMask, T, N>(p, simd<Ty, N>(offsets), vals, mask);
720 }
721 
722 template <typename T, int N, rgba_channel_mask RGBAMask>
723 __SYCL_DEPRECATED("use scatter_rgba<rgba_channel_mask>()")
724 __ESIMD_API std::
725  enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4> scatter_rgba(
726  T *p, simd<uint32_t, N> offsets,
727  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
728  simd_mask<N> mask = 1) {
729  scatter_rgba<RGBAMask>(p, offsets, vals, mask);
730 }
731 
754 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
755  typename AccessorT, int N,
756  typename T = typename AccessorT::value_type>
757 __ESIMD_API std::enable_if_t<((N == 8 || N == 16 || N == 32) &&
758  sizeof(T) == 4 && !std::is_pointer_v<AccessorT>),
759  simd<T, N * get_num_channels_enabled(RGBAMask)>>
760 gather_rgba(AccessorT acc, simd<uint32_t, N> offsets,
761  uint32_t global_offset = 0, simd_mask<N> mask = 1) {
762 #ifdef __ESIMD_FORCE_STATELESS_MEM
763  return gather_rgba<RGBAMask>(
764  __ESIMD_DNS::accessorToPointer<T>(acc, global_offset), offsets, mask);
765 #else
766  // TODO (performance) use hardware-supported scale once BE supports it
767  constexpr uint32_t Scale = 0;
768  const auto SI = get_surface_index(acc);
769  return __esimd_gather4_masked_scaled2<detail::__raw_t<T>, N, RGBAMask,
770  decltype(SI), Scale>(
771  SI, global_offset, offsets.data(), mask.data());
772 #endif
773 }
774 
789 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
790  typename AccessorT, int N,
791  typename T = typename AccessorT::value_type>
792 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4 &&
793  !std::is_pointer_v<AccessorT>>
794 scatter_rgba(AccessorT acc, simd<uint32_t, N> offsets,
795  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
796  uint32_t global_offset = 0, simd_mask<N> mask = 1) {
797  detail::validate_rgba_write_channel_mask<RGBAMask>();
798 #ifdef __ESIMD_FORCE_STATELESS_MEM
799  scatter_rgba<RGBAMask>(__ESIMD_DNS::accessorToPointer<T>(acc, global_offset),
800  offsets, vals, mask);
801 #else
802  // TODO (performance) use hardware-supported scale once BE supports it
803  constexpr uint32_t Scale = 0;
804  const auto SI = get_surface_index(acc);
805  __esimd_scatter4_scaled<T, N, decltype(SI), RGBAMask, Scale>(
806  mask.data(), SI, global_offset, offsets.data(), vals.data());
807 #endif
808 }
809 
811 
812 namespace detail {
815 template <__ESIMD_NS::atomic_op Op, typename T, int N, unsigned NumSrc>
816 constexpr void check_atomic() {
817  static_assert((detail::isPowerOf2(N, 32)),
818  "Execution size 1, 2, 4, 8, 16, 32 are supported");
819  static_assert(NumSrc == __ESIMD_DNS::get_num_args<Op>(),
820  "wrong number of operands");
821  constexpr bool IsInt2BytePlus =
822  std::is_integral_v<T> && (sizeof(T) >= sizeof(uint16_t));
823 
824  if constexpr (Op == __ESIMD_NS::atomic_op::xchg ||
825  Op == __ESIMD_NS::atomic_op::cmpxchg ||
826  Op == __ESIMD_NS::atomic_op::predec ||
827  Op == __ESIMD_NS::atomic_op::inc ||
829  Op == __ESIMD_NS::atomic_op::load) {
830 
831  static_assert(IsInt2BytePlus, "Integral 16-bit or wider type is expected");
832  }
833  // FP ops (are always delegated to native::lsc::<Op>)
834  if constexpr (Op == __ESIMD_NS::atomic_op::fmax ||
836  Op == __ESIMD_NS::atomic_op::fadd ||
837  Op == __ESIMD_NS::atomic_op::fsub) {
838  static_assert((is_type<T, float, sycl::half>()),
839  "Type F or HF is expected");
840  }
841  if constexpr (Op == __ESIMD_NS::atomic_op::add ||
842  Op == __ESIMD_NS::atomic_op::sub ||
848  Op == __ESIMD_NS::atomic_op::minsint ||
849  Op == __ESIMD_NS::atomic_op::maxsint) {
850  static_assert(IsInt2BytePlus, "Integral 16-bit or wider type is expected");
851  constexpr bool IsSignedMinmax = (Op == __ESIMD_NS::atomic_op::minsint) ||
852  (Op == __ESIMD_NS::atomic_op::maxsint);
853  constexpr bool IsUnsignedMinmax = (Op == __ESIMD_NS::atomic_op::min) ||
855 
856  if constexpr (IsSignedMinmax || IsUnsignedMinmax) {
857  constexpr bool SignOK = std::is_signed_v<T> == IsSignedMinmax;
858  static_assert(SignOK, "Signed/unsigned integer type expected for "
859  "signed/unsigned min/max operation");
860  }
861  }
862 }
863 } // namespace detail
864 
867 
891 template <atomic_op Op, typename Tx, int N, typename Toffset>
892 __ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<Toffset, N> offset,
893  simd<Tx, N> src0, simd_mask<N> mask) {
894  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
895  detail::check_atomic<Op, Tx, N, 1>();
896  if constexpr ((Op == atomic_op::fmin) || (Op == atomic_op::fmax) ||
897  (Op == atomic_op::fadd) || (Op == atomic_op::fsub)) {
898  // Auto-convert FP atomics to LSC version. Warning is given - see enum.
899  return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(p, offset, src0,
900  mask);
901  } else if constexpr (Op == atomic_op::store) {
902  return atomic_update<atomic_op::xchg, Tx, N>(p, offset, src0, mask);
903  } else {
904  simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
905  simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(offset);
906  vAddr += offset_i1;
907 
908  using T = typename detail::__raw_t<Tx>;
909  return __esimd_svm_atomic1<Op, T, N>(vAddr.data(), src0.data(),
910  mask.data());
911  }
912 }
913 
933 template <atomic_op Op, typename Tx, int N, typename Toffset>
934 __ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<Toffset, N> offset,
935  simd_mask<N> mask) {
936  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
937  detail::check_atomic<Op, Tx, N, 0>();
938  if constexpr (Op == atomic_op::load) {
939  return atomic_update<atomic_op::bit_or, Tx, N>(p, offset, simd<Tx, N>(0),
940  mask);
941  } else {
942  simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
943  simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(offset);
944  vAddr += offset_i1;
945  using T = typename detail::__raw_t<Tx>;
946  return __esimd_svm_atomic0<Op, T, N>(vAddr.data(), mask.data());
947  }
948 }
949 
964 template <atomic_op Op, typename Tx, int N, typename Toffset,
965  typename RegionTy = region1d_t<Toffset, N, 1>>
966 __ESIMD_API simd<Tx, N> atomic_update(Tx *p,
968  simd_mask<N> mask = 1) {
969  using Ty = typename simd_view<Toffset, RegionTy>::element_type;
970  return atomic_update<Op, Tx, N>(p, simd<Ty, N>(offsets), mask);
971 }
972 
992 template <atomic_op Op, typename Tx, int N, typename Toffset,
993  typename RegionTy = region1d_t<Toffset, N, 1>>
994 __ESIMD_API simd<Tx, N> atomic_update(Tx *p,
996  simd<Tx, N> src0, simd_mask<N> mask) {
997  using Ty = typename simd_view<Toffset, RegionTy>::element_type;
998  return atomic_update<Op, Tx, N>(p, simd<Ty, N>(offsets), src0, mask);
999 }
1000 
1020 template <atomic_op Op, typename Tx, int N, typename Toffset>
1021 __ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<Toffset, N> offset,
1022  simd<Tx, N> src0, simd<Tx, N> src1,
1023  simd_mask<N> mask) {
1024  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
1025  detail::check_atomic<Op, Tx, N, 2>();
1026  if constexpr (Op == atomic_op::fcmpwr) {
1027  // Auto-convert FP atomics to LSC version. Warning is given - see enum.
1028  return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(p, offset, src0,
1029  src1, mask);
1030  } else {
1031  simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
1032  simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(offset);
1033  vAddr += offset_i1;
1034  using T = typename detail::__raw_t<Tx>;
1035  return __esimd_svm_atomic2<Op, T, N>(vAddr.data(), src0.data(), src1.data(),
1036  mask.data());
1037  }
1038 }
1039 
1056 template <atomic_op Op, typename Tx, int N, typename Toffset,
1057  typename RegionTy = region1d_t<Toffset, N, 1>>
1058 __ESIMD_API simd<Tx, N>
1060  simd<Tx, N> src1, simd_mask<N> mask) {
1061  using Ty = typename simd_view<Toffset, RegionTy>::element_type;
1062  return atomic_update<Op, Tx, N>(p, simd<Ty, N>(offsets), src0, src1, mask);
1063 }
1064 
1066 
1069 
1072 enum fence_mask : uint8_t {
1090  sw_barrier = 0x80
1091 };
1092 
1096 template <uint8_t cntl> __ESIMD_API void fence() { __esimd_fence(cntl); }
1097 
1098 __SYCL_DEPRECATED("use fence<fence_mask>()")
1099 __ESIMD_API void fence(fence_mask cntl) { __esimd_fence(cntl); }
1100 
1109 __ESIMD_API void barrier() {
1111  __esimd_barrier();
1112 }
1113 
1115 
1118 
1121 template <uint32_t SLMSize> __ESIMD_API void slm_init() {
1122  __esimd_slm_init(SLMSize);
1123 }
1124 
1128 __ESIMD_API void slm_init(uint32_t size) { __esimd_slm_init(size); }
1129 
1135 template <typename T, int N>
1136 __ESIMD_API
1137  std::enable_if_t<(N == 1 || N == 8 || N == 16 || N == 32), simd<T, N>>
1139  detail::LocalAccessorMarker acc;
1140  return detail::gather_impl<T, N>(acc, offsets, 0, mask);
1141 }
1142 
1148 template <typename T> __ESIMD_API T slm_scalar_load(uint32_t offset) {
1149  const simd<T, 1> Res = slm_gather<T, 1>(simd<uint32_t, 1>(offset));
1150  return Res[0];
1151 }
1152 
1158 template <typename T, int N>
1159 __ESIMD_API std::enable_if_t<(N == 1 || N == 8 || N == 16 || N == 32) &&
1160  (sizeof(T) <= 4)>
1162  detail::LocalAccessorMarker acc;
1163  detail::scatter_impl<T, N>(acc, vals, offsets, 0, mask);
1164 }
1165 
1171 template <typename T>
1172 __ESIMD_API void slm_scalar_store(uint32_t offset, T val) {
1173  slm_scatter<T, 1>(simd<uint32_t, 1>(offset), simd<T, 1>(val), 1);
1174 }
1175 
1186 template <typename T, int N, rgba_channel_mask RGBAMask>
1187 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && (sizeof(T) == 4),
1188  simd<T, N * get_num_channels_enabled(RGBAMask)>>
1190 
1191  const auto SI = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
1192  return __esimd_gather4_masked_scaled2<T, N, RGBAMask>(
1193  SI, 0 /*global_offset*/, offsets.data(), mask.data());
1194 }
1195 
1206 template <typename T, int N, rgba_channel_mask Mask>
1207 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && (sizeof(T) == 4)>
1209  simd<T, N * get_num_channels_enabled(Mask)> vals,
1210  simd_mask<N> mask = 1) {
1211  detail::validate_rgba_write_channel_mask<Mask>();
1212  const auto si = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
1213  constexpr int16_t Scale = 0;
1214  constexpr int global_offset = 0;
1215  __esimd_scatter4_scaled<T, N, decltype(si), Mask, Scale>(
1216  mask.data(), si, global_offset, offsets.data(), vals.data());
1217 }
1218 
1227 template <typename T, int N>
1228 __ESIMD_API simd<T, N> slm_block_load(uint32_t offset) {
1229  constexpr unsigned Sz = sizeof(T) * N;
1230  static_assert(Sz >= detail::OperandSize::OWORD,
1231  "block size must be at least 1 oword");
1232  static_assert(Sz % detail::OperandSize::OWORD == 0,
1233  "block size must be whole number of owords");
1234  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
1235  "block must be 1, 2, 4 or 8 owords long");
1236  static_assert(Sz <= 16 * detail::OperandSize::OWORD,
1237  "block size must be at most 16 owords");
1238 
1239  const auto si = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
1240  return __esimd_oword_ld<detail::__raw_t<T>, N>(si, offset >> 4);
1241 }
1242 
1251 template <typename T, int N>
1252 __ESIMD_API void slm_block_store(uint32_t offset, simd<T, N> vals) {
1253  constexpr unsigned Sz = sizeof(T) * N;
1254  static_assert(Sz >= detail::OperandSize::OWORD,
1255  "block size must be at least 1 oword");
1256  static_assert(Sz % detail::OperandSize::OWORD == 0,
1257  "block size must be whole number of owords");
1258  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
1259  "block must be 1, 2, 4 or 8 owords long");
1260  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
1261  "block size must be at most 8 owords");
1262  const auto si = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
1263  // offset in genx.oword.st is in owords
1264  __esimd_oword_st<detail::__raw_t<T>, N>(si, offset >> 4, vals.data());
1265 }
1266 
1270 template <atomic_op Op, typename Tx, int N, class T = detail::__raw_t<Tx>>
1272  simd_mask<N> mask) {
1273  detail::check_atomic<Op, T, N, 0>();
1274  const auto si = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
1275  return __esimd_dword_atomic0<Op, T, N>(mask.data(), si, offsets.data());
1276 }
1277 
1281 template <atomic_op Op, typename Tx, int N, class T = detail::__raw_t<Tx>>
1283  simd<Tx, N> src0, simd_mask<N> mask) {
1284  detail::check_atomic<Op, T, N, 1>();
1285  const auto si = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
1286  return __esimd_dword_atomic1<Op, T, N>(mask.data(), si, offsets.data(),
1287  src0.data());
1288 }
1289 
1293 template <atomic_op Op, typename Tx, int N, class T = detail::__raw_t<Tx>>
1295  simd<Tx, N> src0, simd<Tx, N> src1,
1296  simd_mask<N> mask) {
1297  detail::check_atomic<Op, T, N, 2>();
1298  const auto si = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
1299  return __esimd_dword_atomic2<Op, T, N>(mask.data(), si, offsets.data(),
1300  src0.data(), src1.data());
1301 }
1302 
1304 
1305 #ifndef __ESIMD_FORCE_STATELESS_MEM
1308 
1321 template <typename T, int m, int N, typename AccessorTy, unsigned plane = 0>
1322 __ESIMD_API simd<T, m * N> media_block_load(AccessorTy acc, unsigned x,
1323  unsigned y) {
1324  constexpr unsigned Width = N * sizeof(T);
1325  static_assert(Width * m <= 256u,
1326  "data does not fit into a single dataport transaction");
1327  static_assert(Width <= 64u, "valid block width is in range [1, 64]");
1328  static_assert(m <= 64u, "valid block height is in range [1, 64]");
1329  static_assert(plane <= 3u, "valid plane index is in range [0, 3]");
1330 
1331  const auto si = __ESIMD_NS::get_surface_index(acc);
1332  using SurfIndTy = decltype(si);
1333  constexpr unsigned int RoundedWidth =
1334  Width < 4 ? 4 : detail::getNextPowerOf2<Width>();
1335  constexpr int BlockWidth = sizeof(T) * N;
1336  constexpr int Mod = 0;
1337 
1338  if constexpr (Width < RoundedWidth) {
1339  constexpr unsigned int n1 = RoundedWidth / sizeof(T);
1340  simd<T, m *n1> temp =
1341  __esimd_media_ld<T, m, n1, Mod, SurfIndTy, (int)plane, BlockWidth>(
1342  si, x, y);
1343  return temp.template select<m, 1, N, 1>(0, 0);
1344  } else {
1345  return __esimd_media_ld<T, m, N, Mod, SurfIndTy, (int)plane, BlockWidth>(
1346  si, x, y);
1347  }
1348 }
1349 
1362 template <typename T, int m, int N, typename AccessorTy, unsigned plane = 0>
1363 __ESIMD_API void media_block_store(AccessorTy acc, unsigned x, unsigned y,
1364  simd<T, m * N> vals) {
1365  constexpr unsigned Width = N * sizeof(T);
1366  static_assert(Width * m <= 256u,
1367  "data does not fit into a single dataport transaction");
1368  static_assert(Width <= 64u, "valid block width is in range [1, 64]");
1369  static_assert(m <= 64u, "valid block height is in range [1, 64]");
1370  static_assert(plane <= 3u, "valid plane index is in range [0, 3]");
1371  const auto si = __ESIMD_NS::get_surface_index(acc);
1372  using SurfIndTy = decltype(si);
1373  constexpr unsigned int RoundedWidth =
1374  Width < 4 ? 4 : detail::getNextPowerOf2<Width>();
1375  constexpr unsigned int n1 = RoundedWidth / sizeof(T);
1376  constexpr int BlockWidth = sizeof(T) * N;
1377  constexpr int Mod = 0;
1378 
1379  if constexpr (Width < RoundedWidth) {
1380  simd<T, m * n1> temp;
1381  auto temp_ref = temp.template bit_cast_view<T, m, n1>();
1382  auto vals_ref = vals.template bit_cast_view<T, m, N>();
1383  temp_ref.template select<m, 1, N, 1>() = vals_ref;
1384  __esimd_media_st<T, m, n1, Mod, SurfIndTy, plane, BlockWidth>(si, x, y,
1385  temp.data());
1386  } else {
1387  __esimd_media_st<T, m, N, Mod, SurfIndTy, plane, BlockWidth>(si, x, y,
1388  vals.data());
1389  }
1390 }
1391 #endif // !__ESIMD_FORCE_STATELESS_MEM
1392 
1394 
1396 
1397 namespace detail {
1398 
1399 // ----- Outlined implementations of simd_obj_impl class memory access APIs.
1400 
1401 template <typename T, int N, class T1, class SFINAE>
1402 template <typename Flags, int ChunkSize, typename>
1403 void simd_obj_impl<T, N, T1, SFINAE>::copy_from(
1404  const simd_obj_impl<T, N, T1, SFINAE>::element_type *Addr,
1405  Flags) SYCL_ESIMD_FUNCTION {
1406  using UT = simd_obj_impl<T, N, T1, SFINAE>::element_type;
1407  constexpr unsigned Size = sizeof(T) * N;
1408  constexpr unsigned Align = Flags::template alignment<T1>;
1409 
1410  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
1411  constexpr unsigned NumBlocks = Size / BlockSize;
1412  constexpr unsigned RemSize = Size % BlockSize;
1413 
1414  if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 &&
1415  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
1416  if constexpr (NumBlocks > 0) {
1417  constexpr unsigned BlockN = BlockSize / sizeof(T);
1418  ForHelper<NumBlocks>::unroll([BlockN, Addr, this](unsigned Block) {
1419  select<BlockN, 1>(Block * BlockN) =
1420  block_load<UT, BlockN, Flags>(Addr + (Block * BlockN), Flags{});
1421  });
1422  }
1423  if constexpr (RemSize > 0) {
1424  constexpr unsigned RemN = RemSize / sizeof(T);
1425  constexpr unsigned BlockN = BlockSize / sizeof(T);
1426  select<RemN, 1>(NumBlocks * BlockN) =
1427  block_load<UT, RemN, Flags>(Addr + (NumBlocks * BlockN), Flags{});
1428  }
1429  } else if constexpr (sizeof(T) == 8) {
1430  simd<int32_t, N * 2> BC(reinterpret_cast<const int32_t *>(Addr), Flags{});
1431  bit_cast_view<int32_t>() = BC;
1432  } else {
1433  constexpr unsigned NumChunks = N / ChunkSize;
1434  if constexpr (NumChunks > 0) {
1435  simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
1436  ForHelper<NumChunks>::unroll([Addr, &Offsets, this](unsigned Block) {
1437  select<ChunkSize, 1>(Block * ChunkSize) =
1438  gather<UT, ChunkSize>(Addr + (Block * ChunkSize), Offsets);
1439  });
1440  }
1441  constexpr unsigned RemN = N % ChunkSize;
1442  if constexpr (RemN > 0) {
1443  if constexpr (RemN == 1) {
1444  select<1, 1>(NumChunks * ChunkSize) = Addr[NumChunks * ChunkSize];
1445  } else if constexpr (RemN == 8 || RemN == 16) {
1446  simd<uint32_t, RemN> Offsets(0u, sizeof(T));
1447  select<RemN, 1>(NumChunks * ChunkSize) =
1448  gather<UT, RemN>(Addr + (NumChunks * ChunkSize), Offsets);
1449  } else {
1450  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
1451  simd_mask_type<N1> Pred(0);
1452  Pred.template select<RemN, 1>() = 1;
1453  simd<uint32_t, N1> Offsets(0u, sizeof(T));
1454  simd<UT, N1> Vals =
1455  gather<UT, N1>(Addr + (NumChunks * ChunkSize), Offsets, Pred);
1456  select<RemN, 1>(NumChunks * ChunkSize) =
1457  Vals.template select<RemN, 1>();
1458  }
1459  }
1460  }
1461 }
1462 
1463 template <typename T, int N, class T1, class SFINAE>
1464 template <typename AccessorT, typename Flags, int ChunkSize, typename>
1465 ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_read,
1466  sycl::access::target::device, void>
1467 simd_obj_impl<T, N, T1, SFINAE>::copy_from(AccessorT acc, uint32_t offset,
1468  Flags) SYCL_ESIMD_FUNCTION {
1469  using UT = simd_obj_impl<T, N, T1, SFINAE>::element_type;
1470  static_assert(sizeof(UT) == sizeof(T));
1471  constexpr unsigned Size = sizeof(T) * N;
1472  constexpr unsigned Align = Flags::template alignment<T1>;
1473 
1474  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
1475  constexpr unsigned NumBlocks = Size / BlockSize;
1476  constexpr unsigned RemSize = Size % BlockSize;
1477 
1478  if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 &&
1479  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
1480  if constexpr (NumBlocks > 0) {
1481  constexpr unsigned BlockN = BlockSize / sizeof(T);
1482  ForHelper<NumBlocks>::unroll([BlockN, acc, offset, this](unsigned Block) {
1483  select<BlockN, 1>(Block * BlockN) =
1484  block_load<UT, BlockN, AccessorT, Flags>(
1485  acc, offset + (Block * BlockSize), Flags{});
1486  });
1487  }
1488  if constexpr (RemSize > 0) {
1489  constexpr unsigned RemN = RemSize / sizeof(T);
1490  constexpr unsigned BlockN = BlockSize / sizeof(T);
1491  select<RemN, 1>(NumBlocks * BlockN) =
1492  block_load<UT, RemN, AccessorT, Flags>(
1493  acc, offset + (NumBlocks * BlockSize), Flags{});
1494  }
1495  } else if constexpr (sizeof(T) == 8) {
1496  simd<int32_t, N * 2> BC(acc, offset, Flags{});
1497  bit_cast_view<int32_t>() = BC;
1498  } else {
1499  constexpr unsigned NumChunks = N / ChunkSize;
1500  if constexpr (NumChunks > 0) {
1501  simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
1502  ForHelper<NumChunks>::unroll(
1503  [acc, offset, &Offsets, this](unsigned Block) {
1504  select<ChunkSize, 1>(Block * ChunkSize) =
1505  gather<UT, ChunkSize, AccessorT>(
1506  acc, Offsets, offset + (Block * ChunkSize * sizeof(T)));
1507  });
1508  }
1509  constexpr unsigned RemN = N % ChunkSize;
1510  if constexpr (RemN > 0) {
1511  if constexpr (RemN == 1 || RemN == 8 || RemN == 16) {
1512  simd<uint32_t, RemN> Offsets(0u, sizeof(T));
1513  select<RemN, 1>(NumChunks * ChunkSize) = gather<UT, RemN, AccessorT>(
1514  acc, Offsets, offset + (NumChunks * ChunkSize * sizeof(T)));
1515  } else {
1516  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
1517  simd_mask_type<N1> Pred(0);
1518  Pred.template select<RemN, 1>() = 1;
1519  simd<uint32_t, N1> Offsets(0u, sizeof(T));
1520  simd<UT, N1> Vals = gather<UT, N1>(
1521  acc, Offsets, offset + (NumChunks * ChunkSize * sizeof(T)), Pred);
1522  select<RemN, 1>(NumChunks * ChunkSize) =
1523  Vals.template select<RemN, 1>();
1524  }
1525  }
1526  }
1527 }
1528 
1529 template <typename T, int N, class T1, class SFINAE>
1530 template <typename Flags, int ChunkSize, typename>
1531 void simd_obj_impl<T, N, T1, SFINAE>::copy_to(
1532  simd_obj_impl<T, N, T1, SFINAE>::element_type *Addr,
1533  Flags) const SYCL_ESIMD_FUNCTION {
1534  using UT = simd_obj_impl<T, N, T1, SFINAE>::element_type;
1535  constexpr unsigned Size = sizeof(T) * N;
1536  constexpr unsigned Align = Flags::template alignment<T1>;
1537 
1538  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
1539  constexpr unsigned NumBlocks = Size / BlockSize;
1540  constexpr unsigned RemSize = Size % BlockSize;
1541 
1542  simd<UT, N> Tmp{data()};
1543  if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
1544  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
1545  if constexpr (NumBlocks > 0) {
1546  constexpr unsigned BlockN = BlockSize / sizeof(T);
1547  ForHelper<NumBlocks>::unroll([BlockN, Addr, &Tmp](unsigned Block) {
1548  block_store<UT, BlockN>(Addr + (Block * BlockN),
1549  Tmp.template select<BlockN, 1>(Block * BlockN));
1550  });
1551  }
1552  if constexpr (RemSize > 0) {
1553  constexpr unsigned RemN = RemSize / sizeof(T);
1554  constexpr unsigned BlockN = BlockSize / sizeof(T);
1555  block_store<UT, RemN>(Addr + (NumBlocks * BlockN),
1556  Tmp.template select<RemN, 1>(NumBlocks * BlockN));
1557  }
1558  } else if constexpr (sizeof(T) == 8) {
1559  simd<int32_t, N * 2> BC = Tmp.template bit_cast_view<int32_t>();
1560  BC.copy_to(reinterpret_cast<int32_t *>(Addr), Flags{});
1561  } else {
1562  constexpr unsigned NumChunks = N / ChunkSize;
1563  if constexpr (NumChunks > 0) {
1564  simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
1565  ForHelper<NumChunks>::unroll([Addr, &Offsets, &Tmp](unsigned Block) {
1566  scatter<UT, ChunkSize>(
1567  Addr + (Block * ChunkSize), Offsets,
1568  Tmp.template select<ChunkSize, 1>(Block * ChunkSize));
1569  });
1570  }
1571  constexpr unsigned RemN = N % ChunkSize;
1572  if constexpr (RemN > 0) {
1573  if constexpr (RemN == 1) {
1574  Addr[NumChunks * ChunkSize] = Tmp[NumChunks * ChunkSize];
1575  } else if constexpr (RemN == 8 || RemN == 16) {
1576  // TODO: GPU runtime may handle scatter of 16 byte elements
1577  // incorrectly. The code below is a workaround which must be deleted
1578  // once GPU runtime is fixed.
1579  if constexpr (sizeof(T) == 1 && RemN == 16) {
1580  if constexpr (Align % OperandSize::DWORD > 0) {
1581  ForHelper<RemN>::unroll([Addr, &Tmp](unsigned Index) {
1582  Addr[Index + NumChunks * ChunkSize] =
1583  Tmp[Index + NumChunks * ChunkSize];
1584  });
1585  } else {
1586  simd_mask_type<8> Pred(0);
1587  simd<int32_t, 8> Vals;
1588  Pred.template select<4, 1>() = 1;
1589  Vals.template select<4, 1>() =
1590  Tmp.template bit_cast_view<int32_t>().template select<4, 1>(
1591  NumChunks * ChunkSize);
1592 
1593  simd<uint32_t, 8> Offsets(0u, sizeof(int32_t));
1594  scatter<int32_t, 8>(
1595  reinterpret_cast<int32_t *>(Addr + (NumChunks * ChunkSize)),
1596  Offsets, Vals, Pred);
1597  }
1598  } else {
1599  simd<uint32_t, RemN> Offsets(0u, sizeof(T));
1600  scatter<UT, RemN>(
1601  Addr + (NumChunks * ChunkSize), Offsets,
1602  Tmp.template select<RemN, 1>(NumChunks * ChunkSize));
1603  }
1604  } else {
1605  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
1606  simd_mask_type<N1> Pred(0);
1607  Pred.template select<RemN, 1>() = 1;
1608  simd<UT, N1> Vals;
1609  Vals.template select<RemN, 1>() =
1610  Tmp.template select<RemN, 1>(NumChunks * ChunkSize);
1611  simd<uint32_t, N1> Offsets(0u, sizeof(T));
1612  scatter<UT, N1>(Addr + (NumChunks * ChunkSize), Offsets, Vals, Pred);
1613  }
1614  }
1615  }
1616 }
1617 
1618 template <typename T, int N, class T1, class SFINAE>
1619 template <typename AccessorT, typename Flags, int ChunkSize, typename>
1620 ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_write,
1621  sycl::access::target::device, void>
1622 simd_obj_impl<T, N, T1, SFINAE>::copy_to(AccessorT acc, uint32_t offset,
1623  Flags) const SYCL_ESIMD_FUNCTION {
1624  using UT = simd_obj_impl<T, N, T1, SFINAE>::element_type;
1625  constexpr unsigned Size = sizeof(T) * N;
1626  constexpr unsigned Align = Flags::template alignment<T1>;
1627 
1628  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
1629  constexpr unsigned NumBlocks = Size / BlockSize;
1630  constexpr unsigned RemSize = Size % BlockSize;
1631 
1632  simd<UT, N> Tmp{data()};
1633 
1634  if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
1635  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
1636  if constexpr (NumBlocks > 0) {
1637  constexpr unsigned BlockN = BlockSize / sizeof(T);
1638  ForHelper<NumBlocks>::unroll([BlockN, acc, offset, &Tmp](unsigned Block) {
1639  block_store<UT, BlockN, AccessorT>(
1640  acc, offset + (Block * BlockSize),
1641  Tmp.template select<BlockN, 1>(Block * BlockN));
1642  });
1643  }
1644  if constexpr (RemSize > 0) {
1645  constexpr unsigned RemN = RemSize / sizeof(T);
1646  constexpr unsigned BlockN = BlockSize / sizeof(T);
1647  block_store<UT, RemN, AccessorT>(
1648  acc, offset + (NumBlocks * BlockSize),
1649  Tmp.template select<RemN, 1>(NumBlocks * BlockN));
1650  }
1651  } else if constexpr (sizeof(T) == 8) {
1652  simd<int32_t, N * 2> BC = Tmp.template bit_cast_view<int32_t>();
1653  BC.copy_to(acc, offset, Flags{});
1654  } else {
1655  constexpr unsigned NumChunks = N / ChunkSize;
1656  if constexpr (NumChunks > 0) {
1657  simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
1658  ForHelper<NumChunks>::unroll([acc, offset, &Offsets,
1659  &Tmp](unsigned Block) {
1660  scatter<UT, ChunkSize, AccessorT>(
1661  acc, Offsets, Tmp.template select<ChunkSize, 1>(Block * ChunkSize),
1662  offset + (Block * ChunkSize * sizeof(T)));
1663  });
1664  }
1665  constexpr unsigned RemN = N % ChunkSize;
1666  if constexpr (RemN > 0) {
1667  if constexpr (RemN == 1 || RemN == 8 || RemN == 16) {
1668  simd<uint32_t, RemN> Offsets(0u, sizeof(T));
1669  scatter<UT, RemN, AccessorT>(
1670  acc, Offsets, Tmp.template select<RemN, 1>(NumChunks * ChunkSize),
1671  offset + (NumChunks * ChunkSize * sizeof(T)));
1672  } else {
1673  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
1674  simd_mask_type<N1> Pred(0);
1675  Pred.template select<RemN, 1>() = 1;
1676  simd<UT, N1> Vals;
1677  Vals.template select<RemN, 1>() =
1678  Tmp.template select<RemN, 1>(NumChunks * ChunkSize);
1679  simd<uint32_t, N1> Offsets(0u, sizeof(T));
1680  scatter<UT, N1, AccessorT>(acc, Offsets, Vals,
1681  offset + (NumChunks * ChunkSize * sizeof(T)),
1682  Pred);
1683  }
1684  }
1685  }
1686 }
1687 
1688 } // namespace detail
1690 
1691 } // namespace ext::intel::esimd
1692 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
1693 } // namespace sycl
Definition: simd.hpp:1384
std::enable_if< __vectorizable< _Up >) &&is_simd_flag_type< _Flags >::value >::type copy_to(_Up *__buffer, _Flags) const
Definition: simd.hpp:1523
This class represents a reference to a sub-region of a base simd object.
Definition: simd_view.hpp:37
typename ShapeTy::element_type element_type
The element type of this class, which could be different from the element type of the base object typ...
Definition: simd_view.hpp:64
The main simd vector class.
Definition: simd.hpp:57
#define __SYCL_INLINE_VER_NAMESPACE(X)
#define __SYCL_DEPRECATED(message)
rgba_channel_mask
Represents a pixel's channel mask - all possible combinations of enabled channels.
Definition: common.hpp:105
unsigned int SurfaceIndex
Surface index type.
Definition: common.hpp:64
constexpr int get_num_channels_enabled(rgba_channel_mask M)
Definition: common.hpp:128
atomic_op
Represents an atomic operation.
Definition: common.hpp:145
__ESIMD_API std::enable_if_t<(N==8||N==16||N==32) &&(sizeof(T)==4)> slm_scatter_rgba(simd< uint32_t, N > offsets, simd< T, N *get_num_channels_enabled(Mask)> vals, simd_mask< N > mask=1)
Gather data from the Shared Local Memory at specified offsets and return it as simd vector.
Definition: memory.hpp:1208
__ESIMD_API T slm_scalar_load(uint32_t offset)
Load a scalar value from the Shared Local Memory.
Definition: memory.hpp:1148
__ESIMD_API void slm_init(uint32_t size)
Declare per-work-group slm size.
Definition: memory.hpp:1128
__ESIMD_API simd< Tx, N > slm_atomic_update(simd< uint32_t, N > offsets, simd< Tx, N > src0, simd< Tx, N > src1, simd_mask< N > mask)
Atomic update operation performed on SLM.
Definition: memory.hpp:1294
__ESIMD_API simd< T, N > slm_block_load(uint32_t offset)
Loads a contiguous block of memory from the SLM at given offset and returns the loaded data as a vect...
Definition: memory.hpp:1228
__ESIMD_API std::enable_if_t<(N==8||N==16||N==32) &&(sizeof(T)==4), simd< T, N *get_num_channels_enabled(RGBAMask)> > slm_gather_rgba(simd< uint32_t, N > offsets, simd_mask< N > mask=1)
Gather data from the Shared Local Memory at specified offsets and return it as simd vector.
Definition: memory.hpp:1189
__ESIMD_API std::enable_if_t<(N==1||N==8||N==16||N==32), simd< T, N > > slm_gather(simd< uint32_t, N > offsets, simd_mask< N > mask=1)
Gather operation over the Shared Local Memory.
Definition: memory.hpp:1138
__ESIMD_API std::enable_if_t<(N==1||N==8||N==16||N==32) &&(sizeof(T)<=4)> slm_scatter(simd< uint32_t, N > offsets, simd< T, N > vals, simd_mask< N > mask=1)
Scatter operation over the Shared Local Memory.
Definition: memory.hpp:1161
__ESIMD_API void slm_scalar_store(uint32_t offset, T val)
Store a scalar value into the Shared Local Memory.
Definition: memory.hpp:1172
__ESIMD_API void slm_block_store(uint32_t offset, simd< T, N > vals)
Stores elements of a vector to a contiguous block of SLM at given offset.
Definition: memory.hpp:1252
__ESIMD_API simd< Tx, N > block_load(AccessorTy acc, uint32_t offset, Flags={})
Loads a contiguous block of memory from given accessor and offset and returns the loaded data as a ve...
Definition: memory.hpp:294
__ESIMD_API void fence(fence_mask cntl)
Definition: memory.hpp:1099
__ESIMD_API std::enable_if_t<((N==8||N==16||N==32) &&sizeof(T)==4 &&!std::is_pointer_v< AccessorT >), simd< T, N *get_num_channels_enabled(RGBAMask)> > gather_rgba(AccessorT acc, simd< uint32_t, N > offsets, uint32_t global_offset=0, simd_mask< N > mask=1)
Gather and transpose pixels from the given memory locations defined by the base specified by acc,...
Definition: memory.hpp:760
__ESIMD_API T scalar_load(AccessorTy acc, uint32_t offset)
Load a scalar value from an accessor.
Definition: memory.hpp:548
__ESIMD_API void media_block_store(AccessorTy acc, unsigned x, unsigned y, simd< T, m *N > vals)
Media block store.
Definition: memory.hpp:1363
__ESIMD_API void block_store(AccessorTy acc, uint32_t offset, simd< Tx, N > vals)
Stores elements of a vector to a contiguous block of memory represented by an accessor and an offset ...
Definition: memory.hpp:358
fence_mask
Represetns a bit mask to control behavior of esimd::fence.
Definition: memory.hpp:1072
__ESIMD_API std::enable_if_t<(N==8||N==16||N==32) &&sizeof(T)==4 &&!std::is_pointer_v< AccessorT > > scatter_rgba(AccessorT acc, simd< uint32_t, N > offsets, simd< T, N *get_num_channels_enabled(RGBAMask)> vals, uint32_t global_offset=0, simd_mask< N > mask=1)
Gather data from the memory addressed by accessor acc, offset common for all loaded elements global_o...
Definition: memory.hpp:794
__ESIMD_API std::enable_if_t<(sizeof(T)<=4) &&(N==1||N==8||N==16||N==32) &&!std::is_pointer< AccessorTy >::value > scatter(AccessorTy acc, simd< uint32_t, N > offsets, simd< T, N > vals, uint32_t glob_offset=0, simd_mask< N > mask=1)
Accessor-based scatter.
Definition: memory.hpp:530
__ESIMD_API SurfaceIndex get_surface_index(AccessorTy acc)
Get surface index corresponding to a SYCL accessor.
Definition: memory.hpp:62
__ESIMD_API simd< T, m *N > media_block_load(AccessorTy acc, unsigned x, unsigned y)
Media block load.
Definition: memory.hpp:1322
__ESIMD_API void scalar_store(AccessorTy acc, uint32_t offset, T val)
Store a scalar value into an accessor.
Definition: memory.hpp:562
__ESIMD_API std::enable_if_t<(sizeof(T)<=4) &&(N==1||N==8||N==16||N==32) &&!std::is_pointer< AccessorTy >::value, simd< T, N > > gather(AccessorTy acc, simd< uint32_t, N > offsets, uint32_t glob_offset=0, simd_mask< N > mask=1)
Accessor-based gather.
Definition: memory.hpp:497
__ESIMD_API void barrier()
Generic work-group barrier.
Definition: memory.hpp:1109
@ l3_flush_constant_data
Flush constant cache.
Definition: memory.hpp:1080
@ global_coherent_fence
“Commit enable” - wait for fence to complete before continuing.
Definition: memory.hpp:1074
@ local_barrier
Issue SLM memory barrier only. If not set, the memory barrier is global.
Definition: memory.hpp:1084
@ l1_flush_ro_data
Flush L1 read - only data cache.
Definition: memory.hpp:1086
@ l3_flush_instructions
Flush the instruction cache.
Definition: memory.hpp:1076
@ sw_barrier
Creates a software (compiler) barrier, which does not generate any instruction and only prevents inst...
Definition: memory.hpp:1090
@ l3_flush_rw_data
Flush constant cache.
Definition: memory.hpp:1082
@ l3_flush_texture_data
Flush sampler (texture) cache.
Definition: memory.hpp:1078
void add(const void *DeviceGlobalPtr, const char *UniqueId)
typename std::enable_if< B, T >::type enable_if_t
static constexpr SurfaceIndex SLM_BTI
Definition: common.hpp:98
static void validate_rgba_write_channel_mask()
Definition: memory.hpp:650
constexpr ESIMD_INLINE bool isPowerOf2(unsigned int n)
Check if a given 32 bit positive integer is a power of 2 at compile time.
Definition: common.hpp:79
constexpr void check_atomic()
Check the legality of an atomic call in terms of size and type.
Definition: memory.hpp:816
__ESIMD_API simd< T, N > atomic_update(T *p, simd_view< Toffset, RegionTy > offsets, simd< T, N > src0, simd< T, N > src1, simd_mask< N > mask=1)
Definition: memory.hpp:1869
std::bit_or< T > bit_or
Definition: functional.hpp:20
std::enable_if_t< detail::is_bf16_storage_type< T >::value, T > fmax(T x, T y)
std::enable_if_t< detail::is_bf16_storage_type< T >::value, T > fmin(T x, T y)
std::bit_xor< T > bit_xor
Definition: functional.hpp:21
std::bit_and< T > bit_and
Definition: functional.hpp:22
constexpr stream_manipulator dec
Definition: stream.hpp:680
---— Error handling, matching OpenCL plugin semantics.
Definition: access.hpp:14
simd< _Tp, _Abi > min(const simd< _Tp, _Abi > &, const simd< _Tp, _Abi > &) noexcept
simd< _Tp, _Abi > max(const simd< _Tp, _Abi > &, const simd< _Tp, _Abi > &) noexcept