DPC++ Runtime
Runtime libraries for oneAPI DPC++
memory.hpp
Go to the documentation of this file.
1 //==-------------- memory.hpp - DPC++ Explicit SIMD API --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // Implement Explicit SIMD memory-access APIs.
9 //===----------------------------------------------------------------------===//
10 
11 #pragma once
12 
13 #include <CL/sycl/half_type.hpp>
19 
20 #include <cstdint>
21 
23 namespace __ESIMD_NS {
24 
27 
33 
35 
37 
39 
40 namespace detail {
41 // Type used in internal functions to designate SLM access by
42 // providing dummy accessor of this type. Used to make it possible to delegate
43 // implemenations of SLM memory accesses to general surface-based memory
44 // accesses and thus reuse validity checks etc.
45 struct LocalAccessorMarker {};
46 
47 } // namespace detail
48 
50 
53 
59 template <typename AccessorTy>
60 __ESIMD_API SurfaceIndex get_surface_index(AccessorTy acc) {
61  if constexpr (std::is_same_v<detail::LocalAccessorMarker, AccessorTy>) {
62  return detail::SLM_BTI;
63  } else {
64  return __esimd_get_surface_index(
65  detail::AccessorPrivateProxy::getNativeImageObj(acc));
66  }
67 }
68 
69 #define __ESIMD_GET_SURF_HANDLE(acc) get_surface_index(acc)
70 
71 // TODO @Pennycook
72 // {quote}
73 // ...I'd like us to think more about what we can do to make these interfaces
74 // more user - friendly. A user providing cache hints has to provide a lot more
75 // template arguments than required.Could we make this nicer by providing the
76 // hints as tag - type arguments ?
77 // ...
78 // // Without cache hints, type and length can be deduced from offsets
79 // float* p;
80 // simd<uint32_t, 16> offsets;
81 // auto result = flat_load(p, offsets);
82 //
83 // // With cache hints as templates, verbosity increases significantly:
84 // // - Providing any cache hint forces the user to specify the type and
85 // length float* p; simd<uint32_t, 16> offsets; auto result =
86 // flat_load<uint32_t, 16, 1, CacheHint::Foo, CacheHint::Bar>(p, offsets);
87 //
88 // // With cache hints as tag types, verbosity is reduced:
89 // // - Providing a cache hint does not prevent deduction of type and length
90 // float* p;
91 // simd <uint32_t, 16> offsets;
92 // auto result = flat_load(p, offsets, CacheHint::Foo{});
93 //
94 // Note also that the templated form prevents a developer from specifying an L3
95 // hint without also explicitly specifying an L1 hint. If flat_load accepted a
96 // list of hints, it might be possible to refactor the hints to specify them in
97 // any order, and it may be more extensible to future cache hints:
98 // {/quote}
99 //
100 // TODO @keryell
101 // {quote}
102 // An approach a la https ://github.com/chriskohlhoff/propria from
103 // @chriskohlhoff would be to add a property to the pointer, such as
104 //
105 // auto result = flat_load(p, offsets);
106 // auto result = flat_load(decorate<CacheHint::Foo, CacheHint::Bar>(p),
107 // offsets);
108 // The advantage is that you do not have to change all tour API and all the uses
109 // of this decorated pointer will benefit from this. decorate is to be bikeshed
110 // accordingly.
111 // {/quote}
112 //
113 
129 template <typename Tx, int N, class T = detail::__raw_t<Tx>>
130 __ESIMD_API std::enable_if_t<detail::isPowerOf2(N, 32), simd<Tx, N>>
131 gather(const Tx *p, simd<uint32_t, N> offsets, simd_mask<N> mask = 1) {
132  simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
133  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
134  addrs = addrs + offsets_i;
135 
136  if constexpr (sizeof(T) == 1) {
137  auto Ret = __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<4>(),
138  detail::ElemsPerAddrEncoding<1>()>(
139  addrs.data(), mask.data());
140  return __esimd_rdregion<T, N * 4, N, /*VS*/ 0, N, 4>(Ret, 0);
141  } else if constexpr (sizeof(T) == 2) {
142  auto Ret = __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<2>(),
143  detail::ElemsPerAddrEncoding<2>()>(
144  addrs.data(), mask.data());
145  return __esimd_rdregion<T, N * 2, N, /*VS*/ 0, N, 2>(Ret, 0);
146  } else
147  return __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<1>(),
148  detail::ElemsPerAddrEncoding<1>()>(addrs.data(),
149  mask.data());
150 }
151 
165 template <typename Tx, int N, class T = detail::__raw_t<Tx>>
166 __ESIMD_API std::enable_if_t<detail::isPowerOf2(N, 32)>
167 scatter(Tx *p, simd<uint32_t, N> offsets, simd<Tx, N> vals,
168  simd_mask<N> mask = 1) {
169  simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
170  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
171  addrs = addrs + offsets_i;
172  if constexpr (sizeof(T) == 1) {
173  simd<T, N * 4> D;
174  D = __esimd_wrregion<T, N * 4, N, /*VS*/ 0, N, 4>(D.data(), vals.data(), 0);
175  __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<4>(),
176  detail::ElemsPerAddrEncoding<1>()>(
177  addrs.data(), D.data(), mask.data());
178  } else if constexpr (sizeof(T) == 2) {
179  simd<T, N * 2> D;
180  D = __esimd_wrregion<T, N * 2, N, /*VS*/ 0, N, 2>(D.data(), vals.data(), 0);
181  __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<2>(),
182  detail::ElemsPerAddrEncoding<2>()>(
183  addrs.data(), D.data(), mask.data());
184  } else
185  __esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<1>(),
186  detail::ElemsPerAddrEncoding<1>()>(
187  addrs.data(), vals.data(), mask.data());
188 }
189 
203 template <typename Tx, int N, typename Flags = vector_aligned_tag,
204  class T = detail::__raw_t<Tx>,
205  typename = std::enable_if_t<is_simd_flag_type_v<Flags>>>
206 __ESIMD_API simd<Tx, N> block_load(const Tx *addr, Flags = {}) {
207  constexpr unsigned Sz = sizeof(T) * N;
208  static_assert(Sz >= detail::OperandSize::OWORD,
209  "block size must be at least 1 oword");
210  static_assert(Sz % detail::OperandSize::OWORD == 0,
211  "block size must be whole number of owords");
212  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
213  "block must be 1, 2, 4 or 8 owords long");
214  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
215  "block size must be at most 8 owords");
216 
217  uintptr_t Addr = reinterpret_cast<uintptr_t>(addr);
218  if constexpr (Flags::template alignment<simd<T, N>> >=
219  detail::OperandSize::OWORD) {
220  return __esimd_svm_block_ld<T, N>(Addr);
221  } else {
222  return __esimd_svm_block_ld_unaligned<T, N>(Addr);
223  }
224 }
225 
241 template <typename Tx, int N, typename AccessorTy,
242  typename Flags = vector_aligned_tag,
243  typename = std::enable_if_t<is_simd_flag_type_v<Flags>>,
244  class T = detail::__raw_t<Tx>>
245 __ESIMD_API simd<Tx, N> block_load(AccessorTy acc, uint32_t offset,
246  Flags = {}) {
247 #ifdef __ESIMD_FORCE_STATELESS_MEM
248  return block_load<Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc, offset));
249 #else
250  constexpr unsigned Sz = sizeof(T) * N;
251  static_assert(Sz >= detail::OperandSize::OWORD,
252  "block size must be at least 1 oword");
253  static_assert(Sz % detail::OperandSize::OWORD == 0,
254  "block size must be whole number of owords");
255  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
256  "block must be 1, 2, 4 or 8 owords long");
257  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
258  "block size must be at most 8 owords");
259 
260  auto surf_ind = __esimd_get_surface_index(
261  detail::AccessorPrivateProxy::getNativeImageObj(acc));
262 
263  if constexpr (Flags::template alignment<simd<T, N>> >=
264  detail::OperandSize::OWORD) {
265  return __esimd_oword_ld<T, N>(surf_ind, offset >> 4);
266  } else {
267  return __esimd_oword_ld_unaligned<T, N>(surf_ind, offset);
268  }
269 #endif
270 }
271 
280 template <typename Tx, int N, class T = detail::__raw_t<Tx>>
281 __ESIMD_API void block_store(Tx *p, simd<Tx, N> vals) {
282  constexpr unsigned Sz = sizeof(T) * N;
283  static_assert(Sz >= detail::OperandSize::OWORD,
284  "block size must be at least 1 oword");
285  static_assert(Sz % detail::OperandSize::OWORD == 0,
286  "block size must be whole number of owords");
287  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
288  "block must be 1, 2, 4 or 8 owords long");
289  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
290  "block size must be at most 8 owords");
291 
292  uintptr_t Addr = reinterpret_cast<uintptr_t>(p);
293  __esimd_svm_block_st<T, N>(Addr, vals.data());
294 }
295 
307 template <typename Tx, int N, typename AccessorTy,
308  class T = detail::__raw_t<Tx>>
309 __ESIMD_API void block_store(AccessorTy acc, uint32_t offset,
310  simd<Tx, N> vals) {
311 #ifdef __ESIMD_FORCE_STATELESS_MEM
312  block_store<Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc, offset), vals);
313 #else
314  constexpr unsigned Sz = sizeof(T) * N;
315  static_assert(Sz >= detail::OperandSize::OWORD,
316  "block size must be at least 1 oword");
317  static_assert(Sz % detail::OperandSize::OWORD == 0,
318  "block size must be whole number of owords");
319  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
320  "block must be 1, 2, 4 or 8 owords long");
321  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
322  "block size must be at most 8 owords");
323 
324  auto surf_ind = __esimd_get_surface_index(
325  detail::AccessorPrivateProxy::getNativeImageObj(acc));
326  __esimd_oword_st<T, N>(surf_ind, offset >> 4, vals.data());
327 #endif
328 }
329 
331 
333 
334 // Implementations of accessor-based gather and scatter functions
335 namespace detail {
336 template <typename T, int N, typename AccessorTy>
337 ESIMD_INLINE
338  ESIMD_NODEBUG std::enable_if_t<(sizeof(T) <= 4) &&
339  (N == 1 || N == 8 || N == 16 || N == 32) &&
340  !std::is_pointer<AccessorTy>::value>
341  scatter_impl(AccessorTy acc, simd<T, N> vals, simd<uint32_t, N> offsets,
342  uint32_t glob_offset, simd_mask<N> mask) {
343 
344  constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
345  // TODO (performance) use hardware-supported scale once BE supports it
346  constexpr int16_t scale = 0;
347  const auto si = __ESIMD_GET_SURF_HANDLE(acc);
348 
349  if constexpr (sizeof(T) < 4) {
350  using Tint = std::conditional_t<std::is_integral_v<T>, T,
351  detail::uint_type_t<sizeof(T)>>;
352  using Treal = __raw_t<T>;
353  simd<Tint, N> vals_int = bitcast<Tint, Treal, N>(std::move(vals).data());
354  using PromoT =
356  int32_t, uint32_t>;
357  const simd<PromoT, N> promo_vals = convert<PromoT>(std::move(vals_int));
358  __esimd_scatter_scaled<PromoT, N, decltype(si), TypeSizeLog2, scale>(
359  mask.data(), si, glob_offset, offsets.data(), promo_vals.data());
360  } else {
361  __esimd_scatter_scaled<T, N, decltype(si), TypeSizeLog2, scale>(
362  mask.data(), si, glob_offset, offsets.data(), vals.data());
363  }
364 }
365 
366 template <typename T, int N, typename AccessorTy>
367 ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t<
368  (sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) &&
369  !std::is_pointer<AccessorTy>::value,
370  simd<T, N>>
371 gather_impl(AccessorTy acc, simd<uint32_t, N> offsets, uint32_t glob_offset,
372  simd_mask<N> mask) {
373 
374  constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
375  // TODO (performance) use hardware-supported scale once BE supports it
376  constexpr uint32_t scale = 0;
377  const auto si = get_surface_index(acc);
378 
379  if constexpr (sizeof(T) < 4) {
380  using Tint = std::conditional_t<std::is_integral_v<T>, T,
381  detail::uint_type_t<sizeof(T)>>;
382  using Treal = __raw_t<T>;
383  static_assert(std::is_integral<Tint>::value,
384  "only integral 1- & 2-byte types are supported");
385  using PromoT =
387  int32_t, uint32_t>;
388  const simd<PromoT, N> promo_vals =
389  __esimd_gather_masked_scaled2<PromoT, N, decltype(si), TypeSizeLog2,
390  scale>(si, glob_offset, offsets.data(),
391  mask.data());
392  auto Res = convert<Tint>(promo_vals);
393 
394  if constexpr (!std::is_same_v<Tint, T>) {
395  return detail::bitcast<Treal, Tint, N>(Res.data());
396  } else {
397  return Res;
398  }
399  } else {
400  return __esimd_gather_masked_scaled2<T, N, decltype(si), TypeSizeLog2,
401  scale>(si, glob_offset, offsets.data(),
402  mask.data());
403  }
404 }
405 
406 } // namespace detail
407 
409 
412 
430 template <typename T, int N, typename AccessorTy>
431 __ESIMD_API std::enable_if_t<(sizeof(T) <= 4) &&
432  (N == 1 || N == 8 || N == 16 || N == 32) &&
433  !std::is_pointer<AccessorTy>::value,
434  simd<T, N>>
435 gather(AccessorTy acc, simd<uint32_t, N> offsets, uint32_t glob_offset = 0,
436  simd_mask<N> mask = 1) {
437 #ifdef __ESIMD_FORCE_STATELESS_MEM
438  return gather<T, N>(__ESIMD_DNS::accessorToPointer<T>(acc, glob_offset),
439  offsets, mask);
440 #else
441  return detail::gather_impl<T, N, AccessorTy>(acc, offsets, glob_offset, mask);
442 #endif
443 }
444 
464 template <typename T, int N, typename AccessorTy>
465 __ESIMD_API std::enable_if_t<(sizeof(T) <= 4) &&
466  (N == 1 || N == 8 || N == 16 || N == 32) &&
467  !std::is_pointer<AccessorTy>::value>
468 scatter(AccessorTy acc, simd<uint32_t, N> offsets, simd<T, N> vals,
469  uint32_t glob_offset = 0, simd_mask<N> mask = 1) {
470 #ifdef __ESIMD_FORCE_STATELESS_MEM
471  scatter<T, N>(__ESIMD_DNS::accessorToPointer<T>(acc, glob_offset), offsets,
472  vals, mask);
473 #else
474  detail::scatter_impl<T, N, AccessorTy>(acc, vals, offsets, glob_offset, mask);
475 #endif
476 }
477 
485 template <typename T, typename AccessorTy>
486 __ESIMD_API T scalar_load(AccessorTy acc, uint32_t offset) {
487  const simd<T, 1> Res =
488  gather<T, 1, AccessorTy>(acc, simd<uint32_t, 1>(offset));
489  return Res[0];
490 }
491 
499 template <typename T, typename AccessorTy>
500 __ESIMD_API void scalar_store(AccessorTy acc, uint32_t offset, T val) {
501  scatter<T, 1, AccessorTy>(acc, simd<uint32_t, 1>(offset), simd<T, 1>(val));
502 }
503 
536 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
537  int N>
538 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4,
539  simd<T, N * get_num_channels_enabled(RGBAMask)>>
540 gather_rgba(const T *p, simd<uint32_t, N> offsets, simd_mask<N> mask = 1) {
541  simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
542  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
543  addrs = addrs + offsets_i;
544  return __esimd_svm_gather4_scaled<detail::__raw_t<T>, N, RGBAMask>(
545  addrs.data(), mask.data());
546 }
547 
548 template <typename T, int N, rgba_channel_mask RGBAMask>
549 __SYCL_DEPRECATED("use gather_rgba<rgba_channel_mask>()")
550 __ESIMD_API std::enable_if_t<
551  (N == 8 || N == 16 || N == 32) && sizeof(T) == 4,
553  RGBAMask)>> gather_rgba(const T *p,
554  simd<uint32_t, N> offsets,
555  simd_mask<N> mask = 1) {
556  return gather_rgba<RGBAMask>(p, offsets, mask);
557 }
558 
559 namespace detail {
560 template <rgba_channel_mask M> static void validate_rgba_write_channel_mask() {
561  using CM = rgba_channel_mask;
562  static_assert(
563  (M == CM::ABGR || M == CM::BGR || M == CM::GR || M == CM::R) &&
564  "Only ABGR, BGR, GR, R channel masks are valid in write operations");
565 }
566 } // namespace detail
567 
588 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
589  int N>
590 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4>
592  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
593  simd_mask<N> mask = 1) {
594  detail::validate_rgba_write_channel_mask<RGBAMask>();
595  simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
596  simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
597  addrs = addrs + offsets_i;
598  __esimd_svm_scatter4_scaled<detail::__raw_t<T>, N, RGBAMask>(
599  addrs.data(), vals.data(), mask.data());
600 }
601 
602 template <typename T, int N, rgba_channel_mask RGBAMask>
603 __SYCL_DEPRECATED("use scatter_rgba<rgba_channel_mask>()")
604 __ESIMD_API std::
605  enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4> scatter_rgba(
606  T *p, simd<uint32_t, N> offsets,
607  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
608  simd_mask<N> mask = 1) {
609  scatter_rgba<RGBAMask>(p, offsets, vals, mask);
610 }
611 
634 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
635  typename AccessorT, int N,
636  typename T = typename AccessorT::value_type>
637 __ESIMD_API std::enable_if_t<((N == 8 || N == 16 || N == 32) &&
638  sizeof(T) == 4 && !std::is_pointer_v<AccessorT>),
639  simd<T, N * get_num_channels_enabled(RGBAMask)>>
640 gather_rgba(AccessorT acc, simd<uint32_t, N> offsets,
641  uint32_t global_offset = 0, simd_mask<N> mask = 1) {
642 #ifdef __ESIMD_FORCE_STATELESS_MEM
643  return gather_rgba<RGBAMask>(
644  __ESIMD_DNS::accessorToPointer<T>(acc, global_offset), offsets, mask);
645 #else
646  // TODO (performance) use hardware-supported scale once BE supports it
647  constexpr uint32_t Scale = 0;
648  const auto SI = get_surface_index(acc);
649  return __esimd_gather4_masked_scaled2<detail::__raw_t<T>, N, RGBAMask,
650  decltype(SI), Scale>(
651  SI, global_offset, offsets.data(), mask.data());
652 #endif
653 }
654 
669 template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR,
670  typename AccessorT, int N,
671  typename T = typename AccessorT::value_type>
672 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4 &&
673  !std::is_pointer_v<AccessorT>>
674 scatter_rgba(AccessorT acc, simd<uint32_t, N> offsets,
675  simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
676  uint32_t global_offset = 0, simd_mask<N> mask = 1) {
677  detail::validate_rgba_write_channel_mask<RGBAMask>();
678 #ifdef __ESIMD_FORCE_STATELESS_MEM
679  scatter_rgba<RGBAMask>(__ESIMD_DNS::accessorToPointer<T>(acc, global_offset),
680  offsets, vals, mask);
681 #else
682  // TODO (performance) use hardware-supported scale once BE supports it
683  constexpr uint32_t Scale = 0;
684  const auto SI = get_surface_index(acc);
685  __esimd_scatter4_scaled<T, N, decltype(SI), RGBAMask, Scale>(
686  mask.data(), SI, global_offset, offsets.data(), vals.data());
687 #endif
688 }
689 
691 
693 
694 namespace detail {
697 template <atomic_op Op, typename T, int N, unsigned NumSrc>
698 constexpr bool check_atomic() {
699  if constexpr (!detail::isPowerOf2(N, 32)) {
700  static_assert((detail::isPowerOf2(N, 32)),
701  "Execution size 1, 2, 4, 8, 16, 32 are supported");
702  return false;
703  }
704 
705  // No source operands.
706  if constexpr (Op == atomic_op::inc || Op == atomic_op::dec) {
707  if constexpr (NumSrc != 0) {
708  static_assert(NumSrc == 0, "No source operands are expected");
709  return false;
710  }
711  if constexpr (!is_type<T, uint16_t, uint32_t, uint64_t>()) {
712  static_assert((is_type<T, uint16_t, uint32_t, uint64_t>()),
713  "Type UW, UD or UQ is expected");
714  return false;
715  }
716  return true;
717  }
718 
719  // One source integer operand.
720  if constexpr (Op == atomic_op::add || Op == atomic_op::sub ||
721  Op == atomic_op::min || Op == atomic_op::max ||
722  Op == atomic_op::xchg || Op == atomic_op::bit_and ||
723  Op == atomic_op::bit_or || Op == atomic_op::bit_xor ||
724  Op == atomic_op::minsint || Op == atomic_op::maxsint) {
725  if constexpr (NumSrc != 1) {
726  static_assert(NumSrc == 1, "One source operand is expected");
727  return false;
728  }
729  if constexpr ((Op != atomic_op::minsint && Op != atomic_op::maxsint) &&
730  !is_type<T, uint16_t, uint32_t, uint64_t>()) {
731  static_assert((is_type<T, uint16_t, uint32_t, uint64_t>()),
732  "Type UW, UD or UQ is expected");
733  return false;
734  }
735  if constexpr ((Op == atomic_op::minsint || Op == atomic_op::maxsint) &&
736  !is_type<T, int16_t, int32_t, int64_t>()) {
737  static_assert((is_type<T, int16_t, int32_t, int64_t>()),
738  "Type W, D or Q is expected");
739  return false;
740  }
741  return true;
742  }
743 
744  // One source float operand.
745  if constexpr (Op == atomic_op::fmax || Op == atomic_op::fmin ||
746  Op == atomic_op::fadd || Op == atomic_op::fsub) {
747  if constexpr (NumSrc != 1) {
748  static_assert(NumSrc == 1, "One source operand is expected");
749  return false;
750  }
751  if constexpr (!is_type<T, float, sycl::half>()) {
752  static_assert((is_type<T, float, sycl::half>()),
753  "Type F or HF is expected");
754  return false;
755  }
756  return true;
757  }
758 
759  // Two source operands.
760  if constexpr (Op == atomic_op::cmpxchg || Op == atomic_op::fcmpwr) {
761  if constexpr (NumSrc != 2) {
762  static_assert(NumSrc == 2, "Two source operands are expected");
763  return false;
764  }
765  if constexpr (Op == atomic_op::cmpxchg &&
766  !is_type<T, uint16_t, uint32_t, uint64_t>()) {
767  static_assert((is_type<T, uint16_t, uint32_t, uint64_t>()),
768  "Type UW, UD or UQ is expected");
769  return false;
770  }
771  if constexpr (Op == atomic_op::fcmpwr && !is_type<T, float, sycl::half>()) {
772  static_assert((is_type<T, float, sycl::half>()),
773  "Type F or HF is expected");
774  return false;
775  }
776  return true;
777  }
778  // Unsupported svm atomic Op.
779  return false;
780 }
781 } // namespace detail
782 
784 
787 
805 template <atomic_op Op, typename Tx, int N, class T = detail::__raw_t<Tx>>
806 __ESIMD_API std::enable_if_t<detail::check_atomic<Op, Tx, N, 0>(), simd<Tx, N>>
808  simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
809  simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(offset);
810  vAddr += offset_i1;
811  return __esimd_svm_atomic0<Op, T, N>(vAddr.data(), mask.data());
812 }
813 
835 template <atomic_op Op, typename Tx, int N, class T = detail::__raw_t<Tx>>
836 __ESIMD_API std::enable_if_t<detail::check_atomic<Op, Tx, N, 1>(), simd<Tx, N>>
838  simd_mask<N> mask) {
839  simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
840  simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(offset);
841  vAddr += offset_i1;
842  return __esimd_svm_atomic1<Op, T, N>(vAddr.data(), src0.data(), mask.data());
843 }
844 
864 template <atomic_op Op, typename Tx, int N, class T = detail::__raw_t<Tx>>
865 __ESIMD_API std::enable_if_t<detail::check_atomic<Op, Tx, N, 2>(), simd<Tx, N>>
867  simd<Tx, N> src1, simd_mask<N> mask) {
868  simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
869  simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(offset);
870  vAddr += offset_i1;
871  return __esimd_svm_atomic2<Op, T, N>(vAddr.data(), src0.data(), src1.data(),
872  mask.data());
873 }
874 
876 
879 
882 enum fence_mask : uint8_t {
898  sw_barrier = 0x80
899 };
900 
904 template <uint8_t cntl> __ESIMD_API void fence() { __esimd_fence(cntl); }
905 
906 __SYCL_DEPRECATED("use fence<fence_mask>()")
907 __ESIMD_API void fence(fence_mask cntl) { __esimd_fence(cntl); }
908 
917 __ESIMD_API void barrier() {
919  __esimd_barrier();
920 }
921 
923 
926 
929 template <uint32_t SLMSize> __ESIMD_API void slm_init() {
930  __esimd_slm_init(SLMSize);
931 }
932 
936 __ESIMD_API void slm_init(uint32_t size) { __esimd_slm_init(size); }
937 
943 template <typename T, int N>
944 __ESIMD_API
945  std::enable_if_t<(N == 1 || N == 8 || N == 16 || N == 32), simd<T, N>>
947  detail::LocalAccessorMarker acc;
948  return detail::gather_impl<T, N>(acc, offsets, 0, mask);
949 }
950 
956 template <typename T> __ESIMD_API T slm_scalar_load(uint32_t offset) {
957  const simd<T, 1> Res = slm_gather<T, 1>(simd<uint32_t, 1>(offset));
958  return Res[0];
959 }
960 
966 template <typename T, int N>
967 __ESIMD_API std::enable_if_t<(N == 1 || N == 8 || N == 16 || N == 32) &&
968  (sizeof(T) <= 4)>
970  detail::LocalAccessorMarker acc;
971  detail::scatter_impl<T, N>(acc, vals, offsets, 0, mask);
972 }
973 
979 template <typename T>
980 __ESIMD_API void slm_scalar_store(uint32_t offset, T val) {
981  slm_scatter<T, 1>(simd<uint32_t, 1>(offset), simd<T, 1>(val), 1);
982 }
983 
994 template <typename T, int N, rgba_channel_mask RGBAMask>
995 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && (sizeof(T) == 4),
996  simd<T, N * get_num_channels_enabled(RGBAMask)>>
998 
999  const auto SI = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker());
1000  return __esimd_gather4_masked_scaled2<T, N, RGBAMask>(
1001  SI, 0 /*global_offset*/, offsets.data(), mask.data());
1002 }
1003 
1014 template <typename T, int N, rgba_channel_mask Mask>
1015 __ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && (sizeof(T) == 4)>
1017  simd<T, N * get_num_channels_enabled(Mask)> vals,
1018  simd_mask<N> mask = 1) {
1019  detail::validate_rgba_write_channel_mask<Mask>();
1020  const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker());
1021  constexpr int16_t Scale = 0;
1022  constexpr int global_offset = 0;
1023  __esimd_scatter4_scaled<T, N, decltype(si), Mask, Scale>(
1024  mask.data(), si, global_offset, offsets.data(), vals.data());
1025 }
1026 
1035 template <typename T, int N>
1036 __ESIMD_API simd<T, N> slm_block_load(uint32_t offset) {
1037  constexpr unsigned Sz = sizeof(T) * N;
1038  static_assert(Sz >= detail::OperandSize::OWORD,
1039  "block size must be at least 1 oword");
1040  static_assert(Sz % detail::OperandSize::OWORD == 0,
1041  "block size must be whole number of owords");
1042  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
1043  "block must be 1, 2, 4 or 8 owords long");
1044  static_assert(Sz <= 16 * detail::OperandSize::OWORD,
1045  "block size must be at most 16 owords");
1046 
1047  const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker());
1048  return __esimd_oword_ld<detail::__raw_t<T>, N>(si, offset >> 4);
1049 }
1050 
1059 template <typename T, int N>
1060 __ESIMD_API void slm_block_store(uint32_t offset, simd<T, N> vals) {
1061  constexpr unsigned Sz = sizeof(T) * N;
1062  static_assert(Sz >= detail::OperandSize::OWORD,
1063  "block size must be at least 1 oword");
1064  static_assert(Sz % detail::OperandSize::OWORD == 0,
1065  "block size must be whole number of owords");
1066  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
1067  "block must be 1, 2, 4 or 8 owords long");
1068  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
1069  "block size must be at most 8 owords");
1070  const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker());
1071  // offset in genx.oword.st is in owords
1072  __esimd_oword_st<detail::__raw_t<T>, N>(si, offset >> 4, vals.data());
1073 }
1074 
1078 template <atomic_op Op, typename Tx, int N, class T = detail::__raw_t<Tx>>
1079 __ESIMD_API std::enable_if_t<detail::check_atomic<Op, T, N, 0>(), simd<Tx, N>>
1081  const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker());
1082  return __esimd_dword_atomic0<Op, T, N>(mask.data(), si, offsets.data());
1083 }
1084 
1088 template <atomic_op Op, typename Tx, int N, class T = detail::__raw_t<Tx>>
1089 __ESIMD_API std::enable_if_t<detail::check_atomic<Op, T, N, 1>(), simd<Tx, N>>
1091  simd_mask<N> mask) {
1092  const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker());
1093  return __esimd_dword_atomic1<Op, T, N>(mask.data(), si, offsets.data(),
1094  src0.data());
1095 }
1096 
1100 template <atomic_op Op, typename Tx, int N, class T = detail::__raw_t<Tx>>
1101 __ESIMD_API std::enable_if_t<detail::check_atomic<Op, T, N, 2>(), simd<Tx, N>>
1103  simd_mask<N> mask) {
1104  const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker());
1105  return __esimd_dword_atomic2<Op, T, N>(mask.data(), si, offsets.data(),
1106  src0.data(), src1.data());
1107 }
1108 
1110 
1111 #ifndef __ESIMD_FORCE_STATELESS_MEM
1112 
1127 template <typename T, int m, int N, typename AccessorTy, unsigned plane = 0>
1128 __ESIMD_API simd<T, m * N> media_block_load(AccessorTy acc, unsigned x,
1129  unsigned y) {
1130  constexpr unsigned Width = N * sizeof(T);
1131  static_assert(Width * m <= 256u,
1132  "data does not fit into a single dataport transaction");
1133  static_assert(Width <= 64u, "valid block width is in range [1, 64]");
1134  static_assert(m <= 64u, "valid block height is in range [1, 64]");
1135  static_assert(plane <= 3u, "valid plane index is in range [0, 3]");
1136 
1137  const auto si = __ESIMD_GET_SURF_HANDLE(acc);
1138  using SurfIndTy = decltype(si);
1139  constexpr unsigned int RoundedWidth =
1140  Width < 4 ? 4 : detail::getNextPowerOf2<Width>();
1141  constexpr int BlockWidth = sizeof(T) * N;
1142  constexpr int Mod = 0;
1143 
1144  if constexpr (Width < RoundedWidth) {
1145  constexpr unsigned int n1 = RoundedWidth / sizeof(T);
1146  simd<T, m *n1> temp =
1147  __esimd_media_ld<T, m, n1, Mod, SurfIndTy, (int)plane, BlockWidth>(
1148  si, x, y);
1149  return temp.template select<m, 1, N, 1>(0, 0);
1150  } else {
1151  return __esimd_media_ld<T, m, N, Mod, SurfIndTy, (int)plane, BlockWidth>(
1152  si, x, y);
1153  }
1154 }
1155 
1168 template <typename T, int m, int N, typename AccessorTy, unsigned plane = 0>
1169 __ESIMD_API void media_block_store(AccessorTy acc, unsigned x, unsigned y,
1170  simd<T, m * N> vals) {
1171  constexpr unsigned Width = N * sizeof(T);
1172  static_assert(Width * m <= 256u,
1173  "data does not fit into a single dataport transaction");
1174  static_assert(Width <= 64u, "valid block width is in range [1, 64]");
1175  static_assert(m <= 64u, "valid block height is in range [1, 64]");
1176  static_assert(plane <= 3u, "valid plane index is in range [0, 3]");
1177  const auto si = __ESIMD_GET_SURF_HANDLE(acc);
1178  using SurfIndTy = decltype(si);
1179  constexpr unsigned int RoundedWidth =
1180  Width < 4 ? 4 : detail::getNextPowerOf2<Width>();
1181  constexpr unsigned int n1 = RoundedWidth / sizeof(T);
1182  constexpr int BlockWidth = sizeof(T) * N;
1183  constexpr int Mod = 0;
1184 
1185  if constexpr (Width < RoundedWidth) {
1186  simd<T, m * n1> temp;
1187  auto temp_ref = temp.template bit_cast_view<T, m, n1>();
1188  auto vals_ref = vals.template bit_cast_view<T, m, N>();
1189  temp_ref.template select<m, 1, N, 1>() = vals_ref;
1190  __esimd_media_st<T, m, n1, Mod, SurfIndTy, plane, BlockWidth>(si, x, y,
1191  temp.data());
1192  } else {
1193  __esimd_media_st<T, m, N, Mod, SurfIndTy, plane, BlockWidth>(si, x, y,
1194  vals.data());
1195  }
1196 }
1197 #endif // !__ESIMD_FORCE_STATELESS_MEM
1198 
1200 
1201 #undef __ESIMD_GET_SURF_HANDLE
1202 
1204 
1205 namespace detail {
1206 
1207 // ----- Outlined implementations of simd_obj_impl class memory access APIs.
1208 
1209 template <typename T, int N, class T1, class SFINAE>
1210 template <typename Flags, int ChunkSize, typename>
1213  Flags) SYCL_ESIMD_FUNCTION {
1215  constexpr unsigned Size = sizeof(T) * N;
1216  constexpr unsigned Align = Flags::template alignment<T1>;
1217 
1218  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
1219  constexpr unsigned NumBlocks = Size / BlockSize;
1220  constexpr unsigned RemSize = Size % BlockSize;
1221 
1222  if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 &&
1223  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
1224  if constexpr (NumBlocks > 0) {
1225  constexpr unsigned BlockN = BlockSize / sizeof(T);
1226  ForHelper<NumBlocks>::unroll([BlockN, Addr, this](unsigned Block) {
1227  select<BlockN, 1>(Block * BlockN) =
1228  block_load<UT, BlockN, Flags>(Addr + (Block * BlockN), Flags{});
1229  });
1230  }
1231  if constexpr (RemSize > 0) {
1232  constexpr unsigned RemN = RemSize / sizeof(T);
1233  constexpr unsigned BlockN = BlockSize / sizeof(T);
1234  select<RemN, 1>(NumBlocks * BlockN) =
1235  block_load<UT, RemN, Flags>(Addr + (NumBlocks * BlockN), Flags{});
1236  }
1237  } else if constexpr (sizeof(T) == 8) {
1238  simd<int32_t, N * 2> BC(reinterpret_cast<const int32_t *>(Addr), Flags{});
1239  bit_cast_view<int32_t>() = BC;
1240  } else {
1241  constexpr unsigned NumChunks = N / ChunkSize;
1242  if constexpr (NumChunks > 0) {
1243  simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
1244  ForHelper<NumChunks>::unroll([Addr, &Offsets, this](unsigned Block) {
1245  select<ChunkSize, 1>(Block * ChunkSize) =
1246  gather<UT, ChunkSize>(Addr + (Block * ChunkSize), Offsets);
1247  });
1248  }
1249  constexpr unsigned RemN = N % ChunkSize;
1250  if constexpr (RemN > 0) {
1251  if constexpr (RemN == 1) {
1252  select<1, 1>(NumChunks * ChunkSize) = Addr[NumChunks * ChunkSize];
1253  } else if constexpr (RemN == 8 || RemN == 16) {
1254  simd<uint32_t, RemN> Offsets(0u, sizeof(T));
1255  select<RemN, 1>(NumChunks * ChunkSize) =
1256  gather<UT, RemN>(Addr + (NumChunks * ChunkSize), Offsets);
1257  } else {
1258  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
1259  simd_mask_type<N1> Pred(0);
1260  Pred.template select<RemN, 1>() = 1;
1261  simd<uint32_t, N1> Offsets(0u, sizeof(T));
1262  simd<UT, N1> Vals =
1263  gather<UT, N1>(Addr + (NumChunks * ChunkSize), Offsets, Pred);
1264  select<RemN, 1>(NumChunks * ChunkSize) =
1265  Vals.template select<RemN, 1>();
1266  }
1267  }
1268  }
1269 }
1270 
1271 template <typename T, int N, class T1, class SFINAE>
1272 template <typename AccessorT, typename Flags, int ChunkSize, typename>
1273 ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_read,
1275 simd_obj_impl<T, N, T1, SFINAE>::copy_from(AccessorT acc, uint32_t offset,
1276  Flags) SYCL_ESIMD_FUNCTION {
1278  static_assert(sizeof(UT) == sizeof(T));
1279  constexpr unsigned Size = sizeof(T) * N;
1280  constexpr unsigned Align = Flags::template alignment<T1>;
1281 
1282  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
1283  constexpr unsigned NumBlocks = Size / BlockSize;
1284  constexpr unsigned RemSize = Size % BlockSize;
1285 
1286  if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 &&
1287  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
1288  if constexpr (NumBlocks > 0) {
1289  constexpr unsigned BlockN = BlockSize / sizeof(T);
1290  ForHelper<NumBlocks>::unroll([BlockN, acc, offset, this](unsigned Block) {
1291  select<BlockN, 1>(Block * BlockN) =
1292  block_load<UT, BlockN, AccessorT, Flags>(
1293  acc, offset + (Block * BlockSize), Flags{});
1294  });
1295  }
1296  if constexpr (RemSize > 0) {
1297  constexpr unsigned RemN = RemSize / sizeof(T);
1298  constexpr unsigned BlockN = BlockSize / sizeof(T);
1299  select<RemN, 1>(NumBlocks * BlockN) =
1300  block_load<UT, RemN, AccessorT, Flags>(
1301  acc, offset + (NumBlocks * BlockSize), Flags{});
1302  }
1303  } else if constexpr (sizeof(T) == 8) {
1304  simd<int32_t, N * 2> BC(acc, offset, Flags{});
1305  bit_cast_view<int32_t>() = BC;
1306  } else {
1307  constexpr unsigned NumChunks = N / ChunkSize;
1308  if constexpr (NumChunks > 0) {
1309  simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
1310  ForHelper<NumChunks>::unroll(
1311  [acc, offset, &Offsets, this](unsigned Block) {
1312  select<ChunkSize, 1>(Block * ChunkSize) =
1313  gather<UT, ChunkSize, AccessorT>(
1314  acc, Offsets, offset + (Block * ChunkSize * sizeof(T)));
1315  });
1316  }
1317  constexpr unsigned RemN = N % ChunkSize;
1318  if constexpr (RemN > 0) {
1319  if constexpr (RemN == 1 || RemN == 8 || RemN == 16) {
1320  simd<uint32_t, RemN> Offsets(0u, sizeof(T));
1321  select<RemN, 1>(NumChunks * ChunkSize) = gather<UT, RemN, AccessorT>(
1322  acc, Offsets, offset + (NumChunks * ChunkSize * sizeof(T)));
1323  } else {
1324  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
1325  simd_mask_type<N1> Pred(0);
1326  Pred.template select<RemN, 1>() = 1;
1327  simd<uint32_t, N1> Offsets(0u, sizeof(T));
1328  simd<UT, N1> Vals = gather<UT, N1>(
1329  acc, Offsets, offset + (NumChunks * ChunkSize * sizeof(T)), Pred);
1330  select<RemN, 1>(NumChunks * ChunkSize) =
1331  Vals.template select<RemN, 1>();
1332  }
1333  }
1334  }
1335 }
1336 
1337 template <typename T, int N, class T1, class SFINAE>
1338 template <typename Flags, int ChunkSize, typename>
1341  Flags) const SYCL_ESIMD_FUNCTION {
1343  constexpr unsigned Size = sizeof(T) * N;
1344  constexpr unsigned Align = Flags::template alignment<T1>;
1345 
1346  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
1347  constexpr unsigned NumBlocks = Size / BlockSize;
1348  constexpr unsigned RemSize = Size % BlockSize;
1349 
1350  simd<UT, N> Tmp{data()};
1351  if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
1352  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
1353  if constexpr (NumBlocks > 0) {
1354  constexpr unsigned BlockN = BlockSize / sizeof(T);
1355  ForHelper<NumBlocks>::unroll([BlockN, Addr, &Tmp](unsigned Block) {
1356  block_store<UT, BlockN>(Addr + (Block * BlockN),
1357  Tmp.template select<BlockN, 1>(Block * BlockN));
1358  });
1359  }
1360  if constexpr (RemSize > 0) {
1361  constexpr unsigned RemN = RemSize / sizeof(T);
1362  constexpr unsigned BlockN = BlockSize / sizeof(T);
1363  block_store<UT, RemN>(Addr + (NumBlocks * BlockN),
1364  Tmp.template select<RemN, 1>(NumBlocks * BlockN));
1365  }
1366  } else if constexpr (sizeof(T) == 8) {
1367  simd<int32_t, N * 2> BC = Tmp.template bit_cast_view<int32_t>();
1368  BC.copy_to(reinterpret_cast<int32_t *>(Addr), Flags{});
1369  } else {
1370  constexpr unsigned NumChunks = N / ChunkSize;
1371  if constexpr (NumChunks > 0) {
1372  simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
1373  ForHelper<NumChunks>::unroll([Addr, &Offsets, &Tmp](unsigned Block) {
1374  scatter<UT, ChunkSize>(
1375  Addr + (Block * ChunkSize), Offsets,
1376  Tmp.template select<ChunkSize, 1>(Block * ChunkSize));
1377  });
1378  }
1379  constexpr unsigned RemN = N % ChunkSize;
1380  if constexpr (RemN > 0) {
1381  if constexpr (RemN == 1) {
1382  Addr[NumChunks * ChunkSize] = Tmp[NumChunks * ChunkSize];
1383  } else if constexpr (RemN == 8 || RemN == 16) {
1384  simd<uint32_t, RemN> Offsets(0u, sizeof(T));
1385  scatter<UT, RemN>(Addr + (NumChunks * ChunkSize), Offsets,
1386  Tmp.template select<RemN, 1>(NumChunks * ChunkSize));
1387  } else {
1388  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
1389  simd_mask_type<N1> Pred(0);
1390  Pred.template select<RemN, 1>() = 1;
1391  simd<UT, N1> Vals;
1392  Vals.template select<RemN, 1>() =
1393  Tmp.template select<RemN, 1>(NumChunks * ChunkSize);
1394  simd<uint32_t, N1> Offsets(0u, sizeof(T));
1395  scatter<UT, N1>(Addr + (NumChunks * ChunkSize), Offsets, Vals, Pred);
1396  }
1397  }
1398  }
1399 }
1400 
1401 template <typename T, int N, class T1, class SFINAE>
1402 template <typename AccessorT, typename Flags, int ChunkSize, typename>
1403 ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_write,
1405 simd_obj_impl<T, N, T1, SFINAE>::copy_to(AccessorT acc, uint32_t offset,
1406  Flags) const SYCL_ESIMD_FUNCTION {
1408  constexpr unsigned Size = sizeof(T) * N;
1409  constexpr unsigned Align = Flags::template alignment<T1>;
1410 
1411  constexpr unsigned BlockSize = OperandSize::OWORD * 8;
1412  constexpr unsigned NumBlocks = Size / BlockSize;
1413  constexpr unsigned RemSize = Size % BlockSize;
1414 
1415  simd<UT, N> Tmp{data()};
1416 
1417  if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
1418  detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
1419  if constexpr (NumBlocks > 0) {
1420  constexpr unsigned BlockN = BlockSize / sizeof(T);
1421  ForHelper<NumBlocks>::unroll([BlockN, acc, offset, &Tmp](unsigned Block) {
1422  block_store<UT, BlockN, AccessorT>(
1423  acc, offset + (Block * BlockSize),
1424  Tmp.template select<BlockN, 1>(Block * BlockN));
1425  });
1426  }
1427  if constexpr (RemSize > 0) {
1428  constexpr unsigned RemN = RemSize / sizeof(T);
1429  constexpr unsigned BlockN = BlockSize / sizeof(T);
1430  block_store<UT, RemN, AccessorT>(
1431  acc, offset + (NumBlocks * BlockSize),
1432  Tmp.template select<RemN, 1>(NumBlocks * BlockN));
1433  }
1434  } else if constexpr (sizeof(T) == 8) {
1435  simd<int32_t, N * 2> BC = Tmp.template bit_cast_view<int32_t>();
1436  BC.copy_to(acc, offset, Flags{});
1437  } else {
1438  constexpr unsigned NumChunks = N / ChunkSize;
1439  if constexpr (NumChunks > 0) {
1440  simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
1441  ForHelper<NumChunks>::unroll([acc, offset, &Offsets,
1442  &Tmp](unsigned Block) {
1443  scatter<UT, ChunkSize, AccessorT>(
1444  acc, Offsets, Tmp.template select<ChunkSize, 1>(Block * ChunkSize),
1445  offset + (Block * ChunkSize * sizeof(T)));
1446  });
1447  }
1448  constexpr unsigned RemN = N % ChunkSize;
1449  if constexpr (RemN > 0) {
1450  if constexpr (RemN == 1 || RemN == 8 || RemN == 16) {
1451  simd<uint32_t, RemN> Offsets(0u, sizeof(T));
1452  scatter<UT, RemN, AccessorT>(
1453  acc, Offsets, Tmp.template select<RemN, 1>(NumChunks * ChunkSize),
1454  offset + (NumChunks * ChunkSize * sizeof(T)));
1455  } else {
1456  constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
1457  simd_mask_type<N1> Pred(0);
1458  Pred.template select<RemN, 1>() = 1;
1459  simd<UT, N1> Vals;
1460  Vals.template select<RemN, 1>() =
1461  Tmp.template select<RemN, 1>(NumChunks * ChunkSize);
1462  simd<uint32_t, N1> Offsets(0u, sizeof(T));
1463  scatter<UT, N1, AccessorT>(acc, Offsets, Vals,
1464  offset + (NumChunks * ChunkSize * sizeof(T)),
1465  Pred);
1466  }
1467  }
1468  }
1469 }
1470 
1471 } // namespace detail
1473 
1474 } // namespace __ESIMD_NS
1475 } // __SYCL_INLINE_NAMESPACE(cl)
cl::sycl::ext::intel::esimd::rgba_channel_mask
rgba_channel_mask
Represents a pixel's channel mask - all possible combinations of enabled channels.
Definition: common.hpp:123
cl::sycl::ext::intel::esimd::sw_barrier
@ sw_barrier
Enable thread scheduling barrier.
Definition: memory.hpp:898
cl::sycl::ext::intel::esimd::l3_flush_constant_data
@ l3_flush_constant_data
Flush constant cache.
Definition: memory.hpp:890
simd_mask
Definition: simd.hpp:1029
cl::sycl::ext::intel::esimd::local_barrier
@ local_barrier
Issue SLM memory barrier only. If not set, the memory barrier is global.
Definition: memory.hpp:894
cl::sycl::bit_xor
std::bit_xor< T > bit_xor
Definition: functional.hpp:22
T
cl::sycl::ext::intel::esimd::global_coherent_fence
@ global_coherent_fence
“Commit enable” - wait for fence to complete before continuing.
Definition: memory.hpp:884
vector_aligned_tag
Definition: simd.hpp:1032
cl::sycl::ext::intel::esimd::slm_scatter_rgba
__ESIMD_API std::enable_if_t<(N==8||N==16||N==32) &&(sizeof(T)==4)> slm_scatter_rgba(simd< uint32_t, N > offsets, simd< T, N *get_num_channels_enabled(Mask)> vals, simd_mask< N > mask=1)
Gather data from the Shared Local Memory at specified offsets and return it as simd vector.
Definition: memory.hpp:1016
cl::sycl::detail::device_global_map::add
void add(const void *DeviceGlobalPtr, const char *UniqueId)
Definition: device_global_map.cpp:16
__ESIMD_GET_SURF_HANDLE
#define __ESIMD_GET_SURF_HANDLE(acc)
Definition: memory.hpp:69
conditional_t
cl::sycl::ext::intel::esimd::l3_flush_instructions
@ l3_flush_instructions
Flush the instruction cache.
Definition: memory.hpp:886
__SYCL_DEPRECATED
#define __SYCL_DEPRECATED(message)
Definition: defines_elementary.hpp:47
simd_obj_impl
cl::sycl::ext::intel::esimd::slm_scatter
__ESIMD_API std::enable_if_t<(N==1||N==8||N==16||N==32) &&(sizeof(T)<=4)> slm_scatter(simd< uint32_t, N > offsets, simd< T, N > vals, simd_mask< N > mask=1)
Scatter operation over the Shared Local Memory.
Definition: memory.hpp:969
cl::sycl::ext::intel::esimd::scalar_store
__ESIMD_API void scalar_store(AccessorTy acc, uint32_t offset, T val)
Store a scalar value into an accessor.
Definition: memory.hpp:500
cl::sycl::ext::intel::esimd::block_store
__ESIMD_API void block_store(AccessorTy acc, uint32_t offset, simd< Tx, N > vals)
Stores elements of a vector to a contiguous block of memory represented by an accessor and an offset ...
Definition: memory.hpp:309
cl::sycl::ext::intel::esimd::block_load
__ESIMD_API simd< Tx, N > block_load(AccessorTy acc, uint32_t offset, Flags={})
Loads a contiguous block of memory from given accessor and offset and returns the loaded data as a ve...
Definition: memory.hpp:245
cl::sycl::ext::intel::esimd::slm_gather_rgba
__ESIMD_API std::enable_if_t<(N==8||N==16||N==32) &&(sizeof(T)==4), simd< T, N *get_num_channels_enabled(RGBAMask)> > slm_gather_rgba(simd< uint32_t, N > offsets, simd_mask< N > mask=1)
Gather data from the Shared Local Memory at specified offsets and return it as simd vector.
Definition: memory.hpp:997
max
simd< _Tp, _Abi > max(const simd< _Tp, _Abi > &, const simd< _Tp, _Abi > &) noexcept
cl::sycl::ext::intel::esimd::slm_gather
__ESIMD_API std::enable_if_t<(N==1||N==8||N==16||N==32), simd< T, N > > slm_gather(simd< uint32_t, N > offsets, simd_mask< N > mask=1)
Gather operation over the Shared Local Memory.
Definition: memory.hpp:946
util.hpp
cl::sycl::bit_or
std::bit_or< T > bit_or
Definition: functional.hpp:21
cl::sycl::ext::intel::esimd::scatter
__ESIMD_API std::enable_if_t<(sizeof(T)<=4) &&(N==1||N==8||N==16||N==32) &&!std::is_pointer< AccessorTy >::value > scatter(AccessorTy acc, simd< uint32_t, N > offsets, simd< T, N > vals, uint32_t glob_offset=0, simd_mask< N > mask=1)
Definition: memory.hpp:468
cl::sycl::ext::intel::esimd::SurfaceIndex
unsigned int SurfaceIndex
Surface index type.
Definition: common.hpp:105
cl::sycl::ext::intel::esimd::get_num_channels_enabled
constexpr int get_num_channels_enabled(rgba_channel_mask M)
Definition: common.hpp:146
cl::sycl::ext::intel::esimd::simd
The main simd vector class.
Definition: types.hpp:31
cl::sycl::ext::intel::esimd::slm_scalar_load
__ESIMD_API T slm_scalar_load(uint32_t offset)
Load a scalar value from the Shared Local Memory.
Definition: memory.hpp:956
cl::sycl::ext::intel::esimd::fence_mask
fence_mask
Represetns a bit mask to control behavior of esimd::fence.
Definition: memory.hpp:882
cl::sycl::ext::intel::esimd::scatter_rgba
__ESIMD_API std::enable_if_t<(N==8||N==16||N==32) &&sizeof(T)==4 &&!std::is_pointer_v< AccessorT > > scatter_rgba(AccessorT acc, simd< uint32_t, N > offsets, simd< T, N *get_num_channels_enabled(RGBAMask)> vals, uint32_t global_offset=0, simd_mask< N > mask=1)
Gather data from the memory addressed by accessor acc, offset common for all loaded elements global_o...
Definition: memory.hpp:674
cl::sycl::ext::intel::esimd::l3_flush_rw_data
@ l3_flush_rw_data
Flush constant cache.
Definition: memory.hpp:892
cl::sycl::ext::intel::esimd::fence
__ESIMD_API void fence(fence_mask cntl)
Definition: memory.hpp:907
cl::sycl::fmax
detail::enable_if_t< detail::is_genfloat< T >::value, T > fmax(T x, T y) __NOEXC
Definition: builtins.hpp:203
cl::sycl::fmin
detail::enable_if_t< detail::is_genfloat< T >::value, T > fmin(T x, T y) __NOEXC
Definition: builtins.hpp:216
cl
We provide new interfaces for matrix muliply in this patch:
Definition: access.hpp:13
cl::sycl::bit_and
std::bit_and< T > bit_and
Definition: functional.hpp:20
cl::sycl::ext::intel::esimd::media_block_store
__ESIMD_API void media_block_store(AccessorTy acc, unsigned x, unsigned y, simd< T, m *N > vals)
Media block store.
Definition: memory.hpp:1169
simd.hpp
cl::sycl::ext::intel::esimd::l3_flush_texture_data
@ l3_flush_texture_data
Flush sampler (texture) cache.
Definition: memory.hpp:888
types.hpp
cl::sycl::ext::intel::esimd::slm_init
__ESIMD_API void slm_init(uint32_t size)
Declare per-work-group slm size.
Definition: memory.hpp:936
cl::sycl::ext::intel::esimd::media_block_load
__ESIMD_API simd< T, m *N > media_block_load(AccessorTy acc, unsigned x, unsigned y)
Media block load.
Definition: memory.hpp:1128
cl::sycl::ext::intel::esimd::slm_atomic_update
__ESIMD_API std::enable_if_t< detail::check_atomic< Op, T, N, 2 >), simd< Tx, N > > slm_atomic_update(simd< uint32_t, N > offsets, simd< Tx, N > src0, simd< Tx, N > src1, simd_mask< N > mask)
Atomic update operation performed on SLM.
Definition: memory.hpp:1102
cl::sycl::ext::intel::esimd::scalar_load
__ESIMD_API T scalar_load(AccessorTy acc, uint32_t offset)
Load a scalar value from an accessor.
Definition: memory.hpp:486
cl::sycl::ext::intel::esimd::get_surface_index
__ESIMD_API SurfaceIndex get_surface_index(AccessorTy acc)
Get surface index corresponding to a SYCL accessor.
Definition: memory.hpp:60
simd
Definition: simd.hpp:1027
std
Definition: accessor.hpp:2617
cl::sycl::ext::intel::esimd::barrier
__ESIMD_API void barrier()
Generic work-group barrier.
Definition: memory.hpp:917
cl::sycl::ext::intel::esimd::l1_flush_ro_data
@ l1_flush_ro_data
Flush L1 read - only data cache.
Definition: memory.hpp:896
cl::sycl::ext::intel::esimd::detail::SLM_BTI
static constexpr SurfaceIndex SLM_BTI
Definition: common.hpp:116
cl::sycl::ext::intel::esimd::gather
__ESIMD_API std::enable_if_t<(sizeof(T)<=4) &&(N==1||N==8||N==16||N==32) &&!std::is_pointer< AccessorTy >::value, simd< T, N > > gather(AccessorTy acc, simd< uint32_t, N > offsets, uint32_t glob_offset=0, simd_mask< N > mask=1)
Definition: memory.hpp:435
half_type.hpp
cl::sycl::ext::intel::esimd::slm_block_store
__ESIMD_API void slm_block_store(uint32_t offset, simd< T, N > vals)
Stores elements of a vector to a contiguous block of SLM at given offset.
Definition: memory.hpp:1060
common.hpp
cl::sycl::dec
constexpr stream_manipulator dec
Definition: stream.hpp:679
cl::sycl::ext::intel::esimd::slm_scalar_store
__ESIMD_API void slm_scalar_store(uint32_t offset, T val)
Store a scalar value into the Shared Local Memory.
Definition: memory.hpp:980
cl::sycl::info::device
device
Definition: info_desc.hpp:53
cl::sycl::ext::intel::esimd::detail::validate_rgba_write_channel_mask
static void validate_rgba_write_channel_mask()
Definition: memory.hpp:560
memory_intrin.hpp
cl::sycl::ext::intel::esimd::gather_rgba
__ESIMD_API std::enable_if_t<((N==8||N==16||N==32) &&sizeof(T)==4 &&!std::is_pointer_v< AccessorT >), simd< T, N *get_num_channels_enabled(RGBAMask)> > gather_rgba(AccessorT acc, simd< uint32_t, N > offsets, uint32_t global_offset=0, simd_mask< N > mask=1)
Gather and transpose pixels from the given memory locations defined by the base specified by acc,...
Definition: memory.hpp:640
cl::sycl::detail::enable_if_t
typename std::enable_if< B, T >::type enable_if_t
Definition: stl_type_traits.hpp:24
cl::sycl::ext::intel::esimd::slm_block_load
__ESIMD_API simd< T, N > slm_block_load(uint32_t offset)
Loads a contiguous block of memory from the SLM at given offset and returns the loaded data as a vect...
Definition: memory.hpp:1036
cl::sycl::ext::intel::esimd::atomic_update
__ESIMD_API std::enable_if_t< detail::check_atomic< Op, Tx, N, 2 >), simd< Tx, N > > atomic_update(Tx *p, simd< unsigned, N > offset, simd< Tx, N > src0, simd< Tx, N > src1, simd_mask< N > mask)
Definition: memory.hpp:866
simd::copy_to
std::enable_if< __vectorizable< _Up >) &&is_simd_flag_type< _Flags >::value >::type copy_to(_Up *__buffer, _Flags) const
Definition: simd.hpp:1511
min
simd< _Tp, _Abi > min(const simd< _Tp, _Abi > &, const simd< _Tp, _Abi > &) noexcept
__SYCL_INLINE_NAMESPACE
#define __SYCL_INLINE_NAMESPACE(X)
Definition: defines_elementary.hpp:12