DPC++ Runtime
Runtime libraries for oneAPI DPC++
intrin.hpp
Go to the documentation of this file.
1 //==------------ intrin.hpp - DPC++ Explicit SIMD API --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // Declares Explicit SIMD intrinsics used to implement working with
9 // the SIMD classes objects.
10 //===----------------------------------------------------------------------===//
11 
12 #pragma once
13 
15 
19 
20 #include <assert.h>
21 #include <cstdint>
22 
23 // \brief __esimd_rdregion: region access intrinsic.
24 //
25 // @param T the element data type, one of i8, i16, i32, i64, half, float,
26 // double. In particular bool (i1) and pointer types are not allowed.
27 //
28 // @param N the input vector size.
29 //
30 // @param M the return vector size.
31 //
32 // @param VStride the vertical stride in elements between rows.
33 //
34 // @param Width the size or each row, non-zero and even divides `M`.
35 //
36 // @param Stride horizontal stride in elements within each row.
37 //
38 // @param ParentWidth the width of the input vector when viewed as a 2D
39 // matrix. Ignored if offset is a constant.
40 //
41 // @param Input the input vector
42 //
43 // @param Offset the starting offset in bytes.
44 //
45 // @return the region extracted.
46 //
47 // This intrinsic computes a vector Result:
48 //
49 // \code{.cpp}
50 // uint16_t EltOffset = Offset / sizeof(T);
51 // assert(Offset % sizeof(T) == 0);
52 //
53 // int NumRows = M / Width;
54 // assert(M % Width == 0);
55 //
56 // int Index = 0;
57 // for (int i = 0; i < NumRows; ++i) {
58 // for (int j = 0; j < Width; ++j) {
59 // Result[Index++] = Input[i * VStride + j * Stride +
60 // EltOffset];
61 // }
62 // }
63 // \endcode
64 //
65 template <typename T, int N, int M, int VStride, int Width, int Stride,
66  int ParentWidth = 0>
67 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<T, M>
68 __esimd_rdregion(__ESIMD_DNS::vector_type_t<T, N> Input, uint16_t Offset);
69 
70 template <typename T, int N, int M, int ParentWidth = 0>
71 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<T, M>
72 __esimd_rdindirect(__ESIMD_DNS::vector_type_t<T, N> Input,
73  __ESIMD_DNS::vector_type_t<uint16_t, M> Offset);
74 
75 // __esimd_wrregion returns the updated vector with the region updated.
76 //
77 // @param T the element data type, one of i8, i16, i32, i64, half, float,
78 // double. In particular bool (i1) and pointer types are not allowed.
79 //
80 // @param N the return vector size.
81 //
82 // @param M the vector size to write.
83 //
84 // @param VStride the vertical stride in elements between rows.
85 //
86 // @param Width the size or each row, non-zero and even divides `M`.
87 //
88 // @param Stride horizontal stride in elements within each row.
89 //
90 // @param ParentWidth the width of the input vector when viewed as a 2D
91 // matrix. Ignored if offset is a constant.
92 //
93 // @param OldVal the vector to write region into.
94 //
95 // @param NewVal the vector to write.
96 //
97 // @param Offset the starting offset in bytes.
98 //
99 // @return the updated vector with the region modifided.
100 //
101 // This intrinsic computes a vector Result:
102 //
103 // \code{.cpp}
104 // uint16_t EltOffset = Offset / sizeof(T);
105 // assert(Offset % sizeof(T) == 0);
106 //
107 // int NumRows = M / Width;
108 // assert(M % Width == 0);
109 //
110 // Result = OldValue;
111 // int Index = 0;
112 // for (int i = 0; i < NumRows; ++i) {
113 // for (int j = 0; j < Width; ++j) {
114 // if (Mask[Index])
115 // Result[i * VStride + j * Stride + EltOffset] = NewVal[Index];
116 // ++Index;
117 // }
118 // }
119 // \endcode
120 //
121 template <typename T, int N, int M, int VStride, int Width, int Stride,
122  int ParentWidth = 0>
123 __ESIMD_INTRIN std::enable_if_t<M <= N, __ESIMD_DNS::vector_type_t<T, N>>
124 __esimd_wrregion(__ESIMD_DNS::vector_type_t<T, N> OldVal,
125  __ESIMD_DNS::vector_type_t<T, M> NewVal, uint16_t Offset,
126  __ESIMD_DNS::simd_mask_storage_t<M> Mask = 1);
127 
128 template <typename T, int N, int M, int ParentWidth = 0>
129 __ESIMD_INTRIN std::enable_if_t<M <= N, __ESIMD_DNS::vector_type_t<T, N>>
130 __esimd_wrindirect(__ESIMD_DNS::vector_type_t<T, N> OldVal,
131  __ESIMD_DNS::vector_type_t<T, M> NewVal,
132  __ESIMD_DNS::vector_type_t<uint16_t, M> Offset,
133  __ESIMD_DNS::simd_mask_storage_t<M> Mask = 1);
134 
135 namespace sycl {
136 inline namespace _V1 {
137 namespace ext::intel::esimd::detail {
138 
139 template <class T> using __st = __raw_t<T>;
140 
142 template <typename BT, int BN, typename RTy>
143 __ESIMD_DNS::vector_type_t<__st<typename RTy::element_type>, RTy::length>
144  ESIMD_INLINE readRegion(
145  const __ESIMD_DNS::vector_type_t<__st<BT>, BN> &Base, RTy Region) {
146  using ElemTy = __st<typename RTy::element_type>;
147  auto Base1 = bitcast<ElemTy, __st<BT>, BN>(Base);
148  constexpr int Bytes = BN * sizeof(BT);
149  if constexpr (Bytes == RTy::Size_in_bytes)
150  // This is a no-op format.
151  return Base1;
152  else {
153  static_assert(!RTy::Is_2D);
154  constexpr int N = Bytes / sizeof(ElemTy);
155  // Access the region information.
156  constexpr int M = RTy::Size_x;
157  constexpr int Stride = RTy::Stride_x;
158  int16_t Offset = static_cast<int16_t>(Region.M_offset_x * sizeof(ElemTy));
159  // read-region
160  check_rdregion_params<N, M, /*VS*/ 0, M, Stride>();
161  return __esimd_rdregion<ElemTy, N, M, /*VS*/ 0, M, Stride>(Base1, Offset);
162  }
163 }
164 
166 template <typename BT, int BN, typename T, typename U>
167 ESIMD_INLINE
168  __ESIMD_DNS::vector_type_t<__st<typename T::element_type>, T::length>
169  readRegion(const __ESIMD_DNS::vector_type_t<__st<BT>, BN> &Base,
170  std::pair<T, U> Region) {
171  // parent-region type
172  using PaTy = typename shape_type<U>::type;
173  constexpr int BN1 = PaTy::length;
174  using BT1 = typename PaTy::element_type;
175  using ElemTy = __st<typename T::element_type>;
176  // Recursively read the base
177  auto Base1 = readRegion<BT, BN>(Base, Region.second);
178  if constexpr (!T::Is_2D || BN1 * sizeof(BT1) == T::Size_in_bytes)
179  // 1-D region or format
180  return readRegion<BT1, BN1>(Base1, Region.first);
181  else {
182  static_assert(T::Is_2D);
183  static_assert(std::is_same_v<ElemTy, __st<BT1>>);
184  // To read a 2D region, we need the parent region
185  // Read full rows with non-trivial vertical and horizontal stride = 1.
186  constexpr int M = T::Size_y * PaTy::Size_x;
187  constexpr int VS = T::Stride_y * PaTy::Size_x;
188  constexpr int W = PaTy::Size_x;
189  constexpr int HS = 1;
190  constexpr int ParentWidth = PaTy::Size_x;
191  uint16_t Offset = static_cast<uint16_t>(Region.first.M_offset_y *
192  PaTy::Size_x * sizeof(ElemTy));
193  check_rdregion_params<BN1, M, VS, W, HS>();
194  auto R =
195  __esimd_rdregion<ElemTy, BN1, M, VS, W, HS, ParentWidth>(Base1, Offset);
196 
197  // Read columns with non-trivial horizontal stride.
198  constexpr int N1 = M;
199  constexpr int M1 = T::length;
200  constexpr int VS1 = PaTy::Size_x;
201  constexpr int W1 = T::Size_x;
202  constexpr int HS1 = T::Stride_x;
203  uint16_t Offset1 =
204  static_cast<uint16_t>(Region.first.M_offset_x * sizeof(ElemTy));
205  check_rdregion_params<N1, M1, VS1, W1, HS1>();
206 
207  return __esimd_rdregion<ElemTy, N1, M1, VS1, W1, HS1, ParentWidth>(R,
208  Offset1);
209  }
210 }
211 
212 } // namespace ext::intel::esimd::detail
213 } // namespace _V1
214 } // namespace sycl
215 
216 // vload
217 //
218 // map to the backend vload intrinsic, used by compiler to control
219 // optimization on simd object
220 //
221 template <typename T, int N>
222 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<T, N>
223 __esimd_vload(const __ESIMD_DNS::vector_type_t<T, N> *ptr);
224 
225 // vstore
226 //
227 // map to the backend vstore intrinsic, used by compiler to control
228 // optimization on simd object
229 template <typename T, int N>
230 __ESIMD_INTRIN void __esimd_vstore(__ESIMD_DNS::vector_type_t<T, N> *ptr,
231  __ESIMD_DNS::vector_type_t<T, N> vals);
232 
233 template <typename T, int N>
234 __ESIMD_INTRIN uint16_t __esimd_any(__ESIMD_DNS::vector_type_t<T, N> src)
235 #ifdef __SYCL_DEVICE_ONLY__
236  ;
237 #else
238 {
239  for (unsigned int i = 0; i != N; i++) {
240  if (src[i] != 0)
241  return 1;
242  }
243  return 0;
244 }
245 #endif // __SYCL_DEVICE_ONLY__
246 
247 template <typename T, int N>
248 __ESIMD_INTRIN uint16_t __esimd_all(__ESIMD_DNS::vector_type_t<T, N> src)
249 #ifdef __SYCL_DEVICE_ONLY__
250  ;
251 #else
252 {
253  for (unsigned int i = 0; i != N; i++) {
254  if (src[i] == 0)
255  return 0;
256  }
257  return 1;
258 }
259 #endif // __SYCL_DEVICE_ONLY__
260 
261 #ifndef __SYCL_DEVICE_ONLY__
262 
263 // Implementations of ESIMD intrinsics for the SYCL host device
264 template <typename T, int N, int M, int VStride, int Width, int Stride,
265  int ParentWidth>
266 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<T, M>
267 __esimd_rdregion(__ESIMD_DNS::vector_type_t<T, N> Input, uint16_t Offset) {
268  uint16_t EltOffset = Offset / sizeof(T);
269  assert(Offset % sizeof(T) == 0);
270 
271  int NumRows = M / Width;
272  assert(M % Width == 0);
273 
274  __ESIMD_DNS::vector_type_t<T, M> Result;
275  int Index = 0;
276  for (int i = 0; i < NumRows; ++i) {
277  for (int j = 0; j < Width; ++j) {
278  Result[Index++] = Input[i * VStride + j * Stride + EltOffset];
279  }
280  }
281  return Result;
282 }
283 
284 template <typename T, int N, int M, int ParentWidth>
285 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<T, M>
286 __esimd_rdindirect(__ESIMD_DNS::vector_type_t<T, N> Input,
287  __ESIMD_DNS::vector_type_t<uint16_t, M> Offset) {
288  __ESIMD_DNS::vector_type_t<T, M> Result;
289  for (int i = 0; i < M; ++i) {
290  uint16_t EltOffset = Offset[i] / sizeof(T);
291  assert(Offset[i] % sizeof(T) == 0);
292  assert(EltOffset < N);
293  Result[i] = Input[EltOffset];
294  }
295  return Result;
296 }
297 
298 template <typename T, int N, int M, int VStride, int Width, int Stride,
299  int ParentWidth>
300 __ESIMD_INTRIN std::enable_if_t<M <= N, __ESIMD_DNS::vector_type_t<T, N>>
301 __esimd_wrregion(__ESIMD_DNS::vector_type_t<T, N> OldVal,
302  __ESIMD_DNS::vector_type_t<T, M> NewVal, uint16_t Offset,
303  __ESIMD_DNS::simd_mask_storage_t<M> Mask) {
304  uint16_t EltOffset = Offset / sizeof(T);
305  assert(Offset % sizeof(T) == 0);
306 
307  int NumRows = M / Width;
308  assert(M % Width == 0);
309 
310  __ESIMD_DNS::vector_type_t<T, N> Result = OldVal;
311  int Index = 0;
312  for (int i = 0; i < NumRows; ++i) {
313  for (int j = 0; j < Width; ++j) {
314  if (Mask[Index])
315  Result[i * VStride + j * Stride + EltOffset] = NewVal[Index];
316  ++Index;
317  }
318  }
319  return Result;
320 }
321 
322 template <typename T, int N, int M, int ParentWidth>
323 __ESIMD_INTRIN std::enable_if_t<M <= N, __ESIMD_DNS::vector_type_t<T, N>>
324 __esimd_wrindirect(__ESIMD_DNS::vector_type_t<T, N> OldVal,
325  __ESIMD_DNS::vector_type_t<T, M> NewVal,
326  __ESIMD_DNS::vector_type_t<uint16_t, M> Offset,
327  __ESIMD_DNS::simd_mask_storage_t<M> Mask) {
328  __ESIMD_DNS::vector_type_t<T, N> Result = OldVal;
329  for (int i = 0; i < M; ++i) {
330  if (Mask[i]) {
331  uint16_t EltOffset = Offset[i] / sizeof(T);
332  assert(Offset[i] % sizeof(T) == 0);
333  assert(EltOffset < N);
334  Result[EltOffset] = NewVal[i];
335  }
336  }
337  return Result;
338 }
339 #endif // __SYCL_DEVICE_ONLY__
340 
341 #ifdef __SYCL_DEVICE_ONLY__
342 // This intrinsic requires one of the types to be _Float16, which is absent on
343 // host, so it can't be represented on host. Callers must emulate it.
344 template <class To, class From, int N>
345 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<To, N>
346 __esimd_bf_cvt(__ESIMD_DNS::vector_type_t<From, N> Val);
347 #endif // __SYCL_DEVICE_ONLY__
348 
349 #ifdef __SYCL_DEVICE_ONLY__
350 template <class To, class From, int N>
351 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<To, N>
352 __esimd_tf32_cvt(__ESIMD_DNS::vector_type_t<From, N> Val);
353 #endif // __SYCL_DEVICE_ONLY__
Definition: access.hpp:18
ValueT length(const ValueT *a, const int len)
Calculate the square root of the input array.
Definition: math.hpp:436