DPC++ Runtime
Runtime libraries for oneAPI DPC++
intrin.hpp
Go to the documentation of this file.
1 //==------------ intrin.hpp - DPC++ Explicit SIMD API --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // Declares Explicit SIMD intrinsics used to implement working with
9 // the SIMD classes objects.
10 //===----------------------------------------------------------------------===//
11 
12 #pragma once
13 
15 
19 
20 #include <assert.h>
21 #include <cstdint>
22 
23 // \brief __esimd_rdregion: region access intrinsic.
24 //
25 // @param T the element data type, one of i8, i16, i32, i64, half, float,
26 // double. In particular bool (i1) and pointer types are not allowed.
27 //
28 // @param N the input vector size.
29 //
30 // @param M the return vector size.
31 //
32 // @param VStride the vertical stride in elements between rows.
33 //
34 // @param Width the size or each row, non-zero and even divides `M`.
35 //
36 // @param Stride horizontal stride in elements within each row.
37 //
38 // @param ParentWidth the width of the input vector when viewed as a 2D
39 // matrix. Ignored if offset is a constant.
40 //
41 // @param Input the input vector
42 //
43 // @param Offset the starting offset in bytes.
44 //
45 // @return the region extracted.
46 //
47 // This intrinsic computes a vector Result:
48 //
49 // \code{.cpp}
50 // uint16_t EltOffset = Offset / sizeof(T);
51 // assert(Offset % sizeof(T) == 0);
52 //
53 // int NumRows = M / Width;
54 // assert(M % Width == 0);
55 //
56 // int Index = 0;
57 // for (int i = 0; i < NumRows; ++i) {
58 // for (int j = 0; j < Width; ++j) {
59 // Result[Index++] = Input[i * VStride + j * Stride +
60 // EltOffset];
61 // }
62 // }
63 // \endcode
64 //
65 template <typename T, int N, int M, int VStride, int Width, int Stride,
66  int ParentWidth = 0>
67 __ESIMD_INTRIN std::enable_if_t<(Width > 0) && M % Width == 0,
68  __ESIMD_DNS::vector_type_t<T, M>>
69 __esimd_rdregion(__ESIMD_DNS::vector_type_t<T, N> Input, uint16_t Offset);
70 
71 template <typename T, int N, int M, int ParentWidth = 0>
72 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<T, M>
73 __esimd_rdindirect(__ESIMD_DNS::vector_type_t<T, N> Input,
74  __ESIMD_DNS::vector_type_t<uint16_t, M> Offset);
75 
76 // __esimd_wrregion returns the updated vector with the region updated.
77 //
78 // @param T the element data type, one of i8, i16, i32, i64, half, float,
79 // double. In particular bool (i1) and pointer types are not allowed.
80 //
81 // @param N the return vector size.
82 //
83 // @param M the vector size to write.
84 //
85 // @param VStride the vertical stride in elements between rows.
86 //
87 // @param Width the size or each row, non-zero and even divides `M`.
88 //
89 // @param Stride horizontal stride in elements within each row.
90 //
91 // @param ParentWidth the width of the input vector when viewed as a 2D
92 // matrix. Ignored if offset is a constant.
93 //
94 // @param OldVal the vector to write region into.
95 //
96 // @param NewVal the vector to write.
97 //
98 // @param Offset the starting offset in bytes.
99 //
100 // @return the updated vector with the region modifided.
101 //
102 // This intrinsic computes a vector Result:
103 //
104 // \code{.cpp}
105 // uint16_t EltOffset = Offset / sizeof(T);
106 // assert(Offset % sizeof(T) == 0);
107 //
108 // int NumRows = M / Width;
109 // assert(M % Width == 0);
110 //
111 // Result = OldValue;
112 // int Index = 0;
113 // for (int i = 0; i < NumRows; ++i) {
114 // for (int j = 0; j < Width; ++j) {
115 // if (Mask[Index])
116 // Result[i * VStride + j * Stride + EltOffset] = NewVal[Index];
117 // ++Index;
118 // }
119 // }
120 // \endcode
121 //
122 template <typename T, int N, int M, int VStride, int Width, int Stride,
123  int ParentWidth = 0>
124 __ESIMD_INTRIN std::enable_if_t<M <= N && (Width > 0) && M % Width == 0,
125  __ESIMD_DNS::vector_type_t<T, N>>
126 __esimd_wrregion(__ESIMD_DNS::vector_type_t<T, N> OldVal,
127  __ESIMD_DNS::vector_type_t<T, M> NewVal, uint16_t Offset,
128  __ESIMD_DNS::simd_mask_storage_t<M> Mask = 1);
129 
130 template <typename T, int N, int M, int ParentWidth = 0>
131 __ESIMD_INTRIN std::enable_if_t<M <= N, __ESIMD_DNS::vector_type_t<T, N>>
132 __esimd_wrindirect(__ESIMD_DNS::vector_type_t<T, N> OldVal,
133  __ESIMD_DNS::vector_type_t<T, M> NewVal,
134  __ESIMD_DNS::vector_type_t<uint16_t, M> Offset,
135  __ESIMD_DNS::simd_mask_storage_t<M> Mask = 1);
136 
137 namespace sycl {
138 inline namespace _V1 {
139 namespace ext::intel::esimd::detail {
140 
141 template <class T> using __st = __raw_t<T>;
142 
144 template <typename BT, int BN, typename RTy>
145 __ESIMD_DNS::vector_type_t<__st<typename RTy::element_type>,
146  RTy::length> ESIMD_INLINE
147 readRegion(const __ESIMD_DNS::vector_type_t<__st<BT>, BN> &Base, RTy Region) {
148  using ElemTy = __st<typename RTy::element_type>;
149  auto Base1 = bitcast<ElemTy, __st<BT>, BN>(Base);
150  constexpr int Bytes = BN * sizeof(BT);
151  if constexpr (Bytes == RTy::Size_in_bytes)
152  // This is a no-op format.
153  return Base1;
154  else {
155  static_assert(!RTy::Is_2D);
156  constexpr int N = Bytes / sizeof(ElemTy);
157  // Access the region information.
158  constexpr int M = RTy::Size_x;
159  constexpr int Stride = RTy::Stride_x;
160  int16_t Offset = static_cast<int16_t>(Region.M_offset_x * sizeof(ElemTy));
161  // read-region
162  return __esimd_rdregion<ElemTy, N, M, /*VS*/ 0, M, Stride>(Base1, Offset);
163  }
164 }
165 
167 template <typename BT, int BN, typename T, typename U>
168 ESIMD_INLINE
169  __ESIMD_DNS::vector_type_t<__st<typename T::element_type>, T::length>
170  readRegion(const __ESIMD_DNS::vector_type_t<__st<BT>, BN> &Base,
171  std::pair<T, U> Region) {
172  // parent-region type
173  using PaTy = typename shape_type<U>::type;
174  constexpr int BN1 = PaTy::length;
175  using BT1 = typename PaTy::element_type;
176  using ElemTy = __st<typename T::element_type>;
177  // Recursively read the base
178  auto Base1 = readRegion<BT, BN>(Base, Region.second);
179  if constexpr (!T::Is_2D || BN1 * sizeof(BT1) == T::Size_in_bytes)
180  // 1-D region or format
181  return readRegion<BT1, BN1>(Base1, Region.first);
182  else {
183  static_assert(T::Is_2D);
184  static_assert(std::is_same_v<ElemTy, __st<BT1>>);
185  // To read a 2D region, we need the parent region
186  // Read full rows with non-trivial vertical and horizontal stride = 1.
187  constexpr int M = T::Size_y * PaTy::Size_x;
188  constexpr int VS = T::Stride_y * PaTy::Size_x;
189  constexpr int W = PaTy::Size_x;
190  constexpr int HS = 1;
191  constexpr int ParentWidth = PaTy::Size_x;
192  uint16_t Offset = static_cast<uint16_t>(Region.first.M_offset_y *
193  PaTy::Size_x * sizeof(ElemTy));
194 
195  auto R =
196  __esimd_rdregion<ElemTy, BN1, M, VS, W, HS, ParentWidth>(Base1, Offset);
197 
198  // Read columns with non-trivial horizontal stride.
199  constexpr int N1 = M;
200  constexpr int M1 = T::length;
201  constexpr int VS1 = PaTy::Size_x;
202  constexpr int W1 = T::Size_x;
203  constexpr int HS1 = T::Stride_x;
204  uint16_t Offset1 =
205  static_cast<uint16_t>(Region.first.M_offset_x * sizeof(ElemTy));
206 
207  return __esimd_rdregion<ElemTy, N1, M1, VS1, W1, HS1, ParentWidth>(R,
208  Offset1);
209  }
210 }
211 
212 } // namespace ext::intel::esimd::detail
213 } // namespace _V1
214 } // namespace sycl
215 
216 // vload
217 //
218 // map to the backend vload intrinsic, used by compiler to control
219 // optimization on simd object
220 //
221 template <typename T, int N>
222 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<T, N>
223 __esimd_vload(const __ESIMD_DNS::vector_type_t<T, N> *ptr);
224 
225 // vstore
226 //
227 // map to the backend vstore intrinsic, used by compiler to control
228 // optimization on simd object
229 template <typename T, int N>
230 __ESIMD_INTRIN void __esimd_vstore(__ESIMD_DNS::vector_type_t<T, N> *ptr,
231  __ESIMD_DNS::vector_type_t<T, N> vals);
232 
233 template <typename T, int N>
234 __ESIMD_INTRIN uint16_t __esimd_any(__ESIMD_DNS::vector_type_t<T, N> src)
235 #ifdef __SYCL_DEVICE_ONLY__
236  ;
237 #else
238 {
239  for (unsigned int i = 0; i != N; i++) {
240  if (src[i] != 0)
241  return 1;
242  }
243  return 0;
244 }
245 #endif // __SYCL_DEVICE_ONLY__
246 
247 template <typename T, int N>
248 __ESIMD_INTRIN uint16_t __esimd_all(__ESIMD_DNS::vector_type_t<T, N> src)
249 #ifdef __SYCL_DEVICE_ONLY__
250  ;
251 #else
252 {
253  for (unsigned int i = 0; i != N; i++) {
254  if (src[i] == 0)
255  return 0;
256  }
257  return 1;
258 }
259 #endif // __SYCL_DEVICE_ONLY__
260 
261 #ifndef __SYCL_DEVICE_ONLY__
262 
263 // Implementations of ESIMD intrinsics for the SYCL host device
264 template <typename T, int N, int M, int VStride, int Width, int Stride,
265  int ParentWidth>
266 __ESIMD_INTRIN std::enable_if_t<(Width > 0) && M % Width == 0,
267  __ESIMD_DNS::vector_type_t<T, M>>
268 __esimd_rdregion(__ESIMD_DNS::vector_type_t<T, N> Input, uint16_t Offset) {
269  uint16_t EltOffset = Offset / sizeof(T);
270  assert(Offset % sizeof(T) == 0);
271 
272  int NumRows = M / Width;
273  assert(M % Width == 0);
274 
275  __ESIMD_DNS::vector_type_t<T, M> Result;
276  int Index = 0;
277  for (int i = 0; i < NumRows; ++i) {
278  for (int j = 0; j < Width; ++j) {
279  Result[Index++] = Input[i * VStride + j * Stride + EltOffset];
280  }
281  }
282  return Result;
283 }
284 
285 template <typename T, int N, int M, int ParentWidth>
286 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<T, M>
287 __esimd_rdindirect(__ESIMD_DNS::vector_type_t<T, N> Input,
288  __ESIMD_DNS::vector_type_t<uint16_t, M> Offset) {
289  __ESIMD_DNS::vector_type_t<T, M> Result;
290  for (int i = 0; i < M; ++i) {
291  uint16_t EltOffset = Offset[i] / sizeof(T);
292  assert(Offset[i] % sizeof(T) == 0);
293  assert(EltOffset < N);
294  Result[i] = Input[EltOffset];
295  }
296  return Result;
297 }
298 
299 template <typename T, int N, int M, int VStride, int Width, int Stride,
300  int ParentWidth>
301 __ESIMD_INTRIN std::enable_if_t<M <= N && (Width > 0) && M % Width == 0,
302  __ESIMD_DNS::vector_type_t<T, N>>
303 __esimd_wrregion(__ESIMD_DNS::vector_type_t<T, N> OldVal,
304  __ESIMD_DNS::vector_type_t<T, M> NewVal, uint16_t Offset,
305  __ESIMD_DNS::simd_mask_storage_t<M> Mask) {
306  uint16_t EltOffset = Offset / sizeof(T);
307  assert(Offset % sizeof(T) == 0);
308 
309  int NumRows = M / Width;
310  assert(M % Width == 0);
311 
312  __ESIMD_DNS::vector_type_t<T, N> Result = OldVal;
313  int Index = 0;
314  for (int i = 0; i < NumRows; ++i) {
315  for (int j = 0; j < Width; ++j) {
316  if (Mask[Index])
317  Result[i * VStride + j * Stride + EltOffset] = NewVal[Index];
318  ++Index;
319  }
320  }
321  return Result;
322 }
323 
324 template <typename T, int N, int M, int ParentWidth>
325 __ESIMD_INTRIN std::enable_if_t<M <= N, __ESIMD_DNS::vector_type_t<T, N>>
326 __esimd_wrindirect(__ESIMD_DNS::vector_type_t<T, N> OldVal,
327  __ESIMD_DNS::vector_type_t<T, M> NewVal,
328  __ESIMD_DNS::vector_type_t<uint16_t, M> Offset,
329  __ESIMD_DNS::simd_mask_storage_t<M> Mask) {
330  __ESIMD_DNS::vector_type_t<T, N> Result = OldVal;
331  for (int i = 0; i < M; ++i) {
332  if (Mask[i]) {
333  uint16_t EltOffset = Offset[i] / sizeof(T);
334  assert(Offset[i] % sizeof(T) == 0);
335  assert(EltOffset < N);
336  Result[EltOffset] = NewVal[i];
337  }
338  }
339  return Result;
340 }
341 #endif // __SYCL_DEVICE_ONLY__
342 
343 #ifdef __SYCL_DEVICE_ONLY__
344 // This intrinsic requires one of the types to be _Float16, which is absent on
345 // host, so it can't be represented on host. Callers must emulate it.
346 template <class To, class From, int N>
347 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<To, N>
348 __esimd_bf_cvt(__ESIMD_DNS::vector_type_t<From, N> Val);
349 #endif // __SYCL_DEVICE_ONLY__
350 
351 #ifdef __SYCL_DEVICE_ONLY__
352 template <class To, class From, int N>
353 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<To, N>
354 __esimd_tf32_cvt(__ESIMD_DNS::vector_type_t<From, N> Val);
355 #endif // __SYCL_DEVICE_ONLY__
Definition: access.hpp:18
ValueT length(const ValueT *a, const int len)
Calculate the square root of the input array.
Definition: math.hpp:161