DPC++ Runtime
Runtime libraries for oneAPI DPC++
math.hpp
Go to the documentation of this file.
1 //==-------------- math.hpp - DPC++ Explicit SIMD API --------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // Implement experimental Explicit SIMD math APIs.
9 //===----------------------------------------------------------------------===//
10 
11 #pragma once
12 
17 
19 namespace __ESIMD_ENS {
20 
23 
34 template <typename T0, typename T1, int SZ, typename U,
35  class Sat = __ESIMD_NS::saturation_off_tag>
36 __ESIMD_API std::enable_if_t<std::is_integral<T0>::value &&
37  std::is_integral<T1>::value &&
38  std::is_integral<U>::value,
39  __ESIMD_NS::simd<T0, SZ>>
40 shl(__ESIMD_NS::simd<T1, SZ> src0, U src1, Sat sat = {}) {
41  using ComputationTy = __ESIMD_DNS::computation_type_t<decltype(src0), U>;
42  typename __ESIMD_DNS::simd_type<ComputationTy>::type Src0 = src0;
43  typename __ESIMD_DNS::simd_type<ComputationTy>::type Src1 = src1;
44 
45  if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_on_tag>) {
46  if constexpr (std::is_unsigned<T0>::value) {
47  if constexpr (std::is_unsigned<T1>::value)
48  return __esimd_uushl_sat<T0, T1, SZ>(Src0.data(), Src1.data());
49  else
50  return __esimd_usshl_sat<T0, T1, SZ>(Src0.data(), Src1.data());
51  } else {
52  if constexpr (std::is_signed<T1>::value)
53  return __esimd_sushl_sat<T0, T1, SZ>(Src0.data(), Src1.data());
54  else
55  return __esimd_ssshl_sat<T0, T1, SZ>(Src0.data(), Src1.data());
56  }
57  } else {
58  if constexpr (std::is_unsigned<T0>::value) {
59  if constexpr (std::is_unsigned<T1>::value)
60  return __esimd_uushl<T0, T1, SZ>(Src0.data(), Src1.data());
61  else
62  return __esimd_usshl<T0, T1, SZ>(Src0.data(), Src1.data());
63  } else {
64  if constexpr (std::is_signed<T1>::value)
65  return __esimd_sushl<T0, T1, SZ>(Src0.data(), Src1.data());
66  else
67  return __esimd_ssshl<T0, T1, SZ>(Src0.data(), Src1.data());
68  }
69  }
70 }
71 
81 template <typename T0, typename T1, typename T2,
82  class Sat = __ESIMD_NS::saturation_off_tag>
83 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<T0>::value &&
84  __ESIMD_DNS::is_esimd_scalar<T1>::value &&
85  __ESIMD_DNS::is_esimd_scalar<T2>::value &&
86  std::is_integral<T0>::value &&
87  std::is_integral<T1>::value &&
88  std::is_integral<T2>::value,
89  std::remove_const_t<T0>>
90 shl(T1 src0, T2 src1, Sat sat = {}) {
91  using ComputationTy = __ESIMD_DNS::computation_type_t<T1, T2>;
92  typename __ESIMD_DNS::simd_type<ComputationTy>::type Src0 = src0;
93  typename __ESIMD_DNS::simd_type<ComputationTy>::type Src1 = src1;
94  __ESIMD_NS::simd<T0, 1> Result = esimd::shl<T0>(Src0, Src1, sat);
95  return Result[0];
96 }
97 
108 template <typename T0, typename T1, int SZ, typename U,
109  class Sat = __ESIMD_NS::saturation_off_tag>
110 __ESIMD_API std::enable_if_t<std::is_integral<T0>::value &&
111  std::is_integral<T1>::value &&
112  std::is_integral<U>::value,
113  __ESIMD_NS::simd<T0, SZ>>
114 shr(__ESIMD_NS::simd<T1, SZ> src0, U src1, Sat sat = {}) {
115  using ComputationTy = __ESIMD_DNS::computation_type_t<decltype(src0), U>;
116  typename __ESIMD_DNS::simd_type<ComputationTy>::type Src0 = src0;
117  typename __ESIMD_DNS::simd_type<ComputationTy>::type Src1 = src1;
118  // TODO H/W supports saturation with this op - map to more efficient version.
119  typename __ESIMD_DNS::simd_type<ComputationTy>::type Result =
120  Src0.data() >> Src1.data();
121 
122  if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
123  return Result;
124  else
125  return __ESIMD_NS::saturate<T0>(Result);
126 }
127 
137 template <typename T0, typename T1, typename T2,
138  class Sat = __ESIMD_NS::saturation_off_tag>
139 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<T0>::value &&
140  __ESIMD_DNS::is_esimd_scalar<T1>::value &&
141  __ESIMD_DNS::is_esimd_scalar<T2>::value &&
142  std::is_integral<T0>::value &&
143  std::is_integral<T1>::value &&
144  std::is_integral<T2>::value,
145  std::remove_const_t<T0>>
146 shr(T1 src0, T2 src1, Sat sat = {}) {
147  using ComputationTy = __ESIMD_DNS::computation_type_t<T1, T2>;
148  typename __ESIMD_DNS::simd_type<ComputationTy>::type Src0 = src0;
149  typename __ESIMD_DNS::simd_type<ComputationTy>::type Src1 = src1;
150  __ESIMD_NS::simd<T0, 1> Result = esimd::shr<T0>(Src0, Src1, sat);
151  return Result[0];
152 }
153 
162 template <typename T0, typename T1, int SZ>
163 __ESIMD_API
164  std::enable_if_t<std::is_integral<T0>::value && std::is_integral<T1>::value,
165  __ESIMD_NS::simd<T0, SZ>>
166  rol(__ESIMD_NS::simd<T1, SZ> src0, __ESIMD_NS::simd<T1, SZ> src1) {
167  return __esimd_rol<T0, T1, SZ>(src0.data(), src1.data());
168 }
169 
178 template <typename T0, typename T1, int SZ, typename U>
179 __ESIMD_API std::enable_if_t<std::is_integral<T0>::value &&
180  std::is_integral<T1>::value &&
181  std::is_integral<U>::value,
182  __ESIMD_NS::simd<T0, SZ>>
183 rol(__ESIMD_NS::simd<T1, SZ> src0, U src1) {
184  using ComputationTy = __ESIMD_DNS::computation_type_t<decltype(src0), U>;
185  typename __ESIMD_DNS::simd_type<ComputationTy>::type Src0 = src0;
186  typename __ESIMD_DNS::simd_type<ComputationTy>::type Src1 = src1;
187  return __esimd_rol<T0>(Src0.data(), Src1.data());
188 }
189 
197 template <typename T0, typename T1, typename T2>
198 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<T0>::value &&
199  __ESIMD_DNS::is_esimd_scalar<T1>::value &&
200  __ESIMD_DNS::is_esimd_scalar<T2>::value &&
201  std::is_integral<T0>::value &&
202  std::is_integral<T1>::value &&
203  std::is_integral<T2>::value,
204  std::remove_const_t<T0>>
205 rol(T1 src0, T2 src1) {
206  using ComputationTy = __ESIMD_DNS::computation_type_t<T1, T2>;
207  typename __ESIMD_DNS::simd_type<ComputationTy>::type Src0 = src0;
208  typename __ESIMD_DNS::simd_type<ComputationTy>::type Src1 = src1;
209  __ESIMD_NS::simd<T0, 1> Result = esimd::rol<T0>(Src0, Src1);
210  return Result[0];
211 }
212 
221 template <typename T0, typename T1, int SZ>
222 __ESIMD_API
223  std::enable_if_t<std::is_integral<T0>::value && std::is_integral<T1>::value,
224  __ESIMD_NS::simd<T0, SZ>>
225  ror(__ESIMD_NS::simd<T1, SZ> src0, __ESIMD_NS::simd<T1, SZ> src1) {
226  return __esimd_ror<T0, T1, SZ>(src0.data(), src1.data());
227 }
228 
237 template <typename T0, typename T1, int SZ, typename U>
238 __ESIMD_API std::enable_if_t<std::is_integral<T0>::value &&
239  std::is_integral<T1>::value &&
240  std::is_integral<U>::value,
241  __ESIMD_NS::simd<T0, SZ>>
242 ror(__ESIMD_NS::simd<T1, SZ> src0, U src1) {
243  using ComputationTy = __ESIMD_DNS::computation_type_t<decltype(src0), U>;
244  typename __ESIMD_DNS::simd_type<ComputationTy>::type Src0 = src0;
245  typename __ESIMD_DNS::simd_type<ComputationTy>::type Src1 = src1;
246  return __esimd_ror<T0>(Src0.data(), Src1.data());
247 }
248 
256 template <typename T0, typename T1, typename T2>
257 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<T0>::value &&
258  __ESIMD_DNS::is_esimd_scalar<T1>::value &&
259  __ESIMD_DNS::is_esimd_scalar<T2>::value &&
260  std::is_integral<T0>::value &&
261  std::is_integral<T1>::value &&
262  std::is_integral<T2>::value,
263  std::remove_const_t<T0>>
264 ror(T1 src0, T2 src1) {
265  using ComputationTy = __ESIMD_DNS::computation_type_t<T1, T2>;
266  typename __ESIMD_DNS::simd_type<ComputationTy>::type Src0 = src0;
267  typename __ESIMD_DNS::simd_type<ComputationTy>::type Src1 = src1;
268  __ESIMD_NS::simd<T0, 1> Result = esimd::ror<T0>(Src0, Src1);
269  return Result[0];
270 }
271 
282 template <typename T0, typename T1, int SZ, typename U,
283  class Sat = __ESIMD_NS::saturation_off_tag>
284 __ESIMD_API std::enable_if_t<std::is_integral<T0>::value &&
285  std::is_integral<T1>::value &&
286  std::is_integral<U>::value,
287  __ESIMD_NS::simd<T0, SZ>>
288 lsr(__ESIMD_NS::simd<T1, SZ> src0, U src1, Sat sat = {}) {
289  using IntermedTy = __ESIMD_DNS::computation_type_t<T1, T1>;
290  typedef typename std::make_unsigned<IntermedTy>::type ComputationTy;
291  __ESIMD_NS::simd<ComputationTy, SZ> Src0 = src0;
292  // TODO H/W supports saturation with this op - map to more efficient version.
293  __ESIMD_NS::simd<ComputationTy, SZ> Result = Src0.data() >> src1.data();
294 
295  if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
296  return Result;
297  else
298  return __ESIMD_NS::saturate<T0>(Result);
299 }
300 
311 template <typename T0, typename T1, typename T2,
312  class Sat = __ESIMD_NS::saturation_off_tag>
313 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<T0>::value &&
314  __ESIMD_DNS::is_esimd_scalar<T1>::value &&
315  __ESIMD_DNS::is_esimd_scalar<T2>::value &&
316  std::is_integral<T0>::value &&
317  std::is_integral<T1>::value &&
318  std::is_integral<T2>::value,
319  std::remove_const_t<T0>>
320 lsr(T1 src0, T2 src1, Sat sat = {}) {
321  using ComputationTy = __ESIMD_DNS::computation_type_t<T1, T2>;
322  typename __ESIMD_DNS::simd_type<ComputationTy>::type Src0 = src0;
323  typename __ESIMD_DNS::simd_type<ComputationTy>::type Src1 = src1;
324  __ESIMD_NS::simd<T0, 1> Result = esimd::lsr<T0>(Src0, Src1, sat);
325  return Result[0];
326 }
327 
338 template <typename T0, typename T1, int SZ, typename U,
339  class Sat = __ESIMD_NS::saturation_off_tag>
340 __ESIMD_API std::enable_if_t<std::is_integral<T0>::value &&
341  std::is_integral<T1>::value &&
342  std::is_integral<U>::value,
343  __ESIMD_NS::simd<T0, SZ>>
344 asr(__ESIMD_NS::simd<T1, SZ> src0, U src1, Sat sat = {}) {
345  using IntermedTy = __ESIMD_DNS::computation_type_t<T1, T1>;
346  typedef typename std::make_signed<IntermedTy>::type ComputationTy;
347  __ESIMD_NS::simd<ComputationTy, SZ> Src0 = src0;
348  // TODO H/W supports saturation with this op - map to more efficient version.
349  __ESIMD_NS::simd<ComputationTy, SZ> Result = Src0 >> src1;
350 
351  if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
352  return Result;
353  else
354  return __ESIMD_NS::saturate<T0>(Result);
355 }
356 
367 template <typename T0, typename T1, typename T2,
368  class Sat = __ESIMD_NS::saturation_off_tag>
369 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<T0>::value &&
370  __ESIMD_DNS::is_esimd_scalar<T1>::value &&
371  __ESIMD_DNS::is_esimd_scalar<T2>::value &&
372  std::is_integral<T0>::value &&
373  std::is_integral<T1>::value &&
374  std::is_integral<T2>::value,
375  std::remove_const_t<T0>>
376 asr(T1 src0, T2 src1, Sat sat = {}) {
377  using ComputationTy = __ESIMD_DNS::computation_type_t<T1, T2>;
378  typename __ESIMD_DNS::simd_type<ComputationTy>::type Src0 = src0;
379  typename __ESIMD_DNS::simd_type<ComputationTy>::type Src1 = src1;
380  __ESIMD_NS::simd<T0, 1> Result = esimd::asr<T0>(Src0, Src1, sat);
381  return Result[0];
382 }
384 
387 
388 // imul
389 #ifndef ESIMD_HAS_LONG_LONG
390 // use mulh instruction for high half
391 template <typename T0, typename T1, typename U, int SZ>
392 ESIMD_NODEBUG
393  ESIMD_INLINE std::enable_if_t<__ESIMD_DNS::is_dword_type<T0>::value &&
394  __ESIMD_DNS::is_dword_type<T1>::value &&
395  __ESIMD_DNS::is_dword_type<U>::value,
396  __ESIMD_NS::simd<T0, SZ>>
397  imul(__ESIMD_NS::simd<T0, SZ> &rmd, __ESIMD_NS::simd<T1, SZ> src0, U src1) {
398  using ComputationTy = __ESIMD_DNS::computation_type_t<decltype(src0), U>;
399  typename __ESIMD_DNS::simd_type<ComputationTy>::type Src0 = src0;
400  typename __ESIMD_DNS::simd_type<ComputationTy>::type Src1 = src1;
401  rmd = Src0 * Src1;
402  if constexpr (std::is_unsigned<T0>::value)
403  return __esimd_umulh(Src0.data(), Src1.data());
404  else
405  return __esimd_smulh(Src0.data(), Src1.data());
406 }
407 
408 #else
409 // imul bdw+ version: use qw=dw*dw multiply.
410 // We need to special case SZ==1 to avoid "error: when select size is 1, the
411 // stride must also be 1" on the selects.
412 template <typename T0, typename T1, typename U, int SZ>
413 __ESIMD_API
414  std::enable_if_t<__ESIMD_DNS::is_dword_type<T0>::value &&
415  __ESIMD_DNS::is_dword_type<T1>::value &&
416  __ESIMD_DNS::is_dword_type<U>::value && SZ == 1,
417  __ESIMD_NS::simd<T0, SZ>>
418  imul(__ESIMD_NS::simd<T0, SZ> &rmd, __ESIMD_NS::simd<T1, SZ> src0, U src1) {
419  using ComputationTy =
420  __ESIMD_DNS::computation_type_t<decltype(rmd), long long>;
421  ComputationTy Product = convert<long long>(src0);
422  Product *= src1;
423  rmd = Product.bit_cast_view<T0>().select<1, 1>[0];
424  return Product.bit_cast_view<T0>().select<1, 1>[1];
425 }
426 
427 template <typename T0, typename T1, typename U, int SZ>
428 __ESIMD_API
429  std::enable_if_t<__ESIMD_DNS::is_dword_type<T0>::value &&
430  __ESIMD_DNS::is_dword_type<T1>::value &&
431  __ESIMD_DNS::is_dword_type<U>::value && SZ != 1,
432  __ESIMD_NS::simd<T0, SZ>>
433  imul(__ESIMD_NS::simd<T0, SZ> &rmd, __ESIMD_NS::simd<T1, SZ> src0, U src1) {
434  using ComputationTy =
435  __ESIMD_DNS::computation_type_t<decltype(rmd), long long>;
436  ComputationTy Product = convert<long long>(src0);
437  Product *= src1;
438  rmd = Product.bit_cast_view<T0>().select<SZ, 2>(0);
439  return Product.bit_cast_view<T0>().select<SZ, 2>(1);
440 }
441 #endif
442 
443 // TODO: document
444 template <typename T0, typename T1, typename U, int SZ>
445 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<U>::value,
446  __ESIMD_NS::simd<T0, SZ>>
447 imul(__ESIMD_NS::simd<T0, SZ> &rmd, U src0, __ESIMD_NS::simd<T1, SZ> src1) {
448  return esimd::imul(rmd, src1, src0);
449 }
450 
451 // TODO: document
452 template <typename T0, typename T, typename U>
453 ESIMD_NODEBUG
454  ESIMD_INLINE std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<T>::value &&
455  __ESIMD_DNS::is_esimd_scalar<U>::value &&
456  __ESIMD_DNS::is_esimd_scalar<T0>::value,
457  T0>
458  imul(__ESIMD_NS::simd<T0, 1> &rmd, T src0, U src1) {
459  __ESIMD_NS::simd<T, 1> src_0 = src0;
460  __ESIMD_NS::simd<U, 1> src_1 = src1;
461  __ESIMD_NS::simd<T0, 1> res =
462  esimd::imul(rmd, src_0.select_all(), src_1.select_all());
463  return res[0];
464 }
465 
473 template <typename T, int SZ, typename U>
474 __ESIMD_API
475  std::enable_if_t<std::is_integral<T>::value && std::is_integral<U>::value,
476  __ESIMD_NS::simd<T, SZ>>
477  quot(__ESIMD_NS::simd<T, SZ> src0, U src1) {
478  return src0 / src1;
479 }
480 
487 template <typename T0, typename T1>
488 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<T0>::value &&
489  __ESIMD_DNS::is_esimd_scalar<T1>::value &&
490  std::is_integral<T0>::value &&
491  std::is_integral<T1>::value,
492  std::remove_const_t<T0>>
493 quot(T0 src0, T1 src1) {
494  return src0 / src1;
495 }
496 
504 template <typename T, int SZ, typename U>
505 __ESIMD_API
506  std::enable_if_t<std::is_integral<T>::value && std::is_integral<U>::value,
507  __ESIMD_NS::simd<T, SZ>>
508  mod(__ESIMD_NS::simd<T, SZ> src0, U src1) {
509  return src0 % src1;
510 }
511 
518 template <typename T0, typename T1>
519 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<T0>::value &&
520  __ESIMD_DNS::is_esimd_scalar<T1>::value &&
521  std::is_integral<T0>::value &&
522  std::is_integral<T1>::value,
523  std::remove_const_t<T0>>
524 mod(T0 src0, T1 src1) {
525  return src0 % src1;
526 }
527 
537 template <typename T, int SZ, typename U>
538 __ESIMD_API
539  std::enable_if_t<std::is_integral<T>::value && std::is_integral<U>::value,
540  __ESIMD_NS::simd<T, SZ>>
541  div(__ESIMD_NS::simd<T, SZ> &remainder, __ESIMD_NS::simd<T, SZ> src0,
542  U src1) {
543  remainder = src0 % src1;
544  return src0 / src1;
545 }
546 
556 template <typename T, int SZ, typename U>
557 __ESIMD_API
558  std::enable_if_t<std::is_integral<T>::value && std::is_integral<U>::value &&
559  __ESIMD_DNS::is_esimd_scalar<U>::value,
560  __ESIMD_NS::simd<T, SZ>>
561  div(__ESIMD_NS::simd<T, SZ> &remainder, U src0,
562  __ESIMD_NS::simd<T, SZ> src1) {
563  remainder = src0 % src1;
564  return src0 / src1;
565 }
566 
576 template <typename RT, typename T0, typename T1>
577 ESIMD_NODEBUG
578  ESIMD_INLINE std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<RT>::value &&
579  __ESIMD_DNS::is_esimd_scalar<T0>::value &&
580  __ESIMD_DNS::is_esimd_scalar<T1>::value,
581  std::remove_const_t<RT>>
582  div(__ESIMD_NS::simd<std::remove_const_t<RT>, 1> &remainder, T0 src0,
583  T1 src1) {
584  remainder[0] = src0 % src1;
585  return src0 / src1;
586 }
587 
588 // Dot product builtins
589 #if defined(ESIMD_GEN7_5) || defined(ESIMD_GEN8) || defined(ESIMD_GEN8_5) || \
590  defined(ESIMD_GEN9) || defined(ESIMD_GEN9_5)
591 
602 template <typename T0, typename T1, int SZ, typename U,
603  class Sat = __ESIMD_NS::saturation_off_tag>
604 __ESIMD_API __ESIMD_NS::simd<T0, SZ> dp2(__ESIMD_NS::simd<T1, SZ> src0, U src1,
605  Sat sat = {}) {
606  static_assert(SZ % 4 == 0, "result size is not a multiple of 4");
607  __ESIMD_NS::simd<float, SZ> Src0 = src0;
608  __ESIMD_NS::simd<float, SZ> Src1 = src1;
609  __ESIMD_NS::simd<float, SZ> Result = __esimd_dp2(Src0.data(), Src1.data());
610  if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
611  return Result;
612  else
613  return __ESIMD_NS::saturate<T0>(Result);
614 }
615 
626 template <typename T0, typename T1, int SZ, typename U,
627  class Sat = __ESIMD_NS::saturation_off_tag>
628 __ESIMD_API __ESIMD_NS::simd<T0, SZ> dp3(__ESIMD_NS::simd<T1, SZ> src0, U src1,
629  Sat sat = {}) {
630  static_assert(SZ % 4 == 0, "result size is not a multiple of 4");
631  __ESIMD_NS::simd<float, SZ> Src0 = src0;
632  __ESIMD_NS::simd<float, SZ> Src1 = src1;
633  __ESIMD_NS::simd<float, SZ> Result = __esimd_dp3(Src0.data(), Src1.data());
634  if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
635  return Result;
636  else
637  return __ESIMD_NS::saturate<T0>(Result);
638 }
639 
650 template <typename T0, typename T1, int SZ, typename U,
651  class Sat = __ESIMD_NS::saturation_off_tag>
652 __ESIMD_API __ESIMD_NS::simd<T0, SZ> dp4(__ESIMD_NS::simd<T1, SZ> src0, U src1,
653  Sat sat = {}) {
654  static_assert(SZ % 4 == 0, "result size is not a multiple of 4");
655  __ESIMD_NS::simd<float, SZ> Src0 = src0;
656  __ESIMD_NS::simd<float, SZ> Src1 = src1;
657  __ESIMD_NS::simd<float, SZ> Result = __esimd_dp4(Src0.data(), Src1.data());
658  if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
659  return Result;
660  else
661  return __ESIMD_NS::saturate<T0>(Result);
662 }
663 
674 template <typename T0, typename T1, typename U, int SZ,
675  class Sat = __ESIMD_NS::saturation_off_tag>
676 __ESIMD_API __ESIMD_NS::simd<T0, SZ> dph(__ESIMD_NS::simd<T1, SZ> src0, U src1,
677  Sat sat = {}) {
678  static_assert(SZ % 4 == 0, "result size is not a multiple of 4");
679  __ESIMD_NS::simd<float, SZ> Src0 = src0;
680  __ESIMD_NS::simd<float, SZ> Src1 = src1;
681  __ESIMD_NS::simd<float, SZ> Result = __esimd_dph(Src0.data(), Src1.data());
682  if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
683  return Result;
684  else
685  return __ESIMD_NS::saturate<T0>(Result);
686 }
687 
699 template <typename RT, typename T1, typename T2, int SZ,
700  class Sat = __ESIMD_NS::saturation_off_tag>
701 __ESIMD_API __ESIMD_NS::simd<RT, SZ> line(__ESIMD_NS::simd<T1, 4> src0,
702  __ESIMD_NS::simd<T2, SZ> src1,
703  Sat sat = {}) {
704  static_assert(SZ % 4 == 0, "result size is not a multiple of 4");
705 
706  __ESIMD_NS::simd<float, 4> Src0 = src0;
707  __ESIMD_NS::simd<float, SZ> Src1 = src1;
708  __ESIMD_NS::simd<float, SZ> Result = __esimd_line(Src0.data(), Src1.data());
709 
710  __ESIMD_NS::simd<RT, SZ> Result;
711  if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
712  return Result;
713  else
714  return __ESIMD_NS::saturate<RT>(Result);
715 }
716 
728 template <typename RT, typename T, int SZ,
729  class Sat = __ESIMD_NS::saturation_off_tag>
730 __ESIMD_API __ESIMD_NS::simd<RT, SZ>
731 line(float P, float Q, __ESIMD_NS::simd<T, SZ> src1, Sat sat = {}) {
732  __ESIMD_NS::simd<float, 4> Src0 = P;
733  Src0(3) = Q;
734  return esimd::line<RT>(Src0, src1, sat);
735 }
736 
737 #else
738 // The old implementation is to generate vISA IRs for dp2/dp3/dp4/dph/line.
739 // Now We change to use direct mul/add, and hope to generate mad instructions
740 // at the end, to still get the performance as good as HW solution.
741 // We rely on "pragma unroll" to get better code.
742 // The only input and return types for these APIs are floats.
743 // In order to be able to use the old emu code, we keep the template argument
744 // for the type, although the type "T" can only be float.
745 // We use enable_if to force the float type only.
746 // If the gen is not specified we warn the programmer that they are potentially
747 // using a less efficient implementation if not on GEN10 or above.
748 
759 template <typename T0, typename T1, int SZ, typename U,
760  class Sat = __ESIMD_NS::saturation_off_tag>
761 ESIMD_NODEBUG ESIMD_INLINE
762  std::enable_if_t<__ESIMD_DNS::is_fp_or_dword_type<T1>::value &&
763  std::is_floating_point<T1>::value &&
764  __ESIMD_DNS::is_fp_or_dword_type<U>::value &&
765  std::is_floating_point<U>::value,
766  __ESIMD_NS::simd<T0, SZ>>
767  dp2(__ESIMD_NS::simd<T1, SZ> src0, U src1, Sat sat = {}) {
768  static_assert(SZ % 4 == 0, "result size is not a multiple of 4");
769 
770  __ESIMD_NS::simd<float, SZ> Src1 = src1;
771  __ESIMD_NS::simd<float, SZ> Result;
772 #pragma unroll
773  for (int i = 0; i < SZ; i += 4) {
774  Result.select<4, 1>(i) = src0[i] * Src1[i] + src0[i + 1] * Src1[i + 1];
775  }
776  if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
777  return Result;
778  else
779  return __ESIMD_NS::saturate<T1>(Result);
780 }
781 
792 template <typename T0, typename T1, int SZ, typename U,
793  class Sat = __ESIMD_NS::saturation_off_tag>
794 ESIMD_NODEBUG ESIMD_INLINE
795  std::enable_if_t<__ESIMD_DNS::is_fp_or_dword_type<T1>::value &&
796  std::is_floating_point<T1>::value &&
797  __ESIMD_DNS::is_fp_or_dword_type<U>::value &&
798  std::is_floating_point<U>::value,
799  __ESIMD_NS::simd<T0, SZ>>
800  dp3(__ESIMD_NS::simd<T1, SZ> src0, U src1, Sat sat = {}) {
801  static_assert(SZ % 4 == 0, "result size is not a multiple of 4");
802 
803  __ESIMD_NS::simd<float, SZ> Src1 = src1;
804  __ESIMD_NS::simd<float, SZ> Result;
805 #pragma unroll
806  for (int i = 0; i < SZ; i += 4) {
807  Result.select<4, 1>(i) = src0[i] * Src1[i] + src0[i + 1] * Src1[i + 1] +
808  src0[i + 2] * Src1[i + 2];
809  }
810  if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
811  return Result;
812  else
813  return __ESIMD_NS::saturate<T1>(Result);
814 }
815 
826 template <typename T0, typename T1, int SZ, typename U,
827  class Sat = __ESIMD_NS::saturation_off_tag>
828 ESIMD_NODEBUG ESIMD_INLINE
829  std::enable_if_t<__ESIMD_DNS::is_fp_or_dword_type<T1>::value &&
830  std::is_floating_point<T1>::value &&
831  __ESIMD_DNS::is_fp_or_dword_type<U>::value &&
832  std::is_floating_point<U>::value,
833  __ESIMD_NS::simd<T0, SZ>>
834  dp4(__ESIMD_NS::simd<T1, SZ> src0, U src1, Sat sat = {}) {
835  static_assert(SZ % 4 == 0, "result size is not a multiple of 4");
836 
837  __ESIMD_NS::simd<T1, SZ> Src1 = src1;
838  __ESIMD_NS::simd<float, SZ> Result;
839 #pragma unroll
840  for (int i = 0; i < SZ; i += 4) {
841  Result.select<4, 1>(i) = src0[i] * Src1[i] + src0[i + 1] * Src1[i + 1] +
842  src0[i + 2] * Src1[i + 2] +
843  src0[i + 3] * Src1[i + 3];
844  }
845  if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
846  return Result;
847  else
848  return __ESIMD_NS::saturate<T1>(Result);
849 }
850 
861 template <typename T, typename U, int SZ,
862  class Sat = __ESIMD_NS::saturation_off_tag>
863 ESIMD_NODEBUG ESIMD_INLINE
864  std::enable_if_t<__ESIMD_DNS::is_fp_or_dword_type<T>::value &&
865  std::is_floating_point<T>::value &&
866  __ESIMD_DNS::is_fp_or_dword_type<U>::value &&
867  std::is_floating_point<U>::value,
868  __ESIMD_NS::simd<T, SZ>>
869  dph(__ESIMD_NS::simd<T, SZ> src0, U src1, Sat sat = {}) {
870  static_assert(SZ % 4 == 0, "result size is not a multiple of 4");
871 
872  __ESIMD_NS::simd<float, SZ> Src1 = src1;
873  __ESIMD_NS::simd<float, SZ> Result;
874 #pragma unroll
875  for (int i = 0; i < SZ; i += 4) {
876  Result.select<4, 1>(i) = src0[i] * Src1[i] + src0[i + 1] * Src1[i + 1] +
877  src0[i + 2] * Src1[i + 2] + 1.0 * Src1[i + 3];
878  }
879  if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
880  return Result;
881  else
882  return __ESIMD_NS::saturate<T>(Result);
883 }
884 
895 template <typename T, int SZ, class Sat = __ESIMD_NS::saturation_off_tag>
896 ESIMD_NODEBUG
897  ESIMD_INLINE std::enable_if_t<__ESIMD_DNS::is_fp_or_dword_type<T>::value &&
898  std::is_floating_point<T>::value,
899  __ESIMD_NS::simd<T, SZ>>
900  line(__ESIMD_NS::simd<T, 4> src0, __ESIMD_NS::simd<T, SZ> src1,
901  Sat sat = {}) {
902  static_assert(SZ % 4 == 0, "result size is not a multiple of 4");
903 
904  __ESIMD_NS::simd<T, SZ> Src1 = src1;
905  __ESIMD_NS::simd<T, SZ> Result;
906 #pragma unroll
907  for (int i = 0; i < SZ; i += 4) {
908  Result.select<4, 1>(i) = src0[0] * src1[i] + src0[3];
909  }
910 
911  if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
912  return Result;
913  else
914  return __ESIMD_NS::saturate<T>(Result);
915 }
916 
928 template <typename T, int SZ, class Sat = __ESIMD_NS::saturation_off_tag>
929 ESIMD_NODEBUG
930  ESIMD_INLINE std::enable_if_t<__ESIMD_DNS::is_fp_or_dword_type<T>::value &&
931  std::is_floating_point<T>::value,
932  __ESIMD_NS::simd<T, SZ>>
933  line(float P, float Q, __ESIMD_NS::simd<T, SZ> src1, Sat sat = {}) {
934  __ESIMD_NS::simd<T, 4> Src0 = P;
935  Src0(3) = Q;
936  return esimd::line<T>(Src0, src1, sat);
937 }
938 
939 #endif
940 
947 template <typename T, int SZ>
948 __ESIMD_API __ESIMD_NS::simd<T, SZ> frc(__ESIMD_NS::simd<T, SZ> src0) {
949  __ESIMD_NS::simd<float, SZ> Src0 = src0;
950  return __esimd_frc(Src0.data());
951 }
952 
958 template <typename T> __ESIMD_API T frc(T src0) {
959  __ESIMD_NS::simd<T, 1> Src0 = src0;
960  __ESIMD_NS::simd<T, 1> Result = esimd::frc<T>(Src0);
961  return Result[0];
962 }
963 
964 // lzd
965 template <typename RT, typename T0, int SZ,
966  class Sat = __ESIMD_NS::saturation_off_tag>
967 __ESIMD_API __ESIMD_NS::simd<RT, SZ> lzd(__ESIMD_NS::simd<T0, SZ> src0,
968  Sat sat = {}) {
969  // Saturation parameter ignored
970  __ESIMD_NS::simd<uint, SZ> Src0 = src0;
971  return __esimd_lzd<uint>(Src0.data());
972 }
973 
974 template <typename RT, typename T0, class Sat = __ESIMD_NS::saturation_off_tag>
975 ESIMD_NODEBUG
976  ESIMD_INLINE std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<RT>::value &&
977  __ESIMD_DNS::is_esimd_scalar<T0>::value,
978  std::remove_const_t<RT>>
979  lzd(T0 src0, Sat sat = {}) {
980  __ESIMD_NS::simd<T0, 1> Src0 = src0;
981  __ESIMD_NS::simd<RT, 1> Result = esimd::lzd<RT>(Src0);
982  return Result[0];
983 }
984 
985 // lrp
986 #if defined(ESIMD_GEN7_5) || defined(ESIMD_GEN8) || defined(ESIMD_GEN8_5) || \
987  defined(ESIMD_GEN9) || defined(ESIMD_GEN9_5)
988 
989 template <int SZ, typename U, typename V,
990  class Sat = __ESIMD_NS::saturation_off_tag>
991 __ESIMD_API __ESIMD_NS::simd<float, SZ> lrp(__ESIMD_NS::simd<float, SZ> src0,
992  U src1, V src2, Sat sat = {}) {
993  static_assert(SZ >= 4 && (SZ & 0x3) == 0,
994  "vector size must be a multiple of 4");
995  __ESIMD_NS::simd<float, SZ> Src1 = src1;
996  __ESIMD_NS::simd<float, SZ> Src2 = src2;
997  __ESIMD_NS::simd<float, SZ> Result =
998  __esimd_lrp<SZ>(src0.data(), Src1.data(), Src2.data());
999 
1000  if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
1001  return Result;
1002  else
1003  return __ESIMD_NS::saturate<float>(Result);
1004 }
1005 
1006 #else
1007 
1008 // The old implementation is to generate vISA IRs for lrp.
1009 // Now We change to use direct mul/add, and hope to generate mad instructions
1010 // at the end, to still get the performance as good as HW solution.
1011 // The only input and return types for these APIs are floats.
1012 // In order to be able to use the old emu code, we keep the template argument
1013 // for the type, although the type "T" can only be float.
1014 // We use enable_if to force the float type only.
1015 // If the gen is not specified we warn the programmer that they are potentially
1016 // using less efficient implementation.
1017 template <typename T, int SZ, typename U, typename V,
1018  class Sat = __ESIMD_NS::saturation_off_tag>
1019 ESIMD_NODEBUG ESIMD_INLINE
1020  std::enable_if_t<__ESIMD_DNS::is_fp_or_dword_type<T>::value &&
1021  std::is_floating_point<T>::value &&
1022  __ESIMD_DNS::is_fp_or_dword_type<U>::value &&
1023  std::is_floating_point<U>::value,
1024  __ESIMD_NS::simd<T, SZ>>
1025  lrp(__ESIMD_NS::simd<T, SZ> src0, U src1, V src2, Sat sat = {}) {
1026 
1027  __ESIMD_NS::simd<float, SZ> Src1 = src1;
1028  __ESIMD_NS::simd<float, SZ> Src2 = src2;
1029  __ESIMD_NS::simd<float, SZ> Result;
1030  Result = Src1 * src0 + Src2 * (1.0f - src0);
1031  if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
1032  return Result;
1033  else
1034  return __ESIMD_NS::saturate<T>(Result);
1035 }
1036 #endif
1037 
1039 
1042 
1044 template <typename T0, typename T1, int SZ>
1045 __ESIMD_API __ESIMD_NS::simd<T0, SZ> bf_reverse(__ESIMD_NS::simd<T1, SZ> src0) {
1046  __ESIMD_NS::simd<unsigned, SZ> Src0 = src0;
1047  return __esimd_bfrev<unsigned>(Src0.data());
1048 }
1049 
1051 template <typename T0, typename T1>
1052 ESIMD_NODEBUG
1053  ESIMD_INLINE std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<T0>::value &&
1054  __ESIMD_DNS::is_esimd_scalar<T1>::value,
1055  std::remove_const_t<T0>>
1056  bf_reverse(T1 src0) {
1057  __ESIMD_NS::simd<T1, 1> Src0 = src0;
1058  __ESIMD_NS::simd<T0, 1> Result = esimd::bf_reverse<T0>(Src0);
1059  return Result[0];
1060 }
1061 
1063 template <typename T0, typename T1, int SZ, typename U, typename V, typename W>
1064 ESIMD_NODEBUG ESIMD_INLINE
1065  std::enable_if_t<std::is_integral<T1>::value, __ESIMD_NS::simd<T0, SZ>>
1066  bf_insert(U src0, V src1, W src2, __ESIMD_NS::simd<T1, SZ> src3) {
1067  typedef typename __ESIMD_DNS::dword_type<T1> DT1;
1068  static_assert(std::is_integral<DT1>::value && sizeof(DT1) == sizeof(int),
1069  "operand conversion failed");
1070  __ESIMD_NS::simd<DT1, SZ> Src0 = src0;
1071  __ESIMD_NS::simd<DT1, SZ> Src1 = src1;
1072  __ESIMD_NS::simd<DT1, SZ> Src2 = src2;
1073  __ESIMD_NS::simd<DT1, SZ> Src3 = src3;
1074 
1075  return __esimd_bfi<DT1>(Src0.data(), Src1.data(), Src2.data(), Src3.data());
1076 }
1077 
1079 template <typename T0, typename T1, typename T2, typename T3, typename T4>
1080 ESIMD_NODEBUG
1081  ESIMD_INLINE std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<T0>::value &&
1082  __ESIMD_DNS::is_esimd_scalar<T4>::value,
1083  std::remove_const_t<T0>>
1084  bf_insert(T1 src0, T2 src1, T3 src2, T4 src3) {
1085  __ESIMD_NS::simd<T4, 1> Src3 = src3;
1086  __ESIMD_NS::simd<T0, 1> Result = esimd::bf_insert<T0>(src0, src1, src2, Src3);
1087  return Result[0];
1088 }
1089 
1091 template <typename T0, typename T1, int SZ, typename U, typename V>
1092 ESIMD_NODEBUG ESIMD_INLINE
1093  std::enable_if_t<std::is_integral<T1>::value, __ESIMD_NS::simd<T0, SZ>>
1094  bf_extract(U src0, V src1, __ESIMD_NS::simd<T1, SZ> src2) {
1095  typedef typename __ESIMD_DNS::dword_type<T1> DT1;
1096  static_assert(std::is_integral<DT1>::value && sizeof(DT1) == sizeof(int),
1097  "operand conversion failed");
1098  __ESIMD_NS::simd<DT1, SZ> Src0 = src0;
1099  __ESIMD_NS::simd<DT1, SZ> Src1 = src1;
1100  __ESIMD_NS::simd<DT1, SZ> Src2 = src2;
1101 
1102  return __esimd_sbfe<DT1>(Src0.data(), Src1.data(), Src2.data());
1103 }
1104 
1106 template <typename T0, typename T1, typename T2, typename T3>
1107 ESIMD_NODEBUG
1108  ESIMD_INLINE std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<T0>::value &&
1109  __ESIMD_DNS::is_esimd_scalar<T3>::value,
1110  std::remove_const_t<T0>>
1111  bf_extract(T1 src0, T2 src1, T3 src2) {
1112  __ESIMD_NS::simd<T3, 1> Src2 = src2;
1113  __ESIMD_NS::simd<T0, 1> Result = esimd::bf_extract<T0>(src0, src1, Src2);
1114  return Result[0];
1115 }
1116 
1118 
1121 
1122 // sincos
1123 template <int SZ, typename U, class Sat = __ESIMD_NS::saturation_off_tag>
1124 __ESIMD_API __ESIMD_NS::simd<float, SZ>
1125 sincos(__ESIMD_NS::simd<float, SZ> &dstcos, U src0, Sat sat = {}) {
1126  dstcos = __ESIMD_NS::cos(src0, sat);
1127  return __ESIMD_NS::sin(src0, sat);
1128 }
1129 
1130 // atan
1131 
1133 namespace detail {
1134 constexpr double HDR_CONST_PI = 3.1415926535897932384626433832795;
1135 } // namespace detail
1137 
1138 template <typename T, int SZ>
1139 ESIMD_NODEBUG ESIMD_INLINE
1140  std::enable_if_t<std::is_floating_point<T>::value, __ESIMD_NS::simd<T, SZ>>
1141  atan(__ESIMD_NS::simd<T, SZ> src0) {
1142  __ESIMD_NS::simd<T, SZ> Src0 = __ESIMD_NS::abs(src0);
1143 
1144  __ESIMD_NS::simd_mask<SZ> Neg = src0 < T(0.0);
1145  __ESIMD_NS::simd_mask<SZ> Gt1 = Src0 > T(1.0);
1146 
1147  Src0.merge(__ESIMD_NS::inv(Src0), Gt1);
1148 
1149  __ESIMD_NS::simd<T, SZ> Src0P2 = Src0 * Src0;
1150  __ESIMD_NS::simd<T, SZ> Src0P4 = Src0P2 * Src0P2;
1151 
1152  __ESIMD_NS::simd<T, SZ> Result =
1153  (Src0P4 * T(0.185696) + ((Src0 * T(0.787997) + T(0.63693)) * Src0P2) +
1154  Src0) /
1155  (((((Src0 * -T(0.000121387) + T(0.00202308)) * Src0P2) +
1156  (Src0 * -T(0.0149145)) + T(0.182569)) *
1157  Src0P4) +
1158  ((Src0 * T(0.395889) + T(1.12158)) * Src0P2) + (Src0 * T(0.636918)) +
1159  T(1.0));
1160 
1161  Result.merge(Result - T(detail::HDR_CONST_PI / 2.0), Gt1);
1162  Result.merge(Result, Neg);
1163  return Result;
1164 }
1165 
1166 template <typename T>
1167 __ESIMD_API std::enable_if_t<std::is_floating_point<T>::value, T> atan(T src0) {
1168  __ESIMD_NS::simd<T, 1> Src0 = src0;
1169  __ESIMD_NS::simd<T, 1> Result = esimd::atan(Src0);
1170  return Result[0];
1171 }
1172 
1173 // acos
1174 
1175 template <typename T, int SZ>
1176 ESIMD_NODEBUG ESIMD_INLINE
1177  std::enable_if_t<std::is_floating_point<T>::value, __ESIMD_NS::simd<T, SZ>>
1178  acos(__ESIMD_NS::simd<T, SZ> src0) {
1179  __ESIMD_NS::simd<T, SZ> Src0 = __ESIMD_NS::abs(src0);
1180 
1181  __ESIMD_NS::simd_mask<SZ> Neg = src0 < T(0.0);
1182  __ESIMD_NS::simd_mask<SZ> TooBig = Src0 >= T(0.999998);
1183 
1184  // Replace oversized values to ensure no possibility of sqrt of
1185  // a negative value later
1186  Src0.merge(T(0.0), TooBig);
1187 
1188  __ESIMD_NS::simd<T, SZ> Src01m = T(1.0) - Src0;
1189 
1190  __ESIMD_NS::simd<T, SZ> Src0P2 = Src01m * Src01m;
1191  __ESIMD_NS::simd<T, SZ> Src0P4 = Src0P2 * Src0P2;
1192 
1193  __ESIMD_NS::simd<T, SZ> Result =
1194  (((Src01m * T(0.015098965761299077) - T(0.005516443930088506)) * Src0P4) +
1195  ((Src01m * T(0.047654245891495528) + T(0.163910606547823220)) * Src0P2) +
1196  Src01m * T(2.000291665285952400) - T(0.000007239283986332)) *
1197  __ESIMD_NS::rsqrt(Src01m * T(2.0));
1198 
1199  Result.merge(T(0.0), TooBig);
1200  Result.merge(T(detail::HDR_CONST_PI) - Result, Neg);
1201  return Result;
1202 }
1203 
1204 template <typename T>
1205 __ESIMD_API std::enable_if_t<std::is_floating_point<T>::value, T> acos(T src0) {
1206  __ESIMD_NS::simd<T, 1> Src0 = src0;
1207  __ESIMD_NS::simd<T, 1> Result = esimd::acos(Src0);
1208  return Result[0];
1209 }
1210 
1211 // asin
1212 
1213 template <typename T, int SZ>
1214 ESIMD_NODEBUG ESIMD_INLINE
1215  std::enable_if_t<std::is_floating_point<T>::value, __ESIMD_NS::simd<T, SZ>>
1216  asin(__ESIMD_NS::simd<T, SZ> src0) {
1217  __ESIMD_NS::simd_mask<SZ> Neg = src0 < T(0.0);
1218 
1219  __ESIMD_NS::simd<T, SZ> Result =
1220  T(detail::HDR_CONST_PI / 2.0) - esimd::acos(__ESIMD_NS::abs(src0));
1221 
1222  Result.merge(-Result, Neg);
1223  return Result;
1224 }
1225 
1226 template <typename T>
1227 __ESIMD_API std::enable_if_t<std::is_floating_point<T>::value, T> asin(T src0) {
1228  __ESIMD_NS::simd<T, 1> Src0 = src0;
1229  __ESIMD_NS::simd<T, 1> Result = esimd::asin(Src0);
1230  return Result[0];
1231 }
1233 
1236 
1237 /* atan2_fast - a fast atan2 implementation */
1238 /* vector input */
1239 template <int N>
1240 __ESIMD_NS::simd<float, N> atan2_fast(__ESIMD_NS::simd<float, N> y,
1241  __ESIMD_NS::simd<float, N> x);
1242 /* scalar input */
1243 template <typename T> float atan2_fast(T y, T x);
1244 
1245 /* atan2 - atan2 implementation */
1246 /* For Vector input */
1247 template <int N>
1248 __ESIMD_NS::simd<float, N> atan2(__ESIMD_NS::simd<float, N> y,
1249  __ESIMD_NS::simd<float, N> x);
1250 /* scalar Input */
1251 template <typename T> float atan2(T y, T x);
1252 
1253 /* fmod: */
1254 /* vector input */
1255 template <int N>
1256 __ESIMD_NS::simd<float, N> fmod(__ESIMD_NS::simd<float, N> y,
1257  __ESIMD_NS::simd<float, N> x);
1258 /* scalar Input */
1259 template <typename T> float fmod(T y, T x);
1260 
1261 /* sin_emu - EU emulation for sin(x) */
1262 /* For Vector input */
1263 template <int N>
1264 __ESIMD_NS::simd<float, N> sin_emu(__ESIMD_NS::simd<float, N> x);
1265 /* scalar Input */
1266 template <typename T> float sin_emu(T x);
1267 
1268 /* cos_emu - EU emulation for cos(x) */
1269 /* For Vector input */
1270 template <int N>
1271 __ESIMD_NS::simd<float, N> cos_emu(__ESIMD_NS::simd<float, N> x);
1272 
1273 /* scalar Input */
1274 template <typename T> float cos_emu(T x);
1275 
1276 /* tanh_cody_waite - Cody-Waite implementation for tanh(x) */
1277 /* float input */
1278 float tanh_cody_waite(float x);
1279 /* vector input */
1280 template <int N>
1281 __ESIMD_NS::simd<float, N> tanh_cody_waite(__ESIMD_NS::simd<float, N> x);
1282 /* tanh - opencl like implementation for tanh(x) */
1283 /* float input */
1284 float tanh(float x);
1285 /* vector input */
1286 template <int N> __ESIMD_NS::simd<float, N> tanh(__ESIMD_NS::simd<float, N> x);
1287 
1288 /* ------------------------- Extended Math Routines
1289  * -------------------------------------------------*/
1290 
1292 
1293 namespace detail {
1294 static auto constexpr CONST_PI = 3.14159f;
1295 static auto constexpr CMPI = 3.14159265f;
1296 } // namespace detail
1297 
1299 
1300 // For vector input
1301 template <int N>
1302 ESIMD_INLINE __ESIMD_NS::simd<float, N>
1303 atan2_fast(__ESIMD_NS::simd<float, N> y, __ESIMD_NS::simd<float, N> x) {
1304  __ESIMD_NS::simd<float, N> a0;
1305  __ESIMD_NS::simd<float, N> a1;
1306  __ESIMD_NS::simd<float, N> atan2;
1307 
1308  __ESIMD_NS::simd_mask<N> mask = (y >= 0.0f);
1309  a0.merge(detail::CONST_PI * 0.5f, detail::CONST_PI * 1.5f, mask);
1310  a1.merge(0, detail::CONST_PI * 2.0f, mask);
1311 
1312  a1.merge(detail::CONST_PI, x < 0.0f);
1313 
1314  __ESIMD_NS::simd<float, N> xy = x * y;
1315  __ESIMD_NS::simd<float, N> x2 = x * x;
1316  __ESIMD_NS::simd<float, N> y2 = y * y;
1317 
1318  /* smallest such that 1.0+CONST_DBL_EPSILON != 1.0 */
1319  constexpr auto CONST_DBL_EPSILON = 0.00001f;
1320 
1321  a0 -= (xy / (y2 + x2 * 0.28f + CONST_DBL_EPSILON));
1322  a1 += (xy / (x2 + y2 * 0.28f + CONST_DBL_EPSILON));
1323 
1324  atan2.merge(a1, a0, y2 <= x2);
1325  return atan2;
1326 }
1327 
1328 // For Scalar Input
1329 template <> ESIMD_INLINE float atan2_fast(float y, float x) {
1330  __ESIMD_NS::simd<float, 1> vy = y;
1331  __ESIMD_NS::simd<float, 1> vx = x;
1332  __ESIMD_NS::simd<float, 1> atan2 = esimd::atan2_fast(vy, vx);
1333  return atan2[0];
1334 }
1335 
1336 // atan2
1337 // For Vector input
1338 template <int N>
1339 ESIMD_INLINE __ESIMD_NS::simd<float, N> atan2(__ESIMD_NS::simd<float, N> y,
1340  __ESIMD_NS::simd<float, N> x) {
1341  __ESIMD_NS::simd<float, N> v_distance;
1342  __ESIMD_NS::simd<float, N> v_y0;
1343  __ESIMD_NS::simd<float, N> atan2;
1344  __ESIMD_NS::simd_mask<N> mask;
1345 
1346  mask = (x < 0);
1347  v_y0.merge(detail::CONST_PI, 0, mask);
1348  v_distance = __ESIMD_NS::sqrt(x * x + y * y);
1349  mask = (__ESIMD_NS::abs<float>(y) < 0.000001f);
1350  atan2.merge(v_y0, (2 * esimd::atan((v_distance - x) / y)), mask);
1351  return atan2;
1352 }
1353 
1354 // For Scalar Input
1355 template <> ESIMD_INLINE float atan2(float y, float x) {
1356  float v_distance;
1357  float v_y0;
1358  __ESIMD_NS::simd<float, 1> atan2;
1359  __ESIMD_NS::simd_mask<1> mask;
1360 
1361  mask = (x < 0);
1362  v_y0 = mask[0] ? detail::CONST_PI : 0;
1363  v_distance = __ESIMD_NS::sqrt<float>(x * x + y * y);
1364  mask = (__ESIMD_NS::abs<float>(y) < 0.000001f);
1365  atan2.merge(v_y0, (2 * esimd::atan((v_distance - x) / y)), mask);
1366  return atan2[0];
1367 }
1368 
1369 // fmod:
1370 // For Vector input
1371 template <int N>
1372 ESIMD_INLINE __ESIMD_NS::simd<float, N> fmod(__ESIMD_NS::simd<float, N> y,
1373  __ESIMD_NS::simd<float, N> x) {
1374  __ESIMD_NS::simd<float, N> abs_x = __ESIMD_NS::abs(x);
1375  __ESIMD_NS::simd<float, N> abs_y = __ESIMD_NS::abs(y);
1376  auto fmod_sign_mask = (y.template bit_cast_view<int32_t>()) & 0x80000000;
1377 
1378  __ESIMD_NS::simd<float, N> reminder =
1379  abs_y - abs_x * __ESIMD_NS::trunc<float>(abs_y / abs_x);
1380 
1381  abs_x.merge(0.0, reminder >= 0);
1382  __ESIMD_NS::simd<float, N> fmod = reminder + abs_x;
1383  __ESIMD_NS::simd<float, N> fmod_abs = __ESIMD_NS::abs(fmod);
1384 
1385  auto fmod_bits =
1386  (fmod_abs.template bit_cast_view<int32_t>()) | fmod_sign_mask;
1387  return fmod_bits.template bit_cast_view<float>();
1388 }
1389 
1390 // For Scalar Input
1391 template <> ESIMD_INLINE float fmod(float y, float x) {
1392  return fmod(__ESIMD_NS::simd<float, 1>(y), __ESIMD_NS::simd<float, 1>(x))[0];
1393 }
1394 
1395 // sin_emu - EU emulation for sin(x)
1396 // For Vector input
1397 template <int N>
1398 ESIMD_INLINE __ESIMD_NS::simd<float, N> sin_emu(__ESIMD_NS::simd<float, N> x) {
1399  __ESIMD_NS::simd<float, N> x1;
1400  __ESIMD_NS::simd<float, N> x2;
1401  __ESIMD_NS::simd<float, N> t3;
1402 
1403  __ESIMD_NS::simd<float, N> sign;
1404  __ESIMD_NS::simd<float, N> fTrig;
1405  __ESIMD_NS::simd<float, N> TwoPI(6.2831853f);
1406  __ESIMD_NS::simd<float, N> CmpI(detail::CMPI);
1407  __ESIMD_NS::simd<float, N> OneP(1.f);
1408  __ESIMD_NS::simd<float, N> OneN(-1.f);
1409 
1410  x = esimd::fmod(x, TwoPI);
1411 
1412  x1.merge(CmpI - x, x - CmpI, (x <= detail::CMPI));
1413  x1.merge(x, (x <= detail::CMPI * 0.5f));
1414  x1.merge(CmpI * 2 - x, (x > detail::CMPI * 1.5f));
1415 
1416  sign.merge(OneN, OneP, (x > detail::CMPI));
1417 
1418  x2 = x1 * x1;
1419  t3 = x2 * x1 * 0.1666667f;
1420 
1421  fTrig =
1422  x1 + t3 * (OneN + x2 * 0.05f *
1423  (OneP + x2 * 0.0238095f *
1424  (OneN + x2 * 0.0138889f *
1425  (OneP - x2 * 0.0090909f))));
1426  fTrig *= sign;
1427  return fTrig;
1428 }
1429 
1430 // scalar Input
1431 template <typename T> ESIMD_INLINE float sin_emu(T x0) {
1432  __ESIMD_NS::simd<float, 1> x1;
1433  __ESIMD_NS::simd<float, 1> x2;
1434  __ESIMD_NS::simd<float, 1> t3;
1435 
1436  __ESIMD_NS::simd<float, 1> sign;
1437  __ESIMD_NS::simd<float, 1> fTrig;
1438  float TwoPI = detail::CMPI * 2.0f;
1439 
1440  __ESIMD_NS::simd<float, 1> x = esimd::fmod(x0, TwoPI);
1441 
1442  __ESIMD_NS::simd<float, 1> CmpI(detail::CMPI);
1443  __ESIMD_NS::simd<float, 1> OneP(1.f);
1444  __ESIMD_NS::simd<float, 1> OneN(-1.f);
1445 
1446  x1.merge(CmpI - x, x - CmpI, (x <= detail::CMPI));
1447  x1.merge(x, (x <= detail::CMPI * 0.5f));
1448  x1.merge(CmpI * 2.0f - x, (x > detail::CMPI * 1.5f));
1449 
1450  sign.merge(OneN, OneP, (x > detail::CMPI));
1451 
1452  x2 = x1 * x1;
1453  t3 = x2 * x1 * 0.1666667f;
1454 
1455  fTrig =
1456  x1 + t3 * (OneN + x2 * 0.05f *
1457  (OneP + x2 * 0.0238095f *
1458  (OneN + x2 * 0.0138889f *
1459  (OneP - x2 * 0.0090909f))));
1460  fTrig *= sign;
1461  return fTrig[0];
1462 }
1463 
1464 // cos_emu - EU emulation for sin(x)
1465 // For Vector input
1466 template <int N>
1467 ESIMD_INLINE __ESIMD_NS::simd<float, N> cos_emu(__ESIMD_NS::simd<float, N> x) {
1468  __ESIMD_NS::simd<float, N> x1;
1469  __ESIMD_NS::simd<float, N> x2;
1470  __ESIMD_NS::simd<float, N> t2;
1471  __ESIMD_NS::simd<float, N> t3;
1472 
1473  __ESIMD_NS::simd<float, N> sign;
1474  __ESIMD_NS::simd<float, N> fTrig;
1475  __ESIMD_NS::simd<float, N> TwoPI(6.2831853f);
1476  __ESIMD_NS::simd<float, N> CmpI(detail::CMPI);
1477  __ESIMD_NS::simd<float, N> OneP(1.f);
1478  __ESIMD_NS::simd<float, N> OneN(-1.f);
1479 
1480  x = esimd::fmod(x, TwoPI);
1481 
1482  x1.merge(x - detail::CMPI * 0.5f, CmpI * 1.5f - x, (x <= detail::CMPI));
1483  x1.merge(CmpI * 0.5f - x, (x <= detail::CMPI * 0.5f));
1484  x1.merge(x - detail::CMPI * 1.5f, (x > detail::CMPI * 1.5f));
1485 
1486  sign.merge(1, -1, ((x < detail::CMPI * 0.5f) | (x >= detail::CMPI * 1.5f)));
1487 
1488  x2 = x1 * x1;
1489  t3 = x2 * x1 * 0.1666667f;
1490  fTrig =
1491  x1 + t3 * (OneN + x2 * 0.05f *
1492  (OneP + x2 * 0.0238095f *
1493  (OneN + x2 * 0.0138889f *
1494  (OneP - x2 * 0.0090909f))));
1495  fTrig *= sign;
1496  return fTrig;
1497 }
1498 
1499 // scalar Input
1500 template <typename T> ESIMD_INLINE float cos_emu(T x0) {
1501  __ESIMD_NS::simd<float, 1> x1;
1502  __ESIMD_NS::simd<float, 1> x2;
1503  __ESIMD_NS::simd<float, 1> t3;
1504 
1505  __ESIMD_NS::simd<float, 1> sign;
1506  __ESIMD_NS::simd<float, 1> fTrig;
1507  float TwoPI = detail::CMPI * 2.0f;
1508 
1509  __ESIMD_NS::simd<float, 1> x = esimd::fmod(x0, TwoPI);
1510 
1511  __ESIMD_NS::simd<float, 1> CmpI(detail::CMPI);
1512  __ESIMD_NS::simd<float, 1> OneP(1.f);
1513  __ESIMD_NS::simd<float, 1> OneN(-1.f);
1514 
1515  x1.merge(x - detail::CMPI * 0.5f, CmpI * 1.5f - x, (x <= detail::CMPI));
1516  x1.merge(CmpI * 0.5f - x, (x <= detail::CMPI * 0.5f));
1517  x1.merge(x - detail::CMPI * 1.5f, (x > detail::CMPI * 1.5f));
1518 
1519  sign.merge(OneP, OneN,
1520  ((x < detail::CMPI * 0.5f) | (x >= detail::CMPI * 1.5f)));
1521 
1522  x2 = x1 * x1;
1523  t3 = x2 * x1 * 0.1666667f;
1524  fTrig =
1525  x1 + t3 * (OneN + x2 * 0.05f *
1526  (OneP + x2 * 0.0238095f *
1527  (OneN + x2 * 0.0138889f *
1528  (OneP - x2 * 0.0090909f))));
1529  fTrig *= sign;
1530  return fTrig[0];
1531 }
1532 
1534 namespace detail {
1535 
1536 template <int N>
1537 ESIMD_INLINE __ESIMD_NS::simd<float, N>
1538 tanh_cody_waite_impl(__ESIMD_NS::simd<float, N> x) {
1539  /*
1540  * 0 x_small x_medium x_large
1541  * | x | rational polynomial | 1 - 2/(1 + exp(2*x)) | 1
1542  *
1543  * rational polynomial for single precision = x + x * (g * (p[1] * g + p[0]) /
1544  * (g + q[0]) g = x^2 p0 = -0.82377 28127 E+00 p1 = -0.38310 10665 E-02 q0 =
1545  * 0.24713 19654 E+01 q1 = 1.00000 00000 E+00
1546  *
1547  */
1548 
1549  constexpr float p0 = -0.8237728127E+00f;
1550  constexpr float p1 = -0.3831010665E-02f;
1551  constexpr float q0 = 0.2471319654E+01f;
1552  constexpr float q1 = 1.0000000000E+00f;
1553  constexpr float xsmall = 4.22863966691620432990E-04f;
1554  constexpr float xmedium = 0.54930614433405484570f;
1555  constexpr float xlarge = 8.66433975699931636772f;
1556  constexpr float log2E = 1.442695f; // same as esimd::log(e)
1557 
1558  using RT = __ESIMD_NS::simd<float, N>;
1559 
1560  RT absX = __ESIMD_NS::abs(x);
1561  RT g = absX * absX;
1562 
1563  RT sign;
1564  sign.merge(-1.f, 1.f, x < 0.f);
1565 
1566  auto isLarge = absX > xlarge;
1567  auto minor = absX <= xlarge;
1568  auto isGtMed = minor & (absX > xmedium);
1569  auto isGtSmall = (absX > xsmall) & (absX <= xmedium);
1570 
1571  RT res;
1572  res.merge(sign, x, isLarge);
1573  auto temp = __ESIMD_NS::exp(absX * 2.0f * log2E) + 1.f;
1574  temp = ((temp - 2.f) / temp) * sign;
1575  res.merge(temp, isGtMed);
1576  res.merge((absX + absX * g * (g * p1 + p0) / (g + q0)) * sign, isGtSmall);
1577 
1578  return res;
1579 }
1580 
1581 template <int N>
1582 ESIMD_INLINE __ESIMD_NS::simd<float, N>
1583 tanh_impl(__ESIMD_NS::simd<float, N> x) {
1584  /*
1585  * 0 x_small x_large
1586  * | x | ( exp(x) - exp(-x) ) / ( exp(x) + exp(-x) ) | 1
1587  *
1588  */
1589 
1590  constexpr float xsmall = 0.000045f; // same as exp(-10.0f)
1591  constexpr float xlarge = 88.f;
1592  constexpr float log2E = 1.442695f; // same as esimd::log(e)
1593 
1594  using RT = __ESIMD_NS::simd<float, N>;
1595 
1596  RT absX = __ESIMD_NS::abs(x);
1597 
1598  RT sign;
1599  sign.merge(-1.f, 1.f, x < 0.f);
1600 
1601  auto isLarge = (absX > xlarge);
1602  auto isLessE = (absX <= xlarge);
1603 
1604  RT res;
1605  res.merge(sign, x, isLarge);
1606 
1607  RT exp;
1608  exp = __ESIMD_NS::exp(absX * 2.f * log2E);
1609 
1610  res.merge(((exp - 1.f) / (exp + 1.f)) * sign, (absX > xsmall) & isLessE);
1611 
1612  return res;
1613 }
1614 } // namespace detail
1616 
1617 /* tanh_cody_waite - Cody-Waite implementation for tanh(x) */
1618 /* float input */
1619 ESIMD_INLINE float tanh_cody_waite(float x) {
1620  return detail::tanh_cody_waite_impl(__ESIMD_NS::simd<float, 1>(x))[0];
1621 }
1622 /* vector input */
1623 template <int N>
1624 ESIMD_INLINE __ESIMD_NS::simd<float, N>
1625 tanh_cody_waite(__ESIMD_NS::simd<float, N> x) {
1626  return detail::tanh_cody_waite_impl(x);
1627 }
1628 
1629 /* tanh - opencl like implementation for tanh(x) */
1630 /* float input */
1631 ESIMD_INLINE float tanh(float x) {
1632  return esimd::detail::tanh_impl(__ESIMD_NS::simd<float, 1>(x))[0];
1633 }
1634 /* vector input */
1635 template <int N>
1636 ESIMD_INLINE __ESIMD_NS::simd<float, N> tanh(__ESIMD_NS::simd<float, N> x) {
1637  return esimd::detail::tanh_impl(x);
1638 }
1639 
1640 template <typename T, int N>
1641 __ESIMD_NS::simd<T, N> dp4(__ESIMD_NS::simd<T, N> v1,
1642  __ESIMD_NS::simd<T, N> v2) {
1643  auto retv = __esimd_dp4<T, N>(v1.data(), v2.data());
1644  return retv;
1645 }
1646 
1648 
1650 // dpas helpers
1651 namespace detail {
1652 
1653 enum class dpas_ops_per_channel : unsigned {
1654  OP1 = 1u,
1655  OP2 = 2u,
1656  OP4 = 4u,
1657  OP8 = 8u,
1658  INVALID = 0xffffffffu
1659 };
1660 constexpr dpas_ops_per_channel
1661 get_ops_per_channel(argument_type src1_precision,
1662  argument_type src2_precision) {
1663  if ((src1_precision == argument_type::U8) ||
1664  (src1_precision == argument_type::S8)) {
1665  if ((src2_precision == argument_type::U8) ||
1666  (src2_precision == argument_type::S8) ||
1667  (src2_precision == argument_type::U4) ||
1668  (src2_precision == argument_type::S4) ||
1669  (src2_precision == argument_type::U2) ||
1670  (src2_precision == argument_type::S2)) {
1671  return dpas_ops_per_channel::OP4;
1672  }
1673  } else if ((src1_precision == argument_type::U4) ||
1674  (src1_precision == argument_type::S4) ||
1675  (src1_precision == argument_type::U2) ||
1676  (src1_precision == argument_type::S2)) {
1677  if ((src2_precision == argument_type::U8) ||
1678  (src2_precision == argument_type::S8)) {
1679  return dpas_ops_per_channel::OP4;
1680  } else if ((src2_precision == argument_type::U4) ||
1681  (src2_precision == argument_type::S4) ||
1682  (src2_precision == argument_type::U2) ||
1683  (src2_precision == argument_type::S2)) {
1684  return dpas_ops_per_channel::OP8;
1685  }
1686  } else if ((src1_precision == argument_type::BF16) &&
1687  (src2_precision == argument_type::BF16)) {
1688  return dpas_ops_per_channel::OP2;
1689  } else if ((src1_precision == argument_type::FP16) &&
1690  (src2_precision == argument_type::FP16)) {
1691  return dpas_ops_per_channel::OP2;
1692  } else if ((src1_precision == argument_type::TF32) &&
1693  (src2_precision == argument_type::TF32)) {
1694  return dpas_ops_per_channel::OP1;
1695  }
1696  return dpas_ops_per_channel::INVALID;
1697 }
1698 
1699 constexpr unsigned get_precision_bits(argument_type src_precision) {
1700  if ((src_precision == argument_type::U8) ||
1701  (src_precision == argument_type::S8)) {
1702  return 8;
1703  } else if ((src_precision == argument_type::U4) ||
1704  (src_precision == argument_type::S4)) {
1705  return 4;
1706  } else if ((src_precision == argument_type::U2) ||
1707  (src_precision == argument_type::S2)) {
1708  return 2;
1709  } else if ((src_precision == argument_type::BF16) ||
1710  (src_precision == argument_type::FP16)) {
1711  return 16;
1712  } else if (src_precision == argument_type::TF32) {
1713  return 32;
1714  }
1715  return 0;
1716 }
1717 
1718 } // namespace detail
1720 
1724 
1737 template <argument_type src1_precision, argument_type src2_precision,
1738  typename T, int systolic_depth, int repeat_count, typename T0,
1739  typename T1, typename T2, int N, int N1, int N2,
1740  typename Sat = __ESIMD_NS::saturation_off_tag>
1741 __ESIMD_API __ESIMD_NS::simd<T, N>
1742 dpas(__ESIMD_NS::simd<T0, N> src0, __ESIMD_NS::simd<T1, N1> src1,
1743  __ESIMD_NS::simd<T2, N2> src2, Sat sat = {}) {
1744  // types: dst, src0, src1, src2
1745  // ud, d | ud, d | ub, b | ub, b
1746  // ud, d | ud, d | u4, s4, u2, s2 | ub, b
1747  // ud, d | ud, d | ub, b | u4, s4, u2, s2
1748  // ud, d | ud, d | u4, s4, u2, s2 | u4, s4, u2, s2
1749  constexpr bool check_integer =
1750  detail::is_one_of_v<T, unsigned int, int> &&
1751  detail::is_one_of_v<T0, unsigned int, int> &&
1752  detail::is_one_of_enum_v<argument_type, src1_precision, argument_type::S8,
1753  argument_type::U8, argument_type::U4,
1754  argument_type::S4, argument_type::U2,
1755  argument_type::S2> &&
1756  detail::is_one_of_enum_v<argument_type, src2_precision, argument_type::S8,
1757  argument_type::U8, argument_type::U4,
1758  argument_type::S4, argument_type::U2,
1759  argument_type::S2>;
1760  // f, bf | f, bf | bf | bf
1761  constexpr bool check_bf16 =
1762  detail::is_one_of_v<T, float, short> &&
1763  detail::is_one_of_v<T0, float, short> &&
1764  detail::is_one_of_enum_v<argument_type, src1_precision,
1765  argument_type::BF16> &&
1766  detail::is_one_of_enum_v<argument_type, src2_precision,
1767  argument_type::BF16>;
1768  // f,hf | f, hf | hf | hf
1769  constexpr bool check_hf =
1770  detail::is_one_of_v<T, float, half> &&
1771  detail::is_one_of_v<T0, float, half> &&
1772  detail::is_one_of_enum_v<argument_type, src1_precision,
1773  argument_type::FP16> &&
1774  detail::is_one_of_enum_v<argument_type, src2_precision,
1775  argument_type::FP16>;
1776 
1777 #if defined(ESIMD_XE_HPC) || defined(ESIMD_XE_HPG)
1778  // f | f | tf32 | tf32
1779  constexpr bool check_tf32 =
1780  detail::is_one_of_v<T, float> && detail::is_one_of_v<T0, float> &&
1781  detail::is_one_of_enum_v<argument_type, src1_precision,
1782  argument_type::TF32> &&
1783  detail::is_one_of_enum_v<argument_type, src2_precision,
1784  argument_type::TF32>;
1785 #endif // defined(ESIMD_XE_HPC) || defined(ESIMD_XE_HPG)
1786 
1787 #if defined(ESIMD_XE_HPC) || defined(ESIMD_XE_HPG)
1788  constexpr bool check_passed =
1789  (check_integer || check_hf || check_bf16 || check_tf32);
1790  static_assert(check_passed,
1791  "unsupported dpas type! The supported types are:\n"
1792  " dst | src0 | src1 | src2 \n"
1793  " ud, d | ud, d | ub, b | ub, b \n"
1794  " ud, d | ud, d | u4, s4, u2, s2 | u4, s4, u2, s2 \n"
1795  " f, bf | f, bf | bf | bf \n"
1796  " f, hf | f, hf | hf | hf \n"
1797  " f | f | tf32 | tf32 \n");
1798 #else // else defined(ESIMD_XE_HPC) || defined(ESIMD_XE_HPG)
1799  constexpr bool check_passed = (check_integer || check_hf || check_bf16);
1800  static_assert(check_passed,
1801  "unsupported dpas type! The supported types are:\n"
1802  " dst | src0 | src1 | src2 \n"
1803  " ud, d | ud, d | ub, b | ub, b \n"
1804  " ud, d | ud, d | u4, s4, u2, s2 | u4, s4, u2, s2 \n"
1805  " f, bf | f, bf | bf | bf \n"
1806  " f, hf | f, hf | hf | hf \n");
1807 #endif // end else defined(ESIMD_XE_HPC) || defined(ESIMD_XE_HPG)
1808 
1809  static_assert(__ESIMD_DNS::is_dword_type<T1>::value,
1810  "Src1 must be DWORD type");
1811  static_assert(__ESIMD_DNS::is_dword_type<T2>::value,
1812  "Src2 must be DWORD type");
1813 
1814 #if defined(ESIMD_XE_HPC) || defined(ESIMD_XE_HPG)
1815  static_assert((N == 16 * repeat_count), "Execution size on PVC must be 16");
1816 #else
1817  static_assert((N == 8 * repeat_count), "Execution size must be 8");
1818 #endif
1819 
1820  static_assert((systolic_depth == 8) || (systolic_depth == 4),
1821  "systolic_depth must be 8 or 4");
1822 
1823  static_assert((repeat_count >= 1) && (repeat_count <= 8),
1824  "repeat_count must be within 1 to 8");
1825 
1826  constexpr auto en_ops_per_channel =
1827  detail::get_ops_per_channel(src1_precision, src2_precision);
1828  static_assert(en_ops_per_channel != detail::dpas_ops_per_channel::INVALID,
1829  "invalid combination of Src1/Src2 precision");
1830  constexpr auto ops_per_channel = static_cast<unsigned>(en_ops_per_channel);
1831 
1832  constexpr auto src1_precision_bits =
1833  detail::get_precision_bits(src1_precision);
1834  static_assert(
1835  N1 == ((src1_precision_bits * systolic_depth * ops_per_channel * N) /
1836  (repeat_count * sizeof(T1) * 8)),
1837  "invalid size for Src1");
1838 
1839  constexpr auto src2_precision_bits =
1840  detail::get_precision_bits(src2_precision);
1841  static_assert(N2 == ((src2_precision_bits * systolic_depth * ops_per_channel *
1842  repeat_count) /
1843  (sizeof(T2) * 8)),
1844  "invalid size for Src2");
1845 
1846 #if defined(__SYCL_DEVICE_ONLY__)
1847  constexpr int dst_signed = std::is_signed<T>::value;
1848  constexpr int src0_signed = std::is_signed<T0>::value;
1849  __ESIMD_NS::simd<T, N> result = __esimd_dpas<T, T0, T1, T2, N, N1, N2>(
1850  src0.data(), src1.data(), src2.data(), (int)src1_precision + 1,
1851  (int)src2_precision + 1, systolic_depth, repeat_count, dst_signed,
1852  src0_signed);
1853 
1854 #else
1855  __ESIMD_NS::simd<T, N> result =
1856  __esimd_dpas<src1_precision, src2_precision, systolic_depth, repeat_count,
1857  T, T0, T1, T2, N, N1, N2>(src0.data(), src1.data(),
1858  src2.data());
1859 #endif // __SYCL_DEVICE_ONLY__
1860 
1861  if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
1862  return result;
1863  else
1864  return __ESIMD_NS::saturate<T>(result);
1865 }
1866 
1877 template <argument_type src1_precision, argument_type src2_precision,
1878  int systolic_depth, int repeat_count, typename T, typename T1,
1879  typename T2, int N, int N1, int N2,
1880  typename Sat = __ESIMD_NS::saturation_off_tag>
1881 __ESIMD_API __ESIMD_NS::simd<T, N>
1882 dpas(__ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd<T1, N1> src1,
1883  __ESIMD_NS::simd<T2, N2> src2, Sat sat = {}) {
1884  return dpas<src1_precision, src2_precision, T, systolic_depth, repeat_count>(
1885  src0, src1, src2, sat);
1886 }
1887 
1896 template <argument_type src1_precision, argument_type src2_precision,
1897  int systolic_depth, int repeat_count, typename T, typename T1,
1898  typename T2, int N, int N1, int N2,
1899  typename Sat = __ESIMD_NS::saturation_off_tag>
1900 __ESIMD_API __ESIMD_NS::simd<T, N> dpas(__ESIMD_NS::simd<T1, N1> src1,
1901  __ESIMD_NS::simd<T2, N2> src2,
1902  Sat sat = {}) {
1903 
1904  static_assert(__ESIMD_DNS::is_fp_or_dword_type<T>::value,
1905  "Dst must be FP or DWORD type");
1906 
1907  static_assert(__ESIMD_DNS::is_dword_type<T1>::value,
1908  "Src1 must be DWORD type");
1909 
1910  static_assert(__ESIMD_DNS::is_dword_type<T2>::value,
1911  "Src2 must be DWORD type");
1912 
1913  static_assert((N == 8 * repeat_count) || (N == 16 * repeat_count),
1914  "Execution size must be 8 or 16");
1915 
1916  static_assert((systolic_depth == 8) || (systolic_depth == 4),
1917  "systolic_depth must be 8 or 4");
1918 
1919  static_assert((repeat_count >= 1) && (repeat_count <= 8),
1920  "repeat_count must be within 1 to 8");
1921 
1922  constexpr auto en_ops_per_channel =
1923  detail::get_ops_per_channel(src1_precision, src2_precision);
1924  static_assert(en_ops_per_channel != detail::dpas_ops_per_channel::INVALID,
1925  "invalid combination of Src1/Src2 precision");
1926  constexpr auto ops_per_channel = static_cast<unsigned>(en_ops_per_channel);
1927 
1928  constexpr auto src1_precision_bits =
1929  detail::get_precision_bits(src1_precision);
1930  static_assert(
1931  N1 == ((src1_precision_bits * systolic_depth * ops_per_channel * N) /
1932  (repeat_count * sizeof(T1) * 8)),
1933  "invalid size for Src1");
1934 
1935  constexpr auto src2_precision_bits =
1936  detail::get_precision_bits(src2_precision);
1937  static_assert(N2 == ((src2_precision_bits * systolic_depth * ops_per_channel *
1938  repeat_count) /
1939  (sizeof(T2) * 8)),
1940  "invalid size for Src2");
1941 
1942 #if defined(__SYCL_DEVICE_ONLY__)
1943  int dpas_info = (repeat_count << 24) + (systolic_depth << 16) +
1944  (((int)src2_precision + 1) << 8) + ((int)src1_precision + 1);
1945  __ESIMD_NS::simd<T, N> result =
1946  __esimd_dpas2<T, T1, T2, N, N1, N2>(src1.data(), src2.data(), dpas_info);
1947 #else
1948  __ESIMD_NS::simd<T, N> result =
1949  __esimd_dpas2<src1_precision, src2_precision, systolic_depth,
1950  repeat_count, T, T1, T2, N, N1, N2>(src1.data(),
1951  src2.data());
1952 #endif // __SYCL_DEVICE_ONLY__
1953 
1954  if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
1955  return result;
1956  else
1957  return __ESIMD_NS::saturate<T>(result);
1958 }
1959 
1970 template <argument_type src1_precision, argument_type src2_precision,
1971  int systolic_depth, int repeat_count, typename T, typename T1,
1972  typename T2, int N, int N1, int N2,
1973  typename Sat = __ESIMD_NS::saturation_off_tag>
1974 __ESIMD_API __ESIMD_NS::simd<T, N>
1975 dpasw(__ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd<T1, N1> src1,
1976  __ESIMD_NS::simd<T2, N2> src2, Sat sat = {}) {
1977  constexpr bool is_4xhf =
1978  (__ESIMD_DNS::is_type<T, cl::sycl::detail::half_impl::StorageT>()) &&
1979  src1_precision == src2_precision && src1_precision == argument_type::FP16;
1980 
1981  constexpr bool is_4xbf = __ESIMD_DNS::is_word_type<T>::value &&
1982  src1_precision == src2_precision &&
1983  src1_precision == argument_type::BF16;
1984 
1985  constexpr bool is_common_dpas = __ESIMD_DNS::is_fp_or_dword_type<T>::value;
1986 
1987  static_assert((is_4xhf || is_4xbf || is_common_dpas),
1988  "unsupported dpas type");
1989 
1990  static_assert(__ESIMD_DNS::is_dword_type<T1>::value,
1991  "Src1 must be DWORD type");
1992 
1993  static_assert(__ESIMD_DNS::is_dword_type<T2>::value,
1994  "Src2 must be DWORD type");
1995 
1996  static_assert((N == 8 * repeat_count) || (N == 16 * repeat_count),
1997  "Execution size must be 8 or 16");
1998 
1999  static_assert((systolic_depth == 8) || (systolic_depth == 4),
2000  "systolic_depth must be 8 or 4");
2001 
2002  static_assert((repeat_count >= 1) && (repeat_count <= 8),
2003  "repeat_count must be within 1 to 8");
2004 
2005  constexpr auto en_ops_per_channel =
2006  detail::get_ops_per_channel(src1_precision, src2_precision);
2007  static_assert(en_ops_per_channel != detail::dpas_ops_per_channel::INVALID,
2008  "invalid combination of Src1/Src2 precision");
2009  constexpr auto ops_per_channel = static_cast<unsigned>(en_ops_per_channel);
2010 
2011  constexpr auto src1_precision_bits =
2012  detail::get_precision_bits(src1_precision);
2013  static_assert(
2014  N1 == ((src1_precision_bits * systolic_depth * ops_per_channel * N) /
2015  (repeat_count * sizeof(T1) * 8)),
2016  "invalid size for Src1");
2017 
2018  constexpr auto src2_precision_bits =
2019  detail::get_precision_bits(src2_precision);
2020  static_assert(N2 == ((src2_precision_bits * systolic_depth * ops_per_channel *
2021  ((repeat_count + 1) / 2)) /
2022  (sizeof(T2) * 8)),
2023  "invalid size for Src2");
2024 
2025 #if defined(__SYCL_DEVICE_ONLY__)
2026  int dpas_info = (repeat_count << 24) + (systolic_depth << 16) +
2027  (((int)src2_precision + 1) << 8) + ((int)src1_precision + 1);
2028  __ESIMD_NS::simd<T, N> result = __esimd_dpasw<T, T1, T2, N, N1, N2>(
2029  src0.data(), src1.data(), src2.data(), dpas_info);
2030 #else
2031  __ESIMD_NS::simd<T, N> result =
2032  __esimd_dpasw<src1_precision, src2_precision, systolic_depth,
2033  repeat_count, T, T1, T2, N, N1, N2>(
2034  src0.data(), src1.data(), src2.data());
2035 #endif // __SYCL_DEVICE_ONLY__
2036 
2037  if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
2038  return result;
2039  else
2040  return __ESIMD_NS::saturate<T>(result);
2041 }
2042 
2051 template <argument_type src1_precision, argument_type src2_precision,
2052  int systolic_depth, int repeat_count, typename T, typename T1,
2053  typename T2, int N, int N1, int N2,
2054  typename Sat = __ESIMD_NS::saturation_off_tag>
2055 __ESIMD_API __ESIMD_NS::simd<T, N> dpasw2(__ESIMD_NS::simd<T1, N1> src1,
2056  __ESIMD_NS::simd<T2, N2> src2,
2057  Sat sat = {}) {
2058  constexpr bool is_4xhf =
2059  (__ESIMD_DNS::is_type<T, cl::sycl::detail::half_impl::StorageT>()) &&
2060  src1_precision == src2_precision && src1_precision == argument_type::FP16;
2061 
2062  constexpr bool is_4xbf = __ESIMD_DNS::is_word_type<T>::value &&
2063  src1_precision == src2_precision &&
2064  src1_precision == argument_type::BF16;
2065 
2066  constexpr bool is_common_dpas = __ESIMD_DNS::is_fp_or_dword_type<T>::value;
2067 
2068  static_assert((is_4xhf || is_4xbf || is_common_dpas),
2069  "unsupported dpas type");
2070 
2071  static_assert(__ESIMD_DNS::is_dword_type<T1>::value,
2072  "Src1 must be DWORD type");
2073 
2074  static_assert(__ESIMD_DNS::is_dword_type<T2>::value,
2075  "Src2 must be DWORD type");
2076 
2077  static_assert((N == 8 * repeat_count) || (N == 16 * repeat_count),
2078  "Execution size must be 8 or 16");
2079 
2080  static_assert((systolic_depth == 8) || (systolic_depth == 4),
2081  "systolic_depth must be 8 or 4");
2082 
2083  static_assert((repeat_count >= 1) && (repeat_count <= 8),
2084  "repeat_count must be within 1 to 8");
2085 
2086  constexpr auto en_ops_per_channel =
2087  detail::get_ops_per_channel(src1_precision, src2_precision);
2088  static_assert(en_ops_per_channel != detail::dpas_ops_per_channel::INVALID,
2089  "invalid combination of Src1/Src2 precision");
2090  constexpr auto ops_per_channel = static_cast<unsigned>(en_ops_per_channel);
2091 
2092  constexpr auto src1_precision_bits =
2093  detail::get_precision_bits(src1_precision);
2094  static_assert(
2095  N1 == ((src1_precision_bits * systolic_depth * ops_per_channel * N) /
2096  (repeat_count * sizeof(T1) * 8)),
2097  "invalid size for Src1");
2098 
2099  constexpr auto src2_precision_bits =
2100  detail::get_precision_bits(src2_precision);
2101  static_assert(N2 == ((src2_precision_bits * systolic_depth * ops_per_channel *
2102  ((repeat_count + 1) / 2)) /
2103  (sizeof(T2) * 8)),
2104  "invalid size for Src2");
2105 
2106 #if defined(__SYCL_DEVICE_ONLY__)
2107  int dpas_info = (repeat_count << 24) + (systolic_depth << 16) +
2108  (((int)src2_precision + 1) << 8) + ((int)src1_precision + 1);
2109  __ESIMD_NS::simd<T, N> result =
2110  __esimd_dpasw2<T, T1, T2, N, N1, N2>(src1.data(), src2.data(), dpas_info);
2111 #else
2112  __ESIMD_NS::simd<T, N> result =
2113  __esimd_dpasw2<src1_precision, src2_precision, systolic_depth,
2114  repeat_count, T, T1, T2, N, N1, N2>(src1.data(),
2115  src2.data());
2116 #endif // __SYCL_DEVICE_ONLY__
2117 
2118  if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
2119  return result;
2120  else
2121  return __ESIMD_NS::saturate<T>(result);
2122 }
2124 
2125 } // namespace __ESIMD_ENS
2126 } // __SYCL_INLINE_NAMESPACE(cl)
math.hpp
cl::sycl::ext::intel::experimental::esimd::dpas
__ESIMD_API sycl::ext::intel::esimd::simd< T, N > dpas(sycl::ext::intel::esimd::simd< T1, N1 > src1, sycl::ext::intel::esimd::simd< T2, N2 > src2, Sat sat={})
DPAS.
Definition: math.hpp:1900
math_intrin.hpp
cl::sycl::ext::intel::experimental::esimd::bf_insert
ESIMD_NODEBUG ESIMD_INLINE std::enable_if_t< __ESIMD_DNS::is_esimd_scalar< T0 >::value &&__ESIMD_DNS::is_esimd_scalar< T4 >::value, std::remove_const_t< T0 > > bf_insert(T1 src0, T2 src1, T3 src2, T4 src3)
bf_insert
Definition: math.hpp:1084
T
cl::sycl::ext::intel::experimental::esimd::quot
__ESIMD_API std::enable_if_t< __ESIMD_DNS::is_esimd_scalar< T0 >::value &&__ESIMD_DNS::is_esimd_scalar< T1 >::value &&std::is_integral< T0 >::value &&std::is_integral< T1 >::value, std::remove_const_t< T0 > > quot(T0 src0, T1 src1)
Integral quotient (scalar version)
Definition: math.hpp:493
cl::sycl::ext::intel::experimental::esimd::atan2_fast
ESIMD_INLINE float atan2_fast(float y, float x)
Definition: math.hpp:1329
cl::sycl::ext::intel::experimental::esimd::asin
__ESIMD_API std::enable_if_t< std::is_floating_point< T >::value, T > asin(T src0)
Definition: math.hpp:1227
cl::sycl::ext::intel::experimental::esimd::fmod
ESIMD_INLINE float fmod(float y, float x)
Definition: math.hpp:1391
cl::sycl::ext::intel::esimd::cos
__ESIMD_API T cos(T src, Sat sat={})
Scalar version.
Definition: math.hpp:407
cl::sycl::ext::intel::esimd::abs
__ESIMD_API std::enable_if_t< detail::is_esimd_scalar< T1 >::value, std::remove_const_t< T1 > > abs(T1 src0)
Get absolute value (scalar version).
Definition: math.hpp:168
cl::sycl::ext::intel::experimental::esimd::div
ESIMD_NODEBUG ESIMD_INLINE std::enable_if_t< __ESIMD_DNS::is_esimd_scalar< RT >::value &&__ESIMD_DNS::is_esimd_scalar< T0 >::value &&__ESIMD_DNS::is_esimd_scalar< T1 >::value, std::remove_const_t< RT > > div(sycl::ext::intel::esimd::simd< std::remove_const_t< RT >, 1 > &remainder, T0 src0, T1 src1)
Integral division (scalar version).
Definition: math.hpp:582
cl::sycl::ext::intel::experimental::esimd::argument_type
argument_type
Definition: common.hpp:29
cl::sycl::ext::intel::experimental::esimd::rol
__ESIMD_API std::enable_if_t< __ESIMD_DNS::is_esimd_scalar< T0 >::value &&__ESIMD_DNS::is_esimd_scalar< T1 >::value &&__ESIMD_DNS::is_esimd_scalar< T2 >::value &&std::is_integral< T0 >::value &&std::is_integral< T1 >::value &&std::is_integral< T2 >::value, std::remove_const_t< T0 > > rol(T1 src0, T2 src1)
Rotate left operation with two scalar inputs.
Definition: math.hpp:205
util.hpp
cl::sycl::ext::intel::experimental::esimd::sincos
__ESIMD_API sycl::ext::intel::esimd::simd< float, SZ > sincos(sycl::ext::intel::esimd::simd< float, SZ > &dstcos, U src0, Sat sat={})
Definition: math.hpp:1125
cl::sycl::ext::intel::experimental::esimd::mod
__ESIMD_API std::enable_if_t< __ESIMD_DNS::is_esimd_scalar< T0 >::value &&__ESIMD_DNS::is_esimd_scalar< T1 >::value &&std::is_integral< T0 >::value &&std::is_integral< T1 >::value, std::remove_const_t< T0 > > mod(T0 src0, T1 src1)
Modulo (scalar version)
Definition: math.hpp:524
cl::sycl::ext::intel::experimental::esimd::tanh
ESIMD_INLINE sycl::ext::intel::esimd::simd< float, N > tanh(sycl::ext::intel::esimd::simd< float, N > x)
Definition: math.hpp:1636
cl::sycl::ext::intel::esimd::inv
__ESIMD_API T inv(T src, Sat sat={})
Scalar version.
Definition: math.hpp:377
cl::sycl::ext::intel::experimental::esimd::dpasw2
__ESIMD_API sycl::ext::intel::esimd::simd< T, N > dpasw2(sycl::ext::intel::esimd::simd< T1, N1 > src1, sycl::ext::intel::esimd::simd< T2, N2 > src2, Sat sat={})
DPASW2.
Definition: math.hpp:2055
cl
We provide new interfaces for matrix muliply in this patch:
Definition: access.hpp:13
cl::sycl::ext::intel::experimental::esimd::shl
__ESIMD_API std::enable_if_t< __ESIMD_DNS::is_esimd_scalar< T0 >::value &&__ESIMD_DNS::is_esimd_scalar< T1 >::value &&__ESIMD_DNS::is_esimd_scalar< T2 >::value &&std::is_integral< T0 >::value &&std::is_integral< T1 >::value &&std::is_integral< T2 >::value, std::remove_const_t< T0 > > shl(T1 src0, T2 src1, Sat sat={})
Shift left operation (scalar version)
Definition: math.hpp:90
cl::sycl::sign
detail::enable_if_t< detail::is_genfloat< T >::value, T > sign(T x) __NOEXC
Definition: builtins.hpp:622
cl::sycl::ext::intel::experimental::esimd::cos_emu
ESIMD_INLINE float cos_emu(T x0)
Definition: math.hpp:1500
cl::sycl::ext::intel::esimd::rsqrt
__ESIMD_API T rsqrt(T src, Sat sat={})
Scalar version.
Definition: math.hpp:399
cl::sycl::ext::intel::experimental::esimd::line
ESIMD_NODEBUG ESIMD_INLINE std::enable_if_t< __ESIMD_DNS::is_fp_or_dword_type< T >::value &&std::is_floating_point< T >::value, sycl::ext::intel::esimd::simd< T, SZ > > line(float P, float Q, sycl::ext::intel::esimd::simd< T, SZ > src1, Sat sat={})
Linear equation.
Definition: math.hpp:933
cl::sycl::ext::intel::experimental::esimd::dpasw
__ESIMD_API sycl::ext::intel::esimd::simd< T, N > dpasw(sycl::ext::intel::esimd::simd< T, N > src0, sycl::ext::intel::esimd::simd< T1, N1 > src1, sycl::ext::intel::esimd::simd< T2, N2 > src2, Sat sat={})
DPASW.
Definition: math.hpp:1975
cl::sycl::ext::intel::experimental::esimd::ror
__ESIMD_API std::enable_if_t< __ESIMD_DNS::is_esimd_scalar< T0 >::value &&__ESIMD_DNS::is_esimd_scalar< T1 >::value &&__ESIMD_DNS::is_esimd_scalar< T2 >::value &&std::is_integral< T0 >::value &&std::is_integral< T1 >::value &&std::is_integral< T2 >::value, std::remove_const_t< T0 > > ror(T1 src0, T2 src1)
Rotate right operation with two scalar inputs.
Definition: math.hpp:264
sycl::ext::oneapi::experimental::simd
std::experimental::simd< T, simd_abi::native_fixed_size< T, N > > simd
Definition: invoke_simd.hpp:85
cl::sycl::ext::intel::experimental::esimd::atan2
ESIMD_INLINE float atan2(float y, float x)
Definition: math.hpp:1355
cl::sycl::detail::pi
Definition: backend_traits_opencl.hpp:193
cl::sycl::ext::intel::experimental::esimd::shr
__ESIMD_API std::enable_if_t< __ESIMD_DNS::is_esimd_scalar< T0 >::value &&__ESIMD_DNS::is_esimd_scalar< T1 >::value &&__ESIMD_DNS::is_esimd_scalar< T2 >::value &&std::is_integral< T0 >::value &&std::is_integral< T1 >::value &&std::is_integral< T2 >::value, std::remove_const_t< T0 > > shr(T1 src0, T2 src1, Sat sat={})
Shift right operation (scalar version)
Definition: math.hpp:146
cl::sycl::ext::intel::experimental::esimd::atan
__ESIMD_API std::enable_if_t< std::is_floating_point< T >::value, T > atan(T src0)
Definition: math.hpp:1167
cl::sycl::ext::intel::experimental::esimd::dp2
ESIMD_NODEBUG ESIMD_INLINE std::enable_if_t< __ESIMD_DNS::is_fp_or_dword_type< T1 >::value &&std::is_floating_point< T1 >::value &&__ESIMD_DNS::is_fp_or_dword_type< U >::value &&std::is_floating_point< U >::value, sycl::ext::intel::esimd::simd< T0, SZ > > dp2(sycl::ext::intel::esimd::simd< T1, SZ > src0, U src1, Sat sat={})
Dot product on groups of 4 elements.
Definition: math.hpp:767
cl::sycl::ext::intel::experimental::esimd::lrp
ESIMD_NODEBUG ESIMD_INLINE std::enable_if_t< __ESIMD_DNS::is_fp_or_dword_type< T >::value &&std::is_floating_point< T >::value &&__ESIMD_DNS::is_fp_or_dword_type< U >::value &&std::is_floating_point< U >::value, sycl::ext::intel::esimd::simd< T, SZ > > lrp(sycl::ext::intel::esimd::simd< T, SZ > src0, U src1, V src2, Sat sat={})
Definition: math.hpp:1025
cl::sycl::ext::intel::experimental::esimd::frc
__ESIMD_API T frc(T src0)
Performs truncate-to-minus-infinity fraction operation of src0.
Definition: math.hpp:958
cl::sycl::ext::intel::esimd::sin
__ESIMD_API T sin(T src, Sat sat={})
Scalar version.
Definition: math.hpp:403
cl::sycl::ext::intel::esimd::exp
ESIMD_NODEBUG ESIMD_INLINE T exp(T src0, Sat sat={})
Definition: math.hpp:499
cl::sycl::ext::intel::experimental::esimd::lsr
__ESIMD_API std::enable_if_t< __ESIMD_DNS::is_esimd_scalar< T0 >::value &&__ESIMD_DNS::is_esimd_scalar< T1 >::value &&__ESIMD_DNS::is_esimd_scalar< T2 >::value &&std::is_integral< T0 >::value &&std::is_integral< T1 >::value &&std::is_integral< T2 >::value, std::remove_const_t< T0 > > lsr(T1 src0, T2 src1, Sat sat={})
Logical Shift Right (scalar version)
Definition: math.hpp:320
cl::sycl::ext::intel::experimental::esimd::acos
__ESIMD_API std::enable_if_t< std::is_floating_point< T >::value, T > acos(T src0)
Definition: math.hpp:1205
cl::sycl::ext::intel::experimental::esimd::sin_emu
ESIMD_INLINE float sin_emu(T x0)
Definition: math.hpp:1431
cl::sycl::ext::intel::experimental::esimd::bf_reverse
ESIMD_NODEBUG ESIMD_INLINE std::enable_if_t< __ESIMD_DNS::is_esimd_scalar< T0 >::value &&__ESIMD_DNS::is_esimd_scalar< T1 >::value, std::remove_const_t< T0 > > bf_reverse(T1 src0)
bf_reverse
Definition: math.hpp:1056
cl::sycl::ext::intel::experimental::esimd::dph
ESIMD_NODEBUG ESIMD_INLINE std::enable_if_t< __ESIMD_DNS::is_fp_or_dword_type< T >::value &&std::is_floating_point< T >::value &&__ESIMD_DNS::is_fp_or_dword_type< U >::value &&std::is_floating_point< U >::value, sycl::ext::intel::esimd::simd< T, SZ > > dph(sycl::ext::intel::esimd::simd< T, SZ > src0, U src1, Sat sat={})
Dot product on groups of 4 elements.
Definition: math.hpp:869
cl::sycl::ext::intel::esimd::sqrt
__ESIMD_API T sqrt(T src, Sat sat={})
Scalar version.
Definition: math.hpp:391
cl::sycl::ext::intel::experimental::esimd::bf_extract
ESIMD_NODEBUG ESIMD_INLINE std::enable_if_t< __ESIMD_DNS::is_esimd_scalar< T0 >::value &&__ESIMD_DNS::is_esimd_scalar< T3 >::value, std::remove_const_t< T0 > > bf_extract(T1 src0, T2 src1, T3 src2)
bf_extract
Definition: math.hpp:1111
cl::sycl::remainder
detail::enable_if_t< detail::is_genfloat< T >::value, T > remainder(T x, T y) __NOEXC
Definition: builtins.hpp:399
cl::sycl::ext::intel::experimental::esimd::asr
__ESIMD_API std::enable_if_t< __ESIMD_DNS::is_esimd_scalar< T0 >::value &&__ESIMD_DNS::is_esimd_scalar< T1 >::value &&__ESIMD_DNS::is_esimd_scalar< T2 >::value &&std::is_integral< T0 >::value &&std::is_integral< T1 >::value &&std::is_integral< T2 >::value, std::remove_const_t< T0 > > asr(T1 src0, T2 src1, Sat sat={})
Arithmetical Shift Right (scalar version)
Definition: math.hpp:376
cl::sycl::ext::intel::experimental::esimd::lzd
ESIMD_NODEBUG ESIMD_INLINE std::enable_if_t< __ESIMD_DNS::is_esimd_scalar< RT >::value &&__ESIMD_DNS::is_esimd_scalar< T0 >::value, std::remove_const_t< RT > > lzd(T0 src0, Sat sat={})
Definition: math.hpp:979
cl::sycl::ext::intel::experimental::esimd::tanh_cody_waite
ESIMD_INLINE sycl::ext::intel::esimd::simd< float, N > tanh_cody_waite(sycl::ext::intel::esimd::simd< float, N > x)
Definition: math.hpp:1625
common.hpp
cl::sycl::ext::intel::experimental::esimd::dp3
ESIMD_NODEBUG ESIMD_INLINE std::enable_if_t< __ESIMD_DNS::is_fp_or_dword_type< T1 >::value &&std::is_floating_point< T1 >::value &&__ESIMD_DNS::is_fp_or_dword_type< U >::value &&std::is_floating_point< U >::value, sycl::ext::intel::esimd::simd< T0, SZ > > dp3(sycl::ext::intel::esimd::simd< T1, SZ > src0, U src1, Sat sat={})
Dot product on groups of 4 elements.
Definition: math.hpp:800
cl::sycl::ext::intel::experimental::esimd::imul
ESIMD_NODEBUG ESIMD_INLINE std::enable_if_t< __ESIMD_DNS::is_esimd_scalar< T >::value &&__ESIMD_DNS::is_esimd_scalar< U >::value &&__ESIMD_DNS::is_esimd_scalar< T0 >::value, T0 > imul(sycl::ext::intel::esimd::simd< T0, 1 > &rmd, T src0, U src1)
Definition: math.hpp:458
cl::sycl::ext::intel::experimental::esimd::dp4
sycl::ext::intel::esimd::simd< T, N > dp4(sycl::ext::intel::esimd::simd< T, N > v1, sycl::ext::intel::esimd::simd< T, N > v2)
Definition: math.hpp:1641
__SYCL_INLINE_NAMESPACE
#define __SYCL_INLINE_NAMESPACE(X)
Definition: defines_elementary.hpp:12