19 namespace __ESIMD_ENS {
34 template <
typename T0,
typename T1,
int SZ,
typename U,
35 class Sat = __ESIMD_NS::saturation_off_tag>
36 __ESIMD_API std::enable_if_t<std::is_integral<T0>::value &&
37 std::is_integral<T1>::value &&
38 std::is_integral<U>::value,
39 __ESIMD_NS::simd<T0, SZ>>
40 shl(__ESIMD_NS::simd<T1, SZ> src0, U src1, Sat sat = {}) {
41 using ComputationTy = __ESIMD_DNS::computation_type_t<decltype(src0), U>;
42 typename __ESIMD_DNS::simd_type<ComputationTy>::type Src0 = src0;
43 typename __ESIMD_DNS::simd_type<ComputationTy>::type Src1 = src1;
45 if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_on_tag>) {
46 if constexpr (std::is_unsigned<T0>::value) {
47 if constexpr (std::is_unsigned<T1>::value)
48 return __esimd_uushl_sat<T0, T1, SZ>(Src0.data(), Src1.data());
50 return __esimd_usshl_sat<T0, T1, SZ>(Src0.data(), Src1.data());
52 if constexpr (std::is_signed<T1>::value)
53 return __esimd_sushl_sat<T0, T1, SZ>(Src0.data(), Src1.data());
55 return __esimd_ssshl_sat<T0, T1, SZ>(Src0.data(), Src1.data());
58 if constexpr (std::is_unsigned<T0>::value) {
59 if constexpr (std::is_unsigned<T1>::value)
60 return __esimd_uushl<T0, T1, SZ>(Src0.data(), Src1.data());
62 return __esimd_usshl<T0, T1, SZ>(Src0.data(), Src1.data());
64 if constexpr (std::is_signed<T1>::value)
65 return __esimd_sushl<T0, T1, SZ>(Src0.data(), Src1.data());
67 return __esimd_ssshl<T0, T1, SZ>(Src0.data(), Src1.data());
81 template <
typename T0,
typename T1,
typename T2,
82 class Sat = __ESIMD_NS::saturation_off_tag>
83 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<T0>::value &&
84 __ESIMD_DNS::is_esimd_scalar<T1>::value &&
85 __ESIMD_DNS::is_esimd_scalar<T2>::value &&
86 std::is_integral<T0>::value &&
87 std::is_integral<T1>::value &&
88 std::is_integral<T2>::value,
89 std::remove_const_t<T0>>
90 shl(T1 src0, T2 src1, Sat sat = {}) {
91 using ComputationTy = __ESIMD_DNS::computation_type_t<T1, T2>;
92 typename __ESIMD_DNS::simd_type<ComputationTy>::type Src0 = src0;
93 typename __ESIMD_DNS::simd_type<ComputationTy>::type Src1 = src1;
94 __ESIMD_NS::simd<T0, 1> Result = esimd::shl<T0>(Src0, Src1, sat);
108 template <
typename T0,
typename T1,
int SZ,
typename U,
109 class Sat = __ESIMD_NS::saturation_off_tag>
110 __ESIMD_API std::enable_if_t<std::is_integral<T0>::value &&
111 std::is_integral<T1>::value &&
112 std::is_integral<U>::value,
113 __ESIMD_NS::simd<T0, SZ>>
114 shr(__ESIMD_NS::simd<T1, SZ> src0, U src1, Sat sat = {}) {
115 using ComputationTy = __ESIMD_DNS::computation_type_t<decltype(src0), U>;
116 typename __ESIMD_DNS::simd_type<ComputationTy>::type Src0 = src0;
117 typename __ESIMD_DNS::simd_type<ComputationTy>::type Src1 = src1;
119 typename __ESIMD_DNS::simd_type<ComputationTy>::type Result =
120 Src0.data() >> Src1.data();
122 if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
125 return __ESIMD_NS::saturate<T0>(Result);
137 template <
typename T0,
typename T1,
typename T2,
138 class Sat = __ESIMD_NS::saturation_off_tag>
139 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<T0>::value &&
140 __ESIMD_DNS::is_esimd_scalar<T1>::value &&
141 __ESIMD_DNS::is_esimd_scalar<T2>::value &&
142 std::is_integral<T0>::value &&
143 std::is_integral<T1>::value &&
144 std::is_integral<T2>::value,
145 std::remove_const_t<T0>>
146 shr(T1 src0, T2 src1, Sat sat = {}) {
147 using ComputationTy = __ESIMD_DNS::computation_type_t<T1, T2>;
148 typename __ESIMD_DNS::simd_type<ComputationTy>::type Src0 = src0;
149 typename __ESIMD_DNS::simd_type<ComputationTy>::type Src1 = src1;
150 __ESIMD_NS::simd<T0, 1> Result = esimd::shr<T0>(Src0, Src1, sat);
162 template <
typename T0,
typename T1,
int SZ>
164 std::enable_if_t<std::is_integral<T0>::value && std::is_integral<T1>::value,
165 __ESIMD_NS::simd<T0, SZ>>
166 rol(__ESIMD_NS::simd<T1, SZ> src0, __ESIMD_NS::simd<T1, SZ> src1) {
167 return __esimd_rol<T0, T1, SZ>(src0.data(), src1.data());
178 template <
typename T0,
typename T1,
int SZ,
typename U>
179 __ESIMD_API std::enable_if_t<std::is_integral<T0>::value &&
180 std::is_integral<T1>::value &&
181 std::is_integral<U>::value,
182 __ESIMD_NS::simd<T0, SZ>>
183 rol(__ESIMD_NS::simd<T1, SZ> src0, U src1) {
184 using ComputationTy = __ESIMD_DNS::computation_type_t<decltype(src0), U>;
185 typename __ESIMD_DNS::simd_type<ComputationTy>::type Src0 = src0;
186 typename __ESIMD_DNS::simd_type<ComputationTy>::type Src1 = src1;
187 return __esimd_rol<T0>(Src0.data(), Src1.data());
197 template <
typename T0,
typename T1,
typename T2>
198 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<T0>::value &&
199 __ESIMD_DNS::is_esimd_scalar<T1>::value &&
200 __ESIMD_DNS::is_esimd_scalar<T2>::value &&
201 std::is_integral<T0>::value &&
202 std::is_integral<T1>::value &&
203 std::is_integral<T2>::value,
204 std::remove_const_t<T0>>
206 using ComputationTy = __ESIMD_DNS::computation_type_t<T1, T2>;
207 typename __ESIMD_DNS::simd_type<ComputationTy>::type Src0 = src0;
208 typename __ESIMD_DNS::simd_type<ComputationTy>::type Src1 = src1;
209 __ESIMD_NS::simd<T0, 1> Result = esimd::rol<T0>(Src0, Src1);
221 template <
typename T0,
typename T1,
int SZ>
223 std::enable_if_t<std::is_integral<T0>::value && std::is_integral<T1>::value,
224 __ESIMD_NS::simd<T0, SZ>>
225 ror(__ESIMD_NS::simd<T1, SZ> src0, __ESIMD_NS::simd<T1, SZ> src1) {
226 return __esimd_ror<T0, T1, SZ>(src0.data(), src1.data());
237 template <
typename T0,
typename T1,
int SZ,
typename U>
238 __ESIMD_API std::enable_if_t<std::is_integral<T0>::value &&
239 std::is_integral<T1>::value &&
240 std::is_integral<U>::value,
241 __ESIMD_NS::simd<T0, SZ>>
242 ror(__ESIMD_NS::simd<T1, SZ> src0, U src1) {
243 using ComputationTy = __ESIMD_DNS::computation_type_t<decltype(src0), U>;
244 typename __ESIMD_DNS::simd_type<ComputationTy>::type Src0 = src0;
245 typename __ESIMD_DNS::simd_type<ComputationTy>::type Src1 = src1;
246 return __esimd_ror<T0>(Src0.data(), Src1.data());
256 template <
typename T0,
typename T1,
typename T2>
257 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<T0>::value &&
258 __ESIMD_DNS::is_esimd_scalar<T1>::value &&
259 __ESIMD_DNS::is_esimd_scalar<T2>::value &&
260 std::is_integral<T0>::value &&
261 std::is_integral<T1>::value &&
262 std::is_integral<T2>::value,
263 std::remove_const_t<T0>>
265 using ComputationTy = __ESIMD_DNS::computation_type_t<T1, T2>;
266 typename __ESIMD_DNS::simd_type<ComputationTy>::type Src0 = src0;
267 typename __ESIMD_DNS::simd_type<ComputationTy>::type Src1 = src1;
268 __ESIMD_NS::simd<T0, 1> Result = esimd::ror<T0>(Src0, Src1);
282 template <
typename T0,
typename T1,
int SZ,
typename U,
283 class Sat = __ESIMD_NS::saturation_off_tag>
284 __ESIMD_API std::enable_if_t<std::is_integral<T0>::value &&
285 std::is_integral<T1>::value &&
286 std::is_integral<U>::value,
287 __ESIMD_NS::simd<T0, SZ>>
288 lsr(__ESIMD_NS::simd<T1, SZ> src0, U src1, Sat sat = {}) {
289 using IntermedTy = __ESIMD_DNS::computation_type_t<T1, T1>;
290 typedef typename std::make_unsigned<IntermedTy>::type ComputationTy;
291 __ESIMD_NS::simd<ComputationTy, SZ> Src0 = src0;
293 __ESIMD_NS::simd<ComputationTy, SZ> Result = Src0.data() >> src1.data();
295 if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
298 return __ESIMD_NS::saturate<T0>(Result);
311 template <
typename T0,
typename T1,
typename T2,
312 class Sat = __ESIMD_NS::saturation_off_tag>
313 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<T0>::value &&
314 __ESIMD_DNS::is_esimd_scalar<T1>::value &&
315 __ESIMD_DNS::is_esimd_scalar<T2>::value &&
316 std::is_integral<T0>::value &&
317 std::is_integral<T1>::value &&
318 std::is_integral<T2>::value,
319 std::remove_const_t<T0>>
320 lsr(T1 src0, T2 src1, Sat sat = {}) {
321 using ComputationTy = __ESIMD_DNS::computation_type_t<T1, T2>;
322 typename __ESIMD_DNS::simd_type<ComputationTy>::type Src0 = src0;
323 typename __ESIMD_DNS::simd_type<ComputationTy>::type Src1 = src1;
324 __ESIMD_NS::simd<T0, 1> Result = esimd::lsr<T0>(Src0, Src1, sat);
338 template <
typename T0,
typename T1,
int SZ,
typename U,
339 class Sat = __ESIMD_NS::saturation_off_tag>
340 __ESIMD_API std::enable_if_t<std::is_integral<T0>::value &&
341 std::is_integral<T1>::value &&
342 std::is_integral<U>::value,
343 __ESIMD_NS::simd<T0, SZ>>
344 asr(__ESIMD_NS::simd<T1, SZ> src0, U src1, Sat sat = {}) {
345 using IntermedTy = __ESIMD_DNS::computation_type_t<T1, T1>;
346 typedef typename std::make_signed<IntermedTy>::type ComputationTy;
347 __ESIMD_NS::simd<ComputationTy, SZ> Src0 = src0;
349 __ESIMD_NS::simd<ComputationTy, SZ> Result = Src0 >> src1;
351 if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
354 return __ESIMD_NS::saturate<T0>(Result);
367 template <
typename T0,
typename T1,
typename T2,
368 class Sat = __ESIMD_NS::saturation_off_tag>
369 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<T0>::value &&
370 __ESIMD_DNS::is_esimd_scalar<T1>::value &&
371 __ESIMD_DNS::is_esimd_scalar<T2>::value &&
372 std::is_integral<T0>::value &&
373 std::is_integral<T1>::value &&
374 std::is_integral<T2>::value,
375 std::remove_const_t<T0>>
376 asr(T1 src0, T2 src1, Sat sat = {}) {
377 using ComputationTy = __ESIMD_DNS::computation_type_t<T1, T2>;
378 typename __ESIMD_DNS::simd_type<ComputationTy>::type Src0 = src0;
379 typename __ESIMD_DNS::simd_type<ComputationTy>::type Src1 = src1;
380 __ESIMD_NS::simd<T0, 1> Result = esimd::asr<T0>(Src0, Src1, sat);
389 #ifndef ESIMD_HAS_LONG_LONG
391 template <
typename T0,
typename T1,
typename U,
int SZ>
393 ESIMD_INLINE std::enable_if_t<__ESIMD_DNS::is_dword_type<T0>::value &&
394 __ESIMD_DNS::is_dword_type<T1>::value &&
395 __ESIMD_DNS::is_dword_type<U>::value,
396 __ESIMD_NS::simd<T0, SZ>>
397 imul(__ESIMD_NS::simd<T0, SZ> &rmd, __ESIMD_NS::simd<T1, SZ> src0, U src1) {
398 using ComputationTy = __ESIMD_DNS::computation_type_t<decltype(src0), U>;
399 typename __ESIMD_DNS::simd_type<ComputationTy>::type Src0 = src0;
400 typename __ESIMD_DNS::simd_type<ComputationTy>::type Src1 = src1;
402 if constexpr (std::is_unsigned<T0>::value)
403 return __esimd_umulh(Src0.data(), Src1.data());
405 return __esimd_smulh(Src0.data(), Src1.data());
412 template <
typename T0,
typename T1,
typename U,
int SZ>
414 std::enable_if_t<__ESIMD_DNS::is_dword_type<T0>::value &&
415 __ESIMD_DNS::is_dword_type<T1>::value &&
416 __ESIMD_DNS::is_dword_type<U>::value && SZ == 1,
417 __ESIMD_NS::simd<T0, SZ>>
418 imul(__ESIMD_NS::simd<T0, SZ> &rmd, __ESIMD_NS::simd<T1, SZ> src0, U src1) {
419 using ComputationTy =
420 __ESIMD_DNS::computation_type_t<decltype(rmd),
long long>;
421 ComputationTy Product = convert<long long>(src0);
423 rmd = Product.bit_cast_view<T0>().select<1, 1>[0];
424 return Product.bit_cast_view<T0>().select<1, 1>[1];
427 template <
typename T0,
typename T1,
typename U,
int SZ>
429 std::enable_if_t<__ESIMD_DNS::is_dword_type<T0>::value &&
430 __ESIMD_DNS::is_dword_type<T1>::value &&
431 __ESIMD_DNS::is_dword_type<U>::value && SZ != 1,
432 __ESIMD_NS::simd<T0, SZ>>
433 imul(__ESIMD_NS::simd<T0, SZ> &rmd, __ESIMD_NS::simd<T1, SZ> src0, U src1) {
434 using ComputationTy =
435 __ESIMD_DNS::computation_type_t<decltype(rmd),
long long>;
436 ComputationTy Product = convert<long long>(src0);
438 rmd = Product.bit_cast_view<T0>().select<SZ, 2>(0);
439 return Product.bit_cast_view<T0>().select<SZ, 2>(1);
444 template <
typename T0,
typename T1,
typename U,
int SZ>
445 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<U>::value,
446 __ESIMD_NS::simd<T0, SZ>>
447 imul(__ESIMD_NS::simd<T0, SZ> &rmd, U src0, __ESIMD_NS::simd<T1, SZ> src1) {
452 template <
typename T0,
typename T,
typename U>
454 ESIMD_INLINE std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<T>::value &&
455 __ESIMD_DNS::is_esimd_scalar<U>::value &&
456 __ESIMD_DNS::is_esimd_scalar<T0>::value,
458 imul(__ESIMD_NS::simd<T0, 1> &rmd, T src0, U src1) {
459 __ESIMD_NS::simd<T, 1> src_0 = src0;
460 __ESIMD_NS::simd<U, 1> src_1 = src1;
461 __ESIMD_NS::simd<T0, 1> res =
462 esimd::imul(rmd, src_0.select_all(), src_1.select_all());
473 template <
typename T,
int SZ,
typename U>
475 std::enable_if_t<std::is_integral<T>::value && std::is_integral<U>::value,
476 __ESIMD_NS::simd<T, SZ>>
477 quot(__ESIMD_NS::simd<T, SZ> src0, U src1) {
487 template <
typename T0,
typename T1>
488 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<T0>::value &&
489 __ESIMD_DNS::is_esimd_scalar<T1>::value &&
490 std::is_integral<T0>::value &&
491 std::is_integral<T1>::value,
492 std::remove_const_t<T0>>
504 template <
typename T,
int SZ,
typename U>
506 std::enable_if_t<std::is_integral<T>::value && std::is_integral<U>::value,
507 __ESIMD_NS::simd<T, SZ>>
508 mod(__ESIMD_NS::simd<T, SZ> src0, U src1) {
518 template <
typename T0,
typename T1>
519 __ESIMD_API std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<T0>::value &&
520 __ESIMD_DNS::is_esimd_scalar<T1>::value &&
521 std::is_integral<T0>::value &&
522 std::is_integral<T1>::value,
523 std::remove_const_t<T0>>
537 template <
typename T,
int SZ,
typename U>
539 std::enable_if_t<std::is_integral<T>::value && std::is_integral<U>::value,
540 __ESIMD_NS::simd<T, SZ>>
556 template <
typename T,
int SZ,
typename U>
558 std::enable_if_t<std::is_integral<T>::value && std::is_integral<U>::value &&
559 __ESIMD_DNS::is_esimd_scalar<U>::value,
560 __ESIMD_NS::simd<T, SZ>>
562 __ESIMD_NS::simd<T, SZ> src1) {
576 template <
typename RT,
typename T0,
typename T1>
578 ESIMD_INLINE std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<RT>::value &&
579 __ESIMD_DNS::is_esimd_scalar<T0>::value &&
580 __ESIMD_DNS::is_esimd_scalar<T1>::value,
581 std::remove_const_t<RT>>
589 #if defined(ESIMD_GEN7_5) || defined(ESIMD_GEN8) || defined(ESIMD_GEN8_5) || \
590 defined(ESIMD_GEN9) || defined(ESIMD_GEN9_5)
602 template <
typename T0,
typename T1,
int SZ,
typename U,
603 class Sat = __ESIMD_NS::saturation_off_tag>
604 __ESIMD_API __ESIMD_NS::simd<T0, SZ>
dp2(__ESIMD_NS::simd<T1, SZ> src0, U src1,
606 static_assert(SZ % 4 == 0,
"result size is not a multiple of 4");
607 __ESIMD_NS::simd<float, SZ> Src0 = src0;
608 __ESIMD_NS::simd<float, SZ> Src1 = src1;
609 __ESIMD_NS::simd<float, SZ> Result = __esimd_dp2(Src0.data(), Src1.data());
610 if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
613 return __ESIMD_NS::saturate<T0>(Result);
626 template <
typename T0,
typename T1,
int SZ,
typename U,
627 class Sat = __ESIMD_NS::saturation_off_tag>
628 __ESIMD_API __ESIMD_NS::simd<T0, SZ>
dp3(__ESIMD_NS::simd<T1, SZ> src0, U src1,
630 static_assert(SZ % 4 == 0,
"result size is not a multiple of 4");
631 __ESIMD_NS::simd<float, SZ> Src0 = src0;
632 __ESIMD_NS::simd<float, SZ> Src1 = src1;
633 __ESIMD_NS::simd<float, SZ> Result = __esimd_dp3(Src0.data(), Src1.data());
634 if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
637 return __ESIMD_NS::saturate<T0>(Result);
650 template <
typename T0,
typename T1,
int SZ,
typename U,
651 class Sat = __ESIMD_NS::saturation_off_tag>
652 __ESIMD_API __ESIMD_NS::simd<T0, SZ>
dp4(__ESIMD_NS::simd<T1, SZ> src0, U src1,
654 static_assert(SZ % 4 == 0,
"result size is not a multiple of 4");
655 __ESIMD_NS::simd<float, SZ> Src0 = src0;
656 __ESIMD_NS::simd<float, SZ> Src1 = src1;
657 __ESIMD_NS::simd<float, SZ> Result = __esimd_dp4(Src0.data(), Src1.data());
658 if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
661 return __ESIMD_NS::saturate<T0>(Result);
674 template <
typename T0,
typename T1,
typename U,
int SZ,
675 class Sat = __ESIMD_NS::saturation_off_tag>
676 __ESIMD_API __ESIMD_NS::simd<T0, SZ>
dph(__ESIMD_NS::simd<T1, SZ> src0, U src1,
678 static_assert(SZ % 4 == 0,
"result size is not a multiple of 4");
679 __ESIMD_NS::simd<float, SZ> Src0 = src0;
680 __ESIMD_NS::simd<float, SZ> Src1 = src1;
681 __ESIMD_NS::simd<float, SZ> Result = __esimd_dph(Src0.data(), Src1.data());
682 if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
685 return __ESIMD_NS::saturate<T0>(Result);
699 template <
typename RT,
typename T1,
typename T2,
int SZ,
700 class Sat = __ESIMD_NS::saturation_off_tag>
701 __ESIMD_API __ESIMD_NS::simd<RT, SZ>
line(__ESIMD_NS::simd<T1, 4> src0,
702 __ESIMD_NS::simd<T2, SZ> src1,
704 static_assert(SZ % 4 == 0,
"result size is not a multiple of 4");
706 __ESIMD_NS::simd<float, 4> Src0 = src0;
707 __ESIMD_NS::simd<float, SZ> Src1 = src1;
708 __ESIMD_NS::simd<float, SZ> Result = __esimd_line(Src0.data(), Src1.data());
710 __ESIMD_NS::simd<RT, SZ> Result;
711 if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
714 return __ESIMD_NS::saturate<RT>(Result);
728 template <
typename RT,
typename T,
int SZ,
729 class Sat = __ESIMD_NS::saturation_off_tag>
730 __ESIMD_API __ESIMD_NS::simd<RT, SZ>
731 line(
float P,
float Q, __ESIMD_NS::simd<T, SZ> src1, Sat sat = {}) {
732 __ESIMD_NS::simd<float, 4> Src0 = P;
734 return esimd::line<RT>(Src0, src1, sat);
759 template <
typename T0,
typename T1,
int SZ,
typename U,
760 class Sat = __ESIMD_NS::saturation_off_tag>
761 ESIMD_NODEBUG ESIMD_INLINE
762 std::enable_if_t<__ESIMD_DNS::is_fp_or_dword_type<T1>::value &&
763 std::is_floating_point<T1>::value &&
764 __ESIMD_DNS::is_fp_or_dword_type<U>::value &&
765 std::is_floating_point<U>::value,
766 __ESIMD_NS::simd<T0, SZ>>
767 dp2(__ESIMD_NS::simd<T1, SZ> src0, U src1, Sat sat = {}) {
768 static_assert(SZ % 4 == 0,
"result size is not a multiple of 4");
770 __ESIMD_NS::simd<float, SZ> Src1 = src1;
771 __ESIMD_NS::simd<float, SZ> Result;
773 for (
int i = 0; i < SZ; i += 4) {
774 Result.select<4, 1>(i) = src0[i] * Src1[i] + src0[i + 1] * Src1[i + 1];
776 if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
779 return __ESIMD_NS::saturate<T1>(Result);
792 template <
typename T0,
typename T1,
int SZ,
typename U,
793 class Sat = __ESIMD_NS::saturation_off_tag>
794 ESIMD_NODEBUG ESIMD_INLINE
795 std::enable_if_t<__ESIMD_DNS::is_fp_or_dword_type<T1>::value &&
796 std::is_floating_point<T1>::value &&
797 __ESIMD_DNS::is_fp_or_dword_type<U>::value &&
798 std::is_floating_point<U>::value,
799 __ESIMD_NS::simd<T0, SZ>>
800 dp3(__ESIMD_NS::simd<T1, SZ> src0, U src1, Sat sat = {}) {
801 static_assert(SZ % 4 == 0,
"result size is not a multiple of 4");
803 __ESIMD_NS::simd<float, SZ> Src1 = src1;
804 __ESIMD_NS::simd<float, SZ> Result;
806 for (
int i = 0; i < SZ; i += 4) {
807 Result.select<4, 1>(i) = src0[i] * Src1[i] + src0[i + 1] * Src1[i + 1] +
808 src0[i + 2] * Src1[i + 2];
810 if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
813 return __ESIMD_NS::saturate<T1>(Result);
826 template <
typename T0,
typename T1,
int SZ,
typename U,
827 class Sat = __ESIMD_NS::saturation_off_tag>
828 ESIMD_NODEBUG ESIMD_INLINE
829 std::enable_if_t<__ESIMD_DNS::is_fp_or_dword_type<T1>::value &&
830 std::is_floating_point<T1>::value &&
831 __ESIMD_DNS::is_fp_or_dword_type<U>::value &&
832 std::is_floating_point<U>::value,
833 __ESIMD_NS::simd<T0, SZ>>
834 dp4(__ESIMD_NS::simd<T1, SZ> src0, U src1, Sat sat = {}) {
835 static_assert(SZ % 4 == 0,
"result size is not a multiple of 4");
837 __ESIMD_NS::simd<T1, SZ> Src1 = src1;
838 __ESIMD_NS::simd<float, SZ> Result;
840 for (
int i = 0; i < SZ; i += 4) {
841 Result.select<4, 1>(i) = src0[i] * Src1[i] + src0[i + 1] * Src1[i + 1] +
842 src0[i + 2] * Src1[i + 2] +
843 src0[i + 3] * Src1[i + 3];
845 if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
848 return __ESIMD_NS::saturate<T1>(Result);
861 template <
typename T,
typename U,
int SZ,
862 class Sat = __ESIMD_NS::saturation_off_tag>
863 ESIMD_NODEBUG ESIMD_INLINE
864 std::enable_if_t<__ESIMD_DNS::is_fp_or_dword_type<T>::value &&
865 std::is_floating_point<T>::value &&
866 __ESIMD_DNS::is_fp_or_dword_type<U>::value &&
867 std::is_floating_point<U>::value,
868 __ESIMD_NS::simd<T, SZ>>
869 dph(__ESIMD_NS::simd<T, SZ> src0, U src1, Sat sat = {}) {
870 static_assert(SZ % 4 == 0,
"result size is not a multiple of 4");
872 __ESIMD_NS::simd<float, SZ> Src1 = src1;
873 __ESIMD_NS::simd<float, SZ> Result;
875 for (
int i = 0; i < SZ; i += 4) {
876 Result.select<4, 1>(i) = src0[i] * Src1[i] + src0[i + 1] * Src1[i + 1] +
877 src0[i + 2] * Src1[i + 2] + 1.0 * Src1[i + 3];
879 if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
882 return __ESIMD_NS::saturate<T>(Result);
895 template <
typename T,
int SZ,
class Sat = __ESIMD_NS::saturation_off_tag>
897 ESIMD_INLINE std::enable_if_t<__ESIMD_DNS::is_fp_or_dword_type<T>::value &&
898 std::is_floating_point<T>::value,
899 __ESIMD_NS::simd<T, SZ>>
900 line(__ESIMD_NS::simd<T, 4> src0, __ESIMD_NS::simd<T, SZ> src1,
902 static_assert(SZ % 4 == 0,
"result size is not a multiple of 4");
904 __ESIMD_NS::simd<T, SZ> Src1 = src1;
905 __ESIMD_NS::simd<T, SZ> Result;
907 for (
int i = 0; i < SZ; i += 4) {
908 Result.select<4, 1>(i) = src0[0] * src1[i] + src0[3];
911 if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
914 return __ESIMD_NS::saturate<T>(Result);
928 template <
typename T,
int SZ,
class Sat = __ESIMD_NS::saturation_off_tag>
930 ESIMD_INLINE std::enable_if_t<__ESIMD_DNS::is_fp_or_dword_type<T>::value &&
931 std::is_floating_point<T>::value,
932 __ESIMD_NS::simd<T, SZ>>
933 line(
float P,
float Q, __ESIMD_NS::simd<T, SZ> src1, Sat sat = {}) {
934 __ESIMD_NS::simd<T, 4> Src0 = P;
936 return esimd::line<T>(Src0, src1, sat);
947 template <
typename T,
int SZ>
948 __ESIMD_API __ESIMD_NS::simd<T, SZ>
frc(__ESIMD_NS::simd<T, SZ> src0) {
949 __ESIMD_NS::simd<float, SZ> Src0 = src0;
950 return __esimd_frc(Src0.data());
958 template <
typename T> __ESIMD_API T
frc(T src0) {
959 __ESIMD_NS::simd<T, 1> Src0 = src0;
960 __ESIMD_NS::simd<T, 1> Result = esimd::frc<T>(Src0);
965 template <
typename RT,
typename T0,
int SZ,
966 class Sat = __ESIMD_NS::saturation_off_tag>
967 __ESIMD_API __ESIMD_NS::simd<RT, SZ>
lzd(__ESIMD_NS::simd<T0, SZ> src0,
970 __ESIMD_NS::simd<uint, SZ> Src0 = src0;
971 return __esimd_lzd<uint>(Src0.data());
974 template <
typename RT,
typename T0,
class Sat = __ESIMD_NS::saturation_off_tag>
976 ESIMD_INLINE std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<RT>::value &&
977 __ESIMD_DNS::is_esimd_scalar<T0>::value,
978 std::remove_const_t<RT>>
979 lzd(T0 src0, Sat sat = {}) {
980 __ESIMD_NS::simd<T0, 1> Src0 = src0;
981 __ESIMD_NS::simd<RT, 1> Result = esimd::lzd<RT>(Src0);
986 #if defined(ESIMD_GEN7_5) || defined(ESIMD_GEN8) || defined(ESIMD_GEN8_5) || \
987 defined(ESIMD_GEN9) || defined(ESIMD_GEN9_5)
989 template <
int SZ,
typename U,
typename V,
990 class Sat = __ESIMD_NS::saturation_off_tag>
991 __ESIMD_API __ESIMD_NS::simd<float, SZ>
lrp(__ESIMD_NS::simd<float, SZ> src0,
992 U src1, V src2, Sat sat = {}) {
993 static_assert(SZ >= 4 && (SZ & 0x3) == 0,
994 "vector size must be a multiple of 4");
995 __ESIMD_NS::simd<float, SZ> Src1 = src1;
996 __ESIMD_NS::simd<float, SZ> Src2 = src2;
997 __ESIMD_NS::simd<float, SZ> Result =
998 __esimd_lrp<SZ>(src0.data(), Src1.data(), Src2.data());
1000 if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
1003 return __ESIMD_NS::saturate<float>(Result);
1017 template <
typename T,
int SZ,
typename U,
typename V,
1018 class Sat = __ESIMD_NS::saturation_off_tag>
1019 ESIMD_NODEBUG ESIMD_INLINE
1020 std::enable_if_t<__ESIMD_DNS::is_fp_or_dword_type<T>::value &&
1021 std::is_floating_point<T>::value &&
1022 __ESIMD_DNS::is_fp_or_dword_type<U>::value &&
1023 std::is_floating_point<U>::value,
1024 __ESIMD_NS::simd<T, SZ>>
1025 lrp(__ESIMD_NS::simd<T, SZ> src0, U src1, V src2, Sat sat = {}) {
1027 __ESIMD_NS::simd<float, SZ> Src1 = src1;
1028 __ESIMD_NS::simd<float, SZ> Src2 = src2;
1029 __ESIMD_NS::simd<float, SZ> Result;
1030 Result = Src1 * src0 + Src2 * (1.0f - src0);
1031 if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
1034 return __ESIMD_NS::saturate<T>(Result);
1044 template <
typename T0,
typename T1,
int SZ>
1045 __ESIMD_API __ESIMD_NS::simd<T0, SZ>
bf_reverse(__ESIMD_NS::simd<T1, SZ> src0) {
1046 __ESIMD_NS::simd<unsigned, SZ> Src0 = src0;
1047 return __esimd_bfrev<unsigned>(Src0.data());
1051 template <
typename T0,
typename T1>
1053 ESIMD_INLINE std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<T0>::value &&
1054 __ESIMD_DNS::is_esimd_scalar<T1>::value,
1055 std::remove_const_t<T0>>
1057 __ESIMD_NS::simd<T1, 1> Src0 = src0;
1058 __ESIMD_NS::simd<T0, 1> Result = esimd::bf_reverse<T0>(Src0);
1063 template <
typename T0,
typename T1,
int SZ,
typename U,
typename V,
typename W>
1064 ESIMD_NODEBUG ESIMD_INLINE
1065 std::enable_if_t<std::is_integral<T1>::value, __ESIMD_NS::simd<T0, SZ>>
1066 bf_insert(U src0, V src1, W src2, __ESIMD_NS::simd<T1, SZ> src3) {
1067 typedef typename __ESIMD_DNS::dword_type<T1> DT1;
1068 static_assert(std::is_integral<DT1>::value &&
sizeof(DT1) ==
sizeof(
int),
1069 "operand conversion failed");
1070 __ESIMD_NS::simd<DT1, SZ> Src0 = src0;
1071 __ESIMD_NS::simd<DT1, SZ> Src1 = src1;
1072 __ESIMD_NS::simd<DT1, SZ> Src2 = src2;
1073 __ESIMD_NS::simd<DT1, SZ> Src3 = src3;
1075 return __esimd_bfi<DT1>(Src0.data(), Src1.data(), Src2.data(), Src3.data());
1079 template <
typename T0,
typename T1,
typename T2,
typename T3,
typename T4>
1081 ESIMD_INLINE std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<T0>::value &&
1082 __ESIMD_DNS::is_esimd_scalar<T4>::value,
1083 std::remove_const_t<T0>>
1085 __ESIMD_NS::simd<T4, 1> Src3 = src3;
1086 __ESIMD_NS::simd<T0, 1> Result = esimd::bf_insert<T0>(src0, src1, src2, Src3);
1091 template <
typename T0,
typename T1,
int SZ,
typename U,
typename V>
1092 ESIMD_NODEBUG ESIMD_INLINE
1093 std::enable_if_t<std::is_integral<T1>::value, __ESIMD_NS::simd<T0, SZ>>
1095 typedef typename __ESIMD_DNS::dword_type<T1> DT1;
1096 static_assert(std::is_integral<DT1>::value &&
sizeof(DT1) ==
sizeof(
int),
1097 "operand conversion failed");
1098 __ESIMD_NS::simd<DT1, SZ> Src0 = src0;
1099 __ESIMD_NS::simd<DT1, SZ> Src1 = src1;
1100 __ESIMD_NS::simd<DT1, SZ> Src2 = src2;
1102 return __esimd_sbfe<DT1>(Src0.data(), Src1.data(), Src2.data());
1106 template <
typename T0,
typename T1,
typename T2,
typename T3>
1108 ESIMD_INLINE std::enable_if_t<__ESIMD_DNS::is_esimd_scalar<T0>::value &&
1109 __ESIMD_DNS::is_esimd_scalar<T3>::value,
1110 std::remove_const_t<T0>>
1112 __ESIMD_NS::simd<T3, 1> Src2 = src2;
1113 __ESIMD_NS::simd<T0, 1> Result = esimd::bf_extract<T0>(src0, src1, Src2);
1123 template <
int SZ,
typename U,
class Sat = __ESIMD_NS::saturation_off_tag>
1124 __ESIMD_API __ESIMD_NS::simd<float, SZ>
1125 sincos(__ESIMD_NS::simd<float, SZ> &dstcos, U src0, Sat sat = {}) {
1134 constexpr
double HDR_CONST_PI = 3.1415926535897932384626433832795;
1138 template <
typename T,
int SZ>
1139 ESIMD_NODEBUG ESIMD_INLINE
1140 std::enable_if_t<std::is_floating_point<T>::value, __ESIMD_NS::simd<T, SZ>>
1141 atan(__ESIMD_NS::simd<T, SZ> src0) {
1144 __ESIMD_NS::simd_mask<SZ> Neg = src0 < T(0.0);
1145 __ESIMD_NS::simd_mask<SZ> Gt1 = Src0 > T(1.0);
1149 __ESIMD_NS::simd<T, SZ> Src0P2 = Src0 * Src0;
1150 __ESIMD_NS::simd<T, SZ> Src0P4 = Src0P2 * Src0P2;
1152 __ESIMD_NS::simd<T, SZ> Result =
1153 (Src0P4 * T(0.185696) + ((Src0 * T(0.787997) + T(0.63693)) * Src0P2) +
1155 (((((Src0 * -T(0.000121387) + T(0.00202308)) * Src0P2) +
1156 (Src0 * -T(0.0149145)) + T(0.182569)) *
1158 ((Src0 * T(0.395889) + T(1.12158)) * Src0P2) + (Src0 * T(0.636918)) +
1161 Result.merge(Result - T(detail::HDR_CONST_PI / 2.0), Gt1);
1162 Result.merge(Result, Neg);
1166 template <
typename T>
1167 __ESIMD_API std::enable_if_t<std::is_floating_point<T>::value, T>
atan(T src0) {
1168 __ESIMD_NS::simd<T, 1> Src0 = src0;
1169 __ESIMD_NS::simd<T, 1> Result =
esimd::atan(Src0);
1175 template <
typename T,
int SZ>
1176 ESIMD_NODEBUG ESIMD_INLINE
1177 std::enable_if_t<std::is_floating_point<T>::value, __ESIMD_NS::simd<T, SZ>>
1178 acos(__ESIMD_NS::simd<T, SZ> src0) {
1181 __ESIMD_NS::simd_mask<SZ> Neg = src0 < T(0.0);
1182 __ESIMD_NS::simd_mask<SZ> TooBig = Src0 >= T(0.999998);
1186 Src0.merge(T(0.0), TooBig);
1188 __ESIMD_NS::simd<T, SZ> Src01m = T(1.0) - Src0;
1190 __ESIMD_NS::simd<T, SZ> Src0P2 = Src01m * Src01m;
1191 __ESIMD_NS::simd<T, SZ> Src0P4 = Src0P2 * Src0P2;
1193 __ESIMD_NS::simd<T, SZ> Result =
1194 (((Src01m * T(0.015098965761299077) - T(0.005516443930088506)) * Src0P4) +
1195 ((Src01m * T(0.047654245891495528) + T(0.163910606547823220)) * Src0P2) +
1196 Src01m * T(2.000291665285952400) - T(0.000007239283986332)) *
1199 Result.merge(T(0.0), TooBig);
1200 Result.merge(T(detail::HDR_CONST_PI) - Result, Neg);
1204 template <
typename T>
1205 __ESIMD_API std::enable_if_t<std::is_floating_point<T>::value, T>
acos(T src0) {
1206 __ESIMD_NS::simd<T, 1> Src0 = src0;
1207 __ESIMD_NS::simd<T, 1> Result =
esimd::acos(Src0);
1213 template <
typename T,
int SZ>
1214 ESIMD_NODEBUG ESIMD_INLINE
1215 std::enable_if_t<std::is_floating_point<T>::value, __ESIMD_NS::simd<T, SZ>>
1216 asin(__ESIMD_NS::simd<T, SZ> src0) {
1217 __ESIMD_NS::simd_mask<SZ> Neg = src0 < T(0.0);
1219 __ESIMD_NS::simd<T, SZ> Result =
1222 Result.merge(-Result, Neg);
1226 template <
typename T>
1227 __ESIMD_API std::enable_if_t<std::is_floating_point<T>::value, T>
asin(T src0) {
1228 __ESIMD_NS::simd<T, 1> Src0 = src0;
1229 __ESIMD_NS::simd<T, 1> Result =
esimd::asin(Src0);
1240 __ESIMD_NS::simd<float, N>
atan2_fast(__ESIMD_NS::simd<float, N> y,
1241 __ESIMD_NS::simd<float, N> x);
1243 template <
typename T>
float atan2_fast(T y, T x);
1248 __ESIMD_NS::simd<float, N>
atan2(__ESIMD_NS::simd<float, N> y,
1249 __ESIMD_NS::simd<float, N> x);
1251 template <
typename T>
float atan2(T y, T x);
1256 __ESIMD_NS::simd<float, N>
fmod(__ESIMD_NS::simd<float, N> y,
1257 __ESIMD_NS::simd<float, N> x);
1259 template <
typename T>
float fmod(T y, T x);
1264 __ESIMD_NS::simd<float, N>
sin_emu(__ESIMD_NS::simd<float, N> x);
1266 template <
typename T>
float sin_emu(T x);
1271 __ESIMD_NS::simd<float, N>
cos_emu(__ESIMD_NS::simd<float, N> x);
1274 template <
typename T>
float cos_emu(T x);
1281 __ESIMD_NS::simd<float, N>
tanh_cody_waite(__ESIMD_NS::simd<float, N> x);
1284 float tanh(
float x);
1286 template <
int N> __ESIMD_NS::simd<float, N>
tanh(__ESIMD_NS::simd<float, N> x);
1294 static auto constexpr CONST_PI = 3.14159f;
1295 static auto constexpr CMPI = 3.14159265f;
1302 ESIMD_INLINE __ESIMD_NS::simd<float, N>
1303 atan2_fast(__ESIMD_NS::simd<float, N> y, __ESIMD_NS::simd<float, N> x) {
1304 __ESIMD_NS::simd<float, N> a0;
1305 __ESIMD_NS::simd<float, N> a1;
1306 __ESIMD_NS::simd<float, N>
atan2;
1308 __ESIMD_NS::simd_mask<N> mask = (y >= 0.0f);
1309 a0.merge(detail::CONST_PI * 0.5f, detail::CONST_PI * 1.5f, mask);
1310 a1.merge(0, detail::CONST_PI * 2.0f, mask);
1312 a1.merge(detail::CONST_PI, x < 0.0f);
1314 __ESIMD_NS::simd<float, N> xy = x * y;
1315 __ESIMD_NS::simd<float, N> x2 = x * x;
1316 __ESIMD_NS::simd<float, N> y2 = y * y;
1319 constexpr
auto CONST_DBL_EPSILON = 0.00001f;
1321 a0 -= (xy / (y2 + x2 * 0.28f + CONST_DBL_EPSILON));
1322 a1 += (xy / (x2 + y2 * 0.28f + CONST_DBL_EPSILON));
1324 atan2.merge(a1, a0, y2 <= x2);
1330 __ESIMD_NS::simd<float, 1> vy = y;
1331 __ESIMD_NS::simd<float, 1> vx = x;
1339 ESIMD_INLINE __ESIMD_NS::simd<float, N>
atan2(__ESIMD_NS::simd<float, N> y,
1340 __ESIMD_NS::simd<float, N> x) {
1341 __ESIMD_NS::simd<float, N> v_distance;
1342 __ESIMD_NS::simd<float, N> v_y0;
1343 __ESIMD_NS::simd<float, N>
atan2;
1344 __ESIMD_NS::simd_mask<N> mask;
1347 v_y0.merge(detail::CONST_PI, 0, mask);
1349 mask = (__ESIMD_NS::abs<float>(y) < 0.000001f);
1355 template <> ESIMD_INLINE
float atan2(
float y,
float x) {
1358 __ESIMD_NS::simd<float, 1>
atan2;
1359 __ESIMD_NS::simd_mask<1> mask;
1362 v_y0 = mask[0] ? detail::CONST_PI : 0;
1363 v_distance = __ESIMD_NS::sqrt<float>(x * x + y * y);
1364 mask = (__ESIMD_NS::abs<float>(y) < 0.000001f);
1372 ESIMD_INLINE __ESIMD_NS::simd<float, N>
fmod(__ESIMD_NS::simd<float, N> y,
1373 __ESIMD_NS::simd<float, N> x) {
1376 auto fmod_sign_mask = (y.template bit_cast_view<int32_t>()) & 0x80000000;
1378 __ESIMD_NS::simd<float, N> reminder =
1379 abs_y - abs_x * __ESIMD_NS::trunc<float>(abs_y / abs_x);
1381 abs_x.merge(0.0, reminder >= 0);
1382 __ESIMD_NS::simd<float, N>
fmod = reminder + abs_x;
1386 (fmod_abs.template bit_cast_view<int32_t>()) | fmod_sign_mask;
1387 return fmod_bits.template bit_cast_view<float>();
1391 template <> ESIMD_INLINE
float fmod(
float y,
float x) {
1392 return fmod(__ESIMD_NS::simd<float, 1>(y), __ESIMD_NS::simd<float, 1>(x))[0];
1398 ESIMD_INLINE __ESIMD_NS::simd<float, N>
sin_emu(__ESIMD_NS::simd<float, N> x) {
1399 __ESIMD_NS::simd<float, N> x1;
1400 __ESIMD_NS::simd<float, N> x2;
1401 __ESIMD_NS::simd<float, N> t3;
1403 __ESIMD_NS::simd<float, N>
sign;
1404 __ESIMD_NS::simd<float, N> fTrig;
1405 __ESIMD_NS::simd<float, N> TwoPI(6.2831853f);
1406 __ESIMD_NS::simd<float, N> CmpI(detail::CMPI);
1407 __ESIMD_NS::simd<float, N> OneP(1.f);
1408 __ESIMD_NS::simd<float, N> OneN(-1.f);
1412 x1.merge(CmpI - x, x - CmpI, (x <= detail::CMPI));
1413 x1.merge(x, (x <= detail::CMPI * 0.5f));
1414 x1.merge(CmpI * 2 - x, (x > detail::CMPI * 1.5f));
1416 sign.merge(OneN, OneP, (x > detail::CMPI));
1419 t3 = x2 * x1 * 0.1666667f;
1422 x1 + t3 * (OneN + x2 * 0.05f *
1423 (OneP + x2 * 0.0238095f *
1424 (OneN + x2 * 0.0138889f *
1425 (OneP - x2 * 0.0090909f))));
1431 template <
typename T> ESIMD_INLINE
float sin_emu(T x0) {
1432 __ESIMD_NS::simd<float, 1> x1;
1433 __ESIMD_NS::simd<float, 1> x2;
1434 __ESIMD_NS::simd<float, 1> t3;
1436 __ESIMD_NS::simd<float, 1>
sign;
1437 __ESIMD_NS::simd<float, 1> fTrig;
1438 float TwoPI = detail::CMPI * 2.0f;
1440 __ESIMD_NS::simd<float, 1> x =
esimd::fmod(x0, TwoPI);
1442 __ESIMD_NS::simd<float, 1> CmpI(detail::CMPI);
1443 __ESIMD_NS::simd<float, 1> OneP(1.f);
1444 __ESIMD_NS::simd<float, 1> OneN(-1.f);
1446 x1.merge(CmpI - x, x - CmpI, (x <= detail::CMPI));
1447 x1.merge(x, (x <= detail::CMPI * 0.5f));
1448 x1.merge(CmpI * 2.0f - x, (x > detail::CMPI * 1.5f));
1450 sign.merge(OneN, OneP, (x > detail::CMPI));
1453 t3 = x2 * x1 * 0.1666667f;
1456 x1 + t3 * (OneN + x2 * 0.05f *
1457 (OneP + x2 * 0.0238095f *
1458 (OneN + x2 * 0.0138889f *
1459 (OneP - x2 * 0.0090909f))));
1467 ESIMD_INLINE __ESIMD_NS::simd<float, N>
cos_emu(__ESIMD_NS::simd<float, N> x) {
1468 __ESIMD_NS::simd<float, N> x1;
1469 __ESIMD_NS::simd<float, N> x2;
1470 __ESIMD_NS::simd<float, N> t2;
1471 __ESIMD_NS::simd<float, N> t3;
1473 __ESIMD_NS::simd<float, N>
sign;
1474 __ESIMD_NS::simd<float, N> fTrig;
1475 __ESIMD_NS::simd<float, N> TwoPI(6.2831853f);
1476 __ESIMD_NS::simd<float, N> CmpI(detail::CMPI);
1477 __ESIMD_NS::simd<float, N> OneP(1.f);
1478 __ESIMD_NS::simd<float, N> OneN(-1.f);
1482 x1.merge(x - detail::CMPI * 0.5f, CmpI * 1.5f - x, (x <= detail::CMPI));
1483 x1.merge(CmpI * 0.5f - x, (x <= detail::CMPI * 0.5f));
1484 x1.merge(x - detail::CMPI * 1.5f, (x > detail::CMPI * 1.5f));
1486 sign.merge(1, -1, ((x < detail::CMPI * 0.5f) | (x >= detail::CMPI * 1.5f)));
1489 t3 = x2 * x1 * 0.1666667f;
1491 x1 + t3 * (OneN + x2 * 0.05f *
1492 (OneP + x2 * 0.0238095f *
1493 (OneN + x2 * 0.0138889f *
1494 (OneP - x2 * 0.0090909f))));
1500 template <
typename T> ESIMD_INLINE
float cos_emu(T x0) {
1501 __ESIMD_NS::simd<float, 1> x1;
1502 __ESIMD_NS::simd<float, 1> x2;
1503 __ESIMD_NS::simd<float, 1> t3;
1505 __ESIMD_NS::simd<float, 1>
sign;
1506 __ESIMD_NS::simd<float, 1> fTrig;
1507 float TwoPI = detail::CMPI * 2.0f;
1509 __ESIMD_NS::simd<float, 1> x =
esimd::fmod(x0, TwoPI);
1511 __ESIMD_NS::simd<float, 1> CmpI(detail::CMPI);
1512 __ESIMD_NS::simd<float, 1> OneP(1.f);
1513 __ESIMD_NS::simd<float, 1> OneN(-1.f);
1515 x1.merge(x - detail::CMPI * 0.5f, CmpI * 1.5f - x, (x <= detail::CMPI));
1516 x1.merge(CmpI * 0.5f - x, (x <= detail::CMPI * 0.5f));
1517 x1.merge(x - detail::CMPI * 1.5f, (x > detail::CMPI * 1.5f));
1519 sign.merge(OneP, OneN,
1520 ((x < detail::CMPI * 0.5f) | (x >= detail::CMPI * 1.5f)));
1523 t3 = x2 * x1 * 0.1666667f;
1525 x1 + t3 * (OneN + x2 * 0.05f *
1526 (OneP + x2 * 0.0238095f *
1527 (OneN + x2 * 0.0138889f *
1528 (OneP - x2 * 0.0090909f))));
1537 ESIMD_INLINE __ESIMD_NS::simd<float, N>
1538 tanh_cody_waite_impl(__ESIMD_NS::simd<float, N> x) {
1549 constexpr
float p0 = -0.8237728127E+00f;
1550 constexpr
float p1 = -0.3831010665E-02f;
1551 constexpr
float q0 = 0.2471319654E+01f;
1552 constexpr
float q1 = 1.0000000000E+00f;
1553 constexpr
float xsmall = 4.22863966691620432990E-04f;
1554 constexpr
float xmedium = 0.54930614433405484570f;
1555 constexpr
float xlarge = 8.66433975699931636772f;
1556 constexpr
float log2E = 1.442695f;
1558 using RT = __ESIMD_NS::simd<float, N>;
1564 sign.merge(-1.f, 1.f, x < 0.f);
1566 auto isLarge = absX > xlarge;
1567 auto minor = absX <= xlarge;
1568 auto isGtMed = minor & (absX > xmedium);
1569 auto isGtSmall = (absX > xsmall) & (absX <= xmedium);
1572 res.merge(
sign, x, isLarge);
1574 temp = ((temp - 2.f) / temp) *
sign;
1575 res.merge(temp, isGtMed);
1576 res.merge((absX + absX * g * (g * p1 + p0) / (g + q0)) *
sign, isGtSmall);
1582 ESIMD_INLINE __ESIMD_NS::simd<float, N>
1583 tanh_impl(__ESIMD_NS::simd<float, N> x) {
1590 constexpr
float xsmall = 0.000045f;
1591 constexpr
float xlarge = 88.f;
1592 constexpr
float log2E = 1.442695f;
1594 using RT = __ESIMD_NS::simd<float, N>;
1599 sign.merge(-1.f, 1.f, x < 0.f);
1601 auto isLarge = (absX > xlarge);
1602 auto isLessE = (absX <= xlarge);
1605 res.merge(
sign, x, isLarge);
1610 res.merge(((
exp - 1.f) / (
exp + 1.f)) *
sign, (absX > xsmall) & isLessE);
1620 return detail::tanh_cody_waite_impl(__ESIMD_NS::simd<float, 1>(x))[0];
1624 ESIMD_INLINE __ESIMD_NS::simd<float, N>
1626 return detail::tanh_cody_waite_impl(x);
1632 return esimd::detail::tanh_impl(__ESIMD_NS::simd<float, 1>(x))[0];
1636 ESIMD_INLINE __ESIMD_NS::simd<float, N>
tanh(__ESIMD_NS::simd<float, N> x) {
1637 return esimd::detail::tanh_impl(x);
1640 template <
typename T,
int N>
1641 __ESIMD_NS::simd<T, N>
dp4(__ESIMD_NS::simd<T, N> v1,
1642 __ESIMD_NS::simd<T, N> v2) {
1643 auto retv = __esimd_dp4<T, N>(v1.data(), v2.data());
1653 enum class dpas_ops_per_channel : unsigned {
1658 INVALID = 0xffffffffu
1660 constexpr dpas_ops_per_channel
1663 if ((src1_precision == argument_type::U8) ||
1664 (src1_precision == argument_type::S8)) {
1665 if ((src2_precision == argument_type::U8) ||
1666 (src2_precision == argument_type::S8) ||
1667 (src2_precision == argument_type::U4) ||
1668 (src2_precision == argument_type::S4) ||
1669 (src2_precision == argument_type::U2) ||
1670 (src2_precision == argument_type::S2)) {
1671 return dpas_ops_per_channel::OP4;
1673 }
else if ((src1_precision == argument_type::U4) ||
1674 (src1_precision == argument_type::S4) ||
1675 (src1_precision == argument_type::U2) ||
1676 (src1_precision == argument_type::S2)) {
1677 if ((src2_precision == argument_type::U8) ||
1678 (src2_precision == argument_type::S8)) {
1679 return dpas_ops_per_channel::OP4;
1680 }
else if ((src2_precision == argument_type::U4) ||
1681 (src2_precision == argument_type::S4) ||
1682 (src2_precision == argument_type::U2) ||
1683 (src2_precision == argument_type::S2)) {
1684 return dpas_ops_per_channel::OP8;
1686 }
else if ((src1_precision == argument_type::BF16) &&
1687 (src2_precision == argument_type::BF16)) {
1688 return dpas_ops_per_channel::OP2;
1689 }
else if ((src1_precision == argument_type::FP16) &&
1690 (src2_precision == argument_type::FP16)) {
1691 return dpas_ops_per_channel::OP2;
1692 }
else if ((src1_precision == argument_type::TF32) &&
1693 (src2_precision == argument_type::TF32)) {
1694 return dpas_ops_per_channel::OP1;
1696 return dpas_ops_per_channel::INVALID;
1699 constexpr
unsigned get_precision_bits(
argument_type src_precision) {
1700 if ((src_precision == argument_type::U8) ||
1701 (src_precision == argument_type::S8)) {
1703 }
else if ((src_precision == argument_type::U4) ||
1704 (src_precision == argument_type::S4)) {
1706 }
else if ((src_precision == argument_type::U2) ||
1707 (src_precision == argument_type::S2)) {
1709 }
else if ((src_precision == argument_type::BF16) ||
1710 (src_precision == argument_type::FP16)) {
1712 }
else if (src_precision == argument_type::TF32) {
1738 typename T,
int systolic_depth,
int repeat_count,
typename T0,
1739 typename T1,
typename T2,
int N,
int N1,
int N2,
1740 typename Sat = __ESIMD_NS::saturation_off_tag>
1741 __ESIMD_API __ESIMD_NS::simd<T, N>
1742 dpas(__ESIMD_NS::simd<T0, N> src0, __ESIMD_NS::simd<T1, N1> src1,
1743 __ESIMD_NS::simd<T2, N2> src2, Sat sat = {}) {
1749 constexpr
bool check_integer =
1750 detail::is_one_of_v<T, unsigned int, int> &&
1751 detail::is_one_of_v<T0, unsigned int, int> &&
1752 detail::is_one_of_enum_v<
argument_type, src1_precision, argument_type::S8,
1753 argument_type::U8, argument_type::U4,
1754 argument_type::S4, argument_type::U2,
1755 argument_type::S2> &&
1756 detail::is_one_of_enum_v<
argument_type, src2_precision, argument_type::S8,
1757 argument_type::U8, argument_type::U4,
1758 argument_type::S4, argument_type::U2,
1761 constexpr
bool check_bf16 =
1762 detail::is_one_of_v<T, float, short> &&
1763 detail::is_one_of_v<T0, float, short> &&
1765 argument_type::BF16> &&
1767 argument_type::BF16>;
1769 constexpr
bool check_hf =
1770 detail::is_one_of_v<T, float, half> &&
1771 detail::is_one_of_v<T0, float, half> &&
1773 argument_type::FP16> &&
1775 argument_type::FP16>;
1777 #if defined(ESIMD_XE_HPC) || defined(ESIMD_XE_HPG)
1779 constexpr
bool check_tf32 =
1780 detail::is_one_of_v<T, float> && detail::is_one_of_v<T0, float> &&
1782 argument_type::TF32> &&
1784 argument_type::TF32>;
1785 #endif // defined(ESIMD_XE_HPC) || defined(ESIMD_XE_HPG)
1787 #if defined(ESIMD_XE_HPC) || defined(ESIMD_XE_HPG)
1788 constexpr
bool check_passed =
1789 (check_integer || check_hf || check_bf16 || check_tf32);
1790 static_assert(check_passed,
1791 "unsupported dpas type! The supported types are:\n"
1792 " dst | src0 | src1 | src2 \n"
1793 " ud, d | ud, d | ub, b | ub, b \n"
1794 " ud, d | ud, d | u4, s4, u2, s2 | u4, s4, u2, s2 \n"
1795 " f, bf | f, bf | bf | bf \n"
1796 " f, hf | f, hf | hf | hf \n"
1797 " f | f | tf32 | tf32 \n");
1798 #else // else defined(ESIMD_XE_HPC) || defined(ESIMD_XE_HPG)
1799 constexpr
bool check_passed = (check_integer || check_hf || check_bf16);
1800 static_assert(check_passed,
1801 "unsupported dpas type! The supported types are:\n"
1802 " dst | src0 | src1 | src2 \n"
1803 " ud, d | ud, d | ub, b | ub, b \n"
1804 " ud, d | ud, d | u4, s4, u2, s2 | u4, s4, u2, s2 \n"
1805 " f, bf | f, bf | bf | bf \n"
1806 " f, hf | f, hf | hf | hf \n");
1807 #endif // end else defined(ESIMD_XE_HPC) || defined(ESIMD_XE_HPG)
1809 static_assert(__ESIMD_DNS::is_dword_type<T1>::value,
1810 "Src1 must be DWORD type");
1811 static_assert(__ESIMD_DNS::is_dword_type<T2>::value,
1812 "Src2 must be DWORD type");
1814 #if defined(ESIMD_XE_HPC) || defined(ESIMD_XE_HPG)
1815 static_assert((N == 16 * repeat_count),
"Execution size on PVC must be 16");
1817 static_assert((N == 8 * repeat_count),
"Execution size must be 8");
1820 static_assert((systolic_depth == 8) || (systolic_depth == 4),
1821 "systolic_depth must be 8 or 4");
1823 static_assert((repeat_count >= 1) && (repeat_count <= 8),
1824 "repeat_count must be within 1 to 8");
1826 constexpr
auto en_ops_per_channel =
1827 detail::get_ops_per_channel(src1_precision, src2_precision);
1828 static_assert(en_ops_per_channel != detail::dpas_ops_per_channel::INVALID,
1829 "invalid combination of Src1/Src2 precision");
1830 constexpr
auto ops_per_channel =
static_cast<unsigned>(en_ops_per_channel);
1832 constexpr
auto src1_precision_bits =
1833 detail::get_precision_bits(src1_precision);
1835 N1 == ((src1_precision_bits * systolic_depth * ops_per_channel * N) /
1836 (repeat_count *
sizeof(T1) * 8)),
1837 "invalid size for Src1");
1839 constexpr
auto src2_precision_bits =
1840 detail::get_precision_bits(src2_precision);
1841 static_assert(N2 == ((src2_precision_bits * systolic_depth * ops_per_channel *
1844 "invalid size for Src2");
1846 #if defined(__SYCL_DEVICE_ONLY__)
1847 constexpr
int dst_signed = std::is_signed<T>::value;
1848 constexpr
int src0_signed = std::is_signed<T0>::value;
1849 __ESIMD_NS::simd<T, N> result = __esimd_dpas<T, T0, T1, T2, N, N1, N2>(
1850 src0.data(), src1.data(), src2.data(), (
int)src1_precision + 1,
1851 (
int)src2_precision + 1, systolic_depth, repeat_count, dst_signed,
1855 __ESIMD_NS::simd<T, N> result =
1856 __esimd_dpas<src1_precision, src2_precision, systolic_depth, repeat_count,
1857 T, T0, T1, T2, N, N1, N2>(src0.data(), src1.data(),
1859 #endif // __SYCL_DEVICE_ONLY__
1861 if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
1864 return __ESIMD_NS::saturate<T>(result);
1878 int systolic_depth,
int repeat_count,
typename T,
typename T1,
1879 typename T2,
int N,
int N1,
int N2,
1880 typename Sat = __ESIMD_NS::saturation_off_tag>
1881 __ESIMD_API __ESIMD_NS::simd<T, N>
1882 dpas(__ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd<T1, N1> src1,
1883 __ESIMD_NS::simd<T2, N2> src2, Sat sat = {}) {
1884 return dpas<src1_precision, src2_precision, T, systolic_depth, repeat_count>(
1885 src0, src1, src2, sat);
1897 int systolic_depth,
int repeat_count,
typename T,
typename T1,
1898 typename T2,
int N,
int N1,
int N2,
1899 typename Sat = __ESIMD_NS::saturation_off_tag>
1900 __ESIMD_API __ESIMD_NS::simd<T, N>
dpas(__ESIMD_NS::simd<T1, N1> src1,
1901 __ESIMD_NS::simd<T2, N2> src2,
1904 static_assert(__ESIMD_DNS::is_fp_or_dword_type<T>::value,
1905 "Dst must be FP or DWORD type");
1907 static_assert(__ESIMD_DNS::is_dword_type<T1>::value,
1908 "Src1 must be DWORD type");
1910 static_assert(__ESIMD_DNS::is_dword_type<T2>::value,
1911 "Src2 must be DWORD type");
1913 static_assert((N == 8 * repeat_count) || (N == 16 * repeat_count),
1914 "Execution size must be 8 or 16");
1916 static_assert((systolic_depth == 8) || (systolic_depth == 4),
1917 "systolic_depth must be 8 or 4");
1919 static_assert((repeat_count >= 1) && (repeat_count <= 8),
1920 "repeat_count must be within 1 to 8");
1922 constexpr
auto en_ops_per_channel =
1923 detail::get_ops_per_channel(src1_precision, src2_precision);
1924 static_assert(en_ops_per_channel != detail::dpas_ops_per_channel::INVALID,
1925 "invalid combination of Src1/Src2 precision");
1926 constexpr
auto ops_per_channel =
static_cast<unsigned>(en_ops_per_channel);
1928 constexpr
auto src1_precision_bits =
1929 detail::get_precision_bits(src1_precision);
1931 N1 == ((src1_precision_bits * systolic_depth * ops_per_channel * N) /
1932 (repeat_count *
sizeof(T1) * 8)),
1933 "invalid size for Src1");
1935 constexpr
auto src2_precision_bits =
1936 detail::get_precision_bits(src2_precision);
1937 static_assert(N2 == ((src2_precision_bits * systolic_depth * ops_per_channel *
1940 "invalid size for Src2");
1942 #if defined(__SYCL_DEVICE_ONLY__)
1943 int dpas_info = (repeat_count << 24) + (systolic_depth << 16) +
1944 (((int)src2_precision + 1) << 8) + ((
int)src1_precision + 1);
1945 __ESIMD_NS::simd<T, N> result =
1946 __esimd_dpas2<T, T1, T2, N, N1, N2>(src1.data(), src2.data(), dpas_info);
1948 __ESIMD_NS::simd<T, N> result =
1949 __esimd_dpas2<src1_precision, src2_precision, systolic_depth,
1950 repeat_count, T, T1, T2, N, N1, N2>(src1.data(),
1952 #endif // __SYCL_DEVICE_ONLY__
1954 if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
1957 return __ESIMD_NS::saturate<T>(result);
1971 int systolic_depth,
int repeat_count,
typename T,
typename T1,
1972 typename T2,
int N,
int N1,
int N2,
1973 typename Sat = __ESIMD_NS::saturation_off_tag>
1974 __ESIMD_API __ESIMD_NS::simd<T, N>
1975 dpasw(__ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd<T1, N1> src1,
1976 __ESIMD_NS::simd<T2, N2> src2, Sat sat = {}) {
1977 constexpr
bool is_4xhf =
1978 (__ESIMD_DNS::is_type<T, cl::sycl::detail::half_impl::StorageT>()) &&
1979 src1_precision == src2_precision && src1_precision == argument_type::FP16;
1981 constexpr
bool is_4xbf = __ESIMD_DNS::is_word_type<T>::value &&
1982 src1_precision == src2_precision &&
1983 src1_precision == argument_type::BF16;
1985 constexpr
bool is_common_dpas = __ESIMD_DNS::is_fp_or_dword_type<T>::value;
1987 static_assert((is_4xhf || is_4xbf || is_common_dpas),
1988 "unsupported dpas type");
1990 static_assert(__ESIMD_DNS::is_dword_type<T1>::value,
1991 "Src1 must be DWORD type");
1993 static_assert(__ESIMD_DNS::is_dword_type<T2>::value,
1994 "Src2 must be DWORD type");
1996 static_assert((N == 8 * repeat_count) || (N == 16 * repeat_count),
1997 "Execution size must be 8 or 16");
1999 static_assert((systolic_depth == 8) || (systolic_depth == 4),
2000 "systolic_depth must be 8 or 4");
2002 static_assert((repeat_count >= 1) && (repeat_count <= 8),
2003 "repeat_count must be within 1 to 8");
2005 constexpr
auto en_ops_per_channel =
2006 detail::get_ops_per_channel(src1_precision, src2_precision);
2007 static_assert(en_ops_per_channel != detail::dpas_ops_per_channel::INVALID,
2008 "invalid combination of Src1/Src2 precision");
2009 constexpr
auto ops_per_channel =
static_cast<unsigned>(en_ops_per_channel);
2011 constexpr
auto src1_precision_bits =
2012 detail::get_precision_bits(src1_precision);
2014 N1 == ((src1_precision_bits * systolic_depth * ops_per_channel * N) /
2015 (repeat_count *
sizeof(T1) * 8)),
2016 "invalid size for Src1");
2018 constexpr
auto src2_precision_bits =
2019 detail::get_precision_bits(src2_precision);
2020 static_assert(N2 == ((src2_precision_bits * systolic_depth * ops_per_channel *
2021 ((repeat_count + 1) / 2)) /
2023 "invalid size for Src2");
2025 #if defined(__SYCL_DEVICE_ONLY__)
2026 int dpas_info = (repeat_count << 24) + (systolic_depth << 16) +
2027 (((int)src2_precision + 1) << 8) + ((
int)src1_precision + 1);
2028 __ESIMD_NS::simd<T, N> result = __esimd_dpasw<T, T1, T2, N, N1, N2>(
2029 src0.data(), src1.data(), src2.data(), dpas_info);
2031 __ESIMD_NS::simd<T, N> result =
2032 __esimd_dpasw<src1_precision, src2_precision, systolic_depth,
2033 repeat_count, T, T1, T2, N, N1, N2>(
2034 src0.data(), src1.data(), src2.data());
2035 #endif // __SYCL_DEVICE_ONLY__
2037 if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
2040 return __ESIMD_NS::saturate<T>(result);
2052 int systolic_depth,
int repeat_count,
typename T,
typename T1,
2053 typename T2,
int N,
int N1,
int N2,
2054 typename Sat = __ESIMD_NS::saturation_off_tag>
2055 __ESIMD_API __ESIMD_NS::simd<T, N>
dpasw2(__ESIMD_NS::simd<T1, N1> src1,
2056 __ESIMD_NS::simd<T2, N2> src2,
2058 constexpr
bool is_4xhf =
2059 (__ESIMD_DNS::is_type<T, cl::sycl::detail::half_impl::StorageT>()) &&
2060 src1_precision == src2_precision && src1_precision == argument_type::FP16;
2062 constexpr
bool is_4xbf = __ESIMD_DNS::is_word_type<T>::value &&
2063 src1_precision == src2_precision &&
2064 src1_precision == argument_type::BF16;
2066 constexpr
bool is_common_dpas = __ESIMD_DNS::is_fp_or_dword_type<T>::value;
2068 static_assert((is_4xhf || is_4xbf || is_common_dpas),
2069 "unsupported dpas type");
2071 static_assert(__ESIMD_DNS::is_dword_type<T1>::value,
2072 "Src1 must be DWORD type");
2074 static_assert(__ESIMD_DNS::is_dword_type<T2>::value,
2075 "Src2 must be DWORD type");
2077 static_assert((N == 8 * repeat_count) || (N == 16 * repeat_count),
2078 "Execution size must be 8 or 16");
2080 static_assert((systolic_depth == 8) || (systolic_depth == 4),
2081 "systolic_depth must be 8 or 4");
2083 static_assert((repeat_count >= 1) && (repeat_count <= 8),
2084 "repeat_count must be within 1 to 8");
2086 constexpr
auto en_ops_per_channel =
2087 detail::get_ops_per_channel(src1_precision, src2_precision);
2088 static_assert(en_ops_per_channel != detail::dpas_ops_per_channel::INVALID,
2089 "invalid combination of Src1/Src2 precision");
2090 constexpr
auto ops_per_channel =
static_cast<unsigned>(en_ops_per_channel);
2092 constexpr
auto src1_precision_bits =
2093 detail::get_precision_bits(src1_precision);
2095 N1 == ((src1_precision_bits * systolic_depth * ops_per_channel * N) /
2096 (repeat_count *
sizeof(T1) * 8)),
2097 "invalid size for Src1");
2099 constexpr
auto src2_precision_bits =
2100 detail::get_precision_bits(src2_precision);
2101 static_assert(N2 == ((src2_precision_bits * systolic_depth * ops_per_channel *
2102 ((repeat_count + 1) / 2)) /
2104 "invalid size for Src2");
2106 #if defined(__SYCL_DEVICE_ONLY__)
2107 int dpas_info = (repeat_count << 24) + (systolic_depth << 16) +
2108 (((int)src2_precision + 1) << 8) + ((
int)src1_precision + 1);
2109 __ESIMD_NS::simd<T, N> result =
2110 __esimd_dpasw2<T, T1, T2, N, N1, N2>(src1.data(), src2.data(), dpas_info);
2112 __ESIMD_NS::simd<T, N> result =
2113 __esimd_dpasw2<src1_precision, src2_precision, systolic_depth,
2114 repeat_count, T, T1, T2, N, N1, N2>(src1.data(),
2116 #endif // __SYCL_DEVICE_ONLY__
2118 if constexpr (std::is_same_v<Sat, __ESIMD_NS::saturation_off_tag>)
2121 return __ESIMD_NS::saturate<T>(result);