16 #include <type_traits>
19 namespace d = s::detail;
22 namespace __host_std {
25 template <
typename T>
inline T __abs_diff(
T x,
T y) {
26 static_assert(std::is_integral<T>::value,
27 "Only integral types are supported");
28 return (x > y) ? (x - y) : (y - x);
31 template <
typename T>
inline T __u_add_sat(
T x,
T y) {
32 return (x < (d::max_v<T>() - y) ? x + y : d::max_v<T>());
35 template <
typename T>
inline T __s_add_sat(
T x,
T y) {
37 return (x < (d::max_v<T>() - y) ? (x + y) : d::max_v<T>());
39 return (x > (d::min_v<T>() - y) ? (x + y) : d::min_v<T>());
43 template <
typename T>
inline T __hadd(
T x,
T y) {
45 return (x >> one) + (y >> one) + ((y & x) & one);
48 template <
typename T>
inline T __rhadd(
T x,
T y) {
50 return (x >> one) + (y >> one) + ((y | x) & one);
53 template <
typename T>
inline T __clamp(
T x,
T minval,
T maxval) {
54 return std::min(std::max(x, minval), maxval);
57 template <
typename T>
inline constexpr
T __clz_impl(
T x,
T m,
T n = 0) {
58 return (x & m) ? n : __clz_impl(x,
T(m >> 1), ++n);
61 template <
typename T>
inline constexpr
T __clz(
T x) {
62 using UT =
typename std::make_unsigned<T>::type;
63 return (x ==
T(0)) ?
sizeof(
T) * 8 : __clz_impl<UT>(x, d::msbMask<UT>(x));
66 template <
typename T>
inline constexpr
T __ctz_impl(
T x,
T m,
T n = 0) {
67 return (x & m) ? n : __ctz_impl(x,
T(m << 1), ++n);
70 template <
typename T>
inline constexpr
T __ctz(
T x) {
71 using UT =
typename std::make_unsigned<T>::type;
72 return (x ==
T(0)) ?
sizeof(
T) * 8 : __ctz_impl<UT>(x, 1);
75 template <
typename T>
T __mul_hi(
T a,
T b) {
76 using UPT =
typename d::make_larger<T>::type;
80 return (mul >> (
sizeof(
T) * 8));
84 template <
typename T>
inline T __get_high_half(
T a0b0,
T a0b1,
T a1b0,
T a1b1) {
85 constexpr
int halfsize = (
sizeof(
T) * 8) / 2;
90 return a1b1 + (__hadd(a1b0, (a0b1 + (a0b0 >> halfsize))) >> (halfsize - 1));
95 inline void __get_half_products(
T a,
T b,
T &a0b0,
T &a0b1,
T &a1b0,
T &a1b1) {
96 constexpr
s::cl_int halfsize = (
sizeof(
T) * 8) / 2;
98 T a0 = (a << halfsize) >> halfsize;
100 T b0 = (b << halfsize) >> halfsize;
112 template <
typename T>
inline T __u_long_mul_hi(
T a,
T b) {
113 T a0b0, a0b1, a1b0, a1b1;
114 __get_half_products(a, b, a0b0, a0b1, a1b0, a1b1);
115 T result = __get_high_half(a0b0, a0b1, a1b0, a1b1);
119 template <
typename T>
inline T __s_long_mul_hi(
T a,
T b) {
120 using UT =
typename std::make_unsigned<T>::type;
124 UT a0b0, a0b1, a1b0, a1b1;
125 __get_half_products(absA, absB, a0b0, a0b1, a1b0, a1b1);
126 T result = __get_high_half(a0b0, a0b1, a1b0, a1b1);
128 bool isResultNegative = (
a < 0) != (b < 0);
129 if (isResultNegative) {
133 constexpr
int halfsize = (
sizeof(
T) * 8) / 2;
134 UT low = a0b0 + ((a0b1 + a1b0) << halfsize);
142 template <
typename T>
inline T __mad_hi(
T a,
T b,
T c) {
143 return __mul_hi(a, b) + c;
146 template <
typename T>
inline T __u_long_mad_hi(
T a,
T b,
T c) {
147 return __u_long_mul_hi(a, b) + c;
150 template <
typename T>
inline T __s_long_mad_hi(
T a,
T b,
T c) {
151 return __s_long_mul_hi(a, b) + c;
154 template <
typename T>
inline T __s_mad_sat(
T a,
T b,
T c) {
155 using UPT =
typename d::make_larger<T>::type;
156 UPT mul = UPT(a) * UPT(b);
157 UPT res = mul + UPT(c);
158 const UPT
max = d::max_v<T>();
159 const UPT
min = d::min_v<T>();
160 res = std::min(std::max(res,
min),
max);
164 template <
typename T>
inline T __s_long_mad_sat(
T a,
T b,
T c) {
165 bool neg_prod = (
a < 0) ^ (b < 0);
166 T mulhi = __s_long_mul_hi(a, b);
171 if (!neg_prod && mulhi != 0)
172 return d::max_v<T>();
173 if (neg_prod && mulhi != -1)
174 return d::min_v<T>();
175 return __s_add_sat(
T(a * b), c);
178 template <
typename T>
inline T __u_mad_sat(
T a,
T b,
T c) {
179 using UPT =
typename d::make_larger<T>::type;
180 UPT mul = UPT(a) * UPT(b);
181 const UPT
min = d::min_v<T>();
182 const UPT
max = d::max_v<T>();
183 mul = std::min(std::max(mul,
min),
max);
184 return __u_add_sat(
T(mul), c);
187 template <
typename T>
inline T __u_long_mad_sat(
T a,
T b,
T c) {
188 T mulhi = __u_long_mul_hi(a, b);
191 return d::max_v<T>();
192 return __u_add_sat(
T(a * b), c);
195 template <
typename T>
inline T __rotate(
T x,
T n) {
196 using UT =
typename std::make_unsigned<T>::type;
199 constexpr UT size =
sizeof(x) * 8;
201 UT nu = UT(n) & (size - 1);
202 return (xu << nu) | (xu >> (size - nu));
205 template <
typename T>
inline T __u_sub_sat(
T x,
T y) {
206 return (y < (x - d::min_v<T>())) ? (x - y) : d::min_v<T>();
209 template <
typename T>
inline T __s_sub_sat(
T x,
T y) {
210 using UT =
typename std::make_unsigned<T>::type;
211 T result = UT(x) - UT(y);
213 if (((x < 0) ^ (y < 0)) && ((x < 0) ^ (result < 0)))
214 result = result < 0 ? d::max_v<T>() : d::
min_v<
T>();
218 template <
typename T1,
typename T2>
219 typename d::make_larger<T1>::type
inline __upsample(T1 hi, T2 lo) {
220 using UT =
typename d::make_larger<T1>::type;
221 return (UT(hi) << (
sizeof(T1) * 8)) | lo;
224 template <
typename T>
inline constexpr
T __popcount_impl(
T x,
size_t n = 0) {
225 return (x ==
T(0)) ? n : __popcount_impl(x >> 1, ((x &
T(1)) ? ++n : n));
228 template <
typename T>
inline constexpr
T __popcount(
T x) {
229 using UT =
typename d::make_unsigned<T>::type;
230 return __popcount_impl(UT(x));
233 template <
typename T>
inline T __mad24(
T x,
T y,
T z) {
return (x * y) + z; }
235 template <
typename T>
inline T __mul24(
T x,
T y) {
return (x * y); }
262 return __abs_diff(x, y);
265 return __abs_diff(x, y);
268 return __abs_diff(x, y);
271 return __abs_diff(x, y);
281 return __abs_diff(x, y);
284 return __abs_diff(x, y);
287 return __abs_diff(x, y);
290 return __abs_diff(x, y);
299 return __u_add_sat(x, y);
302 return __u_add_sat(x, y);
305 return __u_add_sat(x, y);
308 return __u_add_sat(x, y);
317 return __s_add_sat(x, y);
320 return __s_add_sat(x, y);
323 return __s_add_sat(x, y);
326 return __s_add_sat(x, y);
371 return __rhadd(x, y);
374 return __rhadd(x, y);
377 return __rhadd(x, y);
380 return __rhadd(x, y);
389 return __rhadd(x, y);
392 return __rhadd(x, y);
395 return __rhadd(x, y);
398 return __rhadd(x, y);
408 return __clamp(x, minval, maxval);
412 return __clamp(x, minval, maxval);
416 return __clamp(x, minval, maxval);
420 return __clamp(x, minval, maxval);
434 return __clamp(x, minval, maxval);
438 return __clamp(x, minval, maxval);
442 return __clamp(x, minval, maxval);
446 return __clamp(x, minval, maxval);
495 return __mul_hi(a, b);
498 return __mul_hi(a, b);
501 return __mul_hi(a, b);
504 return __s_long_mul_hi(x, y);
513 return __mul_hi(a, b);
516 return __mul_hi(a, b);
519 return __mul_hi(a, b);
522 return __u_long_mul_hi(x, y);
532 return __mad_hi(x, minval, maxval);
536 return __mad_hi(x, minval, maxval);
540 return __mad_hi(x, minval, maxval);
544 return __s_long_mad_hi(x, minval, maxval);
554 return __mad_hi(x, minval, maxval);
558 return __mad_hi(x, minval, maxval);
562 return __mad_hi(x, minval, maxval);
566 return __u_long_mad_hi(x, minval, maxval);
576 return __s_mad_sat(a, b, c);
580 return __s_mad_sat(a, b, c);
584 return __s_mad_sat(a, b, c);
588 return __s_long_mad_sat(a, b, c);
598 return __u_mad_sat(a, b, c);
602 return __u_mad_sat(a, b, c);
606 return __u_mad_sat(a, b, c);
610 return __u_long_mad_sat(a, b, c);
619 return std::max(x, y);
622 return std::max(x, y);
625 return std::max(x, y);
628 return std::max(x, y);
641 return std::max(x, y);
644 return std::max(x, y);
647 return std::max(x, y);
650 return std::max(x, y);
663 return std::min(x, y);
666 return std::min(x, y);
669 return std::min(x, y);
672 return std::min(x, y);
685 return std::min(x, y);
688 return std::min(x, y);
691 return std::min(x, y);
694 return std::min(x, y);
707 return __rotate(x, y);
710 return __rotate(x, y);
713 return __rotate(x, y);
716 return __rotate(x, y);
719 return __rotate(x, y);
722 return __rotate(x, y);
725 return __rotate(x, y);
728 return __rotate(x, y);
741 return __u_sub_sat(x, y);
744 return __u_sub_sat(x, y);
747 return __u_sub_sat(x, y);
750 return __u_sub_sat(x, y);
759 return __s_sub_sat(x, y);
762 return __s_sub_sat(x, y);
765 return __s_sub_sat(x, y);
768 return __s_sub_sat(x, y);
777 return __upsample(x, y);
780 return __upsample(x, y);
783 return __upsample(x, y);
790 return __upsample(x, y);
793 return __upsample(x, y);
796 return __upsample(x, y);
804 return __popcount(x);
807 return __popcount(x);
810 return __popcount(x);
813 return __popcount(x);
821 return __popcount(x);
824 return __popcount(x);
828 return __popcount(x);
838 return __mad24(x, y, z);
844 return __mad24(x, y, z);
850 return __mul24(x, y);
856 return __mul24(x, y);