16 #include <type_traits>
19 namespace d = s::detail;
24 template <
typename T>
inline T __abs_diff(T x, T y) {
25 static_assert(std::is_integral<T>::value,
26 "Only integral types are supported");
27 return (x > y) ? (
x -
y) : (y - x);
30 template <
typename T>
inline T __u_add_sat(T x, T y) {
31 return (x < (d::max_v<T>() - y) ? x + y : d::max_v<T>());
34 template <
typename T>
inline T __s_add_sat(T x, T y) {
36 return (x < (d::max_v<T>() - y) ? (x + y) : d::max_v<T>());
38 return (x > (d::min_v<T>() - y) ? (x + y) : d::min_v<T>());
42 template <
typename T>
inline T __hadd(T x, T y) {
44 return (x >> one) + (
y >> one) + ((y & x) & one);
47 template <
typename T>
inline T __rhadd(T x, T y) {
49 return (x >> one) + (
y >> one) + ((y | x) & one);
52 template <
typename T>
inline T __clamp(T x, T minval, T maxval) {
53 return std::min(std::max(x, minval), maxval);
56 template <
typename T>
inline constexpr T __clz_impl(T x, T m, T n = 0) {
57 return (x & m) ? n : __clz_impl(x, T(m >> 1), ++n);
60 template <
typename T>
inline constexpr T __clz(T x) {
61 using UT =
typename std::make_unsigned<T>::type;
62 return (x == T(0)) ?
sizeof(T) * 8 : __clz_impl<UT>(x, d::msbMask<UT>(x));
65 template <
typename T>
inline constexpr T __ctz_impl(T x, T m, T n = 0) {
66 return (x & m) ? n : __ctz_impl(x, T(m << 1), ++n);
69 template <
typename T>
inline constexpr T __ctz(T x) {
70 using UT =
typename std::make_unsigned<T>::type;
71 return (x == T(0)) ?
sizeof(T) * 8 : __ctz_impl<UT>(x, 1);
74 template <
typename T> T __mul_hi(T a, T b) {
75 using UPT =
typename d::make_larger<T>::type;
79 return (mul >> (
sizeof(T) * 8));
83 template <
typename T>
inline T __get_high_half(T a0b0, T a0b1, T a1b0, T a1b1) {
84 constexpr
int halfsize = (
sizeof(T) * 8) / 2;
89 return a1b1 + (__hadd(a1b0, (a0b1 + (a0b0 >> halfsize))) >> (halfsize - 1));
94 inline void __get_half_products(T a, T b, T &a0b0, T &a0b1, T &a1b0, T &a1b1) {
95 constexpr
s::cl_int halfsize = (
sizeof(T) * 8) / 2;
97 T a0 = (a << halfsize) >> halfsize;
99 T b0 = (b << halfsize) >> halfsize;
111 template <
typename T>
inline T __u_long_mul_hi(T a, T b) {
112 T a0b0, a0b1, a1b0, a1b1;
113 __get_half_products(a, b, a0b0, a0b1, a1b0, a1b1);
114 T result = __get_high_half(a0b0, a0b1, a1b0, a1b1);
118 template <
typename T>
inline T __s_long_mul_hi(T a, T b) {
119 using UT =
typename std::make_unsigned<T>::type;
123 UT a0b0, a0b1, a1b0, a1b1;
124 __get_half_products(absA, absB, a0b0, a0b1, a1b0, a1b1);
125 T result = __get_high_half(a0b0, a0b1, a1b0, a1b1);
127 bool isResultNegative = (
a < 0) != (b < 0);
128 if (isResultNegative) {
132 constexpr
int halfsize = (
sizeof(T) * 8) / 2;
133 UT low = a0b0 + ((a0b1 + a1b0) << halfsize);
141 template <
typename T>
inline T __mad_hi(T a, T b, T c) {
142 return __mul_hi(a, b) + c;
145 template <
typename T>
inline T __u_long_mad_hi(T a, T b, T c) {
146 return __u_long_mul_hi(a, b) + c;
149 template <
typename T>
inline T __s_long_mad_hi(T a, T b, T c) {
150 return __s_long_mul_hi(a, b) + c;
153 template <
typename T>
inline T __s_mad_sat(T a, T b, T c) {
154 using UPT =
typename d::make_larger<T>::type;
155 UPT mul = UPT(a) * UPT(b);
156 UPT res = mul + UPT(c);
157 const UPT
max = d::max_v<T>();
158 const UPT
min = d::min_v<T>();
159 res = std::min(std::max(res, min), max);
163 template <
typename T>
inline T __s_long_mad_sat(T a, T b, T c) {
164 bool neg_prod = (
a < 0) ^ (b < 0);
165 T mulhi = __s_long_mul_hi(a, b);
170 if (!neg_prod && mulhi != 0)
171 return d::max_v<T>();
172 if (neg_prod && mulhi != -1)
173 return d::min_v<T>();
174 return __s_add_sat(T(a * b), c);
177 template <
typename T>
inline T __u_mad_sat(T a, T b, T c) {
178 using UPT =
typename d::make_larger<T>::type;
179 UPT mul = UPT(a) * UPT(b);
180 const UPT
min = d::min_v<T>();
181 const UPT
max = d::max_v<T>();
182 mul = std::min(std::max(mul, min), max);
183 return __u_add_sat(T(mul), c);
186 template <
typename T>
inline T __u_long_mad_sat(T a, T b, T c) {
187 T mulhi = __u_long_mul_hi(a, b);
190 return d::max_v<T>();
191 return __u_add_sat(T(a * b), c);
194 template <
typename T>
inline T __rotate(T x, T n) {
195 using UT =
typename std::make_unsigned<T>::type;
198 constexpr UT size =
sizeof(
x) * 8;
200 UT nu = UT(n) & (size - 1);
201 return (xu << nu) | (xu >> (size - nu));
204 template <
typename T>
inline T __u_sub_sat(T x, T y) {
205 return (y < (x - d::min_v<T>())) ? (
x -
y) : d::min_v<T>();
208 template <
typename T>
inline T __s_sub_sat(T x, T y) {
209 using UT =
typename std::make_unsigned<T>::type;
210 T result = UT(x) - UT(y);
212 if (((x < 0) ^ (y < 0)) && ((x < 0) ^ (result < 0)))
213 result = result < 0 ? d::max_v<T>() : d::
min_v<T>();
217 template <
typename T1,
typename T2>
218 typename d::make_larger<T1>::type
inline __upsample(T1 hi, T2 lo) {
219 using UT =
typename d::make_larger<T1>::type;
220 return (UT(hi) << (
sizeof(T1) * 8)) | lo;
223 template <
typename T>
inline constexpr T __popcount_impl(T x,
size_t n = 0) {
224 return (x == T(0)) ? n : __popcount_impl(x >> 1, ((x & T(1)) ? ++n : n));
227 template <
typename T>
inline constexpr T __popcount(T x) {
228 using UT =
typename d::make_unsigned<T>::type;
229 return __popcount_impl(UT(x));
232 template <
typename T>
inline T __mad24(T x, T y, T z) {
return (x * y) +
z; }
234 template <
typename T>
inline T __mul24(T x, T y) {
return (x * y); }
270 return __abs_diff(x, y);
274 return __abs_diff(x, y);
278 return __abs_diff(x, y);
282 return __abs_diff(x, y);
293 return __abs_diff(x, y);
297 return __abs_diff(x, y);
301 return __abs_diff(x, y);
305 return __abs_diff(x, y);
315 return __u_add_sat(x, y);
319 return __u_add_sat(x, y);
323 return __u_add_sat(x, y);
327 return __u_add_sat(x, y);
337 return __s_add_sat(x, y);
341 return __s_add_sat(x, y);
344 return __s_add_sat(x, y);
348 return __s_add_sat(x, y);
398 return __rhadd(x, y);
402 return __rhadd(x, y);
405 return __rhadd(x, y);
409 return __rhadd(x, y);
418 return __rhadd(x, y);
422 return __rhadd(x, y);
425 return __rhadd(x, y);
428 return __rhadd(x, y);
438 return __clamp(x, minval, maxval);
443 return __clamp(x, minval, maxval);
447 return __clamp(x, minval, maxval);
451 return __clamp(x, minval, maxval);
471 return __clamp(x, minval, maxval);
475 return __clamp(x, minval, maxval);
479 return __clamp(x, minval, maxval);
483 return __clamp(x, minval, maxval);
562 return __mul_hi(a, b);
565 return __mul_hi(a, b);
568 return __mul_hi(a, b);
572 return __s_long_mul_hi(x, y);
581 return __mul_hi(a, b);
584 return __mul_hi(a, b);
587 return __mul_hi(a, b);
591 return __u_long_mul_hi(x, y);
601 return __mad_hi(x, minval, maxval);
605 return __mad_hi(x, minval, maxval);
609 return __mad_hi(x, minval, maxval);
613 return __s_long_mad_hi(x, minval, maxval);
626 return __mad_hi(x, minval, maxval);
631 return __mad_hi(x, minval, maxval);
635 return __mad_hi(x, minval, maxval);
639 return __u_long_mad_hi(x, minval, maxval);
653 return __s_mad_sat(a, b, c);
657 return __s_mad_sat(a, b, c);
661 return __s_mad_sat(a, b, c);
665 return __s_long_mad_sat(a, b, c);
678 return __u_mad_sat(a, b, c);
682 return __u_mad_sat(a, b, c);
686 return __u_mad_sat(a, b, c);
690 return __u_long_mad_sat(a, b, c);
703 return std::max(x, y);
707 return std::max(x, y);
710 return std::max(x, y);
713 return std::max(x, y);
727 return std::max(x, y);
731 return std::max(x, y);
734 return std::max(x, y);
738 return std::max(x, y);
751 return std::min(x, y);
755 return std::min(x, y);
758 return std::min(x, y);
761 return std::min(x, y);
775 return std::min(x, y);
779 return std::min(x, y);
782 return std::min(x, y);
786 return std::min(x, y);
800 return __rotate(x, y);
804 return __rotate(x, y);
807 return __rotate(x, y);
811 return __rotate(x, y);
814 return __rotate(x, y);
818 return __rotate(x, y);
821 return __rotate(x, y);
824 return __rotate(x, y);
838 return __u_sub_sat(x, y);
842 return __u_sub_sat(x, y);
846 return __u_sub_sat(x, y);
850 return __u_sub_sat(x, y);
860 return __s_sub_sat(x, y);
864 return __s_sub_sat(x, y);
867 return __s_sub_sat(x, y);
871 return __s_sub_sat(x, y);
881 return __upsample(x, y);
885 return __upsample(x, y);
889 return __upsample(x, y);
897 return __upsample(x, y);
901 return __upsample(x, y);
905 return __upsample(x, y);
913 return __popcount(x);
916 return __popcount(x);
919 return __popcount(x);
922 return __popcount(x);
930 return __popcount(x);
933 return __popcount(x);
936 return __popcount(x);
939 return __popcount(x);
949 return __mad24(x, y, z);
956 return __mad24(x, y, z);
962 return __mul24(x, y);
968 return __mul24(x, y);