13 #ifndef NO_WARN_X86_INTRINSICS
32 "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
38 #if defined(__powerpc64__) && \
39 (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
42 #define _MM_SHUFFLE(w, x, y, z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
49 #if defined(__STRICT_ANSI__) && \
50 (defined(__cplusplus) || \
51 (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L))
62 #include <mm_malloc.h>
70 typedef vector
float __m128_u
__attribute__((__may_alias__, __aligned__(1)));
73 typedef vector
float __v4sf;
76 extern __inline __m128
77 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
84 extern __inline __m128
85 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
87 return __extension__(__m128){0.0f, 0.0f, 0.0f, 0.0f};
91 extern __inline __m128
92 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
94 return ((__m128)
vec_ld(0, (__v4sf *)
__P));
98 extern __inline __m128
99 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
101 return (vec_vsx_ld(0,
__P));
105 extern __inline __m128
106 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
110 static const __vector
unsigned char __permute_vector = {
111 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,
112 0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};
115 __result = (__m128)
vec_perm(__tmp, __tmp, __permute_vector);
120 extern __inline __m128
121 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
123 return __extension__(__m128)(__v4sf){__F, __F, __F, __F};
126 extern __inline __m128
127 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
133 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__,
135 _mm_set_ps(
const float __Z,
const float __Y,
const float __X,
const float __W) {
136 return __extension__(__m128)(__v4sf){__W, __X,
__Y, __Z};
140 extern __inline __m128
141 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
143 return __extension__(__m128)(__v4sf){__Z,
__Y, __X, __W};
148 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
155 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
157 *(__m128_u *)
__P = __A;
162 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
165 static const __vector
unsigned char __permute_vector = {
166 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,
167 0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};
169 __tmp = (__m128)
vec_perm(__A, __A, __permute_vector);
176 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
183 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
189 extern __inline __m128
190 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
192 return __extension__(__m128)(__v4sf){__F, 0.0f, 0.0f, 0.0f};
196 extern __inline __m128
197 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
199 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
201 return (
vec_sel((__v4sf)__A, (__v4sf)__B, __mask));
205 extern __inline __m128
206 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
213 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
215 *
__P = ((__v4sf)__A)[0];
222 extern __inline __m128
223 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
227 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
239 __A[0] = __A[0] + __B[0];
244 extern __inline __m128
245 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
249 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
261 __A[0] = __A[0] - __B[0];
266 extern __inline __m128
267 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
271 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
283 __A[0] = __A[0] * __B[0];
288 extern __inline __m128
289 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
293 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
305 __A[0] = __A[0] / __B[0];
310 extern __inline __m128
311 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
314 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
327 extern __inline __m128
328 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
330 return (__m128)((__v4sf)__A + (__v4sf)__B);
333 extern __inline __m128
334 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
336 return (__m128)((__v4sf)__A - (__v4sf)__B);
339 extern __inline __m128
340 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
342 return (__m128)((__v4sf)__A * (__v4sf)__B);
345 extern __inline __m128
346 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
348 return (__m128)((__v4sf)__A / (__v4sf)__B);
351 extern __inline __m128
352 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
354 return (vec_sqrt((__v4sf)__A));
357 extern __inline __m128
358 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
360 return (
vec_re((__v4sf)__A));
363 extern __inline __m128
364 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369 extern __inline __m128
370 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
373 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
385 extern __inline __m128
386 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
389 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
401 extern __inline __m128
402 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
405 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
418 extern __inline __m128
419 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
422 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
435 extern __inline __m128
436 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
438 __vector __bool
int __m =
vec_cmpgt((__v4sf)__B, (__v4sf)__A);
442 extern __inline __m128
443 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
445 __vector __bool
int __m =
vec_cmpgt((__v4sf)__A, (__v4sf)__B);
450 extern __inline __m128
451 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
453 return ((__m128)
vec_and((__v4sf)__A, (__v4sf)__B));
457 extern __inline __m128
458 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
460 return ((__m128)
vec_andc((__v4sf)__B, (__v4sf)__A));
463 extern __inline __m128
464 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
466 return ((__m128)
vec_or((__v4sf)__A, (__v4sf)__B));
469 extern __inline __m128
470 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
472 return ((__m128)
vec_xor((__v4sf)__A, (__v4sf)__B));
478 extern __inline __m128
479 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
481 return ((__m128)
vec_cmpeq((__v4sf)__A, (__v4sf)__B));
484 extern __inline __m128
485 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
487 return ((__m128)
vec_cmplt((__v4sf)__A, (__v4sf)__B));
490 extern __inline __m128
491 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
493 return ((__m128)
vec_cmple((__v4sf)__A, (__v4sf)__B));
496 extern __inline __m128
497 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
499 return ((__m128)
vec_cmpgt((__v4sf)__A, (__v4sf)__B));
502 extern __inline __m128
503 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
505 return ((__m128)
vec_cmpge((__v4sf)__A, (__v4sf)__B));
508 extern __inline __m128
509 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
511 __v4sf __temp = (__v4sf)
vec_cmpeq((__v4sf)__A, (__v4sf)__B);
512 return ((__m128)
vec_nor(__temp, __temp));
515 extern __inline __m128
516 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
518 return ((__m128)
vec_cmpge((__v4sf)__A, (__v4sf)__B));
521 extern __inline __m128
522 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
524 return ((__m128)
vec_cmpgt((__v4sf)__A, (__v4sf)__B));
527 extern __inline __m128
528 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
530 return ((__m128)
vec_cmple((__v4sf)__A, (__v4sf)__B));
533 extern __inline __m128
534 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
536 return ((__m128)
vec_cmplt((__v4sf)__A, (__v4sf)__B));
539 extern __inline __m128
540 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
542 __vector
unsigned int __a,
__b;
543 __vector
unsigned int __c, __d;
544 static const __vector
unsigned int __float_exp_mask = {
545 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
554 extern __inline __m128
555 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
557 __vector
unsigned int __a,
__b;
558 __vector
unsigned int __c, __d;
559 static const __vector
unsigned int __float_exp_mask = {
560 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
572 extern __inline __m128
573 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
575 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
586 return ((__m128)
vec_sel((__v4sf)__A,
__c, __mask));
589 extern __inline __m128
590 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
592 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
603 return ((__m128)
vec_sel((__v4sf)__A,
__c, __mask));
606 extern __inline __m128
607 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
609 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
620 return ((__m128)
vec_sel((__v4sf)__A,
__c, __mask));
623 extern __inline __m128
624 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
626 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
637 return ((__m128)
vec_sel((__v4sf)__A,
__c, __mask));
640 extern __inline __m128
641 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
643 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
654 return ((__m128)
vec_sel((__v4sf)__A,
__c, __mask));
657 extern __inline __m128
658 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
660 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
672 return ((__m128)
vec_sel((__v4sf)__A,
__c, __mask));
675 extern __inline __m128
676 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
678 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
689 return ((__m128)
vec_sel((__v4sf)__A,
__c, __mask));
692 extern __inline __m128
693 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
695 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
706 return ((__m128)
vec_sel((__v4sf)__A,
__c, __mask));
709 extern __inline __m128
710 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
712 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
723 return ((__m128)
vec_sel((__v4sf)__A,
__c, __mask));
726 extern __inline __m128
727 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
729 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
740 return ((__m128)
vec_sel((__v4sf)__A,
__c, __mask));
743 extern __inline __m128
744 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
746 __vector
unsigned int __a,
__b;
747 __vector
unsigned int __c, __d;
748 static const __vector
unsigned int __float_exp_mask = {
749 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
750 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
759 return ((__m128)
vec_sel((__v4sf)__A, (__v4sf)
__c, __mask));
762 extern __inline __m128
763 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
765 __vector
unsigned int __a,
__b;
766 __vector
unsigned int __c, __d;
767 static const __vector
unsigned int __float_exp_mask = {
768 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
769 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
778 return ((__m128)
vec_sel((__v4sf)__A, (__v4sf)
__c, __mask));
784 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
786 return (__A[0] == __B[0]);
790 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
792 return (__A[0] < __B[0]);
796 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
798 return (__A[0] <= __B[0]);
802 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
804 return (__A[0] > __B[0]);
808 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
810 return (__A[0] >= __B[0]);
814 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
816 return (__A[0] != __B[0]);
828 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
830 return (__A[0] == __B[0]);
834 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
836 return (__A[0] < __B[0]);
840 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842 return (__A[0] <= __B[0]);
846 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
848 return (__A[0] > __B[0]);
852 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
854 return (__A[0] >= __B[0]);
858 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
860 return (__A[0] != __B[0]);
863 extern __inline
float
864 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
866 return ((__v4sf)__A)[0];
872 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
878 #ifdef __LITTLE_ENDIAN__
879 "xxsldwi %x0,%x0,%x0,3;\n"
881 "xscvspdp %x2,%x0;\n"
884 :
"+wa"(__A),
"=r"(__res),
"=f"(__dtmp)
887 __res = __builtin_rint(__A[0]);
893 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
902 extern __inline
long long
903 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
904 _mm_cvtss_si64(__m128 __A) {
906 #if defined(_ARCH_PWR8) && defined(__powerpc64__)
909 #ifdef __LITTLE_ENDIAN__
910 "xxsldwi %x0,%x0,%x0,3;\n"
912 "xscvspdp %x2,%x0;\n"
915 :
"+wa"(__A),
"=r"(__res),
"=f"(__dtmp)
918 __res = __builtin_llrint(__A[0]);
924 extern __inline
long long
925 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
926 _mm_cvtss_si64x(__m128 __A) {
927 return _mm_cvtss_si64((__v4sf)__A);
944 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
947 __builtin_prefetch(
__P);
952 extern __inline __m64
953 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
956 __v4sf __temp, __rounded;
957 __vector
unsigned long long __result;
960 __temp = (__v4sf)
vec_splat((__vector
long long)__A, 0);
961 __rounded = vec_rint(__temp);
962 __result = (__vector
unsigned long long)
vec_cts(__rounded, 0);
964 return (__m64)((__vector
long long)__result)[0];
967 extern __inline __m64
968 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
975 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
978 float __temp = __A[0];
984 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
990 extern __inline
long long
991 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
992 _mm_cvttss_si64(__m128 __A) {
994 float __temp = __A[0];
1000 extern __inline
long long
1001 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1002 _mm_cvttss_si64x(__m128 __A) {
1004 float __temp = __A[0];
1011 extern __inline __m64
1012 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1015 __vector
unsigned long long __result;
1018 __temp = (__v4sf)
vec_splat((__vector
long long)__A, 0);
1019 __result = (__vector
unsigned long long)
vec_cts(__temp, 0);
1021 return (__m64)((__vector
long long)__result)[0];
1024 extern __inline __m64
1025 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1031 extern __inline __m128
1032 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1040 extern __inline __m128
1041 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1048 extern __inline __m128
1049 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1050 _mm_cvtsi64_ss(__m128 __A,
long long __B) {
1058 extern __inline __m128
1059 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1060 _mm_cvtsi64x_ss(__m128 __A,
long long __B) {
1061 return _mm_cvtsi64_ss(__A, __B);
1066 extern __inline __m128
1067 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1069 __vector
signed int __vm1;
1070 __vector
float __vf1;
1072 __vm1 = (__vector
signed int)(__vector
unsigned long long){__B, __B};
1075 return ((__m128)(__vector
unsigned long long){
1076 ((__vector
unsigned long long)__vf1)[0],
1077 ((__vector
unsigned long long)__A)[1]});
1080 extern __inline __m128
1081 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1087 extern __inline __m128
1088 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1090 __vector
signed short __vs8;
1091 __vector
signed int __vi4;
1092 __vector
float __vf1;
1094 __vs8 = (__vector
signed short)(__vector
unsigned long long){__A, __A};
1098 return (__m128)__vf1;
1102 extern __inline __m128
1103 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1105 const __vector
unsigned short __zero = {0, 0, 0, 0, 0, 0, 0, 0};
1106 __vector
unsigned short __vs8;
1107 __vector
unsigned int __vi4;
1108 __vector
float __vf1;
1110 __vs8 = (__vector
unsigned short)(__vector
unsigned long long){__A, __A};
1112 #ifdef __LITTLE_ENDIAN__
1119 return (__m128)__vf1;
1123 extern __inline __m128
1124 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1126 __vector
signed char __vc16;
1127 __vector
signed short __vs8;
1128 __vector
signed int __vi4;
1129 __vector
float __vf1;
1131 __vc16 = (__vector
signed char)(__vector
unsigned long long){__A, __A};
1136 return (__m128)__vf1;
1140 extern __inline __m128
1141 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1144 const __vector
unsigned char __zero = {0, 0, 0, 0, 0, 0, 0, 0};
1145 __vector
unsigned char __vc16;
1146 __vector
unsigned short __vs8;
1147 __vector
unsigned int __vi4;
1148 __vector
float __vf1;
1150 __vc16 = (__vector
unsigned char)(__vector
unsigned long long){__A, __A};
1151 #ifdef __LITTLE_ENDIAN__
1152 __vs8 = (__vector
unsigned short)
vec_mergel(__vc16, __zero);
1154 (__vector
unsigned int)
vec_mergeh(__vs8, (__vector
unsigned short)__zero);
1156 __vs8 = (__vector
unsigned short)
vec_mergel(__zero, __vc16);
1158 (__vector
unsigned int)
vec_mergeh((__vector
unsigned short)__zero, __vs8);
1162 return (__m128)__vf1;
1166 extern __inline __m128
1167 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1169 __vector
signed int __vi4;
1170 __vector
float __vf4;
1172 __vi4 = (__vector
signed int)(__vector
unsigned long long){__A, __B};
1174 return (__m128)__vf4;
1178 extern __inline __m64
1179 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1182 __vector
signed int __temp;
1183 __vector
unsigned long long __result;
1185 __rounded = vec_rint(__A);
1186 __temp =
vec_cts(__rounded, 0);
1187 __result = (__vector
unsigned long long)
vec_pack(__temp, __temp);
1189 return (__m64)((__vector
long long)__result)[0];
1193 extern __inline __m64
1194 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1197 __vector
signed int __tmp_i;
1198 static const __vector
signed int __zero = {0, 0, 0, 0};
1199 __vector
signed short __tmp_s;
1200 __vector
signed char __res_v;
1202 __rounded = vec_rint(__A);
1203 __tmp_i =
vec_cts(__rounded, 0);
1204 __tmp_s =
vec_pack(__tmp_i, __zero);
1205 __res_v =
vec_pack(__tmp_s, __tmp_s);
1206 return (__m64)((__vector
long long)__res_v)[0];
1210 extern __inline __m128
1211 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1214 unsigned long __element_selector_10 = __mask & 0x03;
1215 unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
1216 unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
1217 unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
1218 static const unsigned int __permute_selectors[4] = {
1219 #ifdef __LITTLE_ENDIAN__
1220 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1222 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
1225 __vector
unsigned int __t;
1227 __t[0] = __permute_selectors[__element_selector_10];
1228 __t[1] = __permute_selectors[__element_selector_32];
1229 __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
1230 __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
1231 return vec_perm((__v4sf)__A, (__v4sf)__B, (__vector
unsigned char)__t);
1235 extern __inline __m128
1236 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1238 return (__m128)
vec_vmrglw((__v4sf)__A, (__v4sf)__B);
1242 extern __inline __m128
1243 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1245 return (__m128)
vec_vmrghw((__v4sf)__A, (__v4sf)__B);
1250 extern __inline __m128
1251 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1253 __vector
unsigned long long __a = (__vector
unsigned long long)__A;
1261 extern __inline
void
1262 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1264 __vector
unsigned long long __a = (__vector
unsigned long long)__A;
1270 extern __inline __m128
1271 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1273 return (__m128)
vec_mergel((__vector
unsigned long long)__B,
1274 (__vector
unsigned long long)__A);
1278 extern __inline __m128
1279 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281 return (__m128)
vec_mergeh((__vector
unsigned long long)__A,
1282 (__vector
unsigned long long)__B);
1287 extern __inline __m128
1288 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1290 __vector
unsigned long long __a = (__vector
unsigned long long)__A;
1298 extern __inline
void
1299 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1301 __vector
unsigned long long __a = (__vector
unsigned long long)__A;
1311 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1314 return vec_extractm((__vector
unsigned int)__A);
1316 __vector
unsigned long long __result;
1317 static const __vector
unsigned int __perm_mask = {
1318 #ifdef __LITTLE_ENDIAN__
1319 0x00204060, 0x80808080, 0x80808080, 0x80808080
1321 0x80808080, 0x80808080, 0x80808080, 0x00204060
1325 __result = ((__vector
unsigned long long)vec_vbpermq(
1326 (__vector
unsigned char)__A, (__vector
unsigned char)__perm_mask));
1328 #ifdef __LITTLE_ENDIAN__
1338 extern __inline __m128
1339 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1344 extern __inline __m128
1345 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1352 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1354 unsigned int __shiftr = __N & 3;
1355 #ifdef __BIG_ENDIAN__
1356 __shiftr = 3 - __shiftr;
1359 return ((__A >> (__shiftr * 16)) & 0xffff);
1363 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1364 _m_pextrw(__m64
const __A,
int const __N) {
1370 extern __inline __m64
1371 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1373 const int __shiftl = (__N & 3) * 16;
1374 const __m64 __shiftD = (
const __m64)
__D << __shiftl;
1375 const __m64 __mask = 0xffffUL << __shiftl;
1376 __m64 __result = (__A & (~__mask)) | (__shiftD & __mask);
1381 extern __inline __m64
1382 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1383 _m_pinsrw(__m64
const __A,
int const __D,
int const __N) {
1388 extern __inline __m64
1389 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1393 __vector
signed short __a,
__b, __r;
1394 __vector __bool
short __c;
1400 return (__m64)((__vector
long long)__r)[0];
1402 __m64_union __m1, __m2, __res;
1407 __res.as_short[0] = (__m1.as_short[0] > __m2.as_short[0]) ? __m1.as_short[0]
1409 __res.as_short[1] = (__m1.as_short[1] > __m2.as_short[1]) ? __m1.as_short[1]
1411 __res.as_short[2] = (__m1.as_short[2] > __m2.as_short[2]) ? __m1.as_short[2]
1413 __res.as_short[3] = (__m1.as_short[3] > __m2.as_short[3]) ? __m1.as_short[3]
1416 return (__m64)__res.as_m64;
1420 extern __inline __m64
1421 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1427 extern __inline __m64
1428 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1431 __vector
unsigned char __a,
__b, __r;
1432 __vector __bool
char __c;
1438 return (__m64)((__vector
long long)__r)[0];
1440 __m64_union __m1, __m2, __res;
1446 for (__i = 0; __i < 8; __i++)
1447 __res.as_char[__i] =
1448 ((
unsigned char)__m1.as_char[__i] > (
unsigned char)__m2.as_char[__i])
1450 : __m2.as_char[__i];
1452 return (__m64)__res.as_m64;
1456 extern __inline __m64
1457 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1463 extern __inline __m64
1464 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1467 __vector
signed short __a,
__b, __r;
1468 __vector __bool
short __c;
1474 return (__m64)((__vector
long long)__r)[0];
1476 __m64_union __m1, __m2, __res;
1481 __res.as_short[0] = (__m1.as_short[0] < __m2.as_short[0]) ? __m1.as_short[0]
1483 __res.as_short[1] = (__m1.as_short[1] < __m2.as_short[1]) ? __m1.as_short[1]
1485 __res.as_short[2] = (__m1.as_short[2] < __m2.as_short[2]) ? __m1.as_short[2]
1487 __res.as_short[3] = (__m1.as_short[3] < __m2.as_short[3]) ? __m1.as_short[3]
1490 return (__m64)__res.as_m64;
1494 extern __inline __m64
1495 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1501 extern __inline __m64
1502 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1505 __vector
unsigned char __a,
__b, __r;
1506 __vector __bool
char __c;
1512 return (__m64)((__vector
long long)__r)[0];
1514 __m64_union __m1, __m2, __res;
1520 for (__i = 0; __i < 8; __i++)
1521 __res.as_char[__i] =
1522 ((
unsigned char)__m1.as_char[__i] < (
unsigned char)__m2.as_char[__i])
1524 : __m2.as_char[__i];
1526 return (__m64)__res.as_m64;
1530 extern __inline __m64
1531 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1538 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1540 #ifdef __powerpc64__
1541 unsigned long long __p =
1542 #ifdef __LITTLE_ENDIAN__
1543 0x0008101820283038UL;
1545 0x3830282018100800UL;
1547 return __builtin_bpermd(
__p, __A);
1549 #ifdef __LITTLE_ENDIAN__
1550 unsigned int __mask = 0x20283038UL;
1551 unsigned int __r1 = __builtin_bpermd(__mask, __A) & 0xf;
1552 unsigned int __r2 = __builtin_bpermd(__mask, __A >> 32) & 0xf;
1554 unsigned int __mask = 0x38302820UL;
1555 unsigned int __r1 = __builtin_bpermd(__mask, __A >> 32) & 0xf;
1556 unsigned int __r2 = __builtin_bpermd(__mask, __A) & 0xf;
1558 return (__r2 << 4) | __r1;
1563 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1570 extern __inline __m64
1571 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1573 __vector
unsigned short __a,
__b;
1574 __vector
unsigned short __c;
1575 __vector
unsigned int __w0, __w1;
1576 __vector
unsigned char __xform1 = {
1577 #ifdef __LITTLE_ENDIAN__
1578 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1579 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1581 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1582 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1589 __w0 = vec_vmuleuh(
__a,
__b);
1590 __w1 = vec_vmulouh(
__a,
__b);
1591 __c = (__vector
unsigned short)
vec_perm(__w0, __w1, __xform1);
1593 return (__m64)((__vector
long long)
__c)[0];
1596 extern __inline __m64
1597 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1604 extern __inline __m64
1605 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1607 unsigned long __element_selector_10 = __N & 0x03;
1608 unsigned long __element_selector_32 = (__N >> 2) & 0x03;
1609 unsigned long __element_selector_54 = (__N >> 4) & 0x03;
1610 unsigned long __element_selector_76 = (__N >> 6) & 0x03;
1611 static const unsigned short __permute_selectors[4] = {
1612 #ifdef __LITTLE_ENDIAN__
1613 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1615 0x0607, 0x0405, 0x0203, 0x0001
1619 __vector
unsigned long long __a,
__p, __r;
1621 #ifdef __LITTLE_ENDIAN__
1622 __t.as_short[0] = __permute_selectors[__element_selector_10];
1623 __t.as_short[1] = __permute_selectors[__element_selector_32];
1624 __t.as_short[2] = __permute_selectors[__element_selector_54];
1625 __t.as_short[3] = __permute_selectors[__element_selector_76];
1627 __t.as_short[3] = __permute_selectors[__element_selector_10];
1628 __t.as_short[2] = __permute_selectors[__element_selector_32];
1629 __t.as_short[1] = __permute_selectors[__element_selector_54];
1630 __t.as_short[0] = __permute_selectors[__element_selector_76];
1635 return (__m64)((__vector
long long)__r)[0];
1638 extern __inline __m64
1639 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1647 extern __inline
void
1648 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1650 __m64 __hibit = 0x8080808080808080UL;
1651 __m64 __mask, __tmp;
1652 __m64 *
__p = (__m64 *)
__P;
1656 __tmp = (__tmp & (~__mask)) | (__A & __mask);
1660 extern __inline
void
1661 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1667 extern __inline __m64
1668 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1675 return (__m64)((__vector
long long)
__c)[0];
1678 extern __inline __m64
1679 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1685 extern __inline __m64
1686 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1693 return (__m64)((__vector
long long)
__c)[0];
1696 extern __inline __m64
1697 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1705 extern __inline __m64
1706 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1708 __vector
unsigned char __a,
__b;
1709 __vector
unsigned char __vmin, __vmax, __vabsdiff;
1710 __vector
signed int __vsum;
1711 const __vector
unsigned int __zero = {0, 0, 0, 0};
1712 __m64_union __result = {0};
1714 __a = (__vector
unsigned char)(__vector
unsigned long long){0UL, __A};
1715 __b = (__vector
unsigned char)(__vector
unsigned long long){0UL, __B};
1718 __vabsdiff =
vec_sub(__vmax, __vmin);
1720 __vsum = (__vector
signed int)
vec_sum4s(__vabsdiff, __zero);
1722 __vsum = vec_sums(__vsum, (__vector
signed int)__zero);
1725 __result.as_short[0] = __vsum[3];
1726 return __result.as_m64;
1729 extern __inline __m64
1730 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1736 extern __inline
void
1737 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1740 __asm__(
" dcbtstt 0,%0" : :
"b"(
__P) :
"memory");
1745 extern __inline
void
1746 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1749 __asm__(
" dcbtstt 0,%0" : :
"b"(
__P) :
"memory");
1755 extern __inline
void
1756 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1759 __atomic_thread_fence(__ATOMIC_RELEASE);
1767 extern __inline
void
1768 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1783 unsigned long __PPR;
1785 __asm__
volatile(
" mfppr %0;"
1797 __atomic_thread_fence(__ATOMIC_SEQ_CST);
1802 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1804 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
1805 __v4sf __t0 = vec_vmrghw(__r0, __r1); \
1806 __v4sf __t1 = vec_vmrghw(__r2, __r3); \
1807 __v4sf __t2 = vec_vmrglw(__r0, __r1); \
1808 __v4sf __t3 = vec_vmrglw(__r2, __r3); \
1809 (row0) = (__v4sf)vec_mergeh((__vector long long)__t0, \
1810 (__vector long long)__t1); \
1811 (row1) = (__v4sf)vec_mergel((__vector long long)__t0, \
1812 (__vector long long)__t1); \
1813 (row2) = (__v4sf)vec_mergeh((__vector long long)__t2, \
1814 (__vector long long)__t3); \
1815 (row3) = (__v4sf)vec_mergel((__vector long long)__t2, \
1816 (__vector long long)__t3); \
1823 #include_next <xmmintrin.h>
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
static __inline__ vector int __ATTRS_o_ai vec_vmrghw(vector int __a, vector int __b)
#define vec_ctf(__a, __b)
static __inline__ vector int __ATTRS_o_ai vec_vupkhsh(vector short __a)
static __inline__ vector float vector float vector float __c
static __inline__ vector float vector float __b
static __inline__ vector signed char __ATTRS_o_ai vec_ld(long __a, const vector signed char *__b)
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
static __inline__ vector short __ATTRS_o_ai vec_vupkhsb(vector signed char __a)
static __inline__ vector signed char __ATTRS_o_ai vec_andc(vector signed char __a, vector signed char __b)
static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, long __b, vector signed char *__c)
static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a, vector int __b)
static __inline__ vector signed char __ATTRS_o_ai vec_and(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_avg(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a, vector signed char __b)
static __inline__ vector int __ATTRS_o_ai vec_vmrglw(vector int __a, vector int __b)
static __inline__ vector signed char __ATTRS_o_ai vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
static __inline__ vector signed char __ATTRS_o_ai vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
static __inline__ vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a, vector signed char __b)
static __inline__ vector int __ATTRS_o_ai vec_vupklsh(vector short __a)
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_max(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_nor(vector signed char __a, vector signed char __b)
static __inline__ vector bool char __ATTRS_o_ai vec_cmpge(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_pack(vector signed short __a, vector signed short __b)
static __inline__ vector float __ATTRS_o_ai vec_re(vector float __a)
static __inline__ vector signed char __ATTRS_o_ai vec_min(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_splat(vector signed char __a, unsigned const int __b)
static __inline__ vector signed char __ATTRS_o_ai vec_or(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_abs(vector signed char __a)
static __inline__ vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a, vector unsigned char __b)
static __inline__ vector float __ATTRS_o_ai vec_rsqrte(vector float __a)
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
static __inline__ vector bool char __ATTRS_o_ai vec_cmple(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_sub(vector signed char __a, vector signed char __b)
static __inline__ uint32_t volatile uint32_t * __p
static __inline__ void int __a
void _mm_pause(void)
Indicates that a spin loop is being executed for the purposes of optimizing power consumption during ...
static __inline__ void short __D
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
Compares the 8-bit integer elements of two 64-bit integer vectors of [8 x i8] to determine if the ele...
__inline unsigned int unsigned int unsigned int * __P
__inline unsigned int unsigned int __Y
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttss_si32(__m128 __a)
Converts the lower (first) element of a vector of [4 x float] into a signed truncated (rounded toward...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ss(__m128 __a)
Calculates the approximate reciprocal of the value stored in the low-order bits of a 128-bit vector o...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a)
Calculates the square root of the value stored in the low-order bits of a 128-bit vector of [4 x floa...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ps(__m128 __a, __m128 __b)
Divides two 128-bit vectors of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for equa...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ss(float __w)
Constructs a 128-bit floating-point vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_and_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ps(__m128 __a, __m128 __b)
Adds two 128-bit vectors of [4 x float], and returns the results of the addition.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an aligned memory location.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mulhi_pu16(__m64 __a, __m64 __b)
Multiplies packed 16-bit unsigned integer values and writes the high-order 16 bits of each 32-bit pro...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for ineq...
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_andnot_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float], using the one's complement of the value...
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_ps(float *__p, __m128 __a)
Stores float values from a 128-bit vector of [4 x float] to an aligned memory location in reverse ord...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttps_pi32(__m128 __a)
Converts the lower (first) two elements of a 128-bit vector of [4 x float] into two signed truncated ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvt_si2ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_stream_pi(void *__p, __m64 __a)
Stores a 64-bit integer in the specified aligned memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi16_ps(__m64 __a)
Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ps(__m128 __a)
Calculates the approximate reciprocals of the square roots of the values stored in a 128-bit vector o...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi8(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pi(__m64 *__p, __m128 __a)
Stores the lower 64 bits of a 128-bit vector of [4 x float] to a memory location.
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
#define _mm_shuffle_ps(a, b, mask)
Selects 4 float values from the 128-bit operands of [4 x float], as specified by the immediate value ...
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality.
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_ps(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu16_ps(__m64 __a)
Converts a 64-bit vector of 16-bit unsigned integer values into a 128-bit vector of [4 x float].
void _mm_sfence(void)
Forces strong memory ordering (serialization) between store instructions preceding this instruction a...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps1(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ps(__m128 __a, __m128 __b)
Multiplies two 128-bit vectors of [4 x float] and returns the results of the multiplication.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the greater of each pair of values.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ss(__m128 __a)
Calculates the approximate reciprocal of the square root of the value stored in the low-order bits of...
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu16(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 16-bit integer values and writes the averages to...
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadl_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the low-order bits of a 128-bit vector of [4 ...
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ss(__m128 __a, __m128 __b)
Subtracts the 32-bit float value in the low-order bits of the second operand from the corresponding v...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ps(__m128 __a, __m128 __b)
Subtracts each of the values of the second operand from the first operand, both of which are 128-bit ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load1_ps(const float *__p)
Loads a 32-bit float value and duplicates it to all four vector elements of a 128-bit vector of [4 x ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movelh_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the lesser of each pair of values.
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_ps(void *__p, __m128 __a)
Moves packed float values from a 128-bit vector of [4 x float] to a 128-bit aligned memory location.
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setr_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float], initialized in reverse order with the spec...
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpackhi_ps(__m128 __a, __m128 __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x float] and interleaves the...
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ss(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] to a memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadh_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the high-order bits of a 128-bit vector of [4...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_xor_ps(__m128 __a, __m128 __b)
Performs a bitwise exclusive OR of two 128-bit vectors of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ps(__m128 __a)
Calculates the approximate reciprocals of the values stored in a 128-bit vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_move_ss(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set1_ps(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] into an aligned memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_or_ps(__m128 __a, __m128 __b)
Performs a bitwise OR of two 128-bit vectors of [4 x float].
#define _mm_extract_pi16(a, n)
Extracts 16-bit element from a 64-bit vector of [4 x i16] and returns it, as specified by the immedia...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a)
Calculates the square roots of the values stored in a 128-bit vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for inequality.
#define _mm_prefetch(a, sel)
Loads one cache line of data from the specified address to a location closer to the processor.
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtss_f32(__m128 __a)
Extracts a float value contained in the lower 32 bits of a vector of [4 x float].
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
int __v4si __attribute__((__vector_size__(16)))
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ss(__m128 __a, __m128 __b)
Multiplies two 32-bit float values in the low-order bits of the operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsi32_ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtt_ps2pi(__m128 __a)
Converts the lower (first) two elements of a 128-bit vector of [4 x float] into two signed truncated ...
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtt_ss2si(__m128 __a)
Converts the lower (first) element of a vector of [4 x float] into a signed truncated (rounded toward...
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_ps(__m128 __a)
Extracts the sign bits from each single-precision floating-point element of a 128-bit floating-point ...
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
Converts the two 32-bit signed integer values from each 64-bit vector operand of [2 x i32] into a 128...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movehl_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadr_ps(const float *__p)
Loads four packed float values, in reverse order, from an aligned memory location to 32-bit elements ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pi(__m64 *__p, __m128 __a)
Stores the upper 64 bits of a 128-bit vector of [4 x float] to a memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
#define _mm_insert_pi16(a, d, n)
Copies data from the 64-bit vector of [4 x i16] to the destination, and inserts the lower 16-bits of ...
#define _mm_shuffle_pi16(a, n)
Shuffles the 4 16-bit integers from a 64-bit integer vector to the destination, as specified by the i...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvt_pi2ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ss(__m128 __a, __m128 __b)
Adds the 32-bit float values in the low-order bits of the operands.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float] initialized with the specified single-preci...
static __inline__ int __DEFAULT_FN_ATTRS_MMX _mm_movemask_pi8(__m64 __a)
Takes the most significant bit from each 8-bit element in a 64-bit integer vector to create an 8-bit ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps1(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the lesser value ...
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu8_ps(__m64 __a)
Converts the lower four unsigned 8-bit integer values from a 64-bit vector of [8 x u8] into a 128-bit...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu8(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 8-bit integer values and writes the averages to ...
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi8_ps(__m64 __a)
Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] into a 128-bit vector of [4 x f...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpacklo_ps(__m128 __a, __m128 __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x float] and interleaves them...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the greater value...
static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
Conditionally copies the values from each 8-bit element in the first 64-bit integer vector operand to...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi16(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sad_pu8(__m64 __a, __m64 __b)
Subtracts the corresponding 8-bit unsigned integer values of the two 64-bit vector operands and compu...
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ss(const float *__p)
Constructs a 128-bit floating-point vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ss(__m128 __a, __m128 __b)
Divides the value in the low-order 32 bits of the first operand by the corresponding value in the sec...