clang  19.0.0git
emmintrin.h
Go to the documentation of this file.
1 /*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11  User Guide and Reference, version 9.0. */
12 
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15  explicitly from x86_64 to powerpc64/powerpc64le.
16 
17  Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
18  PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
19  However scalar float operations in vector (XMM) registers require
20  the POWER8 VSX ISA (2.07) level. There are differences for data
21  format and placement of float scalars in the vector register, which
22  require extra steps to match SSE2 scalar float semantics on POWER.
23 
24  It should be noted that there's much difference between X86_64's
25  MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26  portable <fenv.h> instead of access MXSCR directly.
27 
28  Most SSE2 scalar float intrinsic operations can be performed more
29  efficiently as C language float scalar operations or optimized to
30  use vector SIMD operations. We recommend this for new applications.
31 */
32 #error \
33  "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
34 #endif
35 
36 #ifndef EMMINTRIN_H_
37 #define EMMINTRIN_H_
38 
39 #if defined(__powerpc64__) && \
40  (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
41 
42 #include <altivec.h>
43 
44 /* We need definitions from the SSE header files. */
45 #include <xmmintrin.h>
46 
47 /* SSE2 */
48 typedef __vector double __v2df;
49 typedef __vector float __v4f;
50 typedef __vector long long __v2di;
51 typedef __vector unsigned long long __v2du;
52 typedef __vector int __v4si;
53 typedef __vector unsigned int __v4su;
54 typedef __vector short __v8hi;
55 typedef __vector unsigned short __v8hu;
56 typedef __vector signed char __v16qi;
57 typedef __vector unsigned char __v16qu;
58 
59 /* The Intel API is flexible enough that we must allow aliasing with other
60  vector types, and their scalar components. */
61 typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
62 typedef double __m128d __attribute__((__vector_size__(16), __may_alias__));
63 
64 /* Unaligned version of the same types. */
65 typedef long long __m128i_u
66  __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
67 typedef double __m128d_u
68  __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
69 
70 /* Define two value permute mask. */
71 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
72 
73 /* Create a vector with element 0 as F and the rest zero. */
74 extern __inline __m128d
75  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
76  _mm_set_sd(double __F) {
77  return __extension__(__m128d){__F, 0.0};
78 }
79 
80 /* Create a vector with both elements equal to F. */
81 extern __inline __m128d
82  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
83  _mm_set1_pd(double __F) {
84  return __extension__(__m128d){__F, __F};
85 }
86 
87 extern __inline __m128d
88  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
89  _mm_set_pd1(double __F) {
90  return _mm_set1_pd(__F);
91 }
92 
93 /* Create a vector with the lower value X and upper value W. */
94 extern __inline __m128d
95  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
96  _mm_set_pd(double __W, double __X) {
97  return __extension__(__m128d){__X, __W};
98 }
99 
100 /* Create a vector with the lower value W and upper value X. */
101 extern __inline __m128d
102  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
103  _mm_setr_pd(double __W, double __X) {
104  return __extension__(__m128d){__W, __X};
105 }
106 
107 /* Create an undefined vector. */
108 extern __inline __m128d
109  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
110  _mm_undefined_pd(void) {
111  __m128d __Y = __Y;
112  return __Y;
113 }
114 
115 /* Create a vector of zeros. */
116 extern __inline __m128d
117  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
118  _mm_setzero_pd(void) {
119  return (__m128d)vec_splats(0);
120 }
121 
122 /* Sets the low DPFP value of A from the low value of B. */
123 extern __inline __m128d
124  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
125  _mm_move_sd(__m128d __A, __m128d __B) {
126  __v2df __result = (__v2df)__A;
127  __result[0] = ((__v2df)__B)[0];
128  return (__m128d)__result;
129 }
130 
131 /* Load two DPFP values from P. The address must be 16-byte aligned. */
132 extern __inline __m128d
133  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
134  _mm_load_pd(double const *__P) {
135  return ((__m128d)vec_ld(0, (__v16qu *)__P));
136 }
137 
138 /* Load two DPFP values from P. The address need not be 16-byte aligned. */
139 extern __inline __m128d
140  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
141  _mm_loadu_pd(double const *__P) {
142  return (vec_vsx_ld(0, __P));
143 }
144 
145 /* Create a vector with all two elements equal to *P. */
146 extern __inline __m128d
147  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148  _mm_load1_pd(double const *__P) {
149  return (vec_splats(*__P));
150 }
151 
152 /* Create a vector with element 0 as *P and the rest zero. */
153 extern __inline __m128d
154  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
155  _mm_load_sd(double const *__P) {
156  return _mm_set_sd(*__P);
157 }
158 
159 extern __inline __m128d
160  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
161  _mm_load_pd1(double const *__P) {
162  return _mm_load1_pd(__P);
163 }
164 
165 /* Load two DPFP values in reverse order. The address must be aligned. */
166 extern __inline __m128d
167  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
168  _mm_loadr_pd(double const *__P) {
169  __v2df __tmp = _mm_load_pd(__P);
170  return (__m128d)vec_xxpermdi(__tmp, __tmp, 2);
171 }
172 
173 /* Store two DPFP values. The address must be 16-byte aligned. */
174 extern __inline void
175  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
176  _mm_store_pd(double *__P, __m128d __A) {
177  vec_st((__v16qu)__A, 0, (__v16qu *)__P);
178 }
179 
180 /* Store two DPFP values. The address need not be 16-byte aligned. */
181 extern __inline void
182  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
183  _mm_storeu_pd(double *__P, __m128d __A) {
184  *(__m128d_u *)__P = __A;
185 }
186 
187 /* Stores the lower DPFP value. */
188 extern __inline void
189  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
190  _mm_store_sd(double *__P, __m128d __A) {
191  *__P = ((__v2df)__A)[0];
192 }
193 
194 extern __inline double
195  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
196  _mm_cvtsd_f64(__m128d __A) {
197  return ((__v2df)__A)[0];
198 }
199 
200 extern __inline void
201  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
202  _mm_storel_pd(double *__P, __m128d __A) {
203  _mm_store_sd(__P, __A);
204 }
205 
206 /* Stores the upper DPFP value. */
207 extern __inline void
208  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
209  _mm_storeh_pd(double *__P, __m128d __A) {
210  *__P = ((__v2df)__A)[1];
211 }
212 /* Store the lower DPFP value across two words.
213  The address must be 16-byte aligned. */
214 extern __inline void
215  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
216  _mm_store1_pd(double *__P, __m128d __A) {
217  _mm_store_pd(__P, vec_splat(__A, 0));
218 }
219 
220 extern __inline void
221  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
222  _mm_store_pd1(double *__P, __m128d __A) {
223  _mm_store1_pd(__P, __A);
224 }
225 
226 /* Store two DPFP values in reverse order. The address must be aligned. */
227 extern __inline void
228  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
229  _mm_storer_pd(double *__P, __m128d __A) {
230  _mm_store_pd(__P, vec_xxpermdi(__A, __A, 2));
231 }
232 
233 /* Intel intrinsic. */
234 extern __inline long long
235  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
236  _mm_cvtsi128_si64(__m128i __A) {
237  return ((__v2di)__A)[0];
238 }
239 
240 /* Microsoft intrinsic. */
241 extern __inline long long
242  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
243  _mm_cvtsi128_si64x(__m128i __A) {
244  return ((__v2di)__A)[0];
245 }
246 
247 extern __inline __m128d
248  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
249  _mm_add_pd(__m128d __A, __m128d __B) {
250  return (__m128d)((__v2df)__A + (__v2df)__B);
251 }
252 
253 /* Add the lower double-precision (64-bit) floating-point element in
254  a and b, store the result in the lower element of dst, and copy
255  the upper element from a to the upper element of dst. */
256 extern __inline __m128d
257  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
258  _mm_add_sd(__m128d __A, __m128d __B) {
259  __A[0] = __A[0] + __B[0];
260  return (__A);
261 }
262 
263 extern __inline __m128d
264  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
265  _mm_sub_pd(__m128d __A, __m128d __B) {
266  return (__m128d)((__v2df)__A - (__v2df)__B);
267 }
268 
269 extern __inline __m128d
270  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
271  _mm_sub_sd(__m128d __A, __m128d __B) {
272  __A[0] = __A[0] - __B[0];
273  return (__A);
274 }
275 
276 extern __inline __m128d
277  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
278  _mm_mul_pd(__m128d __A, __m128d __B) {
279  return (__m128d)((__v2df)__A * (__v2df)__B);
280 }
281 
282 extern __inline __m128d
283  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
284  _mm_mul_sd(__m128d __A, __m128d __B) {
285  __A[0] = __A[0] * __B[0];
286  return (__A);
287 }
288 
289 extern __inline __m128d
290  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
291  _mm_div_pd(__m128d __A, __m128d __B) {
292  return (__m128d)((__v2df)__A / (__v2df)__B);
293 }
294 
295 extern __inline __m128d
296  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
297  _mm_div_sd(__m128d __A, __m128d __B) {
298  __A[0] = __A[0] / __B[0];
299  return (__A);
300 }
301 
302 extern __inline __m128d
303  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
304  _mm_sqrt_pd(__m128d __A) {
305  return (vec_sqrt(__A));
306 }
307 
308 /* Return pair {sqrt (B[0]), A[1]}. */
309 extern __inline __m128d
310  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
311  _mm_sqrt_sd(__m128d __A, __m128d __B) {
312  __v2df __c;
313  __c = vec_sqrt((__v2df)_mm_set1_pd(__B[0]));
314  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
315 }
316 
317 extern __inline __m128d
318  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
319  _mm_min_pd(__m128d __A, __m128d __B) {
320  return (vec_min(__A, __B));
321 }
322 
323 extern __inline __m128d
324  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
325  _mm_min_sd(__m128d __A, __m128d __B) {
326  __v2df __a, __b, __c;
327  __a = vec_splats(__A[0]);
328  __b = vec_splats(__B[0]);
329  __c = vec_min(__a, __b);
330  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
331 }
332 
333 extern __inline __m128d
334  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
335  _mm_max_pd(__m128d __A, __m128d __B) {
336  return (vec_max(__A, __B));
337 }
338 
339 extern __inline __m128d
340  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
341  _mm_max_sd(__m128d __A, __m128d __B) {
342  __v2df __a, __b, __c;
343  __a = vec_splats(__A[0]);
344  __b = vec_splats(__B[0]);
345  __c = vec_max(__a, __b);
346  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
347 }
348 
349 extern __inline __m128d
350  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
351  _mm_cmpeq_pd(__m128d __A, __m128d __B) {
352  return ((__m128d)vec_cmpeq((__v2df)__A, (__v2df)__B));
353 }
354 
355 extern __inline __m128d
356  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
357  _mm_cmplt_pd(__m128d __A, __m128d __B) {
358  return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
359 }
360 
361 extern __inline __m128d
362  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
363  _mm_cmple_pd(__m128d __A, __m128d __B) {
364  return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
365 }
366 
367 extern __inline __m128d
368  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369  _mm_cmpgt_pd(__m128d __A, __m128d __B) {
370  return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
371 }
372 
373 extern __inline __m128d
374  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
375  _mm_cmpge_pd(__m128d __A, __m128d __B) {
376  return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
377 }
378 
379 extern __inline __m128d
380  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
381  _mm_cmpneq_pd(__m128d __A, __m128d __B) {
382  __v2df __temp = (__v2df)vec_cmpeq((__v2df)__A, (__v2df)__B);
383  return ((__m128d)vec_nor(__temp, __temp));
384 }
385 
386 extern __inline __m128d
387  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
388  _mm_cmpnlt_pd(__m128d __A, __m128d __B) {
389  return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
390 }
391 
392 extern __inline __m128d
393  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
394  _mm_cmpnle_pd(__m128d __A, __m128d __B) {
395  return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
396 }
397 
398 extern __inline __m128d
399  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
400  _mm_cmpngt_pd(__m128d __A, __m128d __B) {
401  return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
402 }
403 
404 extern __inline __m128d
405  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
406  _mm_cmpnge_pd(__m128d __A, __m128d __B) {
407  return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
408 }
409 
410 extern __inline __m128d
411  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
412  _mm_cmpord_pd(__m128d __A, __m128d __B) {
413  __v2du __c, __d;
414  /* Compare against self will return false (0's) if NAN. */
415  __c = (__v2du)vec_cmpeq(__A, __A);
416  __d = (__v2du)vec_cmpeq(__B, __B);
417  /* A != NAN and B != NAN. */
418  return ((__m128d)vec_and(__c, __d));
419 }
420 
421 extern __inline __m128d
422  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
423  _mm_cmpunord_pd(__m128d __A, __m128d __B) {
424 #if _ARCH_PWR8
425  __v2du __c, __d;
426  /* Compare against self will return false (0's) if NAN. */
427  __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
428  __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
429  /* A == NAN OR B == NAN converts too:
430  NOT(A != NAN) OR NOT(B != NAN). */
431  __c = vec_nor(__c, __c);
432  return ((__m128d)vec_orc(__c, __d));
433 #else
434  __v2du __c, __d;
435  /* Compare against self will return false (0's) if NAN. */
436  __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
437  __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
438  /* Convert the true ('1's) is NAN. */
439  __c = vec_nor(__c, __c);
440  __d = vec_nor(__d, __d);
441  return ((__m128d)vec_or(__c, __d));
442 #endif
443 }
444 
445 extern __inline __m128d
446  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
447  _mm_cmpeq_sd(__m128d __A, __m128d __B) {
448  __v2df __a, __b, __c;
449  /* PowerISA VSX does not allow partial (for just lower double)
450  results. So to insure we don't generate spurious exceptions
451  (from the upper double values) we splat the lower double
452  before we do the operation. */
453  __a = vec_splats(__A[0]);
454  __b = vec_splats(__B[0]);
455  __c = (__v2df)vec_cmpeq(__a, __b);
456  /* Then we merge the lower double result with the original upper
457  double from __A. */
458  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
459 }
460 
461 extern __inline __m128d
462  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
463  _mm_cmplt_sd(__m128d __A, __m128d __B) {
464  __v2df __a, __b, __c;
465  __a = vec_splats(__A[0]);
466  __b = vec_splats(__B[0]);
467  __c = (__v2df)vec_cmplt(__a, __b);
468  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
469 }
470 
471 extern __inline __m128d
472  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
473  _mm_cmple_sd(__m128d __A, __m128d __B) {
474  __v2df __a, __b, __c;
475  __a = vec_splats(__A[0]);
476  __b = vec_splats(__B[0]);
477  __c = (__v2df)vec_cmple(__a, __b);
478  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
479 }
480 
481 extern __inline __m128d
482  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
483  _mm_cmpgt_sd(__m128d __A, __m128d __B) {
484  __v2df __a, __b, __c;
485  __a = vec_splats(__A[0]);
486  __b = vec_splats(__B[0]);
487  __c = (__v2df)vec_cmpgt(__a, __b);
488  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
489 }
490 
491 extern __inline __m128d
492  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
493  _mm_cmpge_sd(__m128d __A, __m128d __B) {
494  __v2df __a, __b, __c;
495  __a = vec_splats(__A[0]);
496  __b = vec_splats(__B[0]);
497  __c = (__v2df)vec_cmpge(__a, __b);
498  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
499 }
500 
501 extern __inline __m128d
502  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
503  _mm_cmpneq_sd(__m128d __A, __m128d __B) {
504  __v2df __a, __b, __c;
505  __a = vec_splats(__A[0]);
506  __b = vec_splats(__B[0]);
507  __c = (__v2df)vec_cmpeq(__a, __b);
508  __c = vec_nor(__c, __c);
509  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
510 }
511 
512 extern __inline __m128d
513  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
514  _mm_cmpnlt_sd(__m128d __A, __m128d __B) {
515  __v2df __a, __b, __c;
516  __a = vec_splats(__A[0]);
517  __b = vec_splats(__B[0]);
518  /* Not less than is just greater than or equal. */
519  __c = (__v2df)vec_cmpge(__a, __b);
520  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
521 }
522 
523 extern __inline __m128d
524  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
525  _mm_cmpnle_sd(__m128d __A, __m128d __B) {
526  __v2df __a, __b, __c;
527  __a = vec_splats(__A[0]);
528  __b = vec_splats(__B[0]);
529  /* Not less than or equal is just greater than. */
530  __c = (__v2df)vec_cmpge(__a, __b);
531  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
532 }
533 
534 extern __inline __m128d
535  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
536  _mm_cmpngt_sd(__m128d __A, __m128d __B) {
537  __v2df __a, __b, __c;
538  __a = vec_splats(__A[0]);
539  __b = vec_splats(__B[0]);
540  /* Not greater than is just less than or equal. */
541  __c = (__v2df)vec_cmple(__a, __b);
542  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
543 }
544 
545 extern __inline __m128d
546  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
547  _mm_cmpnge_sd(__m128d __A, __m128d __B) {
548  __v2df __a, __b, __c;
549  __a = vec_splats(__A[0]);
550  __b = vec_splats(__B[0]);
551  /* Not greater than or equal is just less than. */
552  __c = (__v2df)vec_cmplt(__a, __b);
553  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
554 }
555 
556 extern __inline __m128d
557  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
558  _mm_cmpord_sd(__m128d __A, __m128d __B) {
559  __v2df __r;
560  __r = (__v2df)_mm_cmpord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
561  return (__m128d)_mm_setr_pd(__r[0], ((__v2df)__A)[1]);
562 }
563 
564 extern __inline __m128d
565  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
566  _mm_cmpunord_sd(__m128d __A, __m128d __B) {
567  __v2df __r;
568  __r = _mm_cmpunord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
569  return (__m128d)_mm_setr_pd(__r[0], __A[1]);
570 }
571 
572 /* FIXME
573  The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
574  exactly the same because GCC for PowerPC only generates unordered
575  compares (scalar and vector).
576  Technically __mm_comieq_sp et all should be using the ordered
577  compare and signal for QNaNs. The __mm_ucomieq_sd et all should
578  be OK. */
579 extern __inline int
580  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
581  _mm_comieq_sd(__m128d __A, __m128d __B) {
582  return (__A[0] == __B[0]);
583 }
584 
585 extern __inline int
586  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
587  _mm_comilt_sd(__m128d __A, __m128d __B) {
588  return (__A[0] < __B[0]);
589 }
590 
591 extern __inline int
592  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
593  _mm_comile_sd(__m128d __A, __m128d __B) {
594  return (__A[0] <= __B[0]);
595 }
596 
597 extern __inline int
598  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
599  _mm_comigt_sd(__m128d __A, __m128d __B) {
600  return (__A[0] > __B[0]);
601 }
602 
603 extern __inline int
604  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
605  _mm_comige_sd(__m128d __A, __m128d __B) {
606  return (__A[0] >= __B[0]);
607 }
608 
609 extern __inline int
610  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
611  _mm_comineq_sd(__m128d __A, __m128d __B) {
612  return (__A[0] != __B[0]);
613 }
614 
615 extern __inline int
616  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
617  _mm_ucomieq_sd(__m128d __A, __m128d __B) {
618  return (__A[0] == __B[0]);
619 }
620 
621 extern __inline int
622  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
623  _mm_ucomilt_sd(__m128d __A, __m128d __B) {
624  return (__A[0] < __B[0]);
625 }
626 
627 extern __inline int
628  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
629  _mm_ucomile_sd(__m128d __A, __m128d __B) {
630  return (__A[0] <= __B[0]);
631 }
632 
633 extern __inline int
634  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
635  _mm_ucomigt_sd(__m128d __A, __m128d __B) {
636  return (__A[0] > __B[0]);
637 }
638 
639 extern __inline int
640  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
641  _mm_ucomige_sd(__m128d __A, __m128d __B) {
642  return (__A[0] >= __B[0]);
643 }
644 
645 extern __inline int
646  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
647  _mm_ucomineq_sd(__m128d __A, __m128d __B) {
648  return (__A[0] != __B[0]);
649 }
650 
651 /* Create a vector of Qi, where i is the element number. */
652 extern __inline __m128i
653  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
654  _mm_set_epi64x(long long __q1, long long __q0) {
655  return __extension__(__m128i)(__v2di){__q0, __q1};
656 }
657 
658 extern __inline __m128i
659  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
660  _mm_set_epi64(__m64 __q1, __m64 __q0) {
661  return _mm_set_epi64x((long long)__q1, (long long)__q0);
662 }
663 
664 extern __inline __m128i
665  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
666  _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) {
667  return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3};
668 }
669 
670 extern __inline __m128i
671  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
672  _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3,
673  short __q2, short __q1, short __q0) {
674  return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3,
675  __q4, __q5, __q6, __q7};
676 }
677 
678 extern __inline __m128i
679  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
680  _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11,
681  char __q10, char __q09, char __q08, char __q07, char __q06,
682  char __q05, char __q04, char __q03, char __q02, char __q01,
683  char __q00) {
684  return __extension__(__m128i)(__v16qi){
685  __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
686  __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15};
687 }
688 
689 /* Set all of the elements of the vector to A. */
690 extern __inline __m128i
691  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
692  _mm_set1_epi64x(long long __A) {
693  return _mm_set_epi64x(__A, __A);
694 }
695 
696 extern __inline __m128i
697  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
698  _mm_set1_epi64(__m64 __A) {
699  return _mm_set_epi64(__A, __A);
700 }
701 
702 extern __inline __m128i
703  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
704  _mm_set1_epi32(int __A) {
705  return _mm_set_epi32(__A, __A, __A, __A);
706 }
707 
708 extern __inline __m128i
709  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
710  _mm_set1_epi16(short __A) {
711  return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A);
712 }
713 
714 extern __inline __m128i
715  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
716  _mm_set1_epi8(char __A) {
717  return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A,
718  __A, __A, __A, __A, __A);
719 }
720 
721 /* Create a vector of Qi, where i is the element number.
722  The parameter order is reversed from the _mm_set_epi* functions. */
723 extern __inline __m128i
724  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
725  _mm_setr_epi64(__m64 __q0, __m64 __q1) {
726  return _mm_set_epi64(__q1, __q0);
727 }
728 
729 extern __inline __m128i
730  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
731  _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) {
732  return _mm_set_epi32(__q3, __q2, __q1, __q0);
733 }
734 
735 extern __inline __m128i
736  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
737  _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4,
738  short __q5, short __q6, short __q7) {
739  return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
740 }
741 
742 extern __inline __m128i
743  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
744  _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04,
745  char __q05, char __q06, char __q07, char __q08, char __q09,
746  char __q10, char __q11, char __q12, char __q13, char __q14,
747  char __q15) {
748  return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
749  __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
750 }
751 
752 /* Create a vector with element 0 as *P and the rest zero. */
753 extern __inline __m128i
754  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
755  _mm_load_si128(__m128i const *__P) {
756  return *__P;
757 }
758 
759 extern __inline __m128i
760  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
761  _mm_loadu_si128(__m128i_u const *__P) {
762  return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));
763 }
764 
765 extern __inline __m128i
766  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
767  _mm_loadl_epi64(__m128i_u const *__P) {
768  return _mm_set_epi64((__m64)0LL, *(__m64 *)__P);
769 }
770 
771 extern __inline void
772  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
773  _mm_store_si128(__m128i *__P, __m128i __B) {
774  vec_st((__v16qu)__B, 0, (__v16qu *)__P);
775 }
776 
777 extern __inline void
778  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
779  _mm_storeu_si128(__m128i_u *__P, __m128i __B) {
780  *__P = __B;
781 }
782 
783 extern __inline void
784  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
785  _mm_storel_epi64(__m128i_u *__P, __m128i __B) {
786  *(long long *)__P = ((__v2di)__B)[0];
787 }
788 
789 extern __inline __m64
790  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
791  _mm_movepi64_pi64(__m128i_u __B) {
792  return (__m64)((__v2di)__B)[0];
793 }
794 
795 extern __inline __m128i
796  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
797  _mm_movpi64_epi64(__m64 __A) {
798  return _mm_set_epi64((__m64)0LL, __A);
799 }
800 
801 extern __inline __m128i
802  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
803  _mm_move_epi64(__m128i __A) {
804  return _mm_set_epi64((__m64)0LL, (__m64)__A[0]);
805 }
806 
807 /* Create an undefined vector. */
808 extern __inline __m128i
809  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
810  _mm_undefined_si128(void) {
811  __m128i __Y = __Y;
812  return __Y;
813 }
814 
815 /* Create a vector of zeros. */
816 extern __inline __m128i
817  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
818  _mm_setzero_si128(void) {
819  return __extension__(__m128i)(__v4si){0, 0, 0, 0};
820 }
821 
822 #ifdef _ARCH_PWR8
823 extern __inline __m128d
824  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
825  _mm_cvtepi32_pd(__m128i __A) {
826  __v2di __val;
827  /* For LE need to generate Vector Unpack Low Signed Word.
828  Which is generated from unpackh. */
829  __val = (__v2di)vec_unpackh((__v4si)__A);
830 
831  return (__m128d)vec_ctf(__val, 0);
832 }
833 #endif
834 
835 extern __inline __m128
836  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
837  _mm_cvtepi32_ps(__m128i __A) {
838  return ((__m128)vec_ctf((__v4si)__A, 0));
839 }
840 
841 extern __inline __m128i
842  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
843  _mm_cvtpd_epi32(__m128d __A) {
844  __v2df __rounded = vec_rint(__A);
845  __v4si __result, __temp;
846  const __v4si __vzero = {0, 0, 0, 0};
847 
848  /* VSX Vector truncate Double-Precision to integer and Convert to
849  Signed Integer Word format with Saturate. */
850  __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__rounded) :);
851 
852 #ifdef _ARCH_PWR8
853 #ifdef __LITTLE_ENDIAN__
854  __temp = vec_mergeo(__temp, __temp);
855 #else
856  __temp = vec_mergee(__temp, __temp);
857 #endif
858  __result = (__v4si)vec_vpkudum((__vector long long)__temp,
859  (__vector long long)__vzero);
860 #else
861  {
862  const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
863  0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
864  __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
865  }
866 #endif
867  return (__m128i)__result;
868 }
869 
870 extern __inline __m64
871  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
872  _mm_cvtpd_pi32(__m128d __A) {
873  __m128i __result = _mm_cvtpd_epi32(__A);
874 
875  return (__m64)__result[0];
876 }
877 
878 extern __inline __m128
879  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
880  _mm_cvtpd_ps(__m128d __A) {
881  __v4sf __result;
882  __v4si __temp;
883  const __v4si __vzero = {0, 0, 0, 0};
884 
885  __asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
886 
887 #ifdef _ARCH_PWR8
888 #ifdef __LITTLE_ENDIAN__
889  __temp = vec_mergeo(__temp, __temp);
890 #else
891  __temp = vec_mergee(__temp, __temp);
892 #endif
893  __result = (__v4sf)vec_vpkudum((__vector long long)__temp,
894  (__vector long long)__vzero);
895 #else
896  {
897  const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
898  0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
899  __result = (__v4sf)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
900  }
901 #endif
902  return ((__m128)__result);
903 }
904 
905 extern __inline __m128i
906  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
907  _mm_cvttpd_epi32(__m128d __A) {
908  __v4si __result;
909  __v4si __temp;
910  const __v4si __vzero = {0, 0, 0, 0};
911 
912  /* VSX Vector truncate Double-Precision to integer and Convert to
913  Signed Integer Word format with Saturate. */
914  __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
915 
916 #ifdef _ARCH_PWR8
917 #ifdef __LITTLE_ENDIAN__
918  __temp = vec_mergeo(__temp, __temp);
919 #else
920  __temp = vec_mergee(__temp, __temp);
921 #endif
922  __result = (__v4si)vec_vpkudum((__vector long long)__temp,
923  (__vector long long)__vzero);
924 #else
925  {
926  const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
927  0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
928  __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
929  }
930 #endif
931 
932  return ((__m128i)__result);
933 }
934 
935 extern __inline __m64
936  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
937  _mm_cvttpd_pi32(__m128d __A) {
938  __m128i __result = _mm_cvttpd_epi32(__A);
939 
940  return (__m64)__result[0];
941 }
942 
943 extern __inline int
944  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
945  _mm_cvtsi128_si32(__m128i __A) {
946  return ((__v4si)__A)[0];
947 }
948 
949 #ifdef _ARCH_PWR8
950 extern __inline __m128d
951  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
952  _mm_cvtpi32_pd(__m64 __A) {
953  __v4si __temp;
954  __v2di __tmp2;
955  __v4f __result;
956 
957  __temp = (__v4si)vec_splats(__A);
958  __tmp2 = (__v2di)vec_unpackl(__temp);
959  __result = vec_ctf((__vector signed long long)__tmp2, 0);
960  return (__m128d)__result;
961 }
962 #endif
963 
964 extern __inline __m128i
965  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
966  _mm_cvtps_epi32(__m128 __A) {
967  __v4sf __rounded;
968  __v4si __result;
969 
970  __rounded = vec_rint((__v4sf)__A);
971  __result = vec_cts(__rounded, 0);
972  return (__m128i)__result;
973 }
974 
975 extern __inline __m128i
976  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
977  _mm_cvttps_epi32(__m128 __A) {
978  __v4si __result;
979 
980  __result = vec_cts((__v4sf)__A, 0);
981  return (__m128i)__result;
982 }
983 
984 extern __inline __m128d
985  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
986  _mm_cvtps_pd(__m128 __A) {
987  /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
988 #ifdef vec_doubleh
989  return (__m128d)vec_doubleh((__v4sf)__A);
990 #else
991  /* Otherwise the compiler is not current and so need to generate the
992  equivalent code. */
993  __v4sf __a = (__v4sf)__A;
994  __v4sf __temp;
995  __v2df __result;
996 #ifdef __LITTLE_ENDIAN__
997  /* The input float values are in elements {[0], [1]} but the convert
998  instruction needs them in elements {[1], [3]}, So we use two
999  shift left double vector word immediates to get the elements
1000  lined up. */
1001  __temp = __builtin_vsx_xxsldwi(__a, __a, 3);
1002  __temp = __builtin_vsx_xxsldwi(__a, __temp, 2);
1003 #else
1004  /* The input float values are in elements {[0], [1]} but the convert
1005  instruction needs them in elements {[0], [2]}, So we use two
1006  shift left double vector word immediates to get the elements
1007  lined up. */
1008  __temp = vec_vmrghw(__a, __a);
1009 #endif
1010  __asm__(" xvcvspdp %x0,%x1" : "=wa"(__result) : "wa"(__temp) :);
1011  return (__m128d)__result;
1012 #endif
1013 }
1014 
1015 extern __inline int
1016  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1017  _mm_cvtsd_si32(__m128d __A) {
1018  __v2df __rounded = vec_rint((__v2df)__A);
1019  int __result = ((__v2df)__rounded)[0];
1020 
1021  return __result;
1022 }
1023 /* Intel intrinsic. */
1024 extern __inline long long
1025  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1026  _mm_cvtsd_si64(__m128d __A) {
1027  __v2df __rounded = vec_rint((__v2df)__A);
1028  long long __result = ((__v2df)__rounded)[0];
1029 
1030  return __result;
1031 }
1032 
1033 /* Microsoft intrinsic. */
1034 extern __inline long long
1035  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1036  _mm_cvtsd_si64x(__m128d __A) {
1037  return _mm_cvtsd_si64((__v2df)__A);
1038 }
1039 
1040 extern __inline int
1041  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1042  _mm_cvttsd_si32(__m128d __A) {
1043  int __result = ((__v2df)__A)[0];
1044 
1045  return __result;
1046 }
1047 
1048 /* Intel intrinsic. */
1049 extern __inline long long
1050  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1051  _mm_cvttsd_si64(__m128d __A) {
1052  long long __result = ((__v2df)__A)[0];
1053 
1054  return __result;
1055 }
1056 
1057 /* Microsoft intrinsic. */
1058 extern __inline long long
1059  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1060  _mm_cvttsd_si64x(__m128d __A) {
1061  return _mm_cvttsd_si64(__A);
1062 }
1063 
1064 extern __inline __m128
1065  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1066  _mm_cvtsd_ss(__m128 __A, __m128d __B) {
1067  __v4sf __result = (__v4sf)__A;
1068 
1069 #ifdef __LITTLE_ENDIAN__
1070  __v4sf __temp_s;
1071  /* Copy double element[0] to element [1] for conversion. */
1072  __v2df __temp_b = vec_splat((__v2df)__B, 0);
1073 
1074  /* Pre-rotate __A left 3 (logically right 1) elements. */
1075  __result = __builtin_vsx_xxsldwi(__result, __result, 3);
1076  /* Convert double to single float scalar in a vector. */
1077  __asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s) : "wa"(__temp_b) :);
1078  /* Shift the resulting scalar into vector element [0]. */
1079  __result = __builtin_vsx_xxsldwi(__result, __temp_s, 1);
1080 #else
1081  __result[0] = ((__v2df)__B)[0];
1082 #endif
1083  return (__m128)__result;
1084 }
1085 
1086 extern __inline __m128d
1087  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1088  _mm_cvtsi32_sd(__m128d __A, int __B) {
1089  __v2df __result = (__v2df)__A;
1090  double __db = __B;
1091  __result[0] = __db;
1092  return (__m128d)__result;
1093 }
1094 
1095 /* Intel intrinsic. */
1096 extern __inline __m128d
1097  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1098  _mm_cvtsi64_sd(__m128d __A, long long __B) {
1099  __v2df __result = (__v2df)__A;
1100  double __db = __B;
1101  __result[0] = __db;
1102  return (__m128d)__result;
1103 }
1104 
1105 /* Microsoft intrinsic. */
1106 extern __inline __m128d
1107  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1108  _mm_cvtsi64x_sd(__m128d __A, long long __B) {
1109  return _mm_cvtsi64_sd(__A, __B);
1110 }
1111 
1112 extern __inline __m128d
1113  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1114  _mm_cvtss_sd(__m128d __A, __m128 __B) {
1115 #ifdef __LITTLE_ENDIAN__
1116  /* Use splat to move element [0] into position for the convert. */
1117  __v4sf __temp = vec_splat((__v4sf)__B, 0);
1118  __v2df __res;
1119  /* Convert single float scalar to double in a vector. */
1120  __asm__("xscvspdp %x0,%x1" : "=wa"(__res) : "wa"(__temp) :);
1121  return (__m128d)vec_mergel(__res, (__v2df)__A);
1122 #else
1123  __v2df __res = (__v2df)__A;
1124  __res[0] = ((__v4sf)__B)[0];
1125  return (__m128d)__res;
1126 #endif
1127 }
1128 
1129 extern __inline __m128d
1130  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1131  _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) {
1132  __vector double __result;
1133  const int __litmsk = __mask & 0x3;
1134 
1135  if (__litmsk == 0)
1136  __result = vec_mergeh(__A, __B);
1137 #if __GNUC__ < 6
1138  else if (__litmsk == 1)
1139  __result = vec_xxpermdi(__B, __A, 2);
1140  else if (__litmsk == 2)
1141  __result = vec_xxpermdi(__B, __A, 1);
1142 #else
1143  else if (__litmsk == 1)
1144  __result = vec_xxpermdi(__A, __B, 2);
1145  else if (__litmsk == 2)
1146  __result = vec_xxpermdi(__A, __B, 1);
1147 #endif
1148  else
1149  __result = vec_mergel(__A, __B);
1150 
1151  return __result;
1152 }
1153 
1154 extern __inline __m128d
1155  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1156  _mm_unpackhi_pd(__m128d __A, __m128d __B) {
1157  return (__m128d)vec_mergel((__v2df)__A, (__v2df)__B);
1158 }
1159 
1160 extern __inline __m128d
1161  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1162  _mm_unpacklo_pd(__m128d __A, __m128d __B) {
1163  return (__m128d)vec_mergeh((__v2df)__A, (__v2df)__B);
1164 }
1165 
1166 extern __inline __m128d
1167  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1168  _mm_loadh_pd(__m128d __A, double const *__B) {
1169  __v2df __result = (__v2df)__A;
1170  __result[1] = *__B;
1171  return (__m128d)__result;
1172 }
1173 
1174 extern __inline __m128d
1175  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1176  _mm_loadl_pd(__m128d __A, double const *__B) {
1177  __v2df __result = (__v2df)__A;
1178  __result[0] = *__B;
1179  return (__m128d)__result;
1180 }
1181 
1182 #ifdef _ARCH_PWR8
1183 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1184 
1185 /* Creates a 2-bit mask from the most significant bits of the DPFP values. */
1186 extern __inline int
1187  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188  _mm_movemask_pd(__m128d __A) {
1189 #ifdef _ARCH_PWR10
1190  return vec_extractm((__v2du)__A);
1191 #else
1192  __vector unsigned long long __result;
1193  static const __vector unsigned int __perm_mask = {
1194 #ifdef __LITTLE_ENDIAN__
1195  0x80800040, 0x80808080, 0x80808080, 0x80808080
1196 #else
1197  0x80808080, 0x80808080, 0x80808080, 0x80804000
1198 #endif
1199  };
1200 
1201  __result = ((__vector unsigned long long)vec_vbpermq(
1202  (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1203 
1204 #ifdef __LITTLE_ENDIAN__
1205  return __result[1];
1206 #else
1207  return __result[0];
1208 #endif
1209 #endif /* !_ARCH_PWR10 */
1210 }
1211 #endif /* _ARCH_PWR8 */
1212 
1213 extern __inline __m128i
1214  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1215  _mm_packs_epi16(__m128i __A, __m128i __B) {
1216  return (__m128i)vec_packs((__v8hi)__A, (__v8hi)__B);
1217 }
1218 
1219 extern __inline __m128i
1220  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1221  _mm_packs_epi32(__m128i __A, __m128i __B) {
1222  return (__m128i)vec_packs((__v4si)__A, (__v4si)__B);
1223 }
1224 
1225 extern __inline __m128i
1226  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1227  _mm_packus_epi16(__m128i __A, __m128i __B) {
1228  return (__m128i)vec_packsu((__v8hi)__A, (__v8hi)__B);
1229 }
1230 
1231 extern __inline __m128i
1232  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1233  _mm_unpackhi_epi8(__m128i __A, __m128i __B) {
1234  return (__m128i)vec_mergel((__v16qu)__A, (__v16qu)__B);
1235 }
1236 
1237 extern __inline __m128i
1238  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1239  _mm_unpackhi_epi16(__m128i __A, __m128i __B) {
1240  return (__m128i)vec_mergel((__v8hu)__A, (__v8hu)__B);
1241 }
1242 
1243 extern __inline __m128i
1244  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1245  _mm_unpackhi_epi32(__m128i __A, __m128i __B) {
1246  return (__m128i)vec_mergel((__v4su)__A, (__v4su)__B);
1247 }
1248 
1249 extern __inline __m128i
1250  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1251  _mm_unpackhi_epi64(__m128i __A, __m128i __B) {
1252  return (__m128i)vec_mergel((__vector long long)__A, (__vector long long)__B);
1253 }
1254 
1255 extern __inline __m128i
1256  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1257  _mm_unpacklo_epi8(__m128i __A, __m128i __B) {
1258  return (__m128i)vec_mergeh((__v16qu)__A, (__v16qu)__B);
1259 }
1260 
1261 extern __inline __m128i
1262  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1263  _mm_unpacklo_epi16(__m128i __A, __m128i __B) {
1264  return (__m128i)vec_mergeh((__v8hi)__A, (__v8hi)__B);
1265 }
1266 
1267 extern __inline __m128i
1268  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1269  _mm_unpacklo_epi32(__m128i __A, __m128i __B) {
1270  return (__m128i)vec_mergeh((__v4si)__A, (__v4si)__B);
1271 }
1272 
1273 extern __inline __m128i
1274  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1275  _mm_unpacklo_epi64(__m128i __A, __m128i __B) {
1276  return (__m128i)vec_mergeh((__vector long long)__A, (__vector long long)__B);
1277 }
1278 
1279 extern __inline __m128i
1280  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281  _mm_add_epi8(__m128i __A, __m128i __B) {
1282  return (__m128i)((__v16qu)__A + (__v16qu)__B);
1283 }
1284 
1285 extern __inline __m128i
1286  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1287  _mm_add_epi16(__m128i __A, __m128i __B) {
1288  return (__m128i)((__v8hu)__A + (__v8hu)__B);
1289 }
1290 
1291 extern __inline __m128i
1292  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1293  _mm_add_epi32(__m128i __A, __m128i __B) {
1294  return (__m128i)((__v4su)__A + (__v4su)__B);
1295 }
1296 
1297 extern __inline __m128i
1298  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1299  _mm_add_epi64(__m128i __A, __m128i __B) {
1300  return (__m128i)((__v2du)__A + (__v2du)__B);
1301 }
1302 
1303 extern __inline __m128i
1304  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1305  _mm_adds_epi8(__m128i __A, __m128i __B) {
1306  return (__m128i)vec_adds((__v16qi)__A, (__v16qi)__B);
1307 }
1308 
1309 extern __inline __m128i
1310  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1311  _mm_adds_epi16(__m128i __A, __m128i __B) {
1312  return (__m128i)vec_adds((__v8hi)__A, (__v8hi)__B);
1313 }
1314 
1315 extern __inline __m128i
1316  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1317  _mm_adds_epu8(__m128i __A, __m128i __B) {
1318  return (__m128i)vec_adds((__v16qu)__A, (__v16qu)__B);
1319 }
1320 
1321 extern __inline __m128i
1322  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1323  _mm_adds_epu16(__m128i __A, __m128i __B) {
1324  return (__m128i)vec_adds((__v8hu)__A, (__v8hu)__B);
1325 }
1326 
1327 extern __inline __m128i
1328  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1329  _mm_sub_epi8(__m128i __A, __m128i __B) {
1330  return (__m128i)((__v16qu)__A - (__v16qu)__B);
1331 }
1332 
1333 extern __inline __m128i
1334  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1335  _mm_sub_epi16(__m128i __A, __m128i __B) {
1336  return (__m128i)((__v8hu)__A - (__v8hu)__B);
1337 }
1338 
1339 extern __inline __m128i
1340  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1341  _mm_sub_epi32(__m128i __A, __m128i __B) {
1342  return (__m128i)((__v4su)__A - (__v4su)__B);
1343 }
1344 
1345 extern __inline __m128i
1346  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1347  _mm_sub_epi64(__m128i __A, __m128i __B) {
1348  return (__m128i)((__v2du)__A - (__v2du)__B);
1349 }
1350 
1351 extern __inline __m128i
1352  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1353  _mm_subs_epi8(__m128i __A, __m128i __B) {
1354  return (__m128i)vec_subs((__v16qi)__A, (__v16qi)__B);
1355 }
1356 
1357 extern __inline __m128i
1358  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1359  _mm_subs_epi16(__m128i __A, __m128i __B) {
1360  return (__m128i)vec_subs((__v8hi)__A, (__v8hi)__B);
1361 }
1362 
1363 extern __inline __m128i
1364  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1365  _mm_subs_epu8(__m128i __A, __m128i __B) {
1366  return (__m128i)vec_subs((__v16qu)__A, (__v16qu)__B);
1367 }
1368 
1369 extern __inline __m128i
1370  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1371  _mm_subs_epu16(__m128i __A, __m128i __B) {
1372  return (__m128i)vec_subs((__v8hu)__A, (__v8hu)__B);
1373 }
1374 
1375 extern __inline __m128i
1376  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1377  _mm_madd_epi16(__m128i __A, __m128i __B) {
1378  __vector signed int __zero = {0, 0, 0, 0};
1379 
1380  return (__m128i)vec_vmsumshm((__v8hi)__A, (__v8hi)__B, __zero);
1381 }
1382 
1383 extern __inline __m128i
1384  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1385  _mm_mulhi_epi16(__m128i __A, __m128i __B) {
1386  __vector signed int __w0, __w1;
1387 
1388  __vector unsigned char __xform1 = {
1389 #ifdef __LITTLE_ENDIAN__
1390  0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1391  0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1392 #else
1393  0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1394  0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1395 #endif
1396  };
1397 
1398  __w0 = vec_vmulesh((__v8hi)__A, (__v8hi)__B);
1399  __w1 = vec_vmulosh((__v8hi)__A, (__v8hi)__B);
1400  return (__m128i)vec_perm(__w0, __w1, __xform1);
1401 }
1402 
1403 extern __inline __m128i
1404  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1405  _mm_mullo_epi16(__m128i __A, __m128i __B) {
1406  return (__m128i)((__v8hi)__A * (__v8hi)__B);
1407 }
1408 
1409 extern __inline __m64
1410  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1411  _mm_mul_su32(__m64 __A, __m64 __B) {
1412  unsigned int __a = __A;
1413  unsigned int __b = __B;
1414 
1415  return ((__m64)__a * (__m64)__b);
1416 }
1417 
1418 #ifdef _ARCH_PWR8
1419 extern __inline __m128i
1420  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1421  _mm_mul_epu32(__m128i __A, __m128i __B) {
1422 #if __GNUC__ < 8
1423  __v2du __result;
1424 
1425 #ifdef __LITTLE_ENDIAN__
1426  /* VMX Vector Multiply Odd Unsigned Word. */
1427  __asm__("vmulouw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1428 #else
1429  /* VMX Vector Multiply Even Unsigned Word. */
1430  __asm__("vmuleuw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1431 #endif
1432  return (__m128i)__result;
1433 #else
1434  return (__m128i)vec_mule((__v4su)__A, (__v4su)__B);
1435 #endif
1436 }
1437 #endif
1438 
1439 extern __inline __m128i
1440  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1441  _mm_slli_epi16(__m128i __A, int __B) {
1442  __v8hu __lshift;
1443  __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1444 
1445  if (__B >= 0 && __B < 16) {
1446  if (__builtin_constant_p(__B))
1447  __lshift = (__v8hu)vec_splat_s16(__B);
1448  else
1449  __lshift = vec_splats((unsigned short)__B);
1450 
1451  __result = vec_sl((__v8hi)__A, __lshift);
1452  }
1453 
1454  return (__m128i)__result;
1455 }
1456 
1457 extern __inline __m128i
1458  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1459  _mm_slli_epi32(__m128i __A, int __B) {
1460  __v4su __lshift;
1461  __v4si __result = {0, 0, 0, 0};
1462 
1463  if (__B >= 0 && __B < 32) {
1464  if (__builtin_constant_p(__B) && __B < 16)
1465  __lshift = (__v4su)vec_splat_s32(__B);
1466  else
1467  __lshift = vec_splats((unsigned int)__B);
1468 
1469  __result = vec_sl((__v4si)__A, __lshift);
1470  }
1471 
1472  return (__m128i)__result;
1473 }
1474 
1475 #ifdef _ARCH_PWR8
1476 extern __inline __m128i
1477  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1478  _mm_slli_epi64(__m128i __A, int __B) {
1479  __v2du __lshift;
1480  __v2di __result = {0, 0};
1481 
1482  if (__B >= 0 && __B < 64) {
1483  if (__builtin_constant_p(__B) && __B < 16)
1484  __lshift = (__v2du)vec_splat_s32(__B);
1485  else
1486  __lshift = (__v2du)vec_splats((unsigned int)__B);
1487 
1488  __result = vec_sl((__v2di)__A, __lshift);
1489  }
1490 
1491  return (__m128i)__result;
1492 }
1493 #endif
1494 
1495 extern __inline __m128i
1496  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1497  _mm_srai_epi16(__m128i __A, int __B) {
1498  __v8hu __rshift = {15, 15, 15, 15, 15, 15, 15, 15};
1499  __v8hi __result;
1500 
1501  if (__B < 16) {
1502  if (__builtin_constant_p(__B))
1503  __rshift = (__v8hu)vec_splat_s16(__B);
1504  else
1505  __rshift = vec_splats((unsigned short)__B);
1506  }
1507  __result = vec_sra((__v8hi)__A, __rshift);
1508 
1509  return (__m128i)__result;
1510 }
1511 
1512 extern __inline __m128i
1513  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1514  _mm_srai_epi32(__m128i __A, int __B) {
1515  __v4su __rshift = {31, 31, 31, 31};
1516  __v4si __result;
1517 
1518  if (__B < 32) {
1519  if (__builtin_constant_p(__B)) {
1520  if (__B < 16)
1521  __rshift = (__v4su)vec_splat_s32(__B);
1522  else
1523  __rshift = (__v4su)vec_splats((unsigned int)__B);
1524  } else
1525  __rshift = vec_splats((unsigned int)__B);
1526  }
1527  __result = vec_sra((__v4si)__A, __rshift);
1528 
1529  return (__m128i)__result;
1530 }
1531 
1532 extern __inline __m128i
1533  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1534  _mm_bslli_si128(__m128i __A, const int __N) {
1535  __v16qu __result;
1536  const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1537 
1538  if (__N < 16)
1539  __result = vec_sld((__v16qu)__A, __zeros, __N);
1540  else
1541  __result = __zeros;
1542 
1543  return (__m128i)__result;
1544 }
1545 
1546 extern __inline __m128i
1547  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1548  _mm_bsrli_si128(__m128i __A, const int __N) {
1549  __v16qu __result;
1550  const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1551 
1552  if (__N < 16)
1553 #ifdef __LITTLE_ENDIAN__
1554  if (__builtin_constant_p(__N))
1555  /* Would like to use Vector Shift Left Double by Octet
1556  Immediate here to use the immediate form and avoid
1557  load of __N * 8 value into a separate VR. */
1558  __result = vec_sld(__zeros, (__v16qu)__A, (16 - __N));
1559  else
1560 #endif
1561  {
1562  __v16qu __shift = vec_splats((unsigned char)(__N * 8));
1563 #ifdef __LITTLE_ENDIAN__
1564  __result = vec_sro((__v16qu)__A, __shift);
1565 #else
1566  __result = vec_slo((__v16qu)__A, __shift);
1567 #endif
1568  }
1569  else
1570  __result = __zeros;
1571 
1572  return (__m128i)__result;
1573 }
1574 
1575 extern __inline __m128i
1576  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1577  _mm_srli_si128(__m128i __A, const int __N) {
1578  return _mm_bsrli_si128(__A, __N);
1579 }
1580 
1581 extern __inline __m128i
1582  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1583  _mm_slli_si128(__m128i __A, const int _imm5) {
1584  __v16qu __result;
1585  const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1586 
1587  if (_imm5 < 16)
1588 #ifdef __LITTLE_ENDIAN__
1589  __result = vec_sld((__v16qu)__A, __zeros, _imm5);
1590 #else
1591  __result = vec_sld(__zeros, (__v16qu)__A, (16 - _imm5));
1592 #endif
1593  else
1594  __result = __zeros;
1595 
1596  return (__m128i)__result;
1597 }
1598 
1599 extern __inline __m128i
1600  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1601 
1602  _mm_srli_epi16(__m128i __A, int __B) {
1603  __v8hu __rshift;
1604  __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1605 
1606  if (__B < 16) {
1607  if (__builtin_constant_p(__B))
1608  __rshift = (__v8hu)vec_splat_s16(__B);
1609  else
1610  __rshift = vec_splats((unsigned short)__B);
1611 
1612  __result = vec_sr((__v8hi)__A, __rshift);
1613  }
1614 
1615  return (__m128i)__result;
1616 }
1617 
1618 extern __inline __m128i
1619  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1620  _mm_srli_epi32(__m128i __A, int __B) {
1621  __v4su __rshift;
1622  __v4si __result = {0, 0, 0, 0};
1623 
1624  if (__B < 32) {
1625  if (__builtin_constant_p(__B)) {
1626  if (__B < 16)
1627  __rshift = (__v4su)vec_splat_s32(__B);
1628  else
1629  __rshift = (__v4su)vec_splats((unsigned int)__B);
1630  } else
1631  __rshift = vec_splats((unsigned int)__B);
1632 
1633  __result = vec_sr((__v4si)__A, __rshift);
1634  }
1635 
1636  return (__m128i)__result;
1637 }
1638 
1639 #ifdef _ARCH_PWR8
1640 extern __inline __m128i
1641  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1642  _mm_srli_epi64(__m128i __A, int __B) {
1643  __v2du __rshift;
1644  __v2di __result = {0, 0};
1645 
1646  if (__B < 64) {
1647  if (__builtin_constant_p(__B)) {
1648  if (__B < 16)
1649  __rshift = (__v2du)vec_splat_s32(__B);
1650  else
1651  __rshift = (__v2du)vec_splats((unsigned long long)__B);
1652  } else
1653  __rshift = (__v2du)vec_splats((unsigned int)__B);
1654 
1655  __result = vec_sr((__v2di)__A, __rshift);
1656  }
1657 
1658  return (__m128i)__result;
1659 }
1660 #endif
1661 
1662 extern __inline __m128i
1663  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1664  _mm_sll_epi16(__m128i __A, __m128i __B) {
1665  __v8hu __lshift;
1666  __vector __bool short __shmask;
1667  const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1668  __v8hu __result;
1669 
1670 #ifdef __LITTLE_ENDIAN__
1671  __lshift = vec_splat((__v8hu)__B, 0);
1672 #else
1673  __lshift = vec_splat((__v8hu)__B, 3);
1674 #endif
1675  __shmask = vec_cmple(__lshift, __shmax);
1676  __result = vec_sl((__v8hu)__A, __lshift);
1677  __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1678 
1679  return (__m128i)__result;
1680 }
1681 
1682 extern __inline __m128i
1683  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1684  _mm_sll_epi32(__m128i __A, __m128i __B) {
1685  __v4su __lshift;
1686  __vector __bool int __shmask;
1687  const __v4su __shmax = {32, 32, 32, 32};
1688  __v4su __result;
1689 #ifdef __LITTLE_ENDIAN__
1690  __lshift = vec_splat((__v4su)__B, 0);
1691 #else
1692  __lshift = vec_splat((__v4su)__B, 1);
1693 #endif
1694  __shmask = vec_cmplt(__lshift, __shmax);
1695  __result = vec_sl((__v4su)__A, __lshift);
1696  __result = vec_sel((__v4su)__shmask, __result, __shmask);
1697 
1698  return (__m128i)__result;
1699 }
1700 
1701 #ifdef _ARCH_PWR8
1702 extern __inline __m128i
1703  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1704  _mm_sll_epi64(__m128i __A, __m128i __B) {
1705  __v2du __lshift;
1706  __vector __bool long long __shmask;
1707  const __v2du __shmax = {64, 64};
1708  __v2du __result;
1709 
1710  __lshift = vec_splat((__v2du)__B, 0);
1711  __shmask = vec_cmplt(__lshift, __shmax);
1712  __result = vec_sl((__v2du)__A, __lshift);
1713  __result = vec_sel((__v2du)__shmask, __result, __shmask);
1714 
1715  return (__m128i)__result;
1716 }
1717 #endif
1718 
1719 extern __inline __m128i
1720  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1721  _mm_sra_epi16(__m128i __A, __m128i __B) {
1722  const __v8hu __rshmax = {15, 15, 15, 15, 15, 15, 15, 15};
1723  __v8hu __rshift;
1724  __v8hi __result;
1725 
1726 #ifdef __LITTLE_ENDIAN__
1727  __rshift = vec_splat((__v8hu)__B, 0);
1728 #else
1729  __rshift = vec_splat((__v8hu)__B, 3);
1730 #endif
1731  __rshift = vec_min(__rshift, __rshmax);
1732  __result = vec_sra((__v8hi)__A, __rshift);
1733 
1734  return (__m128i)__result;
1735 }
1736 
1737 extern __inline __m128i
1738  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1739  _mm_sra_epi32(__m128i __A, __m128i __B) {
1740  const __v4su __rshmax = {31, 31, 31, 31};
1741  __v4su __rshift;
1742  __v4si __result;
1743 
1744 #ifdef __LITTLE_ENDIAN__
1745  __rshift = vec_splat((__v4su)__B, 0);
1746 #else
1747  __rshift = vec_splat((__v4su)__B, 1);
1748 #endif
1749  __rshift = vec_min(__rshift, __rshmax);
1750  __result = vec_sra((__v4si)__A, __rshift);
1751 
1752  return (__m128i)__result;
1753 }
1754 
1755 extern __inline __m128i
1756  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1757  _mm_srl_epi16(__m128i __A, __m128i __B) {
1758  __v8hu __rshift;
1759  __vector __bool short __shmask;
1760  const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1761  __v8hu __result;
1762 
1763 #ifdef __LITTLE_ENDIAN__
1764  __rshift = vec_splat((__v8hu)__B, 0);
1765 #else
1766  __rshift = vec_splat((__v8hu)__B, 3);
1767 #endif
1768  __shmask = vec_cmple(__rshift, __shmax);
1769  __result = vec_sr((__v8hu)__A, __rshift);
1770  __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1771 
1772  return (__m128i)__result;
1773 }
1774 
1775 extern __inline __m128i
1776  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1777  _mm_srl_epi32(__m128i __A, __m128i __B) {
1778  __v4su __rshift;
1779  __vector __bool int __shmask;
1780  const __v4su __shmax = {32, 32, 32, 32};
1781  __v4su __result;
1782 
1783 #ifdef __LITTLE_ENDIAN__
1784  __rshift = vec_splat((__v4su)__B, 0);
1785 #else
1786  __rshift = vec_splat((__v4su)__B, 1);
1787 #endif
1788  __shmask = vec_cmplt(__rshift, __shmax);
1789  __result = vec_sr((__v4su)__A, __rshift);
1790  __result = vec_sel((__v4su)__shmask, __result, __shmask);
1791 
1792  return (__m128i)__result;
1793 }
1794 
1795 #ifdef _ARCH_PWR8
1796 extern __inline __m128i
1797  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1798  _mm_srl_epi64(__m128i __A, __m128i __B) {
1799  __v2du __rshift;
1800  __vector __bool long long __shmask;
1801  const __v2du __shmax = {64, 64};
1802  __v2du __result;
1803 
1804  __rshift = vec_splat((__v2du)__B, 0);
1805  __shmask = vec_cmplt(__rshift, __shmax);
1806  __result = vec_sr((__v2du)__A, __rshift);
1807  __result = vec_sel((__v2du)__shmask, __result, __shmask);
1808 
1809  return (__m128i)__result;
1810 }
1811 #endif
1812 
1813 extern __inline __m128d
1814  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1815  _mm_and_pd(__m128d __A, __m128d __B) {
1816  return (vec_and((__v2df)__A, (__v2df)__B));
1817 }
1818 
1819 extern __inline __m128d
1820  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1821  _mm_andnot_pd(__m128d __A, __m128d __B) {
1822  return (vec_andc((__v2df)__B, (__v2df)__A));
1823 }
1824 
1825 extern __inline __m128d
1826  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1827  _mm_or_pd(__m128d __A, __m128d __B) {
1828  return (vec_or((__v2df)__A, (__v2df)__B));
1829 }
1830 
1831 extern __inline __m128d
1832  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1833  _mm_xor_pd(__m128d __A, __m128d __B) {
1834  return (vec_xor((__v2df)__A, (__v2df)__B));
1835 }
1836 
1837 extern __inline __m128i
1838  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1839  _mm_and_si128(__m128i __A, __m128i __B) {
1840  return (__m128i)vec_and((__v2di)__A, (__v2di)__B);
1841 }
1842 
1843 extern __inline __m128i
1844  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1845  _mm_andnot_si128(__m128i __A, __m128i __B) {
1846  return (__m128i)vec_andc((__v2di)__B, (__v2di)__A);
1847 }
1848 
1849 extern __inline __m128i
1850  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1851  _mm_or_si128(__m128i __A, __m128i __B) {
1852  return (__m128i)vec_or((__v2di)__A, (__v2di)__B);
1853 }
1854 
1855 extern __inline __m128i
1856  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1857  _mm_xor_si128(__m128i __A, __m128i __B) {
1858  return (__m128i)vec_xor((__v2di)__A, (__v2di)__B);
1859 }
1860 
1861 extern __inline __m128i
1862  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1863  _mm_cmpeq_epi8(__m128i __A, __m128i __B) {
1864  return (__m128i)vec_cmpeq((__v16qi)__A, (__v16qi)__B);
1865 }
1866 
1867 extern __inline __m128i
1868  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1869  _mm_cmpeq_epi16(__m128i __A, __m128i __B) {
1870  return (__m128i)vec_cmpeq((__v8hi)__A, (__v8hi)__B);
1871 }
1872 
1873 extern __inline __m128i
1874  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1875  _mm_cmpeq_epi32(__m128i __A, __m128i __B) {
1876  return (__m128i)vec_cmpeq((__v4si)__A, (__v4si)__B);
1877 }
1878 
1879 extern __inline __m128i
1880  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1881  _mm_cmplt_epi8(__m128i __A, __m128i __B) {
1882  return (__m128i)vec_cmplt((__v16qi)__A, (__v16qi)__B);
1883 }
1884 
1885 extern __inline __m128i
1886  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1887  _mm_cmplt_epi16(__m128i __A, __m128i __B) {
1888  return (__m128i)vec_cmplt((__v8hi)__A, (__v8hi)__B);
1889 }
1890 
1891 extern __inline __m128i
1892  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1893  _mm_cmplt_epi32(__m128i __A, __m128i __B) {
1894  return (__m128i)vec_cmplt((__v4si)__A, (__v4si)__B);
1895 }
1896 
1897 extern __inline __m128i
1898  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1899  _mm_cmpgt_epi8(__m128i __A, __m128i __B) {
1900  return (__m128i)vec_cmpgt((__v16qi)__A, (__v16qi)__B);
1901 }
1902 
1903 extern __inline __m128i
1904  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1905  _mm_cmpgt_epi16(__m128i __A, __m128i __B) {
1906  return (__m128i)vec_cmpgt((__v8hi)__A, (__v8hi)__B);
1907 }
1908 
1909 extern __inline __m128i
1910  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1911  _mm_cmpgt_epi32(__m128i __A, __m128i __B) {
1912  return (__m128i)vec_cmpgt((__v4si)__A, (__v4si)__B);
1913 }
1914 
1915 extern __inline int
1916  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1917  _mm_extract_epi16(__m128i const __A, int const __N) {
1918  return (unsigned short)((__v8hi)__A)[__N & 7];
1919 }
1920 
1921 extern __inline __m128i
1922  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1923  _mm_insert_epi16(__m128i const __A, int const __D, int const __N) {
1924  __v8hi __result = (__v8hi)__A;
1925 
1926  __result[(__N & 7)] = __D;
1927 
1928  return (__m128i)__result;
1929 }
1930 
1931 extern __inline __m128i
1932  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1933  _mm_max_epi16(__m128i __A, __m128i __B) {
1934  return (__m128i)vec_max((__v8hi)__A, (__v8hi)__B);
1935 }
1936 
1937 extern __inline __m128i
1938  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1939  _mm_max_epu8(__m128i __A, __m128i __B) {
1940  return (__m128i)vec_max((__v16qu)__A, (__v16qu)__B);
1941 }
1942 
1943 extern __inline __m128i
1944  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1945  _mm_min_epi16(__m128i __A, __m128i __B) {
1946  return (__m128i)vec_min((__v8hi)__A, (__v8hi)__B);
1947 }
1948 
1949 extern __inline __m128i
1950  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1951  _mm_min_epu8(__m128i __A, __m128i __B) {
1952  return (__m128i)vec_min((__v16qu)__A, (__v16qu)__B);
1953 }
1954 
1955 #ifdef _ARCH_PWR8
1956 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1957 
1958 /* Return a mask created from the most significant bit of each 8-bit
1959  element in A. */
1960 extern __inline int
1961  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1962  _mm_movemask_epi8(__m128i __A) {
1963 #ifdef _ARCH_PWR10
1964  return vec_extractm((__v16qu)__A);
1965 #else
1966  __vector unsigned long long __result;
1967  static const __vector unsigned char __perm_mask = {
1968  0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
1969  0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
1970 
1971  __result = ((__vector unsigned long long)vec_vbpermq(
1972  (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1973 
1974 #ifdef __LITTLE_ENDIAN__
1975  return __result[1];
1976 #else
1977  return __result[0];
1978 #endif
1979 #endif /* !_ARCH_PWR10 */
1980 }
1981 #endif /* _ARCH_PWR8 */
1982 
1983 extern __inline __m128i
1984  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1985  _mm_mulhi_epu16(__m128i __A, __m128i __B) {
1986  __v4su __w0, __w1;
1987  __v16qu __xform1 = {
1988 #ifdef __LITTLE_ENDIAN__
1989  0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1990  0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1991 #else
1992  0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1993  0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1994 #endif
1995  };
1996 
1997  __w0 = vec_vmuleuh((__v8hu)__A, (__v8hu)__B);
1998  __w1 = vec_vmulouh((__v8hu)__A, (__v8hu)__B);
1999  return (__m128i)vec_perm(__w0, __w1, __xform1);
2000 }
2001 
2002 extern __inline __m128i
2003  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2004  _mm_shufflehi_epi16(__m128i __A, const int __mask) {
2005  unsigned long __element_selector_98 = __mask & 0x03;
2006  unsigned long __element_selector_BA = (__mask >> 2) & 0x03;
2007  unsigned long __element_selector_DC = (__mask >> 4) & 0x03;
2008  unsigned long __element_selector_FE = (__mask >> 6) & 0x03;
2009  static const unsigned short __permute_selectors[4] = {
2010 #ifdef __LITTLE_ENDIAN__
2011  0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2012 #else
2013  0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2014 #endif
2015  };
2016  __v2du __pmask =
2017 #ifdef __LITTLE_ENDIAN__
2018  {0x1716151413121110UL, 0UL};
2019 #else
2020  {0x1011121314151617UL, 0UL};
2021 #endif
2022  __m64_union __t;
2023  __v2du __a, __r;
2024 
2025  __t.as_short[0] = __permute_selectors[__element_selector_98];
2026  __t.as_short[1] = __permute_selectors[__element_selector_BA];
2027  __t.as_short[2] = __permute_selectors[__element_selector_DC];
2028  __t.as_short[3] = __permute_selectors[__element_selector_FE];
2029  __pmask[1] = __t.as_m64;
2030  __a = (__v2du)__A;
2031  __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2032  return (__m128i)__r;
2033 }
2034 
2035 extern __inline __m128i
2036  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2037  _mm_shufflelo_epi16(__m128i __A, const int __mask) {
2038  unsigned long __element_selector_10 = __mask & 0x03;
2039  unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2040  unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2041  unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2042  static const unsigned short __permute_selectors[4] = {
2043 #ifdef __LITTLE_ENDIAN__
2044  0x0100, 0x0302, 0x0504, 0x0706
2045 #else
2046  0x0001, 0x0203, 0x0405, 0x0607
2047 #endif
2048  };
2049  __v2du __pmask =
2050 #ifdef __LITTLE_ENDIAN__
2051  {0UL, 0x1f1e1d1c1b1a1918UL};
2052 #else
2053  {0UL, 0x18191a1b1c1d1e1fUL};
2054 #endif
2055  __m64_union __t;
2056  __v2du __a, __r;
2057  __t.as_short[0] = __permute_selectors[__element_selector_10];
2058  __t.as_short[1] = __permute_selectors[__element_selector_32];
2059  __t.as_short[2] = __permute_selectors[__element_selector_54];
2060  __t.as_short[3] = __permute_selectors[__element_selector_76];
2061  __pmask[0] = __t.as_m64;
2062  __a = (__v2du)__A;
2063  __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2064  return (__m128i)__r;
2065 }
2066 
2067 extern __inline __m128i
2068  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2069  _mm_shuffle_epi32(__m128i __A, const int __mask) {
2070  unsigned long __element_selector_10 = __mask & 0x03;
2071  unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2072  unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2073  unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2074  static const unsigned int __permute_selectors[4] = {
2075 #ifdef __LITTLE_ENDIAN__
2076  0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2077 #else
2078  0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2079 #endif
2080  };
2081  __v4su __t;
2082 
2083  __t[0] = __permute_selectors[__element_selector_10];
2084  __t[1] = __permute_selectors[__element_selector_32];
2085  __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
2086  __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
2087  return (__m128i)vec_perm((__v4si)__A, (__v4si)__A,
2088  (__vector unsigned char)__t);
2089 }
2090 
2091 extern __inline void
2092  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2093  _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) {
2094  __v2du __hibit = {0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2095  __v16qu __mask, __tmp;
2096  __m128i_u *__p = (__m128i_u *)__C;
2097 
2098  __tmp = (__v16qu)_mm_loadu_si128(__p);
2099  __mask = (__v16qu)vec_cmpgt((__v16qu)__B, (__v16qu)__hibit);
2100  __tmp = vec_sel(__tmp, (__v16qu)__A, __mask);
2101  _mm_storeu_si128(__p, (__m128i)__tmp);
2102 }
2103 
2104 extern __inline __m128i
2105  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2106  _mm_avg_epu8(__m128i __A, __m128i __B) {
2107  return (__m128i)vec_avg((__v16qu)__A, (__v16qu)__B);
2108 }
2109 
2110 extern __inline __m128i
2111  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2112  _mm_avg_epu16(__m128i __A, __m128i __B) {
2113  return (__m128i)vec_avg((__v8hu)__A, (__v8hu)__B);
2114 }
2115 
2116 extern __inline __m128i
2117  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2118  _mm_sad_epu8(__m128i __A, __m128i __B) {
2119  __v16qu __a, __b;
2120  __v16qu __vabsdiff;
2121  __v4si __vsum;
2122  const __v4su __zero = {0, 0, 0, 0};
2123  __v4si __result;
2124 
2125  __a = (__v16qu)__A;
2126  __b = (__v16qu)__B;
2127 #ifndef _ARCH_PWR9
2128  __v16qu __vmin = vec_min(__a, __b);
2129  __v16qu __vmax = vec_max(__a, __b);
2130  __vabsdiff = vec_sub(__vmax, __vmin);
2131 #else
2132  __vabsdiff = vec_absd(__a, __b);
2133 #endif
2134  /* Sum four groups of bytes into integers. */
2135  __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
2136 #ifdef __LITTLE_ENDIAN__
2137  /* Sum across four integers with two integer results. */
2138  __asm__("vsum2sws %0,%1,%2" : "=v"(__result) : "v"(__vsum), "v"(__zero));
2139  /* Note: vec_sum2s could be used here, but on little-endian, vector
2140  shifts are added that are not needed for this use-case.
2141  A vector shift to correctly position the 32-bit integer results
2142  (currently at [0] and [2]) to [1] and [3] would then need to be
2143  swapped back again since the desired results are two 64-bit
2144  integers ([1]|[0] and [3]|[2]). Thus, no shift is performed. */
2145 #else
2146  /* Sum across four integers with two integer results. */
2147  __result = vec_sum2s(__vsum, (__vector signed int)__zero);
2148  /* Rotate the sums into the correct position. */
2149  __result = vec_sld(__result, __result, 6);
2150 #endif
2151  return (__m128i)__result;
2152 }
2153 
2154 extern __inline void
2155  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2156  _mm_stream_si32(int *__A, int __B) {
2157  /* Use the data cache block touch for store transient. */
2158  __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2159  *__A = __B;
2160 }
2161 
2162 extern __inline void
2163  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2164  _mm_stream_si64(long long int *__A, long long int __B) {
2165  /* Use the data cache block touch for store transient. */
2166  __asm__(" dcbtstt 0,%0" : : "b"(__A) : "memory");
2167  *__A = __B;
2168 }
2169 
2170 extern __inline void
2171  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2172  _mm_stream_si128(__m128i *__A, __m128i __B) {
2173  /* Use the data cache block touch for store transient. */
2174  __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2175  *__A = __B;
2176 }
2177 
2178 extern __inline void
2179  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2180  _mm_stream_pd(double *__A, __m128d __B) {
2181  /* Use the data cache block touch for store transient. */
2182  __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2183  *(__m128d *)__A = __B;
2184 }
2185 
2186 extern __inline void
2187  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2188  _mm_clflush(void const *__A) {
2189  /* Use the data cache block flush. */
2190  __asm__("dcbf 0,%0" : : "b"(__A) : "memory");
2191 }
2192 
2193 extern __inline void
2194  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2195  _mm_lfence(void) {
2196  /* Use light weight sync for load to load ordering. */
2197  __atomic_thread_fence(__ATOMIC_RELEASE);
2198 }
2199 
2200 extern __inline void
2201  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2202  _mm_mfence(void) {
2203  /* Use heavy weight sync for any to any ordering. */
2204  __atomic_thread_fence(__ATOMIC_SEQ_CST);
2205 }
2206 
2207 extern __inline __m128i
2208  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2209  _mm_cvtsi32_si128(int __A) {
2210  return _mm_set_epi32(0, 0, 0, __A);
2211 }
2212 
2213 extern __inline __m128i
2214  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2215  _mm_cvtsi64_si128(long long __A) {
2216  return __extension__(__m128i)(__v2di){__A, 0LL};
2217 }
2218 
2219 /* Microsoft intrinsic. */
2220 extern __inline __m128i
2221  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2222  _mm_cvtsi64x_si128(long long __A) {
2223  return __extension__(__m128i)(__v2di){__A, 0LL};
2224 }
2225 
2226 /* Casts between various SP, DP, INT vector types. Note that these do no
2227  conversion of values, they just change the type. */
2228 extern __inline __m128
2229  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2230  _mm_castpd_ps(__m128d __A) {
2231  return (__m128)__A;
2232 }
2233 
2234 extern __inline __m128i
2235  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2236  _mm_castpd_si128(__m128d __A) {
2237  return (__m128i)__A;
2238 }
2239 
2240 extern __inline __m128d
2241  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2242  _mm_castps_pd(__m128 __A) {
2243  return (__m128d)__A;
2244 }
2245 
2246 extern __inline __m128i
2247  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2248  _mm_castps_si128(__m128 __A) {
2249  return (__m128i)__A;
2250 }
2251 
2252 extern __inline __m128
2253  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2254  _mm_castsi128_ps(__m128i __A) {
2255  return (__m128)__A;
2256 }
2257 
2258 extern __inline __m128d
2259  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2260  _mm_castsi128_pd(__m128i __A) {
2261  return (__m128d)__A;
2262 }
2263 
2264 #else
2265 #include_next <emmintrin.h>
2266 #endif /* defined(__powerpc64__) && \
2267  * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
2268 
2269 #endif /* EMMINTRIN_H_ */
__device__ int
static __inline__ vector unsigned char __ATTRS_o_ai vec_sr(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:10393
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
Definition: altivec.h:1708
static __inline__ vector signed char __ATTRS_o_ai vec_sra(vector signed char __a, vector unsigned char __b)
Definition: altivec.h:10527
static __inline__ vector int __ATTRS_o_ai vec_vmrghw(vector int __a, vector int __b)
Definition: altivec.h:5326
static __inline__ vector signed char __ATTRS_o_ai vec_sro(vector signed char __a, vector signed char __b)
Definition: altivec.h:10979
#define vec_ctf(__a, __b)
Definition: altivec.h:3244
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4800
static __inline__ vector short __ATTRS_o_ai vec_mule(vector signed char __a, vector signed char __b)
Definition: altivec.h:6263
static __inline__ vector float vector float __b
Definition: altivec.h:578
static __inline__ vector signed char __ATTRS_o_ai vec_ld(long __a, const vector signed char *__b)
Definition: altivec.h:4061
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
Definition: altivec.h:14737
static __inline__ vector signed char __ATTRS_o_ai vec_andc(vector signed char __a, vector signed char __b)
Definition: altivec.h:1235
static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, long __b, vector signed char *__c)
Definition: altivec.h:11184
static __inline__ vector signed int __ATTRS_o_ai vec_sld(vector signed int, vector signed int, unsigned const int __c)
Definition: altivec.h:9149
static __inline__ vector short __ATTRS_o_ai vec_unpackl(vector signed char __a)
Definition: altivec.h:12781
static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a, vector int __b)
Definition: altivec.h:12487
static __inline__ vector signed char __ATTRS_o_ai vec_and(vector signed char __a, vector signed char __b)
Definition: altivec.h:882
static __inline__ vector signed char __ATTRS_o_ai vec_avg(vector signed char __a, vector signed char __b)
Definition: altivec.h:1586
static __inline__ vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a, vector signed char __b)
Definition: altivec.h:5361
static __inline__ vector signed char __ATTRS_o_ai vec_subs(vector signed char __a, vector signed char __b)
Definition: altivec.h:12149
static __inline__ vector int __ATTRS_o_ai vec_splat_s32(signed char __a)
Definition: altivec.h:10353
static __inline__ vector signed char __ATTRS_o_ai vec_adds(vector signed char __a, vector signed char __b)
Definition: altivec.h:626
static __inline__ vector signed char __ATTRS_o_ai vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:7962
static __inline__ vector signed char __ATTRS_o_ai vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:8588
static __inline__ vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a, vector signed char __b)
Definition: altivec.h:5091
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2435
static __inline__ vector signed char __ATTRS_o_ai vec_max(vector signed char __a, vector signed char __b)
Definition: altivec.h:4838
static __inline__ vector signed char __ATTRS_o_ai vec_slo(vector signed char __a, vector signed char __b)
Definition: altivec.h:9884
static __inline__ vector signed char __ATTRS_o_ai vec_nor(vector signed char __a, vector signed char __b)
Definition: altivec.h:6729
static __inline__ vector bool char __ATTRS_o_ai vec_cmpge(vector signed char __a, vector signed char __b)
Definition: altivec.h:2243
static __inline__ vector unsigned char __ATTRS_o_ai vec_packsu(vector short __a, vector short __b)
Definition: altivec.h:7844
static __inline__ vector signed char __ATTRS_o_ai vec_min(vector signed char __a, vector signed char __b)
Definition: altivec.h:5742
#define vec_cts
Definition: altivec.h:3319
static __inline__ vector signed char __ATTRS_o_ai vec_splat(vector signed char __a, unsigned const int __b)
Definition: altivec.h:10090
static __inline__ vector signed char __ATTRS_o_ai vec_or(vector signed char __a, vector signed char __b)
Definition: altivec.h:6865
static __inline__ vector short __ATTRS_o_ai vec_unpackh(vector signed char __a)
Definition: altivec.h:12642
static __inline__ vector unsigned char __ATTRS_o_ai vec_sl(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:8882
static __inline__ vector short __ATTRS_o_ai vec_splat_s16(signed char __a)
Definition: altivec.h:10337
static __inline__ vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:13207
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2131
static __inline__ vector bool char __ATTRS_o_ai vec_cmple(vector signed char __a, vector signed char __b)
Definition: altivec.h:2369
static __inline__ vector signed char __ATTRS_o_ai vec_packs(vector short __a, vector short __b)
Definition: altivec.h:7715
static __inline__ vector signed char __ATTRS_o_ai vec_sub(vector signed char __a, vector signed char __b)
Definition: altivec.h:11869
static __inline__ uint32_t volatile uint32_t * __p
Definition: arm_acle.h:80
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1489
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, __m64 __q1)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition: emmintrin.h:3742
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1044
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-3) values from each of the two 128-bit vectors of [8 x i16] and interl...
Definition: emmintrin.h:4531
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a)
Moves the 64-bit operand to a 128-bit integer vector, zeroing the upper bits.
Definition: emmintrin.h:4606
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1953
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
Initializes the 16-bit values in a 128-bit vector of [8 x i16] with the specified 16-bit integer valu...
Definition: emmintrin.h:3585
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1020
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1805
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, __m128i __b)
Converts, with saturation, 16-bit signed integers from both 128-bit integer vector operands into 8-bi...
Definition: emmintrin.h:4188
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the smaller value f...
Definition: emmintrin.h:2359
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:585
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a, __m128d __b)
Adds lower double-precision values in both operands and returns the sum in the lower 64 bits of the r...
Definition: emmintrin.h:74
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a, __m128d __b)
Performs an element-by-element division of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:212
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a, __m128d __b)
Subtracts two 128-bit vectors of [2 x double].
Definition: emmintrin.h:132
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit integer vector.
Definition: emmintrin.h:4740
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, __m128d __b)
Performs a bitwise OR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:398
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p, __m128i __a)
Stores a 128-bit integer vector to a 128-bit aligned memory location.
Definition: emmintrin.h:4037
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4263
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, __m128i __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2811
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors.
Definition: emmintrin.h:2662
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:820
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1186
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition: emmintrin.h:1609
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a, __m128i __b)
Subtracts the corresponding elements of two [2 x i64] vectors.
Definition: emmintrin.h:2559
static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a)
Moves the least significant 64 bits of a vector of [2 x i64] to a 64-bit signed integer value.
Definition: emmintrin.h:3410
static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
Moves bytes selected by the mask from the first operand to the specified unaligned memory location.
Definition: emmintrin.h:3978
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1162
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
Initializes the 32-bit values in a 128-bit vector of [4 x i32] with the specified 32-bit integer valu...
Definition: emmintrin.h:3545
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1210
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [16 x i8] vectors,...
Definition: emmintrin.h:2154
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an aligned memory location.
Definition: emmintrin.h:1553
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a)
Converts the lower two integer elements of a 128-bit vector of [4 x i32] into two double-precision fl...
Definition: emmintrin.h:1315
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3075
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, int __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3002
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1789
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3215
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double] initialized with the specified double-prec...
Definition: emmintrin.h:1823
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a, __m128i __b)
Subtracts the corresponding 16-bit integer values in the operands.
Definition: emmintrin.h:2507
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:742
#define _mm_slli_si128(a, imm)
Left-shifts the 128-bit integer vector operand by the specified number of bytes.
Definition: emmintrin.h:2736
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a, __m128d __b)
Divides the lower double-precision value of the first operand by the lower double-precision value of ...
Definition: emmintrin.h:193
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p, __m128i __a)
Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to a memory location.
Definition: emmintrin.h:3997
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3235
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, __m128i __b)
Performs a bitwise OR of two 128-bit integer vectors.
Definition: emmintrin.h:2697
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:519
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns a vecto...
Definition: emmintrin.h:298
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp)
Loads a 64-bit double-precision value to the low element of a 128-bit integer vector and clears the u...
Definition: emmintrin.h:1684
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, __m128d __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them ...
Definition: emmintrin.h:4641
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:767
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors, using the one's complement of the values conta...
Definition: emmintrin.h:2681
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3133
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, __m128i __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3020
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding 16-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3094
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the lower 16 bits of ea...
Definition: emmintrin.h:2416
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1138
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, int __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2866
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a, __m128d __b)
Performs a bitwise XOR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:415
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the greater value f...
Definition: emmintrin.h:2321
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2258
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1933
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:4800
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, __m128i __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2984
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-7) values from two 128-bit vectors of [16 x i8] and interleaves them i...
Definition: emmintrin.h:4502
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:793
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3155
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:973
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a, __m128d __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them i...
Definition: emmintrin.h:4661
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 8-bit signed integer values in the input and returns the di...
Definition: emmintrin.h:2581
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:717
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:669
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, __m128i __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them into...
Definition: emmintrin.h:4575
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, int __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2904
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a)
Extracts the sign bits of the double-precision values in the 128-bit vector of [2 x double],...
Definition: emmintrin.h:4679
#define _mm_shuffle_pd(a, b, i)
Constructs a 128-bit floating-point vector of [2 x double] from two 128-bit vector parameters of [2 x...
Definition: emmintrin.h:4710
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding 32-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3113
static __inline__ void int __a
Definition: emmintrin.h:4057
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a, __m128d __b)
Multiplies lower double-precision values in both operands and returns the product in the lower 64 bit...
Definition: emmintrin.h:153
void _mm_mfence(void)
Forces strong memory ordering (serialization) between load and store instructions preceding this inst...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 16-bit unsigned integer values in the input and returns the...
Definition: emmintrin.h:2645
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a)
Moves the lower 64 bits of a 128-bit integer vector to a 128-bit integer vector, zeroing the upper bi...
Definition: emmintrin.h:4622
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:477
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, __m128i __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them int...
Definition: emmintrin.h:4468
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a)
Converts the low-order element of a [2 x double] vector into a 32-bit signed truncated (rounded towar...
Definition: emmintrin.h:1470
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x float].
Definition: emmintrin.h:3313
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a)
Returns a vector of [2 x i64] where the lower element is the input operand and the upper element is z...
Definition: emmintrin.h:3379
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, __m128i __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x i32] and interleaves them i...
Definition: emmintrin.h:4554
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3175
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a, __m128 __b)
Converts the lower single-precision floating-point element of a 128-bit vector of [4 x float],...
Definition: emmintrin.h:1426
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a)
Calculates the square root of the each of two values stored in a 128-bit vector of [2 x double].
Definition: emmintrin.h:253
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, __m128d __b)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1876
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, __m128i __b)
Unpacks the high-order (index 4-7) values from two 128-bit vectors of [8 x i16] and interleaves them ...
Definition: emmintrin.h:4424
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1508
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1337
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
Definition: emmintrin.h:3477
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit integer vector.
Definition: emmintrin.h:4770
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1234
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two 128-bit signed [8 x i16] vectors, producing eight interm...
Definition: emmintrin.h:2283
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2220
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a)
Returns a vector of [4 x i32] where the lowest element is the input operand and the remaining element...
Definition: emmintrin.h:3364
#define _mm_load_pd1(dp)
Definition: emmintrin.h:1577
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, int __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3038
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, int __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2757
#define _mm_insert_epi16(a, b, imm)
Constructs a 128-bit integer vector by first making a copy of the 128-bit integer vector parameter,...
Definition: emmintrin.h:4247
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:606
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, __m128d __b)
Converts the lower double-precision floating-point element of a 128-bit vector of [2 x double],...
Definition: emmintrin.h:1379
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a)
Converts the low-order element of a 128-bit vector of [2 x double] into a 32-bit signed integer value...
Definition: emmintrin.h:1356
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a, __m128d __b)
Subtracts the lower double-precision value of the second operand from the lower double-precision valu...
Definition: emmintrin.h:114
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the low-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1735
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2198
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [8 x i16] vectors,...
Definition: emmintrin.h:2176
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 16-bit signed integer values in the input and returns the d...
Definition: emmintrin.h:2603
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a, __m128i __b)
Subtracts the corresponding 8-bit integer values in the operands.
Definition: emmintrin.h:2490
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
Definition: emmintrin.h:1755
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, __m128i __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x i32] and interleaves them ...
Definition: emmintrin.h:4447
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1276
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition: emmintrin.h:3843
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the upper 16 bits of ea...
Definition: emmintrin.h:2378
#define _mm_extract_epi16(a, imm)
Extracts 16 bits from a 128-bit integer vector of [8 x i16], using the immediate-value parameter as a...
Definition: emmintrin.h:4219
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1092
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:564
#define _mm_shufflelo_epi16(a, imm)
Constructs a 128-bit integer vector by shuffling four lower 16-bit elements of a 128-bit integer vect...
Definition: emmintrin.h:4331
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [2 x i64], saving the lower 64 bits of each...
Definition: emmintrin.h:2132
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two unsigned [8 x i16] vectors, saving the upper 16 bits of ...
Definition: emmintrin.h:2397
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double], using the one's complement of the valu...
Definition: emmintrin.h:381
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:996
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the greater value fro...
Definition: emmintrin.h:2302
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit floating-point vector of [4 x fl...
Definition: emmintrin.h:4725
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, __m128i __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2847
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, __m128i __b)
Converts, with saturation, 16-bit signed integers from both 128-bit integer vector operands into 8-bi...
Definition: emmintrin.h:4142
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i)
Initializes all values in a 128-bit vector of [4 x i32] with the specified 32-bit value.
Definition: emmintrin.h:3689
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, __m128i __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2885
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp)
Loads two double-precision values, in reverse order, from an aligned memory location into a 128-bit v...
Definition: emmintrin.h:1593
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1068
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:648
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit floating-point vector of [2 x dou...
Definition: emmintrin.h:4755
#define _mm_bsrli_si128(a, imm)
Definition: emmintrin.h:2949
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, __m128d __a)
Stores the upper 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:2009
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, int __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2793
void _mm_lfence(void)
Forces strong memory ordering (serialization) between load instructions preceding this instruction an...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a, __m128i __b)
Computes the absolute differences of corresponding 8-bit integer values in two 128-bit vectors.
Definition: emmintrin.h:2473
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:947
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:847
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:2028
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, __m128d __a)
Moves packed double-precision values from a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1914
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [4 x i32], saving the lower 32 bits of each...
Definition: emmintrin.h:2094
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3440
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3723
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, __m128i __b)
Converts, with saturation, 32-bit signed integers from both 128-bit integer vector operands into 16-b...
Definition: emmintrin.h:4165
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, __m128i __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2775
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [16 x i8], saving the lower 8 bits of each ...
Definition: emmintrin.h:2052
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:922
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:498
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition: emmintrin.h:3425
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:692
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p, __m128d __a)
Stores a 128-bit floating point vector of [2 x double] to a 128-bit aligned memory location.
Definition: emmintrin.h:4019
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3195
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [4 x float].
Definition: emmintrin.h:4785
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition: emmintrin.h:1857
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:361
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1, long long __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3497
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a, int __b)
Converts a 32-bit signed integer value, in the second parameter, into a double-precision floating-poi...
Definition: emmintrin.h:1401
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, __m128i __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the corresponding elements o...
Definition: emmintrin.h:2452
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:897
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, __m128i __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3056
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:872
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a)
Converts a vector of [4 x float] into four signed truncated (rounded toward zero) 32-bit integers,...
Definition: emmintrin.h:3350
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition: emmintrin.h:3796
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the high-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1709
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
Initializes the 8-bit values in a 128-bit vector of [16 x i8] with the specified 8-bit integer values...
Definition: emmintrin.h:3634
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q)
Initializes both values in a 128-bit vector of [2 x i64] with the specified 64-bit value.
Definition: emmintrin.h:3672
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, __m128d __b)
Calculates the square root of the lower double-precision value of the second operand and returns it i...
Definition: emmintrin.h:236
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double], initialized in reverse order with the spe...
Definition: emmintrin.h:1843
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1, __m64 __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3518
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the lesser of the pair of...
Definition: emmintrin.h:277
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, int __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2829
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a, __m128d __b)
Adds two 128-bit vectors of [2 x double].
Definition: emmintrin.h:92
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1893
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp)
Loads a double-precision floating-point value from a specified memory location and duplicates it to b...
Definition: emmintrin.h:1569
#define _mm_shufflehi_epi16(a, imm)
Constructs a 128-bit integer vector by shuffling four upper 16-bit elements of a 128-bit integer vect...
Definition: emmintrin.h:4364
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns a vecto...
Definition: emmintrin.h:344
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the smaller value fro...
Definition: emmintrin.h:2340
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a)
Returns the lower 64 bits of a 128-bit integer vector as a 64-bit integer.
Definition: emmintrin.h:4591
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, __m128i __b)
Stores a 128-bit integer vector to a memory location aligned on a 128-bit boundary.
Definition: emmintrin.h:3874
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a, __m128d __b)
Multiplies two 128-bit vectors of [2 x double].
Definition: emmintrin.h:171
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] for...
Definition: emmintrin.h:435
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1773
static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a)
Converts the two signed 32-bit integer elements of a 64-bit vector of [2 x i32] into two double-preci...
Definition: emmintrin.h:1523
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the greater of the pair o...
Definition: emmintrin.h:323
static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a)
Returns the low-order element of a 128-bit vector of [2 x double] as a double-precision floating-poin...
Definition: emmintrin.h:1538
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadl_epi64(__m128i_u const *__p)
Returns a vector of [2 x i64] where the lower element is taken from the lower element of the operand,...
Definition: emmintrin.h:3460
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w)
Initializes all values in a 128-bit vector of [8 x i16] with the specified 16-bit value.
Definition: emmintrin.h:3706
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q)
Initializes both values in a 128-bit integer vector with the specified 64-bit integer value.
Definition: emmintrin.h:3655
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a)
Moves the least significant 32 bits of a vector of [4 x i32] to a 32-bit signed integer value.
Definition: emmintrin.h:3394
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a, __m128i __b)
Subtracts the corresponding 32-bit integer values in the operands.
Definition: emmintrin.h:2524
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2239
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a, __m64 __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the two 64-bit integer vecto...
Definition: emmintrin.h:2434
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, __m128d __a)
Stores two double-precision values, in reverse order, from a 128-bit vector of [2 x double] to a 16-b...
Definition: emmintrin.h:1992
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, __m128i __b)
Unpacks the high-order (index 8-15) values from two 128-bit vectors of [16 x i8] and interleaves them...
Definition: emmintrin.h:4396
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, int __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2966
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition: emmintrin.h:1970
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1258
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [8 x i16], saving the lower 16 bits of each...
Definition: emmintrin.h:2073
#define _mm_bslli_si128(a, imm)
Definition: emmintrin.h:2740
#define _mm_srli_si128(a, imm)
Right-shifts the 128-bit integer vector operand by the specified number of bytes.
Definition: emmintrin.h:2945
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 8-bit unsigned integer values in the input and returns the ...
Definition: emmintrin.h:2624
#define _mm_shuffle_epi32(a, imm)
Constructs a 128-bit integer vector by shuffling four 32-bit elements of a 128-bit integer vector par...
Definition: emmintrin.h:4298
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a)
Converts the lower two single-precision floating-point elements of a 128-bit vector of [4 x float] in...
Definition: emmintrin.h:1294
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition: emmintrin.h:3858
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1116
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:541
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:456
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition: emmintrin.h:3889
double __m128d __attribute__((__vector_size__(16), __aligned__(16)))
Definition: emmintrin.h:19
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, __m128i __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2923
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition: emmintrin.h:3764
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1450
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32].
Definition: emmintrin.h:3331
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, __m128i __b)
Performs a bitwise exclusive OR of two 128-bit integer vectors.
Definition: emmintrin.h:2714
void _mm_clflush(void const *__p)
The cache line containing __p is flushed and invalidated from all caches in the coherency domain.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:627
static __inline__ void short __D
Definition: immintrin.h:468
__inline unsigned int unsigned int unsigned int * __P
Definition: bmi2intrin.h:25
__inline unsigned int unsigned int __Y
Definition: bmi2intrin.h:19