clang  19.0.0git
xmmintrin.h
Go to the documentation of this file.
1 /*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11  User Guide and Reference, version 9.0. */
12 
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15  explicitly from x86_64 to powerpc64/powerpc64le.
16 
17  Since X86 SSE intrinsics mainly handles __m128 type, PowerPC
18  VMX/VSX ISA is a good match for vector float SIMD operations.
19  However scalar float operations in vector (XMM) registers require
20  the POWER8 VSX ISA (2.07) level. There are differences for data
21  format and placement of float scalars in the vector register, which
22  require extra steps to match SSE scalar float semantics on POWER.
23 
24  It should be noted that there's much difference between X86_64's
25  MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26  portable <fenv.h> instead of access MXSCR directly.
27 
28  Most SSE scalar float intrinsic operations can be performed more
29  efficiently as C language float scalar operations or optimized to
30  use vector SIMD operations. We recommend this for new applications. */
31 #error \
32  "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
33 #endif
34 
35 #ifndef XMMINTRIN_H_
36 #define XMMINTRIN_H_
37 
38 #if defined(__powerpc64__) && \
39  (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
40 
41 /* Define four value permute mask */
42 #define _MM_SHUFFLE(w, x, y, z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
43 
44 #include <altivec.h>
45 
46 /* Avoid collisions between altivec.h and strict adherence to C++ and
47  C11 standards. This should eventually be done inside altivec.h itself,
48  but only after testing a full distro build. */
49 #if defined(__STRICT_ANSI__) && \
50  (defined(__cplusplus) || \
51  (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L))
52 #undef vector
53 #undef pixel
54 #undef bool
55 #endif
56 
57 /* We need type definitions from the MMX header file. */
58 #include <mmintrin.h>
59 
60 /* Get _mm_malloc () and _mm_free (). */
61 #if __STDC_HOSTED__
62 #include <mm_malloc.h>
63 #endif
64 
65 /* The Intel API is flexible enough that we must allow aliasing with other
66  vector types, and their scalar components. */
67 typedef vector float __m128 __attribute__((__may_alias__));
68 
69 /* Unaligned version of the same type. */
70 typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1)));
71 
72 /* Internal data types for implementing the intrinsics. */
73 typedef vector float __v4sf;
74 
75 /* Create an undefined vector. */
76 extern __inline __m128
77  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
78  _mm_undefined_ps(void) {
79  __m128 __Y = __Y;
80  return __Y;
81 }
82 
83 /* Create a vector of zeros. */
84 extern __inline __m128
85  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
86  _mm_setzero_ps(void) {
87  return __extension__(__m128){0.0f, 0.0f, 0.0f, 0.0f};
88 }
89 
90 /* Load four SPFP values from P. The address must be 16-byte aligned. */
91 extern __inline __m128
92  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
93  _mm_load_ps(float const *__P) {
94  return ((__m128)vec_ld(0, (__v4sf *)__P));
95 }
96 
97 /* Load four SPFP values from P. The address need not be 16-byte aligned. */
98 extern __inline __m128
99  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
100  _mm_loadu_ps(float const *__P) {
101  return (vec_vsx_ld(0, __P));
102 }
103 
104 /* Load four SPFP values in reverse order. The address must be aligned. */
105 extern __inline __m128
106  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
107  _mm_loadr_ps(float const *__P) {
108  __v4sf __tmp;
109  __m128 __result;
110  static const __vector unsigned char __permute_vector = {
111  0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,
112  0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};
113 
114  __tmp = vec_ld(0, (__v4sf *)__P);
115  __result = (__m128)vec_perm(__tmp, __tmp, __permute_vector);
116  return __result;
117 }
118 
119 /* Create a vector with all four elements equal to F. */
120 extern __inline __m128
121  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
122  _mm_set1_ps(float __F) {
123  return __extension__(__m128)(__v4sf){__F, __F, __F, __F};
124 }
125 
126 extern __inline __m128
127  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
128  _mm_set_ps1(float __F) {
129  return _mm_set1_ps(__F);
130 }
131 
132 /* Create the vector [Z Y X W]. */
133 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
134  __artificial__))
135 _mm_set_ps(const float __Z, const float __Y, const float __X, const float __W) {
136  return __extension__(__m128)(__v4sf){__W, __X, __Y, __Z};
137 }
138 
139 /* Create the vector [W X Y Z]. */
140 extern __inline __m128
141  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142  _mm_setr_ps(float __Z, float __Y, float __X, float __W) {
143  return __extension__(__m128)(__v4sf){__Z, __Y, __X, __W};
144 }
145 
146 /* Store four SPFP values. The address must be 16-byte aligned. */
147 extern __inline void
148  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
149  _mm_store_ps(float *__P, __m128 __A) {
150  vec_st((__v4sf)__A, 0, (__v4sf *)__P);
151 }
152 
153 /* Store four SPFP values. The address need not be 16-byte aligned. */
154 extern __inline void
155  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
156  _mm_storeu_ps(float *__P, __m128 __A) {
157  *(__m128_u *)__P = __A;
158 }
159 
160 /* Store four SPFP values in reverse order. The address must be aligned. */
161 extern __inline void
162  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
163  _mm_storer_ps(float *__P, __m128 __A) {
164  __v4sf __tmp;
165  static const __vector unsigned char __permute_vector = {
166  0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,
167  0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};
168 
169  __tmp = (__m128)vec_perm(__A, __A, __permute_vector);
170 
171  _mm_store_ps(__P, __tmp);
172 }
173 
174 /* Store the lower SPFP value across four words. */
175 extern __inline void
176  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
177  _mm_store1_ps(float *__P, __m128 __A) {
178  __v4sf __va = vec_splat((__v4sf)__A, 0);
179  _mm_store_ps(__P, __va);
180 }
181 
182 extern __inline void
183  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
184  _mm_store_ps1(float *__P, __m128 __A) {
185  _mm_store1_ps(__P, __A);
186 }
187 
188 /* Create a vector with element 0 as F and the rest zero. */
189 extern __inline __m128
190  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
191  _mm_set_ss(float __F) {
192  return __extension__(__m128)(__v4sf){__F, 0.0f, 0.0f, 0.0f};
193 }
194 
195 /* Sets the low SPFP value of A from the low value of B. */
196 extern __inline __m128
197  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
198  _mm_move_ss(__m128 __A, __m128 __B) {
199  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
200 
201  return (vec_sel((__v4sf)__A, (__v4sf)__B, __mask));
202 }
203 
204 /* Create a vector with element 0 as *P and the rest zero. */
205 extern __inline __m128
206  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
207  _mm_load_ss(float const *__P) {
208  return _mm_set_ss(*__P);
209 }
210 
211 /* Stores the lower SPFP value. */
212 extern __inline void
213  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
214  _mm_store_ss(float *__P, __m128 __A) {
215  *__P = ((__v4sf)__A)[0];
216 }
217 
218 /* Perform the respective operation on the lower SPFP (single-precision
219  floating-point) values of A and B; the upper three SPFP values are
220  passed through from A. */
221 
222 extern __inline __m128
223  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
224  _mm_add_ss(__m128 __A, __m128 __B) {
225 #ifdef _ARCH_PWR7
226  __m128 __a, __b, __c;
227  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
228  /* PowerISA VSX does not allow partial (for just lower double)
229  results. So to insure we don't generate spurious exceptions
230  (from the upper double values) we splat the lower double
231  before we to the operation. */
232  __a = vec_splat(__A, 0);
233  __b = vec_splat(__B, 0);
234  __c = __a + __b;
235  /* Then we merge the lower float result with the original upper
236  float elements from __A. */
237  return (vec_sel(__A, __c, __mask));
238 #else
239  __A[0] = __A[0] + __B[0];
240  return (__A);
241 #endif
242 }
243 
244 extern __inline __m128
245  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
246  _mm_sub_ss(__m128 __A, __m128 __B) {
247 #ifdef _ARCH_PWR7
248  __m128 __a, __b, __c;
249  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
250  /* PowerISA VSX does not allow partial (for just lower double)
251  results. So to insure we don't generate spurious exceptions
252  (from the upper double values) we splat the lower double
253  before we to the operation. */
254  __a = vec_splat(__A, 0);
255  __b = vec_splat(__B, 0);
256  __c = __a - __b;
257  /* Then we merge the lower float result with the original upper
258  float elements from __A. */
259  return (vec_sel(__A, __c, __mask));
260 #else
261  __A[0] = __A[0] - __B[0];
262  return (__A);
263 #endif
264 }
265 
266 extern __inline __m128
267  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
268  _mm_mul_ss(__m128 __A, __m128 __B) {
269 #ifdef _ARCH_PWR7
270  __m128 __a, __b, __c;
271  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
272  /* PowerISA VSX does not allow partial (for just lower double)
273  results. So to insure we don't generate spurious exceptions
274  (from the upper double values) we splat the lower double
275  before we to the operation. */
276  __a = vec_splat(__A, 0);
277  __b = vec_splat(__B, 0);
278  __c = __a * __b;
279  /* Then we merge the lower float result with the original upper
280  float elements from __A. */
281  return (vec_sel(__A, __c, __mask));
282 #else
283  __A[0] = __A[0] * __B[0];
284  return (__A);
285 #endif
286 }
287 
288 extern __inline __m128
289  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290  _mm_div_ss(__m128 __A, __m128 __B) {
291 #ifdef _ARCH_PWR7
292  __m128 __a, __b, __c;
293  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
294  /* PowerISA VSX does not allow partial (for just lower double)
295  results. So to insure we don't generate spurious exceptions
296  (from the upper double values) we splat the lower double
297  before we to the operation. */
298  __a = vec_splat(__A, 0);
299  __b = vec_splat(__B, 0);
300  __c = __a / __b;
301  /* Then we merge the lower float result with the original upper
302  float elements from __A. */
303  return (vec_sel(__A, __c, __mask));
304 #else
305  __A[0] = __A[0] / __B[0];
306  return (__A);
307 #endif
308 }
309 
310 extern __inline __m128
311  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
312  _mm_sqrt_ss(__m128 __A) {
313  __m128 __a, __c;
314  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
315  /* PowerISA VSX does not allow partial (for just lower double)
316  * results. So to insure we don't generate spurious exceptions
317  * (from the upper double values) we splat the lower double
318  * before we to the operation. */
319  __a = vec_splat(__A, 0);
320  __c = vec_sqrt(__a);
321  /* Then we merge the lower float result with the original upper
322  * float elements from __A. */
323  return (vec_sel(__A, __c, __mask));
324 }
325 
326 /* Perform the respective operation on the four SPFP values in A and B. */
327 extern __inline __m128
328  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
329  _mm_add_ps(__m128 __A, __m128 __B) {
330  return (__m128)((__v4sf)__A + (__v4sf)__B);
331 }
332 
333 extern __inline __m128
334  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
335  _mm_sub_ps(__m128 __A, __m128 __B) {
336  return (__m128)((__v4sf)__A - (__v4sf)__B);
337 }
338 
339 extern __inline __m128
340  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
341  _mm_mul_ps(__m128 __A, __m128 __B) {
342  return (__m128)((__v4sf)__A * (__v4sf)__B);
343 }
344 
345 extern __inline __m128
346  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
347  _mm_div_ps(__m128 __A, __m128 __B) {
348  return (__m128)((__v4sf)__A / (__v4sf)__B);
349 }
350 
351 extern __inline __m128
352  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
353  _mm_sqrt_ps(__m128 __A) {
354  return (vec_sqrt((__v4sf)__A));
355 }
356 
357 extern __inline __m128
358  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
359  _mm_rcp_ps(__m128 __A) {
360  return (vec_re((__v4sf)__A));
361 }
362 
363 extern __inline __m128
364  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
365  _mm_rsqrt_ps(__m128 __A) {
366  return (vec_rsqrte(__A));
367 }
368 
369 extern __inline __m128
370  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
371  _mm_rcp_ss(__m128 __A) {
372  __m128 __a, __c;
373  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
374  /* PowerISA VSX does not allow partial (for just lower double)
375  * results. So to insure we don't generate spurious exceptions
376  * (from the upper double values) we splat the lower double
377  * before we to the operation. */
378  __a = vec_splat(__A, 0);
379  __c = _mm_rcp_ps(__a);
380  /* Then we merge the lower float result with the original upper
381  * float elements from __A. */
382  return (vec_sel(__A, __c, __mask));
383 }
384 
385 extern __inline __m128
386  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
387  _mm_rsqrt_ss(__m128 __A) {
388  __m128 __a, __c;
389  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
390  /* PowerISA VSX does not allow partial (for just lower double)
391  * results. So to insure we don't generate spurious exceptions
392  * (from the upper double values) we splat the lower double
393  * before we to the operation. */
394  __a = vec_splat(__A, 0);
395  __c = vec_rsqrte(__a);
396  /* Then we merge the lower float result with the original upper
397  * float elements from __A. */
398  return (vec_sel(__A, __c, __mask));
399 }
400 
401 extern __inline __m128
402  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
403  _mm_min_ss(__m128 __A, __m128 __B) {
404  __v4sf __a, __b, __c;
405  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
406  /* PowerISA VSX does not allow partial (for just lower float)
407  * results. So to insure we don't generate spurious exceptions
408  * (from the upper float values) we splat the lower float
409  * before we to the operation. */
410  __a = vec_splat((__v4sf)__A, 0);
411  __b = vec_splat((__v4sf)__B, 0);
412  __c = vec_min(__a, __b);
413  /* Then we merge the lower float result with the original upper
414  * float elements from __A. */
415  return (vec_sel((__v4sf)__A, __c, __mask));
416 }
417 
418 extern __inline __m128
419  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
420  _mm_max_ss(__m128 __A, __m128 __B) {
421  __v4sf __a, __b, __c;
422  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
423  /* PowerISA VSX does not allow partial (for just lower float)
424  * results. So to insure we don't generate spurious exceptions
425  * (from the upper float values) we splat the lower float
426  * before we to the operation. */
427  __a = vec_splat(__A, 0);
428  __b = vec_splat(__B, 0);
429  __c = vec_max(__a, __b);
430  /* Then we merge the lower float result with the original upper
431  * float elements from __A. */
432  return (vec_sel((__v4sf)__A, __c, __mask));
433 }
434 
435 extern __inline __m128
436  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
437  _mm_min_ps(__m128 __A, __m128 __B) {
438  __vector __bool int __m = vec_cmpgt((__v4sf)__B, (__v4sf)__A);
439  return vec_sel(__B, __A, __m);
440 }
441 
442 extern __inline __m128
443  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
444  _mm_max_ps(__m128 __A, __m128 __B) {
445  __vector __bool int __m = vec_cmpgt((__v4sf)__A, (__v4sf)__B);
446  return vec_sel(__B, __A, __m);
447 }
448 
449 /* Perform logical bit-wise operations on 128-bit values. */
450 extern __inline __m128
451  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
452  _mm_and_ps(__m128 __A, __m128 __B) {
453  return ((__m128)vec_and((__v4sf)__A, (__v4sf)__B));
454  // return __builtin_ia32_andps (__A, __B);
455 }
456 
457 extern __inline __m128
458  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
459  _mm_andnot_ps(__m128 __A, __m128 __B) {
460  return ((__m128)vec_andc((__v4sf)__B, (__v4sf)__A));
461 }
462 
463 extern __inline __m128
464  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465  _mm_or_ps(__m128 __A, __m128 __B) {
466  return ((__m128)vec_or((__v4sf)__A, (__v4sf)__B));
467 }
468 
469 extern __inline __m128
470  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
471  _mm_xor_ps(__m128 __A, __m128 __B) {
472  return ((__m128)vec_xor((__v4sf)__A, (__v4sf)__B));
473 }
474 
475 /* Perform a comparison on the four SPFP values of A and B. For each
476  element, if the comparison is true, place a mask of all ones in the
477  result, otherwise a mask of zeros. */
478 extern __inline __m128
479  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
480  _mm_cmpeq_ps(__m128 __A, __m128 __B) {
481  return ((__m128)vec_cmpeq((__v4sf)__A, (__v4sf)__B));
482 }
483 
484 extern __inline __m128
485  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
486  _mm_cmplt_ps(__m128 __A, __m128 __B) {
487  return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B));
488 }
489 
490 extern __inline __m128
491  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
492  _mm_cmple_ps(__m128 __A, __m128 __B) {
493  return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B));
494 }
495 
496 extern __inline __m128
497  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
498  _mm_cmpgt_ps(__m128 __A, __m128 __B) {
499  return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B));
500 }
501 
502 extern __inline __m128
503  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
504  _mm_cmpge_ps(__m128 __A, __m128 __B) {
505  return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B));
506 }
507 
508 extern __inline __m128
509  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
510  _mm_cmpneq_ps(__m128 __A, __m128 __B) {
511  __v4sf __temp = (__v4sf)vec_cmpeq((__v4sf)__A, (__v4sf)__B);
512  return ((__m128)vec_nor(__temp, __temp));
513 }
514 
515 extern __inline __m128
516  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
517  _mm_cmpnlt_ps(__m128 __A, __m128 __B) {
518  return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B));
519 }
520 
521 extern __inline __m128
522  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
523  _mm_cmpnle_ps(__m128 __A, __m128 __B) {
524  return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B));
525 }
526 
527 extern __inline __m128
528  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
529  _mm_cmpngt_ps(__m128 __A, __m128 __B) {
530  return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B));
531 }
532 
533 extern __inline __m128
534  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
535  _mm_cmpnge_ps(__m128 __A, __m128 __B) {
536  return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B));
537 }
538 
539 extern __inline __m128
540  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
541  _mm_cmpord_ps(__m128 __A, __m128 __B) {
542  __vector unsigned int __a, __b;
543  __vector unsigned int __c, __d;
544  static const __vector unsigned int __float_exp_mask = {
545  0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
546 
547  __a = (__vector unsigned int)vec_abs((__v4sf)__A);
548  __b = (__vector unsigned int)vec_abs((__v4sf)__B);
549  __c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a);
550  __d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b);
551  return ((__m128)vec_and(__c, __d));
552 }
553 
554 extern __inline __m128
555  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
556  _mm_cmpunord_ps(__m128 __A, __m128 __B) {
557  __vector unsigned int __a, __b;
558  __vector unsigned int __c, __d;
559  static const __vector unsigned int __float_exp_mask = {
560  0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
561 
562  __a = (__vector unsigned int)vec_abs((__v4sf)__A);
563  __b = (__vector unsigned int)vec_abs((__v4sf)__B);
564  __c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask);
565  __d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask);
566  return ((__m128)vec_or(__c, __d));
567 }
568 
569 /* Perform a comparison on the lower SPFP values of A and B. If the
570  comparison is true, place a mask of all ones in the result, otherwise a
571  mask of zeros. The upper three SPFP values are passed through from A. */
572 extern __inline __m128
573  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
574  _mm_cmpeq_ss(__m128 __A, __m128 __B) {
575  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
576  __v4sf __a, __b, __c;
577  /* PowerISA VMX does not allow partial (for just element 0)
578  * results. So to insure we don't generate spurious exceptions
579  * (from the upper elements) we splat the lower float
580  * before we to the operation. */
581  __a = vec_splat((__v4sf)__A, 0);
582  __b = vec_splat((__v4sf)__B, 0);
583  __c = (__v4sf)vec_cmpeq(__a, __b);
584  /* Then we merge the lower float result with the original upper
585  * float elements from __A. */
586  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
587 }
588 
589 extern __inline __m128
590  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
591  _mm_cmplt_ss(__m128 __A, __m128 __B) {
592  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
593  __v4sf __a, __b, __c;
594  /* PowerISA VMX does not allow partial (for just element 0)
595  * results. So to insure we don't generate spurious exceptions
596  * (from the upper elements) we splat the lower float
597  * before we to the operation. */
598  __a = vec_splat((__v4sf)__A, 0);
599  __b = vec_splat((__v4sf)__B, 0);
600  __c = (__v4sf)vec_cmplt(__a, __b);
601  /* Then we merge the lower float result with the original upper
602  * float elements from __A. */
603  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
604 }
605 
606 extern __inline __m128
607  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
608  _mm_cmple_ss(__m128 __A, __m128 __B) {
609  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
610  __v4sf __a, __b, __c;
611  /* PowerISA VMX does not allow partial (for just element 0)
612  * results. So to insure we don't generate spurious exceptions
613  * (from the upper elements) we splat the lower float
614  * before we to the operation. */
615  __a = vec_splat((__v4sf)__A, 0);
616  __b = vec_splat((__v4sf)__B, 0);
617  __c = (__v4sf)vec_cmple(__a, __b);
618  /* Then we merge the lower float result with the original upper
619  * float elements from __A. */
620  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
621 }
622 
623 extern __inline __m128
624  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625  _mm_cmpgt_ss(__m128 __A, __m128 __B) {
626  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
627  __v4sf __a, __b, __c;
628  /* PowerISA VMX does not allow partial (for just element 0)
629  * results. So to insure we don't generate spurious exceptions
630  * (from the upper elements) we splat the lower float
631  * before we to the operation. */
632  __a = vec_splat((__v4sf)__A, 0);
633  __b = vec_splat((__v4sf)__B, 0);
634  __c = (__v4sf)vec_cmpgt(__a, __b);
635  /* Then we merge the lower float result with the original upper
636  * float elements from __A. */
637  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
638 }
639 
640 extern __inline __m128
641  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
642  _mm_cmpge_ss(__m128 __A, __m128 __B) {
643  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
644  __v4sf __a, __b, __c;
645  /* PowerISA VMX does not allow partial (for just element 0)
646  * results. So to insure we don't generate spurious exceptions
647  * (from the upper elements) we splat the lower float
648  * before we to the operation. */
649  __a = vec_splat((__v4sf)__A, 0);
650  __b = vec_splat((__v4sf)__B, 0);
651  __c = (__v4sf)vec_cmpge(__a, __b);
652  /* Then we merge the lower float result with the original upper
653  * float elements from __A. */
654  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
655 }
656 
657 extern __inline __m128
658  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
659  _mm_cmpneq_ss(__m128 __A, __m128 __B) {
660  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
661  __v4sf __a, __b, __c;
662  /* PowerISA VMX does not allow partial (for just element 0)
663  * results. So to insure we don't generate spurious exceptions
664  * (from the upper elements) we splat the lower float
665  * before we to the operation. */
666  __a = vec_splat((__v4sf)__A, 0);
667  __b = vec_splat((__v4sf)__B, 0);
668  __c = (__v4sf)vec_cmpeq(__a, __b);
669  __c = vec_nor(__c, __c);
670  /* Then we merge the lower float result with the original upper
671  * float elements from __A. */
672  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
673 }
674 
675 extern __inline __m128
676  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
677  _mm_cmpnlt_ss(__m128 __A, __m128 __B) {
678  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
679  __v4sf __a, __b, __c;
680  /* PowerISA VMX does not allow partial (for just element 0)
681  * results. So to insure we don't generate spurious exceptions
682  * (from the upper elements) we splat the lower float
683  * before we to the operation. */
684  __a = vec_splat((__v4sf)__A, 0);
685  __b = vec_splat((__v4sf)__B, 0);
686  __c = (__v4sf)vec_cmpge(__a, __b);
687  /* Then we merge the lower float result with the original upper
688  * float elements from __A. */
689  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
690 }
691 
692 extern __inline __m128
693  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
694  _mm_cmpnle_ss(__m128 __A, __m128 __B) {
695  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
696  __v4sf __a, __b, __c;
697  /* PowerISA VMX does not allow partial (for just element 0)
698  * results. So to insure we don't generate spurious exceptions
699  * (from the upper elements) we splat the lower float
700  * before we to the operation. */
701  __a = vec_splat((__v4sf)__A, 0);
702  __b = vec_splat((__v4sf)__B, 0);
703  __c = (__v4sf)vec_cmpgt(__a, __b);
704  /* Then we merge the lower float result with the original upper
705  * float elements from __A. */
706  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
707 }
708 
709 extern __inline __m128
710  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
711  _mm_cmpngt_ss(__m128 __A, __m128 __B) {
712  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
713  __v4sf __a, __b, __c;
714  /* PowerISA VMX does not allow partial (for just element 0)
715  * results. So to insure we don't generate spurious exceptions
716  * (from the upper elements) we splat the lower float
717  * before we to the operation. */
718  __a = vec_splat((__v4sf)__A, 0);
719  __b = vec_splat((__v4sf)__B, 0);
720  __c = (__v4sf)vec_cmple(__a, __b);
721  /* Then we merge the lower float result with the original upper
722  * float elements from __A. */
723  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
724 }
725 
726 extern __inline __m128
727  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
728  _mm_cmpnge_ss(__m128 __A, __m128 __B) {
729  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
730  __v4sf __a, __b, __c;
731  /* PowerISA VMX does not allow partial (for just element 0)
732  * results. So to insure we don't generate spurious exceptions
733  * (from the upper elements) we splat the lower float
734  * before we do the operation. */
735  __a = vec_splat((__v4sf)__A, 0);
736  __b = vec_splat((__v4sf)__B, 0);
737  __c = (__v4sf)vec_cmplt(__a, __b);
738  /* Then we merge the lower float result with the original upper
739  * float elements from __A. */
740  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
741 }
742 
743 extern __inline __m128
744  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
745  _mm_cmpord_ss(__m128 __A, __m128 __B) {
746  __vector unsigned int __a, __b;
747  __vector unsigned int __c, __d;
748  static const __vector unsigned int __float_exp_mask = {
749  0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
750  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
751 
752  __a = (__vector unsigned int)vec_abs((__v4sf)__A);
753  __b = (__vector unsigned int)vec_abs((__v4sf)__B);
754  __c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a);
755  __d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b);
756  __c = vec_and(__c, __d);
757  /* Then we merge the lower float result with the original upper
758  * float elements from __A. */
759  return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask));
760 }
761 
762 extern __inline __m128
763  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
764  _mm_cmpunord_ss(__m128 __A, __m128 __B) {
765  __vector unsigned int __a, __b;
766  __vector unsigned int __c, __d;
767  static const __vector unsigned int __float_exp_mask = {
768  0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
769  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
770 
771  __a = (__vector unsigned int)vec_abs((__v4sf)__A);
772  __b = (__vector unsigned int)vec_abs((__v4sf)__B);
773  __c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask);
774  __d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask);
775  __c = vec_or(__c, __d);
776  /* Then we merge the lower float result with the original upper
777  * float elements from __A. */
778  return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask));
779 }
780 
781 /* Compare the lower SPFP values of A and B and return 1 if true
782  and 0 if false. */
783 extern __inline int
784  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
785  _mm_comieq_ss(__m128 __A, __m128 __B) {
786  return (__A[0] == __B[0]);
787 }
788 
789 extern __inline int
790  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
791  _mm_comilt_ss(__m128 __A, __m128 __B) {
792  return (__A[0] < __B[0]);
793 }
794 
795 extern __inline int
796  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
797  _mm_comile_ss(__m128 __A, __m128 __B) {
798  return (__A[0] <= __B[0]);
799 }
800 
801 extern __inline int
802  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
803  _mm_comigt_ss(__m128 __A, __m128 __B) {
804  return (__A[0] > __B[0]);
805 }
806 
807 extern __inline int
808  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
809  _mm_comige_ss(__m128 __A, __m128 __B) {
810  return (__A[0] >= __B[0]);
811 }
812 
813 extern __inline int
814  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
815  _mm_comineq_ss(__m128 __A, __m128 __B) {
816  return (__A[0] != __B[0]);
817 }
818 
819 /* FIXME
820  * The __mm_ucomi??_ss implementations below are exactly the same as
821  * __mm_comi??_ss because GCC for PowerPC only generates unordered
822  * compares (scalar and vector).
823  * Technically __mm_comieq_ss et al should be using the ordered
824  * compare and signal for QNaNs.
825  * The __mm_ucomieq_sd et all should be OK, as is.
826  */
827 extern __inline int
828  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
829  _mm_ucomieq_ss(__m128 __A, __m128 __B) {
830  return (__A[0] == __B[0]);
831 }
832 
833 extern __inline int
834  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
835  _mm_ucomilt_ss(__m128 __A, __m128 __B) {
836  return (__A[0] < __B[0]);
837 }
838 
839 extern __inline int
840  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
841  _mm_ucomile_ss(__m128 __A, __m128 __B) {
842  return (__A[0] <= __B[0]);
843 }
844 
845 extern __inline int
846  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
847  _mm_ucomigt_ss(__m128 __A, __m128 __B) {
848  return (__A[0] > __B[0]);
849 }
850 
851 extern __inline int
852  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
853  _mm_ucomige_ss(__m128 __A, __m128 __B) {
854  return (__A[0] >= __B[0]);
855 }
856 
857 extern __inline int
858  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
859  _mm_ucomineq_ss(__m128 __A, __m128 __B) {
860  return (__A[0] != __B[0]);
861 }
862 
863 extern __inline float
864  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
865  _mm_cvtss_f32(__m128 __A) {
866  return ((__v4sf)__A)[0];
867 }
868 
869 /* Convert the lower SPFP value to a 32-bit integer according to the current
870  rounding mode. */
871 extern __inline int
872  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
873  _mm_cvtss_si32(__m128 __A) {
874  int __res;
875 #ifdef _ARCH_PWR8
876  double __dtmp;
877  __asm__(
878 #ifdef __LITTLE_ENDIAN__
879  "xxsldwi %x0,%x0,%x0,3;\n"
880 #endif
881  "xscvspdp %x2,%x0;\n"
882  "fctiw %2,%2;\n"
883  "mfvsrd %1,%x2;\n"
884  : "+wa"(__A), "=r"(__res), "=f"(__dtmp)
885  :);
886 #else
887  __res = __builtin_rint(__A[0]);
888 #endif
889  return __res;
890 }
891 
892 extern __inline int
893  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
894  _mm_cvt_ss2si(__m128 __A) {
895  return _mm_cvtss_si32(__A);
896 }
897 
898 /* Convert the lower SPFP value to a 32-bit integer according to the
899  current rounding mode. */
900 
901 /* Intel intrinsic. */
902 extern __inline long long
903  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
904  _mm_cvtss_si64(__m128 __A) {
905  long long __res;
906 #if defined(_ARCH_PWR8) && defined(__powerpc64__)
907  double __dtmp;
908  __asm__(
909 #ifdef __LITTLE_ENDIAN__
910  "xxsldwi %x0,%x0,%x0,3;\n"
911 #endif
912  "xscvspdp %x2,%x0;\n"
913  "fctid %2,%2;\n"
914  "mfvsrd %1,%x2;\n"
915  : "+wa"(__A), "=r"(__res), "=f"(__dtmp)
916  :);
917 #else
918  __res = __builtin_llrint(__A[0]);
919 #endif
920  return __res;
921 }
922 
923 /* Microsoft intrinsic. */
924 extern __inline long long
925  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
926  _mm_cvtss_si64x(__m128 __A) {
927  return _mm_cvtss_si64((__v4sf)__A);
928 }
929 
930 /* Constants for use with _mm_prefetch. */
931 enum _mm_hint {
932  /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */
933  _MM_HINT_ET0 = 7,
934  _MM_HINT_ET1 = 6,
935  _MM_HINT_T0 = 3,
936  _MM_HINT_T1 = 2,
937  _MM_HINT_T2 = 1,
938  _MM_HINT_NTA = 0
939 };
940 
941 /* Loads one cache line from address P to a location "closer" to the
942  processor. The selector I specifies the type of prefetch operation. */
943 extern __inline void
944  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
945  _mm_prefetch(const void *__P, enum _mm_hint __I) {
946  /* Current PowerPC will ignores the hint parameters. */
947  __builtin_prefetch(__P);
948 }
949 
950 /* Convert the two lower SPFP values to 32-bit integers according to the
951  current rounding mode. Return the integers in packed form. */
952 extern __inline __m64
953  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
954  _mm_cvtps_pi32(__m128 __A) {
955  /* Splat two lower SPFP values to both halves. */
956  __v4sf __temp, __rounded;
957  __vector unsigned long long __result;
958 
959  /* Splat two lower SPFP values to both halves. */
960  __temp = (__v4sf)vec_splat((__vector long long)__A, 0);
961  __rounded = vec_rint(__temp);
962  __result = (__vector unsigned long long)vec_cts(__rounded, 0);
963 
964  return (__m64)((__vector long long)__result)[0];
965 }
966 
967 extern __inline __m64
968  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
969  _mm_cvt_ps2pi(__m128 __A) {
970  return _mm_cvtps_pi32(__A);
971 }
972 
973 /* Truncate the lower SPFP value to a 32-bit integer. */
974 extern __inline int
975  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
976  _mm_cvttss_si32(__m128 __A) {
977  /* Extract the lower float element. */
978  float __temp = __A[0];
979  /* truncate to 32-bit integer and return. */
980  return __temp;
981 }
982 
983 extern __inline int
984  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
985  _mm_cvtt_ss2si(__m128 __A) {
986  return _mm_cvttss_si32(__A);
987 }
988 
989 /* Intel intrinsic. */
990 extern __inline long long
991  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
992  _mm_cvttss_si64(__m128 __A) {
993  /* Extract the lower float element. */
994  float __temp = __A[0];
995  /* truncate to 32-bit integer and return. */
996  return __temp;
997 }
998 
999 /* Microsoft intrinsic. */
1000 extern __inline long long
1001  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1002  _mm_cvttss_si64x(__m128 __A) {
1003  /* Extract the lower float element. */
1004  float __temp = __A[0];
1005  /* truncate to 32-bit integer and return. */
1006  return __temp;
1007 }
1008 
1009 /* Truncate the two lower SPFP values to 32-bit integers. Return the
1010  integers in packed form. */
1011 extern __inline __m64
1012  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1013  _mm_cvttps_pi32(__m128 __A) {
1014  __v4sf __temp;
1015  __vector unsigned long long __result;
1016 
1017  /* Splat two lower SPFP values to both halves. */
1018  __temp = (__v4sf)vec_splat((__vector long long)__A, 0);
1019  __result = (__vector unsigned long long)vec_cts(__temp, 0);
1020 
1021  return (__m64)((__vector long long)__result)[0];
1022 }
1023 
1024 extern __inline __m64
1025  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1026  _mm_cvtt_ps2pi(__m128 __A) {
1027  return _mm_cvttps_pi32(__A);
1028 }
1029 
1030 /* Convert B to a SPFP value and insert it as element zero in A. */
1031 extern __inline __m128
1032  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1033  _mm_cvtsi32_ss(__m128 __A, int __B) {
1034  float __temp = __B;
1035  __A[0] = __temp;
1036 
1037  return __A;
1038 }
1039 
1040 extern __inline __m128
1041  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1042  _mm_cvt_si2ss(__m128 __A, int __B) {
1043  return _mm_cvtsi32_ss(__A, __B);
1044 }
1045 
1046 /* Convert B to a SPFP value and insert it as element zero in A. */
1047 /* Intel intrinsic. */
1048 extern __inline __m128
1049  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1050  _mm_cvtsi64_ss(__m128 __A, long long __B) {
1051  float __temp = __B;
1052  __A[0] = __temp;
1053 
1054  return __A;
1055 }
1056 
1057 /* Microsoft intrinsic. */
1058 extern __inline __m128
1059  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1060  _mm_cvtsi64x_ss(__m128 __A, long long __B) {
1061  return _mm_cvtsi64_ss(__A, __B);
1062 }
1063 
1064 /* Convert the two 32-bit values in B to SPFP form and insert them
1065  as the two lower elements in A. */
1066 extern __inline __m128
1067  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1068  _mm_cvtpi32_ps(__m128 __A, __m64 __B) {
1069  __vector signed int __vm1;
1070  __vector float __vf1;
1071 
1072  __vm1 = (__vector signed int)(__vector unsigned long long){__B, __B};
1073  __vf1 = (__vector float)vec_ctf(__vm1, 0);
1074 
1075  return ((__m128)(__vector unsigned long long){
1076  ((__vector unsigned long long)__vf1)[0],
1077  ((__vector unsigned long long)__A)[1]});
1078 }
1079 
1080 extern __inline __m128
1081  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1082  _mm_cvt_pi2ps(__m128 __A, __m64 __B) {
1083  return _mm_cvtpi32_ps(__A, __B);
1084 }
1085 
1086 /* Convert the four signed 16-bit values in A to SPFP form. */
1087 extern __inline __m128
1088  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1089  _mm_cvtpi16_ps(__m64 __A) {
1090  __vector signed short __vs8;
1091  __vector signed int __vi4;
1092  __vector float __vf1;
1093 
1094  __vs8 = (__vector signed short)(__vector unsigned long long){__A, __A};
1095  __vi4 = vec_vupklsh(__vs8);
1096  __vf1 = (__vector float)vec_ctf(__vi4, 0);
1097 
1098  return (__m128)__vf1;
1099 }
1100 
1101 /* Convert the four unsigned 16-bit values in A to SPFP form. */
1102 extern __inline __m128
1103  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1104  _mm_cvtpu16_ps(__m64 __A) {
1105  const __vector unsigned short __zero = {0, 0, 0, 0, 0, 0, 0, 0};
1106  __vector unsigned short __vs8;
1107  __vector unsigned int __vi4;
1108  __vector float __vf1;
1109 
1110  __vs8 = (__vector unsigned short)(__vector unsigned long long){__A, __A};
1111  __vi4 = (__vector unsigned int)vec_mergel
1112 #ifdef __LITTLE_ENDIAN__
1113  (__vs8, __zero);
1114 #else
1115  (__zero, __vs8);
1116 #endif
1117  __vf1 = (__vector float)vec_ctf(__vi4, 0);
1118 
1119  return (__m128)__vf1;
1120 }
1121 
1122 /* Convert the low four signed 8-bit values in A to SPFP form. */
1123 extern __inline __m128
1124  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1125  _mm_cvtpi8_ps(__m64 __A) {
1126  __vector signed char __vc16;
1127  __vector signed short __vs8;
1128  __vector signed int __vi4;
1129  __vector float __vf1;
1130 
1131  __vc16 = (__vector signed char)(__vector unsigned long long){__A, __A};
1132  __vs8 = vec_vupkhsb(__vc16);
1133  __vi4 = vec_vupkhsh(__vs8);
1134  __vf1 = (__vector float)vec_ctf(__vi4, 0);
1135 
1136  return (__m128)__vf1;
1137 }
1138 
1139 /* Convert the low four unsigned 8-bit values in A to SPFP form. */
1140 extern __inline __m128
1141  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1142 
1143  _mm_cvtpu8_ps(__m64 __A) {
1144  const __vector unsigned char __zero = {0, 0, 0, 0, 0, 0, 0, 0};
1145  __vector unsigned char __vc16;
1146  __vector unsigned short __vs8;
1147  __vector unsigned int __vi4;
1148  __vector float __vf1;
1149 
1150  __vc16 = (__vector unsigned char)(__vector unsigned long long){__A, __A};
1151 #ifdef __LITTLE_ENDIAN__
1152  __vs8 = (__vector unsigned short)vec_mergel(__vc16, __zero);
1153  __vi4 =
1154  (__vector unsigned int)vec_mergeh(__vs8, (__vector unsigned short)__zero);
1155 #else
1156  __vs8 = (__vector unsigned short)vec_mergel(__zero, __vc16);
1157  __vi4 =
1158  (__vector unsigned int)vec_mergeh((__vector unsigned short)__zero, __vs8);
1159 #endif
1160  __vf1 = (__vector float)vec_ctf(__vi4, 0);
1161 
1162  return (__m128)__vf1;
1163 }
1164 
1165 /* Convert the four signed 32-bit values in A and B to SPFP form. */
1166 extern __inline __m128
1167  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1168  _mm_cvtpi32x2_ps(__m64 __A, __m64 __B) {
1169  __vector signed int __vi4;
1170  __vector float __vf4;
1171 
1172  __vi4 = (__vector signed int)(__vector unsigned long long){__A, __B};
1173  __vf4 = (__vector float)vec_ctf(__vi4, 0);
1174  return (__m128)__vf4;
1175 }
1176 
1177 /* Convert the four SPFP values in A to four signed 16-bit integers. */
1178 extern __inline __m64
1179  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1180  _mm_cvtps_pi16(__m128 __A) {
1181  __v4sf __rounded;
1182  __vector signed int __temp;
1183  __vector unsigned long long __result;
1184 
1185  __rounded = vec_rint(__A);
1186  __temp = vec_cts(__rounded, 0);
1187  __result = (__vector unsigned long long)vec_pack(__temp, __temp);
1188 
1189  return (__m64)((__vector long long)__result)[0];
1190 }
1191 
1192 /* Convert the four SPFP values in A to four signed 8-bit integers. */
1193 extern __inline __m64
1194  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1195  _mm_cvtps_pi8(__m128 __A) {
1196  __v4sf __rounded;
1197  __vector signed int __tmp_i;
1198  static const __vector signed int __zero = {0, 0, 0, 0};
1199  __vector signed short __tmp_s;
1200  __vector signed char __res_v;
1201 
1202  __rounded = vec_rint(__A);
1203  __tmp_i = vec_cts(__rounded, 0);
1204  __tmp_s = vec_pack(__tmp_i, __zero);
1205  __res_v = vec_pack(__tmp_s, __tmp_s);
1206  return (__m64)((__vector long long)__res_v)[0];
1207 }
1208 
1209 /* Selects four specific SPFP values from A and B based on MASK. */
1210 extern __inline __m128
1211  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1212 
1213  _mm_shuffle_ps(__m128 __A, __m128 __B, int const __mask) {
1214  unsigned long __element_selector_10 = __mask & 0x03;
1215  unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
1216  unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
1217  unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
1218  static const unsigned int __permute_selectors[4] = {
1219 #ifdef __LITTLE_ENDIAN__
1220  0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1221 #else
1222  0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
1223 #endif
1224  };
1225  __vector unsigned int __t;
1226 
1227  __t[0] = __permute_selectors[__element_selector_10];
1228  __t[1] = __permute_selectors[__element_selector_32];
1229  __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
1230  __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
1231  return vec_perm((__v4sf)__A, (__v4sf)__B, (__vector unsigned char)__t);
1232 }
1233 
1234 /* Selects and interleaves the upper two SPFP values from A and B. */
1235 extern __inline __m128
1236  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1237  _mm_unpackhi_ps(__m128 __A, __m128 __B) {
1238  return (__m128)vec_vmrglw((__v4sf)__A, (__v4sf)__B);
1239 }
1240 
1241 /* Selects and interleaves the lower two SPFP values from A and B. */
1242 extern __inline __m128
1243  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1244  _mm_unpacklo_ps(__m128 __A, __m128 __B) {
1245  return (__m128)vec_vmrghw((__v4sf)__A, (__v4sf)__B);
1246 }
1247 
1248 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
1249  the lower two values are passed through from A. */
1250 extern __inline __m128
1251  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1252  _mm_loadh_pi(__m128 __A, __m64 const *__P) {
1253  __vector unsigned long long __a = (__vector unsigned long long)__A;
1254  __vector unsigned long long __p = vec_splats(*__P);
1255  __a[1] = __p[1];
1256 
1257  return (__m128)__a;
1258 }
1259 
1260 /* Stores the upper two SPFP values of A into P. */
1261 extern __inline void
1262  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1263  _mm_storeh_pi(__m64 *__P, __m128 __A) {
1264  __vector unsigned long long __a = (__vector unsigned long long)__A;
1265 
1266  *__P = __a[1];
1267 }
1268 
1269 /* Moves the upper two values of B into the lower two values of A. */
1270 extern __inline __m128
1271  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1272  _mm_movehl_ps(__m128 __A, __m128 __B) {
1273  return (__m128)vec_mergel((__vector unsigned long long)__B,
1274  (__vector unsigned long long)__A);
1275 }
1276 
1277 /* Moves the lower two values of B into the upper two values of A. */
1278 extern __inline __m128
1279  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1280  _mm_movelh_ps(__m128 __A, __m128 __B) {
1281  return (__m128)vec_mergeh((__vector unsigned long long)__A,
1282  (__vector unsigned long long)__B);
1283 }
1284 
1285 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
1286  the upper two values are passed through from A. */
1287 extern __inline __m128
1288  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1289  _mm_loadl_pi(__m128 __A, __m64 const *__P) {
1290  __vector unsigned long long __a = (__vector unsigned long long)__A;
1291  __vector unsigned long long __p = vec_splats(*__P);
1292  __a[0] = __p[0];
1293 
1294  return (__m128)__a;
1295 }
1296 
1297 /* Stores the lower two SPFP values of A into P. */
1298 extern __inline void
1299  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1300  _mm_storel_pi(__m64 *__P, __m128 __A) {
1301  __vector unsigned long long __a = (__vector unsigned long long)__A;
1302 
1303  *__P = __a[0];
1304 }
1305 
1306 #ifdef _ARCH_PWR8
1307 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1308 
1309 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */
1310 extern __inline int
1311  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1312  _mm_movemask_ps(__m128 __A) {
1313 #ifdef _ARCH_PWR10
1314  return vec_extractm((__vector unsigned int)__A);
1315 #else
1316  __vector unsigned long long __result;
1317  static const __vector unsigned int __perm_mask = {
1318 #ifdef __LITTLE_ENDIAN__
1319  0x00204060, 0x80808080, 0x80808080, 0x80808080
1320 #else
1321  0x80808080, 0x80808080, 0x80808080, 0x00204060
1322 #endif
1323  };
1324 
1325  __result = ((__vector unsigned long long)vec_vbpermq(
1326  (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1327 
1328 #ifdef __LITTLE_ENDIAN__
1329  return __result[1];
1330 #else
1331  return __result[0];
1332 #endif
1333 #endif /* !_ARCH_PWR10 */
1334 }
1335 #endif /* _ARCH_PWR8 */
1336 
1337 /* Create a vector with all four elements equal to *P. */
1338 extern __inline __m128
1339  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1340  _mm_load1_ps(float const *__P) {
1341  return _mm_set1_ps(*__P);
1342 }
1343 
1344 extern __inline __m128
1345  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1346  _mm_load_ps1(float const *__P) {
1347  return _mm_load1_ps(__P);
1348 }
1349 
1350 /* Extracts one of the four words of A. The selector N must be immediate. */
1351 extern __inline int
1352  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1353  _mm_extract_pi16(__m64 const __A, int const __N) {
1354  unsigned int __shiftr = __N & 3;
1355 #ifdef __BIG_ENDIAN__
1356  __shiftr = 3 - __shiftr;
1357 #endif
1358 
1359  return ((__A >> (__shiftr * 16)) & 0xffff);
1360 }
1361 
1362 extern __inline int
1363  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1364  _m_pextrw(__m64 const __A, int const __N) {
1365  return _mm_extract_pi16(__A, __N);
1366 }
1367 
1368 /* Inserts word D into one of four words of A. The selector N must be
1369  immediate. */
1370 extern __inline __m64
1371  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1372  _mm_insert_pi16(__m64 const __A, int const __D, int const __N) {
1373  const int __shiftl = (__N & 3) * 16;
1374  const __m64 __shiftD = (const __m64)__D << __shiftl;
1375  const __m64 __mask = 0xffffUL << __shiftl;
1376  __m64 __result = (__A & (~__mask)) | (__shiftD & __mask);
1377 
1378  return __result;
1379 }
1380 
1381 extern __inline __m64
1382  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1383  _m_pinsrw(__m64 const __A, int const __D, int const __N) {
1384  return _mm_insert_pi16(__A, __D, __N);
1385 }
1386 
1387 /* Compute the element-wise maximum of signed 16-bit values. */
1388 extern __inline __m64
1389  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1390 
1391  _mm_max_pi16(__m64 __A, __m64 __B) {
1392 #if _ARCH_PWR8
1393  __vector signed short __a, __b, __r;
1394  __vector __bool short __c;
1395 
1396  __a = (__vector signed short)vec_splats(__A);
1397  __b = (__vector signed short)vec_splats(__B);
1398  __c = (__vector __bool short)vec_cmpgt(__a, __b);
1399  __r = vec_sel(__b, __a, __c);
1400  return (__m64)((__vector long long)__r)[0];
1401 #else
1402  __m64_union __m1, __m2, __res;
1403 
1404  __m1.as_m64 = __A;
1405  __m2.as_m64 = __B;
1406 
1407  __res.as_short[0] = (__m1.as_short[0] > __m2.as_short[0]) ? __m1.as_short[0]
1408  : __m2.as_short[0];
1409  __res.as_short[1] = (__m1.as_short[1] > __m2.as_short[1]) ? __m1.as_short[1]
1410  : __m2.as_short[1];
1411  __res.as_short[2] = (__m1.as_short[2] > __m2.as_short[2]) ? __m1.as_short[2]
1412  : __m2.as_short[2];
1413  __res.as_short[3] = (__m1.as_short[3] > __m2.as_short[3]) ? __m1.as_short[3]
1414  : __m2.as_short[3];
1415 
1416  return (__m64)__res.as_m64;
1417 #endif
1418 }
1419 
1420 extern __inline __m64
1421  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1422  _m_pmaxsw(__m64 __A, __m64 __B) {
1423  return _mm_max_pi16(__A, __B);
1424 }
1425 
1426 /* Compute the element-wise maximum of unsigned 8-bit values. */
1427 extern __inline __m64
1428  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1429  _mm_max_pu8(__m64 __A, __m64 __B) {
1430 #if _ARCH_PWR8
1431  __vector unsigned char __a, __b, __r;
1432  __vector __bool char __c;
1433 
1434  __a = (__vector unsigned char)vec_splats(__A);
1435  __b = (__vector unsigned char)vec_splats(__B);
1436  __c = (__vector __bool char)vec_cmpgt(__a, __b);
1437  __r = vec_sel(__b, __a, __c);
1438  return (__m64)((__vector long long)__r)[0];
1439 #else
1440  __m64_union __m1, __m2, __res;
1441  long __i;
1442 
1443  __m1.as_m64 = __A;
1444  __m2.as_m64 = __B;
1445 
1446  for (__i = 0; __i < 8; __i++)
1447  __res.as_char[__i] =
1448  ((unsigned char)__m1.as_char[__i] > (unsigned char)__m2.as_char[__i])
1449  ? __m1.as_char[__i]
1450  : __m2.as_char[__i];
1451 
1452  return (__m64)__res.as_m64;
1453 #endif
1454 }
1455 
1456 extern __inline __m64
1457  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1458  _m_pmaxub(__m64 __A, __m64 __B) {
1459  return _mm_max_pu8(__A, __B);
1460 }
1461 
1462 /* Compute the element-wise minimum of signed 16-bit values. */
1463 extern __inline __m64
1464  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1465  _mm_min_pi16(__m64 __A, __m64 __B) {
1466 #if _ARCH_PWR8
1467  __vector signed short __a, __b, __r;
1468  __vector __bool short __c;
1469 
1470  __a = (__vector signed short)vec_splats(__A);
1471  __b = (__vector signed short)vec_splats(__B);
1472  __c = (__vector __bool short)vec_cmplt(__a, __b);
1473  __r = vec_sel(__b, __a, __c);
1474  return (__m64)((__vector long long)__r)[0];
1475 #else
1476  __m64_union __m1, __m2, __res;
1477 
1478  __m1.as_m64 = __A;
1479  __m2.as_m64 = __B;
1480 
1481  __res.as_short[0] = (__m1.as_short[0] < __m2.as_short[0]) ? __m1.as_short[0]
1482  : __m2.as_short[0];
1483  __res.as_short[1] = (__m1.as_short[1] < __m2.as_short[1]) ? __m1.as_short[1]
1484  : __m2.as_short[1];
1485  __res.as_short[2] = (__m1.as_short[2] < __m2.as_short[2]) ? __m1.as_short[2]
1486  : __m2.as_short[2];
1487  __res.as_short[3] = (__m1.as_short[3] < __m2.as_short[3]) ? __m1.as_short[3]
1488  : __m2.as_short[3];
1489 
1490  return (__m64)__res.as_m64;
1491 #endif
1492 }
1493 
1494 extern __inline __m64
1495  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1496  _m_pminsw(__m64 __A, __m64 __B) {
1497  return _mm_min_pi16(__A, __B);
1498 }
1499 
1500 /* Compute the element-wise minimum of unsigned 8-bit values. */
1501 extern __inline __m64
1502  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1503  _mm_min_pu8(__m64 __A, __m64 __B) {
1504 #if _ARCH_PWR8
1505  __vector unsigned char __a, __b, __r;
1506  __vector __bool char __c;
1507 
1508  __a = (__vector unsigned char)vec_splats(__A);
1509  __b = (__vector unsigned char)vec_splats(__B);
1510  __c = (__vector __bool char)vec_cmplt(__a, __b);
1511  __r = vec_sel(__b, __a, __c);
1512  return (__m64)((__vector long long)__r)[0];
1513 #else
1514  __m64_union __m1, __m2, __res;
1515  long __i;
1516 
1517  __m1.as_m64 = __A;
1518  __m2.as_m64 = __B;
1519 
1520  for (__i = 0; __i < 8; __i++)
1521  __res.as_char[__i] =
1522  ((unsigned char)__m1.as_char[__i] < (unsigned char)__m2.as_char[__i])
1523  ? __m1.as_char[__i]
1524  : __m2.as_char[__i];
1525 
1526  return (__m64)__res.as_m64;
1527 #endif
1528 }
1529 
1530 extern __inline __m64
1531  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1532  _m_pminub(__m64 __A, __m64 __B) {
1533  return _mm_min_pu8(__A, __B);
1534 }
1535 
1536 /* Create an 8-bit mask of the signs of 8-bit values. */
1537 extern __inline int
1538  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1539  _mm_movemask_pi8(__m64 __A) {
1540 #ifdef __powerpc64__
1541  unsigned long long __p =
1542 #ifdef __LITTLE_ENDIAN__
1543  0x0008101820283038UL; // permute control for sign bits
1544 #else
1545  0x3830282018100800UL; // permute control for sign bits
1546 #endif
1547  return __builtin_bpermd(__p, __A);
1548 #else
1549 #ifdef __LITTLE_ENDIAN__
1550  unsigned int __mask = 0x20283038UL;
1551  unsigned int __r1 = __builtin_bpermd(__mask, __A) & 0xf;
1552  unsigned int __r2 = __builtin_bpermd(__mask, __A >> 32) & 0xf;
1553 #else
1554  unsigned int __mask = 0x38302820UL;
1555  unsigned int __r1 = __builtin_bpermd(__mask, __A >> 32) & 0xf;
1556  unsigned int __r2 = __builtin_bpermd(__mask, __A) & 0xf;
1557 #endif
1558  return (__r2 << 4) | __r1;
1559 #endif
1560 }
1561 
1562 extern __inline int
1563  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1564  _m_pmovmskb(__m64 __A) {
1565  return _mm_movemask_pi8(__A);
1566 }
1567 
1568 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1569  in B and produce the high 16 bits of the 32-bit results. */
1570 extern __inline __m64
1571  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1572  _mm_mulhi_pu16(__m64 __A, __m64 __B) {
1573  __vector unsigned short __a, __b;
1574  __vector unsigned short __c;
1575  __vector unsigned int __w0, __w1;
1576  __vector unsigned char __xform1 = {
1577 #ifdef __LITTLE_ENDIAN__
1578  0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1579  0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1580 #else
1581  0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1582  0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1583 #endif
1584  };
1585 
1586  __a = (__vector unsigned short)vec_splats(__A);
1587  __b = (__vector unsigned short)vec_splats(__B);
1588 
1589  __w0 = vec_vmuleuh(__a, __b);
1590  __w1 = vec_vmulouh(__a, __b);
1591  __c = (__vector unsigned short)vec_perm(__w0, __w1, __xform1);
1592 
1593  return (__m64)((__vector long long)__c)[0];
1594 }
1595 
1596 extern __inline __m64
1597  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1598  _m_pmulhuw(__m64 __A, __m64 __B) {
1599  return _mm_mulhi_pu16(__A, __B);
1600 }
1601 
1602 /* Return a combination of the four 16-bit values in A. The selector
1603  must be an immediate. */
1604 extern __inline __m64
1605  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1606  _mm_shuffle_pi16(__m64 __A, int const __N) {
1607  unsigned long __element_selector_10 = __N & 0x03;
1608  unsigned long __element_selector_32 = (__N >> 2) & 0x03;
1609  unsigned long __element_selector_54 = (__N >> 4) & 0x03;
1610  unsigned long __element_selector_76 = (__N >> 6) & 0x03;
1611  static const unsigned short __permute_selectors[4] = {
1612 #ifdef __LITTLE_ENDIAN__
1613  0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1614 #else
1615  0x0607, 0x0405, 0x0203, 0x0001
1616 #endif
1617  };
1618  __m64_union __t;
1619  __vector unsigned long long __a, __p, __r;
1620 
1621 #ifdef __LITTLE_ENDIAN__
1622  __t.as_short[0] = __permute_selectors[__element_selector_10];
1623  __t.as_short[1] = __permute_selectors[__element_selector_32];
1624  __t.as_short[2] = __permute_selectors[__element_selector_54];
1625  __t.as_short[3] = __permute_selectors[__element_selector_76];
1626 #else
1627  __t.as_short[3] = __permute_selectors[__element_selector_10];
1628  __t.as_short[2] = __permute_selectors[__element_selector_32];
1629  __t.as_short[1] = __permute_selectors[__element_selector_54];
1630  __t.as_short[0] = __permute_selectors[__element_selector_76];
1631 #endif
1632  __p = vec_splats(__t.as_m64);
1633  __a = vec_splats(__A);
1634  __r = vec_perm(__a, __a, (__vector unsigned char)__p);
1635  return (__m64)((__vector long long)__r)[0];
1636 }
1637 
1638 extern __inline __m64
1639  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1640  _m_pshufw(__m64 __A, int const __N) {
1641  return _mm_shuffle_pi16(__A, __N);
1642 }
1643 
1644 /* Conditionally store byte elements of A into P. The high bit of each
1645  byte in the selector N determines whether the corresponding byte from
1646  A is stored. */
1647 extern __inline void
1648  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1649  _mm_maskmove_si64(__m64 __A, __m64 __N, char *__P) {
1650  __m64 __hibit = 0x8080808080808080UL;
1651  __m64 __mask, __tmp;
1652  __m64 *__p = (__m64 *)__P;
1653 
1654  __tmp = *__p;
1655  __mask = _mm_cmpeq_pi8((__N & __hibit), __hibit);
1656  __tmp = (__tmp & (~__mask)) | (__A & __mask);
1657  *__p = __tmp;
1658 }
1659 
1660 extern __inline void
1661  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1662  _m_maskmovq(__m64 __A, __m64 __N, char *__P) {
1663  _mm_maskmove_si64(__A, __N, __P);
1664 }
1665 
1666 /* Compute the rounded averages of the unsigned 8-bit values in A and B. */
1667 extern __inline __m64
1668  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1669  _mm_avg_pu8(__m64 __A, __m64 __B) {
1670  __vector unsigned char __a, __b, __c;
1671 
1672  __a = (__vector unsigned char)vec_splats(__A);
1673  __b = (__vector unsigned char)vec_splats(__B);
1674  __c = vec_avg(__a, __b);
1675  return (__m64)((__vector long long)__c)[0];
1676 }
1677 
1678 extern __inline __m64
1679  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1680  _m_pavgb(__m64 __A, __m64 __B) {
1681  return _mm_avg_pu8(__A, __B);
1682 }
1683 
1684 /* Compute the rounded averages of the unsigned 16-bit values in A and B. */
1685 extern __inline __m64
1686  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1687  _mm_avg_pu16(__m64 __A, __m64 __B) {
1688  __vector unsigned short __a, __b, __c;
1689 
1690  __a = (__vector unsigned short)vec_splats(__A);
1691  __b = (__vector unsigned short)vec_splats(__B);
1692  __c = vec_avg(__a, __b);
1693  return (__m64)((__vector long long)__c)[0];
1694 }
1695 
1696 extern __inline __m64
1697  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1698  _m_pavgw(__m64 __A, __m64 __B) {
1699  return _mm_avg_pu16(__A, __B);
1700 }
1701 
1702 /* Compute the sum of the absolute differences of the unsigned 8-bit
1703  values in A and B. Return the value in the lower 16-bit word; the
1704  upper words are cleared. */
1705 extern __inline __m64
1706  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1707  _mm_sad_pu8(__m64 __A, __m64 __B) {
1708  __vector unsigned char __a, __b;
1709  __vector unsigned char __vmin, __vmax, __vabsdiff;
1710  __vector signed int __vsum;
1711  const __vector unsigned int __zero = {0, 0, 0, 0};
1712  __m64_union __result = {0};
1713 
1714  __a = (__vector unsigned char)(__vector unsigned long long){0UL, __A};
1715  __b = (__vector unsigned char)(__vector unsigned long long){0UL, __B};
1716  __vmin = vec_min(__a, __b);
1717  __vmax = vec_max(__a, __b);
1718  __vabsdiff = vec_sub(__vmax, __vmin);
1719  /* Sum four groups of bytes into integers. */
1720  __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
1721  /* Sum across four integers with integer result. */
1722  __vsum = vec_sums(__vsum, (__vector signed int)__zero);
1723  /* The sum is in the right most 32-bits of the vector result.
1724  Transfer to a GPR and truncate to 16 bits. */
1725  __result.as_short[0] = __vsum[3];
1726  return __result.as_m64;
1727 }
1728 
1729 extern __inline __m64
1730  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1731  _m_psadbw(__m64 __A, __m64 __B) {
1732  return _mm_sad_pu8(__A, __B);
1733 }
1734 
1735 /* Stores the data in A to the address P without polluting the caches. */
1736 extern __inline void
1737  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1738  _mm_stream_pi(__m64 *__P, __m64 __A) {
1739  /* Use the data cache block touch for store transient. */
1740  __asm__(" dcbtstt 0,%0" : : "b"(__P) : "memory");
1741  *__P = __A;
1742 }
1743 
1744 /* Likewise. The address must be 16-byte aligned. */
1745 extern __inline void
1746  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1747  _mm_stream_ps(float *__P, __m128 __A) {
1748  /* Use the data cache block touch for store transient. */
1749  __asm__(" dcbtstt 0,%0" : : "b"(__P) : "memory");
1750  _mm_store_ps(__P, __A);
1751 }
1752 
1753 /* Guarantees that every preceding store is globally visible before
1754  any subsequent store. */
1755 extern __inline void
1756  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1757  _mm_sfence(void) {
1758  /* Generate a light weight sync. */
1759  __atomic_thread_fence(__ATOMIC_RELEASE);
1760 }
1761 
1762 /* The execution of the next instruction is delayed by an implementation
1763  specific amount of time. The instruction does not modify the
1764  architectural state. This is after the pop_options pragma because
1765  it does not require SSE support in the processor--the encoding is a
1766  nop on processors that do not support it. */
1767 extern __inline void
1768  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1769  _mm_pause(void) {
1770  /* There is no exact match with this construct, but the following is
1771  close to the desired effect. */
1772 #if _ARCH_PWR8
1773  /* On power8 and later processors we can depend on Program Priority
1774  (PRI) and associated "very low" PPI setting. Since we don't know
1775  what PPI this thread is running at we: 1) save the current PRI
1776  from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1777  via the special or 31,31,31 encoding. 3) issue an "isync" to
1778  insure the PRI change takes effect before we execute any more
1779  instructions.
1780  Now we can execute a lwsync (release barrier) while we execute
1781  this thread at "very low" PRI. Finally we restore the original
1782  PRI and continue execution. */
1783  unsigned long __PPR;
1784 
1785  __asm__ volatile(" mfppr %0;"
1786  " or 31,31,31;"
1787  " isync;"
1788  " lwsync;"
1789  " isync;"
1790  " mtppr %0;"
1791  : "=r"(__PPR)
1792  :
1793  : "memory");
1794 #else
1795  /* For older processor where we may not even have Program Priority
1796  controls we can only depend on Heavy Weight Sync. */
1797  __atomic_thread_fence(__ATOMIC_SEQ_CST);
1798 #endif
1799 }
1800 
1801 /* Transpose the 4x4 matrix composed of row[0-3]. */
1802 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1803  do { \
1804  __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
1805  __v4sf __t0 = vec_vmrghw(__r0, __r1); \
1806  __v4sf __t1 = vec_vmrghw(__r2, __r3); \
1807  __v4sf __t2 = vec_vmrglw(__r0, __r1); \
1808  __v4sf __t3 = vec_vmrglw(__r2, __r3); \
1809  (row0) = (__v4sf)vec_mergeh((__vector long long)__t0, \
1810  (__vector long long)__t1); \
1811  (row1) = (__v4sf)vec_mergel((__vector long long)__t0, \
1812  (__vector long long)__t1); \
1813  (row2) = (__v4sf)vec_mergeh((__vector long long)__t2, \
1814  (__vector long long)__t3); \
1815  (row3) = (__v4sf)vec_mergel((__vector long long)__t2, \
1816  (__vector long long)__t3); \
1817  } while (0)
1818 
1819 /* For backward source compatibility. */
1820 //# include <emmintrin.h>
1821 
1822 #else
1823 #include_next <xmmintrin.h>
1824 #endif /* defined(__powerpc64__) && \
1825  * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
1826 
1827 #endif /* XMMINTRIN_H_ */
__device__ int
__device__ float
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
Definition: altivec.h:1708
static __inline__ vector int __ATTRS_o_ai vec_vmrghw(vector int __a, vector int __b)
Definition: altivec.h:5326
#define vec_ctf(__a, __b)
Definition: altivec.h:3244
static __inline__ vector int __ATTRS_o_ai vec_vupkhsh(vector short __a)
Definition: altivec.h:12731
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4800
static __inline__ vector float vector float __b
Definition: altivec.h:578
static __inline__ vector signed char __ATTRS_o_ai vec_ld(long __a, const vector signed char *__b)
Definition: altivec.h:4061
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
Definition: altivec.h:14737
static __inline__ vector short __ATTRS_o_ai vec_vupkhsb(vector signed char __a)
Definition: altivec.h:12712
static __inline__ vector signed char __ATTRS_o_ai vec_andc(vector signed char __a, vector signed char __b)
Definition: altivec.h:1235
static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, long __b, vector signed char *__c)
Definition: altivec.h:11184
static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a, vector int __b)
Definition: altivec.h:12487
static __inline__ vector signed char __ATTRS_o_ai vec_and(vector signed char __a, vector signed char __b)
Definition: altivec.h:882
static __inline__ vector signed char __ATTRS_o_ai vec_avg(vector signed char __a, vector signed char __b)
Definition: altivec.h:1586
static __inline__ vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a, vector signed char __b)
Definition: altivec.h:5361
static __inline__ vector int __ATTRS_o_ai vec_vmrglw(vector int __a, vector int __b)
Definition: altivec.h:5589
static __inline__ vector signed char __ATTRS_o_ai vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:7962
static __inline__ vector signed char __ATTRS_o_ai vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:8588
static __inline__ vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a, vector signed char __b)
Definition: altivec.h:5091
static __inline__ vector int __ATTRS_o_ai vec_vupklsh(vector short __a)
Definition: altivec.h:12870
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2435
static __inline__ vector signed char __ATTRS_o_ai vec_max(vector signed char __a, vector signed char __b)
Definition: altivec.h:4838
static __inline__ vector signed char __ATTRS_o_ai vec_nor(vector signed char __a, vector signed char __b)
Definition: altivec.h:6729
static __inline__ vector bool char __ATTRS_o_ai vec_cmpge(vector signed char __a, vector signed char __b)
Definition: altivec.h:2243
static __inline__ vector signed char __ATTRS_o_ai vec_pack(vector signed short __a, vector signed short __b)
Definition: altivec.h:7389
static __inline__ vector float __ATTRS_o_ai vec_re(vector float __a)
Definition: altivec.h:8263
static __inline__ vector signed char __ATTRS_o_ai vec_min(vector signed char __a, vector signed char __b)
Definition: altivec.h:5742
#define vec_cts
Definition: altivec.h:3319
static __inline__ vector signed char __ATTRS_o_ai vec_splat(vector signed char __a, unsigned const int __b)
Definition: altivec.h:10090
static __inline__ vector signed char __ATTRS_o_ai vec_or(vector signed char __a, vector signed char __b)
Definition: altivec.h:6865
static __inline__ vector signed char __ATTRS_o_ai vec_abs(vector signed char __a)
Definition: altivec.h:117
static __inline__ vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:13207
static __inline__ vector float __ATTRS_o_ai vec_rsqrte(vector float __a)
Definition: altivec.h:8541
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2131
static __inline__ vector bool char __ATTRS_o_ai vec_cmple(vector signed char __a, vector signed char __b)
Definition: altivec.h:2369
static __inline__ vector signed char __ATTRS_o_ai vec_sub(vector signed char __a, vector signed char __b)
Definition: altivec.h:11869
static __inline__ uint32_t volatile uint32_t * __p
Definition: arm_acle.h:80
static __inline__ void int __a
Definition: emmintrin.h:4057
void _mm_pause(void)
Indicates that a spin loop is being executed for the purposes of optimizing power consumption during ...
static __inline__ void short __D
Definition: immintrin.h:468
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
Compares the 8-bit integer elements of two 64-bit integer vectors of [8 x i8] to determine if the ele...
Definition: mmintrin.h:1157
__inline unsigned int unsigned int unsigned int * __P
Definition: bmi2intrin.h:25
__inline unsigned int unsigned int __Y
Definition: bmi2intrin.h:19
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1172
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttss_si32(__m128 __a)
Converts the lower (first) element of a vector of [4 x float] into a signed truncated (rounded toward...
Definition: xmmintrin.h:1494
#define _m_pinsrw
Definition: xmmintrin.h:3185
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ss(__m128 __a)
Calculates the approximate reciprocal of the value stored in the low-order bits of a 128-bit vector o...
Definition: xmmintrin.h:257
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:573
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a)
Calculates the square root of the value stored in the low-order bits of a 128-bit vector of [4 x floa...
Definition: xmmintrin.h:222
#define _MM_HINT_ET0
Definition: xmmintrin.h:2183
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ps(__m128 __a, __m128 __b)
Divides two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:204
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:948
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for equa...
Definition: xmmintrin.h:525
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition: xmmintrin.h:2018
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1452
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ss(float __w)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:1911
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_and_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:420
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:551
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1406
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality.
Definition: xmmintrin.h:504
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ps(__m128 __a, __m128 __b)
Adds two 128-bit vectors of [4 x float], and returns the results of the addition.
Definition: xmmintrin.h:78
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an aligned memory location.
Definition: xmmintrin.h:1838
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mulhi_pu16(__m64 __a, __m64 __b)
Multiplies packed 16-bit unsigned integer values and writes the high-order 16 bits of each 32-bit pro...
Definition: xmmintrin.h:2440
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for ineq...
Definition: xmmintrin.h:769
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1148
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1472
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_andnot_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float], using the one's complement of the value...
Definition: xmmintrin.h:442
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_ps(float *__p, __m128 __a)
Stores float values from a 128-bit vector of [4 x float] to an aligned memory location in reverse ord...
Definition: xmmintrin.h:2177
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
Definition: xmmintrin.h:1891
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttps_pi32(__m128 __a)
Converts the lower (first) two elements of a 128-bit vector of [4 x float] into two signed truncated ...
Definition: xmmintrin.h:1562
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:846
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1267
#define _m_pmulhuw
Definition: xmmintrin.h:3191
#define _m_pmaxub
Definition: xmmintrin.h:3187
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:599
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvt_si2ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition: xmmintrin.h:1628
static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_stream_pi(void *__p, __m64 __a)
Stores a 64-bit integer in the specified aligned memory location.
Definition: xmmintrin.h:2235
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi16_ps(__m64 __a)
Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x float].
Definition: xmmintrin.h:2850
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ps(__m128 __a)
Calculates the approximate reciprocals of the square roots of the values stored in a 128-bit vector o...
Definition: xmmintrin.h:310
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi8(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition: xmmintrin.h:3020
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pi(__m64 *__p, __m128 __a)
Stores the lower 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition: xmmintrin.h:2056
#define _m_pmaxsw
Definition: xmmintrin.h:3186
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1291
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:721
#define _mm_shuffle_ps(a, b, mask)
Selects 4 float values from the 128-bit operands of [4 x float], as specified by the immediate value ...
Definition: xmmintrin.h:2724
#define _m_pavgw
Definition: xmmintrin.h:3195
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality.
Definition: xmmintrin.h:1099
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_ps(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition: xmmintrin.h:2138
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu16_ps(__m64 __a)
Converts a 64-bit vector of 16-bit unsigned integer values into a 128-bit vector of [4 x float].
Definition: xmmintrin.h:2880
void _mm_sfence(void)
Forces strong memory ordering (serialization) between store instructions preceding this instruction a...
#define _mm_load_ps1(p)
Definition: xmmintrin.h:1824
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps1(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition: xmmintrin.h:1948
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ps(__m128 __a, __m128 __b)
Multiplies two 128-bit vectors of [4 x float] and returns the results of the multiplication.
Definition: xmmintrin.h:163
#define _MM_HINT_ET1
Definition: xmmintrin.h:2184
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the greater of each pair of values.
Definition: xmmintrin.h:402
#define _m_pextrw
Definition: xmmintrin.h:3184
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ss(__m128 __a)
Calculates the approximate reciprocal of the square root of the value stored in the low-order bits of...
Definition: xmmintrin.h:293
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1339
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu16(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 16-bit integer values and writes the averages to...
Definition: xmmintrin.h:2544
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1124
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadl_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the low-order bits of a 128-bit vector of [4 ...
Definition: xmmintrin.h:1766
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition: xmmintrin.h:2098
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ss(__m128 __a, __m128 __b)
Subtracts the 32-bit float value in the low-order bits of the second operand from the corresponding v...
Definition: xmmintrin.h:100
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ps(__m128 __a, __m128 __b)
Subtracts each of the values of the second operand from the first operand, both of which are 128-bit ...
Definition: xmmintrin.h:121
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load1_ps(const float *__p)
Loads a 32-bit float value and duplicates it to all four vector elements of a 128-bit vector of [4 x ...
Definition: xmmintrin.h:1815
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movelh_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2832
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the lesser of each pair of values.
Definition: xmmintrin.h:356
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_ps(void *__p, __m128 __a)
Moves packed float values from a 128-bit vector of [4 x float] to a 128-bit aligned memory location.
Definition: xmmintrin.h:2254
#define _m_pavgb
Definition: xmmintrin.h:3194
#define _m_pmovmskb
Definition: xmmintrin.h:3190
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1196
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1384
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:671
#define _m_psadbw
Definition: xmmintrin.h:3196
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setr_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float], initialized in reverse order with the spec...
Definition: xmmintrin.h:2003
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1315
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpackhi_ps(__m128 __a, __m128 __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x float] and interleaves the...
Definition: xmmintrin.h:2745
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ss(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] to a memory location.
Definition: xmmintrin.h:2077
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:896
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadh_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the high-order bits of a 128-bit vector of [4...
Definition: xmmintrin.h:1739
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_xor_ps(__m128 __a, __m128 __b)
Performs a bitwise exclusive OR of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:479
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ps(__m128 __a)
Calculates the approximate reciprocals of the values stored in a 128-bit vector of [4 x float].
Definition: xmmintrin.h:274
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_move_ss(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2789
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set1_ps(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition: xmmintrin.h:1929
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] into an aligned memory location.
Definition: xmmintrin.h:2119
#define _MM_HINT_T0
Definition: xmmintrin.h:2185
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_or_ps(__m128 __a, __m128 __b)
Performs a bitwise OR of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:460
#define _mm_extract_pi16(a, n)
Extracts 16-bit element from a 64-bit vector of [4 x i16] and returns it, as specified by the immedia...
Definition: xmmintrin.h:2298
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a)
Calculates the square roots of the values stored in a 128-bit vector of [4 x float].
Definition: xmmintrin.h:239
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for inequality.
Definition: xmmintrin.h:747
#define _mm_prefetch(a, sel)
Loads one cache line of data from the specified address to a location closer to the processor.
Definition: xmmintrin.h:2218
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtss_f32(__m128 __a)
Extracts a float value contained in the lower 32 bits of a vector of [4 x float].
Definition: xmmintrin.h:1718
#define _m_pminsw
Definition: xmmintrin.h:3188
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2365
int __v4si __attribute__((__vector_size__(16)))
Definition: xmmintrin.h:19
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ss(__m128 __a, __m128 __b)
Multiplies two 32-bit float values in the low-order bits of the operands.
Definition: xmmintrin.h:143
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition: xmmintrin.h:2384
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsi32_ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition: xmmintrin.h:1605
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtt_ps2pi(__m128 __a)
Converts the lower (first) two elements of a 128-bit vector of [4 x float] into two signed truncated ...
Definition: xmmintrin.h:1583
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtt_ss2si(__m128 __a)
Converts the lower (first) element of a vector of [4 x float] into a signed truncated (rounded toward...
Definition: xmmintrin.h:1516
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_ps(__m128 __a)
Extracts the sign bits from each single-precision floating-point element of a 128-bit floating-point ...
Definition: xmmintrin.h:3045
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
Converts the two 32-bit signed integer values from each 64-bit vector operand of [2 x i32] into a 128...
Definition: xmmintrin.h:2961
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movehl_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2811
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadr_ps(const float *__p)
Loads four packed float values, in reverse order, from an aligned memory location to 32-bit elements ...
Definition: xmmintrin.h:1877
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:1000
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:819
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pi(__m64 *__p, __m128 __a)
Stores the upper 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition: xmmintrin.h:2035
#define _MM_HINT_T1
Definition: xmmintrin.h:2186
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:921
#define _m_pshufw
Definition: xmmintrin.h:3192
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:973
#define _m_maskmovq
Definition: xmmintrin.h:3193
#define _mm_insert_pi16(a, d, n)
Copies data from the 64-bit vector of [4 x i16] to the destination, and inserts the lower 16-bits of ...
Definition: xmmintrin.h:2329
#define _mm_shuffle_pi16(a, n)
Shuffles the 4 16-bit integers from a 64-bit integer vector to the destination, as specified by the i...
Definition: xmmintrin.h:2479
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:1024
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:647
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvt_pi2ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition: xmmintrin.h:1701
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1243
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ss(__m128 __a, __m128 __b)
Adds the 32-bit float values in the low-order bits of the operands.
Definition: xmmintrin.h:58
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float] initialized with the specified single-preci...
Definition: xmmintrin.h:1975
static __inline__ int __DEFAULT_FN_ATTRS_MMX _mm_movemask_pi8(__m64 __a)
Takes the most significant bit from each 8-bit element in a 64-bit integer vector to create an 8-bit ...
Definition: xmmintrin.h:2421
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:796
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps1(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition: xmmintrin.h:2158
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the lesser value ...
Definition: xmmintrin.h:335
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu8_ps(__m64 __a)
Converts the lower four unsigned 8-bit integer values from a 64-bit vector of [8 x u8] into a 128-bit...
Definition: xmmintrin.h:2934
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:621
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu8(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 8-bit integer values and writes the averages to ...
Definition: xmmintrin.h:2525
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi8_ps(__m64 __a)
Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] into a 128-bit vector of [4 x f...
Definition: xmmintrin.h:2909
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpacklo_ps(__m128 __a, __m128 __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x float] and interleaves them...
Definition: xmmintrin.h:2767
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the greater value...
Definition: xmmintrin.h:381
static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
Conditionally copies the values from each 8-bit element in the first 64-bit integer vector operand to...
Definition: xmmintrin.h:2506
#define _m_pminub
Definition: xmmintrin.h:3189
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition: xmmintrin.h:2346
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1362
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi16(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition: xmmintrin.h:2990
#define _MM_HINT_NTA
Definition: xmmintrin.h:2188
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1220
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2403
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:869
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sad_pu8(__m64 __a, __m64 __b)
Subtracts the corresponding 8-bit unsigned integer values of the two 64-bit vector operands and compu...
Definition: xmmintrin.h:2566
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition: xmmintrin.h:1678
#define _MM_HINT_T2
Definition: xmmintrin.h:2187
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:1051
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:697
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:1075
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition: xmmintrin.h:1855
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ss(const float *__p)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:1793
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ss(__m128 __a, __m128 __b)
Divides the value in the low-order 32 bits of the first operand by the corresponding value in the sec...
Definition: xmmintrin.h:185