clang  19.0.0git
mmintrin.h
Go to the documentation of this file.
1 /*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11  User Guide and Reference, version 9.0. */
12 
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15  explicitly from x86_64 to powerpc64/powerpc64le.
16 
17  Since PowerPC target doesn't support native 64-bit vector type, we
18  typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which
19  works well for _si64 and some _pi32 operations.
20 
21  For _pi16 and _pi8 operations, it's better to transfer __m64 into
22  128-bit PowerPC vector first. Power8 introduced direct register
23  move instructions which helps for more efficient implementation.
24 
25  It's user's responsibility to determine if the results of such port
26  are acceptable or further changes are needed. Please note that much
27  code using Intel intrinsics CAN BE REWRITTEN in more portable and
28  efficient standard C or GNU C extensions with 64-bit scalar
29  operations, or 128-bit SSE/Altivec operations, which are more
30  recommended. */
31 #error \
32  "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
33 #endif
34 
35 #ifndef _MMINTRIN_H_INCLUDED
36 #define _MMINTRIN_H_INCLUDED
37 
38 #if defined(__powerpc64__) && \
39  (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
40 
41 #include <altivec.h>
42 /* The Intel API is flexible enough that we must allow aliasing with other
43  vector types, and their scalar components. */
44 typedef __attribute__((__aligned__(8))) unsigned long long __m64;
45 
46 typedef __attribute__((__aligned__(8))) union {
47  __m64 as_m64;
48  char as_char[8];
49  signed char as_signed_char[8];
50  short as_short[4];
51  int as_int[2];
52  long long as_long_long;
53  float as_float[2];
54  double as_double;
55 } __m64_union;
56 
57 /* Empty the multimedia state. */
58 extern __inline void
59  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
60  _mm_empty(void) {
61  /* nothing to do on PowerPC. */
62 }
63 
64 extern __inline void
65  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
66  _m_empty(void) {
67  /* nothing to do on PowerPC. */
68 }
69 
70 /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
71 extern __inline __m64
72  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
73  _mm_cvtsi32_si64(int __i) {
74  return (__m64)(unsigned int)__i;
75 }
76 
77 extern __inline __m64
78  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
79  _m_from_int(int __i) {
80  return _mm_cvtsi32_si64(__i);
81 }
82 
83 /* Convert the lower 32 bits of the __m64 object into an integer. */
84 extern __inline int
85  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
86  _mm_cvtsi64_si32(__m64 __i) {
87  return ((int)__i);
88 }
89 
90 extern __inline int
91  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
92  _m_to_int(__m64 __i) {
93  return _mm_cvtsi64_si32(__i);
94 }
95 
96 /* Convert I to a __m64 object. */
97 
98 /* Intel intrinsic. */
99 extern __inline __m64
100  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
101  _m_from_int64(long long __i) {
102  return (__m64)__i;
103 }
104 
105 extern __inline __m64
106  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
107  _mm_cvtsi64_m64(long long __i) {
108  return (__m64)__i;
109 }
110 
111 /* Microsoft intrinsic. */
112 extern __inline __m64
113  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
114  _mm_cvtsi64x_si64(long long __i) {
115  return (__m64)__i;
116 }
117 
118 extern __inline __m64
119  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120  _mm_set_pi64x(long long __i) {
121  return (__m64)__i;
122 }
123 
124 /* Convert the __m64 object to a 64bit integer. */
125 
126 /* Intel intrinsic. */
127 extern __inline long long
128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
129  _m_to_int64(__m64 __i) {
130  return (long long)__i;
131 }
132 
133 extern __inline long long
134  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
135  _mm_cvtm64_si64(__m64 __i) {
136  return (long long)__i;
137 }
138 
139 /* Microsoft intrinsic. */
140 extern __inline long long
141  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142  _mm_cvtsi64_si64x(__m64 __i) {
143  return (long long)__i;
144 }
145 
146 #ifdef _ARCH_PWR8
147 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
148  the result, and the four 16-bit values from M2 into the upper four 8-bit
149  values of the result, all with signed saturation. */
150 extern __inline __m64
151  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
152  _mm_packs_pi16(__m64 __m1, __m64 __m2) {
153  __vector signed short __vm1;
154  __vector signed char __vresult;
155 
156  __vm1 = (__vector signed short)(__vector unsigned long long)
157 #ifdef __LITTLE_ENDIAN__
158  {__m1, __m2};
159 #else
160  {__m2, __m1};
161 #endif
162  __vresult = vec_packs(__vm1, __vm1);
163  return (__m64)((__vector long long)__vresult)[0];
164 }
165 
166 extern __inline __m64
167  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
168  _m_packsswb(__m64 __m1, __m64 __m2) {
169  return _mm_packs_pi16(__m1, __m2);
170 }
171 
172 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
173  the result, and the two 32-bit values from M2 into the upper two 16-bit
174  values of the result, all with signed saturation. */
175 extern __inline __m64
176  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
177  _mm_packs_pi32(__m64 __m1, __m64 __m2) {
178  __vector signed int __vm1;
179  __vector signed short __vresult;
180 
181  __vm1 = (__vector signed int)(__vector unsigned long long)
182 #ifdef __LITTLE_ENDIAN__
183  {__m1, __m2};
184 #else
185  {__m2, __m1};
186 #endif
187  __vresult = vec_packs(__vm1, __vm1);
188  return (__m64)((__vector long long)__vresult)[0];
189 }
190 
191 extern __inline __m64
192  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
193  _m_packssdw(__m64 __m1, __m64 __m2) {
194  return _mm_packs_pi32(__m1, __m2);
195 }
196 
197 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
198  the result, and the four 16-bit values from M2 into the upper four 8-bit
199  values of the result, all with unsigned saturation. */
200 extern __inline __m64
201  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
202  _mm_packs_pu16(__m64 __m1, __m64 __m2) {
203  __vector unsigned char __r;
204  __vector signed short __vm1 = (__vector signed short)(__vector long long)
205 #ifdef __LITTLE_ENDIAN__
206  {__m1, __m2};
207 #else
208  {__m2, __m1};
209 #endif
210  const __vector signed short __zero = {0};
211  __vector __bool short __select = vec_cmplt(__vm1, __zero);
212  __r =
213  vec_packs((__vector unsigned short)__vm1, (__vector unsigned short)__vm1);
214  __vector __bool char __packsel = vec_pack(__select, __select);
215  __r = vec_sel(__r, (const __vector unsigned char)__zero, __packsel);
216  return (__m64)((__vector long long)__r)[0];
217 }
218 
219 extern __inline __m64
220  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221  _m_packuswb(__m64 __m1, __m64 __m2) {
222  return _mm_packs_pu16(__m1, __m2);
223 }
224 #endif /* end ARCH_PWR8 */
225 
226 /* Interleave the four 8-bit values from the high half of M1 with the four
227  8-bit values from the high half of M2. */
228 extern __inline __m64
229  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
230  _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
231 #if _ARCH_PWR8
232  __vector unsigned char __a, __b, __c;
233 
234  __a = (__vector unsigned char)vec_splats(__m1);
235  __b = (__vector unsigned char)vec_splats(__m2);
236  __c = vec_mergel(__a, __b);
237  return (__m64)((__vector long long)__c)[1];
238 #else
239  __m64_union __mu1, __mu2, __res;
240 
241  __mu1.as_m64 = __m1;
242  __mu2.as_m64 = __m2;
243 
244  __res.as_char[0] = __mu1.as_char[4];
245  __res.as_char[1] = __mu2.as_char[4];
246  __res.as_char[2] = __mu1.as_char[5];
247  __res.as_char[3] = __mu2.as_char[5];
248  __res.as_char[4] = __mu1.as_char[6];
249  __res.as_char[5] = __mu2.as_char[6];
250  __res.as_char[6] = __mu1.as_char[7];
251  __res.as_char[7] = __mu2.as_char[7];
252 
253  return (__m64)__res.as_m64;
254 #endif
255 }
256 
257 extern __inline __m64
258  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
259  _m_punpckhbw(__m64 __m1, __m64 __m2) {
260  return _mm_unpackhi_pi8(__m1, __m2);
261 }
262 
263 /* Interleave the two 16-bit values from the high half of M1 with the two
264  16-bit values from the high half of M2. */
265 extern __inline __m64
266  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
267  _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
268  __m64_union __mu1, __mu2, __res;
269 
270  __mu1.as_m64 = __m1;
271  __mu2.as_m64 = __m2;
272 
273  __res.as_short[0] = __mu1.as_short[2];
274  __res.as_short[1] = __mu2.as_short[2];
275  __res.as_short[2] = __mu1.as_short[3];
276  __res.as_short[3] = __mu2.as_short[3];
277 
278  return (__m64)__res.as_m64;
279 }
280 
281 extern __inline __m64
282  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
283  _m_punpckhwd(__m64 __m1, __m64 __m2) {
284  return _mm_unpackhi_pi16(__m1, __m2);
285 }
286 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
287  value from the high half of M2. */
288 extern __inline __m64
289  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290  _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
291  __m64_union __mu1, __mu2, __res;
292 
293  __mu1.as_m64 = __m1;
294  __mu2.as_m64 = __m2;
295 
296  __res.as_int[0] = __mu1.as_int[1];
297  __res.as_int[1] = __mu2.as_int[1];
298 
299  return (__m64)__res.as_m64;
300 }
301 
302 extern __inline __m64
303  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
304  _m_punpckhdq(__m64 __m1, __m64 __m2) {
305  return _mm_unpackhi_pi32(__m1, __m2);
306 }
307 /* Interleave the four 8-bit values from the low half of M1 with the four
308  8-bit values from the low half of M2. */
309 extern __inline __m64
310  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
311  _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
312 #if _ARCH_PWR8
313  __vector unsigned char __a, __b, __c;
314 
315  __a = (__vector unsigned char)vec_splats(__m1);
316  __b = (__vector unsigned char)vec_splats(__m2);
317  __c = vec_mergel(__a, __b);
318  return (__m64)((__vector long long)__c)[0];
319 #else
320  __m64_union __mu1, __mu2, __res;
321 
322  __mu1.as_m64 = __m1;
323  __mu2.as_m64 = __m2;
324 
325  __res.as_char[0] = __mu1.as_char[0];
326  __res.as_char[1] = __mu2.as_char[0];
327  __res.as_char[2] = __mu1.as_char[1];
328  __res.as_char[3] = __mu2.as_char[1];
329  __res.as_char[4] = __mu1.as_char[2];
330  __res.as_char[5] = __mu2.as_char[2];
331  __res.as_char[6] = __mu1.as_char[3];
332  __res.as_char[7] = __mu2.as_char[3];
333 
334  return (__m64)__res.as_m64;
335 #endif
336 }
337 
338 extern __inline __m64
339  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
340  _m_punpcklbw(__m64 __m1, __m64 __m2) {
341  return _mm_unpacklo_pi8(__m1, __m2);
342 }
343 /* Interleave the two 16-bit values from the low half of M1 with the two
344  16-bit values from the low half of M2. */
345 extern __inline __m64
346  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
347  _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
348  __m64_union __mu1, __mu2, __res;
349 
350  __mu1.as_m64 = __m1;
351  __mu2.as_m64 = __m2;
352 
353  __res.as_short[0] = __mu1.as_short[0];
354  __res.as_short[1] = __mu2.as_short[0];
355  __res.as_short[2] = __mu1.as_short[1];
356  __res.as_short[3] = __mu2.as_short[1];
357 
358  return (__m64)__res.as_m64;
359 }
360 
361 extern __inline __m64
362  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
363  _m_punpcklwd(__m64 __m1, __m64 __m2) {
364  return _mm_unpacklo_pi16(__m1, __m2);
365 }
366 
367 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
368  value from the low half of M2. */
369 extern __inline __m64
370  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
371  _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
372  __m64_union __mu1, __mu2, __res;
373 
374  __mu1.as_m64 = __m1;
375  __mu2.as_m64 = __m2;
376 
377  __res.as_int[0] = __mu1.as_int[0];
378  __res.as_int[1] = __mu2.as_int[0];
379 
380  return (__m64)__res.as_m64;
381 }
382 
383 extern __inline __m64
384  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
385  _m_punpckldq(__m64 __m1, __m64 __m2) {
386  return _mm_unpacklo_pi32(__m1, __m2);
387 }
388 
389 /* Add the 8-bit values in M1 to the 8-bit values in M2. */
390 extern __inline __m64
391  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392  _mm_add_pi8(__m64 __m1, __m64 __m2) {
393 #if _ARCH_PWR8
394  __vector signed char __a, __b, __c;
395 
396  __a = (__vector signed char)vec_splats(__m1);
397  __b = (__vector signed char)vec_splats(__m2);
398  __c = vec_add(__a, __b);
399  return (__m64)((__vector long long)__c)[0];
400 #else
401  __m64_union __mu1, __mu2, __res;
402 
403  __mu1.as_m64 = __m1;
404  __mu2.as_m64 = __m2;
405 
406  __res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0];
407  __res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1];
408  __res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2];
409  __res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3];
410  __res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4];
411  __res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5];
412  __res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6];
413  __res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7];
414 
415  return (__m64)__res.as_m64;
416 #endif
417 }
418 
419 extern __inline __m64
420  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
421  _m_paddb(__m64 __m1, __m64 __m2) {
422  return _mm_add_pi8(__m1, __m2);
423 }
424 
425 /* Add the 16-bit values in M1 to the 16-bit values in M2. */
426 extern __inline __m64
427  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
428  _mm_add_pi16(__m64 __m1, __m64 __m2) {
429 #if _ARCH_PWR8
430  __vector signed short __a, __b, __c;
431 
432  __a = (__vector signed short)vec_splats(__m1);
433  __b = (__vector signed short)vec_splats(__m2);
434  __c = vec_add(__a, __b);
435  return (__m64)((__vector long long)__c)[0];
436 #else
437  __m64_union __mu1, __mu2, __res;
438 
439  __mu1.as_m64 = __m1;
440  __mu2.as_m64 = __m2;
441 
442  __res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0];
443  __res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1];
444  __res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2];
445  __res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3];
446 
447  return (__m64)__res.as_m64;
448 #endif
449 }
450 
451 extern __inline __m64
452  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
453  _m_paddw(__m64 __m1, __m64 __m2) {
454  return _mm_add_pi16(__m1, __m2);
455 }
456 
457 /* Add the 32-bit values in M1 to the 32-bit values in M2. */
458 extern __inline __m64
459  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
460  _mm_add_pi32(__m64 __m1, __m64 __m2) {
461 #if _ARCH_PWR9
462  __vector signed int __a, __b, __c;
463 
464  __a = (__vector signed int)vec_splats(__m1);
465  __b = (__vector signed int)vec_splats(__m2);
466  __c = vec_add(__a, __b);
467  return (__m64)((__vector long long)__c)[0];
468 #else
469  __m64_union __mu1, __mu2, __res;
470 
471  __mu1.as_m64 = __m1;
472  __mu2.as_m64 = __m2;
473 
474  __res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0];
475  __res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1];
476 
477  return (__m64)__res.as_m64;
478 #endif
479 }
480 
481 extern __inline __m64
482  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
483  _m_paddd(__m64 __m1, __m64 __m2) {
484  return _mm_add_pi32(__m1, __m2);
485 }
486 
487 /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
488 extern __inline __m64
489  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
490  _mm_sub_pi8(__m64 __m1, __m64 __m2) {
491 #if _ARCH_PWR8
492  __vector signed char __a, __b, __c;
493 
494  __a = (__vector signed char)vec_splats(__m1);
495  __b = (__vector signed char)vec_splats(__m2);
496  __c = vec_sub(__a, __b);
497  return (__m64)((__vector long long)__c)[0];
498 #else
499  __m64_union __mu1, __mu2, __res;
500 
501  __mu1.as_m64 = __m1;
502  __mu2.as_m64 = __m2;
503 
504  __res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0];
505  __res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1];
506  __res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2];
507  __res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3];
508  __res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4];
509  __res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5];
510  __res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6];
511  __res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7];
512 
513  return (__m64)__res.as_m64;
514 #endif
515 }
516 
517 extern __inline __m64
518  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
519  _m_psubb(__m64 __m1, __m64 __m2) {
520  return _mm_sub_pi8(__m1, __m2);
521 }
522 
523 /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
524 extern __inline __m64
525  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
526  _mm_sub_pi16(__m64 __m1, __m64 __m2) {
527 #if _ARCH_PWR8
528  __vector signed short __a, __b, __c;
529 
530  __a = (__vector signed short)vec_splats(__m1);
531  __b = (__vector signed short)vec_splats(__m2);
532  __c = vec_sub(__a, __b);
533  return (__m64)((__vector long long)__c)[0];
534 #else
535  __m64_union __mu1, __mu2, __res;
536 
537  __mu1.as_m64 = __m1;
538  __mu2.as_m64 = __m2;
539 
540  __res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0];
541  __res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1];
542  __res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2];
543  __res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3];
544 
545  return (__m64)__res.as_m64;
546 #endif
547 }
548 
549 extern __inline __m64
550  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
551  _m_psubw(__m64 __m1, __m64 __m2) {
552  return _mm_sub_pi16(__m1, __m2);
553 }
554 
555 /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
556 extern __inline __m64
557  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
558  _mm_sub_pi32(__m64 __m1, __m64 __m2) {
559 #if _ARCH_PWR9
560  __vector signed int __a, __b, __c;
561 
562  __a = (__vector signed int)vec_splats(__m1);
563  __b = (__vector signed int)vec_splats(__m2);
564  __c = vec_sub(__a, __b);
565  return (__m64)((__vector long long)__c)[0];
566 #else
567  __m64_union __mu1, __mu2, __res;
568 
569  __mu1.as_m64 = __m1;
570  __mu2.as_m64 = __m2;
571 
572  __res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0];
573  __res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1];
574 
575  return (__m64)__res.as_m64;
576 #endif
577 }
578 
579 extern __inline __m64
580  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
581  _m_psubd(__m64 __m1, __m64 __m2) {
582  return _mm_sub_pi32(__m1, __m2);
583 }
584 
585 extern __inline __m64
586  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
587  _mm_add_si64(__m64 __m1, __m64 __m2) {
588  return (__m1 + __m2);
589 }
590 
591 extern __inline __m64
592  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
593  _mm_sub_si64(__m64 __m1, __m64 __m2) {
594  return (__m1 - __m2);
595 }
596 
597 /* Shift the 64-bit value in M left by COUNT. */
598 extern __inline __m64
599  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
600  _mm_sll_si64(__m64 __m, __m64 __count) {
601  return (__m << __count);
602 }
603 
604 extern __inline __m64
605  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
606  _m_psllq(__m64 __m, __m64 __count) {
607  return _mm_sll_si64(__m, __count);
608 }
609 
610 extern __inline __m64
611  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
612  _mm_slli_si64(__m64 __m, const int __count) {
613  return (__m << __count);
614 }
615 
616 extern __inline __m64
617  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
618  _m_psllqi(__m64 __m, const int __count) {
619  return _mm_slli_si64(__m, __count);
620 }
621 
622 /* Shift the 64-bit value in M left by COUNT; shift in zeros. */
623 extern __inline __m64
624  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625  _mm_srl_si64(__m64 __m, __m64 __count) {
626  return (__m >> __count);
627 }
628 
629 extern __inline __m64
630  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
631  _m_psrlq(__m64 __m, __m64 __count) {
632  return _mm_srl_si64(__m, __count);
633 }
634 
635 extern __inline __m64
636  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
637  _mm_srli_si64(__m64 __m, const int __count) {
638  return (__m >> __count);
639 }
640 
641 extern __inline __m64
642  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
643  _m_psrlqi(__m64 __m, const int __count) {
644  return _mm_srli_si64(__m, __count);
645 }
646 
647 /* Bit-wise AND the 64-bit values in M1 and M2. */
648 extern __inline __m64
649  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
650  _mm_and_si64(__m64 __m1, __m64 __m2) {
651  return (__m1 & __m2);
652 }
653 
654 extern __inline __m64
655  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
656  _m_pand(__m64 __m1, __m64 __m2) {
657  return _mm_and_si64(__m1, __m2);
658 }
659 
660 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
661  64-bit value in M2. */
662 extern __inline __m64
663  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
664  _mm_andnot_si64(__m64 __m1, __m64 __m2) {
665  return (~__m1 & __m2);
666 }
667 
668 extern __inline __m64
669  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
670  _m_pandn(__m64 __m1, __m64 __m2) {
671  return _mm_andnot_si64(__m1, __m2);
672 }
673 
674 /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
675 extern __inline __m64
676  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
677  _mm_or_si64(__m64 __m1, __m64 __m2) {
678  return (__m1 | __m2);
679 }
680 
681 extern __inline __m64
682  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
683  _m_por(__m64 __m1, __m64 __m2) {
684  return _mm_or_si64(__m1, __m2);
685 }
686 
687 /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
688 extern __inline __m64
689  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
690  _mm_xor_si64(__m64 __m1, __m64 __m2) {
691  return (__m1 ^ __m2);
692 }
693 
694 extern __inline __m64
695  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
696  _m_pxor(__m64 __m1, __m64 __m2) {
697  return _mm_xor_si64(__m1, __m2);
698 }
699 
700 /* Creates a 64-bit zero. */
701 extern __inline __m64
702  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
703  _mm_setzero_si64(void) {
704  return (__m64)0;
705 }
706 
707 /* Compare eight 8-bit values. The result of the comparison is 0xFF if the
708  test is true and zero if false. */
709 extern __inline __m64
710  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
711  _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
712 #if defined(_ARCH_PWR6) && defined(__powerpc64__)
713  __m64 __res;
714  __asm__("cmpb %0,%1,%2;\n" : "=r"(__res) : "r"(__m1), "r"(__m2) :);
715  return (__res);
716 #else
717  __m64_union __mu1, __mu2, __res;
718 
719  __mu1.as_m64 = __m1;
720  __mu2.as_m64 = __m2;
721 
722  __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0]) ? -1 : 0;
723  __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1]) ? -1 : 0;
724  __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2]) ? -1 : 0;
725  __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3]) ? -1 : 0;
726  __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4]) ? -1 : 0;
727  __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5]) ? -1 : 0;
728  __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6]) ? -1 : 0;
729  __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7]) ? -1 : 0;
730 
731  return (__m64)__res.as_m64;
732 #endif
733 }
734 
735 extern __inline __m64
736  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
737  _m_pcmpeqb(__m64 __m1, __m64 __m2) {
738  return _mm_cmpeq_pi8(__m1, __m2);
739 }
740 
741 extern __inline __m64
742  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
743  _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
744 #if _ARCH_PWR8
745  __vector signed char __a, __b, __c;
746 
747  __a = (__vector signed char)vec_splats(__m1);
748  __b = (__vector signed char)vec_splats(__m2);
749  __c = (__vector signed char)vec_cmpgt(__a, __b);
750  return (__m64)((__vector long long)__c)[0];
751 #else
752  __m64_union __mu1, __mu2, __res;
753 
754  __mu1.as_m64 = __m1;
755  __mu2.as_m64 = __m2;
756 
757  __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0]) ? -1 : 0;
758  __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1]) ? -1 : 0;
759  __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2]) ? -1 : 0;
760  __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3]) ? -1 : 0;
761  __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4]) ? -1 : 0;
762  __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5]) ? -1 : 0;
763  __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6]) ? -1 : 0;
764  __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7]) ? -1 : 0;
765 
766  return (__m64)__res.as_m64;
767 #endif
768 }
769 
770 extern __inline __m64
771  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
772  _m_pcmpgtb(__m64 __m1, __m64 __m2) {
773  return _mm_cmpgt_pi8(__m1, __m2);
774 }
775 
776 /* Compare four 16-bit values. The result of the comparison is 0xFFFF if
777  the test is true and zero if false. */
778 extern __inline __m64
779  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
780  _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
781 #if _ARCH_PWR8
782  __vector signed short __a, __b, __c;
783 
784  __a = (__vector signed short)vec_splats(__m1);
785  __b = (__vector signed short)vec_splats(__m2);
786  __c = (__vector signed short)vec_cmpeq(__a, __b);
787  return (__m64)((__vector long long)__c)[0];
788 #else
789  __m64_union __mu1, __mu2, __res;
790 
791  __mu1.as_m64 = __m1;
792  __mu2.as_m64 = __m2;
793 
794  __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0]) ? -1 : 0;
795  __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1]) ? -1 : 0;
796  __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2]) ? -1 : 0;
797  __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3]) ? -1 : 0;
798 
799  return (__m64)__res.as_m64;
800 #endif
801 }
802 
803 extern __inline __m64
804  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
805  _m_pcmpeqw(__m64 __m1, __m64 __m2) {
806  return _mm_cmpeq_pi16(__m1, __m2);
807 }
808 
809 extern __inline __m64
810  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
811  _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
812 #if _ARCH_PWR8
813  __vector signed short __a, __b, __c;
814 
815  __a = (__vector signed short)vec_splats(__m1);
816  __b = (__vector signed short)vec_splats(__m2);
817  __c = (__vector signed short)vec_cmpgt(__a, __b);
818  return (__m64)((__vector long long)__c)[0];
819 #else
820  __m64_union __mu1, __mu2, __res;
821 
822  __mu1.as_m64 = __m1;
823  __mu2.as_m64 = __m2;
824 
825  __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0]) ? -1 : 0;
826  __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1]) ? -1 : 0;
827  __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2]) ? -1 : 0;
828  __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3]) ? -1 : 0;
829 
830  return (__m64)__res.as_m64;
831 #endif
832 }
833 
834 extern __inline __m64
835  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
836  _m_pcmpgtw(__m64 __m1, __m64 __m2) {
837  return _mm_cmpgt_pi16(__m1, __m2);
838 }
839 
840 /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
841  the test is true and zero if false. */
842 extern __inline __m64
843  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
844  _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
845 #if _ARCH_PWR9
846  __vector signed int __a, __b, __c;
847 
848  __a = (__vector signed int)vec_splats(__m1);
849  __b = (__vector signed int)vec_splats(__m2);
850  __c = (__vector signed int)vec_cmpeq(__a, __b);
851  return (__m64)((__vector long long)__c)[0];
852 #else
853  __m64_union __mu1, __mu2, __res;
854 
855  __mu1.as_m64 = __m1;
856  __mu2.as_m64 = __m2;
857 
858  __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0]) ? -1 : 0;
859  __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1]) ? -1 : 0;
860 
861  return (__m64)__res.as_m64;
862 #endif
863 }
864 
865 extern __inline __m64
866  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
867  _m_pcmpeqd(__m64 __m1, __m64 __m2) {
868  return _mm_cmpeq_pi32(__m1, __m2);
869 }
870 
871 extern __inline __m64
872  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
873  _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
874 #if _ARCH_PWR9
875  __vector signed int __a, __b, __c;
876 
877  __a = (__vector signed int)vec_splats(__m1);
878  __b = (__vector signed int)vec_splats(__m2);
879  __c = (__vector signed int)vec_cmpgt(__a, __b);
880  return (__m64)((__vector long long)__c)[0];
881 #else
882  __m64_union __mu1, __mu2, __res;
883 
884  __mu1.as_m64 = __m1;
885  __mu2.as_m64 = __m2;
886 
887  __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0]) ? -1 : 0;
888  __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1]) ? -1 : 0;
889 
890  return (__m64)__res.as_m64;
891 #endif
892 }
893 
894 extern __inline __m64
895  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
896  _m_pcmpgtd(__m64 __m1, __m64 __m2) {
897  return _mm_cmpgt_pi32(__m1, __m2);
898 }
899 
900 #if _ARCH_PWR8
901 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
902  saturated arithmetic. */
903 extern __inline __m64
904  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
905  _mm_adds_pi8(__m64 __m1, __m64 __m2) {
906  __vector signed char __a, __b, __c;
907 
908  __a = (__vector signed char)vec_splats(__m1);
909  __b = (__vector signed char)vec_splats(__m2);
910  __c = vec_adds(__a, __b);
911  return (__m64)((__vector long long)__c)[0];
912 }
913 
914 extern __inline __m64
915  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
916  _m_paddsb(__m64 __m1, __m64 __m2) {
917  return _mm_adds_pi8(__m1, __m2);
918 }
919 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
920  saturated arithmetic. */
921 extern __inline __m64
922  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
923  _mm_adds_pi16(__m64 __m1, __m64 __m2) {
924  __vector signed short __a, __b, __c;
925 
926  __a = (__vector signed short)vec_splats(__m1);
927  __b = (__vector signed short)vec_splats(__m2);
928  __c = vec_adds(__a, __b);
929  return (__m64)((__vector long long)__c)[0];
930 }
931 
932 extern __inline __m64
933  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
934  _m_paddsw(__m64 __m1, __m64 __m2) {
935  return _mm_adds_pi16(__m1, __m2);
936 }
937 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
938  saturated arithmetic. */
939 extern __inline __m64
940  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
941  _mm_adds_pu8(__m64 __m1, __m64 __m2) {
942  __vector unsigned char __a, __b, __c;
943 
944  __a = (__vector unsigned char)vec_splats(__m1);
945  __b = (__vector unsigned char)vec_splats(__m2);
946  __c = vec_adds(__a, __b);
947  return (__m64)((__vector long long)__c)[0];
948 }
949 
950 extern __inline __m64
951  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
952  _m_paddusb(__m64 __m1, __m64 __m2) {
953  return _mm_adds_pu8(__m1, __m2);
954 }
955 
956 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
957  saturated arithmetic. */
958 extern __inline __m64
959  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
960  _mm_adds_pu16(__m64 __m1, __m64 __m2) {
961  __vector unsigned short __a, __b, __c;
962 
963  __a = (__vector unsigned short)vec_splats(__m1);
964  __b = (__vector unsigned short)vec_splats(__m2);
965  __c = vec_adds(__a, __b);
966  return (__m64)((__vector long long)__c)[0];
967 }
968 
969 extern __inline __m64
970  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
971  _m_paddusw(__m64 __m1, __m64 __m2) {
972  return _mm_adds_pu16(__m1, __m2);
973 }
974 
975 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
976  saturating arithmetic. */
977 extern __inline __m64
978  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
979  _mm_subs_pi8(__m64 __m1, __m64 __m2) {
980  __vector signed char __a, __b, __c;
981 
982  __a = (__vector signed char)vec_splats(__m1);
983  __b = (__vector signed char)vec_splats(__m2);
984  __c = vec_subs(__a, __b);
985  return (__m64)((__vector long long)__c)[0];
986 }
987 
988 extern __inline __m64
989  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
990  _m_psubsb(__m64 __m1, __m64 __m2) {
991  return _mm_subs_pi8(__m1, __m2);
992 }
993 
994 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
995  signed saturating arithmetic. */
996 extern __inline __m64
997  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
998  _mm_subs_pi16(__m64 __m1, __m64 __m2) {
999  __vector signed short __a, __b, __c;
1000 
1001  __a = (__vector signed short)vec_splats(__m1);
1002  __b = (__vector signed short)vec_splats(__m2);
1003  __c = vec_subs(__a, __b);
1004  return (__m64)((__vector long long)__c)[0];
1005 }
1006 
1007 extern __inline __m64
1008  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1009  _m_psubsw(__m64 __m1, __m64 __m2) {
1010  return _mm_subs_pi16(__m1, __m2);
1011 }
1012 
1013 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1014  unsigned saturating arithmetic. */
1015 extern __inline __m64
1016  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1017  _mm_subs_pu8(__m64 __m1, __m64 __m2) {
1018  __vector unsigned char __a, __b, __c;
1019 
1020  __a = (__vector unsigned char)vec_splats(__m1);
1021  __b = (__vector unsigned char)vec_splats(__m2);
1022  __c = vec_subs(__a, __b);
1023  return (__m64)((__vector long long)__c)[0];
1024 }
1025 
1026 extern __inline __m64
1027  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1028  _m_psubusb(__m64 __m1, __m64 __m2) {
1029  return _mm_subs_pu8(__m1, __m2);
1030 }
1031 
1032 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1033  unsigned saturating arithmetic. */
1034 extern __inline __m64
1035  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1036  _mm_subs_pu16(__m64 __m1, __m64 __m2) {
1037  __vector unsigned short __a, __b, __c;
1038 
1039  __a = (__vector unsigned short)vec_splats(__m1);
1040  __b = (__vector unsigned short)vec_splats(__m2);
1041  __c = vec_subs(__a, __b);
1042  return (__m64)((__vector long long)__c)[0];
1043 }
1044 
1045 extern __inline __m64
1046  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1047  _m_psubusw(__m64 __m1, __m64 __m2) {
1048  return _mm_subs_pu16(__m1, __m2);
1049 }
1050 
1051 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1052  four 32-bit intermediate results, which are then summed by pairs to
1053  produce two 32-bit results. */
1054 extern __inline __m64
1055  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1056  _mm_madd_pi16(__m64 __m1, __m64 __m2) {
1057  __vector signed short __a, __b;
1058  __vector signed int __c;
1059  __vector signed int __zero = {0, 0, 0, 0};
1060 
1061  __a = (__vector signed short)vec_splats(__m1);
1062  __b = (__vector signed short)vec_splats(__m2);
1063  __c = vec_vmsumshm(__a, __b, __zero);
1064  return (__m64)((__vector long long)__c)[0];
1065 }
1066 
1067 extern __inline __m64
1068  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1069  _m_pmaddwd(__m64 __m1, __m64 __m2) {
1070  return _mm_madd_pi16(__m1, __m2);
1071 }
1072 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1073  M2 and produce the high 16 bits of the 32-bit results. */
1074 extern __inline __m64
1075  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1076  _mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
1077  __vector signed short __a, __b;
1078  __vector signed short __c;
1079  __vector signed int __w0, __w1;
1080  __vector unsigned char __xform1 = {
1081 #ifdef __LITTLE_ENDIAN__
1082  0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1083  0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1084 #else
1085  0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1086  0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1087 #endif
1088  };
1089 
1090  __a = (__vector signed short)vec_splats(__m1);
1091  __b = (__vector signed short)vec_splats(__m2);
1092 
1093  __w0 = vec_vmulesh(__a, __b);
1094  __w1 = vec_vmulosh(__a, __b);
1095  __c = (__vector signed short)vec_perm(__w0, __w1, __xform1);
1096 
1097  return (__m64)((__vector long long)__c)[0];
1098 }
1099 
1100 extern __inline __m64
1101  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1102  _m_pmulhw(__m64 __m1, __m64 __m2) {
1103  return _mm_mulhi_pi16(__m1, __m2);
1104 }
1105 
1106 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1107  the low 16 bits of the results. */
1108 extern __inline __m64
1109  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1110  _mm_mullo_pi16(__m64 __m1, __m64 __m2) {
1111  __vector signed short __a, __b, __c;
1112 
1113  __a = (__vector signed short)vec_splats(__m1);
1114  __b = (__vector signed short)vec_splats(__m2);
1115  __c = __a * __b;
1116  return (__m64)((__vector long long)__c)[0];
1117 }
1118 
1119 extern __inline __m64
1120  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1121  _m_pmullw(__m64 __m1, __m64 __m2) {
1122  return _mm_mullo_pi16(__m1, __m2);
1123 }
1124 
1125 /* Shift four 16-bit values in M left by COUNT. */
1126 extern __inline __m64
1127  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1128  _mm_sll_pi16(__m64 __m, __m64 __count) {
1129  __vector signed short __r;
1130  __vector unsigned short __c;
1131 
1132  if (__count <= 15) {
1133  __r = (__vector signed short)vec_splats(__m);
1134  __c = (__vector unsigned short)vec_splats((unsigned short)__count);
1135  __r = vec_sl(__r, (__vector unsigned short)__c);
1136  return (__m64)((__vector long long)__r)[0];
1137  } else
1138  return (0);
1139 }
1140 
1141 extern __inline __m64
1142  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1143  _m_psllw(__m64 __m, __m64 __count) {
1144  return _mm_sll_pi16(__m, __count);
1145 }
1146 
1147 extern __inline __m64
1148  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1149  _mm_slli_pi16(__m64 __m, int __count) {
1150  /* Promote int to long then invoke mm_sll_pi16. */
1151  return _mm_sll_pi16(__m, __count);
1152 }
1153 
1154 extern __inline __m64
1155  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1156  _m_psllwi(__m64 __m, int __count) {
1157  return _mm_slli_pi16(__m, __count);
1158 }
1159 
1160 /* Shift two 32-bit values in M left by COUNT. */
1161 extern __inline __m64
1162  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1163  _mm_sll_pi32(__m64 __m, __m64 __count) {
1164  __m64_union __res;
1165 
1166  __res.as_m64 = __m;
1167 
1168  __res.as_int[0] = __res.as_int[0] << __count;
1169  __res.as_int[1] = __res.as_int[1] << __count;
1170  return (__res.as_m64);
1171 }
1172 
1173 extern __inline __m64
1174  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1175  _m_pslld(__m64 __m, __m64 __count) {
1176  return _mm_sll_pi32(__m, __count);
1177 }
1178 
1179 extern __inline __m64
1180  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1181  _mm_slli_pi32(__m64 __m, int __count) {
1182  /* Promote int to long then invoke mm_sll_pi32. */
1183  return _mm_sll_pi32(__m, __count);
1184 }
1185 
1186 extern __inline __m64
1187  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188  _m_pslldi(__m64 __m, int __count) {
1189  return _mm_slli_pi32(__m, __count);
1190 }
1191 
1192 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
1193 extern __inline __m64
1194  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1195  _mm_sra_pi16(__m64 __m, __m64 __count) {
1196  __vector signed short __r;
1197  __vector unsigned short __c;
1198 
1199  if (__count <= 15) {
1200  __r = (__vector signed short)vec_splats(__m);
1201  __c = (__vector unsigned short)vec_splats((unsigned short)__count);
1202  __r = vec_sra(__r, (__vector unsigned short)__c);
1203  return (__m64)((__vector long long)__r)[0];
1204  } else
1205  return (0);
1206 }
1207 
1208 extern __inline __m64
1209  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1210  _m_psraw(__m64 __m, __m64 __count) {
1211  return _mm_sra_pi16(__m, __count);
1212 }
1213 
1214 extern __inline __m64
1215  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1216  _mm_srai_pi16(__m64 __m, int __count) {
1217  /* Promote int to long then invoke mm_sra_pi32. */
1218  return _mm_sra_pi16(__m, __count);
1219 }
1220 
1221 extern __inline __m64
1222  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1223  _m_psrawi(__m64 __m, int __count) {
1224  return _mm_srai_pi16(__m, __count);
1225 }
1226 
1227 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
1228 extern __inline __m64
1229  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1230  _mm_sra_pi32(__m64 __m, __m64 __count) {
1231  __m64_union __res;
1232 
1233  __res.as_m64 = __m;
1234 
1235  __res.as_int[0] = __res.as_int[0] >> __count;
1236  __res.as_int[1] = __res.as_int[1] >> __count;
1237  return (__res.as_m64);
1238 }
1239 
1240 extern __inline __m64
1241  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1242  _m_psrad(__m64 __m, __m64 __count) {
1243  return _mm_sra_pi32(__m, __count);
1244 }
1245 
1246 extern __inline __m64
1247  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1248  _mm_srai_pi32(__m64 __m, int __count) {
1249  /* Promote int to long then invoke mm_sra_pi32. */
1250  return _mm_sra_pi32(__m, __count);
1251 }
1252 
1253 extern __inline __m64
1254  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1255  _m_psradi(__m64 __m, int __count) {
1256  return _mm_srai_pi32(__m, __count);
1257 }
1258 
1259 /* Shift four 16-bit values in M right by COUNT; shift in zeros. */
1260 extern __inline __m64
1261  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1262  _mm_srl_pi16(__m64 __m, __m64 __count) {
1263  __vector unsigned short __r;
1264  __vector unsigned short __c;
1265 
1266  if (__count <= 15) {
1267  __r = (__vector unsigned short)vec_splats(__m);
1268  __c = (__vector unsigned short)vec_splats((unsigned short)__count);
1269  __r = vec_sr(__r, (__vector unsigned short)__c);
1270  return (__m64)((__vector long long)__r)[0];
1271  } else
1272  return (0);
1273 }
1274 
1275 extern __inline __m64
1276  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1277  _m_psrlw(__m64 __m, __m64 __count) {
1278  return _mm_srl_pi16(__m, __count);
1279 }
1280 
1281 extern __inline __m64
1282  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1283  _mm_srli_pi16(__m64 __m, int __count) {
1284  /* Promote int to long then invoke mm_sra_pi32. */
1285  return _mm_srl_pi16(__m, __count);
1286 }
1287 
1288 extern __inline __m64
1289  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1290  _m_psrlwi(__m64 __m, int __count) {
1291  return _mm_srli_pi16(__m, __count);
1292 }
1293 
1294 /* Shift two 32-bit values in M right by COUNT; shift in zeros. */
1295 extern __inline __m64
1296  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1297  _mm_srl_pi32(__m64 __m, __m64 __count) {
1298  __m64_union __res;
1299 
1300  __res.as_m64 = __m;
1301 
1302  __res.as_int[0] = (unsigned int)__res.as_int[0] >> __count;
1303  __res.as_int[1] = (unsigned int)__res.as_int[1] >> __count;
1304  return (__res.as_m64);
1305 }
1306 
1307 extern __inline __m64
1308  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1309  _m_psrld(__m64 __m, __m64 __count) {
1310  return _mm_srl_pi32(__m, __count);
1311 }
1312 
1313 extern __inline __m64
1314  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1315  _mm_srli_pi32(__m64 __m, int __count) {
1316  /* Promote int to long then invoke mm_srl_pi32. */
1317  return _mm_srl_pi32(__m, __count);
1318 }
1319 
1320 extern __inline __m64
1321  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1322  _m_psrldi(__m64 __m, int __count) {
1323  return _mm_srli_pi32(__m, __count);
1324 }
1325 #endif /* _ARCH_PWR8 */
1326 
1327 /* Creates a vector of two 32-bit values; I0 is least significant. */
1328 extern __inline __m64
1329  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1330  _mm_set_pi32(int __i1, int __i0) {
1331  __m64_union __res;
1332 
1333  __res.as_int[0] = __i0;
1334  __res.as_int[1] = __i1;
1335  return (__res.as_m64);
1336 }
1337 
1338 /* Creates a vector of four 16-bit values; W0 is least significant. */
1339 extern __inline __m64
1340  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1341  _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
1342  __m64_union __res;
1343 
1344  __res.as_short[0] = __w0;
1345  __res.as_short[1] = __w1;
1346  __res.as_short[2] = __w2;
1347  __res.as_short[3] = __w3;
1348  return (__res.as_m64);
1349 }
1350 
1351 /* Creates a vector of eight 8-bit values; B0 is least significant. */
1352 extern __inline __m64
1353  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1354  _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
1355  char __b2, char __b1, char __b0) {
1356  __m64_union __res;
1357 
1358  __res.as_char[0] = __b0;
1359  __res.as_char[1] = __b1;
1360  __res.as_char[2] = __b2;
1361  __res.as_char[3] = __b3;
1362  __res.as_char[4] = __b4;
1363  __res.as_char[5] = __b5;
1364  __res.as_char[6] = __b6;
1365  __res.as_char[7] = __b7;
1366  return (__res.as_m64);
1367 }
1368 
1369 /* Similar, but with the arguments in reverse order. */
1370 extern __inline __m64
1371  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1372  _mm_setr_pi32(int __i0, int __i1) {
1373  __m64_union __res;
1374 
1375  __res.as_int[0] = __i0;
1376  __res.as_int[1] = __i1;
1377  return (__res.as_m64);
1378 }
1379 
1380 extern __inline __m64
1381  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1382  _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
1383  return _mm_set_pi16(__w3, __w2, __w1, __w0);
1384 }
1385 
1386 extern __inline __m64
1387  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388  _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
1389  char __b5, char __b6, char __b7) {
1390  return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1391 }
1392 
1393 /* Creates a vector of two 32-bit values, both elements containing I. */
1394 extern __inline __m64
1395  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1396  _mm_set1_pi32(int __i) {
1397  __m64_union __res;
1398 
1399  __res.as_int[0] = __i;
1400  __res.as_int[1] = __i;
1401  return (__res.as_m64);
1402 }
1403 
1404 /* Creates a vector of four 16-bit values, all elements containing W. */
1405 extern __inline __m64
1406  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1407  _mm_set1_pi16(short __w) {
1408 #if _ARCH_PWR9
1409  __vector signed short w;
1410 
1411  w = (__vector signed short)vec_splats(__w);
1412  return (__m64)((__vector long long)w)[0];
1413 #else
1414  __m64_union __res;
1415 
1416  __res.as_short[0] = __w;
1417  __res.as_short[1] = __w;
1418  __res.as_short[2] = __w;
1419  __res.as_short[3] = __w;
1420  return (__res.as_m64);
1421 #endif
1422 }
1423 
1424 /* Creates a vector of eight 8-bit values, all elements containing B. */
1425 extern __inline __m64
1426  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1427  _mm_set1_pi8(signed char __b) {
1428 #if _ARCH_PWR8
1429  __vector signed char __res;
1430 
1431  __res = (__vector signed char)vec_splats(__b);
1432  return (__m64)((__vector long long)__res)[0];
1433 #else
1434  __m64_union __res;
1435 
1436  __res.as_char[0] = __b;
1437  __res.as_char[1] = __b;
1438  __res.as_char[2] = __b;
1439  __res.as_char[3] = __b;
1440  __res.as_char[4] = __b;
1441  __res.as_char[5] = __b;
1442  __res.as_char[6] = __b;
1443  __res.as_char[7] = __b;
1444  return (__res.as_m64);
1445 #endif
1446 }
1447 
1448 #else
1449 #include_next <mmintrin.h>
1450 #endif /* defined(__powerpc64__) && \
1451  * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
1452 
1453 #endif /* _MMINTRIN_H_INCLUDED */
__device__ int
static __inline__ vector unsigned char __ATTRS_o_ai vec_sr(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:10393
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
Definition: altivec.h:1708
static __inline__ vector signed char __ATTRS_o_ai vec_sra(vector signed char __a, vector unsigned char __b)
Definition: altivec.h:10527
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4800
static __inline__ vector float vector float __b
Definition: altivec.h:578
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
Definition: altivec.h:14737
static __inline__ vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a, vector signed char __b)
Definition: altivec.h:5361
static __inline__ vector signed char __ATTRS_o_ai vec_subs(vector signed char __a, vector signed char __b)
Definition: altivec.h:12149
static __inline__ vector signed char __ATTRS_o_ai vec_adds(vector signed char __a, vector signed char __b)
Definition: altivec.h:626
static __inline__ vector signed char __ATTRS_o_ai vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:7962
static __inline__ vector signed char __ATTRS_o_ai vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:8588
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2435
static __inline__ vector signed char __ATTRS_o_ai vec_pack(vector signed short __a, vector signed short __b)
Definition: altivec.h:7389
static __inline__ vector unsigned char __ATTRS_o_ai vec_sl(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:8882
static __inline__ vector signed char __ATTRS_o_ai vec_add(vector signed char __a, vector signed char __b)
Definition: altivec.h:200
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2131
static __inline__ vector signed char __ATTRS_o_ai vec_packs(vector short __a, vector short __b)
Definition: altivec.h:7715
static __inline__ vector signed char __ATTRS_o_ai vec_sub(vector signed char __a, vector signed char __b)
Definition: altivec.h:11869
static __inline__ void int __a
Definition: emmintrin.h:4057
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a, __m64 __b)
Subtracts signed or unsigned 64-bit integer values and writes the difference to the corresponding bit...
Definition: emmintrin.h:2542
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a, __m64 __b)
Adds two signed or unsigned 64-bit integer values, returning the lower 64 bits of the sum.
Definition: emmintrin.h:2111
#define _m_empty
Definition: mmintrin.h:1497
#define _m_pcmpeqd
Definition: mmintrin.h:1550
#define _m_pand
Definition: mmintrin.h:1544
#define _m_pslld
Definition: mmintrin.h:1530
#define _m_pcmpgtd
Definition: mmintrin.h:1553
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
Unpacks the upper 32 bits from two 64-bit integer vectors of [4 x i16] and interleaves them into a 64...
Definition: mmintrin.h:228
#define _m_pcmpgtb
Definition: mmintrin.h:1551
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_set1_pi16(short __w)
Constructs a 64-bit integer vector of [4 x i16], with each of the 16-bit integer vector elements set ...
Definition: mmintrin.h:1395
#define _m_psrlwi
Definition: mmintrin.h:1539
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_pi32(__m64 __m1, __m64 __m2)
Adds each 32-bit integer element of the first 64-bit integer vector of [2 x i32] to the corresponding...
Definition: mmintrin.h:383
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srl_pi16(__m64 __m, __m64 __count)
Right-shifts each 16-bit integer element of the first parameter, which is a 64-bit integer vector of ...
Definition: mmintrin.h:952
#define _m_psllq
Definition: mmintrin.h:1532
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srai_pi32(__m64 __m, int __count)
Right-shifts each 32-bit integer element of a 64-bit integer vector of [2 x i32] by the number of bit...
Definition: mmintrin.h:929
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
Constructs a 64-bit integer vector, initialized in reverse order with the specified 16-bit integer va...
Definition: mmintrin.h:1457
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8] and interleaves them into a 64-...
Definition: mmintrin.h:205
#define _m_packuswb
Definition: mmintrin.h:1504
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7)
Constructs a 64-bit integer vector, initialized in reverse order with the specified 8-bit integer val...
Definition: mmintrin.h:1488
#define _m_psllwi
Definition: mmintrin.h:1529
#define _m_packsswb
Definition: mmintrin.h:1502
#define _m_to_int64
Definition: mmintrin.h:1501
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_madd_pi16(__m64 __m1, __m64 __m2)
Multiplies each 16-bit signed integer element of the first 64-bit integer vector of [4 x i16] by the ...
Definition: mmintrin.h:663
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srl_si64(__m64 __m, __m64 __count)
Right-shifts the first 64-bit integer parameter by the number of bits specified by the second 64-bit ...
Definition: mmintrin.h:1039
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_subs_pu16(__m64 __m1, __m64 __m2)
Subtracts each 16-bit unsigned integer element of the second 64-bit integer vector of [4 x i16] from ...
Definition: mmintrin.h:636
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_adds_pi8(__m64 __m1, __m64 __m2)
Adds, with saturation, each 8-bit signed integer element of the first 64-bit integer vector of [8 x i...
Definition: mmintrin.h:407
#define _m_paddb
Definition: mmintrin.h:1511
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8] and interleaves them into a 64-...
Definition: mmintrin.h:276
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_subs_pi16(__m64 __m1, __m64 __m2)
Subtracts, with saturation, each 16-bit signed integer element of the second 64-bit integer vector of...
Definition: mmintrin.h:588
#define _m_paddusw
Definition: mmintrin.h:1517
long long __m64 __attribute__((__vector_size__(8), __aligned__(8)))
Definition: mmintrin.h:17
#define _m_psubusb
Definition: mmintrin.h:1523
#define _m_to_int
Definition: mmintrin.h:1500
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
Compares the 16-bit integer elements of two 64-bit integer vectors of [4 x i16] to determine if the e...
Definition: mmintrin.h:1245
#define _m_punpckhdq
Definition: mmintrin.h:1507
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sll_si64(__m64 __m, __m64 __count)
Left-shifts the first 64-bit integer parameter by the number of bits specified by the second 64-bit i...
Definition: mmintrin.h:815
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_pi8(__m64 __m1, __m64 __m2)
Subtracts each 8-bit integer element of the second 64-bit integer vector of [8 x i8] from the corresp...
Definition: mmintrin.h:498
static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtm64_si64(__m64 __m)
Casts a 64-bit integer vector into a 64-bit signed integer value.
Definition: mmintrin.h:103
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_subs_pi8(__m64 __m1, __m64 __m2)
Subtracts, with saturation, each 8-bit signed integer element of the second 64-bit integer vector of ...
Definition: mmintrin.h:564
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_adds_pi16(__m64 __m1, __m64 __m2)
Adds, with saturation, each 16-bit signed integer element of the first 64-bit integer vector of [4 x ...
Definition: mmintrin.h:431
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
Compares the 32-bit integer elements of two 64-bit integer vectors of [2 x i32] to determine if the e...
Definition: mmintrin.h:1267
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
Compares the 8-bit integer elements of two 64-bit integer vectors of [8 x i8] to determine if the ele...
Definition: mmintrin.h:1223
#define _m_pcmpeqw
Definition: mmintrin.h:1549
#define _m_psllw
Definition: mmintrin.h:1528
#define _m_por
Definition: mmintrin.h:1546
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_packs_pu16(__m64 __m1, __m64 __m2)
Converts, with saturation, 16-bit signed integers from both 64-bit integer vector parameters of [4 x ...
Definition: mmintrin.h:178
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtsi32_si64(int __i)
Constructs a 64-bit integer vector, setting the lower 32 bits to the value of the 32-bit integer para...
Definition: mmintrin.h:54
#define _m_punpckhwd
Definition: mmintrin.h:1506
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_set1_pi32(int __i)
Constructs a 64-bit integer vector of [2 x i32], with each of the 32-bit integer vector elements set ...
Definition: mmintrin.h:1376
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_slli_pi16(__m64 __m, int __count)
Left-shifts each 16-bit signed integer element of a 64-bit integer vector of [4 x i16] by the number ...
Definition: mmintrin.h:750
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_set1_pi8(char __b)
Constructs a 64-bit integer vector of [8 x i8], with each of the 8-bit integer vector elements set to...
Definition: mmintrin.h:1413
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_packs_pi32(__m64 __m1, __m64 __m2)
Converts, with saturation, 32-bit signed integers from both 64-bit integer vector parameters of [2 x ...
Definition: mmintrin.h:153
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_pi16(__m64 __m1, __m64 __m2)
Adds each 16-bit integer element of the first 64-bit integer vector of [4 x i16] to the corresponding...
Definition: mmintrin.h:362
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_subs_pu8(__m64 __m1, __m64 __m2)
Subtracts each 8-bit unsigned integer element of the second 64-bit integer vector of [8 x i8] from th...
Definition: mmintrin.h:612
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_adds_pu8(__m64 __m1, __m64 __m2)
Adds, with saturation, each 8-bit unsigned integer element of the first 64-bit integer vector of [8 x...
Definition: mmintrin.h:454
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_or_si64(__m64 __m1, __m64 __m2)
Performs a bitwise OR of two 64-bit integer vectors.
Definition: mmintrin.h:1117
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_slli_si64(__m64 __m, int __count)
Left-shifts the first parameter, which is a 64-bit integer, by the number of bits specified by the se...
Definition: mmintrin.h:835
#define _m_psrlqi
Definition: mmintrin.h:1543
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sra_pi32(__m64 __m, __m64 __count)
Right-shifts each 32-bit integer element of the first parameter, which is a 64-bit integer vector of ...
Definition: mmintrin.h:906
#define _m_punpckhbw
Definition: mmintrin.h:1505
#define _m_paddsb
Definition: mmintrin.h:1514
#define _m_psllqi
Definition: mmintrin.h:1533
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srl_pi32(__m64 __m, __m64 __count)
Right-shifts each 32-bit integer element of the first parameter, which is a 64-bit integer vector of ...
Definition: mmintrin.h:997
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
Unpacks the lower 32 bits from two 64-bit integer vectors of [4 x i16] and interleaves them into a 64...
Definition: mmintrin.h:299
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
Compares the 16-bit integer elements of two 64-bit integer vectors of [4 x i16] to determine if the e...
Definition: mmintrin.h:1179
#define _m_pslldi
Definition: mmintrin.h:1531
#define _m_pmullw
Definition: mmintrin.h:1527
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srli_pi16(__m64 __m, int __count)
Right-shifts each 16-bit integer element of a 64-bit integer vector of [4 x i16] by the number of bit...
Definition: mmintrin.h:974
#define _m_psubsb
Definition: mmintrin.h:1521
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srli_si64(__m64 __m, int __count)
Right-shifts the first parameter, which is a 64-bit integer, by the number of bits specified by the s...
Definition: mmintrin.h:1060
#define _m_pcmpgtw
Definition: mmintrin.h:1552
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
Constructs a 64-bit integer vector initialized with the specified 16-bit integer values.
Definition: mmintrin.h:1324
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
Compares the 32-bit integer elements of two 64-bit integer vectors of [2 x i32] to determine if the e...
Definition: mmintrin.h:1201
#define _m_pcmpeqb
Definition: mmintrin.h:1548
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sll_pi32(__m64 __m, __m64 __count)
Left-shifts each 32-bit signed integer element of the first parameter, which is a 64-bit integer vect...
Definition: mmintrin.h:773
#define _m_psrldi
Definition: mmintrin.h:1541
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_setr_pi32(int __i0, int __i1)
Constructs a 64-bit integer vector, initialized in reverse order with the specified 32-bit integer va...
Definition: mmintrin.h:1434
#define _m_from_int
Definition: mmintrin.h:1498
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_pi8(__m64 __m1, __m64 __m2)
Adds each 8-bit integer element of the first 64-bit integer vector of [8 x i8] to the corresponding 8...
Definition: mmintrin.h:341
#define _m_paddd
Definition: mmintrin.h:1513
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srai_pi16(__m64 __m, int __count)
Right-shifts each 16-bit integer element of a 64-bit integer vector of [4 x i16] by the number of bit...
Definition: mmintrin.h:882
#define _m_psubw
Definition: mmintrin.h:1519
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_set_pi32(int __i1, int __i0)
Constructs a 64-bit integer vector initialized with the specified 32-bit integer values.
Definition: mmintrin.h:1301
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_adds_pu16(__m64 __m1, __m64 __m2)
Adds, with saturation, each 16-bit unsigned integer element of the first 64-bit integer vector of [4 ...
Definition: mmintrin.h:477
#define _m_psrawi
Definition: mmintrin.h:1535
#define _m_psubb
Definition: mmintrin.h:1518
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_xor_si64(__m64 __m1, __m64 __m2)
Performs a bitwise exclusive OR of two 64-bit integer vectors.
Definition: mmintrin.h:1135
#define _m_from_int64
Definition: mmintrin.h:1499
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtsi64_m64(long long __i)
Casts a 64-bit signed integer value into a 64-bit integer vector.
Definition: mmintrin.h:87
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
Unpacks the upper 32 bits from two 64-bit integer vectors of [2 x i32] and interleaves them into a 64...
Definition: mmintrin.h:249
#define _m_psubsw
Definition: mmintrin.h:1522
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mullo_pi16(__m64 __m1, __m64 __m2)
Multiplies each 16-bit signed integer element of the first 64-bit integer vector of [4 x i16] by the ...
Definition: mmintrin.h:705
#define _m_punpcklwd
Definition: mmintrin.h:1509
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_slli_pi32(__m64 __m, int __count)
Left-shifts each 32-bit signed integer element of a 64-bit integer vector of [2 x i32] by the number ...
Definition: mmintrin.h:795
#define _m_pxor
Definition: mmintrin.h:1547
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
Unpacks the lower 32 bits from two 64-bit integer vectors of [2 x i32] and interleaves them into a 64...
Definition: mmintrin.h:320
#define _m_packssdw
Definition: mmintrin.h:1503
#define _m_pmulhw
Definition: mmintrin.h:1526
#define _m_psrld
Definition: mmintrin.h:1540
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_setzero_si64(void)
Constructs a 64-bit integer vector initialized to zero.
Definition: mmintrin.h:1280
#define _m_paddw
Definition: mmintrin.h:1512
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sll_pi16(__m64 __m, __m64 __count)
Left-shifts each 16-bit signed integer element of the first parameter, which is a 64-bit integer vect...
Definition: mmintrin.h:728
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sra_pi16(__m64 __m, __m64 __count)
Right-shifts each 16-bit integer element of the first parameter, which is a 64-bit integer vector of ...
Definition: mmintrin.h:859
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_pi32(__m64 __m1, __m64 __m2)
Subtracts each 32-bit integer element of the second 64-bit integer vector of [2 x i32] from the corre...
Definition: mmintrin.h:540
#define _m_psraw
Definition: mmintrin.h:1534
#define _m_psubd
Definition: mmintrin.h:1520
#define _m_paddsw
Definition: mmintrin.h:1515
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi64_si32(__m64 __m)
Returns the lower 32 bits of a 64-bit integer vector as a 32-bit signed integer.
Definition: mmintrin.h:71
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
Compares the 8-bit integer elements of two 64-bit integer vectors of [8 x i8] to determine if the ele...
Definition: mmintrin.h:1157
#define _m_psrlq
Definition: mmintrin.h:1542
#define _m_psubusw
Definition: mmintrin.h:1524
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
Constructs a 64-bit integer vector initialized with the specified 8-bit integer values.
Definition: mmintrin.h:1355
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_packs_pi16(__m64 __m1, __m64 __m2)
Converts, with saturation, 16-bit signed integers from both 64-bit integer vector parameters of [4 x ...
Definition: mmintrin.h:128
#define _m_pandn
Definition: mmintrin.h:1545
#define _m_psradi
Definition: mmintrin.h:1537
#define _m_paddusb
Definition: mmintrin.h:1516
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srli_pi32(__m64 __m, int __count)
Right-shifts each 32-bit integer element of a 64-bit integer vector of [2 x i32] by the number of bit...
Definition: mmintrin.h:1019
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
Multiplies each 16-bit signed integer element of the first 64-bit integer vector of [4 x i16] by the ...
Definition: mmintrin.h:684
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_andnot_si64(__m64 __m1, __m64 __m2)
Performs a bitwise NOT of the first 64-bit integer vector, and then performs a bitwise AND of the int...
Definition: mmintrin.h:1099
#define _m_pmaddwd
Definition: mmintrin.h:1525
#define _m_psrad
Definition: mmintrin.h:1536
#define _m_punpcklbw
Definition: mmintrin.h:1508
#define _m_punpckldq
Definition: mmintrin.h:1510
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_pi16(__m64 __m1, __m64 __m2)
Subtracts each 16-bit integer element of the second 64-bit integer vector of [4 x i16] from the corre...
Definition: mmintrin.h:519
#define _m_psrlw
Definition: mmintrin.h:1538
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_and_si64(__m64 __m1, __m64 __m2)
Performs a bitwise AND of two 64-bit integer vectors.
Definition: mmintrin.h:1078
#define as_int(x)
#define as_float(x)
#define as_char(x)
OpenCL v1.1/1.2/2.0 s6.2.4.2 - as_type operators Reinterprets a data type as another data type of the...
#define as_short(x)