clang  19.0.0git
smmintrin.h
Go to the documentation of this file.
1 /*===---- smmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11  User Guide and Reference, version 9.0.
12 
13  NOTE: This is NOT a complete implementation of the SSE4 intrinsics! */
14 
15 #ifndef NO_WARN_X86_INTRINSICS
16 /* This header is distributed to simplify porting x86_64 code that
17  makes explicit use of Intel intrinsics to powerpc64/powerpc64le.
18 
19  It is the user's responsibility to determine if the results are
20  acceptable and make additional changes as necessary.
21 
22  Note that much code that uses Intel intrinsics can be rewritten in
23  standard C or GNU C extensions, which are more portable and better
24  optimized across multiple targets. */
25 #error \
26  "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
27 #endif
28 
29 #ifndef SMMINTRIN_H_
30 #define SMMINTRIN_H_
31 
32 #if defined(__powerpc64__) && \
33  (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
34 
35 #include <altivec.h>
36 #include <tmmintrin.h>
37 
38 /* Rounding mode macros. */
39 #define _MM_FROUND_TO_NEAREST_INT 0x00
40 #define _MM_FROUND_TO_ZERO 0x01
41 #define _MM_FROUND_TO_POS_INF 0x02
42 #define _MM_FROUND_TO_NEG_INF 0x03
43 #define _MM_FROUND_CUR_DIRECTION 0x04
44 
45 #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
46 #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
47 #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
48 #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
49 #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
50 #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
51 
52 #define _MM_FROUND_RAISE_EXC 0x00
53 #define _MM_FROUND_NO_EXC 0x08
54 
55 extern __inline __m128d
56  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
57  _mm_round_pd(__m128d __A, int __rounding) {
58  __v2df __r;
59  union {
60  double __fr;
61  long long __fpscr;
62  } __enables_save, __fpscr_save;
63 
64  if (__rounding & _MM_FROUND_NO_EXC) {
65  /* Save enabled exceptions, disable all exceptions,
66  and preserve the rounding mode. */
67 #ifdef _ARCH_PWR9
68  __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
69  __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
70 #else
71  __fpscr_save.__fr = __builtin_ppc_mffs();
72  __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
73  __fpscr_save.__fpscr &= ~0xf8;
74  __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
75 #endif
76  /* Insert an artificial "read/write" reference to the variable
77  read below, to ensure the compiler does not schedule
78  a read/use of the variable before the FPSCR is modified, above.
79  This can be removed if and when GCC PR102783 is fixed.
80  */
81  __asm__("" : "+wa"(__A));
82  }
83 
84  switch (__rounding) {
86 #ifdef _ARCH_PWR9
87  __fpscr_save.__fr = __builtin_ppc_mffsl();
88 #else
89  __fpscr_save.__fr = __builtin_ppc_mffs();
90  __fpscr_save.__fpscr &= 0x70007f0ffL;
91 #endif
92  __attribute__((fallthrough));
94  __builtin_ppc_set_fpscr_rn(0b00);
95  /* Insert an artificial "read/write" reference to the variable
96  read below, to ensure the compiler does not schedule
97  a read/use of the variable before the FPSCR is modified, above.
98  This can be removed if and when GCC PR102783 is fixed.
99  */
100  __asm__("" : "+wa"(__A));
101 
102  __r = vec_rint((__v2df)__A);
103 
104  /* Insert an artificial "read" reference to the variable written
105  above, to ensure the compiler does not schedule the computation
106  of the value after the manipulation of the FPSCR, below.
107  This can be removed if and when GCC PR102783 is fixed.
108  */
109  __asm__("" : : "wa"(__r));
110  __builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr);
111  break;
114  __r = vec_floor((__v2df)__A);
115  break;
118  __r = vec_ceil((__v2df)__A);
119  break;
120  case _MM_FROUND_TO_ZERO:
122  __r = vec_trunc((__v2df)__A);
123  break;
125  __r = vec_rint((__v2df)__A);
126  break;
127  }
128  if (__rounding & _MM_FROUND_NO_EXC) {
129  /* Insert an artificial "read" reference to the variable written
130  above, to ensure the compiler does not schedule the computation
131  of the value after the manipulation of the FPSCR, below.
132  This can be removed if and when GCC PR102783 is fixed.
133  */
134  __asm__("" : : "wa"(__r));
135  /* Restore enabled exceptions. */
136 #ifdef _ARCH_PWR9
137  __fpscr_save.__fr = __builtin_ppc_mffsl();
138 #else
139  __fpscr_save.__fr = __builtin_ppc_mffs();
140  __fpscr_save.__fpscr &= 0x70007f0ffL;
141 #endif
142  __fpscr_save.__fpscr |= __enables_save.__fpscr;
143  __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
144  }
145  return (__m128d)__r;
146 }
147 
148 extern __inline __m128d
149  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
150  _mm_round_sd(__m128d __A, __m128d __B, int __rounding) {
151  __B = _mm_round_pd(__B, __rounding);
152  __v2df __r = {((__v2df)__B)[0], ((__v2df)__A)[1]};
153  return (__m128d)__r;
154 }
155 
156 extern __inline __m128
157  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
158  _mm_round_ps(__m128 __A, int __rounding) {
159  __v4sf __r;
160  union {
161  double __fr;
162  long long __fpscr;
163  } __enables_save, __fpscr_save;
164 
165  if (__rounding & _MM_FROUND_NO_EXC) {
166  /* Save enabled exceptions, disable all exceptions,
167  and preserve the rounding mode. */
168 #ifdef _ARCH_PWR9
169  __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
170  __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
171 #else
172  __fpscr_save.__fr = __builtin_ppc_mffs();
173  __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
174  __fpscr_save.__fpscr &= ~0xf8;
175  __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
176 #endif
177  /* Insert an artificial "read/write" reference to the variable
178  read below, to ensure the compiler does not schedule
179  a read/use of the variable before the FPSCR is modified, above.
180  This can be removed if and when GCC PR102783 is fixed.
181  */
182  __asm__("" : "+wa"(__A));
183  }
184 
185  switch (__rounding) {
187 #ifdef _ARCH_PWR9
188  __fpscr_save.__fr = __builtin_ppc_mffsl();
189 #else
190  __fpscr_save.__fr = __builtin_ppc_mffs();
191  __fpscr_save.__fpscr &= 0x70007f0ffL;
192 #endif
193  __attribute__((fallthrough));
195  __builtin_ppc_set_fpscr_rn(0b00);
196  /* Insert an artificial "read/write" reference to the variable
197  read below, to ensure the compiler does not schedule
198  a read/use of the variable before the FPSCR is modified, above.
199  This can be removed if and when GCC PR102783 is fixed.
200  */
201  __asm__("" : "+wa"(__A));
202 
203  __r = vec_rint((__v4sf)__A);
204 
205  /* Insert an artificial "read" reference to the variable written
206  above, to ensure the compiler does not schedule the computation
207  of the value after the manipulation of the FPSCR, below.
208  This can be removed if and when GCC PR102783 is fixed.
209  */
210  __asm__("" : : "wa"(__r));
211  __builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr);
212  break;
215  __r = vec_floor((__v4sf)__A);
216  break;
219  __r = vec_ceil((__v4sf)__A);
220  break;
221  case _MM_FROUND_TO_ZERO:
223  __r = vec_trunc((__v4sf)__A);
224  break;
226  __r = vec_rint((__v4sf)__A);
227  break;
228  }
229  if (__rounding & _MM_FROUND_NO_EXC) {
230  /* Insert an artificial "read" reference to the variable written
231  above, to ensure the compiler does not schedule the computation
232  of the value after the manipulation of the FPSCR, below.
233  This can be removed if and when GCC PR102783 is fixed.
234  */
235  __asm__("" : : "wa"(__r));
236  /* Restore enabled exceptions. */
237 #ifdef _ARCH_PWR9
238  __fpscr_save.__fr = __builtin_ppc_mffsl();
239 #else
240  __fpscr_save.__fr = __builtin_ppc_mffs();
241  __fpscr_save.__fpscr &= 0x70007f0ffL;
242 #endif
243  __fpscr_save.__fpscr |= __enables_save.__fpscr;
244  __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
245  }
246  return (__m128)__r;
247 }
248 
249 extern __inline __m128
250  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
251  _mm_round_ss(__m128 __A, __m128 __B, int __rounding) {
252  __B = _mm_round_ps(__B, __rounding);
253  __v4sf __r = (__v4sf)__A;
254  __r[0] = ((__v4sf)__B)[0];
255  return (__m128)__r;
256 }
257 
258 #define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL)
259 #define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL)
260 
261 #define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
262 #define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR)
263 
264 #define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL)
265 #define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL)
266 
267 #define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR)
268 #define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR)
269 
270 extern __inline __m128i
271  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
272  _mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
273  __v16qi __result = (__v16qi)__A;
274 
275  __result[__N & 0xf] = __D;
276 
277  return (__m128i)__result;
278 }
279 
280 extern __inline __m128i
281  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
282  _mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
283  __v4si __result = (__v4si)__A;
284 
285  __result[__N & 3] = __D;
286 
287  return (__m128i)__result;
288 }
289 
290 extern __inline __m128i
291  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
292  _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
293  __v2di __result = (__v2di)__A;
294 
295  __result[__N & 1] = __D;
296 
297  return (__m128i)__result;
298 }
299 
300 extern __inline int
301  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302  _mm_extract_epi8(__m128i __X, const int __N) {
303  return (unsigned char)((__v16qi)__X)[__N & 15];
304 }
305 
306 extern __inline int
307  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
308  _mm_extract_epi32(__m128i __X, const int __N) {
309  return ((__v4si)__X)[__N & 3];
310 }
311 
312 extern __inline int
313  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
314  _mm_extract_epi64(__m128i __X, const int __N) {
315  return ((__v2di)__X)[__N & 1];
316 }
317 
318 extern __inline int
319  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
320  _mm_extract_ps(__m128 __X, const int __N) {
321  return ((__v4si)__X)[__N & 3];
322 }
323 
324 #ifdef _ARCH_PWR8
325 extern __inline __m128i
326  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
327  _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
328  __v16qu __charmask = vec_splats((unsigned char)__imm8);
329  __charmask = vec_gb(__charmask);
330  __v8hu __shortmask = (__v8hu)vec_unpackh((__v16qi)__charmask);
331 #ifdef __BIG_ENDIAN__
332  __shortmask = vec_reve(__shortmask);
333 #endif
334  return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
335 }
336 #endif
337 
338 extern __inline __m128i
339  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
340  _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
341 #ifdef _ARCH_PWR10
342  return (__m128i)vec_blendv((__v16qi)__A, (__v16qi)__B, (__v16qu)__mask);
343 #else
344  const __v16qu __seven = vec_splats((unsigned char)0x07);
345  __v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
346  return (__m128i)vec_sel((__v16qi)__A, (__v16qi)__B, __lmask);
347 #endif
348 }
349 
350 extern __inline __m128
351  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
352  _mm_blend_ps(__m128 __A, __m128 __B, const int __imm8) {
353  __v16qu __pcv[] = {
354  {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
355  {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
356  {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
357  {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
358  {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
359  {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
360  {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
361  {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
362  {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
363  {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
364  {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
365  {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
366  {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
367  {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
368  {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
369  {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
370  };
371  __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
372  return (__m128)__r;
373 }
374 
375 extern __inline __m128
376  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
377  _mm_blendv_ps(__m128 __A, __m128 __B, __m128 __mask) {
378 #ifdef _ARCH_PWR10
379  return (__m128)vec_blendv((__v4sf)__A, (__v4sf)__B, (__v4su)__mask);
380 #else
381  const __v4si __zero = {0};
382  const __vector __bool int __boolmask = vec_cmplt((__v4si)__mask, __zero);
383  return (__m128)vec_sel((__v4su)__A, (__v4su)__B, (__v4su)__boolmask);
384 #endif
385 }
386 
387 extern __inline __m128d
388  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
389  _mm_blend_pd(__m128d __A, __m128d __B, const int __imm8) {
390  __v16qu __pcv[] = {
391  {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
392  {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
393  {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
394  {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}};
395  __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
396  return (__m128d)__r;
397 }
398 
399 #ifdef _ARCH_PWR8
400 extern __inline __m128d
401  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
402  _mm_blendv_pd(__m128d __A, __m128d __B, __m128d __mask) {
403 #ifdef _ARCH_PWR10
404  return (__m128d)vec_blendv((__v2df)__A, (__v2df)__B, (__v2du)__mask);
405 #else
406  const __v2di __zero = {0};
407  const __vector __bool long long __boolmask =
408  vec_cmplt((__v2di)__mask, __zero);
409  return (__m128d)vec_sel((__v2du)__A, (__v2du)__B, (__v2du)__boolmask);
410 #endif
411 }
412 #endif
413 
414 extern __inline int
415  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
416  _mm_testz_si128(__m128i __A, __m128i __B) {
417  /* Note: This implementation does NOT set "zero" or "carry" flags. */
418  const __v16qu __zero = {0};
419  return vec_all_eq(vec_and((__v16qu)__A, (__v16qu)__B), __zero);
420 }
421 
422 extern __inline int
423  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
424  _mm_testc_si128(__m128i __A, __m128i __B) {
425  /* Note: This implementation does NOT set "zero" or "carry" flags. */
426  const __v16qu __zero = {0};
427  const __v16qu __notA = vec_nor((__v16qu)__A, (__v16qu)__A);
428  return vec_all_eq(vec_and((__v16qu)__notA, (__v16qu)__B), __zero);
429 }
430 
431 extern __inline int
432  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
433  _mm_testnzc_si128(__m128i __A, __m128i __B) {
434  /* Note: This implementation does NOT set "zero" or "carry" flags. */
435  return _mm_testz_si128(__A, __B) == 0 && _mm_testc_si128(__A, __B) == 0;
436 }
437 
438 #define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
439 
440 #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
441 
442 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
443 
444 #ifdef _ARCH_PWR8
445 extern __inline __m128i
446  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
447  _mm_cmpeq_epi64(__m128i __X, __m128i __Y) {
448  return (__m128i)vec_cmpeq((__v2di)__X, (__v2di)__Y);
449 }
450 #endif
451 
452 extern __inline __m128i
453  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
454  _mm_min_epi8(__m128i __X, __m128i __Y) {
455  return (__m128i)vec_min((__v16qi)__X, (__v16qi)__Y);
456 }
457 
458 extern __inline __m128i
459  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
460  _mm_min_epu16(__m128i __X, __m128i __Y) {
461  return (__m128i)vec_min((__v8hu)__X, (__v8hu)__Y);
462 }
463 
464 extern __inline __m128i
465  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
466  _mm_min_epi32(__m128i __X, __m128i __Y) {
467  return (__m128i)vec_min((__v4si)__X, (__v4si)__Y);
468 }
469 
470 extern __inline __m128i
471  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
472  _mm_min_epu32(__m128i __X, __m128i __Y) {
473  return (__m128i)vec_min((__v4su)__X, (__v4su)__Y);
474 }
475 
476 extern __inline __m128i
477  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
478  _mm_max_epi8(__m128i __X, __m128i __Y) {
479  return (__m128i)vec_max((__v16qi)__X, (__v16qi)__Y);
480 }
481 
482 extern __inline __m128i
483  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
484  _mm_max_epu16(__m128i __X, __m128i __Y) {
485  return (__m128i)vec_max((__v8hu)__X, (__v8hu)__Y);
486 }
487 
488 extern __inline __m128i
489  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
490  _mm_max_epi32(__m128i __X, __m128i __Y) {
491  return (__m128i)vec_max((__v4si)__X, (__v4si)__Y);
492 }
493 
494 extern __inline __m128i
495  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
496  _mm_max_epu32(__m128i __X, __m128i __Y) {
497  return (__m128i)vec_max((__v4su)__X, (__v4su)__Y);
498 }
499 
500 extern __inline __m128i
501  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
502  _mm_mullo_epi32(__m128i __X, __m128i __Y) {
503  return (__m128i)vec_mul((__v4su)__X, (__v4su)__Y);
504 }
505 
506 #ifdef _ARCH_PWR8
507 extern __inline __m128i
508  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
509  _mm_mul_epi32(__m128i __X, __m128i __Y) {
510  return (__m128i)vec_mule((__v4si)__X, (__v4si)__Y);
511 }
512 #endif
513 
514 extern __inline __m128i
515  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
516  _mm_cvtepi8_epi16(__m128i __A) {
517  return (__m128i)vec_unpackh((__v16qi)__A);
518 }
519 
520 extern __inline __m128i
521  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
522  _mm_cvtepi8_epi32(__m128i __A) {
523  __A = (__m128i)vec_unpackh((__v16qi)__A);
524  return (__m128i)vec_unpackh((__v8hi)__A);
525 }
526 
527 #ifdef _ARCH_PWR8
528 extern __inline __m128i
529  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
530  _mm_cvtepi8_epi64(__m128i __A) {
531  __A = (__m128i)vec_unpackh((__v16qi)__A);
532  __A = (__m128i)vec_unpackh((__v8hi)__A);
533  return (__m128i)vec_unpackh((__v4si)__A);
534 }
535 #endif
536 
537 extern __inline __m128i
538  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
539  _mm_cvtepi16_epi32(__m128i __A) {
540  return (__m128i)vec_unpackh((__v8hi)__A);
541 }
542 
543 #ifdef _ARCH_PWR8
544 extern __inline __m128i
545  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
546  _mm_cvtepi16_epi64(__m128i __A) {
547  __A = (__m128i)vec_unpackh((__v8hi)__A);
548  return (__m128i)vec_unpackh((__v4si)__A);
549 }
550 #endif
551 
552 #ifdef _ARCH_PWR8
553 extern __inline __m128i
554  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
555  _mm_cvtepi32_epi64(__m128i __A) {
556  return (__m128i)vec_unpackh((__v4si)__A);
557 }
558 #endif
559 
560 extern __inline __m128i
561  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
562  _mm_cvtepu8_epi16(__m128i __A) {
563  const __v16qu __zero = {0};
564 #ifdef __LITTLE_ENDIAN__
565  __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
566 #else /* __BIG_ENDIAN__. */
567  __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
568 #endif /* __BIG_ENDIAN__. */
569  return __A;
570 }
571 
572 extern __inline __m128i
573  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
574  _mm_cvtepu8_epi32(__m128i __A) {
575  const __v16qu __zero = {0};
576 #ifdef __LITTLE_ENDIAN__
577  __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
578  __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
579 #else /* __BIG_ENDIAN__. */
580  __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
581  __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
582 #endif /* __BIG_ENDIAN__. */
583  return __A;
584 }
585 
586 extern __inline __m128i
587  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
588  _mm_cvtepu8_epi64(__m128i __A) {
589  const __v16qu __zero = {0};
590 #ifdef __LITTLE_ENDIAN__
591  __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
592  __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
593  __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
594 #else /* __BIG_ENDIAN__. */
595  __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
596  __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
597  __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
598 #endif /* __BIG_ENDIAN__. */
599  return __A;
600 }
601 
602 extern __inline __m128i
603  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
604  _mm_cvtepu16_epi32(__m128i __A) {
605  const __v8hu __zero = {0};
606 #ifdef __LITTLE_ENDIAN__
607  __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
608 #else /* __BIG_ENDIAN__. */
609  __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
610 #endif /* __BIG_ENDIAN__. */
611  return __A;
612 }
613 
614 extern __inline __m128i
615  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
616  _mm_cvtepu16_epi64(__m128i __A) {
617  const __v8hu __zero = {0};
618 #ifdef __LITTLE_ENDIAN__
619  __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
620  __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
621 #else /* __BIG_ENDIAN__. */
622  __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
623  __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
624 #endif /* __BIG_ENDIAN__. */
625  return __A;
626 }
627 
628 extern __inline __m128i
629  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
630  _mm_cvtepu32_epi64(__m128i __A) {
631  const __v4su __zero = {0};
632 #ifdef __LITTLE_ENDIAN__
633  __A = (__m128i)vec_mergeh((__v4su)__A, __zero);
634 #else /* __BIG_ENDIAN__. */
635  __A = (__m128i)vec_mergeh(__zero, (__v4su)__A);
636 #endif /* __BIG_ENDIAN__. */
637  return __A;
638 }
639 
640 /* Return horizontal packed word minimum and its index in bits [15:0]
641  and bits [18:16] respectively. */
642 extern __inline __m128i
643  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
644  _mm_minpos_epu16(__m128i __A) {
645  union __u {
646  __m128i __m;
647  __v8hu __uh;
648  };
649  union __u __u = {.__m = __A}, __r = {.__m = {0}};
650  unsigned short __ridx = 0;
651  unsigned short __rmin = __u.__uh[__ridx];
652  unsigned long __i;
653  for (__i = 1; __i < 8; __i++) {
654  if (__u.__uh[__i] < __rmin) {
655  __rmin = __u.__uh[__i];
656  __ridx = __i;
657  }
658  }
659  __r.__uh[0] = __rmin;
660  __r.__uh[1] = __ridx;
661  return __r.__m;
662 }
663 
664 extern __inline __m128i
665  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
666  _mm_packus_epi32(__m128i __X, __m128i __Y) {
667  return (__m128i)vec_packsu((__v4si)__X, (__v4si)__Y);
668 }
669 
670 #ifdef _ARCH_PWR8
671 extern __inline __m128i
672  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
673  _mm_cmpgt_epi64(__m128i __X, __m128i __Y) {
674  return (__m128i)vec_cmpgt((__v2di)__X, (__v2di)__Y);
675 }
676 #endif
677 
678 #else
679 #include_next <smmintrin.h>
680 #endif /* defined(__powerpc64__) && \
681  * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
682 
683 #endif /* SMMINTRIN_H_ */
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
Definition: altivec.h:1708
static __inline__ vector signed char __ATTRS_o_ai vec_sra(vector signed char __a, vector unsigned char __b)
Definition: altivec.h:10527
static __inline__ vector float __ATTRS_o_ai vec_ceil(vector float __a)
Definition: altivec.h:1659
static __inline__ vector short __ATTRS_o_ai vec_mule(vector signed char __a, vector signed char __b)
Definition: altivec.h:6263
static __ATTRS_o_ai vector bool char vec_reve(vector bool char __a)
Definition: altivec.h:17528
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
Definition: altivec.h:14737
static __inline__ int __ATTRS_o_ai vec_all_eq(vector signed char __a, vector signed char __b)
Definition: altivec.h:14802
static __inline__ vector signed char __ATTRS_o_ai vec_mul(vector signed char __a, vector signed char __b)
Definition: altivec.h:6205
static __inline__ vector signed char __ATTRS_o_ai vec_and(vector signed char __a, vector signed char __b)
Definition: altivec.h:882
static __inline__ vector float __ATTRS_o_ai vec_floor(vector float __a)
Definition: altivec.h:4026
static __inline__ vector signed char __ATTRS_o_ai vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:7962
static __inline__ vector signed char __ATTRS_o_ai vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:8588
static __inline__ vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a, vector signed char __b)
Definition: altivec.h:5091
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2435
static __inline__ vector signed char __ATTRS_o_ai vec_max(vector signed char __a, vector signed char __b)
Definition: altivec.h:4838
static __inline__ vector signed char __ATTRS_o_ai vec_nor(vector signed char __a, vector signed char __b)
Definition: altivec.h:6729
static __inline__ vector unsigned char __ATTRS_o_ai vec_packsu(vector short __a, vector short __b)
Definition: altivec.h:7844
static __inline__ vector signed char __ATTRS_o_ai vec_min(vector signed char __a, vector signed char __b)
Definition: altivec.h:5742
static __inline__ vector short __ATTRS_o_ai vec_unpackh(vector signed char __a)
Definition: altivec.h:12642
static __inline__ vector float __ATTRS_o_ai vec_trunc(vector float __a)
Definition: altivec.h:12597
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2131
static __inline__ void short __D
Definition: immintrin.h:468
__inline unsigned int unsigned int __Y
Definition: bmi2intrin.h:19
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1, __m128d __V2, __m128d __M)
Returns a 128-bit vector of [2 x double] where the values are selected from either the first or secon...
Definition: smmintrin.h:436
#define _MM_FROUND_TO_POS_INF
Definition: smmintrin.h:27
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu16(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [8 x u16] and returns a 128-bit vector ...
Definition: smmintrin.h:702
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V)
Zero-extends each of the lower four 8-bit integer elements of a 128-bit vector of [16 x i8] to 32-bit...
Definition: smmintrin.h:1358
#define _mm_round_ps(X, M)
Rounds each element of the 128-bit vector of [4 x float] to an integer value according to the roundin...
Definition: smmintrin.h:239
#define _mm_blend_epi16(V1, V2, M)
Returns a 128-bit vector of [8 x i16] where the values are selected from either of the first or secon...
Definition: smmintrin.h:521
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V)
Zero-extends each of the lower eight 8-bit integer elements of a 128-bit vector of [16 x i8] to 16-bi...
Definition: smmintrin.h:1338
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1, __m128i __V2)
Multiples corresponding elements of two 128-bit vectors of [4 x i32] and returns the lower 32 bits of...
Definition: smmintrin.h:539
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi8(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [16 x i8] and returns a 128-bit vector ...
Definition: smmintrin.h:666
#define _mm_blend_ps(V1, V2, M)
Returns a 128-bit vector of [4 x float] where the values are selected from either the first or second...
Definition: smmintrin.h:412
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x u32] and returns a 128-bit vector ...
Definition: smmintrin.h:792
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V)
Sign-extends each of the lower four 8-bit integer elements of a 128-bit vector of [16 x i8] to 32-bit...
Definition: smmintrin.h:1243
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps(__m128 __V1, __m128 __V2, __m128 __M)
Returns a 128-bit vector of [4 x float] where the values are selected from either the first or second...
Definition: smmintrin.h:463
#define _mm_extract_epi64(X, N)
Extracts a 64-bit element from the 128-bit integer vector of [2 x i64], using the immediate value par...
Definition: smmintrin.h:1077
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V)
Zero-extends each of the lower two 32-bit integer elements of a 128-bit integer vector of [4 x i32] t...
Definition: smmintrin.h:1430
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V)
Zero-extends each of the lower two 16-bit integer elements of a 128-bit integer vector of [8 x i16] t...
Definition: smmintrin.h:1412
#define _mm_insert_epi32(X, I, N)
Constructs a 128-bit vector of [4 x i32] by first making a copy of the 128-bit integer vector paramet...
Definition: smmintrin.h:954
#define _MM_FROUND_TO_NEG_INF
Definition: smmintrin.h:26
#define _mm_round_ss(X, Y, M)
Copies three upper elements of the first 128-bit vector operand to the corresponding three upper elem...
Definition: smmintrin.h:280
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
Compares each of the corresponding 64-bit values of the 128-bit integer vectors to determine if the v...
Definition: smmintrin.h:2317
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1, __m128i __V2)
Converts, with saturation, 32-bit signed integers from both 128-bit integer vector operands into 16-b...
Definition: smmintrin.h:1454
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x i32] and returns a 128-bit vector ...
Definition: smmintrin.h:738
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V)
Sign-extends each of the lower two 16-bit integer elements of a 128-bit integer vector of [8 x i16] t...
Definition: smmintrin.h:1301
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V)
Sign-extends each of the lower two 8-bit integer elements of a 128-bit integer vector of [16 x i8] to...
Definition: smmintrin.h:1263
#define _mm_round_pd(X, M)
Rounds each element of the 128-bit vector of [2 x double] to an integer value according to the roundi...
Definition: smmintrin.h:314
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi8(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [16 x i8] and returns a 128-bit vector ...
Definition: smmintrin.h:684
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu16(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [8 x u16] and returns a 128-bit vector ...
Definition: smmintrin.h:720
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
Compares each of the corresponding 64-bit values of the 128-bit integer vectors for equality.
Definition: smmintrin.h:1202
#define _MM_FROUND_TO_NEAREST_INT
Definition: smmintrin.h:25
static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M, __m128i __V)
Tests whether the specified bits in a 128-bit integer vector are neither all zeros nor all ones.
Definition: smmintrin.h:1128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V)
Sign-extends each of the lower four 16-bit integer elements of a 128-bit integer vector of [8 x i16] ...
Definition: smmintrin.h:1283
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V)
Zero-extends each of the lower four 16-bit integer elements of a 128-bit integer vector of [8 x i16] ...
Definition: smmintrin.h:1394
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x i32] and returns a 128-bit vector ...
Definition: smmintrin.h:756
static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M, __m128i __V)
Tests whether the specified bits in a 128-bit integer vector are all ones.
Definition: smmintrin.h:1110
#define _mm_round_sd(X, Y, M)
Copies the upper element of the first 128-bit vector operand to the corresponding upper element of th...
Definition: smmintrin.h:355
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V)
Zero-extends each of the lower two 8-bit integer elements of a 128-bit integer vector of [16 x i8] to...
Definition: smmintrin.h:1376
#define _mm_extract_epi8(X, N)
Extracts an 8-bit element from the 128-bit integer vector of [16 x i8], using the immediate value par...
Definition: smmintrin.h:1028
#define _MM_FROUND_TO_ZERO
Definition: smmintrin.h:28
#define _MM_FROUND_CUR_DIRECTION
Definition: smmintrin.h:29
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x u32] and returns a 128-bit vector ...
Definition: smmintrin.h:774
#define _mm_blend_pd(V1, V2, M)
Returns a 128-bit vector of [2 x double] where the values are selected from either the first or secon...
Definition: smmintrin.h:384
#define _mm_extract_ps(X, N)
Extracts a 32-bit integer from a 128-bit vector of [4 x float] and returns it, using the immediate va...
Definition: smmintrin.h:862
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V)
Sign-extends each of the lower eight 8-bit integer elements of a 128-bit vector of [16 x i8] to 16-bi...
Definition: smmintrin.h:1221
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1, __m128i __V2)
Multiplies corresponding even-indexed elements of two 128-bit vectors of [4 x i32] and returns a 128-...
Definition: smmintrin.h:558
#define _mm_insert_epi8(X, I, N)
Constructs a 128-bit vector of [16 x i8] by first making a copy of the 128-bit integer vector paramet...
Definition: smmintrin.h:922
#define _mm_extract_epi32(X, N)
Extracts a 32-bit element from the 128-bit integer vector of [4 x i32], using the immediate value par...
Definition: smmintrin.h:1054
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V)
Finds the minimum unsigned 16-bit element in the input 128-bit vector of [8 x u16] and returns it and...
Definition: smmintrin.h:1512
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V)
Sign-extends each of the lower two 32-bit integer elements of a 128-bit integer vector of [4 x i32] t...
Definition: smmintrin.h:1319
static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M, __m128i __V)
Tests whether the specified bits in a 128-bit integer vector are all zeros.
Definition: smmintrin.h:1093
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_blendv_epi8(__m128i __V1, __m128i __V2, __m128i __M)
Returns a 128-bit vector of [16 x i8] where the values are selected from either of the first or secon...
Definition: smmintrin.h:490
#define _MM_FROUND_NO_EXC
Definition: smmintrin.h:32