llvm-docs/clang_doxygen/ppc__wrappers_2emmintrin_8h_source.html

 /*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===

  *

  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

  * See https://llvm.org/LICENSE.txt for license information.

  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

  *

  *===-----------------------------------------------------------------------===

  */


 /* Implemented from the specification included in the Intel C++ Compiler

    User Guide and Reference, version 9.0.  */


 #ifndef NO_WARN_X86_INTRINSICS

 /* This header file is to help porting code using Intel intrinsics

    explicitly from x86_64 to powerpc64/powerpc64le.


    Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,

    PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.

    However scalar float operations in vector (XMM) registers require

    the POWER8 VSX ISA (2.07) level. There are differences for data

    format and placement of float scalars in the vector register, which

    require extra steps to match SSE2 scalar float semantics on POWER.


    It should be noted that there's much difference between X86_64's

    MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use

    portable <fenv.h> instead of access MXSCR directly.


    Most SSE2 scalar float intrinsic operations can be performed more

    efficiently as C language float scalar operations or optimized to

    use vector SIMD operations. We recommend this for new applications.

 */

 #error                                                                         \

     "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."

 #endif


 #ifndef EMMINTRIN_H_

 #define EMMINTRIN_H_


 #if defined(__powerpc64__) &&                                                  \

     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))


 #include <altivec.h>


 /* We need definitions from the SSE header files.  */

 #include <xmmintrin.h>


 /* SSE2 */

 typedef __vector double __v2df;

 typedef __vector float __v4f;

 typedef __vector long long __v2di;

 typedef __vector unsigned long long __v2du;

 typedef __vector int __v4si;

 typedef __vector unsigned int __v4su;

 typedef __vector short __v8hi;

 typedef __vector unsigned short __v8hu;

 typedef __vector signed char __v16qi;

 typedef __vector unsigned char __v16qu;


 /* The Intel API is flexible enough that we must allow aliasing with other

    vector types, and their scalar components.  */

 typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));

 typedef double __m128d __attribute__((__vector_size__(16), __may_alias__));


 /* Unaligned version of the same types.  */

 typedef long long __m128i_u

     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));

 typedef double __m128d_u

     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));


 /* Define two value permute mask.  */

 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))


 /* Create a vector with element 0 as F and the rest zero.  */

 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_set_sd(double __F) {

   return __extension__(__m128d){__F, 0.0};

 }


 /* Create a vector with both elements equal to F.  */

 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_set1_pd(double __F) {

   return __extension__(__m128d){__F, __F};

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_set_pd1(double __F) {

   return _mm_set1_pd(__F);

 }


 /* Create a vector with the lower value X and upper value W.  */

 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_set_pd(double __W, double __X) {

   return __extension__(__m128d){__X, __W};

 }


 /* Create a vector with the lower value W and upper value X.  */

 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_setr_pd(double __W, double __X) {

   return __extension__(__m128d){__W, __X};

 }


 /* Create an undefined vector.  */

 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_undefined_pd(void) {

   __m128d __Y = __Y;

   return __Y;

 }


 /* Create a vector of zeros.  */

 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_setzero_pd(void) {

   return (__m128d)vec_splats(0);

 }


 /* Sets the low DPFP value of A from the low value of B.  */

 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_move_sd(__m128d __A, __m128d __B) {

   __v2df __result = (__v2df)__A;

   __result[0] = ((__v2df)__B)[0];

   return (__m128d)__result;

 }


 /* Load two DPFP values from P.  The address must be 16-byte aligned.  */

 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_load_pd(double const *__P) {

   return ((__m128d)vec_ld(0, (__v16qu *)__P));

 }


 /* Load two DPFP values from P.  The address need not be 16-byte aligned.  */

 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_loadu_pd(double const *__P) {

   return (vec_vsx_ld(0, __P));

 }


 /* Create a vector with all two elements equal to *P.  */

 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_load1_pd(double const *__P) {

   return (vec_splats(*__P));

 }


 /* Create a vector with element 0 as *P and the rest zero.  */

 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_load_sd(double const *__P) {

   return _mm_set_sd(*__P);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_load_pd1(double const *__P) {

   return _mm_load1_pd(__P);

 }


 /* Load two DPFP values in reverse order.  The address must be aligned.  */

 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_loadr_pd(double const *__P) {

   __v2df __tmp = _mm_load_pd(__P);

   return (__m128d)vec_xxpermdi(__tmp, __tmp, 2);

 }


 /* Store two DPFP values.  The address must be 16-byte aligned.  */

 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_store_pd(double *__P, __m128d __A) {

   vec_st((__v16qu)__A, 0, (__v16qu *)__P);

 }


 /* Store two DPFP values.  The address need not be 16-byte aligned.  */

 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_storeu_pd(double *__P, __m128d __A) {

   *(__m128d_u *)__P = __A;

 }


 /* Stores the lower DPFP value.  */

 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_store_sd(double *__P, __m128d __A) {

   *__P = ((__v2df)__A)[0];

 }


 extern __inline double

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtsd_f64(__m128d __A) {

   return ((__v2df)__A)[0];

 }


 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_storel_pd(double *__P, __m128d __A) {

   _mm_store_sd(__P, __A);

 }


 /* Stores the upper DPFP value.  */

 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_storeh_pd(double *__P, __m128d __A) {

   *__P = ((__v2df)__A)[1];

 }

 /* Store the lower DPFP value across two words.

    The address must be 16-byte aligned.  */

 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_store1_pd(double *__P, __m128d __A) {

   _mm_store_pd(__P, vec_splat(__A, 0));

 }


 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_store_pd1(double *__P, __m128d __A) {

   _mm_store1_pd(__P, __A);

 }


 /* Store two DPFP values in reverse order.  The address must be aligned.  */

 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_storer_pd(double *__P, __m128d __A) {

   _mm_store_pd(__P, vec_xxpermdi(__A, __A, 2));

 }


 /* Intel intrinsic.  */

 extern __inline long long

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtsi128_si64(__m128i __A) {

   return ((__v2di)__A)[0];

 }


 /* Microsoft intrinsic.  */

 extern __inline long long

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtsi128_si64x(__m128i __A) {

   return ((__v2di)__A)[0];

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_add_pd(__m128d __A, __m128d __B) {

   return (__m128d)((__v2df)__A + (__v2df)__B);

 }


 /* Add the lower double-precision (64-bit) floating-point element in

    a and b, store the result in the lower element of dst, and copy

    the upper element from a to the upper element of dst. */

 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_add_sd(__m128d __A, __m128d __B) {

   __A[0] = __A[0] + __B[0];

   return (__A);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_sub_pd(__m128d __A, __m128d __B) {

   return (__m128d)((__v2df)__A - (__v2df)__B);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_sub_sd(__m128d __A, __m128d __B) {

   __A[0] = __A[0] - __B[0];

   return (__A);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_mul_pd(__m128d __A, __m128d __B) {

   return (__m128d)((__v2df)__A * (__v2df)__B);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_mul_sd(__m128d __A, __m128d __B) {

   __A[0] = __A[0] * __B[0];

   return (__A);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_div_pd(__m128d __A, __m128d __B) {

   return (__m128d)((__v2df)__A / (__v2df)__B);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_div_sd(__m128d __A, __m128d __B) {

   __A[0] = __A[0] / __B[0];

   return (__A);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_sqrt_pd(__m128d __A) {

   return (vec_sqrt(__A));

 }


 /* Return pair {sqrt (B[0]), A[1]}.  */

 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_sqrt_sd(__m128d __A, __m128d __B) {

   __v2df __c;

   __c = vec_sqrt((__v2df)_mm_set1_pd(__B[0]));

   return (__m128d)_mm_setr_pd(__c[0], __A[1]);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_min_pd(__m128d __A, __m128d __B) {

   return (vec_min(__A, __B));

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_min_sd(__m128d __A, __m128d __B) {

   __v2df __a, __b, __c;

   __a = vec_splats(__A[0]);

   __b = vec_splats(__B[0]);

   __c = vec_min(__a, __b);

   return (__m128d)_mm_setr_pd(__c[0], __A[1]);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_max_pd(__m128d __A, __m128d __B) {

   return (vec_max(__A, __B));

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_max_sd(__m128d __A, __m128d __B) {

   __v2df __a, __b, __c;

   __a = vec_splats(__A[0]);

   __b = vec_splats(__B[0]);

   __c = vec_max(__a, __b);

   return (__m128d)_mm_setr_pd(__c[0], __A[1]);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpeq_pd(__m128d __A, __m128d __B) {

   return ((__m128d)vec_cmpeq((__v2df)__A, (__v2df)__B));

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmplt_pd(__m128d __A, __m128d __B) {

   return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmple_pd(__m128d __A, __m128d __B) {

   return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpgt_pd(__m128d __A, __m128d __B) {

   return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpge_pd(__m128d __A, __m128d __B) {

   return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpneq_pd(__m128d __A, __m128d __B) {

   __v2df __temp = (__v2df)vec_cmpeq((__v2df)__A, (__v2df)__B);

   return ((__m128d)vec_nor(__temp, __temp));

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpnlt_pd(__m128d __A, __m128d __B) {

   return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpnle_pd(__m128d __A, __m128d __B) {

   return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpngt_pd(__m128d __A, __m128d __B) {

   return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpnge_pd(__m128d __A, __m128d __B) {

   return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpord_pd(__m128d __A, __m128d __B) {

   __v2du __c, __d;

   /* Compare against self will return false (0's) if NAN.  */

   __c = (__v2du)vec_cmpeq(__A, __A);

   __d = (__v2du)vec_cmpeq(__B, __B);

   /* A != NAN and B != NAN.  */

   return ((__m128d)vec_and(__c, __d));

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpunord_pd(__m128d __A, __m128d __B) {

 #if _ARCH_PWR8

   __v2du __c, __d;

   /* Compare against self will return false (0's) if NAN.  */

   __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);

   __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);

   /* A == NAN OR B == NAN converts too:

      NOT(A != NAN) OR NOT(B != NAN).  */

   __c = vec_nor(__c, __c);

   return ((__m128d)vec_orc(__c, __d));

 #else

   __v2du __c, __d;

   /* Compare against self will return false (0's) if NAN.  */

   __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);

   __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);

   /* Convert the true ('1's) is NAN.  */

   __c = vec_nor(__c, __c);

   __d = vec_nor(__d, __d);

   return ((__m128d)vec_or(__c, __d));

 #endif

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpeq_sd(__m128d __A, __m128d __B) {

   __v2df __a, __b, __c;

   /* PowerISA VSX does not allow partial (for just lower double)

      results. So to insure we don't generate spurious exceptions

      (from the upper double values) we splat the lower double

      before we do the operation. */

   __a = vec_splats(__A[0]);

   __b = vec_splats(__B[0]);

   __c = (__v2df)vec_cmpeq(__a, __b);

   /* Then we merge the lower double result with the original upper

      double from __A.  */

   return (__m128d)_mm_setr_pd(__c[0], __A[1]);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmplt_sd(__m128d __A, __m128d __B) {

   __v2df __a, __b, __c;

   __a = vec_splats(__A[0]);

   __b = vec_splats(__B[0]);

   __c = (__v2df)vec_cmplt(__a, __b);

   return (__m128d)_mm_setr_pd(__c[0], __A[1]);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmple_sd(__m128d __A, __m128d __B) {

   __v2df __a, __b, __c;

   __a = vec_splats(__A[0]);

   __b = vec_splats(__B[0]);

   __c = (__v2df)vec_cmple(__a, __b);

   return (__m128d)_mm_setr_pd(__c[0], __A[1]);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpgt_sd(__m128d __A, __m128d __B) {

   __v2df __a, __b, __c;

   __a = vec_splats(__A[0]);

   __b = vec_splats(__B[0]);

   __c = (__v2df)vec_cmpgt(__a, __b);

   return (__m128d)_mm_setr_pd(__c[0], __A[1]);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpge_sd(__m128d __A, __m128d __B) {

   __v2df __a, __b, __c;

   __a = vec_splats(__A[0]);

   __b = vec_splats(__B[0]);

   __c = (__v2df)vec_cmpge(__a, __b);

   return (__m128d)_mm_setr_pd(__c[0], __A[1]);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpneq_sd(__m128d __A, __m128d __B) {

   __v2df __a, __b, __c;

   __a = vec_splats(__A[0]);

   __b = vec_splats(__B[0]);

   __c = (__v2df)vec_cmpeq(__a, __b);

   __c = vec_nor(__c, __c);

   return (__m128d)_mm_setr_pd(__c[0], __A[1]);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpnlt_sd(__m128d __A, __m128d __B) {

   __v2df __a, __b, __c;

   __a = vec_splats(__A[0]);

   __b = vec_splats(__B[0]);

   /* Not less than is just greater than or equal.  */

   __c = (__v2df)vec_cmpge(__a, __b);

   return (__m128d)_mm_setr_pd(__c[0], __A[1]);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpnle_sd(__m128d __A, __m128d __B) {

   __v2df __a, __b, __c;

   __a = vec_splats(__A[0]);

   __b = vec_splats(__B[0]);

   /* Not less than or equal is just greater than.  */

   __c = (__v2df)vec_cmpge(__a, __b);

   return (__m128d)_mm_setr_pd(__c[0], __A[1]);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpngt_sd(__m128d __A, __m128d __B) {

   __v2df __a, __b, __c;

   __a = vec_splats(__A[0]);

   __b = vec_splats(__B[0]);

   /* Not greater than is just less than or equal.  */

   __c = (__v2df)vec_cmple(__a, __b);

   return (__m128d)_mm_setr_pd(__c[0], __A[1]);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpnge_sd(__m128d __A, __m128d __B) {

   __v2df __a, __b, __c;

   __a = vec_splats(__A[0]);

   __b = vec_splats(__B[0]);

   /* Not greater than or equal is just less than.  */

   __c = (__v2df)vec_cmplt(__a, __b);

   return (__m128d)_mm_setr_pd(__c[0], __A[1]);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpord_sd(__m128d __A, __m128d __B) {

   __v2df __r;

   __r = (__v2df)_mm_cmpord_pd(vec_splats(__A[0]), vec_splats(__B[0]));

   return (__m128d)_mm_setr_pd(__r[0], ((__v2df)__A)[1]);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpunord_sd(__m128d __A, __m128d __B) {

   __v2df __r;

   __r = _mm_cmpunord_pd(vec_splats(__A[0]), vec_splats(__B[0]));

   return (__m128d)_mm_setr_pd(__r[0], __A[1]);

 }


 /* FIXME

    The __mm_comi??_sd and __mm_ucomi??_sd implementations below are

    exactly the same because GCC for PowerPC only generates unordered

    compares (scalar and vector).

    Technically __mm_comieq_sp et all should be using the ordered

    compare and signal for QNaNs.  The __mm_ucomieq_sd et all should

    be OK.   */

 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_comieq_sd(__m128d __A, __m128d __B) {

   return (__A[0] == __B[0]);

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_comilt_sd(__m128d __A, __m128d __B) {

   return (__A[0] < __B[0]);

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_comile_sd(__m128d __A, __m128d __B) {

   return (__A[0] <= __B[0]);

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_comigt_sd(__m128d __A, __m128d __B) {

   return (__A[0] > __B[0]);

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_comige_sd(__m128d __A, __m128d __B) {

   return (__A[0] >= __B[0]);

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_comineq_sd(__m128d __A, __m128d __B) {

   return (__A[0] != __B[0]);

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_ucomieq_sd(__m128d __A, __m128d __B) {

   return (__A[0] == __B[0]);

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_ucomilt_sd(__m128d __A, __m128d __B) {

   return (__A[0] < __B[0]);

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_ucomile_sd(__m128d __A, __m128d __B) {

   return (__A[0] <= __B[0]);

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_ucomigt_sd(__m128d __A, __m128d __B) {

   return (__A[0] > __B[0]);

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_ucomige_sd(__m128d __A, __m128d __B) {

   return (__A[0] >= __B[0]);

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_ucomineq_sd(__m128d __A, __m128d __B) {

   return (__A[0] != __B[0]);

 }


 /* Create a vector of Qi, where i is the element number.  */

 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_set_epi64x(long long __q1, long long __q0) {

   return __extension__(__m128i)(__v2di){__q0, __q1};

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_set_epi64(__m64 __q1, __m64 __q0) {

   return _mm_set_epi64x((long long)__q1, (long long)__q0);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) {

   return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3};

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3,

                   short __q2, short __q1, short __q0) {

   return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3,

                                         __q4, __q5, __q6, __q7};

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11,

                  char __q10, char __q09, char __q08, char __q07, char __q06,

                  char __q05, char __q04, char __q03, char __q02, char __q01,

                  char __q00) {

   return __extension__(__m128i)(__v16qi){

       __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,

       __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15};

 }


 /* Set all of the elements of the vector to A.  */

 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_set1_epi64x(long long __A) {

   return _mm_set_epi64x(__A, __A);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_set1_epi64(__m64 __A) {

   return _mm_set_epi64(__A, __A);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_set1_epi32(int __A) {

   return _mm_set_epi32(__A, __A, __A, __A);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_set1_epi16(short __A) {

   return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_set1_epi8(char __A) {

   return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A,

                       __A, __A, __A, __A, __A);

 }


 /* Create a vector of Qi, where i is the element number.

    The parameter order is reversed from the _mm_set_epi* functions.  */

 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_setr_epi64(__m64 __q0, __m64 __q1) {

   return _mm_set_epi64(__q1, __q0);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) {

   return _mm_set_epi32(__q3, __q2, __q1, __q0);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4,

                    short __q5, short __q6, short __q7) {

   return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04,

                   char __q05, char __q06, char __q07, char __q08, char __q09,

                   char __q10, char __q11, char __q12, char __q13, char __q14,

                   char __q15) {

   return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,

                       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);

 }


 /* Create a vector with element 0 as *P and the rest zero.  */

 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_load_si128(__m128i const *__P) {

   return *__P;

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_loadu_si128(__m128i_u const *__P) {

   return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_loadl_epi64(__m128i_u const *__P) {

   return _mm_set_epi64((__m64)0LL, *(__m64 *)__P);

 }


 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_store_si128(__m128i *__P, __m128i __B) {

   vec_st((__v16qu)__B, 0, (__v16qu *)__P);

 }


 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_storeu_si128(__m128i_u *__P, __m128i __B) {

   *__P = __B;

 }


 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_storel_epi64(__m128i_u *__P, __m128i __B) {

   *(long long *)__P = ((__v2di)__B)[0];

 }


 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_movepi64_pi64(__m128i_u __B) {

   return (__m64)((__v2di)__B)[0];

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_movpi64_epi64(__m64 __A) {

   return _mm_set_epi64((__m64)0LL, __A);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_move_epi64(__m128i __A) {

   return _mm_set_epi64((__m64)0LL, (__m64)__A[0]);

 }


 /* Create an undefined vector.  */

 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_undefined_si128(void) {

   __m128i __Y = __Y;

   return __Y;

 }


 /* Create a vector of zeros.  */

 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_setzero_si128(void) {

   return __extension__(__m128i)(__v4si){0, 0, 0, 0};

 }


 #ifdef _ARCH_PWR8

 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtepi32_pd(__m128i __A) {

   __v2di __val;

   /* For LE need to generate Vector Unpack Low Signed Word.

      Which is generated from unpackh.  */

   __val = (__v2di)vec_unpackh((__v4si)__A);


   return (__m128d)vec_ctf(__val, 0);

 }

 #endif


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtepi32_ps(__m128i __A) {

   return ((__m128)vec_ctf((__v4si)__A, 0));

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtpd_epi32(__m128d __A) {

   __v2df __rounded = vec_rint(__A);

   __v4si __result, __temp;

   const __v4si __vzero = {0, 0, 0, 0};


   /* VSX Vector truncate Double-Precision to integer and Convert to

    Signed Integer Word format with Saturate.  */

   __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__rounded) :);


 #ifdef _ARCH_PWR8

 #ifdef __LITTLE_ENDIAN__

   __temp = vec_mergeo(__temp, __temp);

 #else

   __temp = vec_mergee(__temp, __temp);

 #endif

   __result = (__v4si)vec_vpkudum((__vector long long)__temp,

                                  (__vector long long)__vzero);

 #else

   {

     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,

                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};

     __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);

   }

 #endif

   return (__m128i)__result;

 }


 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtpd_pi32(__m128d __A) {

   __m128i __result = _mm_cvtpd_epi32(__A);


   return (__m64)__result[0];

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtpd_ps(__m128d __A) {

   __v4sf __result;

   __v4si __temp;

   const __v4si __vzero = {0, 0, 0, 0};


   __asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);


 #ifdef _ARCH_PWR8

 #ifdef __LITTLE_ENDIAN__

   __temp = vec_mergeo(__temp, __temp);

 #else

   __temp = vec_mergee(__temp, __temp);

 #endif

   __result = (__v4sf)vec_vpkudum((__vector long long)__temp,

                                  (__vector long long)__vzero);

 #else

   {

     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,

                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};

     __result = (__v4sf)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);

   }

 #endif

   return ((__m128)__result);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvttpd_epi32(__m128d __A) {

   __v4si __result;

   __v4si __temp;

   const __v4si __vzero = {0, 0, 0, 0};


   /* VSX Vector truncate Double-Precision to integer and Convert to

    Signed Integer Word format with Saturate.  */

   __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);


 #ifdef _ARCH_PWR8

 #ifdef __LITTLE_ENDIAN__

   __temp = vec_mergeo(__temp, __temp);

 #else

   __temp = vec_mergee(__temp, __temp);

 #endif

   __result = (__v4si)vec_vpkudum((__vector long long)__temp,

                                  (__vector long long)__vzero);

 #else

   {

     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,

                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};

     __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);

   }

 #endif


   return ((__m128i)__result);

 }


 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvttpd_pi32(__m128d __A) {

   __m128i __result = _mm_cvttpd_epi32(__A);


   return (__m64)__result[0];

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtsi128_si32(__m128i __A) {

   return ((__v4si)__A)[0];

 }


 #ifdef _ARCH_PWR8

 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtpi32_pd(__m64 __A) {

   __v4si __temp;

   __v2di __tmp2;

   __v4f __result;


   __temp = (__v4si)vec_splats(__A);

   __tmp2 = (__v2di)vec_unpackl(__temp);

   __result = vec_ctf((__vector signed long long)__tmp2, 0);

   return (__m128d)__result;

 }

 #endif


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtps_epi32(__m128 __A) {

   __v4sf __rounded;

   __v4si __result;


   __rounded = vec_rint((__v4sf)__A);

   __result = vec_cts(__rounded, 0);

   return (__m128i)__result;

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvttps_epi32(__m128 __A) {

   __v4si __result;


   __result = vec_cts((__v4sf)__A, 0);

   return (__m128i)__result;

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtps_pd(__m128 __A) {

   /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */

 #ifdef vec_doubleh

   return (__m128d)vec_doubleh((__v4sf)__A);

 #else

   /* Otherwise the compiler is not current and so need to generate the

      equivalent code.  */

   __v4sf __a = (__v4sf)__A;

   __v4sf __temp;

   __v2df __result;

 #ifdef __LITTLE_ENDIAN__

   /* The input float values are in elements {[0], [1]} but the convert

      instruction needs them in elements {[1], [3]}, So we use two

      shift left double vector word immediates to get the elements

      lined up.  */

   __temp = __builtin_vsx_xxsldwi(__a, __a, 3);

   __temp = __builtin_vsx_xxsldwi(__a, __temp, 2);

 #else

   /* The input float values are in elements {[0], [1]} but the convert

      instruction needs them in elements {[0], [2]}, So we use two

      shift left double vector word immediates to get the elements

      lined up.  */

   __temp = vec_vmrghw(__a, __a);

 #endif

   __asm__(" xvcvspdp %x0,%x1" : "=wa"(__result) : "wa"(__temp) :);

   return (__m128d)__result;

 #endif

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtsd_si32(__m128d __A) {

   __v2df __rounded = vec_rint((__v2df)__A);

   int __result = ((__v2df)__rounded)[0];


   return __result;

 }

 /* Intel intrinsic.  */

 extern __inline long long

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtsd_si64(__m128d __A) {

   __v2df __rounded = vec_rint((__v2df)__A);

   long long __result = ((__v2df)__rounded)[0];


   return __result;

 }


 /* Microsoft intrinsic.  */

 extern __inline long long

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtsd_si64x(__m128d __A) {

   return _mm_cvtsd_si64((__v2df)__A);

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvttsd_si32(__m128d __A) {

   int __result = ((__v2df)__A)[0];


   return __result;

 }


 /* Intel intrinsic.  */

 extern __inline long long

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvttsd_si64(__m128d __A) {

   long long __result = ((__v2df)__A)[0];


   return __result;

 }


 /* Microsoft intrinsic.  */

 extern __inline long long

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvttsd_si64x(__m128d __A) {

   return _mm_cvttsd_si64(__A);

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtsd_ss(__m128 __A, __m128d __B) {

   __v4sf __result = (__v4sf)__A;


 #ifdef __LITTLE_ENDIAN__

   __v4sf __temp_s;

   /* Copy double element[0] to element [1] for conversion.  */

   __v2df __temp_b = vec_splat((__v2df)__B, 0);


   /* Pre-rotate __A left 3 (logically right 1) elements.  */

   __result = __builtin_vsx_xxsldwi(__result, __result, 3);

   /* Convert double to single float scalar in a vector.  */

   __asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s) : "wa"(__temp_b) :);

   /* Shift the resulting scalar into vector element [0].  */

   __result = __builtin_vsx_xxsldwi(__result, __temp_s, 1);

 #else

   __result[0] = ((__v2df)__B)[0];

 #endif

   return (__m128)__result;

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtsi32_sd(__m128d __A, int __B) {

   __v2df __result = (__v2df)__A;

   double __db = __B;

   __result[0] = __db;

   return (__m128d)__result;

 }


 /* Intel intrinsic.  */

 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtsi64_sd(__m128d __A, long long __B) {

   __v2df __result = (__v2df)__A;

   double __db = __B;

   __result[0] = __db;

   return (__m128d)__result;

 }


 /* Microsoft intrinsic.  */

 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtsi64x_sd(__m128d __A, long long __B) {

   return _mm_cvtsi64_sd(__A, __B);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtss_sd(__m128d __A, __m128 __B) {

 #ifdef __LITTLE_ENDIAN__

   /* Use splat to move element [0] into position for the convert. */

   __v4sf __temp = vec_splat((__v4sf)__B, 0);

   __v2df __res;

   /* Convert single float scalar to double in a vector.  */

   __asm__("xscvspdp %x0,%x1" : "=wa"(__res) : "wa"(__temp) :);

   return (__m128d)vec_mergel(__res, (__v2df)__A);

 #else

   __v2df __res = (__v2df)__A;

   __res[0] = ((__v4sf)__B)[0];

   return (__m128d)__res;

 #endif

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) {

   __vector double __result;

   const int __litmsk = __mask & 0x3;


   if (__litmsk == 0)

     __result = vec_mergeh(__A, __B);

 #if __GNUC__ < 6

   else if (__litmsk == 1)

     __result = vec_xxpermdi(__B, __A, 2);

   else if (__litmsk == 2)

     __result = vec_xxpermdi(__B, __A, 1);

 #else

   else if (__litmsk == 1)

     __result = vec_xxpermdi(__A, __B, 2);

   else if (__litmsk == 2)

     __result = vec_xxpermdi(__A, __B, 1);

 #endif

   else

     __result = vec_mergel(__A, __B);


   return __result;

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_unpackhi_pd(__m128d __A, __m128d __B) {

   return (__m128d)vec_mergel((__v2df)__A, (__v2df)__B);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_unpacklo_pd(__m128d __A, __m128d __B) {

   return (__m128d)vec_mergeh((__v2df)__A, (__v2df)__B);

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_loadh_pd(__m128d __A, double const *__B) {

   __v2df __result = (__v2df)__A;

   __result[1] = *__B;

   return (__m128d)__result;

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_loadl_pd(__m128d __A, double const *__B) {

   __v2df __result = (__v2df)__A;

   __result[0] = *__B;

   return (__m128d)__result;

 }


 #ifdef _ARCH_PWR8

 /* Intrinsic functions that require PowerISA 2.07 minimum.  */


 /* Creates a 2-bit mask from the most significant bits of the DPFP values.  */

 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_movemask_pd(__m128d __A) {

 #ifdef _ARCH_PWR10

   return vec_extractm((__v2du)__A);

 #else

   __vector unsigned long long __result;

   static const __vector unsigned int __perm_mask = {

 #ifdef __LITTLE_ENDIAN__

       0x80800040, 0x80808080, 0x80808080, 0x80808080

 #else

       0x80808080, 0x80808080, 0x80808080, 0x80804000

 #endif

   };


   __result = ((__vector unsigned long long)vec_vbpermq(

       (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));


 #ifdef __LITTLE_ENDIAN__

   return __result[1];

 #else

   return __result[0];

 #endif

 #endif /* !_ARCH_PWR10 */

 }

 #endif /* _ARCH_PWR8 */


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_packs_epi16(__m128i __A, __m128i __B) {

   return (__m128i)vec_packs((__v8hi)__A, (__v8hi)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_packs_epi32(__m128i __A, __m128i __B) {

   return (__m128i)vec_packs((__v4si)__A, (__v4si)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_packus_epi16(__m128i __A, __m128i __B) {

   return (__m128i)vec_packsu((__v8hi)__A, (__v8hi)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_unpackhi_epi8(__m128i __A, __m128i __B) {

   return (__m128i)vec_mergel((__v16qu)__A, (__v16qu)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_unpackhi_epi16(__m128i __A, __m128i __B) {

   return (__m128i)vec_mergel((__v8hu)__A, (__v8hu)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_unpackhi_epi32(__m128i __A, __m128i __B) {

   return (__m128i)vec_mergel((__v4su)__A, (__v4su)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_unpackhi_epi64(__m128i __A, __m128i __B) {

   return (__m128i)vec_mergel((__vector long long)__A, (__vector long long)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_unpacklo_epi8(__m128i __A, __m128i __B) {

   return (__m128i)vec_mergeh((__v16qu)__A, (__v16qu)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_unpacklo_epi16(__m128i __A, __m128i __B) {

   return (__m128i)vec_mergeh((__v8hi)__A, (__v8hi)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_unpacklo_epi32(__m128i __A, __m128i __B) {

   return (__m128i)vec_mergeh((__v4si)__A, (__v4si)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_unpacklo_epi64(__m128i __A, __m128i __B) {

   return (__m128i)vec_mergeh((__vector long long)__A, (__vector long long)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_add_epi8(__m128i __A, __m128i __B) {

   return (__m128i)((__v16qu)__A + (__v16qu)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_add_epi16(__m128i __A, __m128i __B) {

   return (__m128i)((__v8hu)__A + (__v8hu)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_add_epi32(__m128i __A, __m128i __B) {

   return (__m128i)((__v4su)__A + (__v4su)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_add_epi64(__m128i __A, __m128i __B) {

   return (__m128i)((__v2du)__A + (__v2du)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_adds_epi8(__m128i __A, __m128i __B) {

   return (__m128i)vec_adds((__v16qi)__A, (__v16qi)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_adds_epi16(__m128i __A, __m128i __B) {

   return (__m128i)vec_adds((__v8hi)__A, (__v8hi)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_adds_epu8(__m128i __A, __m128i __B) {

   return (__m128i)vec_adds((__v16qu)__A, (__v16qu)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_adds_epu16(__m128i __A, __m128i __B) {

   return (__m128i)vec_adds((__v8hu)__A, (__v8hu)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_sub_epi8(__m128i __A, __m128i __B) {

   return (__m128i)((__v16qu)__A - (__v16qu)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_sub_epi16(__m128i __A, __m128i __B) {

   return (__m128i)((__v8hu)__A - (__v8hu)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_sub_epi32(__m128i __A, __m128i __B) {

   return (__m128i)((__v4su)__A - (__v4su)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_sub_epi64(__m128i __A, __m128i __B) {

   return (__m128i)((__v2du)__A - (__v2du)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_subs_epi8(__m128i __A, __m128i __B) {

   return (__m128i)vec_subs((__v16qi)__A, (__v16qi)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_subs_epi16(__m128i __A, __m128i __B) {

   return (__m128i)vec_subs((__v8hi)__A, (__v8hi)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_subs_epu8(__m128i __A, __m128i __B) {

   return (__m128i)vec_subs((__v16qu)__A, (__v16qu)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_subs_epu16(__m128i __A, __m128i __B) {

   return (__m128i)vec_subs((__v8hu)__A, (__v8hu)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_madd_epi16(__m128i __A, __m128i __B) {

   __vector signed int __zero = {0, 0, 0, 0};


   return (__m128i)vec_vmsumshm((__v8hi)__A, (__v8hi)__B, __zero);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_mulhi_epi16(__m128i __A, __m128i __B) {

   __vector signed int __w0, __w1;


   __vector unsigned char __xform1 = {

 #ifdef __LITTLE_ENDIAN__

       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,

       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F

 #else

       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,

       0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D

 #endif

   };


   __w0 = vec_vmulesh((__v8hi)__A, (__v8hi)__B);

   __w1 = vec_vmulosh((__v8hi)__A, (__v8hi)__B);

   return (__m128i)vec_perm(__w0, __w1, __xform1);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_mullo_epi16(__m128i __A, __m128i __B) {

   return (__m128i)((__v8hi)__A * (__v8hi)__B);

 }


 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_mul_su32(__m64 __A, __m64 __B) {

   unsigned int __a = __A;

   unsigned int __b = __B;


   return ((__m64)__a * (__m64)__b);

 }


 #ifdef _ARCH_PWR8

 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_mul_epu32(__m128i __A, __m128i __B) {

 #if __GNUC__ < 8

   __v2du __result;


 #ifdef __LITTLE_ENDIAN__

   /* VMX Vector Multiply Odd Unsigned Word.  */

   __asm__("vmulouw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);

 #else

   /* VMX Vector Multiply Even Unsigned Word.  */

   __asm__("vmuleuw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);

 #endif

   return (__m128i)__result;

 #else

   return (__m128i)vec_mule((__v4su)__A, (__v4su)__B);

 #endif

 }

 #endif


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_slli_epi16(__m128i __A, int __B) {

   __v8hu __lshift;

   __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};


   if (__B >= 0 && __B < 16) {

     if (__builtin_constant_p(__B))

       __lshift = (__v8hu)vec_splat_s16(__B);

     else

       __lshift = vec_splats((unsigned short)__B);


     __result = vec_sl((__v8hi)__A, __lshift);

   }


   return (__m128i)__result;

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_slli_epi32(__m128i __A, int __B) {

   __v4su __lshift;

   __v4si __result = {0, 0, 0, 0};


   if (__B >= 0 && __B < 32) {

     if (__builtin_constant_p(__B) && __B < 16)

       __lshift = (__v4su)vec_splat_s32(__B);

     else

       __lshift = vec_splats((unsigned int)__B);


     __result = vec_sl((__v4si)__A, __lshift);

   }


   return (__m128i)__result;

 }


 #ifdef _ARCH_PWR8

 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_slli_epi64(__m128i __A, int __B) {

   __v2du __lshift;

   __v2di __result = {0, 0};


   if (__B >= 0 && __B < 64) {

     if (__builtin_constant_p(__B) && __B < 16)

       __lshift = (__v2du)vec_splat_s32(__B);

     else

       __lshift = (__v2du)vec_splats((unsigned int)__B);


     __result = vec_sl((__v2di)__A, __lshift);

   }


   return (__m128i)__result;

 }

 #endif


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_srai_epi16(__m128i __A, int __B) {

   __v8hu __rshift = {15, 15, 15, 15, 15, 15, 15, 15};

   __v8hi __result;


   if (__B < 16) {

     if (__builtin_constant_p(__B))

       __rshift = (__v8hu)vec_splat_s16(__B);

     else

       __rshift = vec_splats((unsigned short)__B);

   }

   __result = vec_sra((__v8hi)__A, __rshift);


   return (__m128i)__result;

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_srai_epi32(__m128i __A, int __B) {

   __v4su __rshift = {31, 31, 31, 31};

   __v4si __result;


   if (__B < 32) {

     if (__builtin_constant_p(__B)) {

       if (__B < 16)

         __rshift = (__v4su)vec_splat_s32(__B);

       else

         __rshift = (__v4su)vec_splats((unsigned int)__B);

     } else

       __rshift = vec_splats((unsigned int)__B);

   }

   __result = vec_sra((__v4si)__A, __rshift);


   return (__m128i)__result;

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_bslli_si128(__m128i __A, const int __N) {

   __v16qu __result;

   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};


   if (__N < 16)

     __result = vec_sld((__v16qu)__A, __zeros, __N);

   else

     __result = __zeros;


   return (__m128i)__result;

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_bsrli_si128(__m128i __A, const int __N) {

   __v16qu __result;

   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};


   if (__N < 16)

 #ifdef __LITTLE_ENDIAN__

     if (__builtin_constant_p(__N))

       /* Would like to use Vector Shift Left Double by Octet

          Immediate here to use the immediate form and avoid

          load of __N * 8 value into a separate VR.  */

       __result = vec_sld(__zeros, (__v16qu)__A, (16 - __N));

     else

 #endif

     {

       __v16qu __shift = vec_splats((unsigned char)(__N * 8));

 #ifdef __LITTLE_ENDIAN__

       __result = vec_sro((__v16qu)__A, __shift);

 #else

     __result = vec_slo((__v16qu)__A, __shift);

 #endif

     }

   else

     __result = __zeros;


   return (__m128i)__result;

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_srli_si128(__m128i __A, const int __N) {

   return _mm_bsrli_si128(__A, __N);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_slli_si128(__m128i __A, const int _imm5) {

   __v16qu __result;

   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};


   if (_imm5 < 16)

 #ifdef __LITTLE_ENDIAN__

     __result = vec_sld((__v16qu)__A, __zeros, _imm5);

 #else

     __result = vec_sld(__zeros, (__v16qu)__A, (16 - _imm5));

 #endif

   else

     __result = __zeros;


   return (__m128i)__result;

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))


     _mm_srli_epi16(__m128i __A, int __B) {

   __v8hu __rshift;

   __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};


   if (__B < 16) {

     if (__builtin_constant_p(__B))

       __rshift = (__v8hu)vec_splat_s16(__B);

     else

       __rshift = vec_splats((unsigned short)__B);


     __result = vec_sr((__v8hi)__A, __rshift);

   }


   return (__m128i)__result;

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_srli_epi32(__m128i __A, int __B) {

   __v4su __rshift;

   __v4si __result = {0, 0, 0, 0};


   if (__B < 32) {

     if (__builtin_constant_p(__B)) {

       if (__B < 16)

         __rshift = (__v4su)vec_splat_s32(__B);

       else

         __rshift = (__v4su)vec_splats((unsigned int)__B);

     } else

       __rshift = vec_splats((unsigned int)__B);


     __result = vec_sr((__v4si)__A, __rshift);

   }


   return (__m128i)__result;

 }


 #ifdef _ARCH_PWR8

 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_srli_epi64(__m128i __A, int __B) {

   __v2du __rshift;

   __v2di __result = {0, 0};


   if (__B < 64) {

     if (__builtin_constant_p(__B)) {

       if (__B < 16)

         __rshift = (__v2du)vec_splat_s32(__B);

       else

         __rshift = (__v2du)vec_splats((unsigned long long)__B);

     } else

       __rshift = (__v2du)vec_splats((unsigned int)__B);


     __result = vec_sr((__v2di)__A, __rshift);

   }


   return (__m128i)__result;

 }

 #endif


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_sll_epi16(__m128i __A, __m128i __B) {

   __v8hu __lshift;

   __vector __bool short __shmask;

   const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};

   __v8hu __result;


 #ifdef __LITTLE_ENDIAN__

   __lshift = vec_splat((__v8hu)__B, 0);

 #else

   __lshift = vec_splat((__v8hu)__B, 3);

 #endif

   __shmask = vec_cmple(__lshift, __shmax);

   __result = vec_sl((__v8hu)__A, __lshift);

   __result = vec_sel((__v8hu)__shmask, __result, __shmask);


   return (__m128i)__result;

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_sll_epi32(__m128i __A, __m128i __B) {

   __v4su __lshift;

   __vector __bool int __shmask;

   const __v4su __shmax = {32, 32, 32, 32};

   __v4su __result;

 #ifdef __LITTLE_ENDIAN__

   __lshift = vec_splat((__v4su)__B, 0);

 #else

   __lshift = vec_splat((__v4su)__B, 1);

 #endif

   __shmask = vec_cmplt(__lshift, __shmax);

   __result = vec_sl((__v4su)__A, __lshift);

   __result = vec_sel((__v4su)__shmask, __result, __shmask);


   return (__m128i)__result;

 }


 #ifdef _ARCH_PWR8

 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_sll_epi64(__m128i __A, __m128i __B) {

   __v2du __lshift;

   __vector __bool long long __shmask;

   const __v2du __shmax = {64, 64};

   __v2du __result;


   __lshift = vec_splat((__v2du)__B, 0);

   __shmask = vec_cmplt(__lshift, __shmax);

   __result = vec_sl((__v2du)__A, __lshift);

   __result = vec_sel((__v2du)__shmask, __result, __shmask);


   return (__m128i)__result;

 }

 #endif


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_sra_epi16(__m128i __A, __m128i __B) {

   const __v8hu __rshmax = {15, 15, 15, 15, 15, 15, 15, 15};

   __v8hu __rshift;

   __v8hi __result;


 #ifdef __LITTLE_ENDIAN__

   __rshift = vec_splat((__v8hu)__B, 0);

 #else

   __rshift = vec_splat((__v8hu)__B, 3);

 #endif

   __rshift = vec_min(__rshift, __rshmax);

   __result = vec_sra((__v8hi)__A, __rshift);


   return (__m128i)__result;

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_sra_epi32(__m128i __A, __m128i __B) {

   const __v4su __rshmax = {31, 31, 31, 31};

   __v4su __rshift;

   __v4si __result;


 #ifdef __LITTLE_ENDIAN__

   __rshift = vec_splat((__v4su)__B, 0);

 #else

   __rshift = vec_splat((__v4su)__B, 1);

 #endif

   __rshift = vec_min(__rshift, __rshmax);

   __result = vec_sra((__v4si)__A, __rshift);


   return (__m128i)__result;

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_srl_epi16(__m128i __A, __m128i __B) {

   __v8hu __rshift;

   __vector __bool short __shmask;

   const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};

   __v8hu __result;


 #ifdef __LITTLE_ENDIAN__

   __rshift = vec_splat((__v8hu)__B, 0);

 #else

   __rshift = vec_splat((__v8hu)__B, 3);

 #endif

   __shmask = vec_cmple(__rshift, __shmax);

   __result = vec_sr((__v8hu)__A, __rshift);

   __result = vec_sel((__v8hu)__shmask, __result, __shmask);


   return (__m128i)__result;

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_srl_epi32(__m128i __A, __m128i __B) {

   __v4su __rshift;

   __vector __bool int __shmask;

   const __v4su __shmax = {32, 32, 32, 32};

   __v4su __result;


 #ifdef __LITTLE_ENDIAN__

   __rshift = vec_splat((__v4su)__B, 0);

 #else

   __rshift = vec_splat((__v4su)__B, 1);

 #endif

   __shmask = vec_cmplt(__rshift, __shmax);

   __result = vec_sr((__v4su)__A, __rshift);

   __result = vec_sel((__v4su)__shmask, __result, __shmask);


   return (__m128i)__result;

 }


 #ifdef _ARCH_PWR8

 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_srl_epi64(__m128i __A, __m128i __B) {

   __v2du __rshift;

   __vector __bool long long __shmask;

   const __v2du __shmax = {64, 64};

   __v2du __result;


   __rshift = vec_splat((__v2du)__B, 0);

   __shmask = vec_cmplt(__rshift, __shmax);

   __result = vec_sr((__v2du)__A, __rshift);

   __result = vec_sel((__v2du)__shmask, __result, __shmask);


   return (__m128i)__result;

 }

 #endif


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_and_pd(__m128d __A, __m128d __B) {

   return (vec_and((__v2df)__A, (__v2df)__B));

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_andnot_pd(__m128d __A, __m128d __B) {

   return (vec_andc((__v2df)__B, (__v2df)__A));

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_or_pd(__m128d __A, __m128d __B) {

   return (vec_or((__v2df)__A, (__v2df)__B));

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_xor_pd(__m128d __A, __m128d __B) {

   return (vec_xor((__v2df)__A, (__v2df)__B));

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_and_si128(__m128i __A, __m128i __B) {

   return (__m128i)vec_and((__v2di)__A, (__v2di)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_andnot_si128(__m128i __A, __m128i __B) {

   return (__m128i)vec_andc((__v2di)__B, (__v2di)__A);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_or_si128(__m128i __A, __m128i __B) {

   return (__m128i)vec_or((__v2di)__A, (__v2di)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_xor_si128(__m128i __A, __m128i __B) {

   return (__m128i)vec_xor((__v2di)__A, (__v2di)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpeq_epi8(__m128i __A, __m128i __B) {

   return (__m128i)vec_cmpeq((__v16qi)__A, (__v16qi)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpeq_epi16(__m128i __A, __m128i __B) {

   return (__m128i)vec_cmpeq((__v8hi)__A, (__v8hi)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpeq_epi32(__m128i __A, __m128i __B) {

   return (__m128i)vec_cmpeq((__v4si)__A, (__v4si)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmplt_epi8(__m128i __A, __m128i __B) {

   return (__m128i)vec_cmplt((__v16qi)__A, (__v16qi)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmplt_epi16(__m128i __A, __m128i __B) {

   return (__m128i)vec_cmplt((__v8hi)__A, (__v8hi)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmplt_epi32(__m128i __A, __m128i __B) {

   return (__m128i)vec_cmplt((__v4si)__A, (__v4si)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpgt_epi8(__m128i __A, __m128i __B) {

   return (__m128i)vec_cmpgt((__v16qi)__A, (__v16qi)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpgt_epi16(__m128i __A, __m128i __B) {

   return (__m128i)vec_cmpgt((__v8hi)__A, (__v8hi)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpgt_epi32(__m128i __A, __m128i __B) {

   return (__m128i)vec_cmpgt((__v4si)__A, (__v4si)__B);

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_extract_epi16(__m128i const __A, int const __N) {

   return (unsigned short)((__v8hi)__A)[__N & 7];

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_insert_epi16(__m128i const __A, int const __D, int const __N) {

   __v8hi __result = (__v8hi)__A;


   __result[(__N & 7)] = __D;


   return (__m128i)__result;

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_max_epi16(__m128i __A, __m128i __B) {

   return (__m128i)vec_max((__v8hi)__A, (__v8hi)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_max_epu8(__m128i __A, __m128i __B) {

   return (__m128i)vec_max((__v16qu)__A, (__v16qu)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_min_epi16(__m128i __A, __m128i __B) {

   return (__m128i)vec_min((__v8hi)__A, (__v8hi)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_min_epu8(__m128i __A, __m128i __B) {

   return (__m128i)vec_min((__v16qu)__A, (__v16qu)__B);

 }


 #ifdef _ARCH_PWR8

 /* Intrinsic functions that require PowerISA 2.07 minimum.  */


 /* Return a mask created from the most significant bit of each 8-bit

    element in A.  */

 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_movemask_epi8(__m128i __A) {

 #ifdef _ARCH_PWR10

   return vec_extractm((__v16qu)__A);

 #else

   __vector unsigned long long __result;

   static const __vector unsigned char __perm_mask = {

       0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,

       0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};


   __result = ((__vector unsigned long long)vec_vbpermq(

       (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));


 #ifdef __LITTLE_ENDIAN__

   return __result[1];

 #else

   return __result[0];

 #endif

 #endif /* !_ARCH_PWR10 */

 }

 #endif /* _ARCH_PWR8 */


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_mulhi_epu16(__m128i __A, __m128i __B) {

   __v4su __w0, __w1;

   __v16qu __xform1 = {

 #ifdef __LITTLE_ENDIAN__

       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,

       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F

 #else

       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,

       0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D

 #endif

   };


   __w0 = vec_vmuleuh((__v8hu)__A, (__v8hu)__B);

   __w1 = vec_vmulouh((__v8hu)__A, (__v8hu)__B);

   return (__m128i)vec_perm(__w0, __w1, __xform1);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_shufflehi_epi16(__m128i __A, const int __mask) {

   unsigned long __element_selector_98 = __mask & 0x03;

   unsigned long __element_selector_BA = (__mask >> 2) & 0x03;

   unsigned long __element_selector_DC = (__mask >> 4) & 0x03;

   unsigned long __element_selector_FE = (__mask >> 6) & 0x03;

   static const unsigned short __permute_selectors[4] = {

 #ifdef __LITTLE_ENDIAN__

       0x0908, 0x0B0A, 0x0D0C, 0x0F0E

 #else

       0x0809, 0x0A0B, 0x0C0D, 0x0E0F

 #endif

   };

   __v2du __pmask =

 #ifdef __LITTLE_ENDIAN__

       {0x1716151413121110UL, 0UL};

 #else

       {0x1011121314151617UL, 0UL};

 #endif

   __m64_union __t;

   __v2du __a, __r;


   __t.as_short[0] = __permute_selectors[__element_selector_98];

   __t.as_short[1] = __permute_selectors[__element_selector_BA];

   __t.as_short[2] = __permute_selectors[__element_selector_DC];

   __t.as_short[3] = __permute_selectors[__element_selector_FE];

   __pmask[1] = __t.as_m64;

   __a = (__v2du)__A;

   __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);

   return (__m128i)__r;

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_shufflelo_epi16(__m128i __A, const int __mask) {

   unsigned long __element_selector_10 = __mask & 0x03;

   unsigned long __element_selector_32 = (__mask >> 2) & 0x03;

   unsigned long __element_selector_54 = (__mask >> 4) & 0x03;

   unsigned long __element_selector_76 = (__mask >> 6) & 0x03;

   static const unsigned short __permute_selectors[4] = {

 #ifdef __LITTLE_ENDIAN__

       0x0100, 0x0302, 0x0504, 0x0706

 #else

       0x0001, 0x0203, 0x0405, 0x0607

 #endif

   };

   __v2du __pmask =

 #ifdef __LITTLE_ENDIAN__

       {0UL, 0x1f1e1d1c1b1a1918UL};

 #else

       {0UL, 0x18191a1b1c1d1e1fUL};

 #endif

   __m64_union __t;

   __v2du __a, __r;

   __t.as_short[0] = __permute_selectors[__element_selector_10];

   __t.as_short[1] = __permute_selectors[__element_selector_32];

   __t.as_short[2] = __permute_selectors[__element_selector_54];

   __t.as_short[3] = __permute_selectors[__element_selector_76];

   __pmask[0] = __t.as_m64;

   __a = (__v2du)__A;

   __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);

   return (__m128i)__r;

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_shuffle_epi32(__m128i __A, const int __mask) {

   unsigned long __element_selector_10 = __mask & 0x03;

   unsigned long __element_selector_32 = (__mask >> 2) & 0x03;

   unsigned long __element_selector_54 = (__mask >> 4) & 0x03;

   unsigned long __element_selector_76 = (__mask >> 6) & 0x03;

   static const unsigned int __permute_selectors[4] = {

 #ifdef __LITTLE_ENDIAN__

       0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C

 #else

       0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F

 #endif

   };

   __v4su __t;


   __t[0] = __permute_selectors[__element_selector_10];

   __t[1] = __permute_selectors[__element_selector_32];

   __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;

   __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;

   return (__m128i)vec_perm((__v4si)__A, (__v4si)__A,

                            (__vector unsigned char)__t);

 }


 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) {

   __v2du __hibit = {0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};

   __v16qu __mask, __tmp;

   __m128i_u *__p = (__m128i_u *)__C;


   __tmp = (__v16qu)_mm_loadu_si128(__p);

   __mask = (__v16qu)vec_cmpgt((__v16qu)__B, (__v16qu)__hibit);

   __tmp = vec_sel(__tmp, (__v16qu)__A, __mask);

   _mm_storeu_si128(__p, (__m128i)__tmp);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_avg_epu8(__m128i __A, __m128i __B) {

   return (__m128i)vec_avg((__v16qu)__A, (__v16qu)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_avg_epu16(__m128i __A, __m128i __B) {

   return (__m128i)vec_avg((__v8hu)__A, (__v8hu)__B);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_sad_epu8(__m128i __A, __m128i __B) {

   __v16qu __a, __b;

   __v16qu __vabsdiff;

   __v4si __vsum;

   const __v4su __zero = {0, 0, 0, 0};

   __v4si __result;


   __a = (__v16qu)__A;

   __b = (__v16qu)__B;

 #ifndef _ARCH_PWR9

   __v16qu __vmin = vec_min(__a, __b);

   __v16qu __vmax = vec_max(__a, __b);

   __vabsdiff = vec_sub(__vmax, __vmin);

 #else

   __vabsdiff = vec_absd(__a, __b);

 #endif

   /* Sum four groups of bytes into integers.  */

   __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);

 #ifdef __LITTLE_ENDIAN__

   /* Sum across four integers with two integer results.  */

   __asm__("vsum2sws %0,%1,%2" : "=v"(__result) : "v"(__vsum), "v"(__zero));

   /* Note: vec_sum2s could be used here, but on little-endian, vector

      shifts are added that are not needed for this use-case.

      A vector shift to correctly position the 32-bit integer results

      (currently at [0] and [2]) to [1] and [3] would then need to be

      swapped back again since the desired results are two 64-bit

      integers ([1]|[0] and [3]|[2]).  Thus, no shift is performed.  */

 #else

   /* Sum across four integers with two integer results.  */

   __result = vec_sum2s(__vsum, (__vector signed int)__zero);

   /* Rotate the sums into the correct position.  */

   __result = vec_sld(__result, __result, 6);

 #endif

   return (__m128i)__result;

 }


 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_stream_si32(int *__A, int __B) {

   /* Use the data cache block touch for store transient.  */

   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");

   *__A = __B;

 }


 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_stream_si64(long long int *__A, long long int __B) {

   /* Use the data cache block touch for store transient.  */

   __asm__(" dcbtstt 0,%0" : : "b"(__A) : "memory");

   *__A = __B;

 }


 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_stream_si128(__m128i *__A, __m128i __B) {

   /* Use the data cache block touch for store transient.  */

   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");

   *__A = __B;

 }


 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_stream_pd(double *__A, __m128d __B) {

   /* Use the data cache block touch for store transient.  */

   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");

   *(__m128d *)__A = __B;

 }


 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_clflush(void const *__A) {

   /* Use the data cache block flush.  */

   __asm__("dcbf 0,%0" : : "b"(__A) : "memory");

 }


 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_lfence(void) {

   /* Use light weight sync for load to load ordering.  */

   __atomic_thread_fence(__ATOMIC_RELEASE);

 }


 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_mfence(void) {

   /* Use heavy weight sync for any to any ordering.  */

   __atomic_thread_fence(__ATOMIC_SEQ_CST);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtsi32_si128(int __A) {

   return _mm_set_epi32(0, 0, 0, __A);

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtsi64_si128(long long __A) {

   return __extension__(__m128i)(__v2di){__A, 0LL};

 }


 /* Microsoft intrinsic.  */

 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtsi64x_si128(long long __A) {

   return __extension__(__m128i)(__v2di){__A, 0LL};

 }


 /* Casts between various SP, DP, INT vector types.  Note that these do no

    conversion of values, they just change the type.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_castpd_ps(__m128d __A) {

   return (__m128)__A;

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_castpd_si128(__m128d __A) {

   return (__m128i)__A;

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_castps_pd(__m128 __A) {

   return (__m128d)__A;

 }


 extern __inline __m128i

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_castps_si128(__m128 __A) {

   return (__m128i)__A;

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_castsi128_ps(__m128i __A) {

   return (__m128)__A;

 }


 extern __inline __m128d

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_castsi128_pd(__m128i __A) {

   return (__m128d)__A;

 }


 #else

 #include_next <emmintrin.h>

 #endif /* defined(__powerpc64__) &&                                            \

         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */


 #endif /* EMMINTRIN_H_ */

int
__device__ int
Definition: __clang_hip_libdevice_declares.h:67

altivec.h

vec_sr
static __inline__ vector unsigned char __ATTRS_o_ai vec_sr(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:10393

vec_cmpeq
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
Definition: altivec.h:1708

vec_sra
static __inline__ vector signed char __ATTRS_o_ai vec_sra(vector signed char __a, vector unsigned char __b)
Definition: altivec.h:10527

vec_vmrghw
static __inline__ vector int __ATTRS_o_ai vec_vmrghw(vector int __a, vector int __b)
Definition: altivec.h:5326

vec_sro
static __inline__ vector signed char __ATTRS_o_ai vec_sro(vector signed char __a, vector signed char __b)
Definition: altivec.h:10979

vec_ctf
#define vec_ctf(__a, __b)
Definition: altivec.h:3244

__c
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4800

vec_mule
static __inline__ vector short __ATTRS_o_ai vec_mule(vector signed char __a, vector signed char __b)
Definition: altivec.h:6263

__b
static __inline__ vector float vector float __b
Definition: altivec.h:578

vec_ld
static __inline__ vector signed char __ATTRS_o_ai vec_ld(long __a, const vector signed char *__b)
Definition: altivec.h:4061

vec_splats
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
Definition: altivec.h:14737

vec_andc
static __inline__ vector signed char __ATTRS_o_ai vec_andc(vector signed char __a, vector signed char __b)
Definition: altivec.h:1235

vec_st
static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, long __b, vector signed char *__c)
Definition: altivec.h:11184

vec_sld
static __inline__ vector signed int __ATTRS_o_ai vec_sld(vector signed int, vector signed int, unsigned const int __c)
Definition: altivec.h:9149

vec_unpackl
static __inline__ vector short __ATTRS_o_ai vec_unpackl(vector signed char __a)
Definition: altivec.h:12781

vec_sum4s
static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a, vector int __b)
Definition: altivec.h:12487

vec_and
static __inline__ vector signed char __ATTRS_o_ai vec_and(vector signed char __a, vector signed char __b)
Definition: altivec.h:882

vec_avg
static __inline__ vector signed char __ATTRS_o_ai vec_avg(vector signed char __a, vector signed char __b)
Definition: altivec.h:1586

vec_mergel
static __inline__ vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a, vector signed char __b)
Definition: altivec.h:5361

vec_subs
static __inline__ vector signed char __ATTRS_o_ai vec_subs(vector signed char __a, vector signed char __b)
Definition: altivec.h:12149

vec_splat_s32
static __inline__ vector int __ATTRS_o_ai vec_splat_s32(signed char __a)
Definition: altivec.h:10353

vec_adds
static __inline__ vector signed char __ATTRS_o_ai vec_adds(vector signed char __a, vector signed char __b)
Definition: altivec.h:626

vec_perm
static __inline__ vector signed char __ATTRS_o_ai vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:7962

vec_sel
static __inline__ vector signed char __ATTRS_o_ai vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:8588

vec_mergeh
static __inline__ vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a, vector signed char __b)
Definition: altivec.h:5091

vec_cmplt
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2435

vec_max
static __inline__ vector signed char __ATTRS_o_ai vec_max(vector signed char __a, vector signed char __b)
Definition: altivec.h:4838

vec_slo
static __inline__ vector signed char __ATTRS_o_ai vec_slo(vector signed char __a, vector signed char __b)
Definition: altivec.h:9884

vec_nor
static __inline__ vector signed char __ATTRS_o_ai vec_nor(vector signed char __a, vector signed char __b)
Definition: altivec.h:6729

vec_cmpge
static __inline__ vector bool char __ATTRS_o_ai vec_cmpge(vector signed char __a, vector signed char __b)
Definition: altivec.h:2243

vec_packsu
static __inline__ vector unsigned char __ATTRS_o_ai vec_packsu(vector short __a, vector short __b)
Definition: altivec.h:7844

vec_min
static __inline__ vector signed char __ATTRS_o_ai vec_min(vector signed char __a, vector signed char __b)
Definition: altivec.h:5742

vec_cts
#define vec_cts
Definition: altivec.h:3319

vec_splat
static __inline__ vector signed char __ATTRS_o_ai vec_splat(vector signed char __a, unsigned const int __b)
Definition: altivec.h:10090

vec_or
static __inline__ vector signed char __ATTRS_o_ai vec_or(vector signed char __a, vector signed char __b)
Definition: altivec.h:6865

vec_unpackh
static __inline__ vector short __ATTRS_o_ai vec_unpackh(vector signed char __a)
Definition: altivec.h:12642

vec_sl
static __inline__ vector unsigned char __ATTRS_o_ai vec_sl(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:8882

vec_splat_s16
static __inline__ vector short __ATTRS_o_ai vec_splat_s16(signed char __a)
Definition: altivec.h:10337

vec_xor
static __inline__ vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:13207

vec_cmpgt
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2131

vec_cmple
static __inline__ vector bool char __ATTRS_o_ai vec_cmple(vector signed char __a, vector signed char __b)
Definition: altivec.h:2369

vec_packs
static __inline__ vector signed char __ATTRS_o_ai vec_packs(vector short __a, vector short __b)
Definition: altivec.h:7715

vec_sub
static __inline__ vector signed char __ATTRS_o_ai vec_sub(vector signed char __a, vector signed char __b)
Definition: altivec.h:11869

__p
static __inline__ uint32_t volatile uint32_t * __p
Definition: arm_acle.h:80

_mm_cvtpd_pi32
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1489

_mm_setr_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, __m64 __q1)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition: emmintrin.h:3742

_mm_comile_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1044

_mm_unpacklo_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-3) values from each of the two 128-bit vectors of [8 x i16] and interl...
Definition: emmintrin.h:4531

_mm_movpi64_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a)
Moves the 64-bit operand to a 128-bit integer vector, zeroing the upper bits.
Definition: emmintrin.h:4606

_mm_store_pd1
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1953

_mm_set_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
Initializes the 16-bit values in a 128-bit vector of [8 x i16] with the specified 16-bit integer valu...
Definition: emmintrin.h:3585

_mm_comilt_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1020

_mm_set_pd1
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1805

_mm_packus_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, __m128i __b)
Converts, with saturation, 16-bit signed integers from both 128-bit integer vector operands into 8-bi...
Definition: emmintrin.h:4188

_mm_min_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the smaller value f...
Definition: emmintrin.h:2359

_mm_cmpneq_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:585

_mm_add_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a, __m128d __b)
Adds lower double-precision values in both operands and returns the sum in the lower 64 bits of the r...
Definition: emmintrin.h:74

_mm_div_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a, __m128d __b)
Performs an element-by-element division of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:212

_mm_sub_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a, __m128d __b)
Subtracts two 128-bit vectors of [2 x double].
Definition: emmintrin.h:132

_mm_castpd_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit integer vector.
Definition: emmintrin.h:4740

_mm_or_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, __m128d __b)
Performs a bitwise OR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:398

_mm_stream_si128
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p, __m128i __a)
Stores a 128-bit integer vector to a 128-bit aligned memory location.
Definition: emmintrin.h:4037

_mm_movemask_epi8
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4263

_mm_sll_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, __m128i __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2811

_mm_and_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors.
Definition: emmintrin.h:2662

_mm_cmpord_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:820

_mm_ucomile_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1186

_mm_loadu_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition: emmintrin.h:1609

_mm_sub_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a, __m128i __b)
Subtracts the corresponding elements of two [2 x i64] vectors.
Definition: emmintrin.h:2559

_mm_cvtsi128_si64
static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a)
Moves the least significant 64 bits of a vector of [2 x i64] to a 64-bit signed integer value.
Definition: emmintrin.h:3410

_mm_maskmoveu_si128
static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
Moves bytes selected by the mask from the first operand to the specified unaligned memory location.
Definition: emmintrin.h:3978

_mm_ucomilt_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1162

_mm_set_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
Initializes the 32-bit values in a 128-bit vector of [4 x i32] with the specified 32-bit integer valu...
Definition: emmintrin.h:3545

_mm_ucomigt_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1210

_mm_adds_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [16 x i8] vectors,...
Definition: emmintrin.h:2154

_mm_load_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an aligned memory location.
Definition: emmintrin.h:1553

_mm_cvtepi32_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a)
Converts the lower two integer elements of a 128-bit vector of [4 x i32] into two double-precision fl...
Definition: emmintrin.h:1315

_mm_cmpeq_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3075

_mm_srli_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, int __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3002

_mm_set1_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1789

_mm_cmplt_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3215

_mm_set_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double] initialized with the specified double-prec...
Definition: emmintrin.h:1823

_mm_sub_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a, __m128i __b)
Subtracts the corresponding 16-bit integer values in the operands.
Definition: emmintrin.h:2507

_mm_cmple_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:742

_mm_slli_si128
#define _mm_slli_si128(a, imm)
Left-shifts the 128-bit integer vector operand by the specified number of bytes.
Definition: emmintrin.h:2736

_mm_div_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a, __m128d __b)
Divides the lower double-precision value of the first operand by the lower double-precision value of ...
Definition: emmintrin.h:193

_mm_storel_epi64
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p, __m128i __a)
Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to a memory location.
Definition: emmintrin.h:3997

_mm_cmplt_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3235

_mm_or_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, __m128i __b)
Performs a bitwise OR of two 128-bit integer vectors.
Definition: emmintrin.h:2697

_mm_cmpge_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:519

_mm_min_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns a vecto...
Definition: emmintrin.h:298

_mm_load_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp)
Loads a 64-bit double-precision value to the low element of a 128-bit integer vector and clears the u...
Definition: emmintrin.h:1684

_mm_unpackhi_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, __m128d __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them ...
Definition: emmintrin.h:4641

_mm_cmpgt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:767

_mm_andnot_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors, using the one's complement of the values conta...
Definition: emmintrin.h:2681

_mm_cmpgt_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3133

_mm_srl_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, __m128i __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3020

_mm_cmpeq_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding 16-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3094

_mm_mullo_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the lower 16 bits of ea...
Definition: emmintrin.h:2416

_mm_ucomieq_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1138

_mm_srai_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, int __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2866

_mm_xor_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a, __m128d __b)
Performs a bitwise XOR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:415

_mm_max_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the greater value f...
Definition: emmintrin.h:2321

_mm_avg_epu16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2258

_mm_store1_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1933

_mm_castsi128_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:4800

_mm_srl_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, __m128i __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2984

_mm_unpacklo_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-7) values from two 128-bit vectors of [16 x i8] and interleaves them i...
Definition: emmintrin.h:4502

_mm_cmpge_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:793

_mm_cmpgt_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3155

_mm_cmpnge_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:973

_mm_unpacklo_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a, __m128d __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them i...
Definition: emmintrin.h:4661

_mm_subs_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 8-bit signed integer values in the input and returns the di...
Definition: emmintrin.h:2581

_mm_cmplt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:717

_mm_cmpnge_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:669

_mm_unpacklo_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, __m128i __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them into...
Definition: emmintrin.h:4575

_mm_srai_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, int __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2904

_mm_movemask_pd
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a)
Extracts the sign bits of the double-precision values in the 128-bit vector of [2 x double],...
Definition: emmintrin.h:4679

_mm_shuffle_pd
#define _mm_shuffle_pd(a, b, i)
Constructs a 128-bit floating-point vector of [2 x double] from two 128-bit vector parameters of [2 x...
Definition: emmintrin.h:4710

_mm_cmpeq_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding 32-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3113

__a
static __inline__ void int __a
Definition: emmintrin.h:4057

_mm_mul_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a, __m128d __b)
Multiplies lower double-precision values in both operands and returns the product in the lower 64 bit...
Definition: emmintrin.h:153

_mm_mfence
void _mm_mfence(void)
Forces strong memory ordering (serialization) between load and store instructions preceding this inst...

_mm_subs_epu16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 16-bit unsigned integer values in the input and returns the...
Definition: emmintrin.h:2645

_mm_move_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a)
Moves the lower 64 bits of a 128-bit integer vector to a 128-bit integer vector, zeroing the upper bi...
Definition: emmintrin.h:4622

_mm_cmple_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:477

_mm_unpackhi_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, __m128i __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them int...
Definition: emmintrin.h:4468

_mm_cvttsd_si32
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a)
Converts the low-order element of a [2 x double] vector into a 32-bit signed truncated (rounded towar...
Definition: emmintrin.h:1470

_mm_cvtepi32_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x float].
Definition: emmintrin.h:3313

_mm_cvtsi64_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a)
Returns a vector of [2 x i64] where the lower element is the input operand and the upper element is z...
Definition: emmintrin.h:3379

_mm_unpacklo_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, __m128i __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x i32] and interleaves them i...
Definition: emmintrin.h:4554

_mm_cmpgt_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3175

_mm_cvtss_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a, __m128 __b)
Converts the lower single-precision floating-point element of a 128-bit vector of [4 x float],...
Definition: emmintrin.h:1426

_mm_sqrt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a)
Calculates the square root of the each of two values stored in a 128-bit vector of [2 x double].
Definition: emmintrin.h:253

_mm_move_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, __m128d __b)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1876

_mm_unpackhi_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, __m128i __b)
Unpacks the high-order (index 4-7) values from two 128-bit vectors of [8 x i16] and interleaves them ...
Definition: emmintrin.h:4424

_mm_cvttpd_pi32
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1508

_mm_cvtpd_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1337

_mm_undefined_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
Definition: emmintrin.h:3477

_mm_castps_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit integer vector.
Definition: emmintrin.h:4770

_mm_ucomige_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1234

_mm_madd_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two 128-bit signed [8 x i16] vectors, producing eight interm...
Definition: emmintrin.h:2283

_mm_adds_epu16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2220

_mm_cvtsi32_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a)
Returns a vector of [4 x i32] where the lowest element is the input operand and the remaining element...
Definition: emmintrin.h:3364

_mm_load_pd1
#define _mm_load_pd1(dp)
Definition: emmintrin.h:1577

_mm_srli_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, int __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3038

_mm_slli_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, int __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2757

_mm_insert_epi16
#define _mm_insert_epi16(a, b, imm)
Constructs a 128-bit integer vector by first making a copy of the 128-bit integer vector parameter,...
Definition: emmintrin.h:4247

_mm_cmpnlt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:606

_mm_cvtsd_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, __m128d __b)
Converts the lower double-precision floating-point element of a 128-bit vector of [2 x double],...
Definition: emmintrin.h:1379

_mm_cvtsd_si32
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a)
Converts the low-order element of a 128-bit vector of [2 x double] into a 32-bit signed integer value...
Definition: emmintrin.h:1356

_mm_sub_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a, __m128d __b)
Subtracts the lower double-precision value of the second operand from the lower double-precision valu...
Definition: emmintrin.h:114

_mm_loadl_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the low-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1735

_mm_adds_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2198

_mm_adds_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [8 x i16] vectors,...
Definition: emmintrin.h:2176

_mm_subs_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 16-bit signed integer values in the input and returns the d...
Definition: emmintrin.h:2603

_mm_sub_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a, __m128i __b)
Subtracts the corresponding 8-bit integer values in the operands.
Definition: emmintrin.h:2490

_mm_undefined_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
Definition: emmintrin.h:1755

_mm_unpackhi_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, __m128i __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x i32] and interleaves them ...
Definition: emmintrin.h:4447

_mm_cvtpd_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1276

_mm_setr_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition: emmintrin.h:3843

_mm_mulhi_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the upper 16 bits of ea...
Definition: emmintrin.h:2378

_mm_extract_epi16
#define _mm_extract_epi16(a, imm)
Extracts 16 bits from a 128-bit integer vector of [8 x i16], using the immediate-value parameter as a...
Definition: emmintrin.h:4219

_mm_comige_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1092

_mm_cmpunord_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:564

_mm_shufflelo_epi16
#define _mm_shufflelo_epi16(a, imm)
Constructs a 128-bit integer vector by shuffling four lower 16-bit elements of a 128-bit integer vect...
Definition: emmintrin.h:4331

_mm_add_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [2 x i64], saving the lower 64 bits of each...
Definition: emmintrin.h:2132

_mm_mulhi_epu16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two unsigned [8 x i16] vectors, saving the upper 16 bits of ...
Definition: emmintrin.h:2397

_mm_andnot_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double], using the one's complement of the valu...
Definition: emmintrin.h:381

_mm_comieq_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:996

_mm_max_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the greater value fro...
Definition: emmintrin.h:2302

_mm_castpd_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit floating-point vector of [4 x fl...
Definition: emmintrin.h:4725

_mm_sll_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, __m128i __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2847

_mm_packs_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, __m128i __b)
Converts, with saturation, 16-bit signed integers from both 128-bit integer vector operands into 8-bi...
Definition: emmintrin.h:4142

_mm_set1_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i)
Initializes all values in a 128-bit vector of [4 x i32] with the specified 32-bit value.
Definition: emmintrin.h:3689

_mm_sra_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, __m128i __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2885

_mm_loadr_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp)
Loads two double-precision values, in reverse order, from an aligned memory location into a 128-bit v...
Definition: emmintrin.h:1593

_mm_comigt_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1068

_mm_cmpngt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:648

_mm_castps_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit floating-point vector of [2 x dou...
Definition: emmintrin.h:4755

_mm_bsrli_si128
#define _mm_bsrli_si128(a, imm)
Definition: emmintrin.h:2949

_mm_storeh_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, __m128d __a)
Stores the upper 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:2009

_mm_slli_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, int __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2793

_mm_lfence
void _mm_lfence(void)
Forces strong memory ordering (serialization) between load instructions preceding this instruction an...

_mm_sad_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a, __m128i __b)
Computes the absolute differences of corresponding 8-bit integer values in two 128-bit vectors.
Definition: emmintrin.h:2473

_mm_cmpngt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:947

_mm_cmpunord_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:847

_mm_storel_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:2028

_mm_store_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, __m128d __a)
Moves packed double-precision values from a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1914

_mm_add_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [4 x i32], saving the lower 32 bits of each...
Definition: emmintrin.h:2094

_mm_loadu_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3440

_mm_set1_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3723

_mm_packs_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, __m128i __b)
Converts, with saturation, 32-bit signed integers from both 128-bit integer vector operands into 16-b...
Definition: emmintrin.h:4165

_mm_sll_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, __m128i __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2775

_mm_add_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [16 x i8], saving the lower 8 bits of each ...
Definition: emmintrin.h:2052

_mm_cmpnle_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:922

_mm_cmpgt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:498

_mm_load_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition: emmintrin.h:3425

_mm_cmpeq_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:692

_mm_stream_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p, __m128d __a)
Stores a 128-bit floating point vector of [2 x double] to a 128-bit aligned memory location.
Definition: emmintrin.h:4019

_mm_cmplt_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3195

_mm_castsi128_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [4 x float].
Definition: emmintrin.h:4785

_mm_setzero_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition: emmintrin.h:1857

_mm_and_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:361

_mm_set_epi64x
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1, long long __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3497

_mm_cvtsi32_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a, int __b)
Converts a 32-bit signed integer value, in the second parameter, into a double-precision floating-poi...
Definition: emmintrin.h:1401

_mm_mul_epu32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, __m128i __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the corresponding elements o...
Definition: emmintrin.h:2452

_mm_cmpnlt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:897

_mm_srl_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, __m128i __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3056

_mm_cmpneq_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:872

_mm_cvttps_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a)
Converts a vector of [4 x float] into four signed truncated (rounded toward zero) 32-bit integers,...
Definition: emmintrin.h:3350

_mm_setr_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition: emmintrin.h:3796

_mm_loadh_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the high-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1709

_mm_set_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
Initializes the 8-bit values in a 128-bit vector of [16 x i8] with the specified 8-bit integer values...
Definition: emmintrin.h:3634

_mm_set1_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q)
Initializes both values in a 128-bit vector of [2 x i64] with the specified 64-bit value.
Definition: emmintrin.h:3672

_mm_sqrt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, __m128d __b)
Calculates the square root of the lower double-precision value of the second operand and returns it i...
Definition: emmintrin.h:236

_mm_setr_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double], initialized in reverse order with the spe...
Definition: emmintrin.h:1843

_mm_set_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1, __m64 __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3518

_mm_min_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the lesser of the pair of...
Definition: emmintrin.h:277

_mm_slli_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, int __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2829

_mm_add_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a, __m128d __b)
Adds two 128-bit vectors of [2 x double].
Definition: emmintrin.h:92

_mm_store_sd
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1893

_mm_load1_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp)
Loads a double-precision floating-point value from a specified memory location and duplicates it to b...
Definition: emmintrin.h:1569

_mm_shufflehi_epi16
#define _mm_shufflehi_epi16(a, imm)
Constructs a 128-bit integer vector by shuffling four upper 16-bit elements of a 128-bit integer vect...
Definition: emmintrin.h:4364

_mm_max_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns a vecto...
Definition: emmintrin.h:344

_mm_min_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the smaller value fro...
Definition: emmintrin.h:2340

_mm_movepi64_pi64
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a)
Returns the lower 64 bits of a 128-bit integer vector as a 64-bit integer.
Definition: emmintrin.h:4591

_mm_store_si128
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, __m128i __b)
Stores a 128-bit integer vector to a memory location aligned on a 128-bit boundary.
Definition: emmintrin.h:3874

_mm_mul_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a, __m128d __b)
Multiplies two 128-bit vectors of [2 x double].
Definition: emmintrin.h:171

_mm_cmpeq_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] for...
Definition: emmintrin.h:435

_mm_set_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1773

_mm_cvtpi32_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a)
Converts the two signed 32-bit integer elements of a 64-bit vector of [2 x i32] into two double-preci...
Definition: emmintrin.h:1523

_mm_max_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the greater of the pair o...
Definition: emmintrin.h:323

_mm_cvtsd_f64
static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a)
Returns the low-order element of a 128-bit vector of [2 x double] as a double-precision floating-poin...
Definition: emmintrin.h:1538

_mm_loadl_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadl_epi64(__m128i_u const *__p)
Returns a vector of [2 x i64] where the lower element is taken from the lower element of the operand,...
Definition: emmintrin.h:3460

_mm_set1_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w)
Initializes all values in a 128-bit vector of [8 x i16] with the specified 16-bit value.
Definition: emmintrin.h:3706

_mm_set1_epi64x
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q)
Initializes both values in a 128-bit integer vector with the specified 64-bit integer value.
Definition: emmintrin.h:3655

_mm_cvtsi128_si32
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a)
Moves the least significant 32 bits of a vector of [4 x i32] to a 32-bit signed integer value.
Definition: emmintrin.h:3394

_mm_sub_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a, __m128i __b)
Subtracts the corresponding 32-bit integer values in the operands.
Definition: emmintrin.h:2524

_mm_avg_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2239

_mm_mul_su32
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a, __m64 __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the two 64-bit integer vecto...
Definition: emmintrin.h:2434

_mm_storer_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, __m128d __a)
Stores two double-precision values, in reverse order, from a 128-bit vector of [2 x double] to a 16-b...
Definition: emmintrin.h:1992

_mm_unpackhi_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, __m128i __b)
Unpacks the high-order (index 8-15) values from two 128-bit vectors of [16 x i8] and interleaves them...
Definition: emmintrin.h:4396

_mm_srli_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, int __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2966

_mm_storeu_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition: emmintrin.h:1970

_mm_ucomineq_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1258

_mm_add_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [8 x i16], saving the lower 16 bits of each...
Definition: emmintrin.h:2073

_mm_bslli_si128
#define _mm_bslli_si128(a, imm)
Definition: emmintrin.h:2740

_mm_srli_si128
#define _mm_srli_si128(a, imm)
Right-shifts the 128-bit integer vector operand by the specified number of bytes.
Definition: emmintrin.h:2945

_mm_subs_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 8-bit unsigned integer values in the input and returns the ...
Definition: emmintrin.h:2624

_mm_shuffle_epi32
#define _mm_shuffle_epi32(a, imm)
Constructs a 128-bit integer vector by shuffling four 32-bit elements of a 128-bit integer vector par...
Definition: emmintrin.h:4298

_mm_cvtps_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a)
Converts the lower two single-precision floating-point elements of a 128-bit vector of [4 x float] in...
Definition: emmintrin.h:1294

_mm_setzero_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition: emmintrin.h:3858

_mm_comineq_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1116

_mm_cmpord_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:541

_mm_cmplt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:456

_mm_storeu_si128
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition: emmintrin.h:3889

__attribute__
double __m128d __attribute__((__vector_size__(16), __aligned__(16)))
Definition: emmintrin.h:19

_mm_sra_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, __m128i __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2923

_mm_setr_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition: emmintrin.h:3764

_mm_cvttpd_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1450

_mm_cvtps_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32].
Definition: emmintrin.h:3331

_mm_xor_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, __m128i __b)
Performs a bitwise exclusive OR of two 128-bit integer vectors.
Definition: emmintrin.h:2714

_mm_clflush
void _mm_clflush(void const *__p)
The cache line containing __p is flushed and invalidated from all caches in the coherency domain.

_mm_cmpnle_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:627

__D
static __inline__ void short __D
Definition: immintrin.h:468

__P
__inline unsigned int unsigned int unsigned int * __P
Definition: bmi2intrin.h:25

__Y
__inline unsigned int unsigned int __Y
Definition: bmi2intrin.h:19