llvm-docs/clang_doxygen/ppc__wrappers_2xmmintrin_8h_source.html

 /*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------===

  *

  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

  * See https://llvm.org/LICENSE.txt for license information.

  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

  *

  *===-----------------------------------------------------------------------===

  */


 /* Implemented from the specification included in the Intel C++ Compiler

    User Guide and Reference, version 9.0.  */


 #ifndef NO_WARN_X86_INTRINSICS

 /* This header file is to help porting code using Intel intrinsics

    explicitly from x86_64 to powerpc64/powerpc64le.


    Since X86 SSE intrinsics mainly handles __m128 type, PowerPC

    VMX/VSX ISA is a good match for vector float SIMD operations.

    However scalar float operations in vector (XMM) registers require

    the POWER8 VSX ISA (2.07) level. There are differences for data

    format and placement of float scalars in the vector register, which

    require extra steps to match SSE scalar float semantics on POWER.


    It should be noted that there's much difference between X86_64's

    MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use

    portable <fenv.h> instead of access MXSCR directly.


    Most SSE scalar float intrinsic operations can be performed more

    efficiently as C language float scalar operations or optimized to

    use vector SIMD operations. We recommend this for new applications. */

 #error                                                                         \

     "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."

 #endif


 #ifndef XMMINTRIN_H_

 #define XMMINTRIN_H_


 #if defined(__powerpc64__) &&                                                  \

     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))


 /* Define four value permute mask */

 #define _MM_SHUFFLE(w, x, y, z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))


 #include <altivec.h>


 /* Avoid collisions between altivec.h and strict adherence to C++ and

    C11 standards.  This should eventually be done inside altivec.h itself,

    but only after testing a full distro build.  */

 #if defined(__STRICT_ANSI__) &&                                                \

     (defined(__cplusplus) ||                                                   \

      (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L))

 #undef vector

 #undef pixel

 #undef bool

 #endif


 /* We need type definitions from the MMX header file.  */

 #include <mmintrin.h>


 /* Get _mm_malloc () and _mm_free ().  */

 #if __STDC_HOSTED__

 #include <mm_malloc.h>

 #endif


 /* The Intel API is flexible enough that we must allow aliasing with other

    vector types, and their scalar components.  */

 typedef vector float __m128 __attribute__((__may_alias__));


 /* Unaligned version of the same type.  */

 typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1)));


 /* Internal data types for implementing the intrinsics.  */

 typedef vector float __v4sf;


 /* Create an undefined vector.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_undefined_ps(void) {

   __m128 __Y = __Y;

   return __Y;

 }


 /* Create a vector of zeros.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_setzero_ps(void) {

   return __extension__(__m128){0.0f, 0.0f, 0.0f, 0.0f};

 }


 /* Load four SPFP values from P.  The address must be 16-byte aligned.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_load_ps(float const *__P) {

   return ((__m128)vec_ld(0, (__v4sf *)__P));

 }


 /* Load four SPFP values from P.  The address need not be 16-byte aligned.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_loadu_ps(float const *__P) {

   return (vec_vsx_ld(0, __P));

 }


 /* Load four SPFP values in reverse order.  The address must be aligned.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_loadr_ps(float const *__P) {

   __v4sf __tmp;

   __m128 __result;

   static const __vector unsigned char __permute_vector = {

       0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,

       0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};


   __tmp = vec_ld(0, (__v4sf *)__P);

   __result = (__m128)vec_perm(__tmp, __tmp, __permute_vector);

   return __result;

 }


 /* Create a vector with all four elements equal to F.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_set1_ps(float __F) {

   return __extension__(__m128)(__v4sf){__F, __F, __F, __F};

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_set_ps1(float __F) {

   return _mm_set1_ps(__F);

 }


 /* Create the vector [Z Y X W].  */

 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,

                                       __artificial__))

 _mm_set_ps(const float __Z, const float __Y, const float __X, const float __W) {

   return __extension__(__m128)(__v4sf){__W, __X, __Y, __Z};

 }


 /* Create the vector [W X Y Z].  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_setr_ps(float __Z, float __Y, float __X, float __W) {

   return __extension__(__m128)(__v4sf){__Z, __Y, __X, __W};

 }


 /* Store four SPFP values.  The address must be 16-byte aligned.  */

 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_store_ps(float *__P, __m128 __A) {

   vec_st((__v4sf)__A, 0, (__v4sf *)__P);

 }


 /* Store four SPFP values.  The address need not be 16-byte aligned.  */

 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_storeu_ps(float *__P, __m128 __A) {

   *(__m128_u *)__P = __A;

 }


 /* Store four SPFP values in reverse order.  The address must be aligned.  */

 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_storer_ps(float *__P, __m128 __A) {

   __v4sf __tmp;

   static const __vector unsigned char __permute_vector = {

       0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,

       0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};


   __tmp = (__m128)vec_perm(__A, __A, __permute_vector);


   _mm_store_ps(__P, __tmp);

 }


 /* Store the lower SPFP value across four words.  */

 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_store1_ps(float *__P, __m128 __A) {

   __v4sf __va = vec_splat((__v4sf)__A, 0);

   _mm_store_ps(__P, __va);

 }


 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_store_ps1(float *__P, __m128 __A) {

   _mm_store1_ps(__P, __A);

 }


 /* Create a vector with element 0 as F and the rest zero.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_set_ss(float __F) {

   return __extension__(__m128)(__v4sf){__F, 0.0f, 0.0f, 0.0f};

 }


 /* Sets the low SPFP value of A from the low value of B.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_move_ss(__m128 __A, __m128 __B) {

   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};


   return (vec_sel((__v4sf)__A, (__v4sf)__B, __mask));

 }


 /* Create a vector with element 0 as *P and the rest zero.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_load_ss(float const *__P) {

   return _mm_set_ss(*__P);

 }


 /* Stores the lower SPFP value.  */

 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_store_ss(float *__P, __m128 __A) {

   *__P = ((__v4sf)__A)[0];

 }


 /* Perform the respective operation on the lower SPFP (single-precision

    floating-point) values of A and B; the upper three SPFP values are

    passed through from A.  */


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_add_ss(__m128 __A, __m128 __B) {

 #ifdef _ARCH_PWR7

   __m128 __a, __b, __c;

   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

   /* PowerISA VSX does not allow partial (for just lower double)

      results. So to insure we don't generate spurious exceptions

      (from the upper double values) we splat the lower double

      before we to the operation.  */

   __a = vec_splat(__A, 0);

   __b = vec_splat(__B, 0);

   __c = __a + __b;

   /* Then we merge the lower float result with the original upper

      float elements from __A.  */

   return (vec_sel(__A, __c, __mask));

 #else

   __A[0] = __A[0] + __B[0];

   return (__A);

 #endif

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_sub_ss(__m128 __A, __m128 __B) {

 #ifdef _ARCH_PWR7

   __m128 __a, __b, __c;

   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

   /* PowerISA VSX does not allow partial (for just lower double)

      results. So to insure we don't generate spurious exceptions

      (from the upper double values) we splat the lower double

      before we to the operation.  */

   __a = vec_splat(__A, 0);

   __b = vec_splat(__B, 0);

   __c = __a - __b;

   /* Then we merge the lower float result with the original upper

      float elements from __A.  */

   return (vec_sel(__A, __c, __mask));

 #else

   __A[0] = __A[0] - __B[0];

   return (__A);

 #endif

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_mul_ss(__m128 __A, __m128 __B) {

 #ifdef _ARCH_PWR7

   __m128 __a, __b, __c;

   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

   /* PowerISA VSX does not allow partial (for just lower double)

      results. So to insure we don't generate spurious exceptions

      (from the upper double values) we splat the lower double

      before we to the operation.  */

   __a = vec_splat(__A, 0);

   __b = vec_splat(__B, 0);

   __c = __a * __b;

   /* Then we merge the lower float result with the original upper

      float elements from __A.  */

   return (vec_sel(__A, __c, __mask));

 #else

   __A[0] = __A[0] * __B[0];

   return (__A);

 #endif

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_div_ss(__m128 __A, __m128 __B) {

 #ifdef _ARCH_PWR7

   __m128 __a, __b, __c;

   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

   /* PowerISA VSX does not allow partial (for just lower double)

      results. So to insure we don't generate spurious exceptions

      (from the upper double values) we splat the lower double

      before we to the operation.  */

   __a = vec_splat(__A, 0);

   __b = vec_splat(__B, 0);

   __c = __a / __b;

   /* Then we merge the lower float result with the original upper

      float elements from __A.  */

   return (vec_sel(__A, __c, __mask));

 #else

   __A[0] = __A[0] / __B[0];

   return (__A);

 #endif

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_sqrt_ss(__m128 __A) {

   __m128 __a, __c;

   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

   /* PowerISA VSX does not allow partial (for just lower double)

    * results. So to insure we don't generate spurious exceptions

    * (from the upper double values) we splat the lower double

    * before we to the operation. */

   __a = vec_splat(__A, 0);

   __c = vec_sqrt(__a);

   /* Then we merge the lower float result with the original upper

    * float elements from __A.  */

   return (vec_sel(__A, __c, __mask));

 }


 /* Perform the respective operation on the four SPFP values in A and B.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_add_ps(__m128 __A, __m128 __B) {

   return (__m128)((__v4sf)__A + (__v4sf)__B);

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_sub_ps(__m128 __A, __m128 __B) {

   return (__m128)((__v4sf)__A - (__v4sf)__B);

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_mul_ps(__m128 __A, __m128 __B) {

   return (__m128)((__v4sf)__A * (__v4sf)__B);

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_div_ps(__m128 __A, __m128 __B) {

   return (__m128)((__v4sf)__A / (__v4sf)__B);

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_sqrt_ps(__m128 __A) {

   return (vec_sqrt((__v4sf)__A));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_rcp_ps(__m128 __A) {

   return (vec_re((__v4sf)__A));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_rsqrt_ps(__m128 __A) {

   return (vec_rsqrte(__A));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_rcp_ss(__m128 __A) {

   __m128 __a, __c;

   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

   /* PowerISA VSX does not allow partial (for just lower double)

    * results. So to insure we don't generate spurious exceptions

    * (from the upper double values) we splat the lower double

    * before we to the operation. */

   __a = vec_splat(__A, 0);

   __c = _mm_rcp_ps(__a);

   /* Then we merge the lower float result with the original upper

    * float elements from __A.  */

   return (vec_sel(__A, __c, __mask));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_rsqrt_ss(__m128 __A) {

   __m128 __a, __c;

   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

   /* PowerISA VSX does not allow partial (for just lower double)

    * results. So to insure we don't generate spurious exceptions

    * (from the upper double values) we splat the lower double

    * before we to the operation. */

   __a = vec_splat(__A, 0);

   __c = vec_rsqrte(__a);

   /* Then we merge the lower float result with the original upper

    * float elements from __A.  */

   return (vec_sel(__A, __c, __mask));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_min_ss(__m128 __A, __m128 __B) {

   __v4sf __a, __b, __c;

   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

   /* PowerISA VSX does not allow partial (for just lower float)

    * results. So to insure we don't generate spurious exceptions

    * (from the upper float values) we splat the lower float

    * before we to the operation. */

   __a = vec_splat((__v4sf)__A, 0);

   __b = vec_splat((__v4sf)__B, 0);

   __c = vec_min(__a, __b);

   /* Then we merge the lower float result with the original upper

    * float elements from __A.  */

   return (vec_sel((__v4sf)__A, __c, __mask));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_max_ss(__m128 __A, __m128 __B) {

   __v4sf __a, __b, __c;

   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

   /* PowerISA VSX does not allow partial (for just lower float)

    * results. So to insure we don't generate spurious exceptions

    * (from the upper float values) we splat the lower float

    * before we to the operation. */

   __a = vec_splat(__A, 0);

   __b = vec_splat(__B, 0);

   __c = vec_max(__a, __b);

   /* Then we merge the lower float result with the original upper

    * float elements from __A.  */

   return (vec_sel((__v4sf)__A, __c, __mask));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_min_ps(__m128 __A, __m128 __B) {

   __vector __bool int __m = vec_cmpgt((__v4sf)__B, (__v4sf)__A);

   return vec_sel(__B, __A, __m);

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_max_ps(__m128 __A, __m128 __B) {

   __vector __bool int __m = vec_cmpgt((__v4sf)__A, (__v4sf)__B);

   return vec_sel(__B, __A, __m);

 }


 /* Perform logical bit-wise operations on 128-bit values.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_and_ps(__m128 __A, __m128 __B) {

   return ((__m128)vec_and((__v4sf)__A, (__v4sf)__B));

   //  return __builtin_ia32_andps (__A, __B);

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_andnot_ps(__m128 __A, __m128 __B) {

   return ((__m128)vec_andc((__v4sf)__B, (__v4sf)__A));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_or_ps(__m128 __A, __m128 __B) {

   return ((__m128)vec_or((__v4sf)__A, (__v4sf)__B));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_xor_ps(__m128 __A, __m128 __B) {

   return ((__m128)vec_xor((__v4sf)__A, (__v4sf)__B));

 }


 /* Perform a comparison on the four SPFP values of A and B.  For each

    element, if the comparison is true, place a mask of all ones in the

    result, otherwise a mask of zeros.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpeq_ps(__m128 __A, __m128 __B) {

   return ((__m128)vec_cmpeq((__v4sf)__A, (__v4sf)__B));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmplt_ps(__m128 __A, __m128 __B) {

   return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmple_ps(__m128 __A, __m128 __B) {

   return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpgt_ps(__m128 __A, __m128 __B) {

   return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpge_ps(__m128 __A, __m128 __B) {

   return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpneq_ps(__m128 __A, __m128 __B) {

   __v4sf __temp = (__v4sf)vec_cmpeq((__v4sf)__A, (__v4sf)__B);

   return ((__m128)vec_nor(__temp, __temp));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpnlt_ps(__m128 __A, __m128 __B) {

   return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpnle_ps(__m128 __A, __m128 __B) {

   return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpngt_ps(__m128 __A, __m128 __B) {

   return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpnge_ps(__m128 __A, __m128 __B) {

   return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpord_ps(__m128 __A, __m128 __B) {

   __vector unsigned int __a, __b;

   __vector unsigned int __c, __d;

   static const __vector unsigned int __float_exp_mask = {

       0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};


   __a = (__vector unsigned int)vec_abs((__v4sf)__A);

   __b = (__vector unsigned int)vec_abs((__v4sf)__B);

   __c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a);

   __d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b);

   return ((__m128)vec_and(__c, __d));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpunord_ps(__m128 __A, __m128 __B) {

   __vector unsigned int __a, __b;

   __vector unsigned int __c, __d;

   static const __vector unsigned int __float_exp_mask = {

       0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};


   __a = (__vector unsigned int)vec_abs((__v4sf)__A);

   __b = (__vector unsigned int)vec_abs((__v4sf)__B);

   __c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask);

   __d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask);

   return ((__m128)vec_or(__c, __d));

 }


 /* Perform a comparison on the lower SPFP values of A and B.  If the

    comparison is true, place a mask of all ones in the result, otherwise a

    mask of zeros.  The upper three SPFP values are passed through from A.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpeq_ss(__m128 __A, __m128 __B) {

   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

   __v4sf __a, __b, __c;

   /* PowerISA VMX does not allow partial (for just element 0)

    * results. So to insure we don't generate spurious exceptions

    * (from the upper elements) we splat the lower float

    * before we to the operation. */

   __a = vec_splat((__v4sf)__A, 0);

   __b = vec_splat((__v4sf)__B, 0);

   __c = (__v4sf)vec_cmpeq(__a, __b);

   /* Then we merge the lower float result with the original upper

    * float elements from __A.  */

   return ((__m128)vec_sel((__v4sf)__A, __c, __mask));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmplt_ss(__m128 __A, __m128 __B) {

   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

   __v4sf __a, __b, __c;

   /* PowerISA VMX does not allow partial (for just element 0)

    * results. So to insure we don't generate spurious exceptions

    * (from the upper elements) we splat the lower float

    * before we to the operation. */

   __a = vec_splat((__v4sf)__A, 0);

   __b = vec_splat((__v4sf)__B, 0);

   __c = (__v4sf)vec_cmplt(__a, __b);

   /* Then we merge the lower float result with the original upper

    * float elements from __A.  */

   return ((__m128)vec_sel((__v4sf)__A, __c, __mask));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmple_ss(__m128 __A, __m128 __B) {

   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

   __v4sf __a, __b, __c;

   /* PowerISA VMX does not allow partial (for just element 0)

    * results. So to insure we don't generate spurious exceptions

    * (from the upper elements) we splat the lower float

    * before we to the operation. */

   __a = vec_splat((__v4sf)__A, 0);

   __b = vec_splat((__v4sf)__B, 0);

   __c = (__v4sf)vec_cmple(__a, __b);

   /* Then we merge the lower float result with the original upper

    * float elements from __A.  */

   return ((__m128)vec_sel((__v4sf)__A, __c, __mask));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpgt_ss(__m128 __A, __m128 __B) {

   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

   __v4sf __a, __b, __c;

   /* PowerISA VMX does not allow partial (for just element 0)

    * results. So to insure we don't generate spurious exceptions

    * (from the upper elements) we splat the lower float

    * before we to the operation. */

   __a = vec_splat((__v4sf)__A, 0);

   __b = vec_splat((__v4sf)__B, 0);

   __c = (__v4sf)vec_cmpgt(__a, __b);

   /* Then we merge the lower float result with the original upper

    * float elements from __A.  */

   return ((__m128)vec_sel((__v4sf)__A, __c, __mask));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpge_ss(__m128 __A, __m128 __B) {

   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

   __v4sf __a, __b, __c;

   /* PowerISA VMX does not allow partial (for just element 0)

    * results. So to insure we don't generate spurious exceptions

    * (from the upper elements) we splat the lower float

    * before we to the operation. */

   __a = vec_splat((__v4sf)__A, 0);

   __b = vec_splat((__v4sf)__B, 0);

   __c = (__v4sf)vec_cmpge(__a, __b);

   /* Then we merge the lower float result with the original upper

    * float elements from __A.  */

   return ((__m128)vec_sel((__v4sf)__A, __c, __mask));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpneq_ss(__m128 __A, __m128 __B) {

   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

   __v4sf __a, __b, __c;

   /* PowerISA VMX does not allow partial (for just element 0)

    * results. So to insure we don't generate spurious exceptions

    * (from the upper elements) we splat the lower float

    * before we to the operation. */

   __a = vec_splat((__v4sf)__A, 0);

   __b = vec_splat((__v4sf)__B, 0);

   __c = (__v4sf)vec_cmpeq(__a, __b);

   __c = vec_nor(__c, __c);

   /* Then we merge the lower float result with the original upper

    * float elements from __A.  */

   return ((__m128)vec_sel((__v4sf)__A, __c, __mask));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpnlt_ss(__m128 __A, __m128 __B) {

   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

   __v4sf __a, __b, __c;

   /* PowerISA VMX does not allow partial (for just element 0)

    * results. So to insure we don't generate spurious exceptions

    * (from the upper elements) we splat the lower float

    * before we to the operation. */

   __a = vec_splat((__v4sf)__A, 0);

   __b = vec_splat((__v4sf)__B, 0);

   __c = (__v4sf)vec_cmpge(__a, __b);

   /* Then we merge the lower float result with the original upper

    * float elements from __A.  */

   return ((__m128)vec_sel((__v4sf)__A, __c, __mask));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpnle_ss(__m128 __A, __m128 __B) {

   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

   __v4sf __a, __b, __c;

   /* PowerISA VMX does not allow partial (for just element 0)

    * results. So to insure we don't generate spurious exceptions

    * (from the upper elements) we splat the lower float

    * before we to the operation. */

   __a = vec_splat((__v4sf)__A, 0);

   __b = vec_splat((__v4sf)__B, 0);

   __c = (__v4sf)vec_cmpgt(__a, __b);

   /* Then we merge the lower float result with the original upper

    * float elements from __A.  */

   return ((__m128)vec_sel((__v4sf)__A, __c, __mask));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpngt_ss(__m128 __A, __m128 __B) {

   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

   __v4sf __a, __b, __c;

   /* PowerISA VMX does not allow partial (for just element 0)

    * results. So to insure we don't generate spurious exceptions

    * (from the upper elements) we splat the lower float

    * before we to the operation. */

   __a = vec_splat((__v4sf)__A, 0);

   __b = vec_splat((__v4sf)__B, 0);

   __c = (__v4sf)vec_cmple(__a, __b);

   /* Then we merge the lower float result with the original upper

    * float elements from __A.  */

   return ((__m128)vec_sel((__v4sf)__A, __c, __mask));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpnge_ss(__m128 __A, __m128 __B) {

   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

   __v4sf __a, __b, __c;

   /* PowerISA VMX does not allow partial (for just element 0)

    * results. So to insure we don't generate spurious exceptions

    * (from the upper elements) we splat the lower float

    * before we do the operation. */

   __a = vec_splat((__v4sf)__A, 0);

   __b = vec_splat((__v4sf)__B, 0);

   __c = (__v4sf)vec_cmplt(__a, __b);

   /* Then we merge the lower float result with the original upper

    * float elements from __A.  */

   return ((__m128)vec_sel((__v4sf)__A, __c, __mask));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpord_ss(__m128 __A, __m128 __B) {

   __vector unsigned int __a, __b;

   __vector unsigned int __c, __d;

   static const __vector unsigned int __float_exp_mask = {

       0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};

   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};


   __a = (__vector unsigned int)vec_abs((__v4sf)__A);

   __b = (__vector unsigned int)vec_abs((__v4sf)__B);

   __c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a);

   __d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b);

   __c = vec_and(__c, __d);

   /* Then we merge the lower float result with the original upper

    * float elements from __A.  */

   return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask));

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cmpunord_ss(__m128 __A, __m128 __B) {

   __vector unsigned int __a, __b;

   __vector unsigned int __c, __d;

   static const __vector unsigned int __float_exp_mask = {

       0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};

   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};


   __a = (__vector unsigned int)vec_abs((__v4sf)__A);

   __b = (__vector unsigned int)vec_abs((__v4sf)__B);

   __c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask);

   __d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask);

   __c = vec_or(__c, __d);

   /* Then we merge the lower float result with the original upper

    * float elements from __A.  */

   return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask));

 }


 /* Compare the lower SPFP values of A and B and return 1 if true

    and 0 if false.  */

 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_comieq_ss(__m128 __A, __m128 __B) {

   return (__A[0] == __B[0]);

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_comilt_ss(__m128 __A, __m128 __B) {

   return (__A[0] < __B[0]);

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_comile_ss(__m128 __A, __m128 __B) {

   return (__A[0] <= __B[0]);

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_comigt_ss(__m128 __A, __m128 __B) {

   return (__A[0] > __B[0]);

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_comige_ss(__m128 __A, __m128 __B) {

   return (__A[0] >= __B[0]);

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_comineq_ss(__m128 __A, __m128 __B) {

   return (__A[0] != __B[0]);

 }


 /* FIXME

  * The __mm_ucomi??_ss implementations below are exactly the same as

  * __mm_comi??_ss because GCC for PowerPC only generates unordered

  * compares (scalar and vector).

  * Technically __mm_comieq_ss et al should be using the ordered

  * compare and signal for QNaNs.

  * The __mm_ucomieq_sd et all should be OK, as is.

  */

 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_ucomieq_ss(__m128 __A, __m128 __B) {

   return (__A[0] == __B[0]);

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_ucomilt_ss(__m128 __A, __m128 __B) {

   return (__A[0] < __B[0]);

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_ucomile_ss(__m128 __A, __m128 __B) {

   return (__A[0] <= __B[0]);

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_ucomigt_ss(__m128 __A, __m128 __B) {

   return (__A[0] > __B[0]);

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_ucomige_ss(__m128 __A, __m128 __B) {

   return (__A[0] >= __B[0]);

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_ucomineq_ss(__m128 __A, __m128 __B) {

   return (__A[0] != __B[0]);

 }


 extern __inline float

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtss_f32(__m128 __A) {

   return ((__v4sf)__A)[0];

 }


 /* Convert the lower SPFP value to a 32-bit integer according to the current

    rounding mode.  */

 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtss_si32(__m128 __A) {

   int __res;

 #ifdef _ARCH_PWR8

   double __dtmp;

   __asm__(

 #ifdef __LITTLE_ENDIAN__

       "xxsldwi %x0,%x0,%x0,3;\n"

 #endif

       "xscvspdp %x2,%x0;\n"

       "fctiw  %2,%2;\n"

       "mfvsrd  %1,%x2;\n"

       : "+wa"(__A), "=r"(__res), "=f"(__dtmp)

       :);

 #else

   __res = __builtin_rint(__A[0]);

 #endif

   return __res;

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvt_ss2si(__m128 __A) {

   return _mm_cvtss_si32(__A);

 }


 /* Convert the lower SPFP value to a 32-bit integer according to the

    current rounding mode.  */


 /* Intel intrinsic.  */

 extern __inline long long

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtss_si64(__m128 __A) {

   long long __res;

 #if defined(_ARCH_PWR8) && defined(__powerpc64__)

   double __dtmp;

   __asm__(

 #ifdef __LITTLE_ENDIAN__

       "xxsldwi %x0,%x0,%x0,3;\n"

 #endif

       "xscvspdp %x2,%x0;\n"

       "fctid  %2,%2;\n"

       "mfvsrd  %1,%x2;\n"

       : "+wa"(__A), "=r"(__res), "=f"(__dtmp)

       :);

 #else

   __res = __builtin_llrint(__A[0]);

 #endif

   return __res;

 }


 /* Microsoft intrinsic.  */

 extern __inline long long

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtss_si64x(__m128 __A) {

   return _mm_cvtss_si64((__v4sf)__A);

 }


 /* Constants for use with _mm_prefetch.  */

 enum _mm_hint {

   /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */

   _MM_HINT_ET0 = 7,

   _MM_HINT_ET1 = 6,

   _MM_HINT_T0 = 3,

   _MM_HINT_T1 = 2,

   _MM_HINT_T2 = 1,

   _MM_HINT_NTA = 0

 };


 /* Loads one cache line from address P to a location "closer" to the

    processor.  The selector I specifies the type of prefetch operation.  */

 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_prefetch(const void *__P, enum _mm_hint __I) {

   /* Current PowerPC will ignores the hint parameters.  */

   __builtin_prefetch(__P);

 }


 /* Convert the two lower SPFP values to 32-bit integers according to the

    current rounding mode.  Return the integers in packed form.  */

 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtps_pi32(__m128 __A) {

   /* Splat two lower SPFP values to both halves.  */

   __v4sf __temp, __rounded;

   __vector unsigned long long __result;


   /* Splat two lower SPFP values to both halves.  */

   __temp = (__v4sf)vec_splat((__vector long long)__A, 0);

   __rounded = vec_rint(__temp);

   __result = (__vector unsigned long long)vec_cts(__rounded, 0);


   return (__m64)((__vector long long)__result)[0];

 }


 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvt_ps2pi(__m128 __A) {

   return _mm_cvtps_pi32(__A);

 }


 /* Truncate the lower SPFP value to a 32-bit integer.  */

 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvttss_si32(__m128 __A) {

   /* Extract the lower float element.  */

   float __temp = __A[0];

   /* truncate to 32-bit integer and return.  */

   return __temp;

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtt_ss2si(__m128 __A) {

   return _mm_cvttss_si32(__A);

 }


 /* Intel intrinsic.  */

 extern __inline long long

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvttss_si64(__m128 __A) {

   /* Extract the lower float element.  */

   float __temp = __A[0];

   /* truncate to 32-bit integer and return.  */

   return __temp;

 }


 /* Microsoft intrinsic.  */

 extern __inline long long

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvttss_si64x(__m128 __A) {

   /* Extract the lower float element.  */

   float __temp = __A[0];

   /* truncate to 32-bit integer and return.  */

   return __temp;

 }


 /* Truncate the two lower SPFP values to 32-bit integers.  Return the

    integers in packed form.  */

 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvttps_pi32(__m128 __A) {

   __v4sf __temp;

   __vector unsigned long long __result;


   /* Splat two lower SPFP values to both halves.  */

   __temp = (__v4sf)vec_splat((__vector long long)__A, 0);

   __result = (__vector unsigned long long)vec_cts(__temp, 0);


   return (__m64)((__vector long long)__result)[0];

 }


 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtt_ps2pi(__m128 __A) {

   return _mm_cvttps_pi32(__A);

 }


 /* Convert B to a SPFP value and insert it as element zero in A.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtsi32_ss(__m128 __A, int __B) {

   float __temp = __B;

   __A[0] = __temp;


   return __A;

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvt_si2ss(__m128 __A, int __B) {

   return _mm_cvtsi32_ss(__A, __B);

 }


 /* Convert B to a SPFP value and insert it as element zero in A.  */

 /* Intel intrinsic.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtsi64_ss(__m128 __A, long long __B) {

   float __temp = __B;

   __A[0] = __temp;


   return __A;

 }


 /* Microsoft intrinsic.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtsi64x_ss(__m128 __A, long long __B) {

   return _mm_cvtsi64_ss(__A, __B);

 }


 /* Convert the two 32-bit values in B to SPFP form and insert them

    as the two lower elements in A.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtpi32_ps(__m128 __A, __m64 __B) {

   __vector signed int __vm1;

   __vector float __vf1;


   __vm1 = (__vector signed int)(__vector unsigned long long){__B, __B};

   __vf1 = (__vector float)vec_ctf(__vm1, 0);


   return ((__m128)(__vector unsigned long long){

       ((__vector unsigned long long)__vf1)[0],

       ((__vector unsigned long long)__A)[1]});

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvt_pi2ps(__m128 __A, __m64 __B) {

   return _mm_cvtpi32_ps(__A, __B);

 }


 /* Convert the four signed 16-bit values in A to SPFP form.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtpi16_ps(__m64 __A) {

   __vector signed short __vs8;

   __vector signed int __vi4;

   __vector float __vf1;


   __vs8 = (__vector signed short)(__vector unsigned long long){__A, __A};

   __vi4 = vec_vupklsh(__vs8);

   __vf1 = (__vector float)vec_ctf(__vi4, 0);


   return (__m128)__vf1;

 }


 /* Convert the four unsigned 16-bit values in A to SPFP form.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtpu16_ps(__m64 __A) {

   const __vector unsigned short __zero = {0, 0, 0, 0, 0, 0, 0, 0};

   __vector unsigned short __vs8;

   __vector unsigned int __vi4;

   __vector float __vf1;


   __vs8 = (__vector unsigned short)(__vector unsigned long long){__A, __A};

   __vi4 = (__vector unsigned int)vec_mergel

 #ifdef __LITTLE_ENDIAN__

       (__vs8, __zero);

 #else

       (__zero, __vs8);

 #endif

   __vf1 = (__vector float)vec_ctf(__vi4, 0);


   return (__m128)__vf1;

 }


 /* Convert the low four signed 8-bit values in A to SPFP form.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtpi8_ps(__m64 __A) {

   __vector signed char __vc16;

   __vector signed short __vs8;

   __vector signed int __vi4;

   __vector float __vf1;


   __vc16 = (__vector signed char)(__vector unsigned long long){__A, __A};

   __vs8 = vec_vupkhsb(__vc16);

   __vi4 = vec_vupkhsh(__vs8);

   __vf1 = (__vector float)vec_ctf(__vi4, 0);


   return (__m128)__vf1;

 }


 /* Convert the low four unsigned 8-bit values in A to SPFP form.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))


     _mm_cvtpu8_ps(__m64 __A) {

   const __vector unsigned char __zero = {0, 0, 0, 0, 0, 0, 0, 0};

   __vector unsigned char __vc16;

   __vector unsigned short __vs8;

   __vector unsigned int __vi4;

   __vector float __vf1;


   __vc16 = (__vector unsigned char)(__vector unsigned long long){__A, __A};

 #ifdef __LITTLE_ENDIAN__

   __vs8 = (__vector unsigned short)vec_mergel(__vc16, __zero);

   __vi4 =

       (__vector unsigned int)vec_mergeh(__vs8, (__vector unsigned short)__zero);

 #else

   __vs8 = (__vector unsigned short)vec_mergel(__zero, __vc16);

   __vi4 =

       (__vector unsigned int)vec_mergeh((__vector unsigned short)__zero, __vs8);

 #endif

   __vf1 = (__vector float)vec_ctf(__vi4, 0);


   return (__m128)__vf1;

 }


 /* Convert the four signed 32-bit values in A and B to SPFP form.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtpi32x2_ps(__m64 __A, __m64 __B) {

   __vector signed int __vi4;

   __vector float __vf4;


   __vi4 = (__vector signed int)(__vector unsigned long long){__A, __B};

   __vf4 = (__vector float)vec_ctf(__vi4, 0);

   return (__m128)__vf4;

 }


 /* Convert the four SPFP values in A to four signed 16-bit integers.  */

 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtps_pi16(__m128 __A) {

   __v4sf __rounded;

   __vector signed int __temp;

   __vector unsigned long long __result;


   __rounded = vec_rint(__A);

   __temp = vec_cts(__rounded, 0);

   __result = (__vector unsigned long long)vec_pack(__temp, __temp);


   return (__m64)((__vector long long)__result)[0];

 }


 /* Convert the four SPFP values in A to four signed 8-bit integers.  */

 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_cvtps_pi8(__m128 __A) {

   __v4sf __rounded;

   __vector signed int __tmp_i;

   static const __vector signed int __zero = {0, 0, 0, 0};

   __vector signed short __tmp_s;

   __vector signed char __res_v;


   __rounded = vec_rint(__A);

   __tmp_i = vec_cts(__rounded, 0);

   __tmp_s = vec_pack(__tmp_i, __zero);

   __res_v = vec_pack(__tmp_s, __tmp_s);

   return (__m64)((__vector long long)__res_v)[0];

 }


 /* Selects four specific SPFP values from A and B based on MASK.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))


     _mm_shuffle_ps(__m128 __A, __m128 __B, int const __mask) {

   unsigned long __element_selector_10 = __mask & 0x03;

   unsigned long __element_selector_32 = (__mask >> 2) & 0x03;

   unsigned long __element_selector_54 = (__mask >> 4) & 0x03;

   unsigned long __element_selector_76 = (__mask >> 6) & 0x03;

   static const unsigned int __permute_selectors[4] = {

 #ifdef __LITTLE_ENDIAN__

       0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C

 #else

       0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F

 #endif

   };

   __vector unsigned int __t;


   __t[0] = __permute_selectors[__element_selector_10];

   __t[1] = __permute_selectors[__element_selector_32];

   __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;

   __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;

   return vec_perm((__v4sf)__A, (__v4sf)__B, (__vector unsigned char)__t);

 }


 /* Selects and interleaves the upper two SPFP values from A and B.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_unpackhi_ps(__m128 __A, __m128 __B) {

   return (__m128)vec_vmrglw((__v4sf)__A, (__v4sf)__B);

 }


 /* Selects and interleaves the lower two SPFP values from A and B.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_unpacklo_ps(__m128 __A, __m128 __B) {

   return (__m128)vec_vmrghw((__v4sf)__A, (__v4sf)__B);

 }


 /* Sets the upper two SPFP values with 64-bits of data loaded from P;

    the lower two values are passed through from A.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_loadh_pi(__m128 __A, __m64 const *__P) {

   __vector unsigned long long __a = (__vector unsigned long long)__A;

   __vector unsigned long long __p = vec_splats(*__P);

   __a[1] = __p[1];


   return (__m128)__a;

 }


 /* Stores the upper two SPFP values of A into P.  */

 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_storeh_pi(__m64 *__P, __m128 __A) {

   __vector unsigned long long __a = (__vector unsigned long long)__A;


   *__P = __a[1];

 }


 /* Moves the upper two values of B into the lower two values of A.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_movehl_ps(__m128 __A, __m128 __B) {

   return (__m128)vec_mergel((__vector unsigned long long)__B,

                             (__vector unsigned long long)__A);

 }


 /* Moves the lower two values of B into the upper two values of A.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_movelh_ps(__m128 __A, __m128 __B) {

   return (__m128)vec_mergeh((__vector unsigned long long)__A,

                             (__vector unsigned long long)__B);

 }


 /* Sets the lower two SPFP values with 64-bits of data loaded from P;

    the upper two values are passed through from A.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_loadl_pi(__m128 __A, __m64 const *__P) {

   __vector unsigned long long __a = (__vector unsigned long long)__A;

   __vector unsigned long long __p = vec_splats(*__P);

   __a[0] = __p[0];


   return (__m128)__a;

 }


 /* Stores the lower two SPFP values of A into P.  */

 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_storel_pi(__m64 *__P, __m128 __A) {

   __vector unsigned long long __a = (__vector unsigned long long)__A;


   *__P = __a[0];

 }


 #ifdef _ARCH_PWR8

 /* Intrinsic functions that require PowerISA 2.07 minimum.  */


 /* Creates a 4-bit mask from the most significant bits of the SPFP values.  */

 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_movemask_ps(__m128 __A) {

 #ifdef _ARCH_PWR10

   return vec_extractm((__vector unsigned int)__A);

 #else

   __vector unsigned long long __result;

   static const __vector unsigned int __perm_mask = {

 #ifdef __LITTLE_ENDIAN__

       0x00204060, 0x80808080, 0x80808080, 0x80808080

 #else

       0x80808080, 0x80808080, 0x80808080, 0x00204060

 #endif

   };


   __result = ((__vector unsigned long long)vec_vbpermq(

       (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));


 #ifdef __LITTLE_ENDIAN__

   return __result[1];

 #else

   return __result[0];

 #endif

 #endif /* !_ARCH_PWR10 */

 }

 #endif /* _ARCH_PWR8 */


 /* Create a vector with all four elements equal to *P.  */

 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_load1_ps(float const *__P) {

   return _mm_set1_ps(*__P);

 }


 extern __inline __m128

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_load_ps1(float const *__P) {

   return _mm_load1_ps(__P);

 }


 /* Extracts one of the four words of A.  The selector N must be immediate.  */

 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_extract_pi16(__m64 const __A, int const __N) {

   unsigned int __shiftr = __N & 3;

 #ifdef __BIG_ENDIAN__

   __shiftr = 3 - __shiftr;

 #endif


   return ((__A >> (__shiftr * 16)) & 0xffff);

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _m_pextrw(__m64 const __A, int const __N) {

   return _mm_extract_pi16(__A, __N);

 }


 /* Inserts word D into one of four words of A.  The selector N must be

    immediate.  */

 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_insert_pi16(__m64 const __A, int const __D, int const __N) {

   const int __shiftl = (__N & 3) * 16;

   const __m64 __shiftD = (const __m64)__D << __shiftl;

   const __m64 __mask = 0xffffUL << __shiftl;

   __m64 __result = (__A & (~__mask)) | (__shiftD & __mask);


   return __result;

 }


 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _m_pinsrw(__m64 const __A, int const __D, int const __N) {

   return _mm_insert_pi16(__A, __D, __N);

 }


 /* Compute the element-wise maximum of signed 16-bit values.  */

 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))


     _mm_max_pi16(__m64 __A, __m64 __B) {

 #if _ARCH_PWR8

   __vector signed short __a, __b, __r;

   __vector __bool short __c;


   __a = (__vector signed short)vec_splats(__A);

   __b = (__vector signed short)vec_splats(__B);

   __c = (__vector __bool short)vec_cmpgt(__a, __b);

   __r = vec_sel(__b, __a, __c);

   return (__m64)((__vector long long)__r)[0];

 #else

   __m64_union __m1, __m2, __res;


   __m1.as_m64 = __A;

   __m2.as_m64 = __B;


   __res.as_short[0] = (__m1.as_short[0] > __m2.as_short[0]) ? __m1.as_short[0]

                                                             : __m2.as_short[0];

   __res.as_short[1] = (__m1.as_short[1] > __m2.as_short[1]) ? __m1.as_short[1]

                                                             : __m2.as_short[1];

   __res.as_short[2] = (__m1.as_short[2] > __m2.as_short[2]) ? __m1.as_short[2]

                                                             : __m2.as_short[2];

   __res.as_short[3] = (__m1.as_short[3] > __m2.as_short[3]) ? __m1.as_short[3]

                                                             : __m2.as_short[3];


   return (__m64)__res.as_m64;

 #endif

 }


 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _m_pmaxsw(__m64 __A, __m64 __B) {

   return _mm_max_pi16(__A, __B);

 }


 /* Compute the element-wise maximum of unsigned 8-bit values.  */

 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_max_pu8(__m64 __A, __m64 __B) {

 #if _ARCH_PWR8

   __vector unsigned char __a, __b, __r;

   __vector __bool char __c;


   __a = (__vector unsigned char)vec_splats(__A);

   __b = (__vector unsigned char)vec_splats(__B);

   __c = (__vector __bool char)vec_cmpgt(__a, __b);

   __r = vec_sel(__b, __a, __c);

   return (__m64)((__vector long long)__r)[0];

 #else

   __m64_union __m1, __m2, __res;

   long __i;


   __m1.as_m64 = __A;

   __m2.as_m64 = __B;


   for (__i = 0; __i < 8; __i++)

     __res.as_char[__i] =

         ((unsigned char)__m1.as_char[__i] > (unsigned char)__m2.as_char[__i])

             ? __m1.as_char[__i]

             : __m2.as_char[__i];


   return (__m64)__res.as_m64;

 #endif

 }


 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _m_pmaxub(__m64 __A, __m64 __B) {

   return _mm_max_pu8(__A, __B);

 }


 /* Compute the element-wise minimum of signed 16-bit values.  */

 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_min_pi16(__m64 __A, __m64 __B) {

 #if _ARCH_PWR8

   __vector signed short __a, __b, __r;

   __vector __bool short __c;


   __a = (__vector signed short)vec_splats(__A);

   __b = (__vector signed short)vec_splats(__B);

   __c = (__vector __bool short)vec_cmplt(__a, __b);

   __r = vec_sel(__b, __a, __c);

   return (__m64)((__vector long long)__r)[0];

 #else

   __m64_union __m1, __m2, __res;


   __m1.as_m64 = __A;

   __m2.as_m64 = __B;


   __res.as_short[0] = (__m1.as_short[0] < __m2.as_short[0]) ? __m1.as_short[0]

                                                             : __m2.as_short[0];

   __res.as_short[1] = (__m1.as_short[1] < __m2.as_short[1]) ? __m1.as_short[1]

                                                             : __m2.as_short[1];

   __res.as_short[2] = (__m1.as_short[2] < __m2.as_short[2]) ? __m1.as_short[2]

                                                             : __m2.as_short[2];

   __res.as_short[3] = (__m1.as_short[3] < __m2.as_short[3]) ? __m1.as_short[3]

                                                             : __m2.as_short[3];


   return (__m64)__res.as_m64;

 #endif

 }


 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _m_pminsw(__m64 __A, __m64 __B) {

   return _mm_min_pi16(__A, __B);

 }


 /* Compute the element-wise minimum of unsigned 8-bit values.  */

 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_min_pu8(__m64 __A, __m64 __B) {

 #if _ARCH_PWR8

   __vector unsigned char __a, __b, __r;

   __vector __bool char __c;


   __a = (__vector unsigned char)vec_splats(__A);

   __b = (__vector unsigned char)vec_splats(__B);

   __c = (__vector __bool char)vec_cmplt(__a, __b);

   __r = vec_sel(__b, __a, __c);

   return (__m64)((__vector long long)__r)[0];

 #else

   __m64_union __m1, __m2, __res;

   long __i;


   __m1.as_m64 = __A;

   __m2.as_m64 = __B;


   for (__i = 0; __i < 8; __i++)

     __res.as_char[__i] =

         ((unsigned char)__m1.as_char[__i] < (unsigned char)__m2.as_char[__i])

             ? __m1.as_char[__i]

             : __m2.as_char[__i];


   return (__m64)__res.as_m64;

 #endif

 }


 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _m_pminub(__m64 __A, __m64 __B) {

   return _mm_min_pu8(__A, __B);

 }


 /* Create an 8-bit mask of the signs of 8-bit values.  */

 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_movemask_pi8(__m64 __A) {

 #ifdef __powerpc64__

   unsigned long long __p =

 #ifdef __LITTLE_ENDIAN__

       0x0008101820283038UL; // permute control for sign bits

 #else

       0x3830282018100800UL; // permute control for sign bits

 #endif

   return __builtin_bpermd(__p, __A);

 #else

 #ifdef __LITTLE_ENDIAN__

   unsigned int __mask = 0x20283038UL;

   unsigned int __r1 = __builtin_bpermd(__mask, __A) & 0xf;

   unsigned int __r2 = __builtin_bpermd(__mask, __A >> 32) & 0xf;

 #else

   unsigned int __mask = 0x38302820UL;

   unsigned int __r1 = __builtin_bpermd(__mask, __A >> 32) & 0xf;

   unsigned int __r2 = __builtin_bpermd(__mask, __A) & 0xf;

 #endif

   return (__r2 << 4) | __r1;

 #endif

 }


 extern __inline int

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _m_pmovmskb(__m64 __A) {

   return _mm_movemask_pi8(__A);

 }


 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values

    in B and produce the high 16 bits of the 32-bit results.  */

 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_mulhi_pu16(__m64 __A, __m64 __B) {

   __vector unsigned short __a, __b;

   __vector unsigned short __c;

   __vector unsigned int __w0, __w1;

   __vector unsigned char __xform1 = {

 #ifdef __LITTLE_ENDIAN__

       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,

       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F

 #else

       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,

       0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15

 #endif

   };


   __a = (__vector unsigned short)vec_splats(__A);

   __b = (__vector unsigned short)vec_splats(__B);


   __w0 = vec_vmuleuh(__a, __b);

   __w1 = vec_vmulouh(__a, __b);

   __c = (__vector unsigned short)vec_perm(__w0, __w1, __xform1);


   return (__m64)((__vector long long)__c)[0];

 }


 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _m_pmulhuw(__m64 __A, __m64 __B) {

   return _mm_mulhi_pu16(__A, __B);

 }


 /* Return a combination of the four 16-bit values in A.  The selector

    must be an immediate.  */

 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_shuffle_pi16(__m64 __A, int const __N) {

   unsigned long __element_selector_10 = __N & 0x03;

   unsigned long __element_selector_32 = (__N >> 2) & 0x03;

   unsigned long __element_selector_54 = (__N >> 4) & 0x03;

   unsigned long __element_selector_76 = (__N >> 6) & 0x03;

   static const unsigned short __permute_selectors[4] = {

 #ifdef __LITTLE_ENDIAN__

       0x0908, 0x0B0A, 0x0D0C, 0x0F0E

 #else

       0x0607, 0x0405, 0x0203, 0x0001

 #endif

   };

   __m64_union __t;

   __vector unsigned long long __a, __p, __r;


 #ifdef __LITTLE_ENDIAN__

   __t.as_short[0] = __permute_selectors[__element_selector_10];

   __t.as_short[1] = __permute_selectors[__element_selector_32];

   __t.as_short[2] = __permute_selectors[__element_selector_54];

   __t.as_short[3] = __permute_selectors[__element_selector_76];

 #else

   __t.as_short[3] = __permute_selectors[__element_selector_10];

   __t.as_short[2] = __permute_selectors[__element_selector_32];

   __t.as_short[1] = __permute_selectors[__element_selector_54];

   __t.as_short[0] = __permute_selectors[__element_selector_76];

 #endif

   __p = vec_splats(__t.as_m64);

   __a = vec_splats(__A);

   __r = vec_perm(__a, __a, (__vector unsigned char)__p);

   return (__m64)((__vector long long)__r)[0];

 }


 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _m_pshufw(__m64 __A, int const __N) {

   return _mm_shuffle_pi16(__A, __N);

 }


 /* Conditionally store byte elements of A into P.  The high bit of each

    byte in the selector N determines whether the corresponding byte from

    A is stored.  */

 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_maskmove_si64(__m64 __A, __m64 __N, char *__P) {

   __m64 __hibit = 0x8080808080808080UL;

   __m64 __mask, __tmp;

   __m64 *__p = (__m64 *)__P;


   __tmp = *__p;

   __mask = _mm_cmpeq_pi8((__N & __hibit), __hibit);

   __tmp = (__tmp & (~__mask)) | (__A & __mask);

   *__p = __tmp;

 }


 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _m_maskmovq(__m64 __A, __m64 __N, char *__P) {

   _mm_maskmove_si64(__A, __N, __P);

 }


 /* Compute the rounded averages of the unsigned 8-bit values in A and B.  */

 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_avg_pu8(__m64 __A, __m64 __B) {

   __vector unsigned char __a, __b, __c;


   __a = (__vector unsigned char)vec_splats(__A);

   __b = (__vector unsigned char)vec_splats(__B);

   __c = vec_avg(__a, __b);

   return (__m64)((__vector long long)__c)[0];

 }


 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _m_pavgb(__m64 __A, __m64 __B) {

   return _mm_avg_pu8(__A, __B);

 }


 /* Compute the rounded averages of the unsigned 16-bit values in A and B.  */

 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_avg_pu16(__m64 __A, __m64 __B) {

   __vector unsigned short __a, __b, __c;


   __a = (__vector unsigned short)vec_splats(__A);

   __b = (__vector unsigned short)vec_splats(__B);

   __c = vec_avg(__a, __b);

   return (__m64)((__vector long long)__c)[0];

 }


 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _m_pavgw(__m64 __A, __m64 __B) {

   return _mm_avg_pu16(__A, __B);

 }


 /* Compute the sum of the absolute differences of the unsigned 8-bit

    values in A and B.  Return the value in the lower 16-bit word; the

    upper words are cleared.  */

 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_sad_pu8(__m64 __A, __m64 __B) {

   __vector unsigned char __a, __b;

   __vector unsigned char __vmin, __vmax, __vabsdiff;

   __vector signed int __vsum;

   const __vector unsigned int __zero = {0, 0, 0, 0};

   __m64_union __result = {0};


   __a = (__vector unsigned char)(__vector unsigned long long){0UL, __A};

   __b = (__vector unsigned char)(__vector unsigned long long){0UL, __B};

   __vmin = vec_min(__a, __b);

   __vmax = vec_max(__a, __b);

   __vabsdiff = vec_sub(__vmax, __vmin);

   /* Sum four groups of bytes into integers.  */

   __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);

   /* Sum across four integers with integer result.  */

   __vsum = vec_sums(__vsum, (__vector signed int)__zero);

   /* The sum is in the right most 32-bits of the vector result.

      Transfer to a GPR and truncate to 16 bits.  */

   __result.as_short[0] = __vsum[3];

   return __result.as_m64;

 }


 extern __inline __m64

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _m_psadbw(__m64 __A, __m64 __B) {

   return _mm_sad_pu8(__A, __B);

 }


 /* Stores the data in A to the address P without polluting the caches.  */

 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_stream_pi(__m64 *__P, __m64 __A) {

   /* Use the data cache block touch for store transient.  */

   __asm__(" dcbtstt 0,%0" : : "b"(__P) : "memory");

   *__P = __A;

 }


 /* Likewise.  The address must be 16-byte aligned.  */

 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_stream_ps(float *__P, __m128 __A) {

   /* Use the data cache block touch for store transient.  */

   __asm__(" dcbtstt 0,%0" : : "b"(__P) : "memory");

   _mm_store_ps(__P, __A);

 }


 /* Guarantees that every preceding store is globally visible before

    any subsequent store.  */

 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_sfence(void) {

   /* Generate a light weight sync.  */

   __atomic_thread_fence(__ATOMIC_RELEASE);

 }


 /* The execution of the next instruction is delayed by an implementation

    specific amount of time.  The instruction does not modify the

    architectural state.  This is after the pop_options pragma because

    it does not require SSE support in the processor--the encoding is a

    nop on processors that do not support it.  */

 extern __inline void

     __attribute__((__gnu_inline__, __always_inline__, __artificial__))

     _mm_pause(void) {

   /* There is no exact match with this construct, but the following is

      close to the desired effect.  */

 #if _ARCH_PWR8

   /* On power8 and later processors we can depend on Program Priority

      (PRI) and associated "very low" PPI setting.  Since we don't know

      what PPI this thread is running at we: 1) save the current PRI

      from the PPR SPR into a local GRP, 2) set the PRI to "very low*

      via the special or 31,31,31 encoding. 3) issue an "isync" to

      insure the PRI change takes effect before we execute any more

      instructions.

      Now we can execute a lwsync (release barrier) while we execute

      this thread at "very low" PRI.  Finally we restore the original

      PRI and continue execution.  */

   unsigned long __PPR;


   __asm__ volatile("  mfppr %0;"

                    "   or 31,31,31;"

                    "   isync;"

                    "   lwsync;"

                    "   isync;"

                    "   mtppr  %0;"

                    : "=r"(__PPR)

                    :

                    : "memory");

 #else

   /* For older processor where we may not even have Program Priority

      controls we can only depend on Heavy Weight Sync.  */

   __atomic_thread_fence(__ATOMIC_SEQ_CST);

 #endif

 }


 /* Transpose the 4x4 matrix composed of row[0-3].  */

 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)                              \

   do {                                                                         \

     __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);         \

     __v4sf __t0 = vec_vmrghw(__r0, __r1);                                      \

     __v4sf __t1 = vec_vmrghw(__r2, __r3);                                      \

     __v4sf __t2 = vec_vmrglw(__r0, __r1);                                      \

     __v4sf __t3 = vec_vmrglw(__r2, __r3);                                      \

     (row0) = (__v4sf)vec_mergeh((__vector long long)__t0,                      \

                                 (__vector long long)__t1);                     \

     (row1) = (__v4sf)vec_mergel((__vector long long)__t0,                      \

                                 (__vector long long)__t1);                     \

     (row2) = (__v4sf)vec_mergeh((__vector long long)__t2,                      \

                                 (__vector long long)__t3);                     \

     (row3) = (__v4sf)vec_mergel((__vector long long)__t2,                      \

                                 (__vector long long)__t3);                     \

   } while (0)


 /* For backward source compatibility.  */

 //# include <emmintrin.h>


 #else

 #include_next <xmmintrin.h>

 #endif /* defined(__powerpc64__) &&                                            \

         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */


 #endif /* XMMINTRIN_H_ */

int
__device__ int
Definition: __clang_hip_libdevice_declares.h:67

float
__device__ float
Definition: __clang_hip_libdevice_declares.h:26

altivec.h

vec_cmpeq
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
Definition: altivec.h:1708

vec_vmrghw
static __inline__ vector int __ATTRS_o_ai vec_vmrghw(vector int __a, vector int __b)
Definition: altivec.h:5326

vec_ctf
#define vec_ctf(__a, __b)
Definition: altivec.h:3244

vec_vupkhsh
static __inline__ vector int __ATTRS_o_ai vec_vupkhsh(vector short __a)
Definition: altivec.h:12731

__c
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4800

__b
static __inline__ vector float vector float __b
Definition: altivec.h:578

vec_ld
static __inline__ vector signed char __ATTRS_o_ai vec_ld(long __a, const vector signed char *__b)
Definition: altivec.h:4061

vec_splats
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
Definition: altivec.h:14737

vec_vupkhsb
static __inline__ vector short __ATTRS_o_ai vec_vupkhsb(vector signed char __a)
Definition: altivec.h:12712

vec_andc
static __inline__ vector signed char __ATTRS_o_ai vec_andc(vector signed char __a, vector signed char __b)
Definition: altivec.h:1235

vec_st
static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, long __b, vector signed char *__c)
Definition: altivec.h:11184

vec_sum4s
static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a, vector int __b)
Definition: altivec.h:12487

vec_and
static __inline__ vector signed char __ATTRS_o_ai vec_and(vector signed char __a, vector signed char __b)
Definition: altivec.h:882

vec_avg
static __inline__ vector signed char __ATTRS_o_ai vec_avg(vector signed char __a, vector signed char __b)
Definition: altivec.h:1586

vec_mergel
static __inline__ vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a, vector signed char __b)
Definition: altivec.h:5361

vec_vmrglw
static __inline__ vector int __ATTRS_o_ai vec_vmrglw(vector int __a, vector int __b)
Definition: altivec.h:5589

vec_perm
static __inline__ vector signed char __ATTRS_o_ai vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:7962

vec_sel
static __inline__ vector signed char __ATTRS_o_ai vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:8588

vec_mergeh
static __inline__ vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a, vector signed char __b)
Definition: altivec.h:5091

vec_vupklsh
static __inline__ vector int __ATTRS_o_ai vec_vupklsh(vector short __a)
Definition: altivec.h:12870

vec_cmplt
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2435

vec_max
static __inline__ vector signed char __ATTRS_o_ai vec_max(vector signed char __a, vector signed char __b)
Definition: altivec.h:4838

vec_nor
static __inline__ vector signed char __ATTRS_o_ai vec_nor(vector signed char __a, vector signed char __b)
Definition: altivec.h:6729

vec_cmpge
static __inline__ vector bool char __ATTRS_o_ai vec_cmpge(vector signed char __a, vector signed char __b)
Definition: altivec.h:2243

vec_pack
static __inline__ vector signed char __ATTRS_o_ai vec_pack(vector signed short __a, vector signed short __b)
Definition: altivec.h:7389

vec_re
static __inline__ vector float __ATTRS_o_ai vec_re(vector float __a)
Definition: altivec.h:8263

vec_min
static __inline__ vector signed char __ATTRS_o_ai vec_min(vector signed char __a, vector signed char __b)
Definition: altivec.h:5742

vec_cts
#define vec_cts
Definition: altivec.h:3319

vec_splat
static __inline__ vector signed char __ATTRS_o_ai vec_splat(vector signed char __a, unsigned const int __b)
Definition: altivec.h:10090

vec_or
static __inline__ vector signed char __ATTRS_o_ai vec_or(vector signed char __a, vector signed char __b)
Definition: altivec.h:6865

vec_abs
static __inline__ vector signed char __ATTRS_o_ai vec_abs(vector signed char __a)
Definition: altivec.h:117

vec_xor
static __inline__ vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:13207

vec_rsqrte
static __inline__ vector float __ATTRS_o_ai vec_rsqrte(vector float __a)
Definition: altivec.h:8541

vec_cmpgt
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2131

vec_cmple
static __inline__ vector bool char __ATTRS_o_ai vec_cmple(vector signed char __a, vector signed char __b)
Definition: altivec.h:2369

vec_sub
static __inline__ vector signed char __ATTRS_o_ai vec_sub(vector signed char __a, vector signed char __b)
Definition: altivec.h:11869

__p
static __inline__ uint32_t volatile uint32_t * __p
Definition: arm_acle.h:80

__a
static __inline__ void int __a
Definition: emmintrin.h:4057

_mm_pause
void _mm_pause(void)
Indicates that a spin loop is being executed for the purposes of optimizing power consumption during ...

__D
static __inline__ void short __D
Definition: immintrin.h:468

_mm_cmpeq_pi8
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
Compares the 8-bit integer elements of two 64-bit integer vectors of [8 x i8] to determine if the ele...
Definition: mmintrin.h:1157

__P
__inline unsigned int unsigned int unsigned int * __P
Definition: bmi2intrin.h:25

__Y
__inline unsigned int unsigned int __Y
Definition: bmi2intrin.h:19

_mm_comigt_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1172

_mm_cvttss_si32
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttss_si32(__m128 __a)
Converts the lower (first) element of a vector of [4 x float] into a signed truncated (rounded toward...
Definition: xmmintrin.h:1494

_m_pinsrw
#define _m_pinsrw
Definition: xmmintrin.h:3185

_mm_rcp_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ss(__m128 __a)
Calculates the approximate reciprocal of the value stored in the low-order bits of a 128-bit vector o...
Definition: xmmintrin.h:257

_mm_cmplt_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:573

_mm_sqrt_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a)
Calculates the square root of the value stored in the low-order bits of a 128-bit vector of [4 x floa...
Definition: xmmintrin.h:222

_MM_HINT_ET0
#define _MM_HINT_ET0
Definition: xmmintrin.h:2183

_mm_div_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ps(__m128 __a, __m128 __b)
Divides two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:204

_mm_cmpnge_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:948

_mm_cmpeq_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for equa...
Definition: xmmintrin.h:525

_mm_setzero_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition: xmmintrin.h:2018

_mm_cvtps_pi32
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1452

_mm_set_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ss(float __w)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:1911

_mm_and_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_and_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:420

_mm_cmplt_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:551

_mm_cvt_ss2si
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1406

_mm_cmpeq_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality.
Definition: xmmintrin.h:504

_mm_add_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ps(__m128 __a, __m128 __b)
Adds two 128-bit vectors of [4 x float], and returns the results of the addition.
Definition: xmmintrin.h:78

_mm_load_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an aligned memory location.
Definition: xmmintrin.h:1838

_mm_mulhi_pu16
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mulhi_pu16(__m64 __a, __m64 __b)
Multiplies packed 16-bit unsigned integer values and writes the high-order 16 bits of each 32-bit pro...
Definition: xmmintrin.h:2440

_mm_cmpneq_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for ineq...
Definition: xmmintrin.h:769

_mm_comile_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1148

_mm_cvt_ps2pi
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1472

_mm_andnot_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_andnot_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float], using the one's complement of the value...
Definition: xmmintrin.h:442

_mm_storer_ps
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_ps(float *__p, __m128 __a)
Stores float values from a 128-bit vector of [4 x float] to an aligned memory location in reverse ord...
Definition: xmmintrin.h:2177

_mm_undefined_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
Definition: xmmintrin.h:1891

_mm_cvttps_pi32
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttps_pi32(__m128 __a)
Converts the lower (first) two elements of a 128-bit vector of [4 x float] into two signed truncated ...
Definition: xmmintrin.h:1562

_mm_cmpnle_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:846

_mm_ucomilt_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1267

_m_pmulhuw
#define _m_pmulhuw
Definition: xmmintrin.h:3191

_m_pmaxub
#define _m_pmaxub
Definition: xmmintrin.h:3187

_mm_cmple_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:599

_mm_cvt_si2ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvt_si2ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition: xmmintrin.h:1628

_mm_stream_pi
static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_stream_pi(void *__p, __m64 __a)
Stores a 64-bit integer in the specified aligned memory location.
Definition: xmmintrin.h:2235

_mm_cvtpi16_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi16_ps(__m64 __a)
Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x float].
Definition: xmmintrin.h:2850

_mm_rsqrt_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ps(__m128 __a)
Calculates the approximate reciprocals of the square roots of the values stored in a 128-bit vector o...
Definition: xmmintrin.h:310

_mm_cvtps_pi8
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi8(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition: xmmintrin.h:3020

_mm_storel_pi
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pi(__m64 *__p, __m128 __a)
Stores the lower 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition: xmmintrin.h:2056

_m_pmaxsw
#define _m_pmaxsw
Definition: xmmintrin.h:3186

_mm_ucomile_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1291

_mm_cmpge_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:721

_mm_shuffle_ps
#define _mm_shuffle_ps(a, b, mask)
Selects 4 float values from the 128-bit operands of [4 x float], as specified by the immediate value ...
Definition: xmmintrin.h:2724

_m_pavgw
#define _m_pavgw
Definition: xmmintrin.h:3195

_mm_comieq_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality.
Definition: xmmintrin.h:1099

_mm_store1_ps
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_ps(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition: xmmintrin.h:2138

_mm_cvtpu16_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu16_ps(__m64 __a)
Converts a 64-bit vector of 16-bit unsigned integer values into a 128-bit vector of [4 x float].
Definition: xmmintrin.h:2880

_mm_sfence
void _mm_sfence(void)
Forces strong memory ordering (serialization) between store instructions preceding this instruction a...

_mm_load_ps1
#define _mm_load_ps1(p)
Definition: xmmintrin.h:1824

_mm_set_ps1
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps1(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition: xmmintrin.h:1948

_mm_mul_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ps(__m128 __a, __m128 __b)
Multiplies two 128-bit vectors of [4 x float] and returns the results of the multiplication.
Definition: xmmintrin.h:163

_MM_HINT_ET1
#define _MM_HINT_ET1
Definition: xmmintrin.h:2184

_mm_max_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the greater of each pair of values.
Definition: xmmintrin.h:402

_m_pextrw
#define _m_pextrw
Definition: xmmintrin.h:3184

_mm_rsqrt_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ss(__m128 __a)
Calculates the approximate reciprocal of the square root of the value stored in the low-order bits of...
Definition: xmmintrin.h:293

_mm_ucomige_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1339

_mm_avg_pu16
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu16(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 16-bit integer values and writes the averages to...
Definition: xmmintrin.h:2544

_mm_comilt_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1124

_mm_loadl_pi
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadl_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the low-order bits of a 128-bit vector of [4 ...
Definition: xmmintrin.h:1766

_mm_storeu_ps
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition: xmmintrin.h:2098

_mm_sub_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ss(__m128 __a, __m128 __b)
Subtracts the 32-bit float value in the low-order bits of the second operand from the corresponding v...
Definition: xmmintrin.h:100

_mm_sub_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ps(__m128 __a, __m128 __b)
Subtracts each of the values of the second operand from the first operand, both of which are 128-bit ...
Definition: xmmintrin.h:121

_mm_load1_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load1_ps(const float *__p)
Loads a 32-bit float value and duplicates it to all four vector elements of a 128-bit vector of [4 x ...
Definition: xmmintrin.h:1815

_mm_movelh_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movelh_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2832

_mm_min_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the lesser of each pair of values.
Definition: xmmintrin.h:356

_mm_stream_ps
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_ps(void *__p, __m128 __a)
Moves packed float values from a 128-bit vector of [4 x float] to a 128-bit aligned memory location.
Definition: xmmintrin.h:2254

_m_pavgb
#define _m_pavgb
Definition: xmmintrin.h:3194

_m_pmovmskb
#define _m_pmovmskb
Definition: xmmintrin.h:3190

_mm_comige_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1196

_mm_cvtss_si32
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1384

_mm_cmpgt_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:671

_m_psadbw
#define _m_psadbw
Definition: xmmintrin.h:3196

_mm_setr_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setr_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float], initialized in reverse order with the spec...
Definition: xmmintrin.h:2003

_mm_ucomigt_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1315

_mm_unpackhi_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpackhi_ps(__m128 __a, __m128 __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x float] and interleaves the...
Definition: xmmintrin.h:2745

_mm_store_ss
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ss(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] to a memory location.
Definition: xmmintrin.h:2077

_mm_cmpngt_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:896

_mm_loadh_pi
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadh_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the high-order bits of a 128-bit vector of [4...
Definition: xmmintrin.h:1739

_mm_xor_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_xor_ps(__m128 __a, __m128 __b)
Performs a bitwise exclusive OR of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:479

_mm_rcp_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ps(__m128 __a)
Calculates the approximate reciprocals of the values stored in a 128-bit vector of [4 x float].
Definition: xmmintrin.h:274

_mm_move_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_move_ss(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2789

_mm_set1_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set1_ps(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition: xmmintrin.h:1929

_mm_store_ps
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] into an aligned memory location.
Definition: xmmintrin.h:2119

_MM_HINT_T0
#define _MM_HINT_T0
Definition: xmmintrin.h:2185

_mm_or_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_or_ps(__m128 __a, __m128 __b)
Performs a bitwise OR of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:460

_mm_extract_pi16
#define _mm_extract_pi16(a, n)
Extracts 16-bit element from a 64-bit vector of [4 x i16] and returns it, as specified by the immedia...
Definition: xmmintrin.h:2298

_mm_sqrt_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a)
Calculates the square roots of the values stored in a 128-bit vector of [4 x float].
Definition: xmmintrin.h:239

_mm_cmpneq_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for inequality.
Definition: xmmintrin.h:747

_mm_prefetch
#define _mm_prefetch(a, sel)
Loads one cache line of data from the specified address to a location closer to the processor.
Definition: xmmintrin.h:2218

_mm_cvtss_f32
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtss_f32(__m128 __a)
Extracts a float value contained in the lower 32 bits of a vector of [4 x float].
Definition: xmmintrin.h:1718

_m_pminsw
#define _m_pminsw
Definition: xmmintrin.h:3188

_mm_max_pu8
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2365

__attribute__
int __v4si __attribute__((__vector_size__(16)))
Definition: xmmintrin.h:19

_mm_mul_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ss(__m128 __a, __m128 __b)
Multiplies two 32-bit float values in the low-order bits of the operands.
Definition: xmmintrin.h:143

_mm_min_pi16
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition: xmmintrin.h:2384

_mm_cvtsi32_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsi32_ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition: xmmintrin.h:1605

_mm_cvtt_ps2pi
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtt_ps2pi(__m128 __a)
Converts the lower (first) two elements of a 128-bit vector of [4 x float] into two signed truncated ...
Definition: xmmintrin.h:1583

_mm_cvtt_ss2si
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtt_ss2si(__m128 __a)
Converts the lower (first) element of a vector of [4 x float] into a signed truncated (rounded toward...
Definition: xmmintrin.h:1516

_mm_movemask_ps
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_ps(__m128 __a)
Extracts the sign bits from each single-precision floating-point element of a 128-bit floating-point ...
Definition: xmmintrin.h:3045

_mm_cvtpi32x2_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
Converts the two 32-bit signed integer values from each 64-bit vector operand of [2 x i32] into a 128...
Definition: xmmintrin.h:2961

_mm_movehl_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movehl_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2811

_mm_loadr_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadr_ps(const float *__p)
Loads four packed float values, in reverse order, from an aligned memory location to 32-bit elements ...
Definition: xmmintrin.h:1877

_mm_cmpord_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:1000

_mm_cmpnlt_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:819

_mm_storeh_pi
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pi(__m64 *__p, __m128 __a)
Stores the upper 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition: xmmintrin.h:2035

_MM_HINT_T1
#define _MM_HINT_T1
Definition: xmmintrin.h:2186

_mm_cmpngt_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:921

_m_pshufw
#define _m_pshufw
Definition: xmmintrin.h:3192

_mm_cmpnge_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:973

_m_maskmovq
#define _m_maskmovq
Definition: xmmintrin.h:3193

_mm_insert_pi16
#define _mm_insert_pi16(a, d, n)
Copies data from the 64-bit vector of [4 x i16] to the destination, and inserts the lower 16-bits of ...
Definition: xmmintrin.h:2329

_mm_shuffle_pi16
#define _mm_shuffle_pi16(a, n)
Shuffles the 4 16-bit integers from a 64-bit integer vector to the destination, as specified by the i...
Definition: xmmintrin.h:2479

_mm_cmpord_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:1024

_mm_cmpgt_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:647

_mm_cvt_pi2ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvt_pi2ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition: xmmintrin.h:1701

_mm_ucomieq_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1243

_mm_add_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ss(__m128 __a, __m128 __b)
Adds the 32-bit float values in the low-order bits of the operands.
Definition: xmmintrin.h:58

_mm_set_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float] initialized with the specified single-preci...
Definition: xmmintrin.h:1975

_mm_movemask_pi8
static __inline__ int __DEFAULT_FN_ATTRS_MMX _mm_movemask_pi8(__m64 __a)
Takes the most significant bit from each 8-bit element in a 64-bit integer vector to create an 8-bit ...
Definition: xmmintrin.h:2421

_mm_cmpnlt_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:796

_mm_store_ps1
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps1(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition: xmmintrin.h:2158

_mm_min_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the lesser value ...
Definition: xmmintrin.h:335

_mm_cvtpu8_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu8_ps(__m64 __a)
Converts the lower four unsigned 8-bit integer values from a 64-bit vector of [8 x u8] into a 128-bit...
Definition: xmmintrin.h:2934

_mm_cmple_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:621

_mm_avg_pu8
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu8(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 8-bit integer values and writes the averages to ...
Definition: xmmintrin.h:2525

_mm_cvtpi8_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi8_ps(__m64 __a)
Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] into a 128-bit vector of [4 x f...
Definition: xmmintrin.h:2909

_mm_unpacklo_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpacklo_ps(__m128 __a, __m128 __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x float] and interleaves them...
Definition: xmmintrin.h:2767

_mm_max_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the greater value...
Definition: xmmintrin.h:381

_mm_maskmove_si64
static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
Conditionally copies the values from each 8-bit element in the first 64-bit integer vector operand to...
Definition: xmmintrin.h:2506

_m_pminub
#define _m_pminub
Definition: xmmintrin.h:3189

_mm_max_pi16
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition: xmmintrin.h:2346

_mm_ucomineq_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1362

_mm_cvtps_pi16
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi16(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition: xmmintrin.h:2990

_MM_HINT_NTA
#define _MM_HINT_NTA
Definition: xmmintrin.h:2188

_mm_comineq_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1220

_mm_min_pu8
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2403

_mm_cmpnle_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:869

_mm_sad_pu8
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sad_pu8(__m64 __a, __m64 __b)
Subtracts the corresponding 8-bit unsigned integer values of the two 64-bit vector operands and compu...
Definition: xmmintrin.h:2566

_mm_cvtpi32_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition: xmmintrin.h:1678

_MM_HINT_T2
#define _MM_HINT_T2
Definition: xmmintrin.h:2187

_mm_cmpunord_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:1051

_mm_cmpge_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:697

_mm_cmpunord_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:1075

_mm_loadu_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition: xmmintrin.h:1855

_mm_load_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ss(const float *__p)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:1793

_mm_div_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ss(__m128 __a, __m128 __b)
Divides the value in the low-order 32 bits of the first operand by the corresponding value in the sec...
Definition: xmmintrin.h:185