clang  19.0.0git
pmmintrin.h
Go to the documentation of this file.
1 /*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __PMMINTRIN_H
11 #define __PMMINTRIN_H
12 
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16 
17 #include <emmintrin.h>
18 
19 /* Define the default attributes for the functions in this file. */
20 #define __DEFAULT_FN_ATTRS \
21  __attribute__((__always_inline__, __nodebug__, \
22  __target__("sse3,no-evex512"), __min_vector_width__(128)))
23 
24 /// Loads data from an unaligned memory location to elements in a 128-bit
25 /// vector.
26 ///
27 /// If the address of the data is not 16-byte aligned, the instruction may
28 /// read two adjacent aligned blocks of memory to retrieve the requested
29 /// data.
30 ///
31 /// \headerfile <x86intrin.h>
32 ///
33 /// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
34 ///
35 /// \param __p
36 /// A pointer to a 128-bit integer vector containing integer values.
37 /// \returns A 128-bit vector containing the moved values.
38 static __inline__ __m128i __DEFAULT_FN_ATTRS
39 _mm_lddqu_si128(__m128i_u const *__p)
40 {
41  return (__m128i)__builtin_ia32_lddqu((char const *)__p);
42 }
43 
44 /// Adds the even-indexed values and subtracts the odd-indexed values of
45 /// two 128-bit vectors of [4 x float].
46 ///
47 /// \headerfile <x86intrin.h>
48 ///
49 /// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
50 ///
51 /// \param __a
52 /// A 128-bit vector of [4 x float] containing the left source operand.
53 /// \param __b
54 /// A 128-bit vector of [4 x float] containing the right source operand.
55 /// \returns A 128-bit vector of [4 x float] containing the alternating sums and
56 /// differences of both operands.
57 static __inline__ __m128 __DEFAULT_FN_ATTRS
58 _mm_addsub_ps(__m128 __a, __m128 __b)
59 {
60  return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
61 }
62 
63 /// Horizontally adds the adjacent pairs of values contained in two
64 /// 128-bit vectors of [4 x float].
65 ///
66 /// \headerfile <x86intrin.h>
67 ///
68 /// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
69 ///
70 /// \param __a
71 /// A 128-bit vector of [4 x float] containing one of the source operands.
72 /// The horizontal sums of the values are stored in the lower bits of the
73 /// destination.
74 /// \param __b
75 /// A 128-bit vector of [4 x float] containing one of the source operands.
76 /// The horizontal sums of the values are stored in the upper bits of the
77 /// destination.
78 /// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
79 /// both operands.
80 static __inline__ __m128 __DEFAULT_FN_ATTRS
81 _mm_hadd_ps(__m128 __a, __m128 __b)
82 {
83  return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
84 }
85 
86 /// Horizontally subtracts the adjacent pairs of values contained in two
87 /// 128-bit vectors of [4 x float].
88 ///
89 /// \headerfile <x86intrin.h>
90 ///
91 /// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
92 ///
93 /// \param __a
94 /// A 128-bit vector of [4 x float] containing one of the source operands.
95 /// The horizontal differences between the values are stored in the lower
96 /// bits of the destination.
97 /// \param __b
98 /// A 128-bit vector of [4 x float] containing one of the source operands.
99 /// The horizontal differences between the values are stored in the upper
100 /// bits of the destination.
101 /// \returns A 128-bit vector of [4 x float] containing the horizontal
102 /// differences of both operands.
103 static __inline__ __m128 __DEFAULT_FN_ATTRS
104 _mm_hsub_ps(__m128 __a, __m128 __b)
105 {
106  return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
107 }
108 
109 /// Moves and duplicates odd-indexed values from a 128-bit vector
110 /// of [4 x float] to float values stored in a 128-bit vector of
111 /// [4 x float].
112 ///
113 /// \headerfile <x86intrin.h>
114 ///
115 /// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
116 ///
117 /// \param __a
118 /// A 128-bit vector of [4 x float]. \n
119 /// Bits [127:96] of the source are written to bits [127:96] and [95:64] of
120 /// the destination. \n
121 /// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
122 /// destination.
123 /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
124 /// values.
125 static __inline__ __m128 __DEFAULT_FN_ATTRS
127 {
128  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
129 }
130 
131 /// Duplicates even-indexed values from a 128-bit vector of
132 /// [4 x float] to float values stored in a 128-bit vector of [4 x float].
133 ///
134 /// \headerfile <x86intrin.h>
135 ///
136 /// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
137 ///
138 /// \param __a
139 /// A 128-bit vector of [4 x float] \n
140 /// Bits [95:64] of the source are written to bits [127:96] and [95:64] of
141 /// the destination. \n
142 /// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
143 /// destination.
144 /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
145 /// values.
146 static __inline__ __m128 __DEFAULT_FN_ATTRS
148 {
149  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
150 }
151 
152 /// Adds the even-indexed values and subtracts the odd-indexed values of
153 /// two 128-bit vectors of [2 x double].
154 ///
155 /// \headerfile <x86intrin.h>
156 ///
157 /// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
158 ///
159 /// \param __a
160 /// A 128-bit vector of [2 x double] containing the left source operand.
161 /// \param __b
162 /// A 128-bit vector of [2 x double] containing the right source operand.
163 /// \returns A 128-bit vector of [2 x double] containing the alternating sums
164 /// and differences of both operands.
165 static __inline__ __m128d __DEFAULT_FN_ATTRS
166 _mm_addsub_pd(__m128d __a, __m128d __b)
167 {
168  return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
169 }
170 
171 /// Horizontally adds the pairs of values contained in two 128-bit
172 /// vectors of [2 x double].
173 ///
174 /// \headerfile <x86intrin.h>
175 ///
176 /// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
177 ///
178 /// \param __a
179 /// A 128-bit vector of [2 x double] containing one of the source operands.
180 /// The horizontal sum of the values is stored in the lower bits of the
181 /// destination.
182 /// \param __b
183 /// A 128-bit vector of [2 x double] containing one of the source operands.
184 /// The horizontal sum of the values is stored in the upper bits of the
185 /// destination.
186 /// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
187 /// both operands.
188 static __inline__ __m128d __DEFAULT_FN_ATTRS
189 _mm_hadd_pd(__m128d __a, __m128d __b)
190 {
191  return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
192 }
193 
194 /// Horizontally subtracts the pairs of values contained in two 128-bit
195 /// vectors of [2 x double].
196 ///
197 /// \headerfile <x86intrin.h>
198 ///
199 /// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
200 ///
201 /// \param __a
202 /// A 128-bit vector of [2 x double] containing one of the source operands.
203 /// The horizontal difference of the values is stored in the lower bits of
204 /// the destination.
205 /// \param __b
206 /// A 128-bit vector of [2 x double] containing one of the source operands.
207 /// The horizontal difference of the values is stored in the upper bits of
208 /// the destination.
209 /// \returns A 128-bit vector of [2 x double] containing the horizontal
210 /// differences of both operands.
211 static __inline__ __m128d __DEFAULT_FN_ATTRS
212 _mm_hsub_pd(__m128d __a, __m128d __b)
213 {
214  return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
215 }
216 
217 /// Moves and duplicates one double-precision value to double-precision
218 /// values stored in a 128-bit vector of [2 x double].
219 ///
220 /// \headerfile <x86intrin.h>
221 ///
222 /// \code
223 /// __m128d _mm_loaddup_pd(double const *dp);
224 /// \endcode
225 ///
226 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
227 ///
228 /// \param dp
229 /// A pointer to a double-precision value to be moved and duplicated.
230 /// \returns A 128-bit vector of [2 x double] containing the moved and
231 /// duplicated values.
232 #define _mm_loaddup_pd(dp) _mm_load1_pd(dp)
233 
234 /// Moves and duplicates the double-precision value in the lower bits of
235 /// a 128-bit vector of [2 x double] to double-precision values stored in a
236 /// 128-bit vector of [2 x double].
237 ///
238 /// \headerfile <x86intrin.h>
239 ///
240 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
241 ///
242 /// \param __a
243 /// A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
244 /// [127:64] and [63:0] of the destination.
245 /// \returns A 128-bit vector of [2 x double] containing the moved and
246 /// duplicated values.
247 static __inline__ __m128d __DEFAULT_FN_ATTRS
249 {
250  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
251 }
252 
253 /// Establishes a linear address memory range to be monitored and puts
254 /// the processor in the monitor event pending state. Data stored in the
255 /// monitored address range causes the processor to exit the pending state.
256 ///
257 /// The \c MONITOR instruction can be used in kernel mode, and in other modes
258 /// if MSR <c> C001_0015h[MonMwaitUserEn] </c> is set.
259 ///
260 /// \headerfile <x86intrin.h>
261 ///
262 /// This intrinsic corresponds to the \c MONITOR instruction.
263 ///
264 /// \param __p
265 /// The memory range to be monitored. The size of the range is determined by
266 /// CPUID function 0000_0005h.
267 /// \param __extensions
268 /// Optional extensions for the monitoring state.
269 /// \param __hints
270 /// Optional hints for the monitoring state.
271 static __inline__ void __DEFAULT_FN_ATTRS
272 _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
273 {
274  __builtin_ia32_monitor(__p, __extensions, __hints);
275 }
276 
277 /// Used with the \c MONITOR instruction to wait while the processor is in
278 /// the monitor event pending state. Data stored in the monitored address
279 /// range, or an interrupt, causes the processor to exit the pending state.
280 ///
281 /// The \c MWAIT instruction can be used in kernel mode, and in other modes if
282 /// MSR <c> C001_0015h[MonMwaitUserEn] </c> is set.
283 ///
284 /// \headerfile <x86intrin.h>
285 ///
286 /// This intrinsic corresponds to the \c MWAIT instruction.
287 ///
288 /// \param __extensions
289 /// Optional extensions for the monitoring state, which can vary by
290 /// processor.
291 /// \param __hints
292 /// Optional hints for the monitoring state, which can vary by processor.
293 static __inline__ void __DEFAULT_FN_ATTRS
294 _mm_mwait(unsigned __extensions, unsigned __hints)
295 {
296  __builtin_ia32_mwait(__extensions, __hints);
297 }
298 
299 #undef __DEFAULT_FN_ATTRS
300 
301 #endif /* __PMMINTRIN_H */
static __inline__ vector float vector float __b
Definition: altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition: arm_acle.h:80
static __inline__ void int __a
Definition: emmintrin.h:4057
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_hadd_pd(__m128d __a, __m128d __b)
Horizontally adds the pairs of values contained in two 128-bit vectors of [2 x double].
Definition: pmmintrin.h:189
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_movedup_pd(__m128d __a)
Moves and duplicates the double-precision value in the lower bits of a 128-bit vector of [2 x double]...
Definition: pmmintrin.h:248
#define __DEFAULT_FN_ATTRS
Definition: pmmintrin.h:20
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_hadd_ps(__m128 __a, __m128 __b)
Horizontally adds the adjacent pairs of values contained in two 128-bit vectors of [4 x float].
Definition: pmmintrin.h:81
static __inline__ void __DEFAULT_FN_ATTRS _mm_mwait(unsigned __extensions, unsigned __hints)
Used with the MONITOR instruction to wait while the processor is in the monitor event pending state.
Definition: pmmintrin.h:294
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_addsub_pd(__m128d __a, __m128d __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 128-bit vectors of [2 x doub...
Definition: pmmintrin.h:166
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_hsub_pd(__m128d __a, __m128d __b)
Horizontally subtracts the pairs of values contained in two 128-bit vectors of [2 x double].
Definition: pmmintrin.h:212
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movehdup_ps(__m128 __a)
Moves and duplicates odd-indexed values from a 128-bit vector of [4 x float] to float values stored i...
Definition: pmmintrin.h:126
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_moveldup_ps(__m128 __a)
Duplicates even-indexed values from a 128-bit vector of [4 x float] to float values stored in a 128-b...
Definition: pmmintrin.h:147
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_addsub_ps(__m128 __a, __m128 __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 128-bit vectors of [4 x floa...
Definition: pmmintrin.h:58
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_hsub_ps(__m128 __a, __m128 __b)
Horizontally subtracts the adjacent pairs of values contained in two 128-bit vectors of [4 x float].
Definition: pmmintrin.h:104
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_lddqu_si128(__m128i_u const *__p)
Loads data from an unaligned memory location to elements in a 128-bit vector.
Definition: pmmintrin.h:39
static __inline__ void __DEFAULT_FN_ATTRS _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
Establishes a linear address memory range to be monitored and puts the processor in the monitor event...
Definition: pmmintrin.h:272