clang  20.0.0git
avx10_2niintrin.h
Go to the documentation of this file.
1 /*===---- avx10_2niintrin.h - AVX10.2 new instruction intrinsics -----------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 #ifndef __IMMINTRIN_H
10 #error "Never use <avx10_2niintrin.h> directly; include <immintrin.h> instead."
11 #endif
12 
13 #ifdef __SSE2__
14 
15 #ifndef __AVX10_2NIINTRIN_H
16 #define __AVX10_2NIINTRIN_H
17 
18 #define __DEFAULT_FN_ATTRS128 \
19  __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \
20  __min_vector_width__(128)))
21 #define __DEFAULT_FN_ATTRS256 \
22  __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \
23  __min_vector_width__(256)))
24 
25 /* VNNI FP16 */
26 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_dpph_ps(__m128 __W,
27  __m128h __A,
28  __m128h __B) {
29  return (__m128)__builtin_ia32_vdpphps128((__v4sf)__W, (__v8hf)__A,
30  (__v8hf)__B);
31 }
32 
33 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_dpph_ps(__m128 __W,
34  __mmask8 __U,
35  __m128h __A,
36  __m128h __B) {
37  return (__m128)__builtin_ia32_selectps_128(
38  (__mmask8)__U, (__v4sf)_mm_dpph_ps(__W, __A, __B), (__v4sf)__W);
39 }
40 
41 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_dpph_ps(__mmask8 __U,
42  __m128 __W,
43  __m128h __A,
44  __m128h __B) {
45  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
46  (__v4sf)_mm_dpph_ps(__W, __A, __B),
47  (__v4sf)_mm_setzero_ps());
48 }
49 
50 static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_dpph_ps(__m256 __W,
51  __m256h __A,
52  __m256h __B) {
53  return (__m256)__builtin_ia32_vdpphps256((__v8sf)__W, (__v16hf)__A,
54  (__v16hf)__B);
55 }
56 
57 static __inline__ __m256 __DEFAULT_FN_ATTRS256
58 _mm256_mask_dpph_ps(__m256 __W, __mmask8 __U, __m256h __A, __m256h __B) {
59  return (__m256)__builtin_ia32_selectps_256(
60  (__mmask8)__U, (__v8sf)_mm256_dpph_ps(__W, __A, __B), (__v8sf)__W);
61 }
62 
63 static __inline__ __m256 __DEFAULT_FN_ATTRS256
64 _mm256_maskz_dpph_ps(__mmask8 __U, __m256 __W, __m256h __A, __m256h __B) {
65  return (__m256)__builtin_ia32_selectps_256(
66  (__mmask8)__U, (__v8sf)_mm256_dpph_ps(__W, __A, __B),
67  (__v8sf)_mm256_setzero_ps());
68 }
69 
70 /* VMPSADBW */
71 #define _mm_mask_mpsadbw_epu8(W, U, A, B, imm) \
72  ((__m128i)__builtin_ia32_selectw_128( \
73  (__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \
74  (__v8hi)(__m128i)(W)))
75 
76 #define _mm_maskz_mpsadbw_epu8(U, A, B, imm) \
77  ((__m128i)__builtin_ia32_selectw_128( \
78  (__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \
79  (__v8hi)_mm_setzero_si128()))
80 
81 #define _mm256_mask_mpsadbw_epu8(W, U, A, B, imm) \
82  ((__m256i)__builtin_ia32_selectw_256( \
83  (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \
84  (__v16hi)(__m256i)(W)))
85 
86 #define _mm256_maskz_mpsadbw_epu8(U, A, B, imm) \
87  ((__m256i)__builtin_ia32_selectw_256( \
88  (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \
89  (__v16hi)_mm256_setzero_si256()))
90 
91 /* VNNI INT8 */
92 static __inline__ __m128i __DEFAULT_FN_ATTRS128
93 _mm_mask_dpbssd_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
94  return (__m128i)__builtin_ia32_selectd_128(
95  __U, (__v4si)_mm_dpbssd_epi32(__W, __A, __B), (__v4si)__W);
96 }
97 
98 static __inline__ __m128i __DEFAULT_FN_ATTRS128
99 _mm_maskz_dpbssd_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
100  return (__m128i)__builtin_ia32_selectd_128(
101  __U, (__v4si)_mm_dpbssd_epi32(__W, __A, __B),
102  (__v4si)_mm_setzero_si128());
103 }
104 
105 static __inline__ __m256i __DEFAULT_FN_ATTRS256
106 _mm256_mask_dpbssd_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
107  return (__m256i)__builtin_ia32_selectd_256(
108  __U, (__v8si)_mm256_dpbssd_epi32(__W, __A, __B), (__v8si)__W);
109 }
110 
111 static __inline__ __m256i __DEFAULT_FN_ATTRS256
112 _mm256_maskz_dpbssd_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
113  return (__m256i)__builtin_ia32_selectd_256(
114  __U, (__v8si)_mm256_dpbssd_epi32(__W, __A, __B),
115  (__v8si)_mm256_setzero_si256());
116 }
117 
118 static __inline__ __m128i __DEFAULT_FN_ATTRS128
119 _mm_mask_dpbssds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
120  return (__m128i)__builtin_ia32_selectd_128(
121  __U, (__v4si)_mm_dpbssds_epi32(__W, __A, __B), (__v4si)__W);
122 }
123 
124 static __inline__ __m128i __DEFAULT_FN_ATTRS128
125 _mm_maskz_dpbssds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
126  return (__m128i)__builtin_ia32_selectd_128(
127  __U, (__v4si)_mm_dpbssds_epi32(__W, __A, __B),
128  (__v4si)_mm_setzero_si128());
129 }
130 
131 static __inline__ __m256i __DEFAULT_FN_ATTRS256
132 _mm256_mask_dpbssds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
133  return (__m256i)__builtin_ia32_selectd_256(
134  __U, (__v8si)_mm256_dpbssds_epi32(__W, __A, __B), (__v8si)__W);
135 }
136 
137 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbssds_epi32(
138  __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
139  return (__m256i)__builtin_ia32_selectd_256(
140  __U, (__v8si)_mm256_dpbssds_epi32(__W, __A, __B),
141  (__v8si)_mm256_setzero_si256());
142 }
143 
144 static __inline__ __m128i __DEFAULT_FN_ATTRS128
145 _mm_mask_dpbsud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
146  return (__m128i)__builtin_ia32_selectd_128(
147  __U, (__v4si)_mm_dpbsud_epi32(__W, __A, __B), (__v4si)__W);
148 }
149 
150 static __inline__ __m128i __DEFAULT_FN_ATTRS128
151 _mm_maskz_dpbsud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
152  return (__m128i)__builtin_ia32_selectd_128(
153  __U, (__v4si)_mm_dpbsud_epi32(__W, __A, __B),
154  (__v4si)_mm_setzero_si128());
155 }
156 
157 static __inline__ __m256i __DEFAULT_FN_ATTRS256
158 _mm256_mask_dpbsud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
159  return (__m256i)__builtin_ia32_selectd_256(
160  __U, (__v8si)_mm256_dpbsud_epi32(__W, __A, __B), (__v8si)__W);
161 }
162 
163 static __inline__ __m256i __DEFAULT_FN_ATTRS256
164 _mm256_maskz_dpbsud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
165  return (__m256i)__builtin_ia32_selectd_256(
166  __U, (__v8si)_mm256_dpbsud_epi32(__W, __A, __B),
167  (__v8si)_mm256_setzero_si256());
168 }
169 
170 static __inline__ __m128i __DEFAULT_FN_ATTRS128
171 _mm_mask_dpbsuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
172  return (__m128i)__builtin_ia32_selectd_128(
173  __U, (__v4si)_mm_dpbsuds_epi32(__W, __A, __B), (__v4si)__W);
174 }
175 
176 static __inline__ __m128i __DEFAULT_FN_ATTRS128
177 _mm_maskz_dpbsuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
178  return (__m128i)__builtin_ia32_selectd_128(
179  __U, (__v4si)_mm_dpbsuds_epi32(__W, __A, __B),
180  (__v4si)_mm_setzero_si128());
181 }
182 
183 static __inline__ __m256i __DEFAULT_FN_ATTRS256
184 _mm256_mask_dpbsuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
185  return (__m256i)__builtin_ia32_selectd_256(
186  __U, (__v8si)_mm256_dpbsuds_epi32(__W, __A, __B), (__v8si)__W);
187 }
188 
189 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbsuds_epi32(
190  __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
191  return (__m256i)__builtin_ia32_selectd_256(
192  __U, (__v8si)_mm256_dpbsuds_epi32(__W, __A, __B),
193  (__v8si)_mm256_setzero_si256());
194 }
195 
196 static __inline__ __m128i __DEFAULT_FN_ATTRS128
197 _mm_mask_dpbuud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
198  return (__m128i)__builtin_ia32_selectd_128(
199  __U, (__v4si)_mm_dpbuud_epi32(__W, __A, __B), (__v4si)__W);
200 }
201 
202 static __inline__ __m128i __DEFAULT_FN_ATTRS128
203 _mm_maskz_dpbuud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
204  return (__m128i)__builtin_ia32_selectd_128(
205  __U, (__v4si)_mm_dpbuud_epi32(__W, __A, __B),
206  (__v4si)_mm_setzero_si128());
207 }
208 
209 static __inline__ __m256i __DEFAULT_FN_ATTRS256
210 _mm256_mask_dpbuud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
211  return (__m256i)__builtin_ia32_selectd_256(
212  __U, (__v8si)_mm256_dpbuud_epi32(__W, __A, __B), (__v8si)__W);
213 }
214 
215 static __inline__ __m256i __DEFAULT_FN_ATTRS256
216 _mm256_maskz_dpbuud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
217  return (__m256i)__builtin_ia32_selectd_256(
218  __U, (__v8si)_mm256_dpbuud_epi32(__W, __A, __B),
219  (__v8si)_mm256_setzero_si256());
220 }
221 
222 static __inline__ __m128i __DEFAULT_FN_ATTRS128
223 _mm_mask_dpbuuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
224  return (__m128i)__builtin_ia32_selectd_128(
225  __U, (__v4si)_mm_dpbuuds_epi32(__W, __A, __B), (__v4si)__W);
226 }
227 
228 static __inline__ __m128i __DEFAULT_FN_ATTRS128
229 _mm_maskz_dpbuuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
230  return (__m128i)__builtin_ia32_selectd_128(
231  __U, (__v4si)_mm_dpbuuds_epi32(__W, __A, __B),
232  (__v4si)_mm_setzero_si128());
233 }
234 
235 static __inline__ __m256i __DEFAULT_FN_ATTRS256
236 _mm256_mask_dpbuuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
237  return (__m256i)__builtin_ia32_selectd_256(
238  __U, (__v8si)_mm256_dpbuuds_epi32(__W, __A, __B), (__v8si)__W);
239 }
240 
241 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbuuds_epi32(
242  __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
243  return (__m256i)__builtin_ia32_selectd_256(
244  __U, (__v8si)_mm256_dpbuuds_epi32(__W, __A, __B),
245  (__v8si)_mm256_setzero_si256());
246 }
247 
248 /* VNNI INT16 */
249 static __inline__ __m128i __DEFAULT_FN_ATTRS128
250 _mm_mask_dpwsud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
251  return (__m128i)__builtin_ia32_selectd_128(
252  (__mmask8)__U, (__v4si)_mm_dpwsud_epi32(__A, __B, __C), (__v4si)__A);
253 }
254 
255 static __inline__ __m128i __DEFAULT_FN_ATTRS128
256 _mm_maskz_dpwsud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
257  return (__m128i)__builtin_ia32_selectd_128(
258  (__mmask8)__U, (__v4si)_mm_dpwsud_epi32(__A, __B, __C),
259  (__v4si)_mm_setzero_si128());
260 }
261 
262 static __inline__ __m256i __DEFAULT_FN_ATTRS256
263 _mm256_mask_dpwsud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
264  return (__m256i)__builtin_ia32_selectd_256(
265  (__mmask8)__U, (__v8si)_mm256_dpwsud_epi32(__A, __B, __C), (__v8si)__A);
266 }
267 
268 static __inline__ __m256i __DEFAULT_FN_ATTRS256
269 _mm256_maskz_dpwsud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
270  return (__m256i)__builtin_ia32_selectd_256(
271  (__mmask8)__U, (__v8si)_mm256_dpwsud_epi32(__A, __B, __C),
272  (__v8si)_mm256_setzero_si256());
273 }
274 
275 static __inline__ __m128i __DEFAULT_FN_ATTRS128
276 _mm_mask_dpwsuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
277  return (__m128i)__builtin_ia32_selectd_128(
278  (__mmask8)__U, (__v4si)_mm_dpwsuds_epi32(__A, __B, __C), (__v4si)__A);
279 }
280 
281 static __inline__ __m128i __DEFAULT_FN_ATTRS128
282 _mm_maskz_dpwsuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
283  return (__m128i)__builtin_ia32_selectd_128(
284  (__mmask8)__U, (__v4si)_mm_dpwsuds_epi32(__A, __B, __C),
285  (__v4si)_mm_setzero_si128());
286 }
287 
288 static __inline__ __m256i __DEFAULT_FN_ATTRS256
289 _mm256_mask_dpwsuds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
290  return (__m256i)__builtin_ia32_selectd_256(
291  (__mmask8)__U, (__v8si)_mm256_dpwsuds_epi32(__A, __B, __C), (__v8si)__A);
292 }
293 
294 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwsuds_epi32(
295  __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
296  return (__m256i)__builtin_ia32_selectd_256(
297  (__mmask8)__U, (__v8si)_mm256_dpwsuds_epi32(__A, __B, __C),
298  (__v8si)_mm256_setzero_si256());
299 }
300 
301 static __inline__ __m128i __DEFAULT_FN_ATTRS128
302 _mm_mask_dpwusd_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
303  return (__m128i)__builtin_ia32_selectd_128(
304  (__mmask8)__U, (__v4si)_mm_dpwusd_epi32(__A, __B, __C), (__v4si)__A);
305 }
306 
307 static __inline__ __m128i __DEFAULT_FN_ATTRS128
308 _mm_maskz_dpwusd_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
309  return (__m128i)__builtin_ia32_selectd_128(
310  (__mmask8)__U, (__v4si)_mm_dpwusd_epi32(__A, __B, __C),
311  (__v4si)_mm_setzero_si128());
312 }
313 
314 static __inline__ __m256i __DEFAULT_FN_ATTRS256
315 _mm256_mask_dpwusd_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
316  return (__m256i)__builtin_ia32_selectd_256(
317  (__mmask8)__U, (__v8si)_mm256_dpwusd_epi32(__A, __B, __C), (__v8si)__A);
318 }
319 
320 static __inline__ __m256i __DEFAULT_FN_ATTRS256
321 _mm256_maskz_dpwusd_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
322  return (__m256i)__builtin_ia32_selectd_256(
323  (__mmask8)__U, (__v8si)_mm256_dpwusd_epi32(__A, __B, __C),
324  (__v8si)_mm256_setzero_si256());
325 }
326 
327 static __inline__ __m128i __DEFAULT_FN_ATTRS128
328 _mm_mask_dpwusds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
329  return (__m128i)__builtin_ia32_selectd_128(
330  (__mmask8)__U, (__v4si)_mm_dpwusds_epi32(__A, __B, __C), (__v4si)__A);
331 }
332 
333 static __inline__ __m128i __DEFAULT_FN_ATTRS128
334 _mm_maskz_dpwusds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
335  return (__m128i)__builtin_ia32_selectd_128(
336  (__mmask8)__U, (__v4si)_mm_dpwusds_epi32(__A, __B, __C),
337  (__v4si)_mm_setzero_si128());
338 }
339 
340 static __inline__ __m256i __DEFAULT_FN_ATTRS256
341 _mm256_mask_dpwusds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
342  return (__m256i)__builtin_ia32_selectd_256(
343  (__mmask8)__U, (__v8si)_mm256_dpwusds_epi32(__A, __B, __C), (__v8si)__A);
344 }
345 
346 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwusds_epi32(
347  __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
348  return (__m256i)__builtin_ia32_selectd_256(
349  (__mmask8)__U, (__v8si)_mm256_dpwusds_epi32(__A, __B, __C),
350  (__v8si)_mm256_setzero_si256());
351 }
352 
353 static __inline__ __m128i __DEFAULT_FN_ATTRS128
354 _mm_mask_dpwuud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
355  return (__m128i)__builtin_ia32_selectd_128(
356  (__mmask8)__U, (__v4si)_mm_dpwuud_epi32(__A, __B, __C), (__v4si)__A);
357 }
358 
359 static __inline__ __m128i __DEFAULT_FN_ATTRS128
360 _mm_maskz_dpwuud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
361  return (__m128i)__builtin_ia32_selectd_128(
362  (__mmask8)__U, (__v4si)_mm_dpwuud_epi32(__A, __B, __C),
363  (__v4si)_mm_setzero_si128());
364 }
365 
366 static __inline__ __m256i __DEFAULT_FN_ATTRS256
367 _mm256_mask_dpwuud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
368  return (__m256i)__builtin_ia32_selectd_256(
369  (__mmask8)__U, (__v8si)_mm256_dpwuud_epi32(__A, __B, __C), (__v8si)__A);
370 }
371 
372 static __inline__ __m256i __DEFAULT_FN_ATTRS256
373 _mm256_maskz_dpwuud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
374  return (__m256i)__builtin_ia32_selectd_256(
375  (__mmask8)__U, (__v8si)_mm256_dpwuud_epi32(__A, __B, __C),
376  (__v8si)_mm256_setzero_si256());
377 }
378 
379 static __inline__ __m128i __DEFAULT_FN_ATTRS128
380 _mm_mask_dpwuuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
381  return (__m128i)__builtin_ia32_selectd_128(
382  (__mmask8)__U, (__v4si)_mm_dpwuuds_epi32(__A, __B, __C), (__v4si)__A);
383 }
384 
385 static __inline__ __m128i __DEFAULT_FN_ATTRS128
386 _mm_maskz_dpwuuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
387  return (__m128i)__builtin_ia32_selectd_128(
388  (__mmask8)__U, (__v4si)_mm_dpwuuds_epi32(__A, __B, __C),
389  (__v4si)_mm_setzero_si128());
390 }
391 
392 static __inline__ __m256i __DEFAULT_FN_ATTRS256
393 _mm256_mask_dpwuuds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
394  return (__m256i)__builtin_ia32_selectd_256(
395  (__mmask8)__U, (__v8si)_mm256_dpwuuds_epi32(__A, __B, __C), (__v8si)__A);
396 }
397 
398 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwuuds_epi32(
399  __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
400  return (__m256i)__builtin_ia32_selectd_256(
401  (__mmask8)__U, (__v8si)_mm256_dpwuuds_epi32(__A, __B, __C),
402  (__v8si)_mm256_setzero_si256());
403 }
404 
405 /* YMM Rounding */
406 #define _mm256_add_round_pd(A, B, R) \
407  ((__m256d)__builtin_ia32_vaddpd256_round((__v4df)(__m256d)(A), \
408  (__v4df)(__m256d)(B), (int)(R)))
409 
410 #define _mm256_mask_add_round_pd(W, U, A, B, R) \
411  ((__m256d)__builtin_ia32_selectpd_256( \
412  (__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)), \
413  (__v4df)(__m256d)(W)))
414 
415 #define _mm256_maskz_add_round_pd(U, A, B, R) \
416  ((__m256d)__builtin_ia32_selectpd_256( \
417  (__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)), \
418  (__v4df)_mm256_setzero_pd()))
419 
420 #define _mm256_add_round_ph(A, B, R) \
421  ((__m256h)__builtin_ia32_vaddph256_round((__v16hf)(__m256h)(A), \
422  (__v16hf)(__m256h)(B), (int)(R)))
423 
424 #define _mm256_mask_add_round_ph(W, U, A, B, R) \
425  ((__m256h)__builtin_ia32_selectph_256( \
426  (__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)), \
427  (__v16hf)(__m256h)(W)))
428 
429 #define _mm256_maskz_add_round_ph(U, A, B, R) \
430  ((__m256h)__builtin_ia32_selectph_256( \
431  (__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)), \
432  (__v16hf)_mm256_setzero_ph()))
433 
434 #define _mm256_add_round_ps(A, B, R) \
435  ((__m256)__builtin_ia32_vaddps256_round((__v8sf)(__m256)(A), \
436  (__v8sf)(__m256)(B), (int)(R)))
437 
438 #define _mm256_mask_add_round_ps(W, U, A, B, R) \
439  ((__m256)__builtin_ia32_selectps_256( \
440  (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)), \
441  (__v8sf)(__m256)(W)))
442 
443 #define _mm256_maskz_add_round_ps(U, A, B, R) \
444  ((__m256)__builtin_ia32_selectps_256( \
445  (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)), \
446  (__v8sf)_mm256_setzero_ps()))
447 
448 #define _mm256_cmp_round_pd_mask(A, B, P, R) \
449  ((__mmask8)__builtin_ia32_vcmppd256_round_mask( \
450  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(P), (__mmask8)-1, \
451  (int)(R)))
452 
453 #define _mm256_mask_cmp_round_pd_mask(U, A, B, P, R) \
454  ((__mmask8)__builtin_ia32_vcmppd256_round_mask( \
455  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(P), (__mmask8)(U), \
456  (int)(R)))
457 
458 #define _mm256_cmp_round_ph_mask(A, B, P, R) \
459  ((__mmask16)__builtin_ia32_vcmpph256_round_mask( \
460  (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(P), (__mmask16)-1, \
461  (int)(R)))
462 
463 #define _mm256_mask_cmp_round_ph_mask(U, A, B, P, R) \
464  ((__mmask16)__builtin_ia32_vcmpph256_round_mask( \
465  (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(P), (__mmask16)(U), \
466  (int)(R)))
467 
468 #define _mm256_cmp_round_ps_mask(A, B, P, R) \
469  ((__mmask8)__builtin_ia32_vcmpps256_round_mask( \
470  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(P), (__mmask8)-1, \
471  (int)(R)))
472 
473 #define _mm256_mask_cmp_round_ps_mask(U, A, B, P, R) \
474  ((__mmask8)__builtin_ia32_vcmpps256_round_mask( \
475  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(P), (__mmask8)(U), \
476  (int)(R)))
477 
478 #define _mm256_cvt_roundepi32_ph(A, R) \
479  ((__m128h)__builtin_ia32_vcvtdq2ph256_round_mask( \
480  (__v8si)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
481 
482 #define _mm256_mask_cvt_roundepi32_ph(W, U, A, R) \
483  ((__m128h)__builtin_ia32_vcvtdq2ph256_round_mask((__v8si)(A), (__v8hf)(W), \
484  (__mmask8)(U), (int)(R)))
485 
486 #define _mm256_maskz_cvt_roundepi32_ph(U, A, R) \
487  ((__m128h)__builtin_ia32_vcvtdq2ph256_round_mask( \
488  (__v8si)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
489 
490 #define _mm256_cvt_roundepi32_ps(A, R) \
491  ((__m256)__builtin_ia32_vcvtdq2ps256_round_mask((__v8si)(__m256i)(A), \
492  (__v8sf)_mm256_setzero_ps(), \
493  (__mmask8)-1, (int)(R)))
494 
495 #define _mm256_mask_cvt_roundepi32_ps(W, U, A, R) \
496  ((__m256)__builtin_ia32_vcvtdq2ps256_round_mask( \
497  (__v8si)(__m256i)(A), (__v8sf)(__m256)(W), (__mmask8)(U), (int)(R)))
498 
499 #define _mm256_maskz_cvt_roundepi32_ps(U, A, R) \
500  ((__m256)__builtin_ia32_vcvtdq2ps256_round_mask((__v8si)(__m256i)(A), \
501  (__v8sf)_mm256_setzero_ps(), \
502  (__mmask8)(U), (int)(R)))
503 
504 #define _mm256_cvt_roundpd_epi32(A, R) \
505  ((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask( \
506  (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)-1, \
507  (int)(R)))
508 
509 #define _mm256_mask_cvt_roundpd_epi32(W, U, A, R) \
510  ((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask( \
511  (__v4df)(__m256d)(A), (__v4si)(__m128i)(W), (__mmask8)(U), (int)(R)))
512 
513 #define _mm256_maskz_cvt_roundpd_epi32(U, A, R) \
514  ((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask( \
515  (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)(U), \
516  (int)(R)))
517 
518 #define _mm256_cvt_roundpd_ph(A, R) \
519  ((__m128h)__builtin_ia32_vcvtpd2ph256_round_mask( \
520  (__v4df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
521 
522 #define _mm256_mask_cvt_roundpd_ph(W, U, A, R) \
523  ((__m128h)__builtin_ia32_vcvtpd2ph256_round_mask((__v4df)(A), (__v8hf)(W), \
524  (__mmask8)(U), (int)(R)))
525 
526 #define _mm256_maskz_cvt_roundpd_ph(U, A, R) \
527  ((__m128h)__builtin_ia32_vcvtpd2ph256_round_mask( \
528  (__v4df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
529 
530 #define _mm256_cvt_roundpd_ps(A, R) \
531  ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask( \
532  (__v4df)(__m256d)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R)))
533 
534 #define _mm256_mask_cvt_roundpd_ps(W, U, A, R) \
535  ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask( \
536  (__v4df)(__m256d)(A), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R)))
537 
538 #define _mm256_maskz_cvt_roundpd_ps(U, A, R) \
539  ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask((__v4df)(__m256d)(A), \
540  (__v4sf)_mm_setzero_ps(), \
541  (__mmask8)(U), (int)(R)))
542 
543 #define _mm256_cvt_roundpd_epi64(A, R) \
544  ((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask( \
545  (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \
546  (int)(R)))
547 
548 #define _mm256_mask_cvt_roundpd_epi64(W, U, A, R) \
549  ((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask( \
550  (__v4df)(__m256d)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R)))
551 
552 #define _mm256_maskz_cvt_roundpd_epi64(U, A, R) \
553  ((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask( \
554  (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), \
555  (int)(R)))
556 
557 #define _mm256_cvt_roundpd_epu32(A, R) \
558  ((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask( \
559  (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1, \
560  (int)(R)))
561 
562 #define _mm256_mask_cvt_roundpd_epu32(W, U, A, R) \
563  ((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask( \
564  (__v4df)(__m256d)(A), (__v4su)(__m128i)(W), (__mmask8)(U), (int)(R)))
565 
566 #define _mm256_maskz_cvt_roundpd_epu32(U, A, R) \
567  ((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask( \
568  (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)(U), \
569  (int)(R)))
570 
571 #define _mm256_cvt_roundpd_epu64(A, R) \
572  ((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask( \
573  (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \
574  (int)(R)))
575 
576 #define _mm256_mask_cvt_roundpd_epu64(W, U, A, R) \
577  ((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask( \
578  (__v4df)(__m256d)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R)))
579 
580 #define _mm256_maskz_cvt_roundpd_epu64(U, A, R) \
581  ((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask( \
582  (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), \
583  (int)(R)))
584 
585 #define _mm256_cvt_roundph_epi32(A, R) \
586  ((__m256i)__builtin_ia32_vcvtph2dq256_round_mask( \
587  (__v8hf)(A), (__v8si)_mm256_undefined_si256(), (__mmask8)(-1), \
588  (int)(R)))
589 
590 #define _mm256_mask_cvt_roundph_epi32(W, U, A, R) \
591  ((__m256i)__builtin_ia32_vcvtph2dq256_round_mask((__v8hf)(A), (__v8si)(W), \
592  (__mmask8)(U), (int)(R)))
593 
594 #define _mm256_maskz_cvt_roundph_epi32(U, A, R) \
595  ((__m256i)__builtin_ia32_vcvtph2dq256_round_mask( \
596  (__v8hf)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
597 
598 #define _mm256_cvt_roundph_pd(A, R) \
599  ((__m256d)__builtin_ia32_vcvtph2pd256_round_mask( \
600  (__v8hf)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)(-1), (int)(R)))
601 
602 #define _mm256_mask_cvt_roundph_pd(W, U, A, R) \
603  ((__m256d)__builtin_ia32_vcvtph2pd256_round_mask((__v8hf)(A), (__v4df)(W), \
604  (__mmask8)(U), (int)(R)))
605 
606 #define _mm256_maskz_cvt_roundph_pd(U, A, R) \
607  ((__m256d)__builtin_ia32_vcvtph2pd256_round_mask( \
608  (__v8hf)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R)))
609 
610 #define _mm256_cvtx_roundph_ps(A, R) \
611  ((__m256)__builtin_ia32_vcvtph2psx256_round_mask( \
612  (__v8hf)(A), (__v8sf)_mm256_undefined_ps(), (__mmask8)(-1), (int)(R)))
613 
614 #define _mm256_mask_cvtx_roundph_ps(W, U, A, R) \
615  ((__m256)__builtin_ia32_vcvtph2psx256_round_mask((__v8hf)(A), (__v8sf)(W), \
616  (__mmask8)(U), (int)(R)))
617 
618 #define _mm256_maskz_cvtx_roundph_ps(U, A, R) \
619  ((__m256)__builtin_ia32_vcvtph2psx256_round_mask( \
620  (__v8hf)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R)))
621 
622 #define _mm256_cvt_roundph_epi64(A, R) \
623  ((__m256i)__builtin_ia32_vcvtph2qq256_round_mask( \
624  (__v8hf)(A), (__v4di)_mm256_undefined_si256(), (__mmask8)(-1), \
625  (int)(R)))
626 
627 #define _mm256_mask_cvt_roundph_epi64(W, U, A, R) \
628  ((__m256i)__builtin_ia32_vcvtph2qq256_round_mask((__v8hf)(A), (__v4di)(W), \
629  (__mmask8)(U), (int)(R)))
630 
631 #define _mm256_maskz_cvt_roundph_epi64(U, A, R) \
632  ((__m256i)__builtin_ia32_vcvtph2qq256_round_mask( \
633  (__v8hf)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
634 
635 #define _mm256_cvt_roundph_epu32(A, R) \
636  ((__m256i)__builtin_ia32_vcvtph2udq256_round_mask( \
637  (__v8hf)(A), (__v8su)_mm256_undefined_si256(), (__mmask8)(-1), \
638  (int)(R)))
639 
640 #define _mm256_mask_cvt_roundph_epu32(W, U, A, R) \
641  ((__m256i)__builtin_ia32_vcvtph2udq256_round_mask((__v8hf)(A), (__v8su)(W), \
642  (__mmask8)(U), (int)(R)))
643 
644 #define _mm256_maskz_cvt_roundph_epu32(U, A, R) \
645  ((__m256i)__builtin_ia32_vcvtph2udq256_round_mask( \
646  (__v8hf)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
647 
648 #define _mm256_cvt_roundph_epu64(A, R) \
649  ((__m256i)__builtin_ia32_vcvtph2uqq256_round_mask( \
650  (__v8hf)(A), (__v4du)_mm256_undefined_si256(), (__mmask8)(-1), \
651  (int)(R)))
652 
653 #define _mm256_mask_cvt_roundph_epu64(W, U, A, R) \
654  ((__m256i)__builtin_ia32_vcvtph2uqq256_round_mask((__v8hf)(A), (__v4du)(W), \
655  (__mmask8)(U), (int)(R)))
656 
657 #define _mm256_maskz_cvt_roundph_epu64(U, A, R) \
658  ((__m256i)__builtin_ia32_vcvtph2uqq256_round_mask( \
659  (__v8hf)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
660 
661 #define _mm256_cvt_roundph_epu16(A, R) \
662  ((__m256i)__builtin_ia32_vcvtph2uw256_round_mask( \
663  (__v16hf)(A), (__v16hu)_mm256_undefined_si256(), (__mmask16)(-1), \
664  (int)(R)))
665 
666 #define _mm256_mask_cvt_roundph_epu16(W, U, A, R) \
667  ((__m256i)__builtin_ia32_vcvtph2uw256_round_mask((__v16hf)(A), (__v16hu)(W), \
668  (__mmask16)(U), (int)(R)))
669 
670 #define _mm256_maskz_cvt_roundph_epu16(U, A, R) \
671  ((__m256i)__builtin_ia32_vcvtph2uw256_round_mask( \
672  (__v16hf)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U), \
673  (int)(R)))
674 
675 #define _mm256_cvt_roundph_epi16(A, R) \
676  ((__m256i)__builtin_ia32_vcvtph2w256_round_mask( \
677  (__v16hf)(A), (__v16hi)_mm256_undefined_si256(), (__mmask16)(-1), \
678  (int)(R)))
679 
680 #define _mm256_mask_cvt_roundph_epi16(W, U, A, R) \
681  ((__m256i)__builtin_ia32_vcvtph2w256_round_mask((__v16hf)(A), (__v16hi)(W), \
682  (__mmask16)(U), (int)(R)))
683 
684 #define _mm256_maskz_cvt_roundph_epi16(U, A, R) \
685  ((__m256i)__builtin_ia32_vcvtph2w256_round_mask( \
686  (__v16hf)(A), (__v16hi)_mm256_setzero_si256(), (__mmask16)(U), \
687  (int)(R)))
688 
689 #define _mm256_cvt_roundps_epi32(A, R) \
690  ((__m256i)__builtin_ia32_vcvtps2dq256_round_mask( \
691  (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)-1, \
692  (int)(R)))
693 
694 #define _mm256_mask_cvt_roundps_epi32(W, U, A, R) \
695  ((__m256i)__builtin_ia32_vcvtps2dq256_round_mask( \
696  (__v8sf)(__m256)(A), (__v8si)(__m256i)(W), (__mmask8)(U), (int)(R)))
697 
698 #define _mm256_maskz_cvt_roundps_epi32(U, A, R) \
699  ((__m256i)__builtin_ia32_vcvtps2dq256_round_mask( \
700  (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), \
701  (int)(R)))
702 
703 #define _mm256_cvt_roundps_pd(A, R) \
704  ((__m256d)__builtin_ia32_vcvtps2pd256_round_mask( \
705  (__v4sf)(__m128)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)-1, \
706  (int)(R)))
707 
708 #define _mm256_mask_cvt_roundps_pd(W, U, A, R) \
709  ((__m256d)__builtin_ia32_vcvtps2pd256_round_mask( \
710  (__v4sf)(__m128)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
711 
712 #define _mm256_maskz_cvt_roundps_pd(U, A, R) \
713  ((__m256d)__builtin_ia32_vcvtps2pd256_round_mask( \
714  (__v4sf)(__m128)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), \
715  (int)(R)))
716 
717 #define _mm256_cvt_roundps_ph(A, I) \
718  ((__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
719  (__v8hi)_mm_undefined_si128(), \
720  (__mmask8)-1))
721 
722 /* FIXME: We may use these way in future.
723 #define _mm256_cvt_roundps_ph(A, I) \
724  ((__m128i)__builtin_ia32_vcvtps2ph256_round_mask( \
725  (__v8sf)(__m256)(A), (int)(I), (__v8hi)_mm_undefined_si128(), \
726  (__mmask8)-1))
727 #define _mm256_mask_cvt_roundps_ph(U, W, A, I) \
728  ((__m128i)__builtin_ia32_vcvtps2ph256_round_mask( \
729  (__v8sf)(__m256)(A), (int)(I), (__v8hi)(__m128i)(U), (__mmask8)(W)))
730 #define _mm256_maskz_cvt_roundps_ph(W, A, I) \
731  ((__m128i)__builtin_ia32_vcvtps2ph256_round_mask( \
732  (__v8sf)(__m256)(A), (int)(I), (__v8hi)_mm_setzero_si128(), \
733  (__mmask8)(W))) */
734 
735 #define _mm256_cvtx_roundps_ph(A, R) \
736  ((__m128h)__builtin_ia32_vcvtps2phx256_round_mask( \
737  (__v8sf)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
738 
739 #define _mm256_mask_cvtx_roundps_ph(W, U, A, R) \
740  ((__m128h)__builtin_ia32_vcvtps2phx256_round_mask((__v8sf)(A), (__v8hf)(W), \
741  (__mmask8)(U), (int)(R)))
742 
743 #define _mm256_maskz_cvtx_roundps_ph(U, A, R) \
744  ((__m128h)__builtin_ia32_vcvtps2phx256_round_mask( \
745  (__v8sf)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
746 
747 #define _mm256_cvt_roundps_epi64(A, R) \
748  ((__m256i)__builtin_ia32_vcvtps2qq256_round_mask( \
749  (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \
750  (int)(R)))
751 
752 #define _mm256_mask_cvt_roundps_epi64(W, U, A, R) \
753  ((__m256i)__builtin_ia32_vcvtps2qq256_round_mask( \
754  (__v4sf)(__m128)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R)))
755 
756 #define _mm256_maskz_cvt_roundps_epi64(U, A, R) \
757  ((__m256i)__builtin_ia32_vcvtps2qq256_round_mask( \
758  (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), \
759  (int)(R)))
760 
761 #define _mm256_cvt_roundps_epu32(A, R) \
762  ((__m256i)__builtin_ia32_vcvtps2udq256_round_mask( \
763  (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1, \
764  (int)(R)))
765 
766 #define _mm256_mask_cvt_roundps_epu32(W, U, A, R) \
767  ((__m256i)__builtin_ia32_vcvtps2udq256_round_mask( \
768  (__v8sf)(__m256)(A), (__v8su)(__m256i)(W), (__mmask8)(U), (int)(R)))
769 
770 #define _mm256_maskz_cvt_roundps_epu32(U, A, R) \
771  ((__m256i)__builtin_ia32_vcvtps2udq256_round_mask( \
772  (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), \
773  (int)(R)))
774 
775 #define _mm256_cvt_roundps_epu64(A, R) \
776  ((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask( \
777  (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \
778  (int)(R)))
779 
780 #define _mm256_mask_cvt_roundps_epu64(W, U, A, R) \
781  ((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask( \
782  (__v4sf)(__m128)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R)))
783 
784 #define _mm256_maskz_cvt_roundps_epu64(U, A, R) \
785  ((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask( \
786  (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), \
787  (int)(R)))
788 
789 #define _mm256_cvt_roundepi64_pd(A, R) \
790  ((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask( \
791  (__v4di)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)-1, \
792  (int)(R)))
793 
794 #define _mm256_mask_cvt_roundepi64_pd(W, U, A, R) \
795  ((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask( \
796  (__v4di)(__m256i)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
797 
798 #define _mm256_maskz_cvt_roundepi64_pd(U, A, R) \
799  ((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask( \
800  (__v4di)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), \
801  (int)(R)))
802 
803 #define _mm256_cvt_roundepi64_ph(A, R) \
804  ((__m128h)__builtin_ia32_vcvtqq2ph256_round_mask( \
805  (__v4di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
806 
807 #define _mm256_mask_cvt_roundepi64_ph(W, U, A, R) \
808  ((__m128h)__builtin_ia32_vcvtqq2ph256_round_mask((__v4di)(A), (__v8hf)(W), \
809  (__mmask8)(U), (int)(R)))
810 
811 #define _mm256_maskz_cvt_roundepi64_ph(U, A, R) \
812  ((__m128h)__builtin_ia32_vcvtqq2ph256_round_mask( \
813  (__v4di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
814 
815 #define _mm256_cvt_roundepi64_ps(A, R) \
816  ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask( \
817  (__v4di)(__m256i)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R)))
818 
819 #define _mm256_mask_cvt_roundepi64_ps(W, U, A, R) \
820  ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask( \
821  (__v4di)(__m256i)(A), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R)))
822 
823 #define _mm256_maskz_cvt_roundepi64_ps(U, A, R) \
824  ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask((__v4di)(__m256i)(A), \
825  (__v4sf)_mm_setzero_ps(), \
826  (__mmask8)(U), (int)(R)))
827 
828 #define _mm256_cvtt_roundpd_epi32(A, R) \
829  ((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask( \
830  (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)-1, \
831  (int)(R)))
832 
833 #define _mm256_mask_cvtt_roundpd_epi32(W, U, A, R) \
834  ((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask( \
835  (__v4df)(__m256d)(A), (__v4si)(__m128i)(W), (__mmask8)(U), (int)(R)))
836 
837 #define _mm256_maskz_cvtt_roundpd_epi32(U, A, R) \
838  ((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask( \
839  (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)(U), \
840  (int)(R)))
841 
842 #define _mm256_cvtt_roundpd_epi64(A, R) \
843  ((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask( \
844  (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \
845  (int)(R)))
846 
847 #define _mm256_mask_cvtt_roundpd_epi64(W, U, A, R) \
848  ((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask( \
849  (__v4df)(__m256d)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R)))
850 
851 #define _mm256_maskz_cvtt_roundpd_epi64(U, A, R) \
852  ((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask( \
853  (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), \
854  (int)(R)))
855 
856 #define _mm256_cvtt_roundpd_epu32(A, R) \
857  ((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask( \
858  (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1, \
859  (int)(R)))
860 
861 #define _mm256_mask_cvtt_roundpd_epu32(W, U, A, R) \
862  ((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask( \
863  (__v4df)(__m256d)(A), (__v4su)(__m128i)(W), (__mmask8)(U), (int)(R)))
864 
865 #define _mm256_maskz_cvtt_roundpd_epu32(U, A, R) \
866  ((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask( \
867  (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)(U), \
868  (int)(R)))
869 
870 #define _mm256_cvtt_roundpd_epu64(A, R) \
871  ((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask( \
872  (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \
873  (int)(R)))
874 
875 #define _mm256_mask_cvtt_roundpd_epu64(W, U, A, R) \
876  ((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask( \
877  (__v4df)(__m256d)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R)))
878 
879 #define _mm256_maskz_cvtt_roundpd_epu64(U, A, R) \
880  ((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask( \
881  (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), \
882  (int)(R)))
883 
884 #define _mm256_cvtt_roundph_epi32(A, R) \
885  ((__m256i)__builtin_ia32_vcvttph2dq256_round_mask( \
886  (__v8hf)(A), (__v8si)_mm256_undefined_si256(), (__mmask8)(-1), \
887  (int)(R)))
888 
889 #define _mm256_mask_cvtt_roundph_epi32(W, U, A, R) \
890  ((__m256i)__builtin_ia32_vcvttph2dq256_round_mask((__v8hf)(A), (__v8si)(W), \
891  (__mmask8)(U), (int)(R)))
892 
893 #define _mm256_maskz_cvtt_roundph_epi32(U, A, R) \
894  ((__m256i)__builtin_ia32_vcvttph2dq256_round_mask( \
895  (__v8hf)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
896 
897 #define _mm256_cvtt_roundph_epi64(A, R) \
898  ((__m256i)__builtin_ia32_vcvttph2qq256_round_mask( \
899  (__v8hf)(A), (__v4di)_mm256_undefined_si256(), (__mmask8)(-1), \
900  (int)(R)))
901 
902 #define _mm256_mask_cvtt_roundph_epi64(W, U, A, R) \
903  ((__m256i)__builtin_ia32_vcvttph2qq256_round_mask((__v8hf)(A), (__v4di)(W), \
904  (__mmask8)(U), (int)(R)))
905 
906 #define _mm256_maskz_cvtt_roundph_epi64(U, A, R) \
907  ((__m256i)__builtin_ia32_vcvttph2qq256_round_mask( \
908  (__v8hf)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
909 
910 #define _mm256_cvtt_roundph_epu32(A, R) \
911  ((__m256i)__builtin_ia32_vcvttph2udq256_round_mask( \
912  (__v8hf)(A), (__v8su)_mm256_undefined_si256(), (__mmask8)(-1), \
913  (int)(R)))
914 
915 #define _mm256_mask_cvtt_roundph_epu32(W, U, A, R) \
916  ((__m256i)__builtin_ia32_vcvttph2udq256_round_mask((__v8hf)(A), (__v8su)(W), \
917  (__mmask8)(U), (int)(R)))
918 
919 #define _mm256_maskz_cvtt_roundph_epu32(U, A, R) \
920  ((__m256i)__builtin_ia32_vcvttph2udq256_round_mask( \
921  (__v8hf)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
922 
923 #define _mm256_cvtt_roundph_epu64(A, R) \
924  ((__m256i)__builtin_ia32_vcvttph2uqq256_round_mask( \
925  (__v8hf)(A), (__v4du)_mm256_undefined_si256(), (__mmask8)(-1), \
926  (int)(R)))
927 
928 #define _mm256_mask_cvtt_roundph_epu64(W, U, A, R) \
929  ((__m256i)__builtin_ia32_vcvttph2uqq256_round_mask((__v8hf)(A), (__v4du)(W), \
930  (__mmask8)(U), (int)(R)))
931 
932 #define _mm256_maskz_cvtt_roundph_epu64(U, A, R) \
933  ((__m256i)__builtin_ia32_vcvttph2uqq256_round_mask( \
934  (__v8hf)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
935 
936 #define _mm256_cvtt_roundph_epu16(A, R) \
937  ((__m256i)__builtin_ia32_vcvttph2uw256_round_mask( \
938  (__v16hf)(A), (__v16hu)_mm256_undefined_si256(), (__mmask16)(-1), \
939  (int)(R)))
940 
941 #define _mm256_mask_cvtt_roundph_epu16(W, U, A, R) \
942  ((__m256i)__builtin_ia32_vcvttph2uw256_round_mask( \
943  (__v16hf)(A), (__v16hu)(W), (__mmask16)(U), (int)(R)))
944 
945 #define _mm256_maskz_cvtt_roundph_epu16(U, A, R) \
946  ((__m256i)__builtin_ia32_vcvttph2uw256_round_mask( \
947  (__v16hf)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U), \
948  (int)(R)))
949 
950 #define _mm256_cvtt_roundph_epi16(A, R) \
951  ((__m256i)__builtin_ia32_vcvttph2w256_round_mask( \
952  (__v16hf)(A), (__v16hi)_mm256_undefined_si256(), (__mmask16)(-1), \
953  (int)(R)))
954 
955 #define _mm256_mask_cvtt_roundph_epi16(W, U, A, R) \
956  ((__m256i)__builtin_ia32_vcvttph2w256_round_mask((__v16hf)(A), (__v16hi)(W), \
957  (__mmask16)(U), (int)(R)))
958 
959 #define _mm256_maskz_cvtt_roundph_epi16(U, A, R) \
960  ((__m256i)__builtin_ia32_vcvttph2w256_round_mask( \
961  (__v16hf)(A), (__v16hi)_mm256_setzero_si256(), (__mmask16)(U), \
962  (int)(R)))
963 
964 #define _mm256_cvtt_roundps_epi32(A, R) \
965  ((__m256i)__builtin_ia32_vcvttps2dq256_round_mask( \
966  (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)-1, \
967  (int)(R)))
968 
969 #define _mm256_mask_cvtt_roundps_epi32(W, U, A, R) \
970  ((__m256i)__builtin_ia32_vcvttps2dq256_round_mask( \
971  (__v8sf)(__m256)(A), (__v8si)(__m256i)(W), (__mmask8)(U), (int)(R)))
972 
973 #define _mm256_maskz_cvtt_roundps_epi32(U, A, R) \
974  ((__m256i)__builtin_ia32_vcvttps2dq256_round_mask( \
975  (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), \
976  (int)(R)))
977 
978 #define _mm256_cvtt_roundps_epi64(A, R) \
979  ((__m256i)__builtin_ia32_vcvttps2qq256_round_mask( \
980  (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \
981  (int)(R)))
982 
983 #define _mm256_mask_cvtt_roundps_epi64(W, U, A, R) \
984  ((__m256i)__builtin_ia32_vcvttps2qq256_round_mask( \
985  (__v4sf)(__m128)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R)))
986 
987 #define _mm256_maskz_cvtt_roundps_epi64(U, A, R) \
988  ((__m256i)__builtin_ia32_vcvttps2qq256_round_mask( \
989  (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), \
990  (int)(R)))
991 
992 #define _mm256_cvtt_roundps_epu32(A, R) \
993  ((__m256i)__builtin_ia32_vcvttps2udq256_round_mask( \
994  (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1, \
995  (int)(R)))
996 
997 #define _mm256_mask_cvtt_roundps_epu32(W, U, A, R) \
998  ((__m256i)__builtin_ia32_vcvttps2udq256_round_mask( \
999  (__v8sf)(__m256)(A), (__v8su)(__m256i)(W), (__mmask8)(U), (int)(R)))
1000 
1001 #define _mm256_maskz_cvtt_roundps_epu32(U, A, R) \
1002  ((__m256i)__builtin_ia32_vcvttps2udq256_round_mask( \
1003  (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), \
1004  (int)(R)))
1005 
1006 #define _mm256_cvtt_roundps_epu64(A, R) \
1007  ((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask( \
1008  (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \
1009  (int)(R)))
1010 
1011 #define _mm256_mask_cvtt_roundps_epu64(W, U, A, R) \
1012  ((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask( \
1013  (__v4sf)(__m128)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R)))
1014 
1015 #define _mm256_maskz_cvtt_roundps_epu64(U, A, R) \
1016  ((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask( \
1017  (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), \
1018  (int)(R)))
1019 
1020 #define _mm256_cvt_roundepu32_ph(A, R) \
1021  ((__m128h)__builtin_ia32_vcvtudq2ph256_round_mask( \
1022  (__v8su)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1023 
1024 #define _mm256_mask_cvt_roundepu32_ph(W, U, A, R) \
1025  ((__m128h)__builtin_ia32_vcvtudq2ph256_round_mask((__v8su)(A), (__v8hf)(W), \
1026  (__mmask8)(U), (int)(R)))
1027 
1028 #define _mm256_maskz_cvt_roundepu32_ph(U, A, R) \
1029  ((__m128h)__builtin_ia32_vcvtudq2ph256_round_mask( \
1030  (__v8su)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1031 
1032 #define _mm256_cvt_roundepu32_ps(A, R) \
1033  ((__m256)__builtin_ia32_vcvtudq2ps256_round_mask( \
1034  (__v8su)(__m256i)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, \
1035  (int)(R)))
1036 
1037 #define _mm256_mask_cvt_roundepu32_ps(W, U, A, R) \
1038  ((__m256)__builtin_ia32_vcvtudq2ps256_round_mask( \
1039  (__v8su)(__m256i)(A), (__v8sf)(__m256)(W), (__mmask8)(U), (int)(R)))
1040 
1041 #define _mm256_maskz_cvt_roundepu32_ps(U, A, R) \
1042  ((__m256)__builtin_ia32_vcvtudq2ps256_round_mask( \
1043  (__v8su)(__m256i)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), \
1044  (int)(R)))
1045 
1046 #define _mm256_cvt_roundepu64_pd(A, R) \
1047  ((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask( \
1048  (__v4du)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)-1, \
1049  (int)(R)))
1050 
1051 #define _mm256_mask_cvt_roundepu64_pd(W, U, A, R) \
1052  ((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask( \
1053  (__v4du)(__m256i)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
1054 
1055 #define _mm256_maskz_cvt_roundepu64_pd(U, A, R) \
1056  ((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask( \
1057  (__v4du)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), \
1058  (int)(R)))
1059 
1060 #define _mm256_cvt_roundepu64_ph(A, R) \
1061  ((__m128h)__builtin_ia32_vcvtuqq2ph256_round_mask( \
1062  (__v4du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1063 
1064 #define _mm256_mask_cvt_roundepu64_ph(W, U, A, R) \
1065  ((__m128h)__builtin_ia32_vcvtuqq2ph256_round_mask((__v4du)(A), (__v8hf)(W), \
1066  (__mmask8)(U), (int)(R)))
1067 
1068 #define _mm256_maskz_cvt_roundepu64_ph(U, A, R) \
1069  ((__m128h)__builtin_ia32_vcvtuqq2ph256_round_mask( \
1070  (__v4du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1071 
1072 #define _mm256_cvt_roundepu64_ps(A, R) \
1073  ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask( \
1074  (__v4du)(__m256i)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R)))
1075 
1076 #define _mm256_mask_cvt_roundepu64_ps(W, U, A, R) \
1077  ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask( \
1078  (__v4du)(__m256i)(A), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R)))
1079 
1080 #define _mm256_maskz_cvt_roundepu64_ps(U, A, R) \
1081  ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask((__v4du)(__m256i)(A), \
1082  (__v4sf)_mm_setzero_ps(), \
1083  (__mmask8)(U), (int)(R)))
1084 
1085 #define _mm256_cvt_roundepu16_ph(A, R) \
1086  ((__m256h)__builtin_ia32_vcvtuw2ph256_round_mask( \
1087  (__v16hu)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)(-1), \
1088  (int)(R)))
1089 
1090 #define _mm256_mask_cvt_roundepu16_ph(W, U, A, R) \
1091  ((__m256h)__builtin_ia32_vcvtuw2ph256_round_mask((__v16hu)(A), (__v16hf)(W), \
1092  (__mmask16)(U), (int)(R)))
1093 
1094 #define _mm256_maskz_cvt_roundepu16_ph(U, A, R) \
1095  ((__m256h)__builtin_ia32_vcvtuw2ph256_round_mask( \
1096  (__v16hu)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1097 
1098 #define _mm256_cvt_roundepi16_ph(A, R) \
1099  ((__m256h)__builtin_ia32_vcvtw2ph256_round_mask( \
1100  (__v16hi)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)(-1), \
1101  (int)(R)))
1102 
1103 #define _mm256_mask_cvt_roundepi16_ph(W, U, A, R) \
1104  ((__m256h)__builtin_ia32_vcvtw2ph256_round_mask((__v16hi)(A), (__v16hf)(W), \
1105  (__mmask16)(U), (int)(R)))
1106 
1107 #define _mm256_maskz_cvt_roundepi16_ph(U, A, R) \
1108  ((__m256h)__builtin_ia32_vcvtw2ph256_round_mask( \
1109  (__v16hi)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1110 
1111 #define _mm256_div_round_pd(A, B, R) \
1112  ((__m256d)__builtin_ia32_vdivpd256_round((__v4df)(__m256d)(A), \
1113  (__v4df)(__m256d)(B), (int)(R)))
1114 
1115 #define _mm256_mask_div_round_pd(W, U, A, B, R) \
1116  ((__m256d)__builtin_ia32_selectpd_256( \
1117  (__mmask8)(U), (__v4df)_mm256_div_round_pd((A), (B), (R)), \
1118  (__v4df)(__m256d)(W)))
1119 
1120 #define _mm256_maskz_div_round_pd(U, A, B, R) \
1121  ((__m256d)__builtin_ia32_selectpd_256( \
1122  (__mmask8)(U), (__v4df)_mm256_div_round_pd((A), (B), (R)), \
1123  (__v4df)_mm256_setzero_pd()))
1124 
1125 #define _mm256_div_round_ph(A, B, R) \
1126  ((__m256h)__builtin_ia32_vdivph256_round((__v16hf)(__m256h)(A), \
1127  (__v16hf)(__m256h)(B), (int)(R)))
1128 
1129 #define _mm256_mask_div_round_ph(W, U, A, B, R) \
1130  ((__m256h)__builtin_ia32_selectph_256( \
1131  (__mmask16)(U), (__v16hf)_mm256_div_round_ph((A), (B), (R)), \
1132  (__v16hf)(__m256h)(W)))
1133 
1134 #define _mm256_maskz_div_round_ph(U, A, B, R) \
1135  ((__m256h)__builtin_ia32_selectph_256( \
1136  (__mmask16)(U), (__v16hf)_mm256_div_round_ph((A), (B), (R)), \
1137  (__v16hf)_mm256_setzero_ph()))
1138 
1139 #define _mm256_div_round_ps(A, B, R) \
1140  ((__m256)__builtin_ia32_vdivps256_round((__v8sf)(__m256)(A), \
1141  (__v8sf)(__m256)(B), (int)(R)))
1142 
1143 #define _mm256_mask_div_round_ps(W, U, A, B, R) \
1144  ((__m256)__builtin_ia32_selectps_256( \
1145  (__mmask8)(U), (__v8sf)_mm256_div_round_ps((A), (B), (R)), \
1146  (__v8sf)(__m256)(W)))
1147 
1148 #define _mm256_maskz_div_round_ps(U, A, B, R) \
1149  ((__m256)__builtin_ia32_selectps_256( \
1150  (__mmask8)(U), (__v8sf)_mm256_div_round_ps((A), (B), (R)), \
1151  (__v8sf)_mm256_setzero_ps()))
1152 
1153 #define _mm256_fcmadd_round_pch(A, B, C, R) \
1154  ((__m256h)__builtin_ia32_vfcmaddcph256_round_mask3( \
1155  (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1156  (__mmask8)-1, (int)(R)))
1157 
1158 #define _mm256_mask_fcmadd_round_pch(A, U, B, C, R) \
1159  ((__m256h)__builtin_ia32_vfcmaddcph256_round_mask( \
1160  (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1161  (__mmask8)(U), (int)(R)))
1162 
1163 #define _mm256_mask3_fcmadd_round_pch(A, B, C, U, R) \
1164  ((__m256h)__builtin_ia32_vfcmaddcph256_round_mask3( \
1165  (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1166  (__mmask8)(U), (int)(R)))
1167 
1168 #define _mm256_maskz_fcmadd_round_pch(U, A, B, C, R) \
1169  ((__m256h)__builtin_ia32_vfcmaddcph256_round_maskz( \
1170  (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1171  (__mmask8)(U), (int)(R)))
1172 
1173 #define _mm256_cmul_round_pch(A, B, R) \
1174  ((__m256h)__builtin_ia32_vfcmulcph256_round_mask( \
1175  (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \
1176  (__v8sf)(__m256h)_mm256_undefined_ph(), (__mmask8)-1, (int)(R)))
1177 
1178 #define _mm256_mask_cmul_round_pch(W, U, A, B, R) \
1179  ((__m256h)__builtin_ia32_vfcmulcph256_round_mask( \
1180  (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(W), \
1181  (__mmask8)(U), (int)(R)))
1182 
1183 #define _mm256_maskz_cmul_round_pch(U, A, B, R) \
1184  ((__m256h)__builtin_ia32_vfcmulcph256_round_mask( \
1185  (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \
1186  (__v8sf)(__m256h)_mm256_setzero_ph(), (__mmask8)(U), (int)(R)))
1187 
1188 #define _mm256_fixupimm_round_pd(A, B, C, imm, R) \
1189  ((__m256d)__builtin_ia32_vfixupimmpd256_round_mask( \
1190  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C), \
1191  (int)(imm), (__mmask8)-1, (int)(R)))
1192 
1193 #define _mm256_mask_fixupimm_round_pd(A, U, B, C, imm, R) \
1194  ((__m256d)__builtin_ia32_vfixupimmpd256_round_mask( \
1195  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C), \
1196  (int)(imm), (__mmask8)(U), (int)(R)))
1197 
1198 #define _mm256_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \
1199  ((__m256d)__builtin_ia32_vfixupimmpd256_round_maskz( \
1200  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C), \
1201  (int)(imm), (__mmask8)(U), (int)(R)))
1202 
1203 #define _mm256_fixupimm_round_ps(A, B, C, imm, R) \
1204  ((__m256)__builtin_ia32_vfixupimmps256_round_mask( \
1205  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C), \
1206  (int)(imm), (__mmask8)-1, (int)(R)))
1207 
1208 #define _mm256_mask_fixupimm_round_ps(A, U, B, C, imm, R) \
1209  ((__m256)__builtin_ia32_vfixupimmps256_round_mask( \
1210  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C), \
1211  (int)(imm), (__mmask8)(U), (int)(R)))
1212 
1213 #define _mm256_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \
1214  ((__m256)__builtin_ia32_vfixupimmps256_round_maskz( \
1215  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C), \
1216  (int)(imm), (__mmask8)(U), (int)(R)))
1217 
1218 #define _mm256_fmadd_round_pd(A, B, C, R) \
1219  ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1220  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1221  (__mmask8)-1, (int)(R)))
1222 
1223 #define _mm256_mask_fmadd_round_pd(A, U, B, C, R) \
1224  ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1225  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1226  (__mmask8)(U), (int)(R)))
1227 
1228 #define _mm256_mask3_fmadd_round_pd(A, B, C, U, R) \
1229  ((__m256d)__builtin_ia32_vfmaddpd256_round_mask3( \
1230  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1231  (__mmask8)(U), (int)(R)))
1232 
1233 #define _mm256_maskz_fmadd_round_pd(U, A, B, C, R) \
1234  ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \
1235  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1236  (__mmask8)(U), (int)(R)))
1237 
1238 #define _mm256_fmsub_round_pd(A, B, C, R) \
1239  ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1240  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1241  (__mmask8)-1, (int)(R)))
1242 
1243 #define _mm256_mask_fmsub_round_pd(A, U, B, C, R) \
1244  ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1245  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1246  (__mmask8)(U), (int)(R)))
1247 
1248 #define _mm256_maskz_fmsub_round_pd(U, A, B, C, R) \
1249  ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \
1250  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1251  (__mmask8)(U), (int)(R)))
1252 
1253 #define _mm256_fnmadd_round_pd(A, B, C, R) \
1254  ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1255  -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1256  (__mmask8)-1, (int)(R)))
1257 
1258 #define _mm256_mask3_fnmadd_round_pd(A, B, C, U, R) \
1259  ((__m256d)__builtin_ia32_vfmaddpd256_round_mask3( \
1260  -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1261  (__mmask8)(U), (int)(R)))
1262 
1263 #define _mm256_maskz_fnmadd_round_pd(U, A, B, C, R) \
1264  ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \
1265  -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1266  (__mmask8)(U), (int)(R)))
1267 
1268 #define _mm256_fnmsub_round_pd(A, B, C, R) \
1269  ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1270  -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1271  (__mmask8)-1, (int)(R)))
1272 
1273 #define _mm256_maskz_fnmsub_round_pd(U, A, B, C, R) \
1274  ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \
1275  -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1276  (__mmask8)(U), (int)(R)))
1277 
1278 #define _mm256_fmadd_round_ph(A, B, C, R) \
1279  ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1280  (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1281  (__mmask16)-1, (int)(R)))
1282 
1283 #define _mm256_mask_fmadd_round_ph(A, U, B, C, R) \
1284  ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1285  (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1286  (__mmask16)(U), (int)(R)))
1287 
1288 #define _mm256_mask3_fmadd_round_ph(A, B, C, U, R) \
1289  ((__m256h)__builtin_ia32_vfmaddph256_round_mask3( \
1290  (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1291  (__mmask16)(U), (int)(R)))
1292 
1293 #define _mm256_maskz_fmadd_round_ph(U, A, B, C, R) \
1294  ((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \
1295  (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1296  (__mmask16)(U), (int)(R)))
1297 
1298 #define _mm256_fmsub_round_ph(A, B, C, R) \
1299  ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1300  (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1301  (__mmask16)-1, (int)(R)))
1302 
1303 #define _mm256_mask_fmsub_round_ph(A, U, B, C, R) \
1304  ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1305  (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1306  (__mmask16)(U), (int)(R)))
1307 
1308 #define _mm256_maskz_fmsub_round_ph(U, A, B, C, R) \
1309  ((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \
1310  (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1311  (__mmask16)(U), (int)(R)))
1312 
1313 #define _mm256_fnmadd_round_ph(A, B, C, R) \
1314  ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1315  (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1316  (__mmask16)-1, (int)(R)))
1317 
1318 #define _mm256_mask3_fnmadd_round_ph(A, B, C, U, R) \
1319  ((__m256h)__builtin_ia32_vfmaddph256_round_mask3( \
1320  -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1321  (__mmask16)(U), (int)(R)))
1322 
1323 #define _mm256_maskz_fnmadd_round_ph(U, A, B, C, R) \
1324  ((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \
1325  -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1326  (__mmask16)(U), (int)(R)))
1327 
1328 #define _mm256_fnmsub_round_ph(A, B, C, R) \
1329  ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1330  (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1331  (__mmask16)-1, (int)(R)))
1332 
1333 #define _mm256_maskz_fnmsub_round_ph(U, A, B, C, R) \
1334  ((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \
1335  -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1336  (__mmask16)(U), (int)(R)))
1337 
1338 #define _mm256_fmadd_round_ps(A, B, C, R) \
1339  ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1340  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1341  (__mmask8)-1, (int)(R)))
1342 
1343 #define _mm256_mask_fmadd_round_ps(A, U, B, C, R) \
1344  ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1345  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1346  (__mmask8)(U), (int)(R)))
1347 
1348 #define _mm256_mask3_fmadd_round_ps(A, B, C, U, R) \
1349  ((__m256)__builtin_ia32_vfmaddps256_round_mask3( \
1350  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1351  (__mmask8)(U), (int)(R)))
1352 
1353 #define _mm256_maskz_fmadd_round_ps(U, A, B, C, R) \
1354  ((__m256)__builtin_ia32_vfmaddps256_round_maskz( \
1355  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1356  (__mmask8)(U), (int)(R)))
1357 
1358 #define _mm256_fmsub_round_ps(A, B, C, R) \
1359  ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1360  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1361  (__mmask8)-1, (int)(R)))
1362 
1363 #define _mm256_mask_fmsub_round_ps(A, U, B, C, R) \
1364  ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1365  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1366  (__mmask8)(U), (int)(R)))
1367 
1368 #define _mm256_maskz_fmsub_round_ps(U, A, B, C, R) \
1369  ((__m256)__builtin_ia32_vfmaddps256_round_maskz( \
1370  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1371  (__mmask8)(U), (int)(R)))
1372 
1373 #define _mm256_fnmadd_round_ps(A, B, C, R) \
1374  ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1375  (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1376  (__mmask8)-1, (int)(R)))
1377 
1378 #define _mm256_mask3_fnmadd_round_ps(A, B, C, U, R) \
1379  ((__m256)__builtin_ia32_vfmaddps256_round_mask3( \
1380  -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1381  (__mmask8)(U), (int)(R)))
1382 
1383 #define _mm256_maskz_fnmadd_round_ps(U, A, B, C, R) \
1384  ((__m256)__builtin_ia32_vfmaddps256_round_maskz( \
1385  -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1386  (__mmask8)(U), (int)(R)))
1387 
1388 #define _mm256_fnmsub_round_ps(A, B, C, R) \
1389  ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1390  (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1391  (__mmask8)-1, (int)(R)))
1392 
1393 #define _mm256_maskz_fnmsub_round_ps(U, A, B, C, R) \
1394  ((__m256)__builtin_ia32_vfmaddps256_round_maskz( \
1395  -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1396  (__mmask8)(U), (int)(R)))
1397 
1398 #define _mm256_fmadd_round_pch(A, B, C, R) \
1399  ((__m256h)__builtin_ia32_vfmaddcph256_round_mask3( \
1400  (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1401  (__mmask8)-1, (int)(R)))
1402 
1403 #define _mm256_mask_fmadd_round_pch(A, U, B, C, R) \
1404  ((__m256h)__builtin_ia32_vfmaddcph256_round_mask( \
1405  (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1406  (__mmask8)(U), (int)(R)))
1407 
1408 #define _mm256_mask3_fmadd_round_pch(A, B, C, U, R) \
1409  ((__m256h)__builtin_ia32_vfmaddcph256_round_mask3( \
1410  (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1411  (__mmask8)(U), (int)(R)))
1412 
1413 #define _mm256_maskz_fmadd_round_pch(U, A, B, C, R) \
1414  ((__m256h)__builtin_ia32_vfmaddcph256_round_maskz( \
1415  (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1416  (__mmask8)(U), (int)(R)))
1417 
1418 #define _mm256_fmaddsub_round_pd(A, B, C, R) \
1419  ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \
1420  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1421  (__mmask8)-1, (int)(R)))
1422 
1423 #define _mm256_mask_fmaddsub_round_pd(A, U, B, C, R) \
1424  ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \
1425  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1426  (__mmask8)(U), (int)(R)))
1427 
1428 #define _mm256_mask3_fmaddsub_round_pd(A, B, C, U, R) \
1429  ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask3( \
1430  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1431  (__mmask8)(U), (int)(R)))
1432 
1433 #define _mm256_maskz_fmaddsub_round_pd(U, A, B, C, R) \
1434  ((__m256d)__builtin_ia32_vfmaddsubpd256_round_maskz( \
1435  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1436  (__mmask8)(U), (int)(R)))
1437 
1438 #define _mm256_fmsubadd_round_pd(A, B, C, R) \
1439  ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \
1440  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1441  (__mmask8)-1, (int)(R)))
1442 
1443 #define _mm256_mask_fmsubadd_round_pd(A, U, B, C, R) \
1444  ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \
1445  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1446  (__mmask8)(U), (int)(R)))
1447 
1448 #define _mm256_maskz_fmsubadd_round_pd(U, A, B, C, R) \
1449  ((__m256d)__builtin_ia32_vfmaddsubpd256_round_maskz( \
1450  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1451  (__mmask8)(U), (int)(R)))
1452 
1453 #define _mm256_fmaddsub_round_ph(A, B, C, R) \
1454  ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \
1455  (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1456  (__mmask16)-1, (int)(R)))
1457 
1458 #define _mm256_mask_fmaddsub_round_ph(A, U, B, C, R) \
1459  ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \
1460  (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1461  (__mmask16)(U), (int)(R)))
1462 
1463 #define _mm256_mask3_fmaddsub_round_ph(A, B, C, U, R) \
1464  ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask3( \
1465  (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1466  (__mmask16)(U), (int)(R)))
1467 
1468 #define _mm256_maskz_fmaddsub_round_ph(U, A, B, C, R) \
1469  ((__m256h)__builtin_ia32_vfmaddsubph256_round_maskz( \
1470  (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1471  (__mmask16)(U), (int)(R)))
1472 
1473 #define _mm256_fmsubadd_round_ph(A, B, C, R) \
1474  ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \
1475  (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1476  (__mmask16)-1, (int)(R)))
1477 
1478 #define _mm256_mask_fmsubadd_round_ph(A, U, B, C, R) \
1479  ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \
1480  (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1481  (__mmask16)(U), (int)(R)))
1482 
1483 #define _mm256_maskz_fmsubadd_round_ph(U, A, B, C, R) \
1484  ((__m256h)__builtin_ia32_vfmaddsubph256_round_maskz( \
1485  (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1486  (__mmask16)(U), (int)(R)))
1487 
1488 #define _mm256_fmaddsub_round_ps(A, B, C, R) \
1489  ((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \
1490  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1491  (__mmask8)-1, (int)(R)))
1492 
1493 #define _mm256_mask_fmaddsub_round_ps(A, U, B, C, R) \
1494  ((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \
1495  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1496  (__mmask8)(U), (int)(R)))
1497 
1498 #define _mm256_mask3_fmaddsub_round_ps(A, B, C, U, R) \
1499  ((__m256)__builtin_ia32_vfmaddsubps256_round_mask3( \
1500  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1501  (__mmask8)(U), (int)(R)))
1502 
1503 #define _mm256_maskz_fmaddsub_round_ps(U, A, B, C, R) \
1504  ((__m256)__builtin_ia32_vfmaddsubps256_round_maskz( \
1505  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1506  (__mmask8)(U), (int)(R)))
1507 
1508 #define _mm256_fmsubadd_round_ps(A, B, C, R) \
1509  ((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \
1510  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1511  (__mmask8)-1, (int)(R)))
1512 
1513 #define _mm256_mask_fmsubadd_round_ps(A, U, B, C, R) \
1514  ((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \
1515  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1516  (__mmask8)(U), (int)(R)))
1517 
1518 #define _mm256_maskz_fmsubadd_round_ps(U, A, B, C, R) \
1519  ((__m256)__builtin_ia32_vfmaddsubps256_round_maskz( \
1520  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1521  (__mmask8)(U), (int)(R)))
1522 #define _mm256_mask3_fmsub_round_pd(A, B, C, U, R) \
1523  ((__m256d)__builtin_ia32_vfmsubpd256_round_mask3( \
1524  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1525  (__mmask8)(U), (int)(R)))
1526 
1527 #define _mm256_mask3_fmsubadd_round_pd(A, B, C, U, R) \
1528  ((__m256d)__builtin_ia32_vfmsubaddpd256_round_mask3( \
1529  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1530  (__mmask8)(U), (int)(R)))
1531 
1532 #define _mm256_mask_fnmadd_round_pd(A, U, B, C, R) \
1533  ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1534  (__v4df)(__m256d)(A), -(__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1535  (__mmask8)(U), (int)(R)))
1536 
1537 #define _mm256_mask_fnmsub_round_pd(A, U, B, C, R) \
1538  ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1539  (__v4df)(__m256d)(A), -(__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1540  (__mmask8)(U), (int)(R)))
1541 
1542 #define _mm256_mask3_fnmsub_round_pd(A, B, C, U, R) \
1543  ((__m256d)__builtin_ia32_vfmsubpd256_round_mask3( \
1544  -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1545  (__mmask8)(U), (int)(R)))
1546 
1547 #define _mm256_mask3_fmsub_round_ph(A, B, C, U, R) \
1548  ((__m256h)__builtin_ia32_vfmsubph256_round_mask3( \
1549  (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1550  (__mmask16)(U), (int)(R)))
1551 
1552 #define _mm256_mask3_fmsubadd_round_ph(A, B, C, U, R) \
1553  ((__m256h)__builtin_ia32_vfmsubaddph256_round_mask3( \
1554  (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1555  (__mmask16)(U), (int)(R)))
1556 
1557 #define _mm256_mask_fnmadd_round_ph(A, U, B, C, R) \
1558  ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1559  (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1560  (__mmask16)(U), (int)(R)))
1561 
1562 #define _mm256_mask_fnmsub_round_ph(A, U, B, C, R) \
1563  ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1564  (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1565  (__mmask16)(U), (int)(R)))
1566 
1567 #define _mm256_mask3_fnmsub_round_ph(A, B, C, U, R) \
1568  ((__m256h)__builtin_ia32_vfmsubph256_round_mask3( \
1569  -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1570  (__mmask16)(U), (int)(R)))
1571 
1572 #define _mm256_mask3_fmsub_round_ps(A, B, C, U, R) \
1573  ((__m256)__builtin_ia32_vfmsubps256_round_mask3( \
1574  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1575  (__mmask8)(U), (int)(R)))
1576 
1577 #define _mm256_mask3_fmsubadd_round_ps(A, B, C, U, R) \
1578  ((__m256)__builtin_ia32_vfmsubaddps256_round_mask3( \
1579  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1580  (__mmask8)(U), (int)(R)))
1581 
1582 #define _mm256_mask_fnmadd_round_ps(A, U, B, C, R) \
1583  ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1584  (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1585  (__mmask8)(U), (int)(R)))
1586 
1587 #define _mm256_mask_fnmsub_round_ps(A, U, B, C, R) \
1588  ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1589  (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1590  (__mmask8)(U), (int)(R)))
1591 
1592 #define _mm256_mask3_fnmsub_round_ps(A, B, C, U, R) \
1593  ((__m256)__builtin_ia32_vfmsubps256_round_mask3( \
1594  -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1595  (__mmask8)(U), (int)(R)))
1596 
1597 #define _mm256_mul_round_pch(A, B, R) \
1598  ((__m256h)__builtin_ia32_vfmulcph256_round_mask( \
1599  (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \
1600  (__v8sf)(__m256h)_mm256_undefined_ph(), (__mmask8)-1, (int)(R)))
1601 
1602 #define _mm256_mask_mul_round_pch(W, U, A, B, R) \
1603  ((__m256h)__builtin_ia32_vfmulcph256_round_mask( \
1604  (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(W), \
1605  (__mmask8)(U), (int)(R)))
1606 
1607 #define _mm256_maskz_mul_round_pch(U, A, B, R) \
1608  ((__m256h)__builtin_ia32_vfmulcph256_round_mask( \
1609  (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \
1610  (__v8sf)(__m256h)_mm256_setzero_ph(), (__mmask8)(U), (int)(R)))
1611 
1612 #define _mm256_getexp_round_pd(A, R) \
1613  ((__m256d)__builtin_ia32_vgetexppd256_round_mask( \
1614  (__v4df)(__m256d)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)-1, \
1615  (int)(R)))
1616 
1617 #define _mm256_mask_getexp_round_pd(W, U, A, R) \
1618  ((__m256d)__builtin_ia32_vgetexppd256_round_mask( \
1619  (__v4df)(__m256d)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
1620 
1621 #define _mm256_maskz_getexp_round_pd(U, A, R) \
1622  ((__m256d)__builtin_ia32_vgetexppd256_round_mask( \
1623  (__v4df)(__m256d)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), \
1624  (int)(R)))
1625 
1626 #define _mm256_getexp_round_ph(A, R) \
1627  ((__m256h)__builtin_ia32_vgetexpph256_round_mask( \
1628  (__v16hf)(__m256h)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, \
1629  (int)(R)))
1630 
1631 #define _mm256_mask_getexp_round_ph(W, U, A, R) \
1632  ((__m256h)__builtin_ia32_vgetexpph256_round_mask( \
1633  (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(W), (__mmask16)(U), (int)(R)))
1634 
1635 #define _mm256_maskz_getexp_round_ph(U, A, R) \
1636  ((__m256h)__builtin_ia32_vgetexpph256_round_mask( \
1637  (__v16hf)(__m256h)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), \
1638  (int)(R)))
1639 
1640 #define _mm256_getexp_round_ps(A, R) \
1641  ((__m256)__builtin_ia32_vgetexpps256_round_mask( \
1642  (__v8sf)(__m256)(A), (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, \
1643  (int)(R)))
1644 
1645 #define _mm256_mask_getexp_round_ps(W, U, A, R) \
1646  ((__m256)__builtin_ia32_vgetexpps256_round_mask( \
1647  (__v8sf)(__m256)(A), (__v8sf)(__m256)(W), (__mmask8)(U), (int)(R)))
1648 
1649 #define _mm256_maskz_getexp_round_ps(U, A, R) \
1650  ((__m256)__builtin_ia32_vgetexpps256_round_mask((__v8sf)(__m256)(A), \
1651  (__v8sf)_mm256_setzero_ps(), \
1652  (__mmask8)(U), (int)(R)))
1653 
1654 #define _mm256_getmant_round_pd(A, B, C, R) \
1655  ((__m256d)__builtin_ia32_vgetmantpd256_round_mask( \
1656  (__v4df)(__m256d)(A), (int)(((C) << 2) | (B)), \
1657  (__v4df)_mm256_undefined_pd(), (__mmask8)-1, (int)(R)))
1658 
1659 #define _mm256_mask_getmant_round_pd(W, U, A, B, C, R) \
1660  ((__m256d)__builtin_ia32_vgetmantpd256_round_mask( \
1661  (__v4df)(__m256d)(A), (int)(((C) << 2) | (B)), (__v4df)(__m256d)(W), \
1662  (__mmask8)(U), (int)(R)))
1663 
1664 #define _mm256_maskz_getmant_round_pd(U, A, B, C, R) \
1665  ((__m256d)__builtin_ia32_vgetmantpd256_round_mask( \
1666  (__v4df)(__m256d)(A), (int)(((C) << 2) | (B)), \
1667  (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R)))
1668 
1669 #define _mm256_getmant_round_ph(A, B, C, R) \
1670  ((__m256h)__builtin_ia32_vgetmantph256_round_mask( \
1671  (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
1672  (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, (int)(R)))
1673 
1674 #define _mm256_mask_getmant_round_ph(W, U, A, B, C, R) \
1675  ((__m256h)__builtin_ia32_vgetmantph256_round_mask( \
1676  (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W), \
1677  (__mmask16)(U), (int)(R)))
1678 
1679 #define _mm256_maskz_getmant_round_ph(U, A, B, C, R) \
1680  ((__m256h)__builtin_ia32_vgetmantph256_round_mask( \
1681  (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
1682  (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1683 
1684 #define _mm256_getmant_round_ps(A, B, C, R) \
1685  ((__m256)__builtin_ia32_vgetmantps256_round_mask( \
1686  (__v8sf)(__m256)(A), (int)(((C) << 2) | (B)), \
1687  (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, (int)(R)))
1688 
1689 #define _mm256_mask_getmant_round_ps(W, U, A, B, C, R) \
1690  ((__m256)__builtin_ia32_vgetmantps256_round_mask( \
1691  (__v8sf)(__m256)(A), (int)(((C) << 2) | (B)), (__v8sf)(__m256)(W), \
1692  (__mmask8)(U), (int)(R)))
1693 
1694 #define _mm256_maskz_getmant_round_ps(U, A, B, C, R) \
1695  ((__m256)__builtin_ia32_vgetmantps256_round_mask( \
1696  (__v8sf)(__m256)(A), (int)(((C) << 2) | (B)), \
1697  (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R)))
1698 
1699 #define _mm256_max_round_pd(A, B, R) \
1700  ((__m256d)__builtin_ia32_vmaxpd256_round((__v4df)(__m256d)(A), \
1701  (__v4df)(__m256d)(B), (int)(R)))
1702 
1703 #define _mm256_mask_max_round_pd(W, U, A, B, R) \
1704  ((__m256d)__builtin_ia32_selectpd_256( \
1705  (__mmask8)(U), (__v4df)_mm256_max_round_pd((A), (B), (R)), \
1706  (__v4df)(__m256d)(W)))
1707 
1708 #define _mm256_maskz_max_round_pd(U, A, B, R) \
1709  ((__m256d)__builtin_ia32_selectpd_256( \
1710  (__mmask8)(U), (__v4df)_mm256_max_round_pd((A), (B), (R)), \
1711  (__v4df)_mm256_setzero_pd()))
1712 
1713 #define _mm256_max_round_ph(A, B, R) \
1714  ((__m256h)__builtin_ia32_vmaxph256_round((__v16hf)(__m256h)(A), \
1715  (__v16hf)(__m256h)(B), (int)(R)))
1716 
1717 #define _mm256_mask_max_round_ph(W, U, A, B, R) \
1718  ((__m256h)__builtin_ia32_selectph_256( \
1719  (__mmask16)(U), (__v16hf)_mm256_max_round_ph((A), (B), (R)), \
1720  (__v16hf)(__m256h)(W)))
1721 
1722 #define _mm256_maskz_max_round_ph(U, A, B, R) \
1723  ((__m256h)__builtin_ia32_selectph_256( \
1724  (__mmask16)(U), (__v16hf)_mm256_max_round_ph((A), (B), (R)), \
1725  (__v16hf)_mm256_setzero_ph()))
1726 
1727 #define _mm256_max_round_ps(A, B, R) \
1728  ((__m256)__builtin_ia32_vmaxps256_round((__v8sf)(__m256)(A), \
1729  (__v8sf)(__m256)(B), (int)(R)))
1730 
1731 #define _mm256_mask_max_round_ps(W, U, A, B, R) \
1732  ((__m256)__builtin_ia32_selectps_256( \
1733  (__mmask8)(U), (__v8sf)_mm256_max_round_ps((A), (B), (R)), \
1734  (__v8sf)(__m256)(W)))
1735 
1736 #define _mm256_maskz_max_round_ps(U, A, B, R) \
1737  ((__m256)__builtin_ia32_selectps_256( \
1738  (__mmask8)(U), (__v8sf)_mm256_max_round_ps((A), (B), (R)), \
1739  (__v8sf)_mm256_setzero_ps()))
1740 
1741 #define _mm256_min_round_pd(A, B, R) \
1742  ((__m256d)__builtin_ia32_vminpd256_round((__v4df)(__m256d)(A), \
1743  (__v4df)(__m256d)(B), (int)(R)))
1744 
1745 #define _mm256_mask_min_round_pd(W, U, A, B, R) \
1746  ((__m256d)__builtin_ia32_selectpd_256( \
1747  (__mmask8)(U), (__v4df)_mm256_min_round_pd((A), (B), (R)), \
1748  (__v4df)(__m256d)(W)))
1749 
1750 #define _mm256_maskz_min_round_pd(U, A, B, R) \
1751  ((__m256d)__builtin_ia32_selectpd_256( \
1752  (__mmask8)(U), (__v4df)_mm256_min_round_pd((A), (B), (R)), \
1753  (__v4df)_mm256_setzero_pd()))
1754 
1755 #define _mm256_min_round_ph(A, B, R) \
1756  ((__m256h)__builtin_ia32_vminph256_round((__v16hf)(__m256h)(A), \
1757  (__v16hf)(__m256h)(B), (int)(R)))
1758 
1759 #define _mm256_mask_min_round_ph(W, U, A, B, R) \
1760  ((__m256h)__builtin_ia32_selectph_256( \
1761  (__mmask16)(U), (__v16hf)_mm256_min_round_ph((A), (B), (R)), \
1762  (__v16hf)(__m256h)(W)))
1763 
1764 #define _mm256_maskz_min_round_ph(U, A, B, R) \
1765  ((__m256h)__builtin_ia32_selectph_256( \
1766  (__mmask16)(U), (__v16hf)_mm256_min_round_ph((A), (B), (R)), \
1767  (__v16hf)_mm256_setzero_ph()))
1768 
1769 #define _mm256_min_round_ps(A, B, R) \
1770  ((__m256)__builtin_ia32_vminps256_round((__v8sf)(__m256)(A), \
1771  (__v8sf)(__m256)(B), (int)(R)))
1772 
1773 #define _mm256_mask_min_round_ps(W, U, A, B, R) \
1774  ((__m256)__builtin_ia32_selectps_256( \
1775  (__mmask8)(U), (__v8sf)_mm256_min_round_ps((A), (B), (R)), \
1776  (__v8sf)(__m256)(W)))
1777 
1778 #define _mm256_maskz_min_round_ps(U, A, B, R) \
1779  ((__m256)__builtin_ia32_selectps_256( \
1780  (__mmask8)(U), (__v8sf)_mm256_min_round_ps((A), (B), (R)), \
1781  (__v8sf)_mm256_setzero_ps()))
1782 
1783 #define _mm256_mul_round_pd(A, B, R) \
1784  ((__m256d)__builtin_ia32_vmulpd256_round((__v4df)(__m256d)(A), \
1785  (__v4df)(__m256d)(B), (int)(R)))
1786 
1787 #define _mm256_mask_mul_round_pd(W, U, A, B, R) \
1788  ((__m256d)__builtin_ia32_selectpd_256( \
1789  (__mmask8)(U), (__v4df)_mm256_mul_round_pd((A), (B), (R)), \
1790  (__v4df)(__m256d)(W)))
1791 
1792 #define _mm256_maskz_mul_round_pd(U, A, B, R) \
1793  ((__m256d)__builtin_ia32_selectpd_256( \
1794  (__mmask8)(U), (__v4df)_mm256_mul_round_pd((A), (B), (R)), \
1795  (__v4df)_mm256_setzero_pd()))
1796 
1797 #define _mm256_mul_round_ph(A, B, R) \
1798  ((__m256h)__builtin_ia32_vmulph256_round((__v16hf)(__m256h)(A), \
1799  (__v16hf)(__m256h)(B), (int)(R)))
1800 
1801 #define _mm256_mask_mul_round_ph(W, U, A, B, R) \
1802  ((__m256h)__builtin_ia32_selectph_256( \
1803  (__mmask16)(U), (__v16hf)_mm256_mul_round_ph((A), (B), (R)), \
1804  (__v16hf)(__m256h)(W)))
1805 
1806 #define _mm256_maskz_mul_round_ph(U, A, B, R) \
1807  ((__m256h)__builtin_ia32_selectph_256( \
1808  (__mmask16)(U), (__v16hf)_mm256_mul_round_ph((A), (B), (R)), \
1809  (__v16hf)_mm256_setzero_ph()))
1810 
1811 #define _mm256_mul_round_ps(A, B, R) \
1812  ((__m256)__builtin_ia32_vmulps256_round((__v8sf)(__m256)(A), \
1813  (__v8sf)(__m256)(B), (int)(R)))
1814 
1815 #define _mm256_mask_mul_round_ps(W, U, A, B, R) \
1816  ((__m256)__builtin_ia32_selectps_256( \
1817  (__mmask8)(U), (__v8sf)_mm256_mul_round_ps((A), (B), (R)), \
1818  (__v8sf)(__m256)(W)))
1819 
1820 #define _mm256_maskz_mul_round_ps(U, A, B, R) \
1821  ((__m256)__builtin_ia32_selectps_256( \
1822  (__mmask8)(U), (__v8sf)_mm256_mul_round_ps((A), (B), (R)), \
1823  (__v8sf)_mm256_setzero_ps()))
1824 
1825 #define _mm256_range_round_pd(A, B, C, R) \
1826  ((__m256d)__builtin_ia32_vrangepd256_round_mask( \
1827  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
1828  (__v4df)_mm256_setzero_pd(), (__mmask8)-1, (int)(R)))
1829 
1830 #define _mm256_mask_range_round_pd(W, U, A, B, C, R) \
1831  ((__m256d)__builtin_ia32_vrangepd256_round_mask( \
1832  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
1833  (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
1834 
1835 #define _mm256_maskz_range_round_pd(U, A, B, C, R) \
1836  ((__m256d)__builtin_ia32_vrangepd256_round_mask( \
1837  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
1838  (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R)))
1839 
1840 #define _mm256_range_round_ps(A, B, C, R) \
1841  ((__m256)__builtin_ia32_vrangeps256_round_mask( \
1842  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \
1843  (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, (int)(R)))
1844 
1845 #define _mm256_mask_range_round_ps(W, U, A, B, C, R) \
1846  ((__m256)__builtin_ia32_vrangeps256_round_mask( \
1847  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), \
1848  (__mmask8)(U), (int)(R)))
1849 
1850 #define _mm256_maskz_range_round_ps(U, A, B, C, R) \
1851  ((__m256)__builtin_ia32_vrangeps256_round_mask( \
1852  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \
1853  (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R)))
1854 
1855 #define _mm256_reduce_round_pd(A, B, R) \
1856  ((__m256d)__builtin_ia32_vreducepd256_round_mask( \
1857  (__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_setzero_pd(), \
1858  (__mmask8)-1, (int)(R)))
1859 
1860 #define _mm256_mask_reduce_round_pd(W, U, A, B, R) \
1861  ((__m256d)__builtin_ia32_vreducepd256_round_mask( \
1862  (__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U), \
1863  (int)(R)))
1864 
1865 #define _mm256_maskz_reduce_round_pd(U, A, B, R) \
1866  ((__m256d)__builtin_ia32_vreducepd256_round_mask( \
1867  (__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_setzero_pd(), \
1868  (__mmask8)(U), (int)(R)))
1869 
1870 #define _mm256_mask_reduce_round_ph(W, U, A, imm, R) \
1871  ((__m256h)__builtin_ia32_vreduceph256_round_mask( \
1872  (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W), \
1873  (__mmask16)(U), (int)(R)))
1874 
1875 #define _mm256_maskz_reduce_round_ph(U, A, imm, R) \
1876  ((__m256h)__builtin_ia32_vreduceph256_round_mask( \
1877  (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
1878  (__mmask16)(U), (int)(R)))
1879 
1880 #define _mm256_reduce_round_ph(A, imm, R) \
1881  ((__m256h)__builtin_ia32_vreduceph256_round_mask( \
1882  (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_undefined_ph(), \
1883  (__mmask16)-1, (int)(R)))
1884 
1885 #define _mm256_reduce_round_ps(A, B, R) \
1886  ((__m256)__builtin_ia32_vreduceps256_round_mask( \
1887  (__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_setzero_ps(), \
1888  (__mmask8)-1, (int)(R)))
1889 
1890 #define _mm256_mask_reduce_round_ps(W, U, A, B, R) \
1891  ((__m256)__builtin_ia32_vreduceps256_round_mask( \
1892  (__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U), \
1893  (int)(R)))
1894 
1895 #define _mm256_maskz_reduce_round_ps(U, A, B, R) \
1896  ((__m256)__builtin_ia32_vreduceps256_round_mask( \
1897  (__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_setzero_ps(), \
1898  (__mmask8)(U), (int)(R)))
1899 
1900 #define _mm256_roundscale_round_pd(A, imm, R) \
1901  ((__m256d)__builtin_ia32_vrndscalepd256_round_mask( \
1902  (__v4df)(__m256d)(A), (int)(imm), (__v4df)_mm256_undefined_pd(), \
1903  (__mmask8)-1, (int)(R)))
1904 
1905 #define _mm256_mask_roundscale_round_pd(A, B, C, imm, R) \
1906  ((__m256d)__builtin_ia32_vrndscalepd256_round_mask( \
1907  (__v4df)(__m256d)(C), (int)(imm), (__v4df)(__m256d)(A), (__mmask8)(B), \
1908  (int)(R)))
1909 
1910 #define _mm256_maskz_roundscale_round_pd(A, B, imm, R) \
1911  ((__m256d)__builtin_ia32_vrndscalepd256_round_mask( \
1912  (__v4df)(__m256d)(B), (int)(imm), (__v4df)_mm256_setzero_pd(), \
1913  (__mmask8)(A), (int)(R)))
1914 
1915 #define _mm256_roundscale_round_ph(A, imm, R) \
1916  ((__m256h)__builtin_ia32_vrndscaleph256_round_mask( \
1917  (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_undefined_ph(), \
1918  (__mmask16)-1, (int)(R)))
1919 
1920 #define _mm256_mask_roundscale_round_ph(A, B, C, imm, R) \
1921  ((__m256h)__builtin_ia32_vrndscaleph256_round_mask( \
1922  (__v16hf)(__m256h)(C), (int)(imm), (__v16hf)(__m256h)(A), \
1923  (__mmask16)(B), (int)(R)))
1924 
1925 #define _mm256_maskz_roundscale_round_ph(A, B, imm, R) \
1926  ((__m256h)__builtin_ia32_vrndscaleph256_round_mask( \
1927  (__v16hf)(__m256h)(B), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
1928  (__mmask16)(A), (int)(R)))
1929 
1930 #define _mm256_roundscale_round_ps(A, imm, R) \
1931  ((__m256)__builtin_ia32_vrndscaleps256_round_mask( \
1932  (__v8sf)(__m256)(A), (int)(imm), (__v8sf)_mm256_undefined_ps(), \
1933  (__mmask8)-1, (int)(R)))
1934 
1935 #define _mm256_mask_roundscale_round_ps(A, B, C, imm, R) \
1936  ((__m256)__builtin_ia32_vrndscaleps256_round_mask( \
1937  (__v8sf)(__m256)(C), (int)(imm), (__v8sf)(__m256)(A), (__mmask8)(B), \
1938  (int)(R)))
1939 
1940 #define _mm256_maskz_roundscale_round_ps(A, B, imm, R) \
1941  ((__m256)__builtin_ia32_vrndscaleps256_round_mask( \
1942  (__v8sf)(__m256)(B), (int)(imm), (__v8sf)_mm256_setzero_ps(), \
1943  (__mmask8)(A), (int)(R)))
1944 
1945 #define _mm256_scalef_round_pd(A, B, R) \
1946  ((__m256d)__builtin_ia32_vscalefpd256_round_mask( \
1947  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), \
1948  (__v4df)_mm256_undefined_pd(), (__mmask8)-1, (int)(R)))
1949 
1950 #define _mm256_mask_scalef_round_pd(W, U, A, B, R) \
1951  ((__m256d)__builtin_ia32_vscalefpd256_round_mask( \
1952  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(W), \
1953  (__mmask8)(U), (int)(R)))
1954 
1955 #define _mm256_maskz_scalef_round_pd(U, A, B, R) \
1956  ((__m256d)__builtin_ia32_vscalefpd256_round_mask( \
1957  (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)_mm256_setzero_pd(), \
1958  (__mmask8)(U), (int)(R)))
1959 
1960 #define _mm256_scalef_round_ph(A, B, R) \
1961  ((__m256h)__builtin_ia32_vscalefph256_round_mask( \
1962  (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), \
1963  (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, (int)(R)))
1964 
1965 #define _mm256_mask_scalef_round_ph(W, U, A, B, R) \
1966  ((__m256h)__builtin_ia32_vscalefph256_round_mask( \
1967  (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(W), \
1968  (__mmask16)(U), (int)(R)))
1969 
1970 #define _mm256_maskz_scalef_round_ph(U, A, B, R) \
1971  ((__m256h)__builtin_ia32_vscalefph256_round_mask( \
1972  (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), \
1973  (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1974 
1975 #define _mm256_scalef_round_ps(A, B, R) \
1976  ((__m256)__builtin_ia32_vscalefps256_round_mask( \
1977  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)_mm256_undefined_ps(), \
1978  (__mmask8)-1, (int)(R)))
1979 
1980 #define _mm256_mask_scalef_round_ps(W, U, A, B, R) \
1981  ((__m256)__builtin_ia32_vscalefps256_round_mask( \
1982  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(W), \
1983  (__mmask8)(U), (int)(R)))
1984 
1985 #define _mm256_maskz_scalef_round_ps(U, A, B, R) \
1986  ((__m256)__builtin_ia32_vscalefps256_round_mask( \
1987  (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)_mm256_setzero_ps(), \
1988  (__mmask8)(U), (int)(R)))
1989 
1990 #define _mm256_sqrt_round_pd(A, R) \
1991  ((__m256d)__builtin_ia32_vsqrtpd256_round((__v4df)(__m256d)(A), (int)(R)))
1992 
1993 #define _mm256_mask_sqrt_round_pd(W, U, A, R) \
1994  ((__m256d)__builtin_ia32_selectpd_256( \
1995  (__mmask8)(U), (__v4df)_mm256_sqrt_round_pd((A), (R)), \
1996  (__v4df)(__m256d)(W)))
1997 
1998 #define _mm256_maskz_sqrt_round_pd(U, A, R) \
1999  ((__m256d)__builtin_ia32_selectpd_256( \
2000  (__mmask8)(U), (__v4df)_mm256_sqrt_round_pd((A), (R)), \
2001  (__v4df)_mm256_setzero_pd()))
2002 
2003 #define _mm256_sqrt_round_ph(A, R) \
2004  ((__m256h)__builtin_ia32_vsqrtph256_round((__v16hf)(__m256h)(A), (int)(R)))
2005 
2006 #define _mm256_mask_sqrt_round_ph(W, U, A, R) \
2007  ((__m256h)__builtin_ia32_selectph_256( \
2008  (__mmask16)(U), (__v16hf)_mm256_sqrt_round_ph((A), (R)), \
2009  (__v16hf)(__m256h)(W)))
2010 
2011 #define _mm256_maskz_sqrt_round_ph(U, A, R) \
2012  ((__m256h)__builtin_ia32_selectph_256( \
2013  (__mmask16)(U), (__v16hf)_mm256_sqrt_round_ph((A), (R)), \
2014  (__v16hf)_mm256_setzero_ph()))
2015 
2016 #define _mm256_sqrt_round_ps(A, R) \
2017  ((__m256)__builtin_ia32_vsqrtps256_round((__v8sf)(__m256)(A), (int)(R)))
2018 
2019 #define _mm256_mask_sqrt_round_ps(W, U, A, R) \
2020  ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
2021  (__v8sf)_mm256_sqrt_round_ps((A), (R)), \
2022  (__v8sf)(__m256)(W)))
2023 
2024 #define _mm256_maskz_sqrt_round_ps(U, A, R) \
2025  ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
2026  (__v8sf)_mm256_sqrt_round_ps((A), (R)), \
2027  (__v8sf)_mm256_setzero_ps()))
2028 
2029 #define _mm256_sub_round_pd(A, B, R) \
2030  ((__m256d)__builtin_ia32_vsubpd256_round((__v4df)(__m256d)(A), \
2031  (__v4df)(__m256d)(B), (int)(R)))
2032 
2033 #define _mm256_mask_sub_round_pd(W, U, A, B, R) \
2034  ((__m256d)__builtin_ia32_selectpd_256( \
2035  (__mmask8)(U), (__v4df)_mm256_sub_round_pd((A), (B), (R)), \
2036  (__v4df)(__m256d)(W)))
2037 
2038 #define _mm256_maskz_sub_round_pd(U, A, B, R) \
2039  ((__m256d)__builtin_ia32_selectpd_256( \
2040  (__mmask8)(U), (__v4df)_mm256_sub_round_pd((A), (B), (R)), \
2041  (__v4df)_mm256_setzero_pd()))
2042 
2043 #define _mm256_sub_round_ph(A, B, R) \
2044  ((__m256h)__builtin_ia32_vsubph256_round((__v16hf)(__m256h)(A), \
2045  (__v16hf)(__m256h)(B), (int)(R)))
2046 
2047 #define _mm256_mask_sub_round_ph(W, U, A, B, R) \
2048  ((__m256h)__builtin_ia32_selectph_256( \
2049  (__mmask16)(U), (__v16hf)_mm256_sub_round_ph((A), (B), (R)), \
2050  (__v16hf)(__m256h)(W)))
2051 
2052 #define _mm256_maskz_sub_round_ph(U, A, B, R) \
2053  ((__m256h)__builtin_ia32_selectph_256( \
2054  (__mmask16)(U), (__v16hf)_mm256_sub_round_ph((A), (B), (R)), \
2055  (__v16hf)_mm256_setzero_ph()))
2056 
2057 #define _mm256_sub_round_ps(A, B, R) \
2058  ((__m256)__builtin_ia32_vsubps256_round((__v8sf)(__m256)(A), \
2059  (__v8sf)(__m256)(B), (int)(R)))
2060 
2061 #define _mm256_mask_sub_round_ps(W, U, A, B, R) \
2062  ((__m256)__builtin_ia32_selectps_256( \
2063  (__mmask8)(U), (__v8sf)_mm256_sub_round_ps((A), (B), (R)), \
2064  (__v8sf)(__m256)(W)))
2065 
2066 #define _mm256_maskz_sub_round_ps(U, A, B, R) \
2067  ((__m256)__builtin_ia32_selectps_256( \
2068  (__mmask8)(U), (__v8sf)_mm256_sub_round_ps((A), (B), (R)), \
2069  (__v8sf)_mm256_setzero_ps()))
2070 
2071 #undef __DEFAULT_FN_ATTRS256
2072 #undef __DEFAULT_FN_ATTRS128
2073 
2074 #endif /* __AVX10_2NIINTRIN_H */
2075 #endif /* __SSE2__ */
#define __DEFAULT_FN_ATTRS256
Definition: avx2intrin.h:18
#define __DEFAULT_FN_ATTRS128
Definition: avx2intrin.h:21
unsigned char __mmask8
Definition: avx512fintrin.h:41
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setzero_ps(void)
Constructs a 256-bit floating-point vector of [8 x float] with all vector elements initialized to zer...
Definition: avxintrin.h:4340
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setzero_si256(void)
Constructs a 256-bit integer vector initialized to zero.
Definition: avxintrin.h:4353
#define _mm256_dpwuuds_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding unsigned 16...
#define _mm256_dpwsuds_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of signed 16-bit integers in __A with corresponding unsigned 16-b...
#define _mm_dpwusd_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding signed 16-b...
#define _mm256_dpwusds_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding signed 16-b...
#define _mm_dpwsud_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of signed 16-bit integers in __A with corresponding unsigned 16-b...
#define _mm256_dpwsud_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of signed 16-bit integers in __A with corresponding unsigned 16-b...
#define _mm_dpwusds_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding signed 16-b...
#define _mm_dpwuud_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding unsigned 16...
#define _mm_dpwsuds_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of signed 16-bit integers in __A with corresponding unsigned 16-b...
#define _mm256_dpwusd_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding signed 16-b...
#define _mm_dpwuuds_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding unsigned 16...
#define _mm256_dpwuud_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding unsigned 16...
#define _mm_dpbuuds_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in __A with corresponding unsigned 8-b...
#define _mm256_dpbuuds_epi32(__W, __A, __B)
corresponding unsigned 8-bit integers in __B, producing 4 intermediate signed 16-bit results.
#define _mm256_dpbssd_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding signed 8-bit i...
#define _mm_dpbsud_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding unsigned 8-bit...
#define _mm256_dpbuud_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in __A with corresponding unsigned 8-b...
#define _mm256_dpbsud_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding unsigned 8-bit...
#define _mm256_dpbssds_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding signed 8-bit i...
#define _mm_dpbssd_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding signed 8-bit i...
#define _mm_dpbssds_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding signed 8-bit i...
#define _mm256_dpbsuds_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding unsigned 8-bit...
#define _mm_dpbuud_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in __A with corresponding unsigned 8-b...
#define _mm_dpbsuds_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding unsigned 8-bit...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition: emmintrin.h:3859
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition: xmmintrin.h:2033