clang  19.0.0git
avx512fp16intrin.h
Go to the documentation of this file.
1 /*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 #ifndef __IMMINTRIN_H
10 #error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
11 #endif
12 
13 #ifdef __SSE2__
14 
15 #ifndef __AVX512FP16INTRIN_H
16 #define __AVX512FP16INTRIN_H
17 
18 /* Define the default attributes for the functions in this file. */
19 typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
20 typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
21 typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
22 
23 /* Define the default attributes for the functions in this file. */
24 #define __DEFAULT_FN_ATTRS512 \
25  __attribute__((__always_inline__, __nodebug__, \
26  __target__("avx512fp16,evex512"), __min_vector_width__(512)))
27 #define __DEFAULT_FN_ATTRS256 \
28  __attribute__((__always_inline__, __nodebug__, \
29  __target__("avx512fp16,no-evex512"), \
30  __min_vector_width__(256)))
31 #define __DEFAULT_FN_ATTRS128 \
32  __attribute__((__always_inline__, __nodebug__, \
33  __target__("avx512fp16,no-evex512"), \
34  __min_vector_width__(128)))
35 
36 static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) {
37  return __a[0];
38 }
39 
40 static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_setzero_ph(void) {
41  return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
42 }
43 
44 static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_setzero_ph(void) {
45  return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
46  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
47 }
48 
49 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) {
50  return (__m256h)__builtin_ia32_undef256();
51 }
52 
53 static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_setzero_ph(void) {
54  return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
55  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
56  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
57 }
58 
59 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) {
60  return (__m128h)__builtin_ia32_undef128();
61 }
62 
63 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) {
64  return (__m512h)__builtin_ia32_undef512();
65 }
66 
67 static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_set1_ph(_Float16 __h) {
68  return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
69  __h, __h, __h, __h, __h, __h, __h, __h,
70  __h, __h, __h, __h, __h, __h, __h, __h,
71  __h, __h, __h, __h, __h, __h, __h, __h};
72 }
73 
74 static __inline __m512h __DEFAULT_FN_ATTRS512
75 _mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
76  _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
77  _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
78  _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16,
79  _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20,
80  _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
81  _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
82  _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
83  return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
84  __h25, __h24, __h23, __h22, __h21, __h20, __h19,
85  __h18, __h17, __h16, __h15, __h14, __h13, __h12,
86  __h11, __h10, __h9, __h8, __h7, __h6, __h5,
87  __h4, __h3, __h2, __h1};
88 }
89 
90 #define _mm512_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
91  h14, h15, h16, h17, h18, h19, h20, h21, h22, h23, h24, \
92  h25, h26, h27, h28, h29, h30, h31, h32) \
93  _mm512_set_ph((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), (h24), \
94  (h23), (h22), (h21), (h20), (h19), (h18), (h17), (h16), (h15), \
95  (h14), (h13), (h12), (h11), (h10), (h9), (h8), (h7), (h6), \
96  (h5), (h4), (h3), (h2), (h1))
97 
98 static __inline __m512h __DEFAULT_FN_ATTRS512
99 _mm512_set1_pch(_Float16 _Complex h) {
100  return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, h));
101 }
102 
103 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) {
104  return (__m128)__a;
105 }
106 
107 static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) {
108  return (__m256)__a;
109 }
110 
111 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) {
112  return (__m512)__a;
113 }
114 
115 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) {
116  return (__m128d)__a;
117 }
118 
119 static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) {
120  return (__m256d)__a;
121 }
122 
123 static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) {
124  return (__m512d)__a;
125 }
126 
127 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) {
128  return (__m128i)__a;
129 }
130 
131 static __inline__ __m256i __DEFAULT_FN_ATTRS256
132 _mm256_castph_si256(__m256h __a) {
133  return (__m256i)__a;
134 }
135 
136 static __inline__ __m512i __DEFAULT_FN_ATTRS512
137 _mm512_castph_si512(__m512h __a) {
138  return (__m512i)__a;
139 }
140 
141 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) {
142  return (__m128h)__a;
143 }
144 
145 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) {
146  return (__m256h)__a;
147 }
148 
149 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) {
150  return (__m512h)__a;
151 }
152 
153 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) {
154  return (__m128h)__a;
155 }
156 
157 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) {
158  return (__m256h)__a;
159 }
160 
161 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) {
162  return (__m512h)__a;
163 }
164 
165 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) {
166  return (__m128h)__a;
167 }
168 
169 static __inline__ __m256h __DEFAULT_FN_ATTRS256
170 _mm256_castsi256_ph(__m256i __a) {
171  return (__m256h)__a;
172 }
173 
174 static __inline__ __m512h __DEFAULT_FN_ATTRS512
175 _mm512_castsi512_ph(__m512i __a) {
176  return (__m512h)__a;
177 }
178 
179 static __inline__ __m128h __DEFAULT_FN_ATTRS256
180 _mm256_castph256_ph128(__m256h __a) {
181  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
182 }
183 
184 static __inline__ __m128h __DEFAULT_FN_ATTRS512
185 _mm512_castph512_ph128(__m512h __a) {
186  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
187 }
188 
189 static __inline__ __m256h __DEFAULT_FN_ATTRS512
190 _mm512_castph512_ph256(__m512h __a) {
191  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
192  12, 13, 14, 15);
193 }
194 
195 static __inline__ __m256h __DEFAULT_FN_ATTRS256
196 _mm256_castph128_ph256(__m128h __a) {
197  return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
198  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
199 }
200 
201 static __inline__ __m512h __DEFAULT_FN_ATTRS512
202 _mm512_castph128_ph512(__m128h __a) {
203  __m256h __b = __builtin_nondeterministic_value(__b);
204  return __builtin_shufflevector(
205  __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
206  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
207  __b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
208  20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
209 }
210 
211 static __inline__ __m512h __DEFAULT_FN_ATTRS512
212 _mm512_castph256_ph512(__m256h __a) {
213  return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
214  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
215  15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
216  27, 28, 29, 30, 31);
217 }
218 
219 /// Constructs a 256-bit floating-point vector of [16 x half] from a
220 /// 128-bit floating-point vector of [8 x half]. The lower 128 bits
221 /// contain the value of the source vector. The upper 384 bits are set
222 /// to zero.
223 ///
224 /// \headerfile <x86intrin.h>
225 ///
226 /// This intrinsic has no corresponding instruction.
227 ///
228 /// \param __a
229 /// A 128-bit vector of [8 x half].
230 /// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits
231 /// contain the value of the parameter. The upper 384 bits are set to zero.
232 static __inline__ __m256h __DEFAULT_FN_ATTRS256
233 _mm256_zextph128_ph256(__m128h __a) {
234  return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
235  5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
236 }
237 
238 /// Constructs a 512-bit floating-point vector of [32 x half] from a
239 /// 128-bit floating-point vector of [8 x half]. The lower 128 bits
240 /// contain the value of the source vector. The upper 384 bits are set
241 /// to zero.
242 ///
243 /// \headerfile <x86intrin.h>
244 ///
245 /// This intrinsic has no corresponding instruction.
246 ///
247 /// \param __a
248 /// A 128-bit vector of [8 x half].
249 /// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits
250 /// contain the value of the parameter. The upper 384 bits are set to zero.
251 static __inline__ __m512h __DEFAULT_FN_ATTRS512
252 _mm512_zextph128_ph512(__m128h __a) {
253  return __builtin_shufflevector(
254  __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
255  13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
256 }
257 
258 /// Constructs a 512-bit floating-point vector of [32 x half] from a
259 /// 256-bit floating-point vector of [16 x half]. The lower 256 bits
260 /// contain the value of the source vector. The upper 256 bits are set
261 /// to zero.
262 ///
263 /// \headerfile <x86intrin.h>
264 ///
265 /// This intrinsic has no corresponding instruction.
266 ///
267 /// \param __a
268 /// A 256-bit vector of [16 x half].
269 /// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits
270 /// contain the value of the parameter. The upper 256 bits are set to zero.
271 static __inline__ __m512h __DEFAULT_FN_ATTRS512
272 _mm512_zextph256_ph512(__m256h __a) {
273  return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
274  4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
275  17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
276  29, 30, 31);
277 }
278 
279 #define _mm_comi_round_sh(A, B, P, R) \
280  __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
281 
282 #define _mm_comi_sh(A, B, pred) \
283  _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
284 
285 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h A,
286  __m128h B) {
287  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OS,
289 }
290 
291 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h A,
292  __m128h B) {
293  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OS,
295 }
296 
297 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h A,
298  __m128h B) {
299  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OS,
301 }
302 
303 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h A,
304  __m128h B) {
305  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OS,
307 }
308 
309 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h A,
310  __m128h B) {
311  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OS,
313 }
314 
315 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h A,
316  __m128h B) {
317  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_US,
319 }
320 
321 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h A,
322  __m128h B) {
323  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OQ,
325 }
326 
327 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h A,
328  __m128h B) {
329  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OQ,
331 }
332 
333 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h A,
334  __m128h B) {
335  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OQ,
337 }
338 
339 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h A,
340  __m128h B) {
341  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OQ,
343 }
344 
345 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h A,
346  __m128h B) {
347  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OQ,
349 }
350 
351 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h A,
352  __m128h B) {
353  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_UQ,
355 }
356 
357 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A,
358  __m512h __B) {
359  return (__m512h)((__v32hf)__A + (__v32hf)__B);
360 }
361 
362 static __inline__ __m512h __DEFAULT_FN_ATTRS512
363 _mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
364  return (__m512h)__builtin_ia32_selectph_512(
365  (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
366 }
367 
368 static __inline__ __m512h __DEFAULT_FN_ATTRS512
369 _mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) {
370  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
371  (__v32hf)_mm512_add_ph(__A, __B),
372  (__v32hf)_mm512_setzero_ph());
373 }
374 
375 #define _mm512_add_round_ph(A, B, R) \
376  ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \
377  (__v32hf)(__m512h)(B), (int)(R)))
378 
379 #define _mm512_mask_add_round_ph(W, U, A, B, R) \
380  ((__m512h)__builtin_ia32_selectph_512( \
381  (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
382  (__v32hf)(__m512h)(W)))
383 
384 #define _mm512_maskz_add_round_ph(U, A, B, R) \
385  ((__m512h)__builtin_ia32_selectph_512( \
386  (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
387  (__v32hf)_mm512_setzero_ph()))
388 
389 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A,
390  __m512h __B) {
391  return (__m512h)((__v32hf)__A - (__v32hf)__B);
392 }
393 
394 static __inline__ __m512h __DEFAULT_FN_ATTRS512
395 _mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
396  return (__m512h)__builtin_ia32_selectph_512(
397  (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
398 }
399 
400 static __inline__ __m512h __DEFAULT_FN_ATTRS512
401 _mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) {
402  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
403  (__v32hf)_mm512_sub_ph(__A, __B),
404  (__v32hf)_mm512_setzero_ph());
405 }
406 
407 #define _mm512_sub_round_ph(A, B, R) \
408  ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \
409  (__v32hf)(__m512h)(B), (int)(R)))
410 
411 #define _mm512_mask_sub_round_ph(W, U, A, B, R) \
412  ((__m512h)__builtin_ia32_selectph_512( \
413  (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
414  (__v32hf)(__m512h)(W)))
415 
416 #define _mm512_maskz_sub_round_ph(U, A, B, R) \
417  ((__m512h)__builtin_ia32_selectph_512( \
418  (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
419  (__v32hf)_mm512_setzero_ph()))
420 
421 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A,
422  __m512h __B) {
423  return (__m512h)((__v32hf)__A * (__v32hf)__B);
424 }
425 
426 static __inline__ __m512h __DEFAULT_FN_ATTRS512
427 _mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
428  return (__m512h)__builtin_ia32_selectph_512(
429  (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
430 }
431 
432 static __inline__ __m512h __DEFAULT_FN_ATTRS512
433 _mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) {
434  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
435  (__v32hf)_mm512_mul_ph(__A, __B),
436  (__v32hf)_mm512_setzero_ph());
437 }
438 
439 #define _mm512_mul_round_ph(A, B, R) \
440  ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \
441  (__v32hf)(__m512h)(B), (int)(R)))
442 
443 #define _mm512_mask_mul_round_ph(W, U, A, B, R) \
444  ((__m512h)__builtin_ia32_selectph_512( \
445  (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
446  (__v32hf)(__m512h)(W)))
447 
448 #define _mm512_maskz_mul_round_ph(U, A, B, R) \
449  ((__m512h)__builtin_ia32_selectph_512( \
450  (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
451  (__v32hf)_mm512_setzero_ph()))
452 
453 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A,
454  __m512h __B) {
455  return (__m512h)((__v32hf)__A / (__v32hf)__B);
456 }
457 
458 static __inline__ __m512h __DEFAULT_FN_ATTRS512
459 _mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
460  return (__m512h)__builtin_ia32_selectph_512(
461  (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
462 }
463 
464 static __inline__ __m512h __DEFAULT_FN_ATTRS512
465 _mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) {
466  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
467  (__v32hf)_mm512_div_ph(__A, __B),
468  (__v32hf)_mm512_setzero_ph());
469 }
470 
471 #define _mm512_div_round_ph(A, B, R) \
472  ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \
473  (__v32hf)(__m512h)(B), (int)(R)))
474 
475 #define _mm512_mask_div_round_ph(W, U, A, B, R) \
476  ((__m512h)__builtin_ia32_selectph_512( \
477  (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
478  (__v32hf)(__m512h)(W)))
479 
480 #define _mm512_maskz_div_round_ph(U, A, B, R) \
481  ((__m512h)__builtin_ia32_selectph_512( \
482  (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
483  (__v32hf)_mm512_setzero_ph()))
484 
485 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A,
486  __m512h __B) {
487  return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
489 }
490 
491 static __inline__ __m512h __DEFAULT_FN_ATTRS512
492 _mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
493  return (__m512h)__builtin_ia32_selectph_512(
494  (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
495 }
496 
497 static __inline__ __m512h __DEFAULT_FN_ATTRS512
498 _mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) {
499  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
500  (__v32hf)_mm512_min_ph(__A, __B),
501  (__v32hf)_mm512_setzero_ph());
502 }
503 
504 #define _mm512_min_round_ph(A, B, R) \
505  ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \
506  (__v32hf)(__m512h)(B), (int)(R)))
507 
508 #define _mm512_mask_min_round_ph(W, U, A, B, R) \
509  ((__m512h)__builtin_ia32_selectph_512( \
510  (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
511  (__v32hf)(__m512h)(W)))
512 
513 #define _mm512_maskz_min_round_ph(U, A, B, R) \
514  ((__m512h)__builtin_ia32_selectph_512( \
515  (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
516  (__v32hf)_mm512_setzero_ph()))
517 
518 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A,
519  __m512h __B) {
520  return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
522 }
523 
524 static __inline__ __m512h __DEFAULT_FN_ATTRS512
525 _mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
526  return (__m512h)__builtin_ia32_selectph_512(
527  (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
528 }
529 
530 static __inline__ __m512h __DEFAULT_FN_ATTRS512
531 _mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) {
532  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
533  (__v32hf)_mm512_max_ph(__A, __B),
534  (__v32hf)_mm512_setzero_ph());
535 }
536 
537 #define _mm512_max_round_ph(A, B, R) \
538  ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \
539  (__v32hf)(__m512h)(B), (int)(R)))
540 
541 #define _mm512_mask_max_round_ph(W, U, A, B, R) \
542  ((__m512h)__builtin_ia32_selectph_512( \
543  (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
544  (__v32hf)(__m512h)(W)))
545 
546 #define _mm512_maskz_max_round_ph(U, A, B, R) \
547  ((__m512h)__builtin_ia32_selectph_512( \
548  (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
549  (__v32hf)_mm512_setzero_ph()))
550 
551 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) {
552  return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A);
553 }
554 
555 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) {
556  return (__m512h)_mm512_xor_ps((__m512)__A, _mm512_set1_ps(-0.0f));
557 }
558 
559 static __inline__ __m512h __DEFAULT_FN_ATTRS512
560 _mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) {
561  return (__m512h)__builtin_ia32_selectps_512(
562  (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
563 }
564 
565 static __inline__ __m512h __DEFAULT_FN_ATTRS512
566 _mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) {
567  return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U,
568  (__v16sf)_mm512_conj_pch(__A),
569  (__v16sf)_mm512_setzero_ps());
570 }
571 
572 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A,
573  __m128h __B) {
574  __A[0] += __B[0];
575  return __A;
576 }
577 
578 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W,
579  __mmask8 __U,
580  __m128h __A,
581  __m128h __B) {
582  __A = _mm_add_sh(__A, __B);
583  return __builtin_ia32_selectsh_128(__U, __A, __W);
584 }
585 
586 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U,
587  __m128h __A,
588  __m128h __B) {
589  __A = _mm_add_sh(__A, __B);
590  return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
591 }
592 
593 #define _mm_add_round_sh(A, B, R) \
594  ((__m128h)__builtin_ia32_addsh_round_mask( \
595  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
596  (__mmask8)-1, (int)(R)))
597 
598 #define _mm_mask_add_round_sh(W, U, A, B, R) \
599  ((__m128h)__builtin_ia32_addsh_round_mask( \
600  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
601  (__mmask8)(U), (int)(R)))
602 
603 #define _mm_maskz_add_round_sh(U, A, B, R) \
604  ((__m128h)__builtin_ia32_addsh_round_mask( \
605  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
606  (__mmask8)(U), (int)(R)))
607 
608 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A,
609  __m128h __B) {
610  __A[0] -= __B[0];
611  return __A;
612 }
613 
614 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W,
615  __mmask8 __U,
616  __m128h __A,
617  __m128h __B) {
618  __A = _mm_sub_sh(__A, __B);
619  return __builtin_ia32_selectsh_128(__U, __A, __W);
620 }
621 
622 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U,
623  __m128h __A,
624  __m128h __B) {
625  __A = _mm_sub_sh(__A, __B);
626  return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
627 }
628 
629 #define _mm_sub_round_sh(A, B, R) \
630  ((__m128h)__builtin_ia32_subsh_round_mask( \
631  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
632  (__mmask8)-1, (int)(R)))
633 
634 #define _mm_mask_sub_round_sh(W, U, A, B, R) \
635  ((__m128h)__builtin_ia32_subsh_round_mask( \
636  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
637  (__mmask8)(U), (int)(R)))
638 
639 #define _mm_maskz_sub_round_sh(U, A, B, R) \
640  ((__m128h)__builtin_ia32_subsh_round_mask( \
641  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
642  (__mmask8)(U), (int)(R)))
643 
644 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A,
645  __m128h __B) {
646  __A[0] *= __B[0];
647  return __A;
648 }
649 
650 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W,
651  __mmask8 __U,
652  __m128h __A,
653  __m128h __B) {
654  __A = _mm_mul_sh(__A, __B);
655  return __builtin_ia32_selectsh_128(__U, __A, __W);
656 }
657 
658 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U,
659  __m128h __A,
660  __m128h __B) {
661  __A = _mm_mul_sh(__A, __B);
662  return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
663 }
664 
665 #define _mm_mul_round_sh(A, B, R) \
666  ((__m128h)__builtin_ia32_mulsh_round_mask( \
667  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
668  (__mmask8)-1, (int)(R)))
669 
670 #define _mm_mask_mul_round_sh(W, U, A, B, R) \
671  ((__m128h)__builtin_ia32_mulsh_round_mask( \
672  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
673  (__mmask8)(U), (int)(R)))
674 
675 #define _mm_maskz_mul_round_sh(U, A, B, R) \
676  ((__m128h)__builtin_ia32_mulsh_round_mask( \
677  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
678  (__mmask8)(U), (int)(R)))
679 
680 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A,
681  __m128h __B) {
682  __A[0] /= __B[0];
683  return __A;
684 }
685 
686 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W,
687  __mmask8 __U,
688  __m128h __A,
689  __m128h __B) {
690  __A = _mm_div_sh(__A, __B);
691  return __builtin_ia32_selectsh_128(__U, __A, __W);
692 }
693 
694 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U,
695  __m128h __A,
696  __m128h __B) {
697  __A = _mm_div_sh(__A, __B);
698  return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
699 }
700 
701 #define _mm_div_round_sh(A, B, R) \
702  ((__m128h)__builtin_ia32_divsh_round_mask( \
703  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
704  (__mmask8)-1, (int)(R)))
705 
706 #define _mm_mask_div_round_sh(W, U, A, B, R) \
707  ((__m128h)__builtin_ia32_divsh_round_mask( \
708  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
709  (__mmask8)(U), (int)(R)))
710 
711 #define _mm_maskz_div_round_sh(U, A, B, R) \
712  ((__m128h)__builtin_ia32_divsh_round_mask( \
713  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
714  (__mmask8)(U), (int)(R)))
715 
716 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A,
717  __m128h __B) {
718  return (__m128h)__builtin_ia32_minsh_round_mask(
719  (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
721 }
722 
723 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W,
724  __mmask8 __U,
725  __m128h __A,
726  __m128h __B) {
727  return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
728  (__v8hf)__W, (__mmask8)__U,
730 }
731 
732 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U,
733  __m128h __A,
734  __m128h __B) {
735  return (__m128h)__builtin_ia32_minsh_round_mask(
736  (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
738 }
739 
740 #define _mm_min_round_sh(A, B, R) \
741  ((__m128h)__builtin_ia32_minsh_round_mask( \
742  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
743  (__mmask8)-1, (int)(R)))
744 
745 #define _mm_mask_min_round_sh(W, U, A, B, R) \
746  ((__m128h)__builtin_ia32_minsh_round_mask( \
747  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
748  (__mmask8)(U), (int)(R)))
749 
750 #define _mm_maskz_min_round_sh(U, A, B, R) \
751  ((__m128h)__builtin_ia32_minsh_round_mask( \
752  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
753  (__mmask8)(U), (int)(R)))
754 
755 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A,
756  __m128h __B) {
757  return (__m128h)__builtin_ia32_maxsh_round_mask(
758  (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
760 }
761 
762 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W,
763  __mmask8 __U,
764  __m128h __A,
765  __m128h __B) {
766  return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
767  (__v8hf)__W, (__mmask8)__U,
769 }
770 
771 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U,
772  __m128h __A,
773  __m128h __B) {
774  return (__m128h)__builtin_ia32_maxsh_round_mask(
775  (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
777 }
778 
779 #define _mm_max_round_sh(A, B, R) \
780  ((__m128h)__builtin_ia32_maxsh_round_mask( \
781  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
782  (__mmask8)-1, (int)(R)))
783 
784 #define _mm_mask_max_round_sh(W, U, A, B, R) \
785  ((__m128h)__builtin_ia32_maxsh_round_mask( \
786  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
787  (__mmask8)(U), (int)(R)))
788 
789 #define _mm_maskz_max_round_sh(U, A, B, R) \
790  ((__m128h)__builtin_ia32_maxsh_round_mask( \
791  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
792  (__mmask8)(U), (int)(R)))
793 
794 #define _mm512_cmp_round_ph_mask(A, B, P, R) \
795  ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
796  (__v32hf)(__m512h)(B), (int)(P), \
797  (__mmask32)-1, (int)(R)))
798 
799 #define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \
800  ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
801  (__v32hf)(__m512h)(B), (int)(P), \
802  (__mmask32)(U), (int)(R)))
803 
804 #define _mm512_cmp_ph_mask(A, B, P) \
805  _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
806 
807 #define _mm512_mask_cmp_ph_mask(U, A, B, P) \
808  _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
809 
810 #define _mm_cmp_round_sh_mask(X, Y, P, R) \
811  ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
812  (__v8hf)(__m128h)(Y), (int)(P), \
813  (__mmask8)-1, (int)(R)))
814 
815 #define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \
816  ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
817  (__v8hf)(__m128h)(Y), (int)(P), \
818  (__mmask8)(M), (int)(R)))
819 
820 #define _mm_cmp_sh_mask(X, Y, P) \
821  ((__mmask8)__builtin_ia32_cmpsh_mask( \
822  (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \
823  _MM_FROUND_CUR_DIRECTION))
824 
825 #define _mm_mask_cmp_sh_mask(M, X, Y, P) \
826  ((__mmask8)__builtin_ia32_cmpsh_mask( \
827  (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \
828  _MM_FROUND_CUR_DIRECTION))
829 // loads with vmovsh:
830 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
831  struct __mm_load_sh_struct {
832  _Float16 __u;
833  } __attribute__((__packed__, __may_alias__));
834  _Float16 __u = ((const struct __mm_load_sh_struct *)__dp)->__u;
835  return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
836 }
837 
838 static __inline__ __m128h __DEFAULT_FN_ATTRS128
839 _mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
840  __m128h src = (__v8hf)__builtin_shufflevector(
841  (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
842 
843  return (__m128h)__builtin_ia32_loadsh128_mask((const __v8hf *)__A, src, __U & 1);
844 }
845 
846 static __inline__ __m128h __DEFAULT_FN_ATTRS128
847 _mm_maskz_load_sh(__mmask8 __U, const void *__A) {
848  return (__m128h)__builtin_ia32_loadsh128_mask(
849  (const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
850 }
851 
852 static __inline__ __m512h __DEFAULT_FN_ATTRS512
853 _mm512_load_ph(void const *__p) {
854  return *(const __m512h *)__p;
855 }
856 
857 static __inline__ __m256h __DEFAULT_FN_ATTRS256
858 _mm256_load_ph(void const *__p) {
859  return *(const __m256h *)__p;
860 }
861 
862 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) {
863  return *(const __m128h *)__p;
864 }
865 
866 static __inline__ __m512h __DEFAULT_FN_ATTRS512
867 _mm512_loadu_ph(void const *__p) {
868  struct __loadu_ph {
869  __m512h_u __v;
870  } __attribute__((__packed__, __may_alias__));
871  return ((const struct __loadu_ph *)__p)->__v;
872 }
873 
874 static __inline__ __m256h __DEFAULT_FN_ATTRS256
875 _mm256_loadu_ph(void const *__p) {
876  struct __loadu_ph {
877  __m256h_u __v;
878  } __attribute__((__packed__, __may_alias__));
879  return ((const struct __loadu_ph *)__p)->__v;
880 }
881 
882 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) {
883  struct __loadu_ph {
884  __m128h_u __v;
885  } __attribute__((__packed__, __may_alias__));
886  return ((const struct __loadu_ph *)__p)->__v;
887 }
888 
889 // stores with vmovsh:
890 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp,
891  __m128h __a) {
892  struct __mm_store_sh_struct {
893  _Float16 __u;
894  } __attribute__((__packed__, __may_alias__));
895  ((struct __mm_store_sh_struct *)__dp)->__u = __a[0];
896 }
897 
898 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W,
899  __mmask8 __U,
900  __m128h __A) {
901  __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
902 }
903 
904 static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P,
905  __m512h __A) {
906  *(__m512h *)__P = __A;
907 }
908 
909 static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P,
910  __m256h __A) {
911  *(__m256h *)__P = __A;
912 }
913 
914 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P,
915  __m128h __A) {
916  *(__m128h *)__P = __A;
917 }
918 
919 static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P,
920  __m512h __A) {
921  struct __storeu_ph {
922  __m512h_u __v;
923  } __attribute__((__packed__, __may_alias__));
924  ((struct __storeu_ph *)__P)->__v = __A;
925 }
926 
927 static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P,
928  __m256h __A) {
929  struct __storeu_ph {
930  __m256h_u __v;
931  } __attribute__((__packed__, __may_alias__));
932  ((struct __storeu_ph *)__P)->__v = __A;
933 }
934 
935 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P,
936  __m128h __A) {
937  struct __storeu_ph {
938  __m128h_u __v;
939  } __attribute__((__packed__, __may_alias__));
940  ((struct __storeu_ph *)__P)->__v = __A;
941 }
942 
943 // moves with vmovsh:
944 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a,
945  __m128h __b) {
946  __a[0] = __b[0];
947  return __a;
948 }
949 
950 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W,
951  __mmask8 __U,
952  __m128h __A,
953  __m128h __B) {
954  return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
955 }
956 
957 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U,
958  __m128h __A,
959  __m128h __B) {
960  return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
961  _mm_setzero_ph());
962 }
963 
964 // vmovw:
965 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) {
966  return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0};
967 }
968 
969 static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) {
970  __v8hi __b = (__v8hi)__a;
971  return __b[0];
972 }
973 
974 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) {
975  return (__m512h)__builtin_ia32_rcpph512_mask(
976  (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
977 }
978 
979 static __inline__ __m512h __DEFAULT_FN_ATTRS512
980 _mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
981  return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
982  (__mmask32)__U);
983 }
984 
985 static __inline__ __m512h __DEFAULT_FN_ATTRS512
986 _mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
987  return (__m512h)__builtin_ia32_rcpph512_mask(
988  (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
989 }
990 
991 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) {
992  return (__m512h)__builtin_ia32_rsqrtph512_mask(
993  (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
994 }
995 
996 static __inline__ __m512h __DEFAULT_FN_ATTRS512
997 _mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
998  return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
999  (__mmask32)__U);
1000 }
1001 
1002 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1003 _mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
1004  return (__m512h)__builtin_ia32_rsqrtph512_mask(
1005  (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
1006 }
1007 
1008 #define _mm512_getmant_ph(A, B, C) \
1009  ((__m512h)__builtin_ia32_getmantph512_mask( \
1010  (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1011  (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \
1012  _MM_FROUND_CUR_DIRECTION))
1013 
1014 #define _mm512_mask_getmant_ph(W, U, A, B, C) \
1015  ((__m512h)__builtin_ia32_getmantph512_mask( \
1016  (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1017  (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1018 
1019 #define _mm512_maskz_getmant_ph(U, A, B, C) \
1020  ((__m512h)__builtin_ia32_getmantph512_mask( \
1021  (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1022  (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1023 
1024 #define _mm512_getmant_round_ph(A, B, C, R) \
1025  ((__m512h)__builtin_ia32_getmantph512_mask( \
1026  (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1027  (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1028 
1029 #define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \
1030  ((__m512h)__builtin_ia32_getmantph512_mask( \
1031  (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1032  (__mmask32)(U), (int)(R)))
1033 
1034 #define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \
1035  ((__m512h)__builtin_ia32_getmantph512_mask( \
1036  (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1037  (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1038 
1039 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) {
1040  return (__m512h)__builtin_ia32_getexpph512_mask(
1041  (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1043 }
1044 
1045 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1046 _mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1047  return (__m512h)__builtin_ia32_getexpph512_mask(
1048  (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1049 }
1050 
1051 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1052 _mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
1053  return (__m512h)__builtin_ia32_getexpph512_mask(
1054  (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1056 }
1057 
1058 #define _mm512_getexp_round_ph(A, R) \
1059  ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1060  (__v32hf)_mm512_undefined_ph(), \
1061  (__mmask32)-1, (int)(R)))
1062 
1063 #define _mm512_mask_getexp_round_ph(W, U, A, R) \
1064  ((__m512h)__builtin_ia32_getexpph512_mask( \
1065  (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
1066 
1067 #define _mm512_maskz_getexp_round_ph(U, A, R) \
1068  ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1069  (__v32hf)_mm512_setzero_ph(), \
1070  (__mmask32)(U), (int)(R)))
1071 
1072 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A,
1073  __m512h __B) {
1074  return (__m512h)__builtin_ia32_scalefph512_mask(
1075  (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1077 }
1078 
1079 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1080 _mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
1081  return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
1082  (__v32hf)__W, (__mmask32)__U,
1084 }
1085 
1086 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1087 _mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
1088  return (__m512h)__builtin_ia32_scalefph512_mask(
1089  (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1091 }
1092 
1093 #define _mm512_scalef_round_ph(A, B, R) \
1094  ((__m512h)__builtin_ia32_scalefph512_mask( \
1095  (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1096  (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1097 
1098 #define _mm512_mask_scalef_round_ph(W, U, A, B, R) \
1099  ((__m512h)__builtin_ia32_scalefph512_mask( \
1100  (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \
1101  (__mmask32)(U), (int)(R)))
1102 
1103 #define _mm512_maskz_scalef_round_ph(U, A, B, R) \
1104  ((__m512h)__builtin_ia32_scalefph512_mask( \
1105  (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1106  (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1107 
1108 #define _mm512_roundscale_ph(A, B) \
1109  ((__m512h)__builtin_ia32_rndscaleph_mask( \
1110  (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \
1111  _MM_FROUND_CUR_DIRECTION))
1112 
1113 #define _mm512_mask_roundscale_ph(A, B, C, imm) \
1114  ((__m512h)__builtin_ia32_rndscaleph_mask( \
1115  (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \
1116  (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
1117 
1118 #define _mm512_maskz_roundscale_ph(A, B, imm) \
1119  ((__m512h)__builtin_ia32_rndscaleph_mask( \
1120  (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1121  (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
1122 
1123 #define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \
1124  ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \
1125  (__v32hf)(__m512h)(A), \
1126  (__mmask32)(B), (int)(R)))
1127 
1128 #define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \
1129  ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \
1130  (__v32hf)_mm512_setzero_ph(), \
1131  (__mmask32)(A), (int)(R)))
1132 
1133 #define _mm512_roundscale_round_ph(A, imm, R) \
1134  ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \
1135  (__v32hf)_mm512_undefined_ph(), \
1136  (__mmask32)-1, (int)(R)))
1137 
1138 #define _mm512_reduce_ph(A, imm) \
1139  ((__m512h)__builtin_ia32_reduceph512_mask( \
1140  (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \
1141  (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
1142 
1143 #define _mm512_mask_reduce_ph(W, U, A, imm) \
1144  ((__m512h)__builtin_ia32_reduceph512_mask( \
1145  (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \
1146  (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1147 
1148 #define _mm512_maskz_reduce_ph(U, A, imm) \
1149  ((__m512h)__builtin_ia32_reduceph512_mask( \
1150  (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1151  (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1152 
1153 #define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \
1154  ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1155  (__v32hf)(__m512h)(W), \
1156  (__mmask32)(U), (int)(R)))
1157 
1158 #define _mm512_maskz_reduce_round_ph(U, A, imm, R) \
1159  ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1160  (__v32hf)_mm512_setzero_ph(), \
1161  (__mmask32)(U), (int)(R)))
1162 
1163 #define _mm512_reduce_round_ph(A, imm, R) \
1164  ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1165  (__v32hf)_mm512_undefined_ph(), \
1166  (__mmask32)-1, (int)(R)))
1167 
1168 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A,
1169  __m128h __B) {
1170  return (__m128h)__builtin_ia32_rcpsh_mask(
1171  (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1172 }
1173 
1174 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W,
1175  __mmask8 __U,
1176  __m128h __A,
1177  __m128h __B) {
1178  return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
1179  (__v8hf)__W, (__mmask8)__U);
1180 }
1181 
1182 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U,
1183  __m128h __A,
1184  __m128h __B) {
1185  return (__m128h)__builtin_ia32_rcpsh_mask(
1186  (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1187 }
1188 
1189 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A,
1190  __m128h __B) {
1191  return (__m128h)__builtin_ia32_rsqrtsh_mask(
1192  (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1193 }
1194 
1195 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W,
1196  __mmask8 __U,
1197  __m128h __A,
1198  __m128h __B) {
1199  return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
1200  (__v8hf)__W, (__mmask8)__U);
1201 }
1202 
1203 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1204 _mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1205  return (__m128h)__builtin_ia32_rsqrtsh_mask(
1206  (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1207 }
1208 
1209 #define _mm_getmant_round_sh(A, B, C, D, R) \
1210  ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1211  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1212  (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
1213 
1214 #define _mm_getmant_sh(A, B, C, D) \
1215  ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1216  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1217  (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
1218 
1219 #define _mm_mask_getmant_sh(W, U, A, B, C, D) \
1220  ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1221  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1222  (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1223 
1224 #define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \
1225  ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1226  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1227  (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
1228 
1229 #define _mm_maskz_getmant_sh(U, A, B, C, D) \
1230  ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1231  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1232  (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1233 
1234 #define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \
1235  ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1236  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1237  (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1238 
1239 #define _mm_getexp_round_sh(A, B, R) \
1240  ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1241  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1242  (__mmask8)-1, (int)(R)))
1243 
1244 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A,
1245  __m128h __B) {
1246  return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1247  (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1249 }
1250 
1251 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1252 _mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1253  return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1254  (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U,
1256 }
1257 
1258 #define _mm_mask_getexp_round_sh(W, U, A, B, R) \
1259  ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1260  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1261  (__mmask8)(U), (int)(R)))
1262 
1263 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1264 _mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1265  return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1266  (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1268 }
1269 
1270 #define _mm_maskz_getexp_round_sh(U, A, B, R) \
1271  ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1272  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1273  (__mmask8)(U), (int)(R)))
1274 
1275 #define _mm_scalef_round_sh(A, B, R) \
1276  ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1277  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1278  (__mmask8)-1, (int)(R)))
1279 
1280 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A,
1281  __m128h __B) {
1282  return (__m128h)__builtin_ia32_scalefsh_round_mask(
1283  (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1285 }
1286 
1287 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1288 _mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1289  return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
1290  (__v8hf)__W, (__mmask8)__U,
1292 }
1293 
1294 #define _mm_mask_scalef_round_sh(W, U, A, B, R) \
1295  ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1296  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1297  (__mmask8)(U), (int)(R)))
1298 
1299 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1300 _mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1301  return (__m128h)__builtin_ia32_scalefsh_round_mask(
1302  (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1304 }
1305 
1306 #define _mm_maskz_scalef_round_sh(U, A, B, R) \
1307  ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1308  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1309  (__mmask8)(U), (int)(R)))
1310 
1311 #define _mm_roundscale_round_sh(A, B, imm, R) \
1312  ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1313  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1314  (__mmask8)-1, (int)(imm), (int)(R)))
1315 
1316 #define _mm_roundscale_sh(A, B, imm) \
1317  ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1318  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1319  (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
1320 
1321 #define _mm_mask_roundscale_sh(W, U, A, B, I) \
1322  ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1323  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1324  (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1325 
1326 #define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \
1327  ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1328  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1329  (__mmask8)(U), (int)(I), (int)(R)))
1330 
1331 #define _mm_maskz_roundscale_sh(U, A, B, I) \
1332  ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1333  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1334  (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1335 
1336 #define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \
1337  ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1338  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1339  (__mmask8)(U), (int)(I), (int)(R)))
1340 
1341 #define _mm_reduce_sh(A, B, C) \
1342  ((__m128h)__builtin_ia32_reducesh_mask( \
1343  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1344  (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
1345 
1346 #define _mm_mask_reduce_sh(W, U, A, B, C) \
1347  ((__m128h)__builtin_ia32_reducesh_mask( \
1348  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1349  (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1350 
1351 #define _mm_maskz_reduce_sh(U, A, B, C) \
1352  ((__m128h)__builtin_ia32_reducesh_mask( \
1353  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1354  (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1355 
1356 #define _mm_reduce_round_sh(A, B, C, R) \
1357  ((__m128h)__builtin_ia32_reducesh_mask( \
1358  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1359  (__mmask8)-1, (int)(C), (int)(R)))
1360 
1361 #define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \
1362  ((__m128h)__builtin_ia32_reducesh_mask( \
1363  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1364  (__mmask8)(U), (int)(C), (int)(R)))
1365 
1366 #define _mm_maskz_reduce_round_sh(U, A, B, C, R) \
1367  ((__m128h)__builtin_ia32_reducesh_mask( \
1368  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1369  (__mmask8)(U), (int)(C), (int)(R)))
1370 
1371 #define _mm512_sqrt_round_ph(A, R) \
1372  ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
1373 
1374 #define _mm512_mask_sqrt_round_ph(W, U, A, R) \
1375  ((__m512h)__builtin_ia32_selectph_512( \
1376  (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1377  (__v32hf)(__m512h)(W)))
1378 
1379 #define _mm512_maskz_sqrt_round_ph(U, A, R) \
1380  ((__m512h)__builtin_ia32_selectph_512( \
1381  (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1382  (__v32hf)_mm512_setzero_ph()))
1383 
1384 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) {
1385  return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A,
1387 }
1388 
1389 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1390 _mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1391  return (__m512h)__builtin_ia32_selectph_512(
1392  (__mmask32)(__U),
1393  (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1394  (__v32hf)(__m512h)(__W));
1395 }
1396 
1397 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1398 _mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
1399  return (__m512h)__builtin_ia32_selectph_512(
1400  (__mmask32)(__U),
1401  (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1402  (__v32hf)_mm512_setzero_ph());
1403 }
1404 
1405 #define _mm_sqrt_round_sh(A, B, R) \
1406  ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1407  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1408  (__mmask8)-1, (int)(R)))
1409 
1410 #define _mm_mask_sqrt_round_sh(W, U, A, B, R) \
1411  ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1412  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1413  (__mmask8)(U), (int)(R)))
1414 
1415 #define _mm_maskz_sqrt_round_sh(U, A, B, R) \
1416  ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1417  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1418  (__mmask8)(U), (int)(R)))
1419 
1420 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A,
1421  __m128h __B) {
1422  return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1423  (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1425 }
1426 
1427 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W,
1428  __mmask32 __U,
1429  __m128h __A,
1430  __m128h __B) {
1431  return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1432  (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
1434 }
1435 
1436 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U,
1437  __m128h __A,
1438  __m128h __B) {
1439  return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1440  (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1442 }
1443 
1444 #define _mm512_mask_fpclass_ph_mask(U, A, imm) \
1445  ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1446  (int)(imm), (__mmask32)(U)))
1447 
1448 #define _mm512_fpclass_ph_mask(A, imm) \
1449  ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1450  (int)(imm), (__mmask32)-1))
1451 
1452 #define _mm_fpclass_sh_mask(A, imm) \
1453  ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1454  (__mmask8)-1))
1455 
1456 #define _mm_mask_fpclass_sh_mask(U, A, imm) \
1457  ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1458  (__mmask8)(U)))
1459 
1460 #define _mm512_cvt_roundpd_ph(A, R) \
1461  ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1462  (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1463 
1464 #define _mm512_mask_cvt_roundpd_ph(W, U, A, R) \
1465  ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W), \
1466  (__mmask8)(U), (int)(R)))
1467 
1468 #define _mm512_maskz_cvt_roundpd_ph(U, A, R) \
1469  ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1470  (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1471 
1472 static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) {
1473  return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1474  (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1476 }
1477 
1478 static __inline__ __m128h __DEFAULT_FN_ATTRS512
1479 _mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) {
1480  return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1481  (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1482 }
1483 
1484 static __inline__ __m128h __DEFAULT_FN_ATTRS512
1485 _mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) {
1486  return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1487  (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1489 }
1490 
1491 #define _mm512_cvt_roundph_pd(A, R) \
1492  ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1493  (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
1494 
1495 #define _mm512_mask_cvt_roundph_pd(W, U, A, R) \
1496  ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W), \
1497  (__mmask8)(U), (int)(R)))
1498 
1499 #define _mm512_maskz_cvt_roundph_pd(U, A, R) \
1500  ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1501  (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
1502 
1503 static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) {
1504  return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1505  (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1,
1507 }
1508 
1509 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1510 _mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) {
1511  return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1512  (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1513 }
1514 
1515 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1516 _mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
1517  return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1518  (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U,
1520 }
1521 
1522 #define _mm_cvt_roundsh_ss(A, B, R) \
1523  ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1524  (__v4sf)_mm_undefined_ps(), \
1525  (__mmask8)(-1), (int)(R)))
1526 
1527 #define _mm_mask_cvt_roundsh_ss(W, U, A, B, R) \
1528  ((__m128)__builtin_ia32_vcvtsh2ss_round_mask( \
1529  (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
1530 
1531 #define _mm_maskz_cvt_roundsh_ss(U, A, B, R) \
1532  ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1533  (__v4sf)_mm_setzero_ps(), \
1534  (__mmask8)(U), (int)(R)))
1535 
1536 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A,
1537  __m128h __B) {
1538  return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1539  (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1,
1541 }
1542 
1543 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W,
1544  __mmask8 __U,
1545  __m128 __A,
1546  __m128h __B) {
1547  return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
1548  (__v4sf)__W, (__mmask8)__U,
1550 }
1551 
1552 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U,
1553  __m128 __A,
1554  __m128h __B) {
1555  return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1556  (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U,
1558 }
1559 
1560 #define _mm_cvt_roundss_sh(A, B, R) \
1561  ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1562  (__v8hf)_mm_undefined_ph(), \
1563  (__mmask8)(-1), (int)(R)))
1564 
1565 #define _mm_mask_cvt_roundss_sh(W, U, A, B, R) \
1566  ((__m128h)__builtin_ia32_vcvtss2sh_round_mask( \
1567  (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1568 
1569 #define _mm_maskz_cvt_roundss_sh(U, A, B, R) \
1570  ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1571  (__v8hf)_mm_setzero_ph(), \
1572  (__mmask8)(U), (int)(R)))
1573 
1574 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A,
1575  __m128 __B) {
1576  return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1577  (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1579 }
1580 
1581 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W,
1582  __mmask8 __U,
1583  __m128h __A,
1584  __m128 __B) {
1585  return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1586  (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U,
1588 }
1589 
1590 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U,
1591  __m128h __A,
1592  __m128 __B) {
1593  return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1594  (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1596 }
1597 
1598 #define _mm_cvt_roundsd_sh(A, B, R) \
1599  ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1600  (__v8hf)_mm_undefined_ph(), \
1601  (__mmask8)(-1), (int)(R)))
1602 
1603 #define _mm_mask_cvt_roundsd_sh(W, U, A, B, R) \
1604  ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask( \
1605  (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1606 
1607 #define _mm_maskz_cvt_roundsd_sh(U, A, B, R) \
1608  ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1609  (__v8hf)_mm_setzero_ph(), \
1610  (__mmask8)(U), (int)(R)))
1611 
1612 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A,
1613  __m128d __B) {
1614  return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1615  (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1617 }
1618 
1619 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W,
1620  __mmask8 __U,
1621  __m128h __A,
1622  __m128d __B) {
1623  return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1624  (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U,
1626 }
1627 
1628 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1629 _mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) {
1630  return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1631  (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1633 }
1634 
1635 #define _mm_cvt_roundsh_sd(A, B, R) \
1636  ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1637  (__v2df)_mm_undefined_pd(), \
1638  (__mmask8)(-1), (int)(R)))
1639 
1640 #define _mm_mask_cvt_roundsh_sd(W, U, A, B, R) \
1641  ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask( \
1642  (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
1643 
1644 #define _mm_maskz_cvt_roundsh_sd(U, A, B, R) \
1645  ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1646  (__v2df)_mm_setzero_pd(), \
1647  (__mmask8)(U), (int)(R)))
1648 
1649 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A,
1650  __m128h __B) {
1651  return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1652  (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1,
1654 }
1655 
1656 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W,
1657  __mmask8 __U,
1658  __m128d __A,
1659  __m128h __B) {
1660  return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1661  (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U,
1663 }
1664 
1665 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1666 _mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) {
1667  return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1668  (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U,
1670 }
1671 
1672 #define _mm512_cvt_roundph_epi16(A, R) \
1673  ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1674  (__v32hi)_mm512_undefined_epi32(), \
1675  (__mmask32)(-1), (int)(R)))
1676 
1677 #define _mm512_mask_cvt_roundph_epi16(W, U, A, R) \
1678  ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1679  (__mmask32)(U), (int)(R)))
1680 
1681 #define _mm512_maskz_cvt_roundph_epi16(U, A, R) \
1682  ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1683  (__v32hi)_mm512_setzero_epi32(), \
1684  (__mmask32)(U), (int)(R)))
1685 
1686 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1687 _mm512_cvtph_epi16(__m512h __A) {
1688  return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1689  (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1691 }
1692 
1693 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1694 _mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1695  return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1696  (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1697 }
1698 
1699 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1700 _mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) {
1701  return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1702  (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1704 }
1705 
1706 #define _mm512_cvtt_roundph_epi16(A, R) \
1707  ((__m512i)__builtin_ia32_vcvttph2w512_mask( \
1708  (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1), \
1709  (int)(R)))
1710 
1711 #define _mm512_mask_cvtt_roundph_epi16(W, U, A, R) \
1712  ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1713  (__mmask32)(U), (int)(R)))
1714 
1715 #define _mm512_maskz_cvtt_roundph_epi16(U, A, R) \
1716  ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), \
1717  (__v32hi)_mm512_setzero_epi32(), \
1718  (__mmask32)(U), (int)(R)))
1719 
1720 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1721 _mm512_cvttph_epi16(__m512h __A) {
1722  return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1723  (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1725 }
1726 
1727 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1728 _mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1729  return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1730  (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1731 }
1732 
1733 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1734 _mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) {
1735  return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1736  (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1738 }
1739 
1740 #define _mm512_cvt_roundepi16_ph(A, R) \
1741  ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), \
1742  (__v32hf)_mm512_undefined_ph(), \
1743  (__mmask32)(-1), (int)(R)))
1744 
1745 #define _mm512_mask_cvt_roundepi16_ph(W, U, A, R) \
1746  ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W), \
1747  (__mmask32)(U), (int)(R)))
1748 
1749 #define _mm512_maskz_cvt_roundepi16_ph(U, A, R) \
1750  ((__m512h)__builtin_ia32_vcvtw2ph512_mask( \
1751  (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1752 
1753 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1754 _mm512_cvtepi16_ph(__m512i __A) {
1755  return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1756  (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1758 }
1759 
1760 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1761 _mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1762  return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1763  (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1764 }
1765 
1766 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1767 _mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) {
1768  return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1769  (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1771 }
1772 
1773 #define _mm512_cvt_roundph_epu16(A, R) \
1774  ((__m512i)__builtin_ia32_vcvtph2uw512_mask( \
1775  (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1776  (int)(R)))
1777 
1778 #define _mm512_mask_cvt_roundph_epu16(W, U, A, R) \
1779  ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1780  (__mmask32)(U), (int)(R)))
1781 
1782 #define _mm512_maskz_cvt_roundph_epu16(U, A, R) \
1783  ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), \
1784  (__v32hu)_mm512_setzero_epi32(), \
1785  (__mmask32)(U), (int)(R)))
1786 
1787 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1788 _mm512_cvtph_epu16(__m512h __A) {
1789  return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1790  (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1792 }
1793 
1794 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1795 _mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1796  return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1797  (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1798 }
1799 
1800 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1801 _mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) {
1802  return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1803  (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1805 }
1806 
1807 #define _mm512_cvtt_roundph_epu16(A, R) \
1808  ((__m512i)__builtin_ia32_vcvttph2uw512_mask( \
1809  (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1810  (int)(R)))
1811 
1812 #define _mm512_mask_cvtt_roundph_epu16(W, U, A, R) \
1813  ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1814  (__mmask32)(U), (int)(R)))
1815 
1816 #define _mm512_maskz_cvtt_roundph_epu16(U, A, R) \
1817  ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), \
1818  (__v32hu)_mm512_setzero_epi32(), \
1819  (__mmask32)(U), (int)(R)))
1820 
1821 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1822 _mm512_cvttph_epu16(__m512h __A) {
1823  return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1824  (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1826 }
1827 
1828 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1829 _mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1830  return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1831  (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1832 }
1833 
1834 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1835 _mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) {
1836  return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1837  (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1839 }
1840 
1841 #define _mm512_cvt_roundepu16_ph(A, R) \
1842  ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), \
1843  (__v32hf)_mm512_undefined_ph(), \
1844  (__mmask32)(-1), (int)(R)))
1845 
1846 #define _mm512_mask_cvt_roundepu16_ph(W, U, A, R) \
1847  ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W), \
1848  (__mmask32)(U), (int)(R)))
1849 
1850 #define _mm512_maskz_cvt_roundepu16_ph(U, A, R) \
1851  ((__m512h)__builtin_ia32_vcvtuw2ph512_mask( \
1852  (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1853 
1854 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1855 _mm512_cvtepu16_ph(__m512i __A) {
1856  return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1857  (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1859 }
1860 
1861 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1862 _mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1863  return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1864  (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1865 }
1866 
1867 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1868 _mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) {
1869  return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1870  (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1872 }
1873 
1874 #define _mm512_cvt_roundph_epi32(A, R) \
1875  ((__m512i)__builtin_ia32_vcvtph2dq512_mask( \
1876  (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
1877  (int)(R)))
1878 
1879 #define _mm512_mask_cvt_roundph_epi32(W, U, A, R) \
1880  ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W), \
1881  (__mmask16)(U), (int)(R)))
1882 
1883 #define _mm512_maskz_cvt_roundph_epi32(U, A, R) \
1884  ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), \
1885  (__v16si)_mm512_setzero_epi32(), \
1886  (__mmask16)(U), (int)(R)))
1887 
1888 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1889 _mm512_cvtph_epi32(__m256h __A) {
1890  return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1891  (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
1893 }
1894 
1895 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1896 _mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
1897  return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1898  (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1899 }
1900 
1901 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1902 _mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) {
1903  return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1904  (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
1906 }
1907 
1908 #define _mm512_cvt_roundph_epu32(A, R) \
1909  ((__m512i)__builtin_ia32_vcvtph2udq512_mask( \
1910  (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
1911  (int)(R)))
1912 
1913 #define _mm512_mask_cvt_roundph_epu32(W, U, A, R) \
1914  ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W), \
1915  (__mmask16)(U), (int)(R)))
1916 
1917 #define _mm512_maskz_cvt_roundph_epu32(U, A, R) \
1918  ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), \
1919  (__v16su)_mm512_setzero_epi32(), \
1920  (__mmask16)(U), (int)(R)))
1921 
1922 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1923 _mm512_cvtph_epu32(__m256h __A) {
1924  return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1925  (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
1927 }
1928 
1929 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1930 _mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
1931  return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1932  (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1933 }
1934 
1935 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1936 _mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) {
1937  return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1938  (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
1940 }
1941 
1942 #define _mm512_cvt_roundepi32_ph(A, R) \
1943  ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), \
1944  (__v16hf)_mm256_undefined_ph(), \
1945  (__mmask16)(-1), (int)(R)))
1946 
1947 #define _mm512_mask_cvt_roundepi32_ph(W, U, A, R) \
1948  ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W), \
1949  (__mmask16)(U), (int)(R)))
1950 
1951 #define _mm512_maskz_cvt_roundepi32_ph(U, A, R) \
1952  ((__m256h)__builtin_ia32_vcvtdq2ph512_mask( \
1953  (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1954 
1955 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1956 _mm512_cvtepi32_ph(__m512i __A) {
1957  return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1958  (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1960 }
1961 
1962 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1963 _mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1964  return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1965  (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1966 }
1967 
1968 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1969 _mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) {
1970  return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1971  (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
1973 }
1974 
1975 #define _mm512_cvt_roundepu32_ph(A, R) \
1976  ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), \
1977  (__v16hf)_mm256_undefined_ph(), \
1978  (__mmask16)(-1), (int)(R)))
1979 
1980 #define _mm512_mask_cvt_roundepu32_ph(W, U, A, R) \
1981  ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W), \
1982  (__mmask16)(U), (int)(R)))
1983 
1984 #define _mm512_maskz_cvt_roundepu32_ph(U, A, R) \
1985  ((__m256h)__builtin_ia32_vcvtudq2ph512_mask( \
1986  (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1987 
1988 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1989 _mm512_cvtepu32_ph(__m512i __A) {
1990  return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1991  (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1993 }
1994 
1995 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1996 _mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1997  return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1998  (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1999 }
2000 
2001 static __inline__ __m256h __DEFAULT_FN_ATTRS512
2002 _mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) {
2003  return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2004  (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2006 }
2007 
2008 #define _mm512_cvtt_roundph_epi32(A, R) \
2009  ((__m512i)__builtin_ia32_vcvttph2dq512_mask( \
2010  (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
2011  (int)(R)))
2012 
2013 #define _mm512_mask_cvtt_roundph_epi32(W, U, A, R) \
2014  ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W), \
2015  (__mmask16)(U), (int)(R)))
2016 
2017 #define _mm512_maskz_cvtt_roundph_epi32(U, A, R) \
2018  ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), \
2019  (__v16si)_mm512_setzero_epi32(), \
2020  (__mmask16)(U), (int)(R)))
2021 
2022 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2023 _mm512_cvttph_epi32(__m256h __A) {
2024  return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2025  (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
2027 }
2028 
2029 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2030 _mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
2031  return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2032  (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2033 }
2034 
2035 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2036 _mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) {
2037  return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2038  (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
2040 }
2041 
2042 #define _mm512_cvtt_roundph_epu32(A, R) \
2043  ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2044  (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
2045  (int)(R)))
2046 
2047 #define _mm512_mask_cvtt_roundph_epu32(W, U, A, R) \
2048  ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W), \
2049  (__mmask16)(U), (int)(R)))
2050 
2051 #define _mm512_maskz_cvtt_roundph_epu32(U, A, R) \
2052  ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2053  (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U), \
2054  (int)(R)))
2055 
2056 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2057 _mm512_cvttph_epu32(__m256h __A) {
2058  return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2059  (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
2061 }
2062 
2063 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2064 _mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
2065  return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2066  (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2067 }
2068 
2069 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2070 _mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) {
2071  return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2072  (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
2074 }
2075 
2076 #define _mm512_cvt_roundepi64_ph(A, R) \
2077  ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2078  (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2079 
2080 #define _mm512_mask_cvt_roundepi64_ph(W, U, A, R) \
2081  ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W), \
2082  (__mmask8)(U), (int)(R)))
2083 
2084 #define _mm512_maskz_cvt_roundepi64_ph(U, A, R) \
2085  ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2086  (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2087 
2088 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2089 _mm512_cvtepi64_ph(__m512i __A) {
2090  return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2091  (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2093 }
2094 
2095 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2096 _mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2097  return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2098  (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2099 }
2100 
2101 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2102 _mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) {
2103  return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2104  (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2106 }
2107 
2108 #define _mm512_cvt_roundph_epi64(A, R) \
2109  ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), \
2110  (__v8di)_mm512_undefined_epi32(), \
2111  (__mmask8)(-1), (int)(R)))
2112 
2113 #define _mm512_mask_cvt_roundph_epi64(W, U, A, R) \
2114  ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2115  (__mmask8)(U), (int)(R)))
2116 
2117 #define _mm512_maskz_cvt_roundph_epi64(U, A, R) \
2118  ((__m512i)__builtin_ia32_vcvtph2qq512_mask( \
2119  (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2120 
2121 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2122 _mm512_cvtph_epi64(__m128h __A) {
2123  return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2124  (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2126 }
2127 
2128 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2129 _mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2130  return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2131  (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2132 }
2133 
2134 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2135 _mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
2136  return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2137  (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2139 }
2140 
2141 #define _mm512_cvt_roundepu64_ph(A, R) \
2142  ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2143  (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2144 
2145 #define _mm512_mask_cvt_roundepu64_ph(W, U, A, R) \
2146  ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W), \
2147  (__mmask8)(U), (int)(R)))
2148 
2149 #define _mm512_maskz_cvt_roundepu64_ph(U, A, R) \
2150  ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2151  (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2152 
2153 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2154 _mm512_cvtepu64_ph(__m512i __A) {
2155  return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2156  (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2158 }
2159 
2160 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2161 _mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2162  return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2163  (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2164 }
2165 
2166 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2167 _mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) {
2168  return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2169  (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2171 }
2172 
2173 #define _mm512_cvt_roundph_epu64(A, R) \
2174  ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2175  (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2176  (int)(R)))
2177 
2178 #define _mm512_mask_cvt_roundph_epu64(W, U, A, R) \
2179  ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2180  (__mmask8)(U), (int)(R)))
2181 
2182 #define _mm512_maskz_cvt_roundph_epu64(U, A, R) \
2183  ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2184  (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2185 
2186 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2187 _mm512_cvtph_epu64(__m128h __A) {
2188  return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2189  (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2191 }
2192 
2193 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2194 _mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2195  return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2196  (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2197 }
2198 
2199 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2200 _mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
2201  return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2202  (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2204 }
2205 
2206 #define _mm512_cvtt_roundph_epi64(A, R) \
2207  ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2208  (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1), \
2209  (int)(R)))
2210 
2211 #define _mm512_mask_cvtt_roundph_epi64(W, U, A, R) \
2212  ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2213  (__mmask8)(U), (int)(R)))
2214 
2215 #define _mm512_maskz_cvtt_roundph_epi64(U, A, R) \
2216  ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2217  (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2218 
2219 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2220 _mm512_cvttph_epi64(__m128h __A) {
2221  return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2222  (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2224 }
2225 
2226 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2227 _mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2228  return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2229  (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2230 }
2231 
2232 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2233 _mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
2234  return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2235  (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2237 }
2238 
2239 #define _mm512_cvtt_roundph_epu64(A, R) \
2240  ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2241  (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2242  (int)(R)))
2243 
2244 #define _mm512_mask_cvtt_roundph_epu64(W, U, A, R) \
2245  ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2246  (__mmask8)(U), (int)(R)))
2247 
2248 #define _mm512_maskz_cvtt_roundph_epu64(U, A, R) \
2249  ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2250  (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2251 
2252 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2253 _mm512_cvttph_epu64(__m128h __A) {
2254  return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2255  (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2257 }
2258 
2259 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2260 _mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2261  return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2262  (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2263 }
2264 
2265 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2266 _mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
2267  return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2268  (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2270 }
2271 
2272 #define _mm_cvt_roundsh_i32(A, R) \
2273  ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
2274 
2275 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) {
2276  return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2277 }
2278 
2279 #define _mm_cvt_roundsh_u32(A, R) \
2280  ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
2281 
2282 static __inline__ unsigned int __DEFAULT_FN_ATTRS128
2283 _mm_cvtsh_u32(__m128h __A) {
2284  return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
2286 }
2287 
2288 #ifdef __x86_64__
2289 #define _mm_cvt_roundsh_i64(A, R) \
2290  ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
2291 
2292 static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) {
2293  return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
2295 }
2296 
2297 #define _mm_cvt_roundsh_u64(A, R) \
2298  ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
2299 
2300 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
2301 _mm_cvtsh_u64(__m128h __A) {
2302  return (unsigned long long)__builtin_ia32_vcvtsh2usi64(
2303  (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2304 }
2305 #endif // __x86_64__
2306 
2307 #define _mm_cvt_roundu32_sh(A, B, R) \
2308  ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
2309 
2310 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2311 _mm_cvtu32_sh(__m128h __A, unsigned int __B) {
2312  __A[0] = __B;
2313  return __A;
2314 }
2315 
2316 #ifdef __x86_64__
2317 #define _mm_cvt_roundu64_sh(A, B, R) \
2318  ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B), \
2319  (int)(R)))
2320 
2321 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2322 _mm_cvtu64_sh(__m128h __A, unsigned long long __B) {
2323  __A[0] = __B;
2324  return __A;
2325 }
2326 #endif
2327 
2328 #define _mm_cvt_roundi32_sh(A, B, R) \
2329  ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
2330 
2331 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A,
2332  int __B) {
2333  __A[0] = __B;
2334  return __A;
2335 }
2336 
2337 #ifdef __x86_64__
2338 #define _mm_cvt_roundi64_sh(A, B, R) \
2339  ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
2340 
2341 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A,
2342  long long __B) {
2343  __A[0] = __B;
2344  return __A;
2345 }
2346 #endif
2347 
2348 #define _mm_cvtt_roundsh_i32(A, R) \
2349  ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
2350 
2351 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) {
2352  return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
2354 }
2355 
2356 #ifdef __x86_64__
2357 #define _mm_cvtt_roundsh_i64(A, R) \
2358  ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
2359 
2360 static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) {
2361  return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
2363 }
2364 #endif
2365 
2366 #define _mm_cvtt_roundsh_u32(A, R) \
2367  ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
2368 
2369 static __inline__ unsigned int __DEFAULT_FN_ATTRS128
2370 _mm_cvttsh_u32(__m128h __A) {
2371  return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
2373 }
2374 
2375 #ifdef __x86_64__
2376 #define _mm_cvtt_roundsh_u64(A, R) \
2377  ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
2378 
2379 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
2380 _mm_cvttsh_u64(__m128h __A) {
2381  return (unsigned long long)__builtin_ia32_vcvttsh2usi64(
2382  (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2383 }
2384 #endif
2385 
2386 #define _mm512_cvtx_roundph_ps(A, R) \
2387  ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), \
2388  (__v16sf)_mm512_undefined_ps(), \
2389  (__mmask16)(-1), (int)(R)))
2390 
2391 #define _mm512_mask_cvtx_roundph_ps(W, U, A, R) \
2392  ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W), \
2393  (__mmask16)(U), (int)(R)))
2394 
2395 #define _mm512_maskz_cvtx_roundph_ps(U, A, R) \
2396  ((__m512)__builtin_ia32_vcvtph2psx512_mask( \
2397  (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
2398 
2399 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) {
2400  return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2401  (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1,
2403 }
2404 
2405 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2406 _mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) {
2407  return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2408  (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2409 }
2410 
2411 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2412 _mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) {
2413  return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2414  (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U,
2416 }
2417 
2418 #define _mm512_cvtx_roundps_ph(A, R) \
2419  ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), \
2420  (__v16hf)_mm256_undefined_ph(), \
2421  (__mmask16)(-1), (int)(R)))
2422 
2423 #define _mm512_mask_cvtx_roundps_ph(W, U, A, R) \
2424  ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W), \
2425  (__mmask16)(U), (int)(R)))
2426 
2427 #define _mm512_maskz_cvtx_roundps_ph(U, A, R) \
2428  ((__m256h)__builtin_ia32_vcvtps2phx512_mask( \
2429  (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2430 
2431 static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) {
2432  return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2433  (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
2435 }
2436 
2437 static __inline__ __m256h __DEFAULT_FN_ATTRS512
2438 _mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) {
2439  return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2440  (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2441 }
2442 
2443 static __inline__ __m256h __DEFAULT_FN_ATTRS512
2444 _mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) {
2445  return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2446  (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2448 }
2449 
2450 #define _mm512_fmadd_round_ph(A, B, C, R) \
2451  ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2452  (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2453  (__mmask32)-1, (int)(R)))
2454 
2455 #define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \
2456  ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2457  (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2458  (__mmask32)(U), (int)(R)))
2459 
2460 #define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \
2461  ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2462  (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2463  (__mmask32)(U), (int)(R)))
2464 
2465 #define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \
2466  ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2467  (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2468  (__mmask32)(U), (int)(R)))
2469 
2470 #define _mm512_fmsub_round_ph(A, B, C, R) \
2471  ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2472  (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2473  (__mmask32)-1, (int)(R)))
2474 
2475 #define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \
2476  ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2477  (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2478  (__mmask32)(U), (int)(R)))
2479 
2480 #define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \
2481  ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2482  (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2483  (__mmask32)(U), (int)(R)))
2484 
2485 #define _mm512_fnmadd_round_ph(A, B, C, R) \
2486  ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2487  (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2488  (__mmask32)-1, (int)(R)))
2489 
2490 #define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \
2491  ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2492  -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2493  (__mmask32)(U), (int)(R)))
2494 
2495 #define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \
2496  ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2497  -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2498  (__mmask32)(U), (int)(R)))
2499 
2500 #define _mm512_fnmsub_round_ph(A, B, C, R) \
2501  ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2502  (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2503  (__mmask32)-1, (int)(R)))
2504 
2505 #define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \
2506  ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2507  -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2508  (__mmask32)(U), (int)(R)))
2509 
2510 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A,
2511  __m512h __B,
2512  __m512h __C) {
2513  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2514  (__v32hf)__C, (__mmask32)-1,
2516 }
2517 
2518 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2519 _mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2520  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2521  (__v32hf)__C, (__mmask32)__U,
2523 }
2524 
2525 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2526 _mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2527  return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
2528  (__v32hf)__C, (__mmask32)__U,
2530 }
2531 
2532 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2533 _mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2534  return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
2535  (__v32hf)__C, (__mmask32)__U,
2537 }
2538 
2539 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A,
2540  __m512h __B,
2541  __m512h __C) {
2542  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2543  -(__v32hf)__C, (__mmask32)-1,
2545 }
2546 
2547 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2548 _mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2549  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2550  -(__v32hf)__C, (__mmask32)__U,
2552 }
2553 
2554 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2555 _mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2556  return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2557  (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2559 }
2560 
2561 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A,
2562  __m512h __B,
2563  __m512h __C) {
2564  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2565  (__v32hf)__C, (__mmask32)-1,
2567 }
2568 
2569 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2570 _mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2571  return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2572  (__v32hf)__C, (__mmask32)__U,
2574 }
2575 
2576 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2577 _mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2578  return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
2579  (__v32hf)__C, (__mmask32)__U,
2581 }
2582 
2583 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A,
2584  __m512h __B,
2585  __m512h __C) {
2586  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2587  -(__v32hf)__C, (__mmask32)-1,
2589 }
2590 
2591 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2592 _mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2593  return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2594  -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2596 }
2597 
2598 #define _mm512_fmaddsub_round_ph(A, B, C, R) \
2599  ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2600  (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2601  (__mmask32)-1, (int)(R)))
2602 
2603 #define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \
2604  ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2605  (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2606  (__mmask32)(U), (int)(R)))
2607 
2608 #define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \
2609  ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \
2610  (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2611  (__mmask32)(U), (int)(R)))
2612 
2613 #define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \
2614  ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2615  (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2616  (__mmask32)(U), (int)(R)))
2617 
2618 #define _mm512_fmsubadd_round_ph(A, B, C, R) \
2619  ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2620  (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2621  (__mmask32)-1, (int)(R)))
2622 
2623 #define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \
2624  ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2625  (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2626  (__mmask32)(U), (int)(R)))
2627 
2628 #define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \
2629  ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2630  (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2631  (__mmask32)(U), (int)(R)))
2632 
2633 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2634 _mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
2635  return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2636  (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1,
2638 }
2639 
2640 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2641 _mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2642  return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2643  (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2645 }
2646 
2647 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2648 _mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2649  return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
2650  (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2652 }
2653 
2654 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2655 _mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2656  return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2657  (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2659 }
2660 
2661 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2662 _mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
2663  return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2664  (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1,
2666 }
2667 
2668 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2669 _mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2670  return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2671  (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2673 }
2674 
2675 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2676 _mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2677  return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2678  (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2680 }
2681 
2682 #define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \
2683  ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2684  (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2685  (__mmask32)(U), (int)(R)))
2686 
2687 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2688 _mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2689  return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
2690  (__v32hf)__C, (__mmask32)__U,
2692 }
2693 
2694 #define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \
2695  ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \
2696  (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2697  (__mmask32)(U), (int)(R)))
2698 
2699 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2700 _mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2701  return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
2702  (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2704 }
2705 
2706 #define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \
2707  ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2708  (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2709  (__mmask32)(U), (int)(R)))
2710 
2711 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2712 _mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2713  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2714  (__v32hf)__C, (__mmask32)__U,
2716 }
2717 
2718 #define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \
2719  ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2720  (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2721  (__mmask32)(U), (int)(R)))
2722 
2723 #define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \
2724  ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2725  -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2726  (__mmask32)(U), (int)(R)))
2727 
2728 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2729 _mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2730  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2731  -(__v32hf)__C, (__mmask32)__U,
2733 }
2734 
2735 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2736 _mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2737  return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2738  (__v32hf)__C, (__mmask32)__U,
2740 }
2741 
2742 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W,
2743  __m128h __A,
2744  __m128h __B) {
2745  return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2747 }
2748 
2749 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W,
2750  __mmask8 __U,
2751  __m128h __A,
2752  __m128h __B) {
2753  return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2755 }
2756 
2757 #define _mm_fmadd_round_sh(A, B, C, R) \
2758  ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2759  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2760  (__mmask8)-1, (int)(R)))
2761 
2762 #define _mm_mask_fmadd_round_sh(W, U, A, B, R) \
2763  ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2764  (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2765  (__mmask8)(U), (int)(R)))
2766 
2767 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2768 _mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2769  return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
2770  (__mmask8)__U,
2772 }
2773 
2774 #define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \
2775  ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2776  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2777  (__mmask8)(U), (int)(R)))
2778 
2779 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2780 _mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2781  return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2782  (__mmask8)__U,
2784 }
2785 
2786 #define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \
2787  ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2788  (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2789  (__mmask8)(U), (int)(R)))
2790 
2791 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W,
2792  __m128h __A,
2793  __m128h __B) {
2794  return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2795  -(__v8hf)__B, (__mmask8)-1,
2797 }
2798 
2799 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W,
2800  __mmask8 __U,
2801  __m128h __A,
2802  __m128h __B) {
2803  return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2804  -(__v8hf)__B, (__mmask8)__U,
2806 }
2807 
2808 #define _mm_fmsub_round_sh(A, B, C, R) \
2809  ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2810  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2811  (__mmask8)-1, (int)(R)))
2812 
2813 #define _mm_mask_fmsub_round_sh(W, U, A, B, R) \
2814  ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2815  (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2816  (__mmask8)(U), (int)(R)))
2817 
2818 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2819 _mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2820  return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
2821  -(__v8hf)__C, (__mmask8)__U,
2823 }
2824 
2825 #define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \
2826  ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2827  (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2828  (__mmask8)(U), (int)R))
2829 
2830 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2831 _mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2832  return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2833  (__mmask8)__U,
2835 }
2836 
2837 #define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \
2838  ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2839  (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2840  (__mmask8)(U), (int)(R)))
2841 
2842 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W,
2843  __m128h __A,
2844  __m128h __B) {
2845  return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2847 }
2848 
2849 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2850 _mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2851  return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2853 }
2854 
2855 #define _mm_fnmadd_round_sh(A, B, C, R) \
2856  ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2857  (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2858  (__mmask8)-1, (int)(R)))
2859 
2860 #define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \
2861  ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2862  (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2863  (__mmask8)(U), (int)(R)))
2864 
2865 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2866 _mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2867  return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
2868  (__mmask8)__U,
2870 }
2871 
2872 #define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \
2873  ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2874  (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2875  (__mmask8)(U), (int)(R)))
2876 
2877 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2878 _mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2879  return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2880  (__mmask8)__U,
2882 }
2883 
2884 #define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \
2885  ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2886  (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2887  (__mmask8)(U), (int)(R)))
2888 
2889 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W,
2890  __m128h __A,
2891  __m128h __B) {
2892  return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2894 }
2895 
2896 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2897 _mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2898  return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2900 }
2901 
2902 #define _mm_fnmsub_round_sh(A, B, C, R) \
2903  ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2904  (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2905  (__mmask8)-1, (int)(R)))
2906 
2907 #define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \
2908  ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2909  (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2910  (__mmask8)(U), (int)(R)))
2911 
2912 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2913 _mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2914  return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
2915  (__mmask8)__U,
2917 }
2918 
2919 #define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \
2920  ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2921  (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2922  (__mmask8)(U), (int)(R)))
2923 
2924 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2925 _mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2926  return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2927  (__mmask8)__U,
2929 }
2930 
2931 #define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \
2932  ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2933  (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2934  (__mmask8)(U), (int)(R)))
2935 
2936 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A,
2937  __m128h __B,
2938  __m128h __C) {
2939  return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2940  (__v4sf)__C, (__mmask8)-1,
2942 }
2943 
2944 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2945 _mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2946  return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
2947  (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2948 }
2949 
2950 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2951 _mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2952  return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2953  (__v4sf)__C, (__mmask8)__U,
2955 }
2956 
2957 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2958 _mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
2959  return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
2960  (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
2961 }
2962 
2963 #define _mm_fcmadd_round_sch(A, B, C, R) \
2964  ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \
2965  (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2966  (__mmask8)-1, (int)(R)))
2967 
2968 #define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \
2969  ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask( \
2970  (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2971  (__mmask8)(U), (int)(R)))
2972 
2973 #define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \
2974  ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \
2975  (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2976  (__mmask8)(U), (int)(R)))
2977 
2978 #define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \
2979  ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( \
2980  (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2981  (__mmask8)(U), (int)(R)))
2982 
2983 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A,
2984  __m128h __B,
2985  __m128h __C) {
2986  return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2987  (__v4sf)__C, (__mmask8)-1,
2989 }
2990 
2991 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2992 _mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2993  return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
2994  (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2995 }
2996 
2997 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2998 _mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2999  return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
3000  (__v4sf)__C, (__mmask8)__U,
3002 }
3003 
3004 static __inline__ __m128h __DEFAULT_FN_ATTRS128
3005 _mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
3006  return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
3007  (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
3008 }
3009 
3010 #define _mm_fmadd_round_sch(A, B, C, R) \
3011  ((__m128h)__builtin_ia32_vfmaddcsh_mask( \
3012  (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3013  (__mmask8)-1, (int)(R)))
3014 
3015 #define _mm_mask_fmadd_round_sch(A, U, B, C, R) \
3016  ((__m128h)__builtin_ia32_vfmaddcsh_round_mask( \
3017  (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3018  (__mmask8)(U), (int)(R)))
3019 
3020 #define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \
3021  ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \
3022  (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3023  (__mmask8)(U), (int)(R)))
3024 
3025 #define _mm_mask3_fmadd_round_sch(A, B, C, U, R) \
3026  ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3( \
3027  (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3028  (__mmask8)(U), (int)(R)))
3029 
3030 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A,
3031  __m128h __B) {
3032  return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3033  (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3035 }
3036 
3037 static __inline__ __m128h __DEFAULT_FN_ATTRS128
3038 _mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
3039  return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3040  (__v4sf)__W, (__mmask8)__U,
3042 }
3043 
3044 static __inline__ __m128h __DEFAULT_FN_ATTRS128
3045 _mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3046  return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3047  (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3049 }
3050 
3051 #define _mm_fcmul_round_sch(A, B, R) \
3052  ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3053  (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3054  (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3055 
3056 #define _mm_mask_fcmul_round_sch(W, U, A, B, R) \
3057  ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3058  (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3059  (__mmask8)(U), (int)(R)))
3060 
3061 #define _mm_maskz_fcmul_round_sch(U, A, B, R) \
3062  ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3063  (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3064  (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3065 
3066 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A,
3067  __m128h __B) {
3068  return (__m128h)__builtin_ia32_vfmulcsh_mask(
3069  (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3071 }
3072 
3073 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W,
3074  __mmask8 __U,
3075  __m128h __A,
3076  __m128h __B) {
3077  return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3078  (__v4sf)__W, (__mmask8)__U,
3080 }
3081 
3082 static __inline__ __m128h __DEFAULT_FN_ATTRS128
3083 _mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3084  return (__m128h)__builtin_ia32_vfmulcsh_mask(
3085  (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3087 }
3088 
3089 #define _mm_fmul_round_sch(A, B, R) \
3090  ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3091  (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3092  (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3093 
3094 #define _mm_mask_fmul_round_sch(W, U, A, B, R) \
3095  ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3096  (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3097  (__mmask8)(U), (int)(R)))
3098 
3099 #define _mm_maskz_fmul_round_sch(U, A, B, R) \
3100  ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3101  (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3102  (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3103 
3104 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A,
3105  __m512h __B) {
3106  return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3107  (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3109 }
3110 
3111 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3112 _mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3113  return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3114  (__v16sf)__W, (__mmask16)__U,
3116 }
3117 
3118 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3119 _mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3120  return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3121  (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3123 }
3124 
3125 #define _mm512_fcmul_round_pch(A, B, R) \
3126  ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3127  (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3128  (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3129 
3130 #define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \
3131  ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3132  (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3133  (__mmask16)(U), (int)(R)))
3134 
3135 #define _mm512_maskz_fcmul_round_pch(U, A, B, R) \
3136  ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3137  (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3138  (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3139 
3140 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A,
3141  __m512h __B) {
3142  return (__m512h)__builtin_ia32_vfmulcph512_mask(
3143  (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3145 }
3146 
3147 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3148 _mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3149  return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3150  (__v16sf)__W, (__mmask16)__U,
3152 }
3153 
3154 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3155 _mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3156  return (__m512h)__builtin_ia32_vfmulcph512_mask(
3157  (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3159 }
3160 
3161 #define _mm512_fmul_round_pch(A, B, R) \
3162  ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3163  (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3164  (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3165 
3166 #define _mm512_mask_fmul_round_pch(W, U, A, B, R) \
3167  ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3168  (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3169  (__mmask16)(U), (int)(R)))
3170 
3171 #define _mm512_maskz_fmul_round_pch(U, A, B, R) \
3172  ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3173  (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3174  (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3175 
3176 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A,
3177  __m512h __B,
3178  __m512h __C) {
3179  return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3180  (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1,
3182 }
3183 
3184 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3185 _mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3186  return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
3187  (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3189 }
3190 
3191 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3192 _mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3193  return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3194  (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3196 }
3197 
3198 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3199 _mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3200  return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
3201  (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3203 }
3204 
3205 #define _mm512_fcmadd_round_pch(A, B, C, R) \
3206  ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3207  (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3208  (__mmask16)-1, (int)(R)))
3209 
3210 #define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \
3211  ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
3212  (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3213  (__mmask16)(U), (int)(R)))
3214 
3215 #define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \
3216  ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3217  (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3218  (__mmask16)(U), (int)(R)))
3219 
3220 #define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \
3221  ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \
3222  (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3223  (__mmask16)(U), (int)(R)))
3224 
3225 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A,
3226  __m512h __B,
3227  __m512h __C) {
3228  return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
3229  (__v16sf)__C, (__mmask16)-1,
3231 }
3232 
3233 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3234 _mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3235  return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
3236  (__v16sf)__C, (__mmask16)__U,
3238 }
3239 
3240 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3241 _mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3242  return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
3243  (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3245 }
3246 
3247 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3248 _mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3249  return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
3250  (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3252 }
3253 
3254 #define _mm512_fmadd_round_pch(A, B, C, R) \
3255  ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3256  (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3257  (__mmask16)-1, (int)(R)))
3258 
3259 #define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \
3260  ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
3261  (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3262  (__mmask16)(U), (int)(R)))
3263 
3264 #define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \
3265  ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3266  (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3267  (__mmask16)(U), (int)(R)))
3268 
3269 #define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \
3270  ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \
3271  (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3272  (__mmask16)(U), (int)(R)))
3273 
3274 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3275 _mm512_reduce_add_ph(__m512h __W) {
3276  return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
3277 }
3278 
3279 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3280 _mm512_reduce_mul_ph(__m512h __W) {
3281  return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
3282 }
3283 
3284 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3285 _mm512_reduce_max_ph(__m512h __V) {
3286  return __builtin_ia32_reduce_fmax_ph512(__V);
3287 }
3288 
3289 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3290 _mm512_reduce_min_ph(__m512h __V) {
3291  return __builtin_ia32_reduce_fmin_ph512(__V);
3292 }
3293 
3294 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3295 _mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
3296  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W,
3297  (__v32hf)__A);
3298 }
3299 
3300 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3301 _mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
3302  return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
3303  (__v32hi)__B);
3304 }
3305 
3306 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3307 _mm512_permutexvar_ph(__m512i __A, __m512h __B) {
3308  return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
3309 }
3310 
3311 // intrinsics below are alias for f*mul_*ch
3312 #define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
3313 #define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
3314 #define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
3315 #define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
3316 #define _mm512_mask_mul_round_pch(W, U, A, B, R) \
3317  _mm512_mask_fmul_round_pch(W, U, A, B, R)
3318 #define _mm512_maskz_mul_round_pch(U, A, B, R) \
3319  _mm512_maskz_fmul_round_pch(U, A, B, R)
3320 
3321 #define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
3322 #define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
3323 #define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
3324 #define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
3325 #define _mm512_mask_cmul_round_pch(W, U, A, B, R) \
3326  _mm512_mask_fcmul_round_pch(W, U, A, B, R)
3327 #define _mm512_maskz_cmul_round_pch(U, A, B, R) \
3328  _mm512_maskz_fcmul_round_pch(U, A, B, R)
3329 
3330 #define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
3331 #define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
3332 #define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
3333 #define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
3334 #define _mm_mask_mul_round_sch(W, U, A, B, R) \
3335  _mm_mask_fmul_round_sch(W, U, A, B, R)
3336 #define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
3337 
3338 #define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
3339 #define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
3340 #define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
3341 #define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
3342 #define _mm_mask_cmul_round_sch(W, U, A, B, R) \
3343  _mm_mask_fcmul_round_sch(W, U, A, B, R)
3344 #define _mm_maskz_cmul_round_sch(U, A, B, R) \
3345  _mm_maskz_fcmul_round_sch(U, A, B, R)
3346 
3347 #undef __DEFAULT_FN_ATTRS128
3348 #undef __DEFAULT_FN_ATTRS256
3349 #undef __DEFAULT_FN_ATTRS512
3350 
3351 #endif
3352 #endif
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
__device__ _Float16
static __inline__ vector float vector float __b
Definition: altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition: arm_acle.h:80
#define __DEFAULT_FN_ATTRS256
Definition: avx2intrin.h:18
#define __DEFAULT_FN_ATTRS128
Definition: avx2intrin.h:21
#define __DEFAULT_FN_ATTRS512
unsigned int __mmask32
static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_xor_ps(__m512 __A, __m512 __B)
static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_setzero_pd(void)
static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_set1_epi32(int __s)
static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_setzero_ps(void)
#define _mm512_setzero_epi32
unsigned char __mmask8
Definition: avx512fintrin.h:41
static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_and_epi32(__m512i __a, __m512i __b)
static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_set1_ps(float __w)
unsigned short __mmask16
Definition: avx512fintrin.h:42
#define _MM_FROUND_CUR_DIRECTION
Definition: avx512fintrin.h:49
#define _CMP_GT_OS
Definition: avxintrin.h:1592
#define _CMP_GE_OS
Definition: avxintrin.h:1591
#define _CMP_GT_OQ
Definition: avxintrin.h:1608
#define _CMP_LE_OQ
Definition: avxintrin.h:1596
#define _CMP_LT_OQ
Definition: avxintrin.h:1595
#define _CMP_NEQ_US
Definition: avxintrin.h:1598
#define _CMP_EQ_OS
Definition: avxintrin.h:1594
#define _CMP_GE_OQ
Definition: avxintrin.h:1607
static __inline__ void int __a
Definition: emmintrin.h:4057
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
Definition: emmintrin.h:1755
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition: emmintrin.h:1857
struct __storeu_i16 *__P __v
Definition: immintrin.h:472
__inline unsigned int unsigned int unsigned int * __P
Definition: bmi2intrin.h:25
__inline unsigned int unsigned int __Y
Definition: bmi2intrin.h:19
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition: xmmintrin.h:2018
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
Definition: xmmintrin.h:1891
#define _CMP_LE_OS
Definition: xmmintrin.h:3053
#define _CMP_NEQ_UQ
Definition: xmmintrin.h:3055
#define _CMP_LT_OS
Definition: xmmintrin.h:3052
#define _CMP_EQ_OQ
Definition: xmmintrin.h:3051