clang  19.0.0git
avx512bf16intrin.h
Go to the documentation of this file.
1 /*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 #ifndef __IMMINTRIN_H
10 #error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
11 #endif
12 
13 #ifdef __SSE2__
14 
15 #ifndef __AVX512BF16INTRIN_H
16 #define __AVX512BF16INTRIN_H
17 
18 typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
19 typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
20 typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
21 
22 #define __DEFAULT_FN_ATTRS512 \
23  __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16,evex512"), \
24  __min_vector_width__(512)))
25 #define __DEFAULT_FN_ATTRS \
26  __attribute__((__always_inline__, __nodebug__, \
27  __target__("avx512bf16,no-evex512")))
28 
29 /// Convert One BF16 Data to One Single Float Data.
30 ///
31 /// \headerfile <x86intrin.h>
32 ///
33 /// This intrinsic does not correspond to a specific instruction.
34 ///
35 /// \param __A
36 /// A bfloat data.
37 /// \returns A float data whose sign field and exponent field keep unchanged,
38 /// and fraction field is extended to 23 bits.
39 static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
40  return __builtin_ia32_cvtsbf162ss_32(__A);
41 }
42 
43 /// Convert Two Packed Single Data to One Packed BF16 Data.
44 ///
45 /// \headerfile <x86intrin.h>
46 ///
47 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
48 ///
49 /// \param __A
50 /// A 512-bit vector of [16 x float].
51 /// \param __B
52 /// A 512-bit vector of [16 x float].
53 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
54 /// conversion of __B, and higher 256 bits come from conversion of __A.
55 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
56 _mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
57  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A,
58  (__v16sf) __B);
59 }
60 
61 /// Convert Two Packed Single Data to One Packed BF16 Data.
62 ///
63 /// \headerfile <x86intrin.h>
64 ///
65 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
66 ///
67 /// \param __A
68 /// A 512-bit vector of [16 x float].
69 /// \param __B
70 /// A 512-bit vector of [16 x float].
71 /// \param __W
72 /// A 512-bit vector of [32 x bfloat].
73 /// \param __U
74 /// A 32-bit mask value specifying what is chosen for each element.
75 /// A 1 means conversion of __A or __B. A 0 means element from __W.
76 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
77 /// conversion of __B, and higher 256 bits come from conversion of __A.
78 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
79 _mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
80  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
81  (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
82  (__v32bf)__W);
83 }
84 
85 /// Convert Two Packed Single Data to One Packed BF16 Data.
86 ///
87 /// \headerfile <x86intrin.h>
88 ///
89 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
90 ///
91 /// \param __A
92 /// A 512-bit vector of [16 x float].
93 /// \param __B
94 /// A 512-bit vector of [16 x float].
95 /// \param __U
96 /// A 32-bit mask value specifying what is chosen for each element.
97 /// A 1 means conversion of __A or __B. A 0 means element is zero.
98 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
99 /// conversion of __B, and higher 256 bits come from conversion of __A.
100 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
101 _mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
102  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
103  (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
104  (__v32bf)_mm512_setzero_si512());
105 }
106 
107 /// Convert Packed Single Data to Packed BF16 Data.
108 ///
109 /// \headerfile <x86intrin.h>
110 ///
111 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
112 ///
113 /// \param __A
114 /// A 512-bit vector of [16 x float].
115 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
116 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
117 _mm512_cvtneps_pbh(__m512 __A) {
118  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
119  (__v16bf)_mm256_undefined_si256(),
120  (__mmask16)-1);
121 }
122 
123 /// Convert Packed Single Data to Packed BF16 Data.
124 ///
125 /// \headerfile <x86intrin.h>
126 ///
127 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
128 ///
129 /// \param __A
130 /// A 512-bit vector of [16 x float].
131 /// \param __W
132 /// A 256-bit vector of [16 x bfloat].
133 /// \param __U
134 /// A 16-bit mask value specifying what is chosen for each element.
135 /// A 1 means conversion of __A. A 0 means element from __W.
136 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
137 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
138 _mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
139  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
140  (__v16bf)__W,
141  (__mmask16)__U);
142 }
143 
144 /// Convert Packed Single Data to Packed BF16 Data.
145 ///
146 /// \headerfile <x86intrin.h>
147 ///
148 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
149 ///
150 /// \param __A
151 /// A 512-bit vector of [16 x float].
152 /// \param __U
153 /// A 16-bit mask value specifying what is chosen for each element.
154 /// A 1 means conversion of __A. A 0 means element is zero.
155 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
156 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
157 _mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
158  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
159  (__v16bf)_mm256_setzero_si256(),
160  (__mmask16)__U);
161 }
162 
163 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
164 ///
165 /// \headerfile <x86intrin.h>
166 ///
167 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
168 ///
169 /// \param __A
170 /// A 512-bit vector of [32 x bfloat].
171 /// \param __B
172 /// A 512-bit vector of [32 x bfloat].
173 /// \param __D
174 /// A 512-bit vector of [16 x float].
175 /// \returns A 512-bit vector of [16 x float] comes from Dot Product of
176 /// __A, __B and __D
177 static __inline__ __m512 __DEFAULT_FN_ATTRS512
178 _mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
179  return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
180  (__v32bf) __A,
181  (__v32bf) __B);
182 }
183 
184 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
185 ///
186 /// \headerfile <x86intrin.h>
187 ///
188 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
189 ///
190 /// \param __A
191 /// A 512-bit vector of [32 x bfloat].
192 /// \param __B
193 /// A 512-bit vector of [32 x bfloat].
194 /// \param __D
195 /// A 512-bit vector of [16 x float].
196 /// \param __U
197 /// A 16-bit mask value specifying what is chosen for each element.
198 /// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
199 /// \returns A 512-bit vector of [16 x float] comes from Dot Product of
200 /// __A, __B and __D
201 static __inline__ __m512 __DEFAULT_FN_ATTRS512
202 _mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) {
203  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
204  (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
205  (__v16sf)__D);
206 }
207 
208 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
209 ///
210 /// \headerfile <x86intrin.h>
211 ///
212 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
213 ///
214 /// \param __A
215 /// A 512-bit vector of [32 x bfloat].
216 /// \param __B
217 /// A 512-bit vector of [32 x bfloat].
218 /// \param __D
219 /// A 512-bit vector of [16 x float].
220 /// \param __U
221 /// A 16-bit mask value specifying what is chosen for each element.
222 /// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
223 /// \returns A 512-bit vector of [16 x float] comes from Dot Product of
224 /// __A, __B and __D
225 static __inline__ __m512 __DEFAULT_FN_ATTRS512
226 _mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
227  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
228  (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
229  (__v16sf)_mm512_setzero_si512());
230 }
231 
232 /// Convert Packed BF16 Data to Packed float Data.
233 ///
234 /// \headerfile <x86intrin.h>
235 ///
236 /// \param __A
237 /// A 256-bit vector of [16 x bfloat].
238 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
239 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
240  return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
241  (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
242 }
243 
244 /// Convert Packed BF16 Data to Packed float Data using zeroing mask.
245 ///
246 /// \headerfile <x86intrin.h>
247 ///
248 /// \param __U
249 /// A 16-bit mask. Elements are zeroed out when the corresponding mask
250 /// bit is not set.
251 /// \param __A
252 /// A 256-bit vector of [16 x bfloat].
253 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
254 static __inline__ __m512 __DEFAULT_FN_ATTRS512
255 _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
256  return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
257  (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
258 }
259 
260 /// Convert Packed BF16 Data to Packed float Data using merging mask.
261 ///
262 /// \headerfile <x86intrin.h>
263 ///
264 /// \param __S
265 /// A 512-bit vector of [16 x float]. Elements are copied from __S when
266 /// the corresponding mask bit is not set.
267 /// \param __U
268 /// A 16-bit mask.
269 /// \param __A
270 /// A 256-bit vector of [16 x bfloat].
271 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
272 static __inline__ __m512 __DEFAULT_FN_ATTRS512
273 _mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
275  (__m512i)__S, (__mmask16)__U,
276  (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
277 }
278 
279 #undef __DEFAULT_FN_ATTRS
280 #undef __DEFAULT_FN_ATTRS512
281 
282 #endif
283 #endif
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
#define __DEFAULT_FN_ATTRS
#define __DEFAULT_FN_ATTRS512
unsigned int __mmask32
static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A)
static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_castsi512_ps(__m512i __A)
static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, unsigned int __B)
unsigned short __mmask16
Definition: avx512fintrin.h:42
static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_slli_epi32(__m512i __A, unsigned int __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvtepi16_epi32(__m256i __A)
static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_setzero_si512(void)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_undefined_si256(void)
Create a 256-bit integer vector with undefined values.
Definition: avxintrin.h:3666
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setzero_si256(void)
Constructs a 256-bit integer vector initialized to zero.
Definition: avxintrin.h:4353
static __inline__ void short __D
Definition: immintrin.h:468