clang  19.0.0git
avxvnniint8intrin.h
Go to the documentation of this file.
1 /*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 #ifndef __IMMINTRIN_H
10 #error \
11  "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
12 #endif
13 
14 #ifndef __AVXVNNIINT8INTRIN_H
15 #define __AVXVNNIINT8INTRIN_H
16 
17 /* Define the default attributes for the functions in this file. */
18 #define __DEFAULT_FN_ATTRS256 \
19  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
20  __min_vector_width__(256)))
21 #define __DEFAULT_FN_ATTRS128 \
22  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
23  __min_vector_width__(128)))
24 
25 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
26 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
27 /// signed 16-bit results. Sum these 4 results with the corresponding
28 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
29 ///
30 /// \headerfile <x86intrin.h>
31 ///
32 /// \code
33 /// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
34 /// \endcode
35 ///
36 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
37 ///
38 /// \param __A
39 /// A 128-bit vector of [16 x char].
40 /// \param __B
41 /// A 128-bit vector of [16 x char].
42 /// \returns
43 /// A 128-bit vector of [4 x int].
44 ///
45 /// \code{.operation}
46 /// FOR j := 0 to 3
47 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
48 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
49 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
50 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
51 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
52 /// ENDFOR
53 /// dst[MAX:128] := 0
54 /// \endcode
55 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
56  __m128i __A,
57  __m128i __B) {
58  return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
59  (__v4si)__B);
60 }
61 
62 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
63 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
64 /// signed 16-bit results. Sum these 4 results with the corresponding
65 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
66 ///
67 /// \headerfile <x86intrin.h>
68 ///
69 /// \code
70 /// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
71 /// \endcode
72 ///
73 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
74 ///
75 /// \param __A
76 /// A 256-bit vector of [32 x char].
77 /// \param __B
78 /// A 256-bit vector of [32 x char].
79 /// \returns
80 /// A 256-bit vector of [8 x int].
81 ///
82 /// \code{.operation}
83 /// FOR j := 0 to 7
84 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
85 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
86 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
87 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
88 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
89 /// ENDFOR
90 /// dst[MAX:256] := 0
91 /// \endcode
92 static __inline__ __m256i __DEFAULT_FN_ATTRS256
93 _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
94  return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
95  (__v8si)__B);
96 }
97 
98 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
99 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
100 /// signed 16-bit results. Sum these 4 results with the corresponding
101 /// 32-bit integer in \a __W with signed saturation, and store the packed
102 /// 32-bit results in \a dst.
103 ///
104 /// \headerfile <x86intrin.h>
105 ///
106 /// \code
107 /// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
108 /// \endcode
109 ///
110 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
111 ///
112 /// \param __A
113 /// A 128-bit vector of [16 x char].
114 /// \param __B
115 /// A 128-bit vector of [16 x char].
116 /// \returns
117 /// A 128-bit vector of [4 x int].
118 ///
119 /// \code{.operation}
120 /// FOR j := 0 to 3
121 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
122 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
123 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
124 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
125 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
126 /// ENDFOR
127 /// dst[MAX:128] := 0
128 /// \endcode
129 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
130  __m128i __A,
131  __m128i __B) {
132  return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
133  (__v4si)__B);
134 }
135 
136 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
137 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
138 /// signed 16-bit results. Sum these 4 results with the corresponding
139 /// 32-bit integer in \a __W with signed saturation, and store the packed
140 /// 32-bit results in \a dst.
141 ///
142 /// \headerfile <x86intrin.h>
143 ///
144 /// \code
145 /// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
146 /// \endcode
147 ///
148 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
149 ///
150 /// \param __A
151 /// A 256-bit vector of [32 x char].
152 /// \param __B
153 /// A 256-bit vector of [32 x char].
154 /// \returns
155 /// A 256-bit vector of [8 x int].
156 ///
157 /// \code{.operation}
158 /// FOR j := 0 to 7
159 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
160 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
161 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
162 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
163 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
164 /// ENDFOR
165 /// dst[MAX:256] := 0
166 /// \endcode
167 static __inline__ __m256i __DEFAULT_FN_ATTRS256
168 _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
169  return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
170  (__v8si)__B);
171 }
172 
173 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
174 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
175 /// signed 16-bit results. Sum these 4 results with the corresponding
176 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
177 ///
178 /// \headerfile <x86intrin.h>
179 ///
180 /// \code
181 /// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
182 /// \endcode
183 ///
184 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
185 ///
186 /// \param __A
187 /// A 128-bit vector of [16 x char].
188 /// \param __B
189 /// A 128-bit vector of [16 x unsigned char].
190 /// \returns
191 /// A 128-bit vector of [4 x int].
192 ///
193 /// \code{.operation}
194 /// FOR j := 0 to 3
195 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
196 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
197 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
198 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
199 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
200 /// ENDFOR
201 /// dst[MAX:128] := 0
202 /// \endcode
203 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
204  __m128i __A,
205  __m128i __B) {
206  return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
207  (__v4si)__B);
208 }
209 
210 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
211 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
212 /// signed 16-bit results. Sum these 4 results with the corresponding
213 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
214 ///
215 /// \headerfile <x86intrin.h>
216 ///
217 /// \code
218 /// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
219 /// \endcode
220 ///
221 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
222 ///
223 /// \param __A
224 /// A 256-bit vector of [32 x char].
225 /// \param __B
226 /// A 256-bit vector of [32 x unsigned char].
227 /// \returns
228 /// A 256-bit vector of [8 x int].
229 ///
230 /// \code{.operation}
231 /// FOR j := 0 to 7
232 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
233 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
234 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
235 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
236 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
237 /// ENDFOR
238 /// dst[MAX:256] := 0
239 /// \endcode
240 static __inline__ __m256i __DEFAULT_FN_ATTRS256
241 _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
242  return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
243  (__v8si)__B);
244 }
245 
246 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
247 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
248 /// signed 16-bit results. Sum these 4 results with the corresponding
249 /// 32-bit integer in \a __W with signed saturation, and store the packed
250 /// 32-bit results in \a dst.
251 ///
252 /// \headerfile <x86intrin.h>
253 ///
254 /// \code
255 /// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
256 /// \endcode
257 ///
258 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
259 ///
260 /// \param __A
261 /// A 128-bit vector of [16 x char].
262 /// \param __B
263 /// A 128-bit vector of [16 x unsigned char].
264 /// \returns
265 /// A 128-bit vector of [4 x int].
266 ///
267 /// \code{.operation}
268 /// FOR j := 0 to 3
269 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
270 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
271 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
272 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
273 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
274 /// ENDFOR
275 /// dst[MAX:128] := 0
276 /// \endcode
277 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
278  __m128i __A,
279  __m128i __B) {
280  return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
281  (__v4si)__B);
282 }
283 
284 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
285 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
286 /// signed 16-bit results. Sum these 4 results with the corresponding
287 /// 32-bit integer in \a __W with signed saturation, and store the packed
288 /// 32-bit results in \a dst.
289 ///
290 /// \headerfile <x86intrin.h>
291 ///
292 /// \code
293 /// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
294 /// \endcode
295 ///
296 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
297 ///
298 /// \param __A
299 /// A 256-bit vector of [32 x char].
300 /// \param __B
301 /// A 256-bit vector of [32 x unsigned char].
302 /// \returns
303 /// A 256-bit vector of [8 x int].
304 ///
305 /// \code{.operation}
306 /// FOR j := 0 to 7
307 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
308 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
309 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
310 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
311 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
312 /// ENDFOR
313 /// dst[MAX:256] := 0
314 /// \endcode
315 static __inline__ __m256i __DEFAULT_FN_ATTRS256
316 _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
317  return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
318  (__v8si)__B);
319 }
320 
321 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
322 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
323 /// signed 16-bit results. Sum these 4 results with the corresponding
324 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
325 ///
326 /// \headerfile <x86intrin.h>
327 ///
328 /// \code
329 /// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
330 /// \endcode
331 ///
332 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
333 ///
334 /// \param __A
335 /// A 128-bit vector of [16 x unsigned char].
336 /// \param __B
337 /// A 128-bit vector of [16 x unsigned char].
338 /// \returns
339 /// A 128-bit vector of [4 x int].
340 ///
341 /// \code{.operation}
342 /// FOR j := 0 to 3
343 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
344 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
345 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
346 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
347 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
348 /// ENDFOR
349 /// dst[MAX:128] := 0
350 /// \endcode
351 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
352  __m128i __A,
353  __m128i __B) {
354  return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
355  (__v4si)__B);
356 }
357 
358 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
359 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
360 /// signed 16-bit results. Sum these 4 results with the corresponding
361 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
362 ///
363 /// \headerfile <x86intrin.h>
364 ///
365 /// \code
366 /// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
367 /// \endcode
368 ///
369 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
370 ///
371 /// \param __A
372 /// A 256-bit vector of [32 x unsigned char].
373 /// \param __B
374 /// A 256-bit vector of [32 x unsigned char].
375 /// \returns
376 /// A 256-bit vector of [8 x int].
377 ///
378 /// \code{.operation}
379 /// FOR j := 0 to 7
380 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
381 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
382 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
383 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
384 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
385 /// ENDFOR
386 /// dst[MAX:256] := 0
387 /// \endcode
388 static __inline__ __m256i __DEFAULT_FN_ATTRS256
389 _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
390  return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
391  (__v8si)__B);
392 }
393 
394 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
395 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
396 /// signed 16-bit results. Sum these 4 results with the corresponding
397 /// 32-bit integer in \a __W with signed saturation, and store the packed
398 /// 32-bit results in \a dst.
399 ///
400 /// \headerfile <x86intrin.h>
401 ///
402 /// \code
403 /// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
404 /// \endcode
405 ///
406 /// This intrinsic corresponds to the \c VPDPBUUDS instruction.
407 ///
408 /// \param __A
409 /// A 128-bit vector of [16 x unsigned char].
410 /// \param __B
411 /// A 128-bit vector of [16 x unsigned char].
412 /// \returns
413 /// A 128-bit vector of [4 x int].
414 ///
415 /// \code{.operation}
416 /// FOR j := 0 to 3
417 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
418 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
419 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
420 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
421 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
422 /// ENDFOR
423 /// dst[MAX:128] := 0
424 /// \endcode
425 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
426  __m128i __A,
427  __m128i __B) {
428  return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
429  (__v4si)__B);
430 }
431 
432 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
433 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
434 /// signed 16-bit results. Sum these 4 results with the corresponding
435 /// 32-bit integer in \a __W with signed saturation, and store the packed
436 /// 32-bit results in \a dst.
437 ///
438 /// \headerfile <x86intrin.h>
439 ///
440 /// \code
441 /// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
442 /// \endcode
443 ///
444 /// This intrinsic corresponds to the \c VPDPBUUDS instruction.
445 ///
446 /// \param __A
447 /// A 256-bit vector of [32 x unsigned char].
448 /// \param __B
449 /// A 256-bit vector of [32 x unsigned char].
450 /// \returns
451 /// A 256-bit vector of [8 x int].
452 ///
453 /// \code{.operation}
454 /// FOR j := 0 to 7
455 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
456 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
457 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
458 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
459 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
460 /// ENDFOR
461 /// dst[MAX:256] := 0
462 /// \endcode
463 static __inline__ __m256i __DEFAULT_FN_ATTRS256
464 _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
465  return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
466  (__v8si)__B);
467 }
468 #undef __DEFAULT_FN_ATTRS128
469 #undef __DEFAULT_FN_ATTRS256
470 
471 #endif // __AVXVNNIINT8INTRIN_H
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding signed 8-bit i...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding unsigned 8-bit...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding unsigned 8-bit...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W, __m128i __A, __m128i __B)
Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in __A with corresponding unsigned 8-b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding signed 8-bit i...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B)
Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in __A with corresponding unsigned 8-b...
#define __DEFAULT_FN_ATTRS256
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding unsigned 8-bit...
#define __DEFAULT_FN_ATTRS128
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding unsigned 8-bit...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding signed 8-bit i...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B)
Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in __A with corresponding unsigned 8-b...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding unsigned 8-bit...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W, __m128i __A, __m128i __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding signed 8-bit i...