clang  19.0.0git
avxvnniint16intrin.h
Go to the documentation of this file.
1 /*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __IMMINTRIN_H
11 #error \
12  "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead."
13 #endif // __IMMINTRIN_H
14 
15 #ifndef __AVXVNNIINT16INTRIN_H
16 #define __AVXVNNIINT16INTRIN_H
17 
18 /* Define the default attributes for the functions in this file. */
19 #define __DEFAULT_FN_ATTRS128 \
20  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \
21  __min_vector_width__(128)))
22 #define __DEFAULT_FN_ATTRS256 \
23  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \
24  __min_vector_width__(256)))
25 
26 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
27 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
28 /// signed 16-bit results. Sum these 2 results with the corresponding
29 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
30 ///
31 /// \headerfile <immintrin.h>
32 ///
33 /// \code
34 /// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B)
35 /// \endcode
36 ///
37 /// This intrinsic corresponds to the \c VPDPWSUD instruction.
38 ///
39 /// \param __W
40 /// A 128-bit vector of [4 x int].
41 /// \param __A
42 /// A 128-bit vector of [8 x short].
43 /// \param __B
44 /// A 128-bit vector of [8 x unsigned short].
45 /// \returns
46 /// A 128-bit vector of [4 x int].
47 ///
48 /// \code{.operation}
49 /// FOR j := 0 to 3
50 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
51 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
52 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
53 /// ENDFOR
54 /// dst[MAX:128] := 0
55 /// \endcode
56 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W,
57  __m128i __A,
58  __m128i __B) {
59  return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A,
60  (__v4si)__B);
61 }
62 
63 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
64 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
65 /// signed 16-bit results. Sum these 2 results with the corresponding
66 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
67 ///
68 /// \headerfile <immintrin.h>
69 ///
70 /// \code
71 /// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B)
72 /// \endcode
73 ///
74 /// This intrinsic corresponds to the \c VPDPWSUD instruction.
75 ///
76 /// \param __W
77 /// A 256-bit vector of [8 x int].
78 /// \param __A
79 /// A 256-bit vector of [16 x short].
80 /// \param __B
81 /// A 256-bit vector of [16 x unsigned short].
82 /// \returns
83 /// A 256-bit vector of [8 x int].
84 ///
85 /// \code{.operation}
86 /// FOR j := 0 to 7
87 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
88 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
89 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
90 /// ENDFOR
91 /// dst[MAX:256] := 0
92 /// \endcode
93 static __inline__ __m256i __DEFAULT_FN_ATTRS256
94 _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
95  return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A,
96  (__v8si)__B);
97 }
98 
99 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
100 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
101 /// signed 16-bit results. Sum these 2 results with the corresponding
102 /// 32-bit integer in \a __W with signed saturation, and store the packed
103 /// 32-bit results in \a dst.
104 ///
105 /// \headerfile <immintrin.h>
106 ///
107 /// \code
108 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
109 /// \endcode
110 ///
111 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
112 ///
113 /// \param __W
114 /// A 128-bit vector of [4 x int].
115 /// \param __A
116 /// A 128-bit vector of [8 x short].
117 /// \param __B
118 /// A 128-bit vector of [8 x unsigned short].
119 /// \returns
120 /// A 128-bit vector of [4 x int].
121 ///
122 /// \code{.operation}
123 /// FOR j := 0 to 3
124 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
125 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
126 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
127 /// ENDFOR
128 /// dst[MAX:128] := 0
129 /// \endcode
130 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W,
131  __m128i __A,
132  __m128i __B) {
133  return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A,
134  (__v4si)__B);
135 }
136 
137 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
138 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
139 /// signed 16-bit results. Sum these 2 results with the corresponding
140 /// 32-bit integer in \a __W with signed saturation, and store the packed
141 /// 32-bit results in \a dst.
142 ///
143 /// \headerfile <immintrin.h>
144 ///
145 /// \code
146 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
147 /// \endcode
148 ///
149 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
150 ///
151 /// \param __W
152 /// A 256-bit vector of [8 x int].
153 /// \param __A
154 /// A 256-bit vector of [16 x short].
155 /// \param __B
156 /// A 256-bit vector of [16 x unsigned short].
157 /// \returns
158 /// A 256-bit vector of [8 x int].
159 ///
160 /// \code{.operation}
161 /// FOR j := 0 to 7
162 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
163 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
164 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
165 /// ENDFOR
166 /// dst[MAX:256] := 0
167 /// \endcode
168 static __inline__ __m256i __DEFAULT_FN_ATTRS256
169 _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
170  return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A,
171  (__v8si)__B);
172 }
173 
174 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
175 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
176 /// signed 16-bit results. Sum these 2 results with the corresponding
177 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
178 ///
179 /// \headerfile <immintrin.h>
180 ///
181 /// \code
182 /// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B)
183 /// \endcode
184 ///
185 /// This intrinsic corresponds to the \c VPDPWUSD instruction.
186 ///
187 /// \param __W
188 /// A 128-bit vector of [4 x int].
189 /// \param __A
190 /// A 128-bit vector of [8 x unsigned short].
191 /// \param __B
192 /// A 128-bit vector of [8 x short].
193 /// \returns
194 /// A 128-bit vector of [4 x int].
195 ///
196 /// \code{.operation}
197 /// FOR j := 0 to 3
198 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
199 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
200 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
201 /// ENDFOR
202 /// dst[MAX:128] := 0
203 /// \endcode
204 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W,
205  __m128i __A,
206  __m128i __B) {
207  return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A,
208  (__v4si)__B);
209 }
210 
211 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
212 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
213 /// signed 16-bit results. Sum these 2 results with the corresponding
214 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
215 ///
216 /// \headerfile <immintrin.h>
217 ///
218 /// \code
219 /// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B)
220 /// \endcode
221 ///
222 /// This intrinsic corresponds to the \c VPDPWUSD instruction.
223 ///
224 /// \param __W
225 /// A 256-bit vector of [8 x int].
226 /// \param __A
227 /// A 256-bit vector of [16 x unsigned short].
228 /// \param __B
229 /// A 256-bit vector of [16 x short].
230 /// \returns
231 /// A 256-bit vector of [8 x int].
232 ///
233 /// \code{.operation}
234 /// FOR j := 0 to 7
235 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
236 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
237 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
238 /// ENDFOR
239 /// dst[MAX:256] := 0
240 /// \endcode
241 static __inline__ __m256i __DEFAULT_FN_ATTRS256
242 _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) {
243  return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A,
244  (__v8si)__B);
245 }
246 
247 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
248 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
249 /// signed 16-bit results. Sum these 2 results with the corresponding
250 /// 32-bit integer in \a __W with signed saturation, and store the packed
251 /// 32-bit results in \a dst.
252 ///
253 /// \headerfile <immintrin.h>
254 ///
255 /// \code
256 /// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
257 /// \endcode
258 ///
259 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
260 ///
261 /// \param __W
262 /// A 128-bit vector of [4 x int].
263 /// \param __A
264 /// A 128-bit vector of [8 x unsigned short].
265 /// \param __B
266 /// A 128-bit vector of [8 x short].
267 /// \returns
268 /// A 128-bit vector of [4 x int].
269 ///
270 /// \code{.operation}
271 /// FOR j := 0 to 3
272 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
273 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
274 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
275 /// ENDFOR
276 /// dst[MAX:128] := 0
277 /// \endcode
278 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W,
279  __m128i __A,
280  __m128i __B) {
281  return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A,
282  (__v4si)__B);
283 }
284 
285 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
286 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
287 /// signed 16-bit results. Sum these 2 results with the corresponding
288 /// 32-bit integer in \a __W with signed saturation, and store the packed
289 /// 32-bit results in \a dst.
290 ///
291 /// \headerfile <immintrin.h>
292 ///
293 /// \code
294 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
295 /// \endcode
296 ///
297 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
298 ///
299 /// \param __W
300 /// A 256-bit vector of [8 x int].
301 /// \param __A
302 /// A 256-bit vector of [16 x unsigned short].
303 /// \param __B
304 /// A 256-bit vector of [16 x short].
305 /// \returns
306 /// A 256-bit vector of [8 x int].
307 ///
308 /// \code{.operation}
309 /// FOR j := 0 to 7
310 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
311 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
312 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
313 /// ENDFOR
314 /// dst[MAX:256] := 0
315 /// \endcode
316 static __inline__ __m256i __DEFAULT_FN_ATTRS256
317 _mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) {
318  return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A,
319  (__v8si)__B);
320 }
321 
322 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
323 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
324 /// signed 16-bit results. Sum these 2 results with the corresponding
325 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
326 ///
327 /// \headerfile <immintrin.h>
328 ///
329 /// \code
330 /// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B)
331 /// \endcode
332 ///
333 /// This intrinsic corresponds to the \c VPDPWUUD instruction.
334 ///
335 /// \param __W
336 /// A 128-bit vector of [4 x unsigned int].
337 /// \param __A
338 /// A 128-bit vector of [8 x unsigned short].
339 /// \param __B
340 /// A 128-bit vector of [8 x unsigned short].
341 /// \returns
342 /// A 128-bit vector of [4 x unsigned int].
343 ///
344 /// \code{.operation}
345 /// FOR j := 0 to 3
346 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
347 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
348 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
349 /// ENDFOR
350 /// dst[MAX:128] := 0
351 /// \endcode
352 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W,
353  __m128i __A,
354  __m128i __B) {
355  return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A,
356  (__v4si)__B);
357 }
358 
359 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
360 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
361 /// signed 16-bit results. Sum these 2 results with the corresponding
362 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
363 ///
364 /// \headerfile <immintrin.h>
365 ///
366 /// \code
367 /// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B)
368 /// \endcode
369 ///
370 /// This intrinsic corresponds to the \c VPDPWUUD instruction.
371 ///
372 /// \param __W
373 /// A 256-bit vector of [8 x unsigned int].
374 /// \param __A
375 /// A 256-bit vector of [16 x unsigned short].
376 /// \param __B
377 /// A 256-bit vector of [16 x unsigned short].
378 /// \returns
379 /// A 256-bit vector of [8 x unsigned int].
380 ///
381 /// \code{.operation}
382 /// FOR j := 0 to 7
383 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
384 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
385 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
386 /// ENDFOR
387 /// dst[MAX:256] := 0
388 /// \endcode
389 static __inline__ __m256i __DEFAULT_FN_ATTRS256
390 _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
391  return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A,
392  (__v8si)__B);
393 }
394 
395 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
396 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
397 /// signed 16-bit results. Sum these 2 results with the corresponding
398 /// 32-bit integer in \a __W with signed saturation, and store the packed
399 /// 32-bit results in \a dst.
400 ///
401 /// \headerfile <immintrin.h>
402 ///
403 /// \code
404 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
405 /// \endcode
406 ///
407 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
408 ///
409 /// \param __W
410 /// A 128-bit vector of [4 x unsigned int].
411 /// \param __A
412 /// A 128-bit vector of [8 x unsigned short].
413 /// \param __B
414 /// A 128-bit vector of [8 x unsigned short].
415 /// \returns
416 /// A 128-bit vector of [4 x unsigned int].
417 ///
418 /// \code{.operation}
419 /// FOR j := 0 to 3
420 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
421 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
422 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
423 /// ENDFOR
424 /// dst[MAX:128] := 0
425 /// \endcode
426 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W,
427  __m128i __A,
428  __m128i __B) {
429  return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A,
430  (__v4si)__B);
431 }
432 
433 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
434 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
435 /// signed 16-bit results. Sum these 2 results with the corresponding
436 /// 32-bit integer in \a __W with signed saturation, and store the packed
437 /// 32-bit results in \a dst.
438 ///
439 /// \headerfile <immintrin.h>
440 ///
441 /// \code
442 /// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
443 /// \endcode
444 ///
445 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
446 ///
447 /// \param __W
448 /// A 256-bit vector of [8 x unsigned int].
449 /// \param __A
450 /// A 256-bit vector of [16 x unsigned short].
451 /// \param __B
452 /// A 256-bit vector of [16 x unsigned short].
453 /// \returns
454 /// A 256-bit vector of [8 x unsigned int].
455 ///
456 /// \code{.operation}
457 /// FOR j := 0 to 7
458 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
459 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
460 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
461 /// ENDFOR
462 /// dst[MAX:256] := 0
463 /// \endcode
464 static __inline__ __m256i __DEFAULT_FN_ATTRS256
465 _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
466  return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A,
467  (__v8si)__B);
468 }
469 
470 #undef __DEFAULT_FN_ATTRS128
471 #undef __DEFAULT_FN_ATTRS256
472 
473 #endif // __AVXVNNIINT16INTRIN_H
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding signed 16-b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
Multiply groups of 2 adjacent pairs of signed 16-bit integers in __A with corresponding unsigned 16-b...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
Multiply groups of 2 adjacent pairs of signed 16-bit integers in __A with corresponding unsigned 16-b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding unsigned 16...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding signed 16-b...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B)
Multiply groups of 2 adjacent pairs of signed 16-bit integers in __A with corresponding unsigned 16-b...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W, __m128i __A, __m128i __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding unsigned 16...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding signed 16-b...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W, __m128i __A, __m128i __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding signed 16-b...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding unsigned 16...
#define __DEFAULT_FN_ATTRS256
#define __DEFAULT_FN_ATTRS128
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B)
Multiply groups of 2 adjacent pairs of signed 16-bit integers in __A with corresponding unsigned 16-b...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding unsigned 16...