11 #error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
19 #define __DEFAULT_FN_ATTRS_TILE \
20 __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
21 #define __DEFAULT_FN_ATTRS_INT8 \
22 __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
23 #define __DEFAULT_FN_ATTRS_BF16 \
24 __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
25 #define __DEFAULT_FN_ATTRS_FP16 \
26 __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
41 static __inline__
void __DEFAULT_FN_ATTRS_TILE
42 _tile_loadconfig(
const void *__config) {
43 __builtin_ia32_tile_loadconfig(__config);
57 static __inline__
void __DEFAULT_FN_ATTRS_TILE
58 _tile_storeconfig(
void *__config) {
59 __builtin_ia32_tile_storeconfig(__config);
68 static __inline__
void __DEFAULT_FN_ATTRS_TILE _tile_release(
void) {
69 __builtin_ia32_tilerelease();
86 #define _tile_loadd(dst, base, stride) \
87 __builtin_ia32_tileloadd64((dst), ((const void *)(base)), \
88 (__SIZE_TYPE__)(stride))
106 #define _tile_stream_loadd(dst, base, stride) \
107 __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \
108 (__SIZE_TYPE__)(stride))
124 #define _tile_stored(dst, base, stride) \
125 __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
135 #define _tile_zero(tile) __builtin_ia32_tilezero((tile))
153 #define _tile_dpbssd(dst, src0, src1) \
154 __builtin_ia32_tdpbssd((dst), (src0), (src1))
172 #define _tile_dpbsud(dst, src0, src1) \
173 __builtin_ia32_tdpbsud((dst), (src0), (src1))
191 #define _tile_dpbusd(dst, src0, src1) \
192 __builtin_ia32_tdpbusd((dst), (src0), (src1))
210 #define _tile_dpbuud(dst, src0, src1) \
211 __builtin_ia32_tdpbuud((dst), (src0), (src1))
228 #define _tile_dpbf16ps(dst, src0, src1) \
229 __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
234 typedef int _tile1024i
__attribute__((__vector_size__(1024), __aligned__(64)));
237 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
238 _tile_loadd_internal(
unsigned short m,
unsigned short n,
const void *base,
239 __SIZE_TYPE__ stride) {
240 return __builtin_ia32_tileloadd64_internal(m, n, base,
241 (__SIZE_TYPE__)(stride));
245 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
246 _tile_loaddt1_internal(
unsigned short m,
unsigned short n,
const void *base,
247 __SIZE_TYPE__ stride) {
248 return __builtin_ia32_tileloaddt164_internal(m, n, base,
249 (__SIZE_TYPE__)(stride));
253 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
254 _tile_dpbssd_internal(
unsigned short m,
unsigned short n,
unsigned short k,
255 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
256 return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
260 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
261 _tile_dpbsud_internal(
unsigned short m,
unsigned short n,
unsigned short k,
262 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
263 return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
267 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
268 _tile_dpbusd_internal(
unsigned short m,
unsigned short n,
unsigned short k,
269 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
270 return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
274 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
275 _tile_dpbuud_internal(
unsigned short m,
unsigned short n,
unsigned short k,
276 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
277 return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
281 static __inline__
void __DEFAULT_FN_ATTRS_INT8
282 _tile_stored_internal(
unsigned short m,
unsigned short n,
void *base,
283 __SIZE_TYPE__ stride, _tile1024i tile) {
284 return __builtin_ia32_tilestored64_internal(m, n, base,
285 (__SIZE_TYPE__)(stride), tile);
289 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
290 _tile_dpbf16ps_internal(
unsigned short m,
unsigned short n,
unsigned short k,
291 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
292 return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
296 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16
297 _tile_dpfp16ps_internal(
unsigned short m,
unsigned short n,
unsigned short k,
298 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
299 return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
306 typedef struct __tile1024i_str {
307 const unsigned short row;
308 const unsigned short col;
325 __DEFAULT_FN_ATTRS_TILE
326 static __inline__
void __tile_loadd(__tile1024i *dst,
const void *base,
327 __SIZE_TYPE__ stride) {
328 dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
346 __DEFAULT_FN_ATTRS_TILE
347 static __inline__
void __tile_stream_loadd(__tile1024i *dst,
const void *base,
348 __SIZE_TYPE__ stride) {
349 dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
368 __DEFAULT_FN_ATTRS_INT8
369 static __inline__
void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
371 dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
372 src0.tile, src1.tile);
391 __DEFAULT_FN_ATTRS_INT8
392 static __inline__
void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
394 dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
395 src0.tile, src1.tile);
414 __DEFAULT_FN_ATTRS_INT8
415 static __inline__
void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
417 dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
418 src0.tile, src1.tile);
437 __DEFAULT_FN_ATTRS_INT8
438 static __inline__
void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
440 dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
441 src0.tile, src1.tile);
455 __DEFAULT_FN_ATTRS_TILE
456 static __inline__
void __tile_stored(
void *base, __SIZE_TYPE__ stride,
458 _tile_stored_internal(src.row, src.col, base, stride, src.tile);
469 __DEFAULT_FN_ATTRS_TILE
470 static __inline__
void __tile_zero(__tile1024i *dst) {
471 dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
489 __DEFAULT_FN_ATTRS_BF16
490 static __inline__
void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
492 dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
493 src0.tile, src1.tile);
511 __DEFAULT_FN_ATTRS_FP16
512 static __inline__
void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
514 dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
515 src0.tile, src1.tile);
518 #undef __DEFAULT_FN_ATTRS_TILE
519 #undef __DEFAULT_FN_ATTRS_INT8
520 #undef __DEFAULT_FN_ATTRS_BF16
521 #undef __DEFAULT_FN_ATTRS_FP16
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.