Namespaces
namespace	core

namespace	decision_tree_rule

namespace	detail

namespace	group

namespace	impl

namespace	kernel

namespace	subgroup

Classes
struct	arch_attr_t

struct	arch_attr_t< gpu_arch::Xe >

struct	decision_tree_optimizer

struct	dict_t

struct	dropout_fwd_t

struct	dummy_optimizer

struct	elem_t

struct	elem_t_t

struct	elem_v_t

struct	fallback_optimizer

struct	FastDivMod
	Fast division + modulus operation Host code pre-computes values to avoid expensive operations in kernel code. More...

struct	get_uint_type
	Get the unit representation based on Size. More...

struct	int4x2
	xetla 4bits data packed as 8bits data type. More...

struct	is_floating_point
	Used to check if the type is floating_point. More...

struct	is_host_callable

struct	is_host_callable< T, std::enable_if_t< T::host_callable==true > >

struct	is_integral
	Used to check if the type is floating_point. More...

struct	is_internal_type
	Used to check if the type is xetla internal data type. More...

struct	is_internal_type< int4x2 >
	Used to check if the type is xetla internal data type. More...

struct	load_store_attr_t

struct	load_store_attr_t< msg_type::block_2d, gpu_arch::Xe >

struct	mem_base_t

struct	mem_base_t< dtype_, mem_space::global >

struct	mem_base_t< dtype_, mem_space::local >

struct	mem_coord_t

struct	mem_coord_t< 2 >

struct	mem_desc_t

struct	mem_desc_t< dtype_, layout_, space_, alignment_, 2 >

struct	mem_shape_t

struct	mem_shape_t< 2 >

struct	mma_attr_t

struct	mma_attr_t< gpu_arch::Xe >

struct	native_type
	Set the native data type of T. More...

struct	native_type< int4x2 >
	Set uint8_t as the native data type of int4x2. More...

struct	param_adaptor

struct	param_adaptor< param_adaptor_tag::kernel, dict_t_ >

struct	param_adaptor< param_adaptor_tag::work_group_epilogue, dict_t_ >

struct	param_adaptor< param_adaptor_tag::work_group_gemm, dict_t_ >

struct	param_adaptor_base

struct	param_optimizer

struct	param_optimizer< param_optimizer_tag::kernel, dict_t_ >

struct	param_optimizer< param_optimizer_tag::work_group, dict_t_ >

struct	param_optimizer_base

struct	register_attr_t

struct	register_attr_t< grf_num_mode, gpu_arch::Xe >

struct	shape

struct	uint_type
	Get the unit representation of type T. More...

struct	work_group_t
	Define a workgroup scope for a specific problem shape. More...

struct	xetla_nbarrier_t
	xetla nbarrier definition API. More...

struct	xetla_rand_t

class	xetla_saturation_off_tag

class	xetla_saturation_on_tag

Concepts
concept	xetla_vector_ref
	Workaround for ESIMD vector(1D) ref type.

concept	xetla_matrix_ref
	Workaround for ESIMD matrix(2D) ref type.

Typedefs
using	bf16 = sycl::ext::oneapi::bfloat16
	xetla bf16 data type.

using	fp16 = sycl::half
	xetla fp16 data type.

using	tf32 = sycl::ext::intel::experimental::esimd::tfloat32
	xetla tf32 data type.

template<typename T >
using	native_type_t = typename native_type< T >::type
	Return the native data type of T.

template<typename T >
using	uint_type_t = typename uint_type< T >::type
	Return the uint representation of type T.

template<int Size>
using	get_uint_type_t = typename get_uint_type< Size >::type
	Return the uint representation based on Size.

template<typename Ty , uint32_t N>
using	xetla_vector = __ESIMD_NS::simd< native_type_t< Ty >, N >
	wrapper for xetla_vector.

using	xetla_tdescriptor = xetla_vector< uint32_t, 16 >
	Description of nd tensor descriptor for load and store.

template<uint32_t N>
using	xetla_mask = __ESIMD_NS::simd_mask< N >
	wrapper for xetla_mask.

template<uint32_t N>
using	xetla_mask_int = __ESIMD_NS::simd_mask< N >
	wrapper for xetla_mask_int.

template<typename T >
using	is_xetla_scalar = typename __ESIMD_DNS::is_esimd_scalar< T >

template<auto d_ = impl::meta_impl_base_default, typename T = decltype(d_)>
using	meta_value = impl::meta_value_impl< T, d_ >

template<auto d_ = impl::meta_impl_base_default, typename T = decltype(d_)>
using	meta_value_t = typename meta_value< d_, T >::type

template<typename T = impl::meta_impl_base::_default>
using	meta_type = impl::meta_type_impl< T >

template<typename T >
using	meta_type_t = typename T::type

using	default_param_t = dict_t<>::template update_dict_t< detail::param_dtype_bf16_bf16_bf16 >::template update_dict_t< detail::param_memlayout_rrr >::template update_dict_t< detail::param_memalignment_8_8_8 >::template update_dict_t< detail::param_memspace_ggg >::template update_dict_t< detail::param_performance_default >::template update_dict_t< detail::param_runtime_default >::template update_t< elem_t_t< tune_key::data_type_acc, float >, elem_v_t< tune_key::global_kslicing_ratio, 1UL, uint32_t >, elem_v_t< tune_key::local_kslicing_ratio, 1UL, uint32_t >, elem_t_t< tune_key::wg_tile_shape, shape< 256, 256 > >, elem_t_t< tune_key::sg_tile_shape, shape< 64, 32 > >, elem_v_t< tune_key::param_optimizer_type, tune_key_value::param_optimizer_dummy > >

Enumerations
enum class	gpu_arch : uint8_t { Xe = 0 }

enum class	grf_mode : uint8_t { normal = 0 , double_grf = 1 }

enum class	mem_layout : uint8_t { row_major = 0 , col_major = 1 }

enum class	mem_space : uint8_t { global = 0 , local = 1 }

enum class	msg_type : uint8_t { block_2d = 0 , block_1d = 1 , scatter = 2 , atomic_add = 3 , unaligned_2d = 4 }

enum class	cache_hint : uint8_t { none = 0 , uncached = 1 , cached = 2 , write_back = 3 , write_through = 4 , streaming = 5 , read_invalidate = 6 }
	L1 or L2 cache hint kinds. More...

enum class	data_size : uint8_t { default_size = 0 , u8 = 1 , u16 = 2 , u32 = 3 , u64 = 4 , u8u32 = 5 , u16u32 = 6 , u16u32h = 7 }
	Data size or format to read or store. More...

enum class	memory_kind : uint8_t { untyped_global = 0 , untyped_global_low_pri = 1 , typed_global = 2 , shared_local = 3 }
	The specific LSC shared function to fence with xetla_fence. More...

enum class	fence_op : uint8_t { none = 0 , evict = 1 , invalidate = 2 , discard = 3 , clean = 4 , flushl2 = 5 }
	The xetla_fence operation to apply to caches. More...

enum class	fence_scope : uint8_t { group = 0 , local = 1 , tile = 2 , gpu = 3 , gpus = 4 , system = 5 , sysacq = 6 }
	The scope that xetla_fence operation should apply to. More...

enum class	atomic_op : uint8_t { iinc = 0x0 , idec = 0x1 , iadd = 0x2 , isub = 0x3 , smin = 0x4 , smax = 0x5 , cmpxchg = 0x6 , fadd = 0x7 , fsub = 0x8 , fmin = 0x9 , fmax = 0xa , fcmpxchg = 0xb , umin = 0xc , umax = 0xd , bit_and = 0xe , bit_or = 0xf , bit_xor = 0x10 , load = 0x11 , store = 0x12 }
	Represents an atomic operation. More...

enum class	argument_type : uint8_t { U1 = 0 , S1 = 1 , U2 = 2 , S2 = 3 , U4 = 4 , S4 = 5 , U8 = 6 , S8 = 7 , BF16 = 8 , FP16 = 9 , TF32 = 12 , DF = 13 , NUM_ARG_TYPES = 14 }
	xetla dpas argument typ More...

enum class	reduce_op : uint8_t { sum = 0 , prod = 1 , min = 2 , max = 3 }
	xetla reduce op More...

enum class	reg_layout : uint8_t { linear = 0 , tiled = 1 , vnni_tiled = 2 , transpose_tiled = 3 , vnni_tiled_col_major = 4 }
	tile layout in register linear: linear layout with one tile tiled: 2d block stacked in raster order vnni_tiled: vnni pack with 2d block and 2d block stacked in raster order for dword and qword, there is no impact for word, two rows are interleaved, i.e. More...

enum class	store_op : uint8_t { normal = 0 , atomic_fadd = 1 , atomic_iadd = 2 , scattered_transpose = 3 , block_1d = 4 }

enum class	mma_engine : uint8_t { xmx = 0 , fpu = 1 }

enum class	memory_op : uint8_t { load = 0 , store = 1 }

enum class	tdesc_update_dir : uint8_t { x_dir = 0 , y_dir = 1 }

enum class	post_kind : uint8_t { none = 0 , relu = 1 , gelu = 2 , gelu_bwd_w = 3 , sigmoid = 4 , tanh = 5 }

enum class	pre_kind : uint8_t { none = 0 , bias_add = 1 , res_add = 2 }

enum class	offset_mode : uint8_t { const_offset = 0 , cyclic_offset = 1 , acyclic_offset = 2 }

enum class	nbarrier_role : uint8_t { nbarrier_role::producer_consumer = 0 , nbarrier_role::producer = 1 , nbarrier_role::consumer = 2 }

enum class	ln_fwd_fused_kind : uint8_t { none = 0 , bias_dropout_resAdd_ln = 1 , ln_dropout = 2 , bias_rng_dropout_resAdd_ln = 3 , ln_rng_dropout = 4 }

enum class	ln_bwd_fused_kind : uint8_t { none = 0 , bias_dropout_resAdd_ln = 1 , ln_dropout_gradAdd = 2 , ln_dropout = 3 }

enum class	reduction_fused_kind : uint8_t { none = 0 , bias_gelu_w_bwd = 1 , bias_dropout_bwd = 2 }

enum class	tune_key : uint8_t { data_type_a , memory_layout_a , memory_alignment_a , memory_space_a , data_type_b , memory_layout_b , memory_alignment_b , memory_space_b , data_type_c , memory_layout_c , memory_alignment_c , memory_space_c , data_type_acc , global_kslicing_ratio , local_kslicing_ratio , wg_tile_shape , wg_tile_k , sg_tile_shape , pre_processing , prefetch_distance , periodic_sync_interval , mma_engine , gpu_arch , epilogue_policy , dispatch_policy , group_swizzle_policy , param_optimizer_type , source_location }

enum class	tune_key_value : uint8_t { pre_processing_default , pre_processing_mata_neg_filter , dispatch_policy_default , dispatch_policy_kslicing , dispatch_policy_stream_k , param_optimizer_dummy , param_optimizer_decision_tree }

enum class	param_optimizer_tag : uint8_t { kernel , work_group }

enum class	param_adaptor_tag : uint8_t { kernel , work_group_gemm , work_group_epilogue }

Functions
template<uint8_t NbarCount>
__XETLA_API void	xetla_nbarrier_init ()
	Initialize the number of named barrier index for a kernel.

__XETLA_API void	named_barrier_signal (uint8_t barrier_id, uint8_t producer_consumer_mode, uint32_t num_producers, uint32_t num_consumers)
	Perform signal operation for the given named barrier id.

__XETLA_API void	named_barrier_wait (uint8_t barrier_id)
	Wait on a named barrier.

template<typename T0 , typename T1 , int SZ, typename U , class Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T0, SZ >	xetla_shl (xetla_vector< T1, SZ > src0, U src1, Sat sat={})
	Shift left operation (vector version)

template<typename T0 , typename T1 , typename T2 , class Sat = xetla_saturation_off_tag>
std::remove_const< T0 >::type	xetla_shl (T1 src0, T2 src1, Sat sat={})
	Shift left operation (scalar version)

template<typename T0 , typename T1 , int SZ, typename U , class Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T0, SZ >	xetla_shr (xetla_vector< T1, SZ > src0, U src1, Sat sat={})
	Shift right operation (vector version)

template<typename T0 , typename T1 , typename T2 , class Sat = xetla_saturation_off_tag>
__XETLA_API std::remove_const< T0 >::type	xetla_shr (T1 src0, T2 src1, Sat sat={})
	Shift right operation (scalar version)

template<typename T0 , typename T1 , int SZ>
__XETLA_API xetla_vector< T0, SZ >	xetla_rol (xetla_vector< T1, SZ > src0, xetla_vector< T1, SZ > src1)
	Rotate left operation with two vector inputs.

template<typename T0 , typename T1 , int SZ, typename U >
__XETLA_API std::enable_if_t< std::is_integral< T0 >::value &&std::is_integral< T1 >::value &&std::is_integral< U >::value &&is_xetla_scalar< U >::value, xetla_vector< T0, SZ > >	xetla_rol (xetla_vector< T1, SZ > src0, U src1)
	Rotate left operation with a vector and a scalar inputs.

template<typename T0 , typename T1 , typename T2 >
__XETLA_API std::enable_if_t< std::is_integral< T0 >::value &&std::is_integral< T1 >::value &&std::is_integral< T2 >::value, remove_const_t< T0 > >	xetla_rol (T1 src0, T2 src1)
	Rotate left operation with two scalar inputs.

template<typename T0 , typename T1 , int SZ>
__XETLA_API xetla_vector< T0, SZ >	xetla_ror (xetla_vector< T1, SZ > src0, xetla_vector< T1, SZ > src1)
	Rotate right operation with two vector inputs.

template<typename T0 , typename T1 , int SZ, typename U >
__XETLA_API std::enable_if_t< std::is_integral< T0 >::value &&std::is_integral< T1 >::value &&std::is_integral< U >::value &&is_xetla_scalar< U >::value, xetla_vector< T0, SZ > >	xetla_ror (xetla_vector< T1, SZ > src0, U src1)
	Rotate right operation with a vector and a scalar inputs.

template<typename T0 , typename T1 , typename T2 >
__XETLA_API std::enable_if_t< std::is_integral< T0 >::value &&std::is_integral< T1 >::value &&std::is_integral< T2 >::value, remove_const_t< T0 > >	xetla_ror (T1 src0, T2 src1)
	Rotate right operation with two scalar inputs.

template<typename T0 , typename T1 , int SZ, typename U , class Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T0, SZ >	xetla_lsr (xetla_vector< T1, SZ > src0, U src1, Sat sat={})
	Logical Shift Right (vector version)

template<typename T0 , typename T1 , typename T2 , class Sat = xetla_saturation_off_tag>
__XETLA_API std::remove_const< T0 >::type	xetla_lsr (T1 src0, T2 src1, Sat sat={})
	Logical Shift Right (scalar version)

template<typename T0 , typename T1 , int SZ, typename U , class Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T0, SZ >	xetla_asr (xetla_vector< T1, SZ > src0, U src1, Sat sat={})
	Arithmetical Shift Right (vector version)

template<typename T0 , typename T1 , typename T2 , class Sat = xetla_saturation_off_tag>
__XETLA_API std::remove_const< T0 >::type	xetla_asr (T1 src0, T2 src1, Sat sat={})
	Arithmetical Shift Right (scalar version)

template<int N>
__XETLA_API uint32_t	xetla_pack_mask (xetla_mask< N > src0)
	Pack a xetla_mask into a single unsigned 32-bit integer value.

template<int N>
__XETLA_API xetla_mask< N >	xetla_unpack_mask (uint32_t src0)
	Unpack an unsigned 32-bit integer value into a xetla_mask.

void	xetla_wait (uint16_t val)

template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t<!(is_internal_type< T_dst >::value) &&!(is_internal_type< T_src >::value), xetla_vector< T_dst, N > >	xetla_cvt (xetla_vector< T_src, N > src)
	xetla explicit data conversion for standard data types(integer,float,half)

template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t< std::is_same< T_dst, bf16 >::value &&std::is_same< T_src, float >::value, xetla_vector< T_dst, N > >	xetla_cvt (xetla_vector< T_src, N > src)
	xetla explicit data conversion, fp32->bf16.

template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t< std::is_same< T_dst, float >::value &&std::is_same< T_src, bf16 >::value, xetla_vector< T_dst, N > >	xetla_cvt (xetla_vector< T_src, N > src)
	xetla explicit data conversion, bf16->fp32.

template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t< std::is_same< T_dst, tf32 >::value &&std::is_same< T_src, float >::value, xetla_vector< T_dst, N > >	xetla_cvt (xetla_vector< T_src, N > src)
	xetla explicit data conversion, fp32->tf32.

template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t< std::is_same< T_dst, float >::value &&std::is_same< T_src, tf32 >::value, xetla_vector< T_dst, N > >	xetla_cvt (xetla_vector< T_src, N > src)
	xetla explicit data conversion, tf32->fp32.

template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t< std::is_same< T_dst, fp16 >::value &&std::is_same< T_src, int32_t >::value, xetla_vector< T_dst, N > >	xetla_cvt (xetla_vector< T_src, N > src, float scaling_value)
	xetpp explicit data conversion with scaling, int32->fp16.

template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t< std::is_same< T_dst, int8_t >::value &&std::is_same< T_src, int32_t >::value, xetla_vector< T_dst, N > >	xetla_cvt (xetla_vector< T_src, N > src, float scaling_value)
	xetpp explicit data conversion with re-quantization, int32->int8.

template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t< std::is_same< T_dst, int8_t >::value &&std::is_same< T_src, float >::value, xetla_vector< T_dst, N > >	xetla_cvt (xetla_vector< T_src, N > src, float scaling_value)
	xetpp explicit data conversion with scaling and quantization, float32->int8.

template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t< std::is_same< T_dst, T_src >::value &&is_internal_type< T_src >::value, xetla_vector< T_dst, N > >	xetla_cvt (xetla_vector< T_src, N > src)
	xetla explicit data conversion, same type.

template<typename T0 , typename T1 , int SZ>
__XETLA_API xetla_vector< T0, SZ >	xetla_abs (xetla_vector< T1, SZ > src0)
	Get absolute value (vector version)

template<typename T0 , typename T1 >
std::enable_if_t<!std::is_same< remove_const_t< T0 >, remove_const_t< T1 > >::value, remove_const_t< T0 > > __XETLA_API	xetla_abs (T1 src0)
	Get absolute value (scalar version)

template<typename T1 , int SZ>
__XETLA_API xetla_vector< T1, SZ >	xetla_abs (xetla_vector< T1, SZ > src0)
	Get absolute value (vector version).

template<typename T1 >
__XETLA_API std::remove_const< T1 >::type	xetla_abs (T1 src0)
	Get absolute value (scalar version).

template<typename T , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, SZ >	xetla_max (xetla_vector< T, SZ > src0, xetla_vector< T, SZ > src1, Sat sat={})
	Selects component-wise the maximum of the two vectors.

template<typename T , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, SZ >	xetla_max (xetla_vector< T, SZ > src0, T src1, Sat sat={})
	Selects maximums for each element of the input vector and a scalar.

template<typename T , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, SZ >	xetla_max (T src0, xetla_vector< T, SZ > src1, Sat sat={})
	Selects maximums for each element of the input scalar and a vector.

template<typename T , typename Sat = xetla_saturation_off_tag>
__XETLA_API T	xetla_max (T src0, T src1, Sat sat={})
	Selects maximum between two scalar values.

template<typename T , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, SZ >	xetla_min (xetla_vector< T, SZ > src0, xetla_vector< T, SZ > src1, Sat sat={})
	Selects component-wise the minimum of the two vectors.

template<typename T , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, SZ >	xetla_min (xetla_vector< T, SZ > src0, T src1, Sat sat={})
	Selects minimums for each element of the input vector and a scalar.

template<typename T , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, SZ >	xetla_min (T src0, xetla_vector< T, SZ > src1, Sat sat={})
	Selects minimums for each element of the input scalar and a vector.

template<typename T , typename Sat = xetla_saturation_off_tag>
__XETLA_API T	xetla_min (T src0, T src1, Sat sat={})
	Selects minimum between two scalar values.

template<class T , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, SZ >	xetla_exp (xetla_vector< T, SZ > src, Sat sat={})
	Calculate exponent value for each element of the input vector, the base is e.

template<class T , typename Sat = xetla_saturation_off_tag>
__XETLA_API T	xetla_exp (T src, Sat sat={})
	Calculate exponent value of a scalar, the base is e.

template<class T , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, SZ >	xetla_exp2 (xetla_vector< T, SZ > src, Sat sat={})
	Calculate exponent value for each element of the input vector, the base is 2.

template<class T , typename Sat = xetla_saturation_off_tag>
__XETLA_API T	xetla_exp2 (T src, Sat sat={})
	Calculate exponent value of a scalar, the base is 2.

template<typename T , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, SZ >	xetla_inv (xetla_vector< T, SZ > src, Sat sat={})
	Calculate the inversion, i.e.

template<typename T , typename Sat = xetla_saturation_off_tag>
__XETLA_API T	xetla_inv (T src, Sat sat={})
	Calculate the inversion, i.e.

template<typename T , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, SZ >	xetla_sqrt (xetla_vector< T, SZ > src, Sat sat={})
	Calculate the square root, i.e.

template<typename T , typename Sat = xetla_saturation_off_tag>
__XETLA_API T	xetla_sqrt (T src, Sat sat={})
	Calculate the square root, i.e.

template<typename T , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, SZ >	xetla_sqrt_ieee (xetla_vector< T, SZ > src, Sat sat={})
	Calculate the square root, i.e.

template<typename T , typename Sat = xetla_saturation_off_tag>
__XETLA_API T	xetla_sqrt_ieee (T src, Sat sat={})
	Calculate the square root, i.e.

template<typename T , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, SZ >	xetla_rsqrt (xetla_vector< T, SZ > src, Sat sat={})
	Calculate the inversion of square root, i.e.

template<typename T , typename Sat = xetla_saturation_off_tag>
__XETLA_API T	xetla_rsqrt (T src, Sat sat={})
	Calculate the inversion of square root, i.e.

template<typename T , int SZ>
__XETLA_API xetla_vector< T, SZ >	xetla_tanh (xetla_vector< T, SZ > src)
	Calculate the tanh (vector version).

template<typename T >
__XETLA_API T	xetla_tanh (T src)
	Calculate the tanh (scalar version).

template<typename T , int SZ>
__XETLA_API xetla_vector< T, SZ >	xetla_add_c (xetla_vector< T, SZ > src0, xetla_vector< T, SZ > src1, xetla_vector_ref< T, SZ > __REF__ carry)
	Add two unsigned integer vectors, return the result and in-place update the carry.

template<typename T , int SZ>
__XETLA_API xetla_vector< T, SZ >	xetla_add_c (xetla_vector< T, SZ > src0, T src1, xetla_vector_ref< T, SZ > __REF__ carry)
	Add one unsigned integer vectors with a scalar, return the result and in-place update the carry.

template<typename T0 , typename T1 , typename T2 , int SZ>
__XETLA_API xetla_vector< T0, SZ >	xetla_imul (xetla_vector_ref< T0, SZ > __REF__ lo, xetla_vector< T1, SZ > src0, T2 src1)
	Multiply src0 with src1, return the hi part and in-place update the lo part.

template<typename T0 , typename T1 , int SZ, reduce_op BinaryOperation>
__XETLA_API T0	xetla_reduce (xetla_vector< T1, SZ > v)
	Performs reduction over elements of the input vector.

template<typename T , int SZ>
__XETLA_API xetla_vector< T, SZ >	xetla_rnde (xetla_vector< T, SZ > src0)
	Get rounded value.

template<typename T1 , typename T0 , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T1, SZ >	xetla_add (xetla_vector< T0, SZ > src0, xetla_vector< T0, SZ > src1, Sat sat={})
	Adds two vectors with saturation The source operands must be both of floating-point type.

template<typename T1 , typename T0 , int SZ>
__XETLA_API xetla_vector< T1, SZ >	xetla_sat (xetla_vector< T0, SZ > src)
	Saturation function.

template<argument_type src1_precision, argument_type src2_precision, int systolic_depth, int repeat_count, typename T , typename T1 , typename T2 , int N, int N1, int N2, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, N >	xetla_mma (xetla_vector< T, N > src0, xetla_vector< T1, N1 > src1, xetla_vector< T2, N2 > src2, Sat sat={})
	description of xetla mma perform matrix multiply add operation

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::cached, cache_hint L2H = cache_hint::cached, int N>
__XETLA_API void	xetla_prefetch_global (Ty *p, xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred=1)
	Stateless scattered prefetch.

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::cached, cache_hint L2H = cache_hint::cached>
__XETLA_API void	xetla_prefetch_global (Ty *p, uint64_t offset=0)
	Stateless block prefetch (transposed gather with 1 channel).

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, int N, typename Toffset = uint32_t>
__XETLA_API xetla_vector< Ty, N *NElts >	xetla_load_global (Ty *p, xetla_vector< Toffset, N > offsets, xetla_mask< N > pred=1)
	Stateless scattered load.

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API xetla_vector< Ty, NElts >	xetla_load_global (Ty *p, uint64_t offset=0)
	Stateless block load (transposed gather with 1 channel).

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, int N, typename Toffset = uint32_t>
__XETLA_API void	xetla_store_global (Ty p, xetla_vector< Toffset, N > offsets, xetla_vector< Ty, N NElts > vals, xetla_mask< N > pred=1)
	Stateless scattered store.

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API void	xetla_store_global (Ty *p, uint64_t offset, xetla_vector< Ty, NElts > vals)
	Stateless block store (transposed scatter with 1 channel).

template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API xetla_vector< T, N >	xetla_atomic_global (T *p, xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred)
	Stateless scattered atomic (0 src).

template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API xetla_vector< T, N >	xetla_atomic_global (T *p, xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_mask< N > pred)
	Stateless scattered atomic (1 src).

template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API xetla_vector< T, N >	xetla_atomic_global (T *p, xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_vector< T, N > src1, xetla_mask< N > pred)
	Stateless scattered atomic (2 src).

template<uint32_t SLMSize>
__XETLA_API void	xetla_local_init ()
	Declare per-work-group slm size.

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, int N>
__XETLA_API xetla_vector< Ty, N *NElts >	xetla_load_local (xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred=1)
	SLM scattered load.

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size>
__XETLA_API xetla_vector< Ty, NElts >	xetla_load_local (uint32_t offset)
	SLM block load.

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, int N>
__XETLA_API void	xetla_store_local (xetla_vector< uint32_t, N > offsets, xetla_vector< Ty, N *NElts > vals, xetla_mask< N > pred=1)
	SLM scattered store.

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size>
__XETLA_API void	xetla_store_local (uint32_t offset, xetla_vector< Ty, NElts > vals)
	SLM block store (transposed SLM scatter with 1 channel).

template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size>
__XETLA_API xetla_vector< T, N >	xetla_atomic_local (xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred)
	SLM scattered atomic (0 src).

template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size>
__XETLA_API xetla_vector< T, N >	xetla_atomic_local (xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_mask< N > pred)
	SLM scattered atomic (1 src).

template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size>
__XETLA_API xetla_vector< T, N >	xetla_atomic_local (xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_vector< T, N > src1, xetla_mask< N > pred)
	SLM scattered atomic (2 src).

template<memory_kind Kind = memory_kind::untyped_global, fence_op FenceOp = fence_op::none, fence_scope Scope = fence_scope::group, int N = 16>
__XETLA_API void	xetla_fence (xetla_mask< N > pred=1)
	Memory fence.

template<typename T1 , uint32_t n1, typename T2 , uint32_t n2, uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t numDst, uint8_t isEOT = 0, uint8_t isSendc = 0, int N = 16>
__XETLA_API void	xetla_raw_send (xetla_vector_ref< T1, n1 > __REF__ msgDst, xetla_vector< T2, n2 > msgSrc0, uint32_t exDesc, uint32_t msgDesc, xetla_mask< N > mask=1)
	Raw send with one source operand and one destination operand.

template<typename T1 , uint32_t n1, typename T2 , uint32_t n2, typename T3 , uint32_t n3, uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t numSrc1, uint8_t numDst, uint8_t isEOT = 0, uint8_t isSendc = 0, int N = 16>
__XETLA_API void	xetla_raw_send (xetla_vector_ref< T1, n1 > __REF__ msgDst, xetla_vector< T2, n2 > msgSrc0, xetla_vector< T3, n3 > msgSrc1, uint32_t exDesc, uint32_t msgDesc, xetla_mask< N > mask=1)
	Raw send with two source operands and one destination operand.

template<typename T1 , uint32_t n1, uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t isEOT = 0, uint8_t isSendc = 0, int N = 16>
__XETLA_API void	xetla_raw_send (xetla_vector< T1, n1 > msgSrc0, uint32_t exDesc, uint32_t msgDesc, xetla_mask< N > mask=1)
	Raw send with one source operand and no return.

template<typename T1 , uint32_t n1, typename T2 , uint32_t n2, uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t numSrc1, uint8_t isEOT = 0, uint8_t isSendc = 0, int N = 16>
__XETLA_API void	xetla_raw_send (xetla_vector< T1, n1 > msgSrc0, xetla_vector< T2, n2 > msgSrc1, uint32_t exDesc, uint32_t msgDesc, xetla_mask< N > mask=1)
	Raw send with two source operands and no return.

template<typename kernel_t >
void	slm_barrier_init ()
	Initial the local memory size and named barrier count with kernel_t.

template<uint32_t slm_size, uint32_t nbarrier_count>
void	slm_barrier_init ()
	Initial the local memory size and named barrier count.

int	clz (int x)
	Host side utility function to compute number of leading zeros in the binary representation.

int	find_log2 (int x)
	Host side utility function to compute log2 function.

__XETLA_API xetla_vector< uint32_t, 4 >	get_time_stamp ()
	Returns time stamp.

template<typename Ty , int N>
__XETLA_API xetla_vector< Ty, N >	xetla_vector_gen (int InitVal, int Step)
	xetla_vector generation.

template<uint32_t N>
__XETLA_API xetla_mask_int< N >	xetla_mask_int_gen (uint32_t mask_val)

template<typename dtype_acc , uint32_t N, uint32_t num_flag = 4, typename dtype_mask = uint8_t>
__XETLA_API xetla_vector< dtype_acc, N >	drop_out (xetla_vector< dtype_acc, N > in, xetla_vector< dtype_mask, N > mask, dtype_acc scale)

template<reduce_op reduce_kind, typename dtype , int size>
__XETLA_API std::enable_if_t< reduce_kind==reduce_op::sum, xetla_vector< dtype, size > >	reduce_helper (xetla_vector< dtype, size > a, xetla_vector< dtype, size > b)

template<reduce_op reduce_kind, typename dtype , int size>
__XETLA_API std::enable_if_t< reduce_kind==reduce_op::prod, xetla_vector< dtype, size > >	reduce_helper (xetla_vector< dtype, size > a, xetla_vector< dtype, size > b)

template<reduce_op reduce_kind, typename dtype , int size>
__XETLA_API std::enable_if_t< reduce_kind==reduce_op::max, xetla_vector< dtype, size > >	reduce_helper (xetla_vector< dtype, size > a, xetla_vector< dtype, size > b)

template<reduce_op reduce_kind, typename dtype , int size>
__XETLA_API std::enable_if_t< reduce_kind==reduce_op::min, xetla_vector< dtype, size > >	reduce_helper (xetla_vector< dtype, size > a, xetla_vector< dtype, size > b)

template<reduce_op reduce_kind, typename dtype , int N_x, int N_y>
__XETLA_API std::enable_if_t< N_y==1, xetla_vector< dtype, N_x > >	recur_row_reduce (xetla_vector< dtype, N_x > in)

template<reduce_op reduce_kind, typename dtype , int N_x, int N_y>
__XETLA_API std::enable_if_t<(N_y > 1), xetla_vector< dtype, N_x > >	recur_row_reduce (xetla_vector< dtype, N_x *N_y > in)

template<reduce_op reduce_kind, typename dtype , int N_x, int N_y>
__XETLA_API std::enable_if_t< N_x==1, xetla_vector< dtype, N_y > >	recur_col_reduce (xetla_vector< dtype, N_y > in)

template<reduce_op reduce_kind, typename dtype , int N_x, int N_y>
__XETLA_API std::enable_if_t<(N_x > 1), xetla_vector< dtype, N_y > >	recur_col_reduce (xetla_vector< dtype, N_x *N_y > in)

__XETLA_API uint32_t	get_2d_group_linear_id (sycl::nd_item< 3 > &item)
	get linear group id of the last two dimensions.

template<typename Ty , uint32_t block_width = 1, uint32_t block_height = 1, uint8_t array_len = 1>
__XETLA_API void	xetla_fill_tdesc (xetla_tdescriptor_ref tdesc, Ty *p, int tensor_width, int tensor_height, int tensor_pitch, int offset_x, int offset_y)
	Tensor descriptor construction(global memory version).

template<typename Ty >
__XETLA_API void	xetla_fill_tdesc (xetla_tdescriptor_ref tdesc, uint32_t base_address, int tensor_width, int tensor_height, int tensor_pitch, int offset_x, int offset_y)
	Tensor descriptor construction(local memory version).

template<typename Ty , uint32_t block_width = 1, uint32_t block_height = 1, uint8_t array_len = 1>
__XETLA_API xetla_tdescriptor	xetla_get_tdesc (Ty *p, int tensor_width, int tensor_height, int tensor_pitch, int offset_x, int offset_y)
	Generate a new tensor descriptor(global memory version).

template<typename Ty >
__XETLA_API xetla_tdescriptor	xetla_get_tdesc (uint32_t base_address, int tensor_width, int tensor_height, int tensor_pitch, int offset_x, int offset_y)
	Generate a new tensor descriptor(local memory version).

__XETLA_API void	xetla_update_tdesc_offsetx (xetla_tdescriptor_ref tdesc, int32_t doffset_x)
	Update the x coordinate in the given tensor descriptor.

__XETLA_API void	xetla_update_tdesc_offsety (xetla_tdescriptor_ref tdesc, int32_t doffset_y)
	Update the y coordinate in the given tensor descriptor.

template<typename Ty , uint32_t N, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, bool transpose = false, bool transform = false, gpu_arch arch_tag = gpu_arch::Xe>
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, xetla_vector< Ty, N > >	xetla_tload_global (xetla_tdescriptor tdesc)
	Tensor load API.

template<typename Ty , uint32_t N, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, gpu_arch arch_tag = gpu_arch::Xe>
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void >	xetla_tstore_global (xetla_tdescriptor tdesc, xetla_vector< Ty, N > data)
	Tensor store API.

template<typename Ty , cache_hint L1H = cache_hint::cached, cache_hint L2H = cache_hint::cached, gpu_arch arch_tag = gpu_arch::Xe>
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void >	xetla_tprefetch_global (xetla_tdescriptor tdesc)
	Tensor prefetch API.

template<typename Ty , uint32_t N, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, atomic_op Op, gpu_arch arch_tag = gpu_arch::Xe, typename Toffset = uint32_t>
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void >	xetla_tatomic_store_global (uint64_t base_address, xetla_vector< Toffset, N > offset, xetla_vector< Ty, N > data, xetla_mask< N > pred=1)
	Tensor atomic store API.

Typedef Documentation

◆ default_param_t

using gpu::xetla::default_param_t = typedef dict_t<>::template update_dict_t< detail::param_dtype_bf16_bf16_bf16>::template update_dict_t<detail:: param_memlayout_rrr>::template update_dict_t<detail:: param_memalignment_8_8_8>::template update_dict_t<detail:: param_memspace_ggg>::template update_dict_t<detail:: param_performance_default>::template update_dict_t<detail:: param_runtime_default>:: template update_t<elem_t_t<tune_key::data_type_acc, float>, elem_v_t<tune_key::global_kslicing_ratio, 1UL, uint32_t>, elem_v_t<tune_key::local_kslicing_ratio, 1UL, uint32_t>, elem_t_t<tune_key::wg_tile_shape, shape<256, 256> >, elem_t_t<tune_key::sg_tile_shape, shape<64, 32> >, elem_v_t<tune_key::param_optimizer_type, tune_key_value::param_optimizer_dummy> >

◆ is_xetla_scalar

template<typename T >

using gpu::xetla::is_xetla_scalar = typedef typename __ESIMD_DNS::is_esimd_scalar<T>

◆ meta_type

template<typename T = impl::meta_impl_base::_default>

using gpu::xetla::meta_type = typedef impl::meta_type_impl<T>

◆ meta_type_t

template<typename T >

using gpu::xetla::meta_type_t = typedef typename T::type

◆ meta_value

template<auto d_ = impl::meta_impl_base_default, typename T = decltype(d_)>

using gpu::xetla::meta_value = typedef impl::meta_value_impl<T, d_>

◆ meta_value_t

template<auto d_ = impl::meta_impl_base_default, typename T = decltype(d_)>

using gpu::xetla::meta_value_t = typedef typename meta_value<d_, T>::type

Enumeration Type Documentation

◆ argument_type

enum class gpu::xetla::argument_type : uint8_t

strong

xetla dpas argument typ

Enumerator
U1
S1
U2
S2
U4
S4
U8
S8
BF16
FP16
TF32
DF
NUM_ARG_TYPES

◆ atomic_op

enum class gpu::xetla::atomic_op : uint8_t

strong

Represents an atomic operation.

Operations always return the old value(s) of the target memory location(s) as it was before the operation was applied.

Enumerator
iinc	Atomic increment of memory data and return the old value. see
idec	Atomic decrement of memory data and return the old value. see
iadd	Atomic signed int add of src1 from memory data and return the old value. see
isub	Atomic signed int subtract of src1 from memory data and return the old value. see
smin	Atomic store the signed int min of src1 and memory data and return the old value. see
smax	Atomic store the signed int max of src1 and memory data and return the old value. see
cmpxchg	Atomic bit-compare src1_X and memory data and replace if equal with src1_Y. Returns the old value. see
fadd	Atomic float add of src1 from memory data and return the old value. see
fsub	Atomic float subtract of src1 from memory data and return the old value. see
fmin	Atomic store the float min of src1 and memory data and return the old value. see
fmax	Atomic store the float max of src1 and memory data and return the old value. see
fcmpxchg	Atomic float compare src1_X and memory data and replace if equal with src1_Y. Returns the old value. see
umin	Atomic store the unsigned int min of src1 and memory data and return the old value. see
umax	Atomic store the unsigned int max of src1 and memory data and return the old value. see
bit_and	Atomic store the bitwise AND of src1 and memory data and return the old value. see
bit_or	Atomic store the bitwise OR of src1 and memory data and return the old value. see
bit_xor	Atomic store the bitwise XOR of src1 and memory data and return the old value. see
load	Atomic read of the memory data value, without modifying the data. see
store	Atomic store untyped data to memory. see

◆ cache_hint

enum class gpu::xetla::cache_hint : uint8_t

strong

L1 or L2 cache hint kinds.

Enumerator
none
uncached
cached
write_back
write_through
streaming
read_invalidate

◆ data_size

enum class gpu::xetla::data_size : uint8_t

strong

Data size or format to read or store.

Enumerator
default_size
u8
u16
u32
u64
u8u32
u16u32	load 8b, zero extend to 32b; store the opposite
u16u32h	load 16b, zero extend to 32b; store the opposite

◆ fence_op

enum class gpu::xetla::fence_op : uint8_t

strong

The xetla_fence operation to apply to caches.

Enumerator
none
evict	no operation
invalidate	dirty lines evicted and invalidated from L1
discard	invalidate all clean lines
clean	direct and clean lines are discarded w/o eviction
flushl2	dirty lines are written to memory, but retained in cache in clean state

◆ fence_scope

enum class gpu::xetla::fence_scope : uint8_t

strong

The scope that xetla_fence operation should apply to.

Enumerator
group
local	flush out to the threadgroup's scope
tile	flush out to the local scope
gpu	tile, flush out to several DSSs
gpus	entire GPU, flush out to the GPUs LLC
system	all GPUs in the system, flush out to memory shared by all GPUs
sysacq	the entire system memory space

◆ gpu_arch

enum class gpu::xetla::gpu_arch : uint8_t

strong

Enumerator
Xe

◆ grf_mode

enum class gpu::xetla::grf_mode : uint8_t

strong

Enumerator
normal
double_grf

◆ ln_bwd_fused_kind

enum class gpu::xetla::ln_bwd_fused_kind : uint8_t

strong

Enumerator
none
bias_dropout_resAdd_ln
ln_dropout_gradAdd
ln_dropout

◆ ln_fwd_fused_kind

enum class gpu::xetla::ln_fwd_fused_kind : uint8_t

strong

Enumerator
none
bias_dropout_resAdd_ln
ln_dropout
bias_rng_dropout_resAdd_ln
ln_rng_dropout

◆ mem_layout

enum class gpu::xetla::mem_layout : uint8_t

strong

Enumerator
row_major
col_major

◆ mem_space

enum class gpu::xetla::mem_space : uint8_t

strong

Enumerator
global
local

◆ memory_kind

enum class gpu::xetla::memory_kind : uint8_t

strong

The specific LSC shared function to fence with xetla_fence.

Enumerator
untyped_global
untyped_global_low_pri	untyped global memory
typed_global	low-priority untyped global memory
shared_local	typed global memory

◆ memory_op

enum class gpu::xetla::memory_op : uint8_t

strong

Enumerator
load
store

◆ mma_engine

enum class gpu::xetla::mma_engine : uint8_t

strong

Enumerator
xmx
fpu

◆ msg_type

enum class gpu::xetla::msg_type : uint8_t

strong

Enumerator
block_2d
block_1d
scatter
atomic_add
unaligned_2d

◆ offset_mode

enum class gpu::xetla::offset_mode : uint8_t

strong

Enumerator
const_offset
cyclic_offset
acyclic_offset

◆ param_adaptor_tag

enum class gpu::xetla::param_adaptor_tag : uint8_t

strong

Enumerator
kernel
work_group_gemm
work_group_epilogue

◆ param_optimizer_tag

enum class gpu::xetla::param_optimizer_tag : uint8_t

strong

Enumerator
kernel
work_group

◆ post_kind

enum class gpu::xetla::post_kind : uint8_t

strong

Enumerator
none
relu
gelu
gelu_bwd_w
sigmoid
tanh

◆ pre_kind

enum class gpu::xetla::pre_kind : uint8_t

strong

Enumerator
none
bias_add
res_add

◆ reduce_op

enum class gpu::xetla::reduce_op : uint8_t

strong

xetla reduce op

Enumerator
sum
prod
min
max

◆ reduction_fused_kind

enum class gpu::xetla::reduction_fused_kind : uint8_t

strong

Enumerator
none
bias_gelu_w_bwd
bias_dropout_bwd

◆ reg_layout

enum class gpu::xetla::reg_layout : uint8_t

strong

tile layout in register linear: linear layout with one tile tiled: 2d block stacked in raster order vnni_tiled: vnni pack with 2d block and 2d block stacked in raster order for dword and qword, there is no impact for word, two rows are interleaved, i.e.

a0 b0 c0 d0 ==> a0 a1 b0 b1 c0 c1 d0 d1 a1 b1 c1 d1 for byte, four rows are interleaved and formed one row, i.e. a0 b0 c0 d0 ==> a0 a1 a2 a3 b0 b1 b2 b3 c0 c1 c2 c3 d0 d1 d2 d3 a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3

Enumerator
linear
tiled
vnni_tiled
transpose_tiled
vnni_tiled_col_major	this is vnni tiled format, but for each block, they are stored in col major order

◆ store_op

enum class gpu::xetla::store_op : uint8_t

strong

Enumerator
normal
atomic_fadd
atomic_iadd
scattered_transpose
block_1d

◆ tdesc_update_dir

enum class gpu::xetla::tdesc_update_dir : uint8_t

strong

Enumerator
x_dir
y_dir

◆ tune_key

enum class gpu::xetla::tune_key : uint8_t

strong

Enumerator
data_type_a
memory_layout_a
memory_alignment_a
memory_space_a
data_type_b
memory_layout_b
memory_alignment_b
memory_space_b
data_type_c
memory_layout_c
memory_alignment_c
memory_space_c
data_type_acc
global_kslicing_ratio
local_kslicing_ratio
wg_tile_shape
wg_tile_k
sg_tile_shape
pre_processing
prefetch_distance
periodic_sync_interval
mma_engine
gpu_arch
epilogue_policy
dispatch_policy
group_swizzle_policy
param_optimizer_type
source_location

◆ tune_key_value

enum class gpu::xetla::tune_key_value : uint8_t

strong

Enumerator
pre_processing_default
pre_processing_mata_neg_filter
dispatch_policy_default
dispatch_policy_kslicing
dispatch_policy_stream_k
param_optimizer_dummy
param_optimizer_decision_tree

Function Documentation

◆ clz()

int gpu::xetla::clz ( int x )

inline

Host side utility function to compute number of leading zeros in the binary representation.

◆ find_log2()

int gpu::xetla::find_log2 ( int x )

inline

Host side utility function to compute log2 function.

◆ reduce_helper() [1/3]

template<reduce_op reduce_kind, typename dtype , int size>

__XETLA_API std::enable_if_t< reduce_kind==reduce_op::prod, xetla_vector< dtype, size > > gpu::xetla::reduce_helper	(	xetla_vector< dtype, size >	a,
		xetla_vector< dtype, size >	b
	)

◆ reduce_helper() [2/3]

template<reduce_op reduce_kind, typename dtype , int size>

__XETLA_API std::enable_if_t< reduce_kind==reduce_op::max, xetla_vector< dtype, size > > gpu::xetla::reduce_helper	(	xetla_vector< dtype, size >	a,
		xetla_vector< dtype, size >	b
	)

◆ reduce_helper() [3/3]

template<reduce_op reduce_kind, typename dtype , int size>

__XETLA_API std::enable_if_t< reduce_kind==reduce_op::min, xetla_vector< dtype, size > > gpu::xetla::reduce_helper	(	xetla_vector< dtype, size >	a,
		xetla_vector< dtype, size >	b
	)

◆ slm_barrier_init() [1/2]

template<typename kernel_t >

void gpu::xetla::slm_barrier_init ( )

Initial the local memory size and named barrier count with kernel_t.

Template Parameters

kernel_t Is XeTLA kernel level functor.

◆ slm_barrier_init() [2/2]

template<uint32_t slm_size, uint32_t nbarrier_count>

void gpu::xetla::slm_barrier_init ( )

Initial the local memory size and named barrier count.

Template Parameters

slm_size	Is the local memory size required.
nbarrier_count	Is the named barrier count required.