Namespaces
namespace	detail

Classes
struct	bias_add_op_t
	Is the bias_add op functor. More...

struct	bias_add_op_t< dtype_bias_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> >
	Is the bias_add op functor, specialized for Xe architecture. More...

struct	chained_tile_op_arg_t

struct	chained_tile_op_arg_t< idx, curr_args_t, remain_args_t... >

struct	chained_tile_op_t

struct	check_load

struct	check_load< gpu_arch::Xe, dtype, mem_dtype >

struct	check_store

struct	check_store< gpu_arch::Xe, dtype, mem_dtype >

class	cooperative_load_helper_t
	Helper to do the cooperative workgroups load. More...

class	cooperative_load_helper_t< matAcc_t_, mem_layout::col_major, num_cooperative_wg, arch_tag_, std::enable_if_t< gpu_arch::Xe==arch_tag_ > >
	Workgroups to do the cooperative load. Specialized for and row_major and Xe architecture. More...

class	cooperative_load_helper_t< matAcc_t_, mem_layout::row_major, num_cooperative_wg, arch_tag_, std::enable_if_t< gpu_arch::Xe==arch_tag_ > >
	Workgroups to do the cooperative load. Specialized for and row_major and Xe architecture. More...

struct	dequant_op_t
	Is the dequantization op functor. More...

struct	dequant_op_t< tile_op_t_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> >
	Is the quantization op functor, specialized for Xe architecture. More...

struct	dropout_op_t
	Is the dropout op functor. More...

struct	dropout_op_t< dtype_mask_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> >
	Is the dropout op functor, specialized for Xe architecture. More...

struct	elemwise_reduce_op_stream_k_t
	Is the element-wise reduce op functor, specialized for stream_k dispatch Load partial sum from scratchspace Reduce in GRF Store zero to scratchspace Do these steps with smaller tiles to minimize GRF pressure. More...

struct	elemwise_reduce_op_stream_k_t< reduce_kind_, dtype_in_, gpu_arch::Xe >
	Is the element-wise reduce op functor, specialized for Xe architecture. More...

struct	elemwise_reduce_op_t
	Is the element-wise reduce op functor. More...

struct	elemwise_reduce_op_t< reduce_kind_, dtype_in_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> >
	Is the element-wise reduce op functor, specialized for Xe architecture. More...

struct	gelu_bwd_op_t
	Is the element-wise gelu backward op functor. More...

struct	gelu_bwd_op_t< dtype_in_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> >
	Is the element-wise gelu backward op functor, specialized for Xe architecture. More...

struct	gelu_fwd_op_t
	Is the element-wise gelu inference forward op functor. More...

struct	gelu_fwd_w_op_t
	Is the element-wise gelu training forward op functor. More...

struct	gelu_fwd_w_op_t< dtype_out_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> >
	Is the element-wise gelu training forward op functor, specialized for Xe architecture. More...

struct	get_load_block_size_auto

struct	get_load_block_size_auto< dtype, tile_size_x, tile_size_y, gpu_arch::Xe, mem_layout::row_major, reg_layout::tiled >

struct	get_store_block_size_auto

struct	get_store_block_size_auto< dtype, tile_size_x, tile_size_y, gpu_arch::Xe, mem_layout::row_major, reg_layout::tiled >

struct	global_atomic_oob_check_off_tag

struct	global_atomic_oob_check_on_tag

struct	is_floating_to_integer

struct	is_same_layout

struct	linear_op_t
	Is the linear_op functor. More...

struct	linear_op_t< dtype_in_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> >
	Is the linear_op functor, specialized for Xe architecture. More...

struct	mem_payload_t
	Is to illustrate the memory information. More...

struct	mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::global, alignment_ >, tile_desc_, msg_type::atomic_add, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >
	Is to describe the global memory surface for atomic store For atomic store, we need to prepare necessary information for each simd channel. More...

struct	mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::global, alignment_ >, tile_desc_, msg_type::block_1d, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >
	Is to describe the global memory surface for block-1d load/store For a block-1d payload message we need to set the base address and offset of surface. More...

struct	mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ >, tile_desc_, msg_type::block_1d, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >
	Is to describe the shared local memory surface for block-1d load/store. More...

struct	mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ >, tile_desc_, msg_type::scatter, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >
	Is to describe the shared local memory surface for scatter load/store. More...

struct	mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout::vnni_tiled_col_major >, msg_type::scatter, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >
	Is to describe the shared local memory surface for scattering store. More...

struct	mem_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_, msg_type::block_2d, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >
	Is to describe the global memory surface for block-2d load/store for each block in one tile, a payload message is prepared here. More...

struct	mem_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_, msg_type::unaligned_2d, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >
	Is to describe the global memory surface for unaligned-2d load/store for each block in one tile, a payload message is prepared here. More...

struct	msg_type_query

struct	none_op_t
	Is none op functor, for placeholder purpose. More...

struct	polynomial_op_t

struct	prefetch_payload_t
	Is to illustrate the memory information to prefetch data to cache. More...

struct	prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, 1, block_size_x_, 1, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >
	Is to describe the memory memory to prefetch data to cache data in global memory will be prefetched into 1d tile. More...

struct	prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >
	Is to describe the global memory surface to prefetch data to cache data in global memory will be prefetched into 2d tile. More...

struct	prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::local, alignment_ >, tile_desc_, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >
	Is to describe the memory infomation to prefetch data to cache data located in shared local memory, nothing will do. More...

struct	quant_op_t
	Is the quantization op functor. More...

struct	quant_op_t< tile_op_t_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> >
	Is the quantization op functor, specialized for Xe architecture. More...

struct	relu_op_t
	Is the element-wise relu op functor. More...

struct	rng_dropout_op_t
	Is the random number generator and dropout op functor. More...

struct	rng_dropout_op_t< dtype_mask_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> >
	Is the random number generator and dropout op functor, specialized for Xe architecture. More...

struct	scalar_mul_op_t
	Is the scalar_multiply op functor. More...

struct	scalar_mul_op_t< dtype_in_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> >
	Is the scalar_multiply op functor, specialized for Xe architecture. More...

struct	scale_v_offset_v_op_t
	Is MatAcc * vector scale + vector offset. More...

struct	scale_v_offset_v_op_t< scale_dtype_, offset_dtype_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> >
	Is the scale_v_offset_v op functor, specialized for Xe architecture. More...

struct	scale_v_op_t
	Is MatAcc * vector scale. More...

struct	scale_v_op_t< scale_dtype_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> >
	Is the scale_v op functor, specialized for Xe architecture. More...

struct	sigmoid_op_t
	Is the element-wise sigmoid op functor. More...

struct	tanh_op_t
	Is the element-wise tanh op functor. More...

struct	tile_desc_t
	Is to illustrate the tile information about a sub matrix. More...

struct	tile_div

struct	tile_minus

struct	tile_mma_t
	Is the xetla tile mma operation definition API. More...

struct	tile_mma_t< matAcc_dst_t_, matAcc_src_t_, matB_t_, matA_t_, mma_engine::fpu, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >
	Is the tile mma operation functor, specialized for Xe and fpu engine. More...

struct	tile_mma_t< matAcc_dst_t_, matAcc_src_t_, matB_t_, matA_t_, mma_engine::xmx, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >
	Is the tile mma operation functor, specialized for Xe and matrix engine. More...

struct	tile_op_arg_helper_t

struct	tile_t
	Is a struct contains some register file. More...

Functions
template<cache_hint L1 = cache_hint::cached, cache_hint L2 = cache_hint::cached, typename tile_t , typename payload_t >
__XETLA_API std::enable_if_t< detail::check_load_type< tile_t, payload_t >::is_global_2d_xe >	tile_load (tile_t &tile, payload_t &payload)
	This function loads data from 2D memory surface.

template<cache_hint L1 = cache_hint::cached, cache_hint L2 = cache_hint::cached, typename tile_t , typename payload_t >
__XETLA_API std::enable_if_t< detail::check_load_type< tile_t, payload_t >::is_global_block_1d_xe >	tile_load (tile_t &tile, payload_t &payload)
	This function loads data from memory.

template<cache_hint L1 = cache_hint::cached, cache_hint L3 = cache_hint::cached, typename tile_t , typename payload_t , typename oob_check_tag = global_atomic_oob_check_on_tag>
__XETLA_API std::enable_if_t< detail::check_load_type< tile_t, payload_t >::is_global_unaligned_2d_xe >	tile_load (tile_t &tile, payload_t &payload, oob_check_tag tag={})
	This function loads data from unaligned-2D memory surface.

template<cache_hint L1 = cache_hint::cached, cache_hint L2 = cache_hint::cached, typename tile_t , typename payload_t >
__XETLA_API std::enable_if_t< detail::check_load_type< tile_t, payload_t >::is_local_scatter_xe >	tile_load (tile_t &tile, payload_t &payload)
	Is the data load func from local shared memory to register file, which supports the memory surface is 1d or 2d scenario.

template<cache_hint L1 = cache_hint::cached, cache_hint L2 = cache_hint::cached, typename tile_t , typename payload_t >
__XETLA_API std::enable_if_t< detail::check_load_type< tile_t, payload_t >::is_local_block_1d_xe >	tile_load (tile_t &tile, payload_t &payload)
	Is the data load func from shared local memory to register file, which supports the memory surface is 1d scenario.

template<typename T_dst , typename T_src >
__XETLA_API std::enable_if_t<(T_src::register_layout !=reg_layout::linear) &&(T_dst::register_layout !=reg_layout::linear) &&is_same_layout< T_dst, T_src >::value &&(!is_floating_to_integer< T_dst, T_src >::value)>	elemwise_cvt (T_dst &dst, T_src &src)
	Is the element wise data conversion, the src and dst tile should have the same layout.

template<typename T_dst , typename T_src >
__XETLA_API std::enable_if_t<(T_src::register_layout !=reg_layout::linear) &&(T_dst::register_layout !=reg_layout::linear) &&is_same_layout< T_dst, T_src >::value &&is_floating_to_integer< T_dst, T_src >::value >	elemwise_cvt (T_dst &dst, T_src &src)
	Is the element wise data conversion from floating point to integral, the src and dst tile should have the same layout.

template<typename T_dst , typename T_src >
__XETLA_API std::enable_if_t<(T_src::register_layout !=reg_layout::linear) &&(T_dst::register_layout !=reg_layout::linear) &&is_same_layout< T_dst, T_src >::value >	elemwise_cvt (T_dst &dst, T_src &src, float scale)
	element wise data conversion with scaling, the src and dst tile should have the same layout.

template<typename T >
__XETLA_API std::enable_if_t< T::register_layout==reg_layout::vnni_tiled >	vnni_convert (T &mat_Acc)
	Converts tiled layout to vnni_tiled layout format.

template<typename T >
__XETLA_API std::enable_if_t< T::register_layout==reg_layout::tiled >	vnni_reverse (T &mat_Acc)
	Converts vnni_tiled layout format to tiled layout.

template<typename T >
__XETLA_API std::enable_if_t< T::register_layout==reg_layout::transpose_tiled >	vnni_reverse (T &mat_Acc)
	Converts vnni_tiled layout format to transpose_tiled layout.

template<typename T_dst , typename T_src >
__XETLA_API std::enable_if_t< is_same_layout< T_dst, T_src >::value >	vnni_transform (T_dst &dst, T_src &src)
	Changes vnni layout.

template<typename T_dst , typename T_src >
__XETLA_API std::enable_if_t<(T_dst::register_layout==reg_layout::tiled) &&(T_src::register_layout==reg_layout::tiled) &&(T_src::tile_size_x==T_dst::tile_size_x) &&(T_src::tile_size_y==1)>	row_broadcast (T_dst &dst, T_src &src)
	Broadcasts 1d src tile to the entire 2d tile, as well as do the data conversion.

template<typename T_dst , typename T_src >
__XETLA_API std::enable_if_t<(T_dst::register_layout==reg_layout::linear) &&(T_src::register_layout==reg_layout::tiled) &&(T_src::tile_size_x==T_dst::tile_size_x) &&(T_src::tile_size_y==T_dst::tile_size_y) &&(T_dst::tile_size_x==T_dst::block_size_x) &&(T_dst::tile_size_y==T_dst::block_size_y) &&(std::is_same< typename T_dst::dtype, typename T_src::dtype >::value)>	layout_convert (T_dst &dst, T_src &src)
	convert 2d tile in a tiled register layout to a 2d tile in a linear register layout

template<typename T_dst , typename T_src >
__XETLA_API std::enable_if_t<(T_dst::register_layout==reg_layout::tiled) &&(T_src::register_layout==reg_layout::linear) &&(T_dst::tile_size_x==T_src::tile_size_x) &&(T_dst::tile_size_y==T_src::tile_size_y) &&(T_src::tile_size_x==T_src::block_size_x) &&(T_src::tile_size_y==T_src::block_size_y) &&(std::is_same< typename T_dst::dtype, typename T_src::dtype >::value)>	layout_convert (T_dst &dst, T_src &src)
	convert 2d tile in a linear register layout to a 2d tile in a tiled register layout

template<cache_hint L1 = cache_hint::cached, cache_hint L2 = cache_hint::cached, typename payload_t >
__XETLA_API std::enable_if_t< detail::check_prefetch_type< payload_t >::is_global_2d_xe >	tile_prefetch (payload_t &payload)
	Is prefetch data func, which data located in global memory is prefetched to cache, where has higher bandwidth.

template<cache_hint L1 = cache_hint::cached, cache_hint L2 = cache_hint::cached, typename payload_t >
__XETLA_API std::enable_if_t< detail::check_prefetch_type< payload_t >::is_global_block_1d_xe >	tile_prefetch (payload_t &payload)
	Is prefetch data func, which data located in global memory is prefetched to cache, where has higher bandwidth.

template<cache_hint L1 = cache_hint::cached, cache_hint L2 = cache_hint::cached, typename payload_t >
__XETLA_API std::enable_if_t< detail::check_prefetch_type< payload_t >::is_local_xe >	tile_prefetch (payload_t &payload)
	Is prefetch data func.

template<reduce_op reduce_kind, typename dtype_out , typename dtype_acc , int dim, typename mat_t >
__XETLA_API std::enable_if_t<(dim==1), xetla_vector< dtype_out, mat_t::tile_size_y > >	tile_reduce (mat_t &src)

template<reduce_op reduce_kind, typename dtype_out , typename dtype_acc , int dim, typename mat_t >
__XETLA_API std::enable_if_t<(dim==0), xetla_vector< dtype_out, mat_t::tile_size_x > >	tile_reduce (mat_t &src)

template<typename T_dst , typename T_src , bool accumulate = true, typename dtype_acc = float, uint32_t num_acc = 4>
	XETLA_MARKER ("This is only for reduce add, and will be deprecated in future. " "Please use tile_reduce instead.") __XETLA_API typename std
	Reduce 2d src tile to the 1d tile, and output to 1d dst.

template<cache_hint L1 = cache_hint::write_back, cache_hint L2 = cache_hint::write_back, typename tile_t , typename payload_t >
__XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_global_2d_xe >	tile_store (tile_t &tile, payload_t &payload)
	Is the func storing data from register file to global memory.

template<cache_hint L1 = cache_hint::write_back, cache_hint L2 = cache_hint::write_back, typename tile_t , typename payload_t >
__XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_global_block_1d_xe >	tile_store (tile_t &tile, payload_t &payload)
	Is the func storing data from register file to global memory.

template<cache_hint L1 = cache_hint::write_back, cache_hint L3 = cache_hint::write_back, typename tile_t , typename payload_t , typename oob_check_tag = global_atomic_oob_check_on_tag>
__XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_global_unaligned_2d_xe >	tile_store (tile_t &tile, payload_t &payload, oob_check_tag tag={})
	Is the func storing data from register file to unaligned global memory surface.

template<cache_hint L1 = cache_hint::uncached, cache_hint L2 = cache_hint::write_back, typename tile_t , typename payload_t , typename oob_check_tag = global_atomic_oob_check_on_tag>
__XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_global_atomic_xe >	tile_store (tile_t &tile, payload_t &payload, oob_check_tag tag={})
	Is the func storing data from register file to global memory enable atomic adding data into the same buffer, but support float32, float64, uint32_t, uint64_t and int type.

template<cache_hint L1 = cache_hint::write_back, cache_hint L2 = cache_hint::write_back, typename tile_t , typename payload_t >
__XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_local_scatter_xe >	tile_store (tile_t &tile, payload_t &payload)
	Is the func storing data from register file to shared local memory, which supports the memory surface 2d scenario.

template<cache_hint L1 = cache_hint::write_back, cache_hint L2 = cache_hint::write_back, typename tile_t , typename payload_t >
__XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_local_scatter_vnni_col_xe >	tile_store (tile_t &tile, payload_t &payload)
	Is the data store func from register file to local shared memory, where the data in register is vnni packed and col major.

template<cache_hint L1 = cache_hint::write_back, cache_hint L2 = cache_hint::write_back, typename tile_t , typename payload_t >
__XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_local_block_1d_xe &&tile_t::block_size_y !=1 >	tile_store (tile_t &tile, payload_t &payload)
	Is the data store func from register file to shared local memory, where supports memory surface 1d or 2d scenario, and we always assume dst memory layout is row major.

template<cache_hint L1 = cache_hint::write_back, cache_hint L2 = cache_hint::write_back, typename tile_t , typename payload_t >
__XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_local_block_1d_xe &&tile_t::tile_size_y==1 &&tile_t::block_size_y==1 >	tile_store (tile_t &tile, payload_t &payload)
	Is the func storing data from register file to shared local memory, the data in registers will be stored to SLM in 1d mode, and we always assume dst memory layout is row major.

template<typename op , typename matAcc_t >
void	tile_broadcast_op (matAcc_t &matAcc, xetla_vector< typename matAcc_t::dtype, matAcc_t::tile_size_y > data)

Variables
template<typename tile_desc_ , mem_space memory_space>
constexpr msg_type	msg_type_v = msg_type_query<tile_desc_, memory_space>::value

Function Documentation

◆ elemwise_cvt() [1/3]

template<typename T_dst , typename T_src >

__XETLA_API std::enable_if_t<(T_src::register_layout !=reg_layout::linear) &&(T_dst::register_layout !=reg_layout::linear) &&is_same_layout< T_dst, T_src >::value &&(!is_floating_to_integer< T_dst, T_src >::value)> gpu::xetla::subgroup::elemwise_cvt	(	T_dst &	dst,
		T_src &	src
	)

Is the element wise data conversion, the src and dst tile should have the same layout.

Template Parameters

T_dst	Is the destination tile data type.
T_src	Is the source tile data type.

Parameters

dst	Is the reference of the destination tile object.
src	Is the reference of the destination tile object.

Returns: No return, in-place update in the destination tile.

◆ elemwise_cvt() [2/3]

template<typename T_dst , typename T_src >

__XETLA_API std::enable_if_t<(T_src::register_layout !=reg_layout::linear) &&(T_dst::register_layout !=reg_layout::linear) &&is_same_layout< T_dst, T_src >::value &&is_floating_to_integer< T_dst, T_src >::value > gpu::xetla::subgroup::elemwise_cvt	(	T_dst &	dst,
		T_src &	src
	)

Is the element wise data conversion from floating point to integral, the src and dst tile should have the same layout.

Template Parameters

T_dst	Is the destination tile data type.
T_src	Is the source tile data type.

Parameters

dst	Is the reference of the destination tile object.
src	Is the reference of the destination tile object.

Returns: No return, in-place update in the destination tile.

◆ elemwise_cvt() [3/3]

template<typename T_dst , typename T_src >

__XETLA_API std::enable_if_t<(T_src::register_layout !=reg_layout::linear) &&(T_dst::register_layout !=reg_layout::linear) &&is_same_layout< T_dst, T_src >::value > gpu::xetla::subgroup::elemwise_cvt	(	T_dst &	dst,
		T_src &	src,
		float	scale
	)

element wise data conversion with scaling, the src and dst tile should have the same layout.

Template Parameters

T_dst	is the destination tile data type.
T_src	is the source tile data type.

Parameters

dst	is the reference of the destination tile object.
src	is the reference of the destination tile object.
scale	is the scaling value to be applied before the assignment.

Returns: no return, in-place update in the destination tile.

◆ layout_convert() [1/2]

template<typename T_dst , typename T_src >

__XETLA_API std::enable_if_t<(T_dst::register_layout==reg_layout::linear) &&(T_src::register_layout==reg_layout::tiled) &&(T_src::tile_size_x==T_dst::tile_size_x) &&(T_src::tile_size_y==T_dst::tile_size_y) &&(T_dst::tile_size_x==T_dst::block_size_x) &&(T_dst::tile_size_y==T_dst::block_size_y) &&(std::is_same< typename T_dst::dtype, typename T_src::dtype >::value)> gpu::xetla::subgroup::layout_convert	(	T_dst &	dst,
		T_src &	src
	)

convert 2d tile in a tiled register layout to a 2d tile in a linear register layout

Template Parameters

T_dst	Is the destination tile data type.
T_src	Is the source tile data type.

Parameters

dst	Is the reference of the destination tile object.
src	Is the reference of the destination tile object.

Returns: No return, in-place update in the destination tile.

◆ layout_convert() [2/2]

template<typename T_dst , typename T_src >

__XETLA_API std::enable_if_t<(T_dst::register_layout==reg_layout::tiled) &&(T_src::register_layout==reg_layout::linear) &&(T_dst::tile_size_x==T_src::tile_size_x) &&(T_dst::tile_size_y==T_src::tile_size_y) &&(T_src::tile_size_x==T_src::block_size_x) &&(T_src::tile_size_y==T_src::block_size_y) &&(std::is_same< typename T_dst::dtype, typename T_src::dtype >::value)> gpu::xetla::subgroup::layout_convert	(	T_dst &	dst,
		T_src &	src
	)

convert 2d tile in a linear register layout to a 2d tile in a tiled register layout

Template Parameters

T_dst	Is the destination tile data type.
T_src	Is the source tile data type.

Parameters

dst	Is the reference of the destination tile object.
src	Is the reference of the destination tile object.

Returns: No return, in-place update in the destination tile.

◆ row_broadcast()

template<typename T_dst , typename T_src >

__XETLA_API std::enable_if_t<(T_dst::register_layout==reg_layout::tiled) &&(T_src::register_layout==reg_layout::tiled) &&(T_src::tile_size_x==T_dst::tile_size_x) &&(T_src::tile_size_y==1)> gpu::xetla::subgroup::row_broadcast	(	T_dst &	dst,
		T_src &	src
	)

Broadcasts 1d src tile to the entire 2d tile, as well as do the data conversion.

Template Parameters

T_dst	Is the destination tile data type.
T_src	Is the source tile data type, interpreted as 1D data.

Parameters

dst	Is the reference of the destination tile object.
src	Is the reference of the destination tile object.

Returns: No return, in-place update in the destination tile.

◆ tile_broadcast_op()

template<typename op , typename matAcc_t >

void gpu::xetla::subgroup::tile_broadcast_op	(	matAcc_t &	matAcc,
		xetla_vector< typename matAcc_t::dtype, matAcc_t::tile_size_y >	data
	)

inline

◆ tile_load() [1/5]

template<cache_hint L1 = cache_hint::cached, cache_hint L2 = cache_hint::cached, typename tile_t , typename payload_t >

__XETLA_API std::enable_if_t< detail::check_load_type< tile_t, payload_t >::is_global_2d_xe > gpu::xetla::subgroup::tile_load	(	tile_t &	tile,
		payload_t &	payload
	)

This function loads data from 2D memory surface.

Loads an array of rectangular regions (X,Y)..(X+W,Y+H) from memory into registers. Each block will be loaded serially by its corresponding payload.

Template Parameters

tile_t	Is the tile_t struct contains registers. These registers will be the destination of load operation.
payload_t	Is the mem_payload_t struct describing the memory information Payload indicates the source of load operation.
L1	Is the cache hint for L1 cache.
L2	Is the cache hint for L2 cache.

Parameters

tile	Is the tile object with type tile_t, holds the return data of the loads.
payload	Is the payload object with type payload_t. Contains all the information for loads.

Returns: No return, update in place.

◆ tile_load() [2/5]

template<cache_hint L1 = cache_hint::cached, cache_hint L2 = cache_hint::cached, typename tile_t , typename payload_t >

__XETLA_API std::enable_if_t< detail::check_load_type< tile_t, payload_t >::is_global_block_1d_xe > gpu::xetla::subgroup::tile_load	(	tile_t &	tile,
		payload_t &	payload
	)

This function loads data from memory.

For each enabled SIMT lane, a vector is read from memory into registers.

Template Parameters

tile_t	Is the tile_t struct contains registers. These registers will be the destination of load operation.
payload_t	Is the mem_payload_t struct describing the memory information. Payload indicates the source of load operation.
L1	Is the cache hint for L1 cache.
L2	Is the cache hint for L2 cache.

Parameters

tile	Is the tile object with type tile_t, holds the return data of the loads.
payload	Is the payload object with type payload_t. Contains all the information for loads.

Returns: No return, update in place.

◆ tile_load() [3/5]

template<cache_hint L1 = cache_hint::cached, cache_hint L2 = cache_hint::cached, typename tile_t , typename payload_t >

__XETLA_API std::enable_if_t< detail::check_load_type< tile_t, payload_t >::is_local_scatter_xe > gpu::xetla::subgroup::tile_load	(	tile_t &	tile,
		payload_t &	payload
	)

Is the data load func from local shared memory to register file, which supports the memory surface is 1d or 2d scenario.

And we always assume data in SLM is row major.

Template Parameters

tile_t	Is the tile_t struct contains registers These registers will be the destination of load operation.
payload_t	Is the mem_payload_t struct describing the memory information. Payload indicates the source of load operation.
L1	Is the cache hint for L1 cache.
L2	Is the cache hint for L2 cache.

Parameters

tile	Is the tile object with type tile_t, holds the return data of the loads.
payload	Is the payload object with type payload_t. Contains all the information for loads.

Returns: No return, update in place.

◆ tile_load() [4/5]

template<cache_hint L1 = cache_hint::cached, cache_hint L2 = cache_hint::cached, typename tile_t , typename payload_t >

__XETLA_API std::enable_if_t< detail::check_load_type< tile_t, payload_t >::is_local_block_1d_xe > gpu::xetla::subgroup::tile_load	(	tile_t &	tile,
		payload_t &	payload
	)

Is the data load func from shared local memory to register file, which supports the memory surface is 1d scenario.

And the src memory layout is always row major.

Template Parameters

tile_t	Is the tile_t struct contains registers. These registers will be the destination of load operation.
payload_t	Is the mem_payload_t struct describing the memory information. Payload indicates the source of load operation.
L1	Is the cache hint for L1 cache.
L2	Is the cache hint for L2 cache.

Parameters

tile	Is the tile object with type tile_t, holds the return data of the loads.
payload	Is the payload object with type payload_t. Contains all the information for loads.

Returns: No return, update in place.

◆ tile_load() [5/5]

template<cache_hint L1 = cache_hint::cached, cache_hint L3 = cache_hint::cached, typename tile_t , typename payload_t , typename oob_check_tag = global_atomic_oob_check_on_tag>

__XETLA_API std::enable_if_t< detail::check_load_type< tile_t, payload_t >::is_global_unaligned_2d_xe > gpu::xetla::subgroup::tile_load	(	tile_t &	tile,
		payload_t &	payload,
		oob_check_tag	tag = `{}`
	)

This function loads data from unaligned-2D memory surface.

Loads an array of rectangular regions (X,Y)..(X+W,Y+H) from memory into registers. Each block will be loaded serially by its corresponding payload.

Template Parameters

tile_t	Is the tile_t struct contains registers. These registers will be the destination of load operation.
payload_t	Is the mem_payload_t struct describing the memory information. Payload indicates the source of load operation.
L1	Is the cache hint for L1 cache.
L3	Is the cache hint for L3 cache.

Parameters

tile	Is the tile object with type tile_t, holds the return data of the loads.
payload	Is the payload object with type payload_t. Contains all the information for loads.

Returns: No return, update in place.

◆ tile_prefetch() [1/3]

template<cache_hint L1 = cache_hint::cached, cache_hint L2 = cache_hint::cached, typename payload_t >

__XETLA_API std::enable_if_t< detail::check_prefetch_type< payload_t >::is_global_2d_xe > gpu::xetla::subgroup::tile_prefetch ( payload_t & payload )

Is prefetch data func, which data located in global memory is prefetched to cache, where has higher bandwidth.

e.g. In gemm, prefetch next iteration data for mma consumption. This func is specicalized for block 2d scenario.

Template Parameters

payload_t	Is the mem_payload_t struct illustrating memory info payload indicates the source of prefetch operation.
L1	Is cache hint for L1 cache.
L2	Is cache hint for L2 cache.

Parameters

payload Is the payload object with type payload_t. Contains all the information for prefetches.

◆ tile_prefetch() [2/3]

template<cache_hint L1 = cache_hint::cached, cache_hint L2 = cache_hint::cached, typename payload_t >

__XETLA_API std::enable_if_t< detail::check_prefetch_type< payload_t >::is_global_block_1d_xe > gpu::xetla::subgroup::tile_prefetch ( payload_t & payload )

Is prefetch data func, which data located in global memory is prefetched to cache, where has higher bandwidth.

e.g. In gemm, prefetch next iteration data for mma consumption. This func is specicalized for block 1d scenario.

Template Parameters

payload_t	Is the mem_payload_t struct illustrating memory info payload indicates the source of prefetch operation
L1	Is cache hint for L1 cache.
L2	Is cache hint for L2 cache.

Parameters

payload Is the payload object with type payload_t. Contains all the information for prefetches.

◆ tile_prefetch() [3/3]

template<cache_hint L1 = cache_hint::cached, cache_hint L2 = cache_hint::cached, typename payload_t >

__XETLA_API std::enable_if_t< detail::check_prefetch_type< payload_t >::is_local_xe > gpu::xetla::subgroup::tile_prefetch ( payload_t & payload )

Is prefetch data func.

Current shared local memory prefetch is not supported yet. Only used to keep the consistency with global prefetch.

Template Parameters

payload_t	Is the mem_payload_t struct illustrating memory info.
L1	Is cache hint for L1 cache.
L2	Is cache hint for L2 cache.

Parameters

payload Is the payload object with type payload_t. Contains all the information for prefetches.

◆ tile_reduce() [1/2]

template<reduce_op reduce_kind, typename dtype_out , typename dtype_acc , int dim, typename mat_t >

__XETLA_API std::enable_if_t<(dim==1), xetla_vector< dtype_out, mat_t::tile_size_y > > gpu::xetla::subgroup::tile_reduce ( mat_t & src )

The idea is 1) allocate a temp buffer; 2) reduce the entire tile into temp buffer; 3) reduce within temp buffer

◆ tile_reduce() [2/2]

template<reduce_op reduce_kind, typename dtype_out , typename dtype_acc , int dim, typename mat_t >

__XETLA_API std::enable_if_t<(dim==0), xetla_vector< dtype_out, mat_t::tile_size_x > > gpu::xetla::subgroup::tile_reduce ( mat_t & src )

The idea is 1) allocate a temp buffer; 2) reduce the entire tile into temp buffer; 3) reduce within temp buffer This will introduce additional instructions to initialize the temp buffer, but will have more parallelism

◆ tile_store() [1/8]

template<cache_hint L1 = cache_hint::write_back, cache_hint L2 = cache_hint::write_back, typename tile_t , typename payload_t >

__XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_global_2d_xe > gpu::xetla::subgroup::tile_store	(	tile_t &	tile,
		payload_t &	payload
	)

Is the func storing data from register file to global memory.

store a rectangular region (X,Y)..(X+W,Y+H) into memory from registers.

Template Parameters

tile_t	Is the tile_t struct contains registers These registers will be the source of store operation.
payload_t	Is the mem_payload_t struct describing the memory info payload indicates the destination of store operation.
L1	Is the cache hint for L1 cache.
L2	Is the cache hint for L2 cache.

Parameters

tile	Is the tile object with type tile_t, contains the data to be stored.
payload	Is the payload object with type payload_t. Contains all the information for stores.

◆ tile_store() [2/8]

template<cache_hint L1 = cache_hint::write_back, cache_hint L2 = cache_hint::write_back, typename tile_t , typename payload_t >

__XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_global_block_1d_xe > gpu::xetla::subgroup::tile_store	(	tile_t &	tile,
		payload_t &	payload
	)

Is the func storing data from register file to global memory.

For each enabled SIMT lane, a vector is written into memory from registers.

Template Parameters

tile_t	Is the tile_t struct contains registers These registers will be the source of store operation.
payload_t	Is the mem_payload_t struct describing the memory info payload indicates the destination of store operation
L1	Is the cache hint for L1 cache.
L2	Is the cache hint for L2 cache.

Parameters

tile	Is the tile object with type tile_t, contains the data to be stored.
payload	Is the payload object with type payload_t. Contains all the information for stores.

◆ tile_store() [3/8]

template<cache_hint L1 = cache_hint::write_back, cache_hint L2 = cache_hint::write_back, typename tile_t , typename payload_t >

__XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_local_scatter_xe > gpu::xetla::subgroup::tile_store	(	tile_t &	tile,
		payload_t &	payload
	)

Is the func storing data from register file to shared local memory, which supports the memory surface 2d scenario.

And the dst memory layout is is always row major.

Template Parameters

tile_t	Is the tile_t struct contains registers These registers will be the source of store operation.
payload_t	Is the mem_payload_t struct describing the memory info payload indicates the destination of store operation
L1	Is the cache hint for L1 cache.
L2	Is the cache hint for L2 cache.

Parameters

tile	Is the tile object with type tile_t, contains the data to be stored.
payload	Is the payload object with type payload_t. Contains all the information for stores.

◆ tile_store() [4/8]

template<cache_hint L1 = cache_hint::write_back, cache_hint L2 = cache_hint::write_back, typename tile_t , typename payload_t >

__XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_local_scatter_vnni_col_xe > gpu::xetla::subgroup::tile_store	(	tile_t &	tile,
		payload_t &	payload
	)

Is the data store func from register file to local shared memory, where the data in register is vnni packed and col major.

And we always assume the dst memory layout is row major.

Template Parameters

tile_t	Is the tile_t struct contains registers These registers will be the source of store operation.
payload_t	Is the mem_payload_t struct describing the memory info payload indicates the destination of store operation
L1	Is the cache hint for L1 cache.
L2	Is the cache hint for L2 cache.

Parameters

tile	Is the tile object with type tile_t, contains the data to be stored.
payload	Is the payload object with type payload_t. Contains all the information for stores.

◆ tile_store() [5/8]

template<cache_hint L1 = cache_hint::write_back, cache_hint L2 = cache_hint::write_back, typename tile_t , typename payload_t >

__XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_local_block_1d_xe &&tile_t::block_size_y !=1 > gpu::xetla::subgroup::tile_store	(	tile_t &	tile,
		payload_t &	payload
	)

Is the data store func from register file to shared local memory, where supports memory surface 1d or 2d scenario, and we always assume dst memory layout is row major.

Template Parameters

tile_t	Is the tile_t struct contains registers These registers will be the source of store operation.
payload_t	Is the mem_payload_t struct describing the memory info payload indicates the destination of store operation
L1	Is the cache hint for L1 cache.
L2	Is the cache hint for L2 cache.

Parameters

tile	Is the tile object with type tile_t, contains the data to be stored.
payload	Is the payload object with type payload_t. Contains all the information for stores.

◆ tile_store() [6/8]

template<cache_hint L1 = cache_hint::write_back, cache_hint L2 = cache_hint::write_back, typename tile_t , typename payload_t >

__XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_local_block_1d_xe &&tile_t::tile_size_y==1 &&tile_t::block_size_y==1 > gpu::xetla::subgroup::tile_store	(	tile_t &	tile,
		payload_t &	payload
	)

Is the func storing data from register file to shared local memory, the data in registers will be stored to SLM in 1d mode, and we always assume dst memory layout is row major.

Template Parameters

tile_t	Is the tile_t struct contains registers These registers will be the source of store operation.
payload_t	Is the mem_payload_t struct describing the memory info payload indicates the destination of store operation
L1	Is the cache hint for L1 cache.
L2	Is the cache hint for L2 cache.

Parameters

tile	Is the tile object with type tile_t, contains the data to be stored.
payload	Is the payload object with type payload_t. Contains all the information for stores.

◆ tile_store() [7/8]

template<cache_hint L1 = cache_hint::write_back, cache_hint L3 = cache_hint::write_back, typename tile_t , typename payload_t , typename oob_check_tag = global_atomic_oob_check_on_tag>

__XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_global_unaligned_2d_xe > gpu::xetla::subgroup::tile_store	(	tile_t &	tile,
		payload_t &	payload,
		oob_check_tag	tag = `{}`
	)

Is the func storing data from register file to unaligned global memory surface.

store a rectangular region (X,Y)..(X+W,Y+H) into memory from registers.

Template Parameters

tile_t	Is the tile_t struct contains registers These registers will be the source of store operation.
payload_t	Is the mem_payload_t struct describing the memory info payload indicates the destination of store operation.
L1	Is the cache hint for L1 cache.
L3	Is the cache hint for L3 cache.

Parameters

tile	Is the tile object with type tile_t, contains the data to be stored.
payload	Is the payload object with type payload_t. Contains all the information for stores.

Returns: No return, update in place.

◆ tile_store() [8/8]

template<cache_hint L1 = cache_hint::uncached, cache_hint L2 = cache_hint::write_back, typename tile_t , typename payload_t , typename oob_check_tag = global_atomic_oob_check_on_tag>

__XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_global_atomic_xe > gpu::xetla::subgroup::tile_store	(	tile_t &	tile,
		payload_t &	payload,
		oob_check_tag	tag = `{}`
	)

Is the func storing data from register file to global memory enable atomic adding data into the same buffer, but support float32, float64, uint32_t, uint64_t and int type.

Template Parameters

tile_t	Is the tile_t struct contains registers These registers will be the source of store operation.
payload_t	Is the mem_payload_t struct describing the memory info payload indicates the destination of store operation
L1	Is the cache hint for L1 cache.
L2	Is the cache hint for L2 cache.

Parameters

tile	Is the tile object with type tile_t, contains the data to be stored.
payload	Is the payload object with type payload_t. Contains all the information for stores.

◆ vnni_convert()

template<typename T >

__XETLA_API std::enable_if_t< T::register_layout==reg_layout::vnni_tiled > gpu::xetla::subgroup::vnni_convert ( T & mat_Acc )

Converts tiled layout to vnni_tiled layout format.

Template Parameters

T	Is the tile data type.

Parameters

mat_Acc Is the reference of the tile object.

Returns: No return, update the data in-place.

◆ vnni_reverse() [1/2]

template<typename T >

__XETLA_API std::enable_if_t< T::register_layout==reg_layout::tiled > gpu::xetla::subgroup::vnni_reverse ( T & mat_Acc )

Converts vnni_tiled layout format to tiled layout.

Template Parameters

T	Is the tile data type.

Parameters

mat_Acc Is the reference of the tile object.

Returns: No return, update the data in-place.

◆ vnni_reverse() [2/2]

template<typename T >

__XETLA_API std::enable_if_t< T::register_layout==reg_layout::transpose_tiled > gpu::xetla::subgroup::vnni_reverse ( T & mat_Acc )

Converts vnni_tiled layout format to transpose_tiled layout.

Template Parameters

T	Is the tile data type.

Parameters

mat_Acc Is the reference of the tile object.

Returns: No return, update the data in-place.

◆ vnni_transform()

template<typename T_dst , typename T_src >

__XETLA_API std::enable_if_t< is_same_layout< T_dst, T_src >::value > gpu::xetla::subgroup::vnni_transform	(	T_dst &	dst,
		T_src &	src
	)

Changes vnni layout.

Template Parameters

T_dst	Is the destination tile data type.
T_src	Is the source tile data type.

Parameters

dst	Is the reference of the destination tile object.
src	Is the reference of the destination tile object.

Returns: No return, in-place update in the destination tile.

◆ XETLA_MARKER()

template<typename T_dst , typename T_src , bool accumulate = true, typename dtype_acc = float, uint32_t num_acc = 4>

gpu::xetla::subgroup::XETLA_MARKER	(	"This is only for reduce	add,
		and will be deprecated in future. " "Please use tile_reduce instead."
	)

Reduce 2d src tile to the 1d tile, and output to 1d dst.

Template Parameters

T_dst	Is the destination tile data type, interpreted as 1d data.
T_src	Is the source tile data type, interpreted as 2d data.
accumulate	is to accumulate the old value or not.
dtype_acc	Is the accumulation data type, src ==> convert to dtype_acc ==> reduction + accumulation ==> convert to dtype_dst.

Parameters

dst	Is the reference of the destination tile object.
src	Is the reference of the destination tile object.

Returns: No return, in-place update in the destination tile.

Note: This is only for reduce add, and will be deprecated in future. Please use tile_reduce instead.

Here we rely on compiler to generate mixed mode for bf16

The idea is 1) allocate a temp buffer; 2) accumulate the entire tile into temp buffer; 3) reduce within temp buffer This will introduce additional instructions to initialize the temp buffer, but will have more parallelism

Variable Documentation

◆ msg_type_v

template<typename tile_desc_ , mem_space memory_space>

constexpr msg_type gpu::xetla::subgroup::msg_type_v = msg_type_query<tile_desc_, memory_space>::value

constexpr

Namespaces

Classes

Functions

Variables

Function Documentation

◆ elemwise_cvt() [1/3]

◆ elemwise_cvt() [2/3]

◆ elemwise_cvt() [3/3]

◆ layout_convert() [1/2]

◆ layout_convert() [2/2]

◆ row_broadcast()

◆ tile_broadcast_op()

◆ tile_load() [1/5]

◆ tile_load() [2/5]

◆ tile_load() [3/5]

◆ tile_load() [4/5]

◆ tile_load() [5/5]

◆ tile_prefetch() [1/3]

◆ tile_prefetch() [2/3]

◆ tile_prefetch() [3/3]

◆ tile_reduce() [1/2]

◆ tile_reduce() [2/2]

◆ tile_store() [1/8]

◆ tile_store() [2/8]

◆ tile_store() [3/8]

◆ tile_store() [4/8]

◆ tile_store() [5/8]

◆ tile_store() [6/8]

◆ tile_store() [7/8]

◆ tile_store() [8/8]

◆ vnni_convert()

◆ vnni_reverse() [1/2]

◆ vnni_reverse() [2/2]

◆ vnni_transform()

◆ XETLA_MARKER()

Variable Documentation

◆ msg_type_v