Implements the tensor load store functionality using raw send instructions. More...

Collaboration diagram for Tensor load store API:

Functions
template<typename Ty , uint32_t block_width = 1, uint32_t block_height = 1, uint8_t array_len = 1>
__XETLA_API void	gpu::xetla::xetla_fill_tdesc (xetla_tdescriptor_ref tdesc, Ty *p, int tensor_width, int tensor_height, int tensor_pitch, int offset_x, int offset_y)
	Tensor descriptor construction(global memory version).

template<typename Ty >
__XETLA_API void	gpu::xetla::xetla_fill_tdesc (xetla_tdescriptor_ref tdesc, uint32_t base_address, int tensor_width, int tensor_height, int tensor_pitch, int offset_x, int offset_y)
	Tensor descriptor construction(local memory version).

template<typename Ty , uint32_t block_width = 1, uint32_t block_height = 1, uint8_t array_len = 1>
__XETLA_API xetla_tdescriptor	gpu::xetla::xetla_get_tdesc (Ty *p, int tensor_width, int tensor_height, int tensor_pitch, int offset_x, int offset_y)
	Generate a new tensor descriptor(global memory version).

template<typename Ty >
__XETLA_API xetla_tdescriptor	gpu::xetla::xetla_get_tdesc (uint32_t base_address, int tensor_width, int tensor_height, int tensor_pitch, int offset_x, int offset_y)
	Generate a new tensor descriptor(local memory version).

__XETLA_API void	gpu::xetla::xetla_update_tdesc_offsetx (xetla_tdescriptor_ref tdesc, int32_t doffset_x)
	Update the x coordinate in the given tensor descriptor.

__XETLA_API void	gpu::xetla::xetla_update_tdesc_offsety (xetla_tdescriptor_ref tdesc, int32_t doffset_y)
	Update the y coordinate in the given tensor descriptor.

template<typename Ty , uint32_t N, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, bool transpose = false, bool transform = false, gpu_arch arch_tag = gpu_arch::Xe>
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, xetla_vector< Ty, N > >	gpu::xetla::xetla_tload_global (xetla_tdescriptor tdesc)
	Tensor load API.

template<typename Ty , uint32_t N, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, gpu_arch arch_tag = gpu_arch::Xe>
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void >	gpu::xetla::xetla_tstore_global (xetla_tdescriptor tdesc, xetla_vector< Ty, N > data)
	Tensor store API.

template<typename Ty , cache_hint L1H = cache_hint::cached, cache_hint L2H = cache_hint::cached, gpu_arch arch_tag = gpu_arch::Xe>
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void >	gpu::xetla::xetla_tprefetch_global (xetla_tdescriptor tdesc)
	Tensor prefetch API.

template<typename Ty , uint32_t N, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, atomic_op Op, gpu_arch arch_tag = gpu_arch::Xe, typename Toffset = uint32_t>
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void >	gpu::xetla::xetla_tatomic_store_global (uint64_t base_address, xetla_vector< Toffset, N > offset, xetla_vector< Ty, N > data, xetla_mask< N > pred=1)
	Tensor atomic store API.

Detailed Description

Implements the tensor load store functionality using raw send instructions.

Function Documentation

◆ xetla_fill_tdesc() [1/2]

template<typename Ty , uint32_t block_width = 1, uint32_t block_height = 1, uint8_t array_len = 1>

__XETLA_API void gpu::xetla::xetla_fill_tdesc	(	xetla_tdescriptor_ref	tdesc,
		Ty *	p,
		int	tensor_width,
		int	tensor_height,
		int	tensor_pitch,
		int	offset_x,
		int	offset_y
	)

Tensor descriptor construction(global memory version).

Constructs a tensor descriptor based on the given arguments, check here for more details.

Template Parameters

Ty	is the data type per element.
block_width	is the width of the block to be loaded.
block_height	is the height of the block to be loaded.
array_len	is the array length of the block to be loaded.

Parameters

tdesc	[in\|out] is the reference of tensor descriptor.
p	[in] is the base address pointer of the tensor.
tensor_width	[in] is the width of the tensor.
tensor_height	[in] is the height of the tensor.
tensor_pitch	[in] is the pitch(physical width of tensor in memory).
offset_x	[in] is the x coordinate of the start point.
offset_y	[in] is the y coordinate of the start point.

◆ xetla_fill_tdesc() [2/2]

template<typename Ty >

__XETLA_API void gpu::xetla::xetla_fill_tdesc	(	xetla_tdescriptor_ref	tdesc,
		uint32_t	base_address,
		int	tensor_width,
		int	tensor_height,
		int	tensor_pitch,
		int	offset_x,
		int	offset_y
	)

Tensor descriptor construction(local memory version).

Constructs a tensor descriptor based on the given arguments, keep the same format as the global memory version.

Template Parameters

Ty	is the data type per element.

Parameters

tdesc	[in\|out] is the reference of tensor descriptor.
base_address	[in] is the local memory base address of the tensor.
tensor_width	[in] is the width of the tensor.
tensor_height	[in] is the height of the tensor.
tensor_pitch	[in] is the pitch(physical width of tensor in memory).
offset_x	[in] is the x coordinate of the start point.
offset_y	[in] is the y coordinate of the start point.

◆ xetla_get_tdesc() [1/2]

template<typename Ty , uint32_t block_width = 1, uint32_t block_height = 1, uint8_t array_len = 1>

__XETLA_API xetla_tdescriptor gpu::xetla::xetla_get_tdesc	(	Ty *	p,
		int	tensor_width,
		int	tensor_height,
		int	tensor_pitch,
		int	offset_x,
		int	offset_y
	)

Generate a new tensor descriptor(global memory version).

Generate a tensor descriptor based on the given arguments, check here for more details.

Template Parameters

Ty	is the data type per element.
block_width	is the width of the block to be loaded.
block_height	is the height of the block to be loaded.
array_len	is the array length of the block to be loaded.

Parameters

p	[in] is the base address pointer of the tensor.
tensor_width	[in] is the width of the tensor.
tensor_height	[in] is the height of the tensor.
tensor_pitch	[in] is the pitch(physical width of tensor in memory).
offset_x	[in] is the x coordinate of the start point.
offset_y	[in] is the y coordinate of the start point.

Returns: return a new tensor

◆ xetla_get_tdesc() [2/2]

template<typename Ty >

__XETLA_API xetla_tdescriptor gpu::xetla::xetla_get_tdesc	(	uint32_t	base_address,
		int	tensor_width,
		int	tensor_height,
		int	tensor_pitch,
		int	offset_x,
		int	offset_y
	)

Generate a new tensor descriptor(local memory version).

Constructs a tensor descriptor based on the given arguments, keep the same format as the global memory version.

Template Parameters

Ty	is the data type per element.

Parameters

base_address	[in] is the local memory base address of the tensor.
tensor_width	[in] is the width of the tensor.
tensor_height	[in] is the height of the tensor.
tensor_pitch	[in] is the pitch(physical width of tensor in memory).
offset_x	[in] is the x coordinate of the start point.
offset_y	[in] is the y coordinate of the start point.

Returns: return a new tensor descriptor

◆ xetla_tatomic_store_global()

template<typename Ty , uint32_t N, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, atomic_op Op, gpu_arch arch_tag = gpu_arch::Xe, typename Toffset = uint32_t>

__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void > gpu::xetla::xetla_tatomic_store_global	(	uint64_t	base_address,
		xetla_vector< Toffset, N >	offset,
		xetla_vector< Ty, N >	data,
		xetla_mask< N >	pred = `1`
	)

Tensor atomic store API.

Tensor atomic store API is to store a n-d (e.g. n=2) tensor into global. Check here for more details.

Template Parameters

Ty	is the data type per element.
N	is the number of elements to store.
L1H	is L1 cache hint.
L2H	is L2 cache hint.
Toffset	is the offset data type.

Parameters

base_address	[in] is the 64bit base address of the surface.
offset	[in] is the address offset for each channel, default is 32bits.
data	[in] is tensor data to store.

Returns: none.

only support 64bit address

◆ xetla_tload_global()

template<typename Ty , uint32_t N, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, bool transpose = false, bool transform = false, gpu_arch arch_tag = gpu_arch::Xe>

__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, xetla_vector< Ty, N > > gpu::xetla::xetla_tload_global ( xetla_tdescriptor tdesc )

Tensor load API.

This is tensor load API from global to registers. Check here for more details.

Template Parameters

Ty	is the data type per element.
N	is the total number of elements to load.
L1H	is L1$ cache hint.
L2H	is L2$ cache hint.
transpose	is a flag to indicate whether the data is transposed during load.
transform	is a flag to indicate whether the data is transformed (data pack inside dword) during load.

Parameters

tdesc [in] is tensor descriptor including tensor base address, tensor dimensions, block size, etc.

Returns: xetla_vector is data returned from the load.

◆ xetla_tprefetch_global()

template<typename Ty , cache_hint L1H = cache_hint::cached, cache_hint L2H = cache_hint::cached, gpu_arch arch_tag = gpu_arch::Xe>

__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void > gpu::xetla::xetla_tprefetch_global ( xetla_tdescriptor tdesc )

Tensor prefetch API.

This is tensor prefetch API from global memory to L1$/L2$. Check here for more details.

Template Parameters

Ty	is the data type per element.
L1H	is L1$ cache hit.
L2H	is L2$ cache hit.

Parameters

tdesc is tensor descriptor including tensor base address, tensor dimensions, block size, etc.

Returns: none.

◆ xetla_tstore_global()

template<typename Ty , uint32_t N, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, gpu_arch arch_tag = gpu_arch::Xe>

__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void > gpu::xetla::xetla_tstore_global	(	xetla_tdescriptor	tdesc,
		xetla_vector< Ty, N >	data
	)

Tensor store API.

Tensor store API is to store a n-d (e.g. n=2) tensor into global using tensor descriptor. Check here for more details.

Template Parameters

Ty	is the data type per element.
N	is the number of elements to store.
L1H	is L1 cache hint.
L2H	is L2 cache hint.

Parameters

tdesc	[in] is tensor descriptor including tensor base address, tensor dimensions, block size, etc.
data	[in] is tensor data to store.

Returns: none.

◆ xetla_update_tdesc_offsetx()

__XETLA_API void gpu::xetla::xetla_update_tdesc_offsetx	(	xetla_tdescriptor_ref	tdesc,
		int32_t	doffset_x
	)

Update the x coordinate in the given tensor descriptor.

Parameters

tdesc	[in\|out] is the reference of tensor descriptor.
doffset_x	[in] is the offset (in number of data elements) in x direction.

◆ xetla_update_tdesc_offsety()

__XETLA_API void gpu::xetla::xetla_update_tdesc_offsety	(	xetla_tdescriptor_ref	tdesc,
		int32_t	doffset_y
	)

Update the y coordinate in the given tensor descriptor.

Parameters

tdesc	[in\|out] is the reference of tensor descriptor.
doffset_y	[in] is the offset (in number of data elements) in y direction.

Functions

Detailed Description

Function Documentation

◆ xetla_fill_tdesc() [1/2]

◆ xetla_fill_tdesc() [2/2]

◆ xetla_get_tdesc() [1/2]

◆ xetla_get_tdesc() [2/2]

◆ xetla_tatomic_store_global()

◆ xetla_tload_global()

◆ xetla_tprefetch_global()

◆ xetla_tstore_global()

◆ xetla_update_tdesc_offsetx()

◆ xetla_update_tdesc_offsety()