Defines XeTLA APIs to access memory, including read, write and atomic. More...

Collaboration diagram for Memory access APIs:

Functions
template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::cached, cache_hint L2H = cache_hint::cached, int N>
__XETLA_API void	gpu::xetla::xetla_prefetch_global (Ty *p, xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred=1)
	Stateless scattered prefetch.

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::cached, cache_hint L2H = cache_hint::cached>
__XETLA_API void	gpu::xetla::xetla_prefetch_global (Ty *p, uint64_t offset=0)
	Stateless block prefetch (transposed gather with 1 channel).

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, int N, typename Toffset = uint32_t>
__XETLA_API xetla_vector< Ty, N *NElts >	gpu::xetla::xetla_load_global (Ty *p, xetla_vector< Toffset, N > offsets, xetla_mask< N > pred=1)
	Stateless scattered load.

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API xetla_vector< Ty, NElts >	gpu::xetla::xetla_load_global (Ty *p, uint64_t offset=0)
	Stateless block load (transposed gather with 1 channel).

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, int N, typename Toffset = uint32_t>
__XETLA_API void	gpu::xetla::xetla_store_global (Ty p, xetla_vector< Toffset, N > offsets, xetla_vector< Ty, N NElts > vals, xetla_mask< N > pred=1)
	Stateless scattered store.

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API void	gpu::xetla::xetla_store_global (Ty *p, uint64_t offset, xetla_vector< Ty, NElts > vals)
	Stateless block store (transposed scatter with 1 channel).

template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API xetla_vector< T, N >	gpu::xetla::xetla_atomic_global (T *p, xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred)
	Stateless scattered atomic (0 src).

template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API xetla_vector< T, N >	gpu::xetla::xetla_atomic_global (T *p, xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_mask< N > pred)
	Stateless scattered atomic (1 src).

template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API xetla_vector< T, N >	gpu::xetla::xetla_atomic_global (T *p, xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_vector< T, N > src1, xetla_mask< N > pred)
	Stateless scattered atomic (2 src).

template<uint32_t SLMSize>
__XETLA_API void	gpu::xetla::xetla_local_init ()
	Declare per-work-group slm size.

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, int N>
__XETLA_API xetla_vector< Ty, N *NElts >	gpu::xetla::xetla_load_local (xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred=1)
	SLM scattered load.

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size>
__XETLA_API xetla_vector< Ty, NElts >	gpu::xetla::xetla_load_local (uint32_t offset)
	SLM block load.

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, int N>
__XETLA_API void	gpu::xetla::xetla_store_local (xetla_vector< uint32_t, N > offsets, xetla_vector< Ty, N *NElts > vals, xetla_mask< N > pred=1)
	SLM scattered store.

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size>
__XETLA_API void	gpu::xetla::xetla_store_local (uint32_t offset, xetla_vector< Ty, NElts > vals)
	SLM block store (transposed SLM scatter with 1 channel).

template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size>
__XETLA_API xetla_vector< T, N >	gpu::xetla::xetla_atomic_local (xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred)
	SLM scattered atomic (0 src).

template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size>
__XETLA_API xetla_vector< T, N >	gpu::xetla::xetla_atomic_local (xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_mask< N > pred)
	SLM scattered atomic (1 src).

template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size>
__XETLA_API xetla_vector< T, N >	gpu::xetla::xetla_atomic_local (xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_vector< T, N > src1, xetla_mask< N > pred)
	SLM scattered atomic (2 src).

template<memory_kind Kind = memory_kind::untyped_global, fence_op FenceOp = fence_op::none, fence_scope Scope = fence_scope::group, int N = 16>
__XETLA_API void	gpu::xetla::xetla_fence (xetla_mask< N > pred=1)
	Memory fence.

Detailed Description

Defines XeTLA APIs to access memory, including read, write and atomic.

Function Documentation

◆ xetla_atomic_global() [1/3]

template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>

__XETLA_API xetla_vector< T, N > gpu::xetla::xetla_atomic_global	(	T *	p,
		xetla_vector< uint32_t, N >	offsets,
		xetla_mask< N >	pred
	)

Stateless scattered atomic (0 src).

Supported platforms: DG2, PVC VISA instruction: lsc_atomic_<OP>.ugm

Template Parameters

Op	is operation type.
T	is element type.
N	is the number of SIMD channels (platform dependent).
DS	is the data size.
L1H	is L1 cache hint.
L2H	is L2 cache hint.

Parameters

p	[in] is the base pointer.
offsets	[in] is the zero-based offsets.
pred	[in] is predicates.

◆ xetla_atomic_global() [2/3]

template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>

__XETLA_API xetla_vector< T, N > gpu::xetla::xetla_atomic_global	(	T *	p,
		xetla_vector< uint32_t, N >	offsets,
		xetla_vector< T, N >	src0,
		xetla_mask< N >	pred
	)

Stateless scattered atomic (1 src).

Supported platforms: DG2, PVC VISA instruction: lsc_atomic_<OP>.ugm

Template Parameters

Op	is operation type.
T	is element type.
N	is the number of SIMD channels (platform dependent).
DS	is the data size.
L1H	is L1 cache hint.
L2H	is L2 cache hint.

Parameters

p	[in] is the base pointer.
offsets	[in] is the zero-based offsets.
src0	[in] is the first atomic operand.
pred	[in] is predicates.

◆ xetla_atomic_global() [3/3]

template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>

__XETLA_API xetla_vector< T, N > gpu::xetla::xetla_atomic_global	(	T *	p,
		xetla_vector< uint32_t, N >	offsets,
		xetla_vector< T, N >	src0,
		xetla_vector< T, N >	src1,
		xetla_mask< N >	pred
	)

Stateless scattered atomic (2 src).

Supported platforms: DG2, PVC VISA instruction: lsc_atomic_<OP>.ugm

Template Parameters

Op	is operation type.
T	is element type.
N	is the number of SIMD channels (platform dependent).
DS	is the data size.
L1H	is L1 cache hint.
L2H	is L2 cache hint.

Parameters

p	[in] is the base pointer.
offsets	[in] is the zero-based offsets.
src0	[in] is the first atomic operand.
src1	[in] is the second atomic operand.
pred	[in] is predicates.

◆ xetla_atomic_local() [1/3]

template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size>

__XETLA_API xetla_vector< T, N > gpu::xetla::xetla_atomic_local	(	xetla_vector< uint32_t, N >	offsets,
		xetla_mask< N >	pred
	)

SLM scattered atomic (0 src).

Supported platforms: DG2, PVC VISA instruction: lsc_atomic_<OP>.slm

Template Parameters

Op	is operation type.
T	is element type.
N	is the number of SIMD channels (platform dependent).
DS	is the data size.

Parameters

offsets	[in] is the zero-based offsets.
pred	[in] is predicates.

◆ xetla_atomic_local() [2/3]

template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size>

__XETLA_API xetla_vector< T, N > gpu::xetla::xetla_atomic_local	(	xetla_vector< uint32_t, N >	offsets,
		xetla_vector< T, N >	src0,
		xetla_mask< N >	pred
	)

SLM scattered atomic (1 src).

Supported platforms: DG2, PVC VISA instruction: lsc_atomic_<OP>.slm

Template Parameters

Op	is operation type.
T	is element type.
N	is the number of SIMD channels (platform dependent).
DS	is the data size.

Parameters

offsets	[in] is the zero-based offsets.
src0	[in] is the first atomic operand.
pred	[in] is predicates.

◆ xetla_atomic_local() [3/3]

template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size>

__XETLA_API xetla_vector< T, N > gpu::xetla::xetla_atomic_local	(	xetla_vector< uint32_t, N >	offsets,
		xetla_vector< T, N >	src0,
		xetla_vector< T, N >	src1,
		xetla_mask< N >	pred
	)

SLM scattered atomic (2 src).

Supported platforms: DG2, PVC VISA instruction: lsc_atomic_<OP>.slm

Template Parameters

Op	is operation type.
T	is element type.
N	is the number of SIMD channels (platform dependent).
DS	is the data size.

Parameters

offsets	[in] is the zero-based offsets.
src0	[in] is the first atomic operand.
src1	[in] is the second atomic operand.
pred	[in] is predicates.

◆ xetla_fence()

template<memory_kind Kind = memory_kind::untyped_global, fence_op FenceOp = fence_op::none, fence_scope Scope = fence_scope::group, int N = 16>

__XETLA_API void gpu::xetla::xetla_fence ( xetla_mask< N > pred = 1 )

Memory fence.

Supported platforms: DG2, PVC

Template Parameters

Kind	is the Sfid shaded function.
FenceOp	is the fence operation.
Scope	is the operation scope.
N	is the number of SIMD channels (platform dependent).

Parameters

pred	is predicates.

◆ xetla_load_global() [1/2]

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>

__XETLA_API xetla_vector< Ty, NElts > gpu::xetla::xetla_load_global	(	Ty *	p,
		uint64_t	offset = `0`
	)

Stateless block load (transposed gather with 1 channel).

Collects elements located at specified address and returns them to a single xetla_vector object.

Supported platforms: DG2, PVC

VISA instruction: lsc_load.ugm

Template Parameters

Ty	is element type.
NElts	is the number of elements to load per address (i.e. vector_size per SIMD channel).
DS	is the data size.
L1H	is L1 cache hint.
L2H	is L2 cache hint.

Parameters

p	[in] is the base pointer.
offset	[in] is the zero-based offset in bytes.

Returns: is a xetla_vector of type T and size NElts.

◆ xetla_load_global() [2/2]

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, int N, typename Toffset = uint32_t>

__XETLA_API xetla_vector< Ty, N *NElts > gpu::xetla::xetla_load_global	(	Ty *	p,
		xetla_vector< Toffset, N >	offsets,
		xetla_mask< N >	pred = `1`
	)

Stateless scattered load.

Collects elements located at specified address and returns them to a single xetla_vector object.

Supported platforms: DG2, PVC

VISA instruction: lsc_load.ugm

Template Parameters

Ty	is element type.
NElts	is the number of elements to load per address (i.e. vector_size per SIMD channel).
DS	is the data size.
L1H	is L1 cache hint.
L2H	is L2 cache hint.
N	is the number of SIMD channels (platform dependent).

Parameters

p	[in] is the base pointer.
offsets	[in] is the zero-based offsets in bytes.
pred	[in] is predicates.

Returns: is a xetla_vector of type T and size N * NElts.

◆ xetla_load_local() [1/2]

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size>

__XETLA_API xetla_vector< Ty, NElts > gpu::xetla::xetla_load_local ( uint32_t offset )

SLM block load.

(transposed gather with 1 channel). Collects elements located at slm and returns them as a single xetla_vector object.

Supported platforms: DG2, PVC

VISA instruction: lsc_load.slm

Template Parameters

Ty	is element type.
NElts	is the number of elements to load per address (i.e. vector_size per SIMD channel).
DS	is the data size.

Parameters

offset [in] is the zero-based offset for SLM buffer in bytes.

Returns: is a xetla_vector of type T and size NElts.

◆ xetla_load_local() [2/2]

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, int N>

__XETLA_API xetla_vector< Ty, N *NElts > gpu::xetla::xetla_load_local	(	xetla_vector< uint32_t, N >	offsets,
		xetla_mask< N >	pred = `1`
	)

SLM scattered load.

Collects elements located at slm and returns them as a single xetla_vector object.

Supported platforms: DG2, PVC

VISA instruction: lsc_load.slm

Template Parameters

Ty	is element type.
NElts	is the number of elements to load per address (i.e. vector_size per SIMD channel).
DS	is the data size.
N	is the number of SIMD channels (platform dependent).

Parameters

offsets	[in] is the zero-based offsets for SLM buffer in bytes.
pred	[in] is predicates.

Returns: is a xetla_vector of type T and size N * NElts.

◆ xetla_local_init()

template<uint32_t SLMSize>

__XETLA_API void gpu::xetla::xetla_local_init ( )

Declare per-work-group slm size.

Template Parameters

SLMSize Shared Local Memory (SLM) size (in Bytes).

◆ xetla_prefetch_global() [1/2]

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::cached, cache_hint L2H = cache_hint::cached>

__XETLA_API void gpu::xetla::xetla_prefetch_global	(	Ty *	p,
		uint64_t	offset = `0`
	)

Stateless block prefetch (transposed gather with 1 channel).

Prefetches elements located at specified address.

Supported platforms: DG2, PVC

VISA instruction: lsc_load.ugm

Template Parameters

Ty	is element type.
NElts	is the number of elements to prefetch per address (i.e. vector_size per SIMD channel).
DS	is the data size.
L1H	is L1 cache hint.
L2H	is L2 cache hint.

Parameters

p	[in] is the base pointer.
offset	[in] is the zero-based offset in bytes.

◆ xetla_prefetch_global() [2/2]

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::cached, cache_hint L2H = cache_hint::cached, int N>

__XETLA_API void gpu::xetla::xetla_prefetch_global	(	Ty *	p,
		xetla_vector< uint32_t, N >	offsets,
		xetla_mask< N >	pred = `1`
	)

Stateless scattered prefetch.

Prefetches elements located at specified address.

Supported platforms: DG2, PVC

VISA instruction: lsc_load.ugm

Template Parameters

Ty	is element type.
NElts	is the number of elements to prefetch per address (i.e. vector_size per SIMD channel).
DS	is the data size.
L1H	is L1 cache hint.
L2H	is L2 cache hint.
N	is the number of SIMD channels (platform dependent).

Parameters

p	[in] is the base pointer.
offsets	[in] is the zero-based offsets in bytes.
pred	[in] is predicates.

◆ xetla_store_global() [1/2]

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>

__XETLA_API void gpu::xetla::xetla_store_global	(	Ty *	p,
		uint64_t	offset,
		xetla_vector< Ty, NElts >	vals
	)

Stateless block store (transposed scatter with 1 channel).

Writes elements to specific address.

Supported platforms: DG2, PVC

VISA instruction: lsc_store.ugm

Template Parameters

Ty	is element type.
NElts	is the number of elements to store per address (i.e. vector_size per SIMD channel).
DS	is the data size.
L1H	is L1 cache hint.
L2H	is L2 cache hint.

Parameters

p	[in] is the base pointer.
offset	[in] is the zero-based offset in bytes.
vals	[in] is values to store.

◆ xetla_store_global() [2/2]

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, int N, typename Toffset = uint32_t>

__XETLA_API void gpu::xetla::xetla_store_global	(	Ty *	p,
		xetla_vector< Toffset, N >	offsets,
		xetla_vector< Ty, N *NElts >	vals,
		xetla_mask< N >	pred = `1`
	)

Stateless scattered store.

Writes elements to specific address.

Supported platforms: DG2, PVC

VISA instruction: lsc_store.ugm

Template Parameters

Ty	is element type.
NElts	is the number of elements to store per address (i.e. vector_size per SIMD channel).
DS	is the data size.
L1H	is L1 cache hint.
L2H	is L2 cache hint.
N	is the number of SIMD channels (platform dependent).

Parameters

p	[in] is the base pointer.
offsets	[in] is the zero-based offsets in bytes.
vals	[in] is values to store.
pred	[in] is predicates.

◆ xetla_store_local() [1/2]

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size>

__XETLA_API void gpu::xetla::xetla_store_local	(	uint32_t	offset,
		xetla_vector< Ty, NElts >	vals
	)

SLM block store (transposed SLM scatter with 1 channel).

Scatters elements located to slm.

Supported platforms: DG2, PVC

VISA instruction: lsc_store.slm

Template Parameters

Ty	is element type.
NElts	is the number of elements to store per address (i.e. vector_size per SIMD channel).
DS	is the data size.

Parameters

offset	[in] is the zero-based offset for SLM buffer in bytes.
vals	[in] is values to store.

◆ xetla_store_local() [2/2]

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, int N>

__XETLA_API void gpu::xetla::xetla_store_local	(	xetla_vector< uint32_t, N >	offsets,
		xetla_vector< Ty, N *NElts >	vals,
		xetla_mask< N >	pred = `1`
	)

SLM scattered store.

Scatters elements located to slm.

Supported platforms: DG2, PVC

VISA instruction: lsc_store.slm

Template Parameters

Ty	is element type.
NElts	is the number of elements to store per address (i.e. vector_size per SIMD channel).
DS	is the data size.
N	is the number of SIMD channels (platform dependent).

Parameters

offsets	[in] is the zero-based offsets for SLM buffer in bytes.
vals	[in] is values to store.
pred	[in] is predicates.

Functions

Detailed Description

Function Documentation

◆ xetla_atomic_global() [1/3]

◆ xetla_atomic_global() [2/3]

◆ xetla_atomic_global() [3/3]

◆ xetla_atomic_local() [1/3]

◆ xetla_atomic_local() [2/3]

◆ xetla_atomic_local() [3/3]

◆ xetla_fence()

◆ xetla_load_global() [1/2]

◆ xetla_load_global() [2/2]

◆ xetla_load_local() [1/2]

◆ xetla_load_local() [2/2]

◆ xetla_local_init()

◆ xetla_prefetch_global() [1/2]

◆ xetla_prefetch_global() [2/2]

◆ xetla_store_global() [1/2]

◆ xetla_store_global() [2/2]

◆ xetla_store_local() [1/2]

◆ xetla_store_local() [2/2]