C++ API. More...

#include "common/core/base_ops.hpp"
#include "common/core/base_types.hpp"
#include "common/core/common.hpp"
#include "common/utils/limitation.hpp"

Include dependency graph for memory.hpp:

This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Namespaces
namespace	gpu

namespace	gpu::xetla

namespace	gpu::xetla::detail

Functions
constexpr __ESIMD_ENS::cache_hint	gpu::xetla::detail::get_cache_hint (gpu::xetla::cache_hint ch)
	lookup table for cache hint.

constexpr __ESIMD_ENS::lsc_data_size	gpu::xetla::detail::get_data_size (gpu::xetla::data_size ds)
	lookup table for data size.

constexpr __ESIMD_ENS::lsc_memory_kind	gpu::xetla::detail::get_memory_kind (gpu::xetla::memory_kind mk)
	lookup table for memory kind.

constexpr __ESIMD_ENS::lsc_fence_op	gpu::xetla::detail::get_fence_op (gpu::xetla::fence_op fo)
	lookup table for fence op.

constexpr __ESIMD_ENS::lsc_scope	gpu::xetla::detail::get_fence_scope (gpu::xetla::fence_scope fs)
	lookup table for fence scope.

constexpr __ESIMD_NS::atomic_op	gpu::xetla::detail::get_atomic_op (gpu::xetla::atomic_op ao)
	lookup table for atomic op.

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::cached, cache_hint L2H = cache_hint::cached, int N>
__XETLA_API void	gpu::xetla::xetla_prefetch_global (Ty *p, xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred=1)
	Stateless scattered prefetch.

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::cached, cache_hint L2H = cache_hint::cached>
__XETLA_API void	gpu::xetla::xetla_prefetch_global (Ty *p, uint64_t offset=0)
	Stateless block prefetch (transposed gather with 1 channel).

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, int N, typename Toffset = uint32_t>
__XETLA_API xetla_vector< Ty, N *NElts >	gpu::xetla::xetla_load_global (Ty *p, xetla_vector< Toffset, N > offsets, xetla_mask< N > pred=1)
	Stateless scattered load.

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API xetla_vector< Ty, NElts >	gpu::xetla::xetla_load_global (Ty *p, uint64_t offset=0)
	Stateless block load (transposed gather with 1 channel).

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, int N, typename Toffset = uint32_t>
__XETLA_API void	gpu::xetla::xetla_store_global (Ty p, xetla_vector< Toffset, N > offsets, xetla_vector< Ty, N NElts > vals, xetla_mask< N > pred=1)
	Stateless scattered store.

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API void	gpu::xetla::xetla_store_global (Ty *p, uint64_t offset, xetla_vector< Ty, NElts > vals)
	Stateless block store (transposed scatter with 1 channel).

template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API xetla_vector< T, N >	gpu::xetla::xetla_atomic_global (T *p, xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred)
	Stateless scattered atomic (0 src).

template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API xetla_vector< T, N >	gpu::xetla::xetla_atomic_global (T *p, xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_mask< N > pred)
	Stateless scattered atomic (1 src).

template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API xetla_vector< T, N >	gpu::xetla::xetla_atomic_global (T *p, xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_vector< T, N > src1, xetla_mask< N > pred)
	Stateless scattered atomic (2 src).

template<uint32_t SLMSize>
__XETLA_API void	gpu::xetla::xetla_local_init ()
	Declare per-work-group slm size.

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, int N>
__XETLA_API xetla_vector< Ty, N *NElts >	gpu::xetla::xetla_load_local (xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred=1)
	SLM scattered load.

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size>
__XETLA_API xetla_vector< Ty, NElts >	gpu::xetla::xetla_load_local (uint32_t offset)
	SLM block load.

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, int N>
__XETLA_API void	gpu::xetla::xetla_store_local (xetla_vector< uint32_t, N > offsets, xetla_vector< Ty, N *NElts > vals, xetla_mask< N > pred=1)
	SLM scattered store.

template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size>
__XETLA_API void	gpu::xetla::xetla_store_local (uint32_t offset, xetla_vector< Ty, NElts > vals)
	SLM block store (transposed SLM scatter with 1 channel).

template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size>
__XETLA_API xetla_vector< T, N >	gpu::xetla::xetla_atomic_local (xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred)
	SLM scattered atomic (0 src).

template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size>
__XETLA_API xetla_vector< T, N >	gpu::xetla::xetla_atomic_local (xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_mask< N > pred)
	SLM scattered atomic (1 src).

template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size>
__XETLA_API xetla_vector< T, N >	gpu::xetla::xetla_atomic_local (xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_vector< T, N > src1, xetla_mask< N > pred)
	SLM scattered atomic (2 src).

template<memory_kind Kind = memory_kind::untyped_global, fence_op FenceOp = fence_op::none, fence_scope Scope = fence_scope::group, int N = 16>
__XETLA_API void	gpu::xetla::xetla_fence (xetla_mask< N > pred=1)
	Memory fence.

Detailed Description

C++ API.

Namespaces

Functions

Detailed Description