XeTLA v0.3.6
IntelĀ® Xe Templates for Linear Algebra - API Definition Document
 
Loading...
Searching...
No Matches
memory.hpp File Reference

C++ API. More...

Include dependency graph for memory.hpp:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Namespaces

namespace  gpu
 
namespace  gpu::xetla
 
namespace  gpu::xetla::detail
 

Functions

constexpr __ESIMD_ENS::cache_hint gpu::xetla::detail::get_cache_hint (gpu::xetla::cache_hint ch)
 lookup table for cache hint.
 
constexpr __ESIMD_ENS::lsc_data_size gpu::xetla::detail::get_data_size (gpu::xetla::data_size ds)
 lookup table for data size.
 
constexpr __ESIMD_ENS::lsc_memory_kind gpu::xetla::detail::get_memory_kind (gpu::xetla::memory_kind mk)
 lookup table for memory kind.
 
constexpr __ESIMD_ENS::lsc_fence_op gpu::xetla::detail::get_fence_op (gpu::xetla::fence_op fo)
 lookup table for fence op.
 
constexpr __ESIMD_ENS::lsc_scope gpu::xetla::detail::get_fence_scope (gpu::xetla::fence_scope fs)
 lookup table for fence scope.
 
constexpr __ESIMD_NS::atomic_op gpu::xetla::detail::get_atomic_op (gpu::xetla::atomic_op ao)
 lookup table for atomic op.
 
template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::cached, cache_hint L2H = cache_hint::cached, int N>
__XETLA_API void gpu::xetla::xetla_prefetch_global (Ty *p, xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred=1)
 Stateless scattered prefetch.
 
template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::cached, cache_hint L2H = cache_hint::cached>
__XETLA_API void gpu::xetla::xetla_prefetch_global (Ty *p, uint64_t offset=0)
 Stateless block prefetch (transposed gather with 1 channel).
 
template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, int N, typename Toffset = uint32_t>
__XETLA_API xetla_vector< Ty, N *NElts > gpu::xetla::xetla_load_global (Ty *p, xetla_vector< Toffset, N > offsets, xetla_mask< N > pred=1)
 Stateless scattered load.
 
template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API xetla_vector< Ty, NElts > gpu::xetla::xetla_load_global (Ty *p, uint64_t offset=0)
 Stateless block load (transposed gather with 1 channel).
 
template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, int N, typename Toffset = uint32_t>
__XETLA_API void gpu::xetla::xetla_store_global (Ty *p, xetla_vector< Toffset, N > offsets, xetla_vector< Ty, N *NElts > vals, xetla_mask< N > pred=1)
 Stateless scattered store.
 
template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API void gpu::xetla::xetla_store_global (Ty *p, uint64_t offset, xetla_vector< Ty, NElts > vals)
 Stateless block store (transposed scatter with 1 channel).
 
template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API xetla_vector< T, N > gpu::xetla::xetla_atomic_global (T *p, xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred)
 Stateless scattered atomic (0 src).
 
template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API xetla_vector< T, N > gpu::xetla::xetla_atomic_global (T *p, xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_mask< N > pred)
 Stateless scattered atomic (1 src).
 
template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API xetla_vector< T, N > gpu::xetla::xetla_atomic_global (T *p, xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_vector< T, N > src1, xetla_mask< N > pred)
 Stateless scattered atomic (2 src).
 
template<uint32_t SLMSize>
__XETLA_API void gpu::xetla::xetla_local_init ()
 Declare per-work-group slm size.
 
template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, int N>
__XETLA_API xetla_vector< Ty, N *NElts > gpu::xetla::xetla_load_local (xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred=1)
 SLM scattered load.
 
template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size>
__XETLA_API xetla_vector< Ty, NElts > gpu::xetla::xetla_load_local (uint32_t offset)
 SLM block load.
 
template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, int N>
__XETLA_API void gpu::xetla::xetla_store_local (xetla_vector< uint32_t, N > offsets, xetla_vector< Ty, N *NElts > vals, xetla_mask< N > pred=1)
 SLM scattered store.
 
template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size>
__XETLA_API void gpu::xetla::xetla_store_local (uint32_t offset, xetla_vector< Ty, NElts > vals)
 SLM block store (transposed SLM scatter with 1 channel).
 
template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size>
__XETLA_API xetla_vector< T, N > gpu::xetla::xetla_atomic_local (xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred)
 SLM scattered atomic (0 src).
 
template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size>
__XETLA_API xetla_vector< T, N > gpu::xetla::xetla_atomic_local (xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_mask< N > pred)
 SLM scattered atomic (1 src).
 
template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size>
__XETLA_API xetla_vector< T, N > gpu::xetla::xetla_atomic_local (xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_vector< T, N > src1, xetla_mask< N > pred)
 SLM scattered atomic (2 src).
 
template<memory_kind Kind = memory_kind::untyped_global, fence_op FenceOp = fence_op::none, fence_scope Scope = fence_scope::group, int N = 16>
__XETLA_API void gpu::xetla::xetla_fence (xetla_mask< N > pred=1)
 Memory fence.
 

Detailed Description

C++ API.