C++ API. More...
#include "common/core/base_ops.hpp"#include "common/core/base_types.hpp"#include "common/core/common.hpp"#include "common/utils/limitation.hpp"

Go to the source code of this file.
Namespaces | |
| namespace | gpu |
| namespace | gpu::xetla |
| namespace | gpu::xetla::detail |
Functions | |
| constexpr __ESIMD_ENS::cache_hint | gpu::xetla::detail::get_cache_hint (gpu::xetla::cache_hint ch) |
| lookup table for cache hint. | |
| constexpr __ESIMD_ENS::lsc_data_size | gpu::xetla::detail::get_data_size (gpu::xetla::data_size ds) |
| lookup table for data size. | |
| constexpr __ESIMD_ENS::lsc_memory_kind | gpu::xetla::detail::get_memory_kind (gpu::xetla::memory_kind mk) |
| lookup table for memory kind. | |
| constexpr __ESIMD_ENS::lsc_fence_op | gpu::xetla::detail::get_fence_op (gpu::xetla::fence_op fo) |
| lookup table for fence op. | |
| constexpr __ESIMD_ENS::lsc_scope | gpu::xetla::detail::get_fence_scope (gpu::xetla::fence_scope fs) |
| lookup table for fence scope. | |
| constexpr __ESIMD_NS::atomic_op | gpu::xetla::detail::get_atomic_op (gpu::xetla::atomic_op ao) |
| lookup table for atomic op. | |
| template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::cached, cache_hint L2H = cache_hint::cached, int N> | |
| __XETLA_API void | gpu::xetla::xetla_prefetch_global (Ty *p, xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred=1) |
| Stateless scattered prefetch. | |
| template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::cached, cache_hint L2H = cache_hint::cached> | |
| __XETLA_API void | gpu::xetla::xetla_prefetch_global (Ty *p, uint64_t offset=0) |
| Stateless block prefetch (transposed gather with 1 channel). | |
| template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, int N, typename Toffset = uint32_t> | |
| __XETLA_API xetla_vector< Ty, N *NElts > | gpu::xetla::xetla_load_global (Ty *p, xetla_vector< Toffset, N > offsets, xetla_mask< N > pred=1) |
| Stateless scattered load. | |
| template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none> | |
| __XETLA_API xetla_vector< Ty, NElts > | gpu::xetla::xetla_load_global (Ty *p, uint64_t offset=0) |
| Stateless block load (transposed gather with 1 channel). | |
| template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, int N, typename Toffset = uint32_t> | |
| __XETLA_API void | gpu::xetla::xetla_store_global (Ty *p, xetla_vector< Toffset, N > offsets, xetla_vector< Ty, N *NElts > vals, xetla_mask< N > pred=1) |
| Stateless scattered store. | |
| template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none> | |
| __XETLA_API void | gpu::xetla::xetla_store_global (Ty *p, uint64_t offset, xetla_vector< Ty, NElts > vals) |
| Stateless block store (transposed scatter with 1 channel). | |
| template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none> | |
| __XETLA_API xetla_vector< T, N > | gpu::xetla::xetla_atomic_global (T *p, xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred) |
| Stateless scattered atomic (0 src). | |
| template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none> | |
| __XETLA_API xetla_vector< T, N > | gpu::xetla::xetla_atomic_global (T *p, xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_mask< N > pred) |
| Stateless scattered atomic (1 src). | |
| template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none> | |
| __XETLA_API xetla_vector< T, N > | gpu::xetla::xetla_atomic_global (T *p, xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_vector< T, N > src1, xetla_mask< N > pred) |
| Stateless scattered atomic (2 src). | |
| template<uint32_t SLMSize> | |
| __XETLA_API void | gpu::xetla::xetla_local_init () |
| Declare per-work-group slm size. | |
| template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, int N> | |
| __XETLA_API xetla_vector< Ty, N *NElts > | gpu::xetla::xetla_load_local (xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred=1) |
| SLM scattered load. | |
| template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size> | |
| __XETLA_API xetla_vector< Ty, NElts > | gpu::xetla::xetla_load_local (uint32_t offset) |
| SLM block load. | |
| template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, int N> | |
| __XETLA_API void | gpu::xetla::xetla_store_local (xetla_vector< uint32_t, N > offsets, xetla_vector< Ty, N *NElts > vals, xetla_mask< N > pred=1) |
| SLM scattered store. | |
| template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size> | |
| __XETLA_API void | gpu::xetla::xetla_store_local (uint32_t offset, xetla_vector< Ty, NElts > vals) |
| SLM block store (transposed SLM scatter with 1 channel). | |
| template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size> | |
| __XETLA_API xetla_vector< T, N > | gpu::xetla::xetla_atomic_local (xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred) |
| SLM scattered atomic (0 src). | |
| template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size> | |
| __XETLA_API xetla_vector< T, N > | gpu::xetla::xetla_atomic_local (xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_mask< N > pred) |
| SLM scattered atomic (1 src). | |
| template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size> | |
| __XETLA_API xetla_vector< T, N > | gpu::xetla::xetla_atomic_local (xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_vector< T, N > src1, xetla_mask< N > pred) |
| SLM scattered atomic (2 src). | |
| template<memory_kind Kind = memory_kind::untyped_global, fence_op FenceOp = fence_op::none, fence_scope Scope = fence_scope::group, int N = 16> | |
| __XETLA_API void | gpu::xetla::xetla_fence (xetla_mask< N > pred=1) |
| Memory fence. | |
C++ API.