xetla/memory_8hpp_source.html

/*******************************************************************************

* Copyright (c) 2022-2023 Intel Corporation

*

* Licensed under the Apache License, Version 2.0 (the "License");

* you may not use this file except in compliance with the License.

* You may obtain a copy of the License at

*

*     http://www.apache.org/licenses/LICENSE-2.0

*

* Unless required by applicable law or agreed to in writing, software

* distributed under the License is distributed on an "AS IS" BASIS,

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

* See the License for the specific language governing permissions and

* limitations under the License.

*******************************************************************************/


#pragma once


#include "common/core/base_ops.hpp"

#include "common/core/base_types.hpp"

#include "common/core/common.hpp"

#include "common/utils/limitation.hpp"


namespace gpu::xetla {


namespace detail {


constexpr __ESIMD_ENS::cache_hint get_cache_hint(gpu::xetla::cache_hint ch) {

    switch (ch) {

        case gpu::xetla::cache_hint::none: return __ESIMD_ENS::cache_hint::none;

        case gpu::xetla::cache_hint::uncached:

            return __ESIMD_ENS::cache_hint::uncached;

        case gpu::xetla::cache_hint::cached:

            return __ESIMD_ENS::cache_hint::cached;

        case gpu::xetla::cache_hint::write_back:

            return __ESIMD_ENS::cache_hint::write_back;

        case gpu::xetla::cache_hint::write_through:

            return __ESIMD_ENS::cache_hint::write_through;

        case gpu::xetla::cache_hint::streaming:

            return __ESIMD_ENS::cache_hint::streaming;

        case gpu::xetla::cache_hint::read_invalidate:

            return __ESIMD_ENS::cache_hint::read_invalidate;

    }

}


constexpr __ESIMD_ENS::lsc_data_size get_data_size(gpu::xetla::data_size ds) {

    switch (ds) {

        case gpu::xetla::data_size::default_size:

            return __ESIMD_ENS::lsc_data_size::default_size;

        case gpu::xetla::data_size::u8: return __ESIMD_ENS::lsc_data_size::u8;

        case gpu::xetla::data_size::u16: return __ESIMD_ENS::lsc_data_size::u16;

        case gpu::xetla::data_size::u32: return __ESIMD_ENS::lsc_data_size::u32;

        case gpu::xetla::data_size::u64: return __ESIMD_ENS::lsc_data_size::u64;

        case gpu::xetla::data_size::u8u32:

            return __ESIMD_ENS::lsc_data_size::u8u32;

        case gpu::xetla::data_size::u16u32:

            return __ESIMD_ENS::lsc_data_size::u16u32;

        case gpu::xetla::data_size::u16u32h:

            return __ESIMD_ENS::lsc_data_size::u16u32h;

    }

}


constexpr __ESIMD_ENS::lsc_memory_kind get_memory_kind(

        gpu::xetla::memory_kind mk) {

    switch (mk) {

        case gpu::xetla::memory_kind::untyped_global:

            return __ESIMD_ENS::lsc_memory_kind::untyped_global;

        case gpu::xetla::memory_kind::untyped_global_low_pri:

            return __ESIMD_ENS::lsc_memory_kind::untyped_global_low_pri;

        case gpu::xetla::memory_kind::typed_global:

            return __ESIMD_ENS::lsc_memory_kind::typed_global;

        case gpu::xetla::memory_kind::shared_local:

            return __ESIMD_ENS::lsc_memory_kind::shared_local;

    }

}


constexpr __ESIMD_ENS::lsc_fence_op get_fence_op(gpu::xetla::fence_op fo) {

    switch (fo) {

        case gpu::xetla::fence_op::none: return __ESIMD_ENS::lsc_fence_op::none;

        case gpu::xetla::fence_op::evict:

            return __ESIMD_ENS::lsc_fence_op::evict;

        case gpu::xetla::fence_op::invalidate:

            return __ESIMD_ENS::lsc_fence_op::invalidate;

        case gpu::xetla::fence_op::discard:

            return __ESIMD_ENS::lsc_fence_op::discard;

        case gpu::xetla::fence_op::clean:

            return __ESIMD_ENS::lsc_fence_op::clean;

        case gpu::xetla::fence_op::flushl2:

            return __ESIMD_ENS::lsc_fence_op::flushl3;

    }

}


constexpr __ESIMD_ENS::lsc_scope get_fence_scope(gpu::xetla::fence_scope fs) {

    switch (fs) {

        case gpu::xetla::fence_scope::group:

            return __ESIMD_ENS::lsc_scope::group;

        case gpu::xetla::fence_scope::local:

            return __ESIMD_ENS::lsc_scope::local;

        case gpu::xetla::fence_scope::tile: return __ESIMD_ENS::lsc_scope::tile;

        case gpu::xetla::fence_scope::gpu: return __ESIMD_ENS::lsc_scope::gpu;

        case gpu::xetla::fence_scope::gpus: return __ESIMD_ENS::lsc_scope::gpus;

        case gpu::xetla::fence_scope::system:

            return __ESIMD_ENS::lsc_scope::system;

        case gpu::xetla::fence_scope::sysacq:

            return __ESIMD_ENS::lsc_scope::sysacq;

    }

}


constexpr __ESIMD_NS::atomic_op get_atomic_op(gpu::xetla::atomic_op ao) {

    switch (ao) {

        case gpu::xetla::atomic_op::iinc: return __ESIMD_NS::atomic_op::inc;

        case gpu::xetla::atomic_op::idec: return __ESIMD_NS::atomic_op::dec;

        case gpu::xetla::atomic_op::iadd: return __ESIMD_NS::atomic_op::add;

        case gpu::xetla::atomic_op::isub: return __ESIMD_NS::atomic_op::sub;

        case gpu::xetla::atomic_op::smin: return __ESIMD_NS::atomic_op::smin;

        case gpu::xetla::atomic_op::smax: return __ESIMD_NS::atomic_op::smax;

        case gpu::xetla::atomic_op::umin: return __ESIMD_NS::atomic_op::umin;

        case gpu::xetla::atomic_op::umax: return __ESIMD_NS::atomic_op::umax;

        case gpu::xetla::atomic_op::cmpxchg:

            return __ESIMD_NS::atomic_op::cmpxchg;

#pragma clang diagnostic push

#pragma clang diagnostic ignored "-Wdeprecated-declarations"

        case gpu::xetla::atomic_op::fadd: return __ESIMD_NS::atomic_op::fadd;

        case gpu::xetla::atomic_op::fsub: return __ESIMD_NS::atomic_op::fsub;

        case gpu::xetla::atomic_op::fmin: return __ESIMD_NS::atomic_op::fmin;

        case gpu::xetla::atomic_op::fmax: return __ESIMD_NS::atomic_op::fmax;

        case gpu::xetla::atomic_op::fcmpxchg:

            return __ESIMD_NS::atomic_op::fcmpwr;

#pragma clang diagnostic pop

        case gpu::xetla::atomic_op::bit_and:

            return __ESIMD_NS::atomic_op::bit_and;

        case gpu::xetla::atomic_op::bit_or:

            return __ESIMD_NS::atomic_op::bit_or;

        case gpu::xetla::atomic_op::bit_xor:

            return __ESIMD_NS::atomic_op::bit_xor;

        case gpu::xetla::atomic_op::load: return __ESIMD_NS::atomic_op::load;

        case gpu::xetla::atomic_op::store: return __ESIMD_NS::atomic_op::store;

    }

}

} // namespace detail


template <typename Ty, uint8_t NElts = 1,

        data_size DS = data_size::default_size,

        cache_hint L1H = cache_hint::cached,

        cache_hint L2H = cache_hint::cached, int N>

__XETLA_API void xetla_prefetch_global(

        Ty *p, xetla_vector<uint32_t, N> offsets, xetla_mask<N> pred = 1) {

    using T = native_type_t<Ty>;

    __ESIMD_ENS::lsc_prefetch<T, NElts, gpu::xetla::detail::get_data_size(DS),

            gpu::xetla::detail::get_cache_hint(L1H),

            gpu::xetla::detail::get_cache_hint(L2H), N>((T *)p, offsets, pred);

}


template <typename Ty, uint8_t NElts = 1,

        data_size DS = data_size::default_size,

        cache_hint L1H = cache_hint::cached,

        cache_hint L2H = cache_hint::cached>

__XETLA_API void xetla_prefetch_global(Ty *p, uint64_t offset = 0) {

    using T = native_type_t<Ty>;

    __ESIMD_ENS::lsc_prefetch<T, NElts, gpu::xetla::detail::get_data_size(DS),

            gpu::xetla::detail::get_cache_hint(L1H),

            gpu::xetla::detail::get_cache_hint(L2H)>(

            (T *)p + (offset / sizeof(T)));

}


template <typename Ty, uint8_t NElts = 1,

        data_size DS = data_size::default_size,

        cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none,

        int N, typename Toffset = uint32_t>

__XETLA_API xetla_vector<Ty, N * NElts> xetla_load_global(

        Ty *p, xetla_vector<Toffset, N> offsets, xetla_mask<N> pred = 1) {

    using T = native_type_t<Ty>;

    DEBUG_INVOKE(dbg_level::core,

            core::general_1d<gpu_arch::Xe,

                    Ty>::template check_restriction<NElts, N>(offsets,

                    (uint64_t)p));


    return __ESIMD_ENS::lsc_gather<T, NElts,

            gpu::xetla::detail::get_data_size(DS),

            gpu::xetla::detail::get_cache_hint(L1H),

            gpu::xetla::detail::get_cache_hint(L2H), N>((T *)p, offsets, pred);

}


template <typename Ty, uint8_t NElts = 1,

        data_size DS = data_size::default_size,

        cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>

__XETLA_API xetla_vector<Ty, NElts> xetla_load_global(

        Ty *p, uint64_t offset = 0) {

    using T = native_type_t<Ty>;

    DEBUG_INVOKE(dbg_level::core,

            core::general_1d<gpu_arch::Xe,

                    Ty>::template check_restriction<NElts>(offset,

                    (uint64_t)p));


    return __ESIMD_ENS::lsc_block_load<T, NElts,

            gpu::xetla::detail::get_data_size(DS),

            gpu::xetla::detail::get_cache_hint(L1H),

            gpu::xetla::detail::get_cache_hint(L2H)>(

            (T *)p + (offset / sizeof(T)));

}


template <typename Ty, uint8_t NElts = 1,

        data_size DS = data_size::default_size,

        cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none,

        int N, typename Toffset = uint32_t>

__XETLA_API void xetla_store_global(Ty *p, xetla_vector<Toffset, N> offsets,

        xetla_vector<Ty, N * NElts> vals, xetla_mask<N> pred = 1) {

    using T = native_type_t<Ty>;

    __ESIMD_ENS::lsc_scatter<T, NElts, gpu::xetla::detail::get_data_size(DS),

            gpu::xetla::detail::get_cache_hint(L1H),

            gpu::xetla::detail::get_cache_hint(L2H), N>(

            (T *)p, offsets, vals, pred);

}


template <typename Ty, uint8_t NElts = 1,

        data_size DS = data_size::default_size,

        cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>

__XETLA_API void xetla_store_global(

        Ty *p, uint64_t offset, xetla_vector<Ty, NElts> vals) {

    using T = native_type_t<Ty>;

    __ESIMD_ENS::lsc_block_store<T, NElts,

            gpu::xetla::detail::get_data_size(DS),

            gpu::xetla::detail::get_cache_hint(L1H),

            gpu::xetla::detail::get_cache_hint(L2H)>(

            (T *)p + (offset / sizeof(T)), vals);

}


template <atomic_op Op, typename T, int N,

        data_size DS = data_size::default_size,

        cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>

__XETLA_API xetla_vector<T, N> xetla_atomic_global(

        T *p, xetla_vector<uint32_t, N> offsets, xetla_mask<N> pred) {

    static_assert(!(is_internal_type<T>::value),

            "The internal types are not yet supported!");

    return __ESIMD_ENS::lsc_atomic_update<gpu::xetla::detail::get_atomic_op(Op),

            T, N, gpu::xetla::detail::get_data_size(DS),

            gpu::xetla::detail::get_cache_hint(L1H),

            gpu::xetla::detail::get_cache_hint(L2H)>(p, offsets, pred);

}


template <atomic_op Op, typename T, int N,

        data_size DS = data_size::default_size,

        cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>

__XETLA_API xetla_vector<T, N> xetla_atomic_global(T *p,

        xetla_vector<uint32_t, N> offsets, xetla_vector<T, N> src0,

        xetla_mask<N> pred) {

    static_assert(!(is_internal_type<T>::value),

            "The internal types are not yet supported!");

    return __ESIMD_ENS::lsc_atomic_update<gpu::xetla::detail::get_atomic_op(Op),

            T, N, gpu::xetla::detail::get_data_size(DS),

            gpu::xetla::detail::get_cache_hint(L1H),

            gpu::xetla::detail::get_cache_hint(L2H)>(p, offsets, src0, pred);

}


template <atomic_op Op, typename T, int N,

        data_size DS = data_size::default_size,

        cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>

__XETLA_API xetla_vector<T, N> xetla_atomic_global(T *p,

        xetla_vector<uint32_t, N> offsets, xetla_vector<T, N> src0,

        xetla_vector<T, N> src1, xetla_mask<N> pred) {

    static_assert(!(is_internal_type<T>::value),

            "The internal types are not yet supported!");

    return __ESIMD_ENS::lsc_atomic_update<gpu::xetla::detail::get_atomic_op(Op),

            T, N, gpu::xetla::detail::get_data_size(DS),

            gpu::xetla::detail::get_cache_hint(L1H),

            gpu::xetla::detail::get_cache_hint(L2H)>(

            p, offsets, src0, src1, pred);

}

template <uint32_t SLMSize>

__XETLA_API void xetla_local_init() {

    if constexpr (SLMSize != 0) { __ESIMD_NS::slm_init(SLMSize); }

}


template <typename Ty, uint8_t NElts = 1,

        data_size DS = data_size::default_size, int N>

__XETLA_API xetla_vector<Ty, N * NElts> xetla_load_local(

        xetla_vector<uint32_t, N> offsets, xetla_mask<N> pred = 1) {

    using T = native_type_t<Ty>;

    DEBUG_INVOKE(dbg_level::core,

            core::general_1d<gpu_arch::Xe,

                    Ty>::template check_restriction<NElts, N>(offsets));


    return __ESIMD_ENS::lsc_slm_gather<T, NElts,

            gpu::xetla::detail::get_data_size(DS), N>(

            xetla_cvt<uint64_t, uint32_t>(offsets), pred);

}


template <typename Ty, uint8_t NElts = 1,

        data_size DS = data_size::default_size>

__XETLA_API xetla_vector<Ty, NElts> xetla_load_local(uint32_t offset) {

    using T = native_type_t<Ty>;

    DEBUG_INVOKE(dbg_level::core,

            core::general_1d<gpu_arch::Xe,

                    Ty>::template check_restriction<NElts>((uint64_t)offset));


    return __ESIMD_ENS::lsc_slm_block_load<T, NElts,

            gpu::xetla::detail::get_data_size(DS)>(offset);

}


template <typename Ty, uint8_t NElts = 1,

        data_size DS = data_size::default_size, int N>

__XETLA_API void xetla_store_local(xetla_vector<uint32_t, N> offsets,

        xetla_vector<Ty, N * NElts> vals, xetla_mask<N> pred = 1) {

    using T = native_type_t<Ty>;

    DEBUG_INVOKE(dbg_level::core,

            core::general_1d<gpu_arch::Xe, Ty>::template check_restriction<

                    NElts, N, uint32_t>(offsets));


    __ESIMD_ENS::lsc_slm_scatter<T, NElts,

            gpu::xetla::detail::get_data_size(DS), N>(offsets, vals, pred);

}


template <typename Ty, uint8_t NElts = 1,

        data_size DS = data_size::default_size>

__XETLA_API void xetla_store_local(

        uint32_t offset, xetla_vector<Ty, NElts> vals) {

    using T = native_type_t<Ty>;

    DEBUG_INVOKE(dbg_level::core,

            core::general_1d<gpu_arch::Xe,

                    Ty>::template check_restriction<NElts>(offset));


    __ESIMD_ENS::lsc_slm_block_store<T, NElts,

            gpu::xetla::detail::get_data_size(DS)>(offset, vals);

}


template <atomic_op Op, typename T, int N,

        data_size DS = data_size::default_size>

__XETLA_API xetla_vector<T, N> xetla_atomic_local(

        xetla_vector<uint32_t, N> offsets, xetla_mask<N> pred) {

    static_assert(!(is_internal_type<T>::value),

            "The internal types are not yet supported!");

    return __ESIMD_ENS::lsc_slm_atomic_update<gpu::xetla::detail::get_atomic_op(

                                                      Op),

            T, N, gpu::xetla::detail::get_data_size(DS)>(offsets, pred);

}


template <atomic_op Op, typename T, int N,

        data_size DS = data_size::default_size>

__XETLA_API xetla_vector<T, N> xetla_atomic_local(

        xetla_vector<uint32_t, N> offsets, xetla_vector<T, N> src0,

        xetla_mask<N> pred) {

    static_assert(!(is_internal_type<T>::value),

            "The internal types are not yet supported!");

    return __ESIMD_ENS::lsc_slm_atomic_update<gpu::xetla::detail::get_atomic_op(

                                                      Op),

            T, N, gpu::xetla::detail::get_data_size(DS)>(offsets, src0, pred);

}


template <atomic_op Op, typename T, int N,

        data_size DS = data_size::default_size>

__XETLA_API xetla_vector<T, N> xetla_atomic_local(

        xetla_vector<uint32_t, N> offsets, xetla_vector<T, N> src0,

        xetla_vector<T, N> src1, xetla_mask<N> pred) {

    static_assert(!(is_internal_type<T>::value),

            "The internal types are not yet supported!");

    return __ESIMD_ENS::lsc_slm_atomic_update<gpu::xetla::detail::get_atomic_op(

                                                      Op),

            T, N, gpu::xetla::detail::get_data_size(DS)>(

            offsets, src0, src1, pred);

}


template <memory_kind Kind = memory_kind::untyped_global,

        fence_op FenceOp = fence_op::none,

        fence_scope Scope = fence_scope::group, int N = 16>

__XETLA_API void xetla_fence(xetla_mask<N> pred = 1) {

    __ESIMD_ENS::lsc_fence<gpu::xetla::detail::get_memory_kind(Kind),

            gpu::xetla::detail::get_fence_op(FenceOp),

            gpu::xetla::detail::get_fence_scope(Scope), N>(pred);

}


} // namespace gpu::xetla

base_ops.hpp
C++ API.

base_types.hpp
C++ API.

common.hpp
C++ API.

__XETLA_API
#define __XETLA_API
Definition common.hpp:43

DEBUG_INVOKE
#define DEBUG_INVOKE(level,...)
Definition debug.hpp:180

gpu::xetla::native_type_t
typename native_type< T >::type native_type_t
Return the native data type of T.
Definition base_types.hpp:106

gpu::xetla::xetla_vector
__ESIMD_NS::simd< native_type_t< Ty >, N > xetla_vector
wrapper for xetla_vector.
Definition base_types.hpp:149

gpu::xetla::xetla_mask
__ESIMD_NS::simd_mask< N > xetla_mask
wrapper for xetla_mask.
Definition base_types.hpp:165

gpu::xetla::xetla_atomic_local
__XETLA_API xetla_vector< T, N > xetla_atomic_local(xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred)
SLM scattered atomic (0 src).
Definition memory.hpp:568

gpu::xetla::xetla_fence
__XETLA_API void xetla_fence(xetla_mask< N > pred=1)
Memory fence.
Definition memory.hpp:638

gpu::xetla::xetla_prefetch_global
__XETLA_API void xetla_prefetch_global(Ty *p, xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred=1)
Stateless scattered prefetch.
Definition memory.hpp:187

gpu::xetla::xetla_local_init
__XETLA_API void xetla_local_init()
Declare per-work-group slm size.
Definition memory.hpp:443

gpu::xetla::xetla_load_global
__XETLA_API xetla_vector< Ty, N *NElts > xetla_load_global(Ty *p, xetla_vector< Toffset, N > offsets, xetla_mask< N > pred=1)
Stateless scattered load.
Definition memory.hpp:245

gpu::xetla::xetla_load_local
__XETLA_API xetla_vector< Ty, N *NElts > xetla_load_local(xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred=1)
SLM scattered load.
Definition memory.hpp:464

gpu::xetla::xetla_atomic_global
__XETLA_API xetla_vector< T, N > xetla_atomic_global(T *p, xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred)
Stateless scattered atomic (0 src).
Definition memory.hpp:371

gpu::xetla::xetla_store_local
__XETLA_API void xetla_store_local(xetla_vector< uint32_t, N > offsets, xetla_vector< Ty, N *NElts > vals, xetla_mask< N > pred=1)
SLM scattered store.
Definition memory.hpp:518

gpu::xetla::xetla_store_global
__XETLA_API void xetla_store_global(Ty *p, xetla_vector< Toffset, N > offsets, xetla_vector< Ty, N *NElts > vals, xetla_mask< N > pred=1)
Stateless scattered store.
Definition memory.hpp:316

limitation.hpp
C++ API.

gpu::xetla::detail::get_fence_scope
constexpr __ESIMD_ENS::lsc_scope get_fence_scope(gpu::xetla::fence_scope fs)
lookup table for fence scope.
Definition memory.hpp:111

gpu::xetla::detail::get_atomic_op
constexpr __ESIMD_NS::atomic_op get_atomic_op(gpu::xetla::atomic_op ao)
lookup table for atomic op.
Definition memory.hpp:130

gpu::xetla::detail::get_data_size
constexpr __ESIMD_ENS::lsc_data_size get_data_size(gpu::xetla::data_size ds)
lookup table for data size.
Definition memory.hpp:55

gpu::xetla::detail::get_fence_op
constexpr __ESIMD_ENS::lsc_fence_op get_fence_op(gpu::xetla::fence_op fo)
lookup table for fence op.
Definition memory.hpp:92

gpu::xetla::detail::get_memory_kind
constexpr __ESIMD_ENS::lsc_memory_kind get_memory_kind(gpu::xetla::memory_kind mk)
lookup table for memory kind.
Definition memory.hpp:75

gpu::xetla::detail::get_cache_hint
constexpr __ESIMD_ENS::cache_hint get_cache_hint(gpu::xetla::cache_hint ch)
lookup table for cache hint.
Definition memory.hpp:34

gpu::xetla
Definition arch_config.hpp:24

gpu::xetla::cache_hint
cache_hint
L1 or L2 cache hint kinds.
Definition common.hpp:89

gpu::xetla::cache_hint::cached
@ cached

gpu::xetla::cache_hint::read_invalidate
@ read_invalidate

gpu::xetla::cache_hint::none
@ none

gpu::xetla::cache_hint::write_through
@ write_through

gpu::xetla::cache_hint::write_back
@ write_back

gpu::xetla::cache_hint::streaming
@ streaming

gpu::xetla::cache_hint::uncached
@ uncached

gpu::xetla::data_size
data_size
Data size or format to read or store.
Definition common.hpp:100

gpu::xetla::data_size::u8
@ u8

gpu::xetla::data_size::u64
@ u64

gpu::xetla::data_size::default_size
@ default_size

gpu::xetla::data_size::u16u32h
@ u16u32h
load 16b, zero extend to 32b; store the opposite

gpu::xetla::data_size::u32
@ u32

gpu::xetla::data_size::u16u32
@ u16u32
load 8b, zero extend to 32b; store the opposite

gpu::xetla::data_size::u8u32
@ u8u32

gpu::xetla::data_size::u16
@ u16

gpu::xetla::fence_op
fence_op
The xetla_fence operation to apply to caches.
Definition common.hpp:120

gpu::xetla::fence_op::clean
@ clean
direct and clean lines are discarded w/o eviction

gpu::xetla::fence_op::none
@ none

gpu::xetla::fence_op::flushl2
@ flushl2
dirty lines are written to memory, but retained in cache

gpu::xetla::fence_op::discard
@ discard
invalidate all clean lines

gpu::xetla::fence_op::invalidate
@ invalidate
dirty lines evicted and invalidated from L1

gpu::xetla::fence_op::evict
@ evict
no operation

gpu::xetla::fence_scope
fence_scope
The scope that xetla_fence operation should apply to.
Definition common.hpp:130

gpu::xetla::fence_scope::gpu
@ gpu
tile, flush out to several DSSs

gpu::xetla::fence_scope::tile
@ tile
flush out to the local scope

gpu::xetla::fence_scope::gpus
@ gpus
entire GPU, flush out to the GPUs LLC

gpu::xetla::fence_scope::sysacq
@ sysacq
the entire system memory space

gpu::xetla::fence_scope::system
@ system
all GPUs in the system, flush out to memory shared by all GPUs

gpu::xetla::fence_scope::group
@ group

gpu::xetla::fence_scope::local
@ local
flush out to the threadgroup's scope

gpu::xetla::memory_kind
memory_kind
The specific LSC shared function to fence with xetla_fence.
Definition common.hpp:112

gpu::xetla::memory_kind::untyped_global
@ untyped_global

gpu::xetla::memory_kind::typed_global
@ typed_global
low-priority untyped global memory

gpu::xetla::memory_kind::untyped_global_low_pri
@ untyped_global_low_pri
untyped global memory

gpu::xetla::memory_kind::shared_local
@ shared_local
typed global memory

gpu::xetla::atomic_op
atomic_op
Represents an atomic operation.
Definition common.hpp:142

gpu::xetla::atomic_op::umin
@ umin
Atomic store the unsigned int min of src1 and memory data and return the old value....

gpu::xetla::atomic_op::fsub
@ fsub
Atomic float subtract of src1 from memory data and return the old value. see

gpu::xetla::atomic_op::bit_or
@ bit_or
Atomic store the bitwise OR of src1 and memory data and return the old value. see

gpu::xetla::atomic_op::iadd
@ iadd
Atomic signed int add of src1 from memory data and return the old value. see

gpu::xetla::atomic_op::smin
@ smin
Atomic store the signed int min of src1 and memory data and return the old value. see

gpu::xetla::atomic_op::cmpxchg
@ cmpxchg
Atomic bit-compare src1_X and memory data and replace if equal with src1_Y. Returns the old value....

gpu::xetla::atomic_op::fmax
@ fmax
Atomic store the float max of src1 and memory data and return the old value. see

gpu::xetla::atomic_op::fadd
@ fadd
Atomic float add of src1 from memory data and return the old value. see

gpu::xetla::atomic_op::idec
@ idec
Atomic decrement of memory data and return the old value. see

gpu::xetla::atomic_op::umax
@ umax
Atomic store the unsigned int max of src1 and memory data and return the old value....

gpu::xetla::atomic_op::store
@ store
Atomic store untyped data to memory. see

gpu::xetla::atomic_op::fmin
@ fmin
Atomic store the float min of src1 and memory data and return the old value. see

gpu::xetla::atomic_op::bit_and
@ bit_and
Atomic store the bitwise AND of src1 and memory data and return the old value. see

gpu::xetla::atomic_op::iinc
@ iinc
Atomic increment of memory data and return the old value. see

gpu::xetla::atomic_op::smax
@ smax
Atomic store the signed int max of src1 and memory data and return the old value. see

gpu::xetla::atomic_op::bit_xor
@ bit_xor
Atomic store the bitwise XOR of src1 and memory data and return the old value. see

gpu::xetla::atomic_op::isub
@ isub
Atomic signed int subtract of src1 from memory data and return the old value. see

gpu::xetla::atomic_op::fcmpxchg
@ fcmpxchg
Atomic float compare src1_X and memory data and replace if equal with src1_Y. Returns the old value....

gpu::xetla::atomic_op::load
@ load
Atomic read of the memory data value, without modifying the data. see

gpu::xetla::gpu_arch::Xe
@ Xe

gpu::xetla::core::general_1d
Definition limitation.hpp:31

gpu::xetla::is_internal_type
Used to check if the type is xetla internal data type.
Definition base_types.hpp:67