xetla/common_2utils_2common_8hpp_source.html

/*******************************************************************************

 * Copyright (c) 2022-2023 Intel Corporation

 *

 * Licensed under the Apache License, Version 2.0 (the "License");

 * you may not use this file except in compliance with the License.

 * You may obtain a copy of the License at

 *

 *     http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 *******************************************************************************/


#pragma once


#include "common/core/core.hpp"


namespace gpu::xetla {

namespace detail {


template <uint32_t element_size>

constexpr uint32_t get_element_size_code() {

    static_assert(element_size == 1 || element_size == 2 || element_size == 4

                    || element_size == 8,

            "element_size not supported!");

    switch (element_size) {

        case 1: return 0;

        case 2: return 1;

        case 4: return 2;

        case 8: return 3;

    }

}


enum class lsc_action : uint8_t { prefetch, load, store, atomic };


template <lsc_action Action, cache_hint L1H, cache_hint L2H, gpu_arch arch_tag>

constexpr std::enable_if_t<arch_tag == gpu_arch::Xe, void>

check_lsc_cache_hint() {

    if constexpr (Action == lsc_action::prefetch) {

        // https://gfxspecs.intel.com/Predator/Home/Index/53560

        static_assert(

                ((L2H == cache_hint::uncached || L2H == cache_hint::cached)

                        && (L1H == cache_hint::uncached

                                || L1H == cache_hint::cached

                                || L1H == cache_hint::streaming)),

                "cache hint type not supported!");

    } else if constexpr (Action == lsc_action::load) {

        // https://gfxspecs.intel.com/Predator/Home/Index/53560

        static_assert((L1H == cache_hint::none && L2H == cache_hint::none)

                        || ((L2H == cache_hint::uncached)

                                && (L1H == cache_hint::uncached

                                        || L1H == cache_hint::cached

                                        || L1H == cache_hint::streaming))

                        || ((L2H == cache_hint::cached)

                                && (L1H == cache_hint::uncached

                                        || L1H == cache_hint::cached

                                        || L1H == cache_hint::streaming

                                        || L1H == cache_hint::read_invalidate)),

                "unsupported cache hint!");

    } else if constexpr (Action == lsc_action::store) {

        // https://gfxspecs.intel.com/Predator/Home/Index/53561

        static_assert((L1H == cache_hint::none && L2H == cache_hint::none)

                        || ((L2H == cache_hint::uncached)

                                && (L1H == cache_hint::uncached

                                        || L1H == cache_hint::write_through

                                        || L1H == cache_hint::streaming))

                        || ((L2H == cache_hint::write_back)

                                && (L1H == cache_hint::uncached

                                        || L1H == cache_hint::write_through

                                        || L1H == cache_hint::streaming

                                        || L1H == cache_hint::write_back)),

                "unsupported cache hint!");

    } else if constexpr (Action == lsc_action::atomic) {

        // https://gfxspecs.intel.com/Predator/Home/Index/53561

        static_assert((L1H == cache_hint::none && L2H == cache_hint::none)

                        || (L1H == cache_hint::uncached

                                && (L2H == cache_hint::uncached

                                        || L2H == cache_hint::write_back)),

                "unsupported cache hint!");

    }

}


template <cache_hint L1H, cache_hint L2H, gpu_arch arch_tag>

constexpr std::enable_if_t<arch_tag == gpu_arch::Xe, uint32_t>

get_load_cache_hint_code() {

    check_lsc_cache_hint<lsc_action::load, L1H, L2H, arch_tag>();

    if (L1H == cache_hint::none && L2H == cache_hint::none) {

        return 0;

    } else if (L2H == cache_hint::uncached) {

        if (L1H == cache_hint::uncached) { return 1; }

        if (L1H == cache_hint::cached) { return 3; }

        if (L1H == cache_hint::streaming) { return 5; }

    } else if (L2H == cache_hint::cached) {

        if (L1H == cache_hint::uncached) { return 2; }

        if (L1H == cache_hint::cached) { return 4; }

        if (L1H == cache_hint::streaming) { return 6; }

        if (L1H == cache_hint::read_invalidate) { return 7; }

    }

}


template <cache_hint L1H, cache_hint L2H, gpu_arch arch_tag>

constexpr std::enable_if_t<arch_tag == gpu_arch::Xe, uint32_t>

get_prefetch_cache_hint_code() {

    check_lsc_cache_hint<lsc_action::prefetch, L1H, L2H, arch_tag>();

    if (L2H == cache_hint::uncached) {

        if (L1H == cache_hint::uncached) { return 1; }

        if (L1H == cache_hint::cached) { return 3; }

        if (L1H == cache_hint::streaming) { return 5; }

    } else if (L2H == cache_hint::cached) {

        if (L1H == cache_hint::uncached) { return 2; }

        if (L1H == cache_hint::cached) { return 4; }

        if (L1H == cache_hint::streaming) { return 6; }

    }

}


template <cache_hint L1H, cache_hint L2H, gpu_arch arch_tag>

constexpr std::enable_if_t<arch_tag == gpu_arch::Xe, uint32_t>

get_store_cache_hint_code() {

    check_lsc_cache_hint<lsc_action::store, L1H, L2H, arch_tag>();

    if (L1H == cache_hint::none && L2H == cache_hint::none) {

        return 0;

    } else if (L2H == cache_hint::uncached) {

        if (L1H == cache_hint::uncached) { return 1; }

        if (L1H == cache_hint::write_through) { return 3; }

        if (L1H == cache_hint::streaming) { return 5; }

    } else if (L2H == cache_hint::write_back) {

        if (L1H == cache_hint::uncached) { return 2; }

        if (L1H == cache_hint::write_through) { return 4; }

        if (L1H == cache_hint::streaming) { return 6; }

        if (L1H == cache_hint::write_back) { return 7; }

    }

}


template <cache_hint L1H, cache_hint L2H, gpu_arch arch_tag>

constexpr std::enable_if_t<arch_tag == gpu_arch::Xe, uint32_t>

get_atomic_cache_hint_code() {

    check_lsc_cache_hint<lsc_action::atomic, L1H, L2H, arch_tag>();

    if (L1H == cache_hint::none && L2H == cache_hint::none) {

        return 0;

    } else if (L2H == cache_hint::uncached) {

        if (L1H == cache_hint::uncached) { return 1; }

        if (L1H == cache_hint::write_through) { return 3; }

        if (L1H == cache_hint::streaming) { return 5; }

    } else if (L2H == cache_hint::write_back) {

        if (L1H == cache_hint::uncached) { return 2; }

        if (L1H == cache_hint::write_through) { return 4; }

        if (L1H == cache_hint::streaming) { return 6; }

        if (L1H == cache_hint::write_back) { return 7; }

    }

}


template <uint32_t num_channel>

constexpr uint32_t get_execSize_code() {

    static_assert(num_channel == 1 || num_channel == 2 || num_channel == 4

                    || num_channel == 8 || num_channel == 16

                    || num_channel == 32,

            "num_channel not supported!");

    switch (num_channel) {

        case 1: return 0;

        case 2: return 1;

        case 4: return 2;

        case 8: return 3;

        case 16: return 4;

        case 32: return 5;

    }

}


template <atomic_op Op>

constexpr uint32_t get_atomic_opcode() {

    static_assert(Op == atomic_op::fadd || Op == atomic_op::fmax

                    || Op == atomic_op::iadd,

            "Other atomic op didn't added");

    switch (Op) {

        case atomic_op::fadd: return 19;

        case atomic_op::fmax: return 22;

        case atomic_op::iadd: return 12;

    }

}


} // namespace detail


enum class reg_layout : uint8_t {

    linear = 0,

    tiled = 1,

    vnni_tiled = 2,

    transpose_tiled = 3,

    vnni_tiled_col_major = 4

};

enum class store_op : uint8_t {

    normal = 0,

    atomic_fadd = 1,

    atomic_iadd = 2,

    scattered_transpose = 3,

    block_1d = 4

};

enum class mma_engine : uint8_t { xmx = 0, fpu = 1 };

// enum class trans_mode : uint8_t { none = 0, transpose = 1 };

enum class memory_op : uint8_t { load = 0, store = 1 };

enum class tdesc_update_dir : uint8_t { x_dir = 0, y_dir = 1 };

enum class post_kind : uint8_t {

    none = 0,

    relu = 1,

    gelu = 2,

    gelu_bwd_w = 3,

    sigmoid = 4,

    tanh = 5

};

enum class pre_kind : uint8_t { none = 0, bias_add = 1, res_add = 2 };

enum class offset_mode : uint8_t {

    const_offset = 0,

    cyclic_offset = 1,

    acyclic_offset = 2

};


template <typename kernel_t>

void slm_barrier_init() {

    xetla_nbarrier_init<kernel_t::get_barrier_count()>();

    xetla_local_init<kernel_t::get_slm_size()>();

}


template <uint32_t slm_size, uint32_t nbarrier_count>

void slm_barrier_init() {

    xetla_nbarrier_init<nbarrier_count>();

    xetla_local_init<slm_size>();

}


} // namespace gpu::xetla

core.hpp
C++ API.

gpu::xetla::detail::get_execSize_code
constexpr uint32_t get_execSize_code()
Definition common.hpp:165

gpu::xetla::detail::get_store_cache_hint_code
constexpr std::enable_if_t< arch_tag==gpu_arch::Xe, uint32_t > get_store_cache_hint_code()
Definition common.hpp:130

gpu::xetla::detail::get_atomic_cache_hint_code
constexpr std::enable_if_t< arch_tag==gpu_arch::Xe, uint32_t > get_atomic_cache_hint_code()
Definition common.hpp:148

gpu::xetla::detail::get_load_cache_hint_code
constexpr std::enable_if_t< arch_tag==gpu_arch::Xe, uint32_t > get_load_cache_hint_code()
Definition common.hpp:97

gpu::xetla::detail::get_prefetch_cache_hint_code
constexpr std::enable_if_t< arch_tag==gpu_arch::Xe, uint32_t > get_prefetch_cache_hint_code()
Definition common.hpp:115

gpu::xetla::detail::check_lsc_cache_hint
constexpr std::enable_if_t< arch_tag==gpu_arch::Xe, void > check_lsc_cache_hint()
Definition common.hpp:50

gpu::xetla::detail::get_atomic_opcode
constexpr uint32_t get_atomic_opcode()
Definition common.hpp:181

gpu::xetla::detail::lsc_action
lsc_action
Definition common.hpp:46

gpu::xetla::detail::lsc_action::atomic
@ atomic

gpu::xetla::detail::lsc_action::prefetch
@ prefetch

gpu::xetla::detail::lsc_action::store
@ store

gpu::xetla::detail::lsc_action::load
@ load

gpu::xetla::detail::get_element_size_code
constexpr uint32_t get_element_size_code()
Get the element size code object.
Definition common.hpp:34

gpu::xetla
Definition arch_config.hpp:24

gpu::xetla::post_kind
post_kind
Definition common.hpp:229

gpu::xetla::post_kind::gelu_bwd_w
@ gelu_bwd_w

gpu::xetla::post_kind::tanh
@ tanh

gpu::xetla::post_kind::sigmoid
@ sigmoid

gpu::xetla::post_kind::gelu
@ gelu

gpu::xetla::post_kind::relu
@ relu

gpu::xetla::cache_hint::cached
@ cached

gpu::xetla::cache_hint::read_invalidate
@ read_invalidate

gpu::xetla::cache_hint::none
@ none

gpu::xetla::cache_hint::write_through
@ write_through

gpu::xetla::cache_hint::write_back
@ write_back

gpu::xetla::cache_hint::streaming
@ streaming

gpu::xetla::cache_hint::uncached
@ uncached

gpu::xetla::reg_layout
reg_layout
tile layout in register linear: linear layout with one tile tiled: 2d block stacked in raster order v...
Definition common.hpp:209

gpu::xetla::reg_layout::vnni_tiled_col_major
@ vnni_tiled_col_major
this is vnni tiled format, but for each block, they are stored in col major order

gpu::xetla::reg_layout::tiled
@ tiled

gpu::xetla::reg_layout::transpose_tiled
@ transpose_tiled

gpu::xetla::reg_layout::linear
@ linear

gpu::xetla::reg_layout::vnni_tiled
@ vnni_tiled

gpu::xetla::grf_mode::normal
@ normal

gpu::xetla::mma_engine
mma_engine
Definition common.hpp:225

gpu::xetla::mma_engine::fpu
@ fpu

gpu::xetla::mma_engine::xmx
@ xmx

gpu::xetla::pre_kind
pre_kind
Definition common.hpp:237

gpu::xetla::pre_kind::bias_add
@ bias_add

gpu::xetla::pre_kind::res_add
@ res_add

gpu::xetla::atomic_op::iadd
@ iadd
Atomic signed int add of src1 from memory data and return the old value. see

gpu::xetla::atomic_op::fmax
@ fmax
Atomic store the float max of src1 and memory data and return the old value. see

gpu::xetla::atomic_op::fadd
@ fadd
Atomic float add of src1 from memory data and return the old value. see

gpu::xetla::atomic_op::store
@ store
Atomic store untyped data to memory. see

gpu::xetla::atomic_op::load
@ load
Atomic read of the memory data value, without modifying the data. see

gpu::xetla::msg_type::block_1d
@ block_1d

gpu::xetla::memory_op
memory_op
Definition common.hpp:227

gpu::xetla::tdesc_update_dir
tdesc_update_dir
Definition common.hpp:228

gpu::xetla::tdesc_update_dir::y_dir
@ y_dir

gpu::xetla::tdesc_update_dir::x_dir
@ x_dir

gpu::xetla::offset_mode
offset_mode
Definition common.hpp:238

gpu::xetla::offset_mode::const_offset
@ const_offset

gpu::xetla::offset_mode::cyclic_offset
@ cyclic_offset

gpu::xetla::offset_mode::acyclic_offset
@ acyclic_offset

gpu::xetla::slm_barrier_init
void slm_barrier_init()
Initial the local memory size and named barrier count with kernel_t.
Definition common.hpp:247

gpu::xetla::store_op
store_op
Definition common.hpp:218

gpu::xetla::store_op::atomic_iadd
@ atomic_iadd

gpu::xetla::store_op::atomic_fadd
@ atomic_fadd

gpu::xetla::store_op::scattered_transpose
@ scattered_transpose