xetla/raw__send__load__store_8hpp_source.html

/*******************************************************************************

* Copyright (c) 2022-2023 Intel Corporation

*

* Licensed under the Apache License, Version 2.0 (the "License");

* you may not use this file except in compliance with the License.

* You may obtain a copy of the License at

*

*     http://www.apache.org/licenses/LICENSE-2.0

*

* Unless required by applicable law or agreed to in writing, software

* distributed under the License is distributed on an "AS IS" BASIS,

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

* See the License for the specific language governing permissions and

* limitations under the License.

*******************************************************************************/


#pragma once


#include "common/utils/common.hpp"

#include "common/utils/limitation.hpp"

#include "common/utils/tensor_descriptor.hpp"


namespace gpu::xetla {


template <typename Ty, uint32_t block_width = 1, uint32_t block_height = 1,

        uint8_t array_len = 1>

__XETLA_API void xetla_fill_tdesc(xetla_tdescriptor_ref tdesc, Ty *p,

        int tensor_width, int tensor_height, int tensor_pitch, int offset_x,

        int offset_y) {

    detail::xetla_set_tensor_base_address(tdesc, (uint64_t)p);

    detail::xetla_set_tensor_width_x(tdesc, tensor_width * sizeof(Ty) - 1);

    detail::xetla_set_tensor_width_y(tdesc, tensor_height - 1);

    detail::xetla_set_tensor_pitch_x(tdesc, tensor_pitch * sizeof(Ty) - 1);

    detail::xetla_set_tensor_offset_x(tdesc, offset_x);

    detail::xetla_set_tensor_offset_y(tdesc, offset_y);

    uint32_t block_widthx_widthy_arrlen = (block_width - 1)

            | ((block_height - 1) << 8) | ((array_len - 1) << 16);

    detail::xetla_set_block_widthx_widthy_arrlen(

            tdesc, block_widthx_widthy_arrlen);

}


template <typename Ty>

__XETLA_API void xetla_fill_tdesc(xetla_tdescriptor_ref tdesc,

        uint32_t base_address, int tensor_width, int tensor_height,

        int tensor_pitch, int offset_x, int offset_y) {

    detail::xetla_set_tensor_base_address(tdesc, base_address);

    detail::xetla_set_tensor_width_x(tdesc, tensor_width * sizeof(Ty));

    detail::xetla_set_tensor_width_y(tdesc, tensor_height);

    detail::xetla_set_tensor_pitch_x(tdesc, tensor_pitch * sizeof(Ty));

    detail::xetla_set_tensor_offset_x(tdesc, offset_x);

    detail::xetla_set_tensor_offset_y(tdesc, offset_y);

}


template <typename Ty, uint32_t block_width = 1, uint32_t block_height = 1,

        uint8_t array_len = 1>

__XETLA_API xetla_tdescriptor xetla_get_tdesc(Ty *p, int tensor_width,

        int tensor_height, int tensor_pitch, int offset_x, int offset_y) {

    xetla_tdescriptor tdesc;

    auto tdesc_ref = tdesc.xetla_format<uint32_t>();

    detail::xetla_set_tensor_base_address(tdesc_ref, (uint64_t)p);

    detail::xetla_set_tensor_width_x(tdesc_ref, tensor_width * sizeof(Ty) - 1);

    detail::xetla_set_tensor_width_y(tdesc_ref, tensor_height - 1);

    detail::xetla_set_tensor_pitch_x(tdesc_ref, tensor_pitch * sizeof(Ty) - 1);

    detail::xetla_set_tensor_offset_x(tdesc_ref, offset_x);

    detail::xetla_set_tensor_offset_y(tdesc_ref, offset_y);

    uint32_t block_widthx_widthy_arrlen = (block_width - 1)

            | ((block_height - 1) << 8) | ((array_len - 1) << 16);

    detail::xetla_set_block_widthx_widthy_arrlen(

            tdesc_ref, block_widthx_widthy_arrlen);

    return tdesc;

}


template <typename Ty>

__XETLA_API xetla_tdescriptor xetla_get_tdesc(uint32_t base_address,

        int tensor_width, int tensor_height, int tensor_pitch, int offset_x,

        int offset_y) {

    xetla_tdescriptor tdesc;

    auto tdesc_ref = tdesc.xetla_format<uint32_t>();

    detail::xetla_set_tensor_base_address(tdesc_ref, base_address);

    detail::xetla_set_tensor_width_x(tdesc_ref, tensor_width * sizeof(Ty));

    detail::xetla_set_tensor_width_y(tdesc_ref, tensor_height);

    detail::xetla_set_tensor_pitch_x(tdesc_ref, tensor_pitch * sizeof(Ty));

    detail::xetla_set_tensor_offset_x(tdesc_ref, offset_x);

    detail::xetla_set_tensor_offset_y(tdesc_ref, offset_y);

    return tdesc;

}


__XETLA_API void xetla_update_tdesc_offsetx(

        xetla_tdescriptor_ref tdesc, int32_t doffset_x) {

    detail::xetla_set_tensor_offset_x(

            tdesc, detail::xetla_get_tensor_offset_x(tdesc) + doffset_x);

}


__XETLA_API void xetla_update_tdesc_offsety(

        xetla_tdescriptor_ref tdesc, int32_t doffset_y) {

    detail::xetla_set_tensor_offset_y(

            tdesc, detail::xetla_get_tensor_offset_y(tdesc) + doffset_y);

}


template <typename Ty, uint32_t N, cache_hint L1H = cache_hint::none,

        cache_hint L2H = cache_hint::none, bool transpose = false,

        bool transform = false, gpu_arch arch_tag = gpu_arch::Xe>

__XETLA_API std::enable_if_t<arch_tag == gpu_arch::Xe, xetla_vector<Ty, N>>

xetla_tload_global(xetla_tdescriptor tdesc) {

    DEBUG_INVOKE(dbg_level::core,

            core::block_2d<arch_tag, Ty>::template check_load<transpose,

                    transform>(tdesc));


    constexpr uint32_t numDst = 31 < ((N * sizeof(Ty) + 63) / 64)

            ? 31

            : ((N * sizeof(Ty) + 63) / 64);

    uint32_t msg_desc = 3;

    msg_desc |= (transform ? 1 : 0) << 7;

    msg_desc |= detail::get_element_size_code<sizeof(Ty)>() << 9;

    msg_desc |= (transpose ? 1 : 0) << 15;

    msg_desc |= detail::get_load_cache_hint_code<L1H, L2H, arch_tag>() << 17;

    msg_desc |= 1 << 25;

    msg_desc |= numDst << 20;


    constexpr uint32_t numSrc0 = 1;

    constexpr uint32_t execSize = 0;

    constexpr uint32_t sfid = 0xF;

    constexpr uint32_t exDesc = 0;


    constexpr uint32_t ret_N = (N * sizeof(Ty)) >= 32 ? N : 32 / sizeof(Ty);

    xetla_vector<Ty, ret_N> ret;


    xetla_raw_send<Ty, ret_N, uint32_t, 16, execSize, sfid, numSrc0, numDst>(

            ret.xetla_format<native_type_t<Ty>>(), tdesc, exDesc, msg_desc);


    return ret.xetla_select<N, 1>(0);

}


template <typename Ty, uint32_t N, cache_hint L1H = cache_hint::none,

        cache_hint L2H = cache_hint::none, gpu_arch arch_tag = gpu_arch::Xe>

__XETLA_API std::enable_if_t<arch_tag == gpu_arch::Xe, void>

xetla_tstore_global(xetla_tdescriptor tdesc, xetla_vector<Ty, N> data) {

    DEBUG_INVOKE(

            dbg_level::core, core::block_2d<arch_tag, Ty>::check_store(tdesc));


    uint32_t msg_desc = 7; // store operation

    msg_desc |= detail::get_element_size_code<sizeof(Ty)>() << 9;

    msg_desc |= detail::get_store_cache_hint_code<L1H, L2H, arch_tag>() << 17;

    msg_desc |= 1 << 25;


    constexpr uint32_t numSrc1 = (N * sizeof(Ty) + 63) / 64;

    constexpr uint32_t numSrc0 = 1;

    constexpr uint32_t execSize = 0;

    constexpr uint32_t sfid = 0xF;

    constexpr uint32_t exDesc = 0;


    xetla_raw_send<uint32_t, 16, Ty, N, execSize, sfid, numSrc0, numSrc1>(

            tdesc, data, exDesc, msg_desc);

}


template <typename Ty, cache_hint L1H = cache_hint::cached,

        cache_hint L2H = cache_hint::cached, gpu_arch arch_tag = gpu_arch::Xe>

__XETLA_API std::enable_if_t<arch_tag == gpu_arch::Xe, void>

xetla_tprefetch_global(xetla_tdescriptor tdesc) {


    uint32_t msg_desc = 3;

    msg_desc |= 0 << 7;

    msg_desc |= detail::get_element_size_code<sizeof(Ty)>() << 9;

    msg_desc |= 0 << 15;

    msg_desc |= detail::get_prefetch_cache_hint_code<L1H, L2H, arch_tag>()

            << 17;

    msg_desc |= 1 << 25;


    constexpr uint32_t numSrc0 = 1;

    constexpr uint32_t execSize = 0;

    constexpr uint32_t sfid = 0xF;

    constexpr uint32_t exDesc = 0;


    xetla_raw_send<uint32_t, 16, execSize, sfid, numSrc0>(

            tdesc, exDesc, msg_desc);

}


template <typename Ty, uint32_t N, cache_hint L1H = cache_hint::none,

        cache_hint L2H = cache_hint::none, atomic_op Op,

        gpu_arch arch_tag = gpu_arch::Xe, typename Toffset = uint32_t>

__XETLA_API std::enable_if_t<arch_tag == gpu_arch::Xe, void>

xetla_tatomic_store_global(uint64_t base_address,

        xetla_vector<Toffset, N> offset, xetla_vector<Ty, N> data,

        xetla_mask<N> pred = 1) {


    constexpr uint32_t numSrc0 = (N * sizeof(uint64_t) + 63) / 64;

    constexpr uint32_t numSrc1 = (N * sizeof(Ty) + 63) / 64;


    static_assert(sizeof(Ty) == 2 || sizeof(Ty) == 4 || sizeof(Ty) == 8,

            "element_size not supported!");

    uint32_t element_size_code;

    if constexpr (sizeof(Ty) == 2) {

        element_size_code = 5;

    } else if constexpr (sizeof(Ty) == 4) {

        element_size_code = 2;

    } else if constexpr (sizeof(Ty) == 8) {

        element_size_code = 3;

    }


    uint32_t msg_desc = detail::get_atomic_opcode<Op>();

    msg_desc |= 3 << 7;

    msg_desc |= element_size_code << 9;

    msg_desc |= detail::get_atomic_cache_hint_code<L1H, L2H, arch_tag>() << 17;

    msg_desc |= numSrc0 << 25;


    constexpr uint32_t execSize = gpu::xetla::detail::get_execSize_code<N>();

    constexpr uint32_t sfid = 0xF;

    constexpr uint32_t exDesc = 0;


    xetla_vector<uint64_t, N> address = base_address + offset;


    xetla_raw_send<uint64_t, N, Ty, N, execSize, sfid, numSrc0, numSrc1>(

            address, data, exDesc, msg_desc, pred);

}


} // namespace gpu::xetla

gpu::xetla::core::block_2d
Definition limitation.hpp:33

__XETLA_API
#define __XETLA_API
Definition common.hpp:43

common.hpp
C++ API.

DEBUG_INVOKE
#define DEBUG_INVOKE(level,...)
Definition debug.hpp:180

gpu::xetla::xetla_tdescriptor
xetla_vector< uint32_t, 16 > xetla_tdescriptor
Description of nd tensor descriptor for load and store.
Definition base_types.hpp:155

gpu::xetla::native_type_t
typename native_type< T >::type native_type_t
Return the native data type of T.
Definition base_types.hpp:106

gpu::xetla::xetla_vector
__ESIMD_NS::simd< native_type_t< Ty >, N > xetla_vector
wrapper for xetla_vector.
Definition base_types.hpp:149

xetla_tdescriptor_ref
#define xetla_tdescriptor_ref
Alias to xetla_vector<uint32_t, 16> reference.
Definition base_types.hpp:158

gpu::xetla::xetla_mask
__ESIMD_NS::simd_mask< N > xetla_mask
wrapper for xetla_mask.
Definition base_types.hpp:165

gpu::xetla::xetla_update_tdesc_offsetx
__XETLA_API void xetla_update_tdesc_offsetx(xetla_tdescriptor_ref tdesc, int32_t doffset_x)
Update the x coordinate in the given tensor descriptor.
Definition raw_send_load_store.hpp:152

gpu::xetla::xetla_tprefetch_global
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void > xetla_tprefetch_global(xetla_tdescriptor tdesc)
Tensor prefetch API.
Definition raw_send_load_store.hpp:258

gpu::xetla::xetla_tatomic_store_global
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void > xetla_tatomic_store_global(uint64_t base_address, xetla_vector< Toffset, N > offset, xetla_vector< Ty, N > data, xetla_mask< N > pred=1)
Tensor atomic store API.
Definition raw_send_load_store.hpp:294

gpu::xetla::xetla_update_tdesc_offsety
__XETLA_API void xetla_update_tdesc_offsety(xetla_tdescriptor_ref tdesc, int32_t doffset_y)
Update the y coordinate in the given tensor descriptor.
Definition raw_send_load_store.hpp:161

gpu::xetla::xetla_tstore_global
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void > xetla_tstore_global(xetla_tdescriptor tdesc, xetla_vector< Ty, N > data)
Tensor store API.
Definition raw_send_load_store.hpp:227

gpu::xetla::xetla_tload_global
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, xetla_vector< Ty, N > > xetla_tload_global(xetla_tdescriptor tdesc)
Tensor load API.
Definition raw_send_load_store.hpp:183

gpu::xetla::xetla_fill_tdesc
__XETLA_API void xetla_fill_tdesc(xetla_tdescriptor_ref tdesc, Ty *p, int tensor_width, int tensor_height, int tensor_pitch, int offset_x, int offset_y)
Tensor descriptor construction(global memory version).
Definition raw_send_load_store.hpp:52

gpu::xetla::xetla_get_tdesc
__XETLA_API xetla_tdescriptor xetla_get_tdesc(Ty *p, int tensor_width, int tensor_height, int tensor_pitch, int offset_x, int offset_y)
Generate a new tensor descriptor(global memory version).
Definition raw_send_load_store.hpp:106

limitation.hpp
C++ API.

gpu::xetla::detail::xetla_set_tensor_offset_y
__XETLA_API void xetla_set_tensor_offset_y(xetla_tdescriptor_ref desc, int32_t offset_y)
Definition tensor_descriptor.hpp:71

gpu::xetla::detail::xetla_set_tensor_width_x
__XETLA_API void xetla_set_tensor_width_x(xetla_tdescriptor_ref desc, uint32_t width_x)
Definition tensor_descriptor.hpp:39

gpu::xetla::detail::xetla_set_tensor_width_y
__XETLA_API void xetla_set_tensor_width_y(xetla_tdescriptor_ref desc, uint32_t width_y)
Definition tensor_descriptor.hpp:47

gpu::xetla::detail::xetla_set_tensor_base_address
__XETLA_API void xetla_set_tensor_base_address(xetla_tdescriptor_ref desc, uint64_t base_address)
Definition tensor_descriptor.hpp:27

gpu::xetla::detail::xetla_set_tensor_offset_x
__XETLA_API void xetla_set_tensor_offset_x(xetla_tdescriptor_ref desc, int32_t offset_x)
Definition tensor_descriptor.hpp:63

gpu::xetla::detail::xetla_set_tensor_pitch_x
__XETLA_API void xetla_set_tensor_pitch_x(xetla_tdescriptor_ref desc, uint32_t pitch_x)
Definition tensor_descriptor.hpp:55

gpu::xetla::detail::xetla_set_block_widthx_widthy_arrlen
__XETLA_API void xetla_set_block_widthx_widthy_arrlen(xetla_tdescriptor_ref desc, uint32_t block_widthx_widthy_arrlen)
Definition tensor_descriptor.hpp:79

gpu::xetla::detail::xetla_get_tensor_offset_x
__XETLA_API int32_t xetla_get_tensor_offset_x(xetla_tdescriptor desc)
Definition tensor_descriptor.hpp:67

gpu::xetla::detail::xetla_get_tensor_offset_y
__XETLA_API int32_t xetla_get_tensor_offset_y(xetla_tdescriptor desc)
Definition tensor_descriptor.hpp:75

gpu::xetla
Definition arch_config.hpp:24

gpu::xetla::cache_hint
cache_hint
L1 or L2 cache hint kinds.
Definition common.hpp:89

gpu::xetla::cache_hint::cached
@ cached

gpu::xetla::cache_hint::none
@ none

gpu::xetla::atomic_op
atomic_op
Represents an atomic operation.
Definition common.hpp:142

gpu::xetla::gpu_arch
gpu_arch
Definition common.hpp:73

gpu::xetla::gpu_arch::Xe
@ Xe

tensor_descriptor.hpp
C++ API.