xetla/row__reduction__fused__op__xe_8hpp_source.html

/*******************************************************************************

* Copyright (c) 2022-2023 Intel Corporation

*

* Licensed under the Apache License, Version 2.0 (the "License");

* you may not use this file except in compliance with the License.

* You may obtain a copy of the License at

*

*     http://www.apache.org/licenses/LICENSE-2.0

*

* Unless required by applicable law or agreed to in writing, software

* distributed under the License is distributed on an "AS IS" BASIS,

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

* See the License for the specific language governing permissions and

* limitations under the License.

*******************************************************************************/


#pragma once


#include "experimental/group/fused_op/row_reduction_fused_op_api.hpp"


namespace gpu::xetla::group {


template <typename dtype_in, typename dtype_out, typename dtype_acc>

struct xetla_row_reduction_fused_op_arguments_t {

    dtype_in *gelu_bwd_w_ptr;

    dtype_out *gelu_bwd_x_ptr;

    dtype_out *dropout_bwd_ptr;

    uint8_t *mask_ptr;

    float dropout_prob;

    float dropout_scale_inv;

    uint32_t matrix_m;

    uint32_t matrix_n;

    uint32_t mat_in_ld;

    uint32_t mat_out_ld;

};


template <reduction_fused_kind fused_op_kind_, typename dtype_in_,

        typename dtype_out_, typename dtype_acc_, typename reduction_attr_>

struct row_reduction_fused_op_t<fused_op_kind_, dtype_in_, dtype_out_,

        dtype_acc_, reduction_attr_, gpu_arch::Xe> {

    static constexpr reduction_fused_kind fused_op_kind = fused_op_kind_;

    using dtype_in = dtype_in_;

    using dtype_out = dtype_out_;

    using dtype_acc = dtype_acc_;

    using arguments_t = xetla_row_reduction_fused_op_arguments_t<dtype_in,

            dtype_out, dtype_acc>;

    __XETLA_API row_reduction_fused_op_t([[maybe_unused]] arguments_t *args,

            [[maybe_unused]] int start_n = 0,

            [[maybe_unused]] int start_m = 0) {}

    template <typename matAcc_t>

    __XETLA_API KERNEL_FUNC void operator()([[maybe_unused]] matAcc_t &matAcc) {

    }

    __XETLA_API void update_tdesc([[maybe_unused]] int offset_n = 0,

            [[maybe_unused]] int offset_m = 0) {}

};


template <typename dtype_in_, typename dtype_out_, typename dtype_acc_,

        typename reduction_attr_>

struct row_reduction_fused_op_t<reduction_fused_kind::bias_gelu_w_bwd,

        dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe> {

    static constexpr reduction_fused_kind fused_op_kind

            = reduction_fused_kind::bias_gelu_w_bwd;

    using dtype_in = dtype_in_;

    using dtype_out = dtype_out_;

    using dtype_acc = dtype_acc_;

    using arguments_t = xetla_row_reduction_fused_op_arguments_t<dtype_in,

            dtype_out, dtype_acc>;

    mem_desc_t<dtype_in, mem_layout::row_major, mem_space::global>

            w_load_base_desc;

    mem_desc_t<dtype_out, mem_layout::row_major, mem_space::global>

            x_store_base_desc;

    __XETLA_API row_reduction_fused_op_t(

            arguments_t *args, int start_n = 0, int start_m = 0) {

        w_load_base_desc.init({args->gelu_bwd_w_ptr},

                {args->matrix_n, args->matrix_m, args->mat_in_ld},

                {start_n, start_m});

        x_store_base_desc.init({args->gelu_bwd_x_ptr},

                {args->matrix_n, args->matrix_m, args->mat_out_ld},

                {start_n, start_m});

    }


    template <typename matAcc_t>

    __XETLA_API KERNEL_FUNC void operator()(matAcc_t &matAcc) {

        static_assert(std::is_same<remove_const_t<dtype_acc>,

                              typename matAcc_t::dtype>::value,

                "dtype_acc should match with matAcc");

        static constexpr uint32_t tile_size_x = matAcc_t::tile_size_x;

        static constexpr uint32_t tile_size_y = matAcc_t::tile_size_y;

        static constexpr uint32_t block_size_x = matAcc_t::block_size_x;

        static constexpr uint32_t block_size_y = matAcc_t::block_size_y;

        static constexpr uint32_t num_elems = matAcc_t::tile_elems;

        using dgelu_tile_desc_t = subgroup::tile_desc_t<tile_size_x,

                tile_size_y, block_size_x, block_size_y, reg_layout::tiled>;

        using dgelu_w_in_t = subgroup::tile_t<dtype_in, dgelu_tile_desc_t>;

        using dgelu_w_in_payload_t = subgroup::mem_payload_t<

                mem_desc_t<dtype_in, mem_layout::row_major, mem_space::global>,

                dgelu_tile_desc_t,

                subgroup::msg_type_v<dgelu_tile_desc_t, mem_space::global>,

                gpu_arch::Xe>;

        using dgelu_x_out_t = subgroup::tile_t<dtype_out, dgelu_tile_desc_t>;

        using dgelu_x_out_payload_t = subgroup::mem_payload_t<

                mem_desc_t<dtype_out, mem_layout::row_major, mem_space::global>,

                dgelu_tile_desc_t, msg_type::block_2d, gpu_arch::Xe>;

        dgelu_w_in_t dgelu_w_in;

        dgelu_w_in_payload_t dgelu_w_in_payload(w_load_base_desc);

        subgroup::tile_load(dgelu_w_in, dgelu_w_in_payload);

        xetla_vector<dtype_acc, num_elems> w

                = xetla_cvt<dtype_acc, dtype_in, num_elems>(dgelu_w_in.reg);

        matAcc.reg = matAcc.reg * w;

        dgelu_x_out_t dgelu_x_out;

        dgelu_x_out_payload_t dgelu_x_out_payload(x_store_base_desc);

        subgroup::elemwise_cvt(dgelu_x_out, matAcc);

        subgroup::tile_store<cache_hint::uncached>(

                dgelu_x_out, dgelu_x_out_payload);

    }


    __XETLA_API void update_tdesc(int offset_n = 0, int offset_m = 0) {

        w_load_base_desc.update_coord(offset_n, offset_m);

        x_store_base_desc.update_coord(offset_n, offset_m);

    }

};


template <typename dtype_in_, typename dtype_out_, typename dtype_acc_,

        typename reduction_attr_>

struct row_reduction_fused_op_t<reduction_fused_kind::bias_dropout_bwd,

        dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe> {

    static constexpr reduction_fused_kind fused_op_kind

            = reduction_fused_kind::bias_dropout_bwd;

    using dtype_in = dtype_in_;

    using dtype_out = dtype_out_;

    using dtype_acc = dtype_acc_;

    using dtype_mask = uint8_t;

    using arguments_t = xetla_row_reduction_fused_op_arguments_t<dtype_in,

            dtype_out, dtype_acc>;

    mem_desc_t<dtype_mask, mem_layout::row_major, mem_space::global>

            mask_load_base_desc;

    mem_desc_t<dtype_out, mem_layout::row_major, mem_space::global>

            dropout_bwd_store_base_desc;

    float dropout_prob;

    float dropout_scale_inv;


    __XETLA_API row_reduction_fused_op_t(

            arguments_t *args, int start_n = 0, int start_m = 0) {


        mask_load_base_desc.init({args->mask_ptr},

                {args->matrix_n, args->matrix_m, args->mat_in_ld},

                {start_n, start_m});

        dropout_bwd_store_base_desc.init({args->dropout_bwd_ptr},

                {args->matrix_n, args->matrix_m, args->mat_out_ld},

                {start_n, start_m});

        dropout_scale_inv = args->dropout_scale_inv;

        dropout_prob = args->dropout_prob;

    }


    template <typename matAcc_t>

    __XETLA_API KERNEL_FUNC void operator()(matAcc_t &matAcc) {

        static_assert(std::is_same<remove_const_t<dtype_acc>,

                              typename matAcc_t::dtype>::value,

                "dtype_acc should match with matAcc");

        static constexpr uint32_t tile_size_x = matAcc_t::tile_size_x;

        static constexpr uint32_t tile_size_y = matAcc_t::tile_size_y;

        static constexpr uint32_t block_size_x = matAcc_t::block_size_x;

        static constexpr uint32_t block_size_y = matAcc_t::block_size_y;

        using reduction_tile_desc_t = subgroup::tile_desc_t<tile_size_x,

                tile_size_y, block_size_x, block_size_y, reg_layout::tiled>;

        using mask_in_t = subgroup::tile_t<dtype_mask, reduction_tile_desc_t>;

        using mask_in_payload_t = subgroup::mem_payload_t<

                mem_desc_t<dtype_mask, mem_layout::row_major,

                        mem_space::global>,

                reduction_tile_desc_t,

                subgroup::msg_type_v<reduction_tile_desc_t, mem_space::global>,

                gpu_arch::Xe>;

        using dropout_bwd_out_t

                = subgroup::tile_t<dtype_out, reduction_tile_desc_t>;

        using dropout_bwd_out_payload_t = subgroup::mem_payload_t<

                mem_desc_t<dtype_out, mem_layout::row_major, mem_space::global>,

                reduction_tile_desc_t,

                subgroup::msg_type_v<reduction_tile_desc_t, mem_space::global>,

                gpu_arch::Xe>;

        if (dropout_prob != 0) {

            mask_in_t mask_in;

            mask_in_payload_t mask_in_payload(mask_load_base_desc);

            subgroup::tile_load(mask_in, mask_in_payload);

            SW_BARRIER();

            matAcc.reg = drop_out<dtype_acc, tile_size_x * tile_size_y>(

                    matAcc.reg, mask_in.reg, dropout_scale_inv);

        }

        dropout_bwd_out_t dropout_bwd_out;

        dropout_bwd_out_payload_t dropout_bwd_out_payload(

                dropout_bwd_store_base_desc);

        subgroup::elemwise_cvt(dropout_bwd_out, matAcc);

        subgroup::tile_store<cache_hint::uncached>(

                dropout_bwd_out, dropout_bwd_out_payload);

    }


    __XETLA_API void update_tdesc(int offset_n = 0, int offset_m = 0) {

        mask_load_base_desc.update_coord(offset_n, offset_m);

        dropout_bwd_store_base_desc.update_coord(offset_n, offset_m);

    }

};


} // namespace gpu::xetla::group

SW_BARRIER
#define SW_BARRIER()
SW_BARRIER, insert software scheduling barrier, for better code control.
Definition common.hpp:227

__XETLA_API
#define __XETLA_API
Definition common.hpp:43

gpu::xetla::xetla_vector
__ESIMD_NS::simd< native_type_t< Ty >, N > xetla_vector
wrapper for xetla_vector.
Definition base_types.hpp:149

KERNEL_FUNC
#define KERNEL_FUNC
KERNEL_FUNC macro.
Definition common.hpp:39

gpu::xetla::group
Definition limitation.hpp:607

gpu::xetla::subgroup::elemwise_cvt
__XETLA_API std::enable_if_t<(T_src::register_layout !=reg_layout::linear) &&(T_dst::register_layout !=reg_layout::linear) &&is_same_layout< T_dst, T_src >::value &&(!is_floating_to_integer< T_dst, T_src >::value)> elemwise_cvt(T_dst &dst, T_src &src)
Is the element wise data conversion, the src and dst tile should have the same layout.
Definition op_function.hpp:40

gpu::xetla::subgroup::tile_load
__XETLA_API std::enable_if_t< detail::check_load_type< tile_t, payload_t >::is_global_2d_xe > tile_load(tile_t &tile, payload_t &payload)
This function loads data from 2D memory surface.
Definition load_xe.hpp:76

gpu::xetla::reg_layout::tiled
@ tiled

gpu::xetla::mem_space::global
@ global

gpu::xetla::gpu_arch
gpu_arch
Definition common.hpp:73

gpu::xetla::gpu_arch::Xe
@ Xe

gpu::xetla::msg_type::block_2d
@ block_2d

gpu::xetla::reduction_fused_kind
reduction_fused_kind
Definition row_reduction_fused_op_api.hpp:28

gpu::xetla::reduction_fused_kind::bias_gelu_w_bwd
@ bias_gelu_w_bwd

gpu::xetla::reduction_fused_kind::bias_dropout_bwd
@ bias_dropout_bwd

gpu::xetla::mem_layout::row_major
@ row_major

row_reduction_fused_op_api.hpp
C++ API.

gpu::xetla::group::row_reduction_fused_op_t< fused_op_kind_, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::dtype_in
dtype_in_ dtype_in
Definition row_reduction_fused_op_xe.hpp:57

gpu::xetla::group::row_reduction_fused_op_t< fused_op_kind_, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::dtype_out
dtype_out_ dtype_out
Definition row_reduction_fused_op_xe.hpp:58

gpu::xetla::group::row_reduction_fused_op_t< fused_op_kind_, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::dtype_acc
dtype_acc_ dtype_acc
Definition row_reduction_fused_op_xe.hpp:59

gpu::xetla::group::row_reduction_fused_op_t< fused_op_kind_, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::row_reduction_fused_op_t
__XETLA_API row_reduction_fused_op_t(arguments_t *args, int start_n=0, int start_m=0)
Definition row_reduction_fused_op_xe.hpp:62

gpu::xetla::group::row_reduction_fused_op_t< fused_op_kind_, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::update_tdesc
__XETLA_API void update_tdesc(int offset_n=0, int offset_m=0)
Definition row_reduction_fused_op_xe.hpp:68

gpu::xetla::group::row_reduction_fused_op_t< fused_op_kind_, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::operator()
__XETLA_API KERNEL_FUNC void operator()(matAcc_t &matAcc)
Definition row_reduction_fused_op_xe.hpp:66

gpu::xetla::group::row_reduction_fused_op_t< reduction_fused_kind::bias_dropout_bwd, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::dropout_prob
float dropout_prob
Definition row_reduction_fused_op_xe.hpp:164

gpu::xetla::group::row_reduction_fused_op_t< reduction_fused_kind::bias_dropout_bwd, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::dtype_out
dtype_out_ dtype_out
Definition row_reduction_fused_op_xe.hpp:155

gpu::xetla::group::row_reduction_fused_op_t< reduction_fused_kind::bias_dropout_bwd, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::dropout_bwd_store_base_desc
mem_desc_t< dtype_out, mem_layout::row_major, mem_space::global > dropout_bwd_store_base_desc
Definition row_reduction_fused_op_xe.hpp:163

gpu::xetla::group::row_reduction_fused_op_t< reduction_fused_kind::bias_dropout_bwd, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::dtype_in
dtype_in_ dtype_in
Definition row_reduction_fused_op_xe.hpp:154

gpu::xetla::group::row_reduction_fused_op_t< reduction_fused_kind::bias_dropout_bwd, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::update_tdesc
__XETLA_API void update_tdesc(int offset_n=0, int offset_m=0)
Definition row_reduction_fused_op_xe.hpp:221

gpu::xetla::group::row_reduction_fused_op_t< reduction_fused_kind::bias_dropout_bwd, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::dropout_scale_inv
float dropout_scale_inv
Definition row_reduction_fused_op_xe.hpp:165

gpu::xetla::group::row_reduction_fused_op_t< reduction_fused_kind::bias_dropout_bwd, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::operator()
__XETLA_API KERNEL_FUNC void operator()(matAcc_t &matAcc)
Definition row_reduction_fused_op_xe.hpp:181

gpu::xetla::group::row_reduction_fused_op_t< reduction_fused_kind::bias_dropout_bwd, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::mask_load_base_desc
mem_desc_t< dtype_mask, mem_layout::row_major, mem_space::global > mask_load_base_desc
Definition row_reduction_fused_op_xe.hpp:161

gpu::xetla::group::row_reduction_fused_op_t< reduction_fused_kind::bias_dropout_bwd, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::row_reduction_fused_op_t
__XETLA_API row_reduction_fused_op_t(arguments_t *args, int start_n=0, int start_m=0)
Definition row_reduction_fused_op_xe.hpp:167

gpu::xetla::group::row_reduction_fused_op_t< reduction_fused_kind::bias_dropout_bwd, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::dtype_mask
uint8_t dtype_mask
Definition row_reduction_fused_op_xe.hpp:157

gpu::xetla::group::row_reduction_fused_op_t< reduction_fused_kind::bias_dropout_bwd, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::dtype_acc
dtype_acc_ dtype_acc
Definition row_reduction_fused_op_xe.hpp:156

gpu::xetla::group::row_reduction_fused_op_t< reduction_fused_kind::bias_gelu_w_bwd, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::w_load_base_desc
mem_desc_t< dtype_in, mem_layout::row_major, mem_space::global > w_load_base_desc
Definition row_reduction_fused_op_xe.hpp:84

gpu::xetla::group::row_reduction_fused_op_t< reduction_fused_kind::bias_gelu_w_bwd, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::dtype_in
dtype_in_ dtype_in
Definition row_reduction_fused_op_xe.hpp:78

gpu::xetla::group::row_reduction_fused_op_t< reduction_fused_kind::bias_gelu_w_bwd, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::x_store_base_desc
mem_desc_t< dtype_out, mem_layout::row_major, mem_space::global > x_store_base_desc
Definition row_reduction_fused_op_xe.hpp:86

gpu::xetla::group::row_reduction_fused_op_t< reduction_fused_kind::bias_gelu_w_bwd, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::update_tdesc
__XETLA_API void update_tdesc(int offset_n=0, int offset_m=0)
Definition row_reduction_fused_op_xe.hpp:142

gpu::xetla::group::row_reduction_fused_op_t< reduction_fused_kind::bias_gelu_w_bwd, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::row_reduction_fused_op_t
__XETLA_API row_reduction_fused_op_t(arguments_t *args, int start_n=0, int start_m=0)
Definition row_reduction_fused_op_xe.hpp:87

gpu::xetla::group::row_reduction_fused_op_t< reduction_fused_kind::bias_gelu_w_bwd, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::dtype_out
dtype_out_ dtype_out
Definition row_reduction_fused_op_xe.hpp:79

gpu::xetla::group::row_reduction_fused_op_t< reduction_fused_kind::bias_gelu_w_bwd, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::operator()
__XETLA_API KERNEL_FUNC void operator()(matAcc_t &matAcc)
Definition row_reduction_fused_op_xe.hpp:103

gpu::xetla::group::row_reduction_fused_op_t< reduction_fused_kind::bias_gelu_w_bwd, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe >::dtype_acc
dtype_acc_ dtype_acc
Definition row_reduction_fused_op_xe.hpp:80

gpu::xetla::group::row_reduction_fused_op_t
Additional Ops that can be fused with row reduction processing flow.
Definition row_reduction_fused_op_api.hpp:47

gpu::xetla::group::xetla_row_reduction_fused_op_arguments_t
Definition row_reduction_fused_op_xe.hpp:32

gpu::xetla::group::xetla_row_reduction_fused_op_arguments_t::mask_ptr
uint8_t * mask_ptr
Definition row_reduction_fused_op_xe.hpp:36

gpu::xetla::group::xetla_row_reduction_fused_op_arguments_t::mat_in_ld
uint32_t mat_in_ld
Definition row_reduction_fused_op_xe.hpp:41

gpu::xetla::group::xetla_row_reduction_fused_op_arguments_t::gelu_bwd_w_ptr
dtype_in * gelu_bwd_w_ptr
Definition row_reduction_fused_op_xe.hpp:33

gpu::xetla::group::xetla_row_reduction_fused_op_arguments_t::dropout_scale_inv
float dropout_scale_inv
Definition row_reduction_fused_op_xe.hpp:38

gpu::xetla::group::xetla_row_reduction_fused_op_arguments_t::matrix_n
uint32_t matrix_n
Definition row_reduction_fused_op_xe.hpp:40

gpu::xetla::group::xetla_row_reduction_fused_op_arguments_t::dropout_bwd_ptr
dtype_out * dropout_bwd_ptr
Definition row_reduction_fused_op_xe.hpp:35

gpu::xetla::group::xetla_row_reduction_fused_op_arguments_t::dropout_prob
float dropout_prob
Definition row_reduction_fused_op_xe.hpp:37

gpu::xetla::group::xetla_row_reduction_fused_op_arguments_t::matrix_m
uint32_t matrix_m
Definition row_reduction_fused_op_xe.hpp:39

gpu::xetla::group::xetla_row_reduction_fused_op_arguments_t::mat_out_ld
uint32_t mat_out_ld
Definition row_reduction_fused_op_xe.hpp:42

gpu::xetla::group::xetla_row_reduction_fused_op_arguments_t::gelu_bwd_x_ptr
dtype_out * gelu_bwd_x_ptr
Definition row_reduction_fused_op_xe.hpp:34

gpu::xetla::mem_desc_t
Definition memory_descriptor.hpp:139

gpu::xetla::subgroup::mem_payload_t
Is to illustrate the memory information.
Definition api.hpp:44

gpu::xetla::subgroup::tile_desc_t
Is to illustrate the tile information about a sub matrix.
Definition api.hpp:64

gpu::xetla::subgroup::tile_t
Is a struct contains some register file.
Definition api.hpp:99