xetla/row__reduction__xe_8hpp_source.html

/*******************************************************************************

* Copyright (c) 2022-2023 Intel Corporation

*

* Licensed under the Apache License, Version 2.0 (the "License");

* you may not use this file except in compliance with the License.

* You may obtain a copy of the License at

*

*     http://www.apache.org/licenses/LICENSE-2.0

*

* Unless required by applicable law or agreed to in writing, software

* distributed under the License is distributed on an "AS IS" BASIS,

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

* See the License for the specific language governing permissions and

* limitations under the License.

*******************************************************************************/


#pragma once


#include "experimental/group/fused_op/row_reduction_fused_op_xe.hpp"

#include "experimental/group/reduction/row_reduce_store_xe.hpp"

#include "experimental/kernel/reduction/api.hpp"

#include "experimental/kernel/reduction/common.hpp"

#include "experimental/kernel/reduction/config.hpp"


namespace gpu::xetla::kernel {


template <typename dtype_in_, typename dtype_out_, typename dtype_acc_,

        typename reduction_attr_, typename fused_op_t_>

struct xetla_row_reduction_t<dtype_in_, dtype_out_, dtype_acc_, reduction_attr_,

        gpu_arch::Xe, fused_op_t_> {

    using dtype_in = dtype_in_;

    using dtype_out = dtype_out_;

    using dtype_acc = dtype_acc_;

    using reduction_attr = reduction_attr_;

    using fused_op_t = fused_op_t_;

    using fused_op_arguments_t = typename fused_op_t::arguments_t;


    static constexpr uint32_t wg_tile_m = reduction_attr::wg_tile_m;

    static constexpr uint32_t wg_tile_n = reduction_attr::wg_tile_n;

    static constexpr uint32_t sg_tile_m = reduction_attr::sg_tile_m;

    static constexpr uint32_t sg_tile_n = reduction_attr::sg_tile_n;

    static constexpr bool is_dynamic_job = reduction_attr::is_dynamic_job;

    static constexpr uint32_t wg_size_x

            = (wg_tile_n + sg_tile_n - 1) / sg_tile_n;

    static constexpr uint32_t wg_size_y

            = (wg_tile_m + sg_tile_m - 1) / sg_tile_m;

    using work_group_t = work_group_t<wg_size_x * wg_size_y>;

    static constexpr bool use_dynamic_job = is_dynamic_job && (wg_size_y > 1);

    using load_store_attr = typename arch_attr_t<

            gpu_arch::Xe>::template load_store_attr<msg_type::block_2d>;

    static constexpr uint32_t max_load_height_in_elem

            = load_store_attr::max_load_height_in_elem;

    static constexpr uint32_t max_load_width_in_bytes

            = load_store_attr::max_load_width_in_bytes;

    static constexpr uint32_t max_store_width_in_bytes

            = load_store_attr::max_store_width_in_bytes;

    static constexpr uint32_t max_load_width_in_elem

            = max_load_width_in_bytes / sizeof(dtype_in);

    static constexpr uint32_t max_store_width_in_elem

            = max_store_width_in_bytes / sizeof(dtype_out);


    static constexpr uint32_t tile_size_x = sg_tile_n;

    static constexpr uint32_t tile_size_y = sg_tile_m;


    static constexpr uint32_t max_simd_len = max_store_width_in_elem;


    static constexpr uint32_t block_size_x

            = max_load_width_in_elem > tile_size_x

            ? tile_size_x

            : gpu::xetla::subgroup::detail::gcd<tile_size_x,

                    max_load_width_in_elem>::value;

    static_assert(block_size_x >= 8,

            "if block_size_x less than 8, the efficiency will be low. Please "

            "choose another tile_size_x");

    static constexpr uint32_t block_size_y

            = max_load_height_in_elem > tile_size_y ? tile_size_y

                                                    : max_load_height_in_elem;


    static constexpr uint32_t SIMD = 16;


    using global_ld_tile_desc_t = subgroup::tile_desc_t<tile_size_x,

            tile_size_y, block_size_x, block_size_y, reg_layout::tiled>;

    using global_ld_t = subgroup::tile_t<dtype_in, global_ld_tile_desc_t>;

    using global_ld_payload_t = subgroup::mem_payload_t<

            mem_desc_t<dtype_in, mem_layout::row_major, mem_space::global>,

            global_ld_tile_desc_t,

            subgroup::msg_type_v<global_ld_tile_desc_t, mem_space::global>,

            gpu_arch::Xe>;

    using mat_buffer_t = subgroup::tile_t<dtype_acc,

            subgroup::tile_desc_t<tile_size_x, 1, block_size_x, 1,

                    reg_layout::tiled>>;

    using matAcc_t = subgroup::tile_t<dtype_acc, global_ld_tile_desc_t>;

    using row_reduce_store_t = group::group_row_reduce_store_t<dtype_acc,

            dtype_out, sg_tile_n, wg_size_x, wg_size_y, max_simd_len>;


    struct arguments_t {

        dtype_in *mat_in_ptr;

        dtype_out *mat_out_ptr;

        uint32_t matrix_m;

        uint32_t matrix_n;

        uint32_t mat_in_ld;

    };


    struct get_barrier_count {

        static constexpr uint32_t count = (wg_size_y > 1) ? wg_size_x : 0;

    };


    static constexpr uint32_t counter_size

            = use_dynamic_job ? SIMD * sizeof(int) * wg_size_x : 0;

    static constexpr uint32_t row_buffer_size = (wg_size_y > 1)

            ? tile_size_x * wg_size_x * wg_size_y * sizeof(dtype_acc)

            : 0;


    struct get_slm_size {

        static constexpr uint32_t size = row_buffer_size + counter_size;

    };


    __XETLA_API static void call(sycl::nd_item<3> &item, arguments_t *args,

            fused_op_arguments_t *fused_op_args = nullptr,

            uint32_t slm_base = 0, uint32_t nbarrier_base = 0) {

        work_group_t g;

        g.init(item.get_local_linear_id());

        int sg_idx = g.get_id() % wg_size_x;

        int sg_idy = g.get_id() / wg_size_x;


        int global_start_x_in

                = item.get_group(2) * wg_tile_n + sg_idx * sg_tile_n;

        int global_start_y_in = sg_idy * sg_tile_m;

        xetla_nbarrier_t<wg_size_y, wg_size_y, gpu_arch::Xe> nbarrier;

        nbarrier.init_nbarrier(

                nbarrier_base + sg_idx, nbarrier_role::producer_consumer);

        if constexpr (use_dynamic_job) {

            xetla_vector<uint32_t, SIMD> offsets(

                    slm_base + row_buffer_size + sg_idx * SIMD * sizeof(int));

            xetla_mask<SIMD> pred(0);

            pred[0] = 1;

            if (sg_idy == 0) {

                xetla_vector<int, SIMD> init(wg_size_y);

                xetla_store_local<int, 1, data_size::default_size, SIMD>(

                        offsets, init, pred);

                xetla_fence<memory_kind::shared_local>();

            }

            nbarrier.arrive();

        }


        global_ld_t mat_global_ld;

        fused_op_t fused_op(

                fused_op_args, global_start_x_in, global_start_y_in);

        global_ld_payload_t mat_global_ld_payload(args->mat_in_ptr,

                args->matrix_n, args->matrix_m, args->mat_in_ld,

                global_start_x_in, global_start_y_in);

        mat_buffer_t mat_buffer(0);

        if constexpr (use_dynamic_job) {

            nbarrier.wait();

            int job_id = sg_idy;

            xetla_vector<uint32_t, SIMD> offsets(

                    slm_base + row_buffer_size + sg_idx * SIMD * sizeof(int));

            xetla_mask<SIMD> pred(0);

            pred[0] = 1;

            while (job_id * tile_size_y < args->matrix_m) {

                xetla_vector<int, SIMD> next_job

                        = xetla_atomic_local<atomic_op::iinc, int, SIMD>(

                                offsets, pred);

                subgroup::tile_load(mat_global_ld, mat_global_ld_payload);

                matAcc_t matAcc;

                subgroup::elemwise_cvt<matAcc_t, global_ld_t>(

                        matAcc, mat_global_ld);

                fused_op(matAcc);

                mat_buffer.reg += subgroup::tile_reduce<reduce_op::sum,

                        dtype_acc, dtype_acc, 0>(matAcc);

                mat_global_ld_payload

                        .template update_tdesc<tdesc_update_dir::y_dir>(

                                (next_job[0] - job_id) * tile_size_y);

                fused_op.update_tdesc(0, (next_job[0] - job_id) * tile_size_y);

                job_id = next_job[0];

            }

        } else {

            for (int job_id = sg_idy; job_id * tile_size_y < args->matrix_m;

                    job_id += wg_size_y) {

                subgroup::tile_load(mat_global_ld, mat_global_ld_payload);

                matAcc_t matAcc;

                subgroup::elemwise_cvt<matAcc_t, global_ld_t>(

                        matAcc, mat_global_ld);

                fused_op(matAcc);

                mat_buffer.reg += subgroup::tile_reduce<reduce_op::sum,

                        dtype_acc, dtype_acc, 0>(matAcc);

                fused_op.update_tdesc(0, wg_size_y * tile_size_y);

                mat_global_ld_payload

                        .template update_tdesc<tdesc_update_dir::y_dir>(

                                wg_size_y * tile_size_y);

            }

        }


        row_reduce_store_t row_reduce_store;

        uint32_t slm_row_reduce_base = slm_base;

        uint32_t nbarrier_row_reduce_base = nbarrier_base;

        row_reduce_store.init(

                sg_idx, sg_idy, slm_row_reduce_base, nbarrier_row_reduce_base);

        row_reduce_store(args->mat_out_ptr, args->matrix_n, 1, args->matrix_n,

                global_start_x_in, 0, mat_buffer.reg);

    }

};


} // namespace gpu::xetla::kernel

__XETLA_API
#define __XETLA_API
Definition common.hpp:43

api.hpp
C++ API.

common.hpp
C++ API.

SIMD
#define SIMD
Definition gemm_softmax.cpp:23

gpu::xetla::xetla_vector
__ESIMD_NS::simd< native_type_t< Ty >, N > xetla_vector
wrapper for xetla_vector.
Definition base_types.hpp:149

gpu::xetla::xetla_mask
__ESIMD_NS::simd_mask< N > xetla_mask
wrapper for xetla_mask.
Definition base_types.hpp:165

gpu::xetla::nbarrier_role::producer_consumer
@ producer_consumer

gpu::xetla::kernel
Definition limitation.hpp:734

gpu::xetla::subgroup::tile_load
__XETLA_API std::enable_if_t< detail::check_load_type< tile_t, payload_t >::is_global_2d_xe > tile_load(tile_t &tile, payload_t &payload)
This function loads data from 2D memory surface.
Definition load_xe.hpp:76

gpu::xetla::subgroup::tile_reduce
__XETLA_API std::enable_if_t<(dim==1), xetla_vector< dtype_out, mat_t::tile_size_y > > tile_reduce(mat_t &src)
Definition reduction.hpp:33

gpu::xetla::reg_layout::tiled
@ tiled

gpu::xetla::reduce_op::sum
@ sum

gpu::xetla::gpu_arch
gpu_arch
Definition common.hpp:73

gpu::xetla::gpu_arch::Xe
@ Xe

config.hpp
C++ API.

row_reduce_store_xe.hpp
C++ API.

row_reduction_fused_op_xe.hpp
C++ API.

gpu::xetla::arch_attr_t
Definition arch_config.hpp:72

gpu::xetla::group::group_row_reduce_store_t
This is the group row reduction(reduce_sum) + cooperative write out.
Definition reduction_api.hpp:39

gpu::xetla::kernel::xetla_row_reduction_t< dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe, fused_op_t_ >::load_store_attr
typename arch_attr_t< gpu_arch::Xe >::template load_store_attr< msg_type::block_2d > load_store_attr
Definition row_reduction_xe.hpp:63

gpu::xetla::kernel::xetla_row_reduction_t< dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe, fused_op_t_ >::dtype_out
dtype_out_ dtype_out
Definition row_reduction_xe.hpp:45

gpu::xetla::kernel::xetla_row_reduction_t< dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe, fused_op_t_ >::call
static __XETLA_API void call(sycl::nd_item< 3 > &item, arguments_t *args, fused_op_arguments_t *fused_op_args=nullptr, uint32_t slm_base=0, uint32_t nbarrier_base=0)
Main execution function for row reduction.
Definition row_reduction_xe.hpp:147

gpu::xetla::kernel::xetla_row_reduction_t< dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe, fused_op_t_ >::fused_op_t
fused_op_t_ fused_op_t
Definition row_reduction_xe.hpp:48

gpu::xetla::kernel::xetla_row_reduction_t< dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe, fused_op_t_ >::reduction_attr
reduction_attr_ reduction_attr
Definition row_reduction_xe.hpp:47

gpu::xetla::kernel::xetla_row_reduction_t< dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe, fused_op_t_ >::work_group_t
work_group_t< wg_size_x *wg_size_y > work_group_t
Definition row_reduction_xe.hpp:60

gpu::xetla::kernel::xetla_row_reduction_t< dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe, fused_op_t_ >::fused_op_arguments_t
typename fused_op_t::arguments_t fused_op_arguments_t
Definition row_reduction_xe.hpp:49

gpu::xetla::kernel::xetla_row_reduction_t< dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe, fused_op_t_ >::dtype_acc
dtype_acc_ dtype_acc
Definition row_reduction_xe.hpp:46

gpu::xetla::kernel::xetla_row_reduction_t< dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe, fused_op_t_ >::dtype_in
dtype_in_ dtype_in
Definition row_reduction_xe.hpp:44

gpu::xetla::kernel::xetla_row_reduction_t< dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe, fused_op_t_ >::arguments_t::matrix_n
uint32_t matrix_n
Definition row_reduction_xe.hpp:116

gpu::xetla::kernel::xetla_row_reduction_t< dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe, fused_op_t_ >::arguments_t::mat_out_ptr
dtype_out * mat_out_ptr
Definition row_reduction_xe.hpp:114

gpu::xetla::kernel::xetla_row_reduction_t< dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe, fused_op_t_ >::arguments_t::mat_in_ptr
dtype_in * mat_in_ptr
Definition row_reduction_xe.hpp:113

gpu::xetla::kernel::xetla_row_reduction_t< dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe, fused_op_t_ >::arguments_t::matrix_m
uint32_t matrix_m
Definition row_reduction_xe.hpp:115

gpu::xetla::kernel::xetla_row_reduction_t< dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe, fused_op_t_ >::arguments_t::mat_in_ld
uint32_t mat_in_ld
Definition row_reduction_xe.hpp:117

gpu::xetla::kernel::xetla_row_reduction_t
Is the row_reduction functor.
Definition api.hpp:39

gpu::xetla::mem_desc_t
Definition memory_descriptor.hpp:139

gpu::xetla::subgroup::detail::gcd
Definition common.hpp:80

gpu::xetla::subgroup::mem_payload_t
Is to illustrate the memory information.
Definition api.hpp:44

gpu::xetla::subgroup::tile_desc_t
Is to illustrate the tile information about a sub matrix.
Definition api.hpp:64

gpu::xetla::subgroup::tile_t
Is a struct contains some register file.
Definition api.hpp:99

gpu::xetla::subgroup::tile_t::reg
xetla_vector< dtype, tile_desc::tile_elems > reg
Definition api.hpp:102

gpu::xetla::xetla_nbarrier_t
xetla nbarrier definition API.
Definition raw_send_nbarrier.hpp:43

gpu::xetla::xetla_nbarrier_t::arrive
__XETLA_API void arrive()
named barrier signal from subgroup.
Definition raw_send_nbarrier.hpp:65

gpu::xetla::xetla_nbarrier_t::init_nbarrier
__XETLA_API void init_nbarrier(uint8_t nbarrier_id, nbarrier_role role=nbarrier_role::producer_consumer)
Definition raw_send_nbarrier.hpp:55

gpu::xetla::xetla_nbarrier_t::wait
__XETLA_API void wait()
named barrier wait within subgroup.
Definition raw_send_nbarrier.hpp:76