xetla/global__reduction_8hpp_source.html

/*******************************************************************************

* Copyright (c) 2022-2023 Intel Corporation

*

* Licensed under the Apache License, Version 2.0 (the "License");

* you may not use this file except in compliance with the License.

* You may obtain a copy of the License at

*

*     http://www.apache.org/licenses/LICENSE-2.0

*

* Unless required by applicable law or agreed to in writing, software

* distributed under the License is distributed on an "AS IS" BASIS,

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

* See the License for the specific language governing permissions and

* limitations under the License.

*******************************************************************************/


#pragma once


#include "group/tile_shape.hpp"

#include "subgroup/subgroup.hpp"


namespace gpu::xetla::group {


template <reduce_op reduce_kind, typename tile_shape_acc,

        typename tile_shape_cnt, typename mem_desc_acc_t,

        typename mem_desc_cnt_t, uint32_t num_group_reduction,

        uint32_t counter_size, gpu_arch arch_tag, class enable = void>

class global_reduce_t {};


template <typename tile_shape_acc_, typename tile_shape_cnt_,

        typename mem_desc_acc_t_, typename mem_desc_cnt_t_,

        uint32_t num_group_reduction, uint32_t counter_size, gpu_arch arch_tag_>

class global_reduce_t<reduce_op::sum, tile_shape_acc_, tile_shape_cnt_,

        mem_desc_acc_t_, mem_desc_cnt_t_, num_group_reduction, counter_size,

        arch_tag_, std::enable_if_t<(arch_tag_ == gpu_arch::Xe)>> {

public:

    static constexpr gpu_arch arch_tag = arch_tag_;

    using tile_shape_acc = tile_shape_acc_;

    using tile_shape_cnt = tile_shape_cnt_;

    using mem_desc_acc_t = mem_desc_acc_t_;

    using mem_desc_cnt_t = mem_desc_cnt_t_;

    using dtype_acc = typename mem_desc_acc_t::dtype;

    using dtype_cnt = typename mem_desc_cnt_t::dtype;


private:

    static constexpr uint32_t acc_sg_tile_y = tile_shape_acc::sg_tile_size_y;

    static constexpr uint32_t acc_sg_tile_x = tile_shape_acc::sg_tile_size_x;

    static constexpr uint32_t cnt_sg_tile_y = tile_shape_cnt::sg_tile_size_y;

    static constexpr uint32_t cnt_sg_tile_x = tile_shape_cnt::sg_tile_size_x;

    static constexpr uint32_t wg_size_x = tile_shape_acc::wg_size_x;

    static constexpr uint32_t wg_size_y = tile_shape_acc::wg_size_y;

    static_assert((tile_shape_acc::wg_size_x == tile_shape_cnt::wg_size_x)

                    && (tile_shape_acc::wg_size_y == tile_shape_cnt::wg_size_y),

            "acc and cnt wg shape need to be matched");

    using work_group_t = typename tile_shape_acc::work_group_t;


    inline void update_sg_tile_tdesc(work_group_t &g,

            mem_desc_acc_t &mem_desc_acc, mem_desc_cnt_t &mem_desc_cnt) {

        int32_t sg_idx = g.get_id() % wg_size_x;

        int32_t sg_idy = g.get_id() / wg_size_x;

        int32_t acc_tile_offset_x = sg_idx * acc_sg_tile_x;

        int32_t acc_tile_offset_y = sg_idy * acc_sg_tile_y;

        mem_desc_acc.update_coord(acc_tile_offset_x, acc_tile_offset_y);

        int32_t cnt_tile_offset_x = sg_idx * cnt_sg_tile_x;

        int32_t cnt_tile_offset_y = sg_idy * cnt_sg_tile_y;

        mem_desc_cnt.update_coord(cnt_tile_offset_x, cnt_tile_offset_y);

    }


    inline uint32_t update_reduce_counter(mem_desc_cnt_t &mem_desc_cnt) {

        constexpr uint32_t SIMD = 16;

        uint32_t pitch_in_bytes

                = mem_desc_cnt.shape.stride * sizeof(dtype_cnt) * counter_size;

        uint32_t offset_x = mem_desc_cnt.coord.x;

        uint32_t offset_y = mem_desc_cnt.coord.y;

        uint64_t address = (uint64_t)mem_desc_cnt.base.base

                + offset_y * pitch_in_bytes

                + offset_x * sizeof(dtype_cnt) * counter_size;

        xetla_vector<uint32_t, SIMD> offsets

                = xetla_vector_gen<uint32_t, SIMD>(0, 1);

        offsets *= sizeof(dtype_cnt);

        xetla_mask<SIMD> pred(0);

        pred[0] = 1;

        xetla_vector<dtype_cnt, SIMD> ret = xetla_atomic_global<atomic_op::iinc,

                dtype_cnt, SIMD, data_size::default_size, cache_hint::uncached,

                cache_hint::write_back>((dtype_cnt *)address, offsets, pred);

        return ret[0];

    }


    inline void clean_reduce_counter(mem_desc_cnt_t &mem_desc_cnt) {

        uint32_t pitch_in_bytes

                = mem_desc_cnt.shape.stride * sizeof(dtype_cnt) * counter_size;

        uint32_t offset_x = mem_desc_cnt.coord.x;

        uint32_t offset_y = mem_desc_cnt.coord.y;

        uint64_t address = (uint64_t)mem_desc_cnt.base.base

                + offset_y * pitch_in_bytes

                + offset_x * sizeof(dtype_cnt) * counter_size;

        xetla_vector<dtype_cnt, 1> zeros(0);


        xetla_store_global<dtype_cnt, 1, data_size::default_size,

                cache_hint::uncached, cache_hint::write_back>(

                (dtype_cnt *)address, 0, zeros);

    }


public:

    static constexpr uint32_t barrier_count = 0;

    static constexpr uint32_t slm_size = 0;

    uint32_t reduce_id = 0;


    inline bool is_last_group() {

        return reduce_id == (num_group_reduction - 1);

    }


    template <typename matAcc_t>

    __XETLA_API KERNEL_FUNC void operator()(work_group_t &g, matAcc_t &matAcc,

            mem_desc_acc_t mem_desc_acc, mem_desc_cnt_t mem_desc_cnt,

            [[maybe_unused]] uint32_t slm_base = 0,

            [[maybe_unused]] uint32_t nbarrier_base = 0) {

        static_assert(std::is_same<typename matAcc_t::dtype, dtype_acc>::value,

                "matAcc_t::dtype should match with dtype_acc");

        update_sg_tile_tdesc(g, mem_desc_acc, mem_desc_cnt);

        using matAcc_tile_desc_t = typename matAcc_t::tile_desc;

        using matAcc_store_payload_t = subgroup::mem_payload_t<mem_desc_acc_t,

                matAcc_tile_desc_t, msg_type::atomic_add, arch_tag>;

        matAcc_store_payload_t matAcc_store_payload(mem_desc_acc);

        subgroup::tile_store<cache_hint::uncached, cache_hint::write_back>(

                matAcc, matAcc_store_payload);

        xetla_fence<memory_kind::untyped_global, fence_op::none,

                fence_scope::tile>();

        reduce_id = update_reduce_counter(mem_desc_cnt);

        if (reduce_id == (num_group_reduction - 1)) {

            using matAcc_payload_t = subgroup::mem_payload_t<mem_desc_acc_t,

                    matAcc_tile_desc_t, msg_type::block_2d, arch_tag>;

            matAcc_payload_t matAcc_payload(mem_desc_acc);

            subgroup::tile_load(matAcc, matAcc_payload);

            clean_reduce_counter(mem_desc_cnt);

            using mat_zero_t = subgroup::tile_t<dtype_acc, matAcc_tile_desc_t>;

            mat_zero_t mat_zero;

            mat_zero.reg = 0;

            subgroup::tile_store<cache_hint::uncached, cache_hint::write_back>(

                    mat_zero, matAcc_payload);

            SW_BARRIER();

        }

    }

};


template <typename tile_shape_acc_, typename tile_shape_cnt_,

        typename mem_desc_acc_t_, typename mem_desc_cnt_t_,

        uint32_t counter_size_, gpu_arch arch_tag_>

class global_reduce_t<reduce_op::sum, tile_shape_acc_, tile_shape_cnt_,

        mem_desc_acc_t_, mem_desc_cnt_t_, 1, counter_size_, arch_tag_,

        std::enable_if_t<(arch_tag_ == gpu_arch::Xe)>> {

public:

    static constexpr gpu_arch arch_tag = arch_tag_;

    using tile_shape_acc = tile_shape_acc_;

    using tile_shape_cnt = tile_shape_cnt_;

    using mem_desc_acc_t = mem_desc_acc_t_;

    using mem_desc_cnt_t = mem_desc_cnt_t_;

    using dtype_acc = typename mem_desc_acc_t::dtype;


private:

    using work_group_t = typename tile_shape_acc::work_group_t;


public:

    static constexpr uint32_t barrier_count = 0;

    static constexpr uint32_t slm_size = 0;

    inline bool is_last_group() { return true; }


    template <typename matAcc_t>

    inline KERNEL_FUNC void operator()([[maybe_unused]] work_group_t &g,

            [[maybe_unused]] matAcc_t &matAcc,

            [[maybe_unused]] mem_desc_acc_t mem_desc_acc,

            [[maybe_unused]] mem_desc_cnt_t mem_desc_cnt,

            [[maybe_unused]] uint32_t slm_base = 0,

            [[maybe_unused]] uint32_t nbarrier_base = 0) {}

};


} // namespace gpu::xetla::group

gpu::xetla::group::global_reduce_t< reduce_op::sum, tile_shape_acc_, tile_shape_cnt_, mem_desc_acc_t_, mem_desc_cnt_t_, 1, counter_size_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::operator()
KERNEL_FUNC void operator()(work_group_t &g, matAcc_t &matAcc, mem_desc_acc_t mem_desc_acc, mem_desc_cnt_t mem_desc_cnt, uint32_t slm_base=0, uint32_t nbarrier_base=0)
Definition global_reduction.hpp:195

gpu::xetla::group::global_reduce_t< reduce_op::sum, tile_shape_acc_, tile_shape_cnt_, mem_desc_acc_t_, mem_desc_cnt_t_, 1, counter_size_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::mem_desc_cnt_t
mem_desc_cnt_t_ mem_desc_cnt_t
Definition global_reduction.hpp:183

gpu::xetla::group::global_reduce_t< reduce_op::sum, tile_shape_acc_, tile_shape_cnt_, mem_desc_acc_t_, mem_desc_cnt_t_, 1, counter_size_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::mem_desc_acc_t
mem_desc_acc_t_ mem_desc_acc_t
Definition global_reduction.hpp:182

gpu::xetla::group::global_reduce_t< reduce_op::sum, tile_shape_acc_, tile_shape_cnt_, mem_desc_acc_t_, mem_desc_cnt_t_, 1, counter_size_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::tile_shape_acc
tile_shape_acc_ tile_shape_acc
Definition global_reduction.hpp:180

gpu::xetla::group::global_reduce_t< reduce_op::sum, tile_shape_acc_, tile_shape_cnt_, mem_desc_acc_t_, mem_desc_cnt_t_, 1, counter_size_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::is_last_group
bool is_last_group()
Definition global_reduction.hpp:192

gpu::xetla::group::global_reduce_t< reduce_op::sum, tile_shape_acc_, tile_shape_cnt_, mem_desc_acc_t_, mem_desc_cnt_t_, 1, counter_size_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::dtype_acc
typename mem_desc_acc_t::dtype dtype_acc
Definition global_reduction.hpp:184

gpu::xetla::group::global_reduce_t< reduce_op::sum, tile_shape_acc_, tile_shape_cnt_, mem_desc_acc_t_, mem_desc_cnt_t_, 1, counter_size_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::tile_shape_cnt
tile_shape_cnt_ tile_shape_cnt
Definition global_reduction.hpp:181

gpu::xetla::group::global_reduce_t< reduce_op::sum, tile_shape_acc_, tile_shape_cnt_, mem_desc_acc_t_, mem_desc_cnt_t_, num_group_reduction, counter_size, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::mem_desc_acc_t
mem_desc_acc_t_ mem_desc_acc_t
Definition global_reduction.hpp:53

gpu::xetla::group::global_reduce_t< reduce_op::sum, tile_shape_acc_, tile_shape_cnt_, mem_desc_acc_t_, mem_desc_cnt_t_, num_group_reduction, counter_size, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::operator()
__XETLA_API KERNEL_FUNC void operator()(work_group_t &g, matAcc_t &matAcc, mem_desc_acc_t mem_desc_acc, mem_desc_cnt_t mem_desc_cnt, uint32_t slm_base=0, uint32_t nbarrier_base=0)
Global reduction.
Definition global_reduction.hpp:139

gpu::xetla::group::global_reduce_t< reduce_op::sum, tile_shape_acc_, tile_shape_cnt_, mem_desc_acc_t_, mem_desc_cnt_t_, num_group_reduction, counter_size, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::is_last_group
bool is_last_group()
Definition global_reduction.hpp:123

gpu::xetla::group::global_reduce_t< reduce_op::sum, tile_shape_acc_, tile_shape_cnt_, mem_desc_acc_t_, mem_desc_cnt_t_, num_group_reduction, counter_size, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::mem_desc_cnt_t
mem_desc_cnt_t_ mem_desc_cnt_t
Definition global_reduction.hpp:54

gpu::xetla::group::global_reduce_t< reduce_op::sum, tile_shape_acc_, tile_shape_cnt_, mem_desc_acc_t_, mem_desc_cnt_t_, num_group_reduction, counter_size, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::tile_shape_acc
tile_shape_acc_ tile_shape_acc
Definition global_reduction.hpp:51

gpu::xetla::group::global_reduce_t< reduce_op::sum, tile_shape_acc_, tile_shape_cnt_, mem_desc_acc_t_, mem_desc_cnt_t_, num_group_reduction, counter_size, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::tile_shape_cnt
tile_shape_cnt_ tile_shape_cnt
Definition global_reduction.hpp:52

gpu::xetla::group::global_reduce_t< reduce_op::sum, tile_shape_acc_, tile_shape_cnt_, mem_desc_acc_t_, mem_desc_cnt_t_, num_group_reduction, counter_size, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::dtype_acc
typename mem_desc_acc_t::dtype dtype_acc
Definition global_reduction.hpp:55

gpu::xetla::group::global_reduce_t< reduce_op::sum, tile_shape_acc_, tile_shape_cnt_, mem_desc_acc_t_, mem_desc_cnt_t_, num_group_reduction, counter_size, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::dtype_cnt
typename mem_desc_cnt_t::dtype dtype_cnt
Definition global_reduction.hpp:56

gpu::xetla::group::global_reduce_t
Cross group global reduction.
Definition global_reduction.hpp:40

SW_BARRIER
#define SW_BARRIER()
SW_BARRIER, insert software scheduling barrier, for better code control.
Definition common.hpp:227

__XETLA_API
#define __XETLA_API
Definition common.hpp:43

SIMD
#define SIMD
Definition gemm_softmax.cpp:23

gpu::xetla::xetla_vector
__ESIMD_NS::simd< native_type_t< Ty >, N > xetla_vector
wrapper for xetla_vector.
Definition base_types.hpp:149

gpu::xetla::xetla_mask
__ESIMD_NS::simd_mask< N > xetla_mask
wrapper for xetla_mask.
Definition base_types.hpp:165

gpu::xetla::xetla_fence
__XETLA_API void xetla_fence(xetla_mask< N > pred=1)
Memory fence.
Definition memory.hpp:638

gpu::xetla::xetla_atomic_global
__XETLA_API xetla_vector< T, N > xetla_atomic_global(T *p, xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred)
Stateless scattered atomic (0 src).
Definition memory.hpp:371

gpu::xetla::xetla_store_global
__XETLA_API void xetla_store_global(Ty *p, xetla_vector< Toffset, N > offsets, xetla_vector< Ty, N *NElts > vals, xetla_mask< N > pred=1)
Stateless scattered store.
Definition memory.hpp:316

KERNEL_FUNC
#define KERNEL_FUNC
KERNEL_FUNC macro.
Definition common.hpp:39

gpu::xetla::group
Definition limitation.hpp:607

gpu::xetla::subgroup::tile_load
__XETLA_API std::enable_if_t< detail::check_load_type< tile_t, payload_t >::is_global_2d_xe > tile_load(tile_t &tile, payload_t &payload)
This function loads data from 2D memory surface.
Definition load_xe.hpp:76

gpu::xetla::cache_hint::write_back
@ write_back

gpu::xetla::cache_hint::uncached
@ uncached

gpu::xetla::data_size::default_size
@ default_size

gpu::xetla::fence_op::none
@ none

gpu::xetla::fence_scope::tile
@ tile
flush out to the local scope

gpu::xetla::memory_kind::untyped_global
@ untyped_global

gpu::xetla::reduce_op
reduce_op
xetla reduce op
Definition common.hpp:217

gpu::xetla::reduce_op::sum
@ sum

gpu::xetla::atomic_op::iinc
@ iinc
Atomic increment of memory data and return the old value. see

gpu::xetla::gpu_arch
gpu_arch
Definition common.hpp:73

gpu::xetla::msg_type::atomic_add
@ atomic_add

gpu::xetla::msg_type::block_2d
@ block_2d

gpu::xetla::subgroup::mem_payload_t
Is to illustrate the memory information.
Definition api.hpp:44

gpu::xetla::subgroup::tile_t
Is a struct contains some register file.
Definition api.hpp:99

gpu::xetla::subgroup::tile_t::reg
xetla_vector< dtype, tile_desc::tile_elems > reg
Definition api.hpp:102

tile_shape.hpp
C++ API.