xetla/utils_2misc_8hpp_source.html

/*******************************************************************************

* Copyright (c) 2022-2023 Intel Corporation

*

* Licensed under the Apache License, Version 2.0 (the "License");

* you may not use this file except in compliance with the License.

* You may obtain a copy of the License at

*

*     http://www.apache.org/licenses/LICENSE-2.0

*

* Unless required by applicable law or agreed to in writing, software

* distributed under the License is distributed on an "AS IS" BASIS,

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

* See the License for the specific language governing permissions and

* limitations under the License.

*******************************************************************************/


#pragma once


#include "common/utils/common.hpp"


__XETLA_API constexpr uint32_t div_round_up(uint32_t n, uint32_t d) {

    return (n + d - 1) / d;

}


//Rounds number down towards the next lowest number

//e.g. -2.0/3.0 ~ -0.666 -> -1.

__XETLA_API constexpr int div_round_down(int n, int d) {


    return (n - (((n % d) + d) % d)) / d;

}


//Calculate modulo based on definition that uses floored divison.

//Result has the same sign as d.

__XETLA_API constexpr int modulo(int n, int d) {

    return (d + (n % d)) % d;

}


//Pad the given allocation size upto nearest cacheline

__XETLA_API constexpr uint32_t cacheline_align_up(size_t size) {


    const int CACHELINE_SIZE = 256;

    return (size + CACHELINE_SIZE - 1) / CACHELINE_SIZE * CACHELINE_SIZE;

}


namespace gpu::xetla {


__XETLA_API xetla_vector<uint32_t, 4> get_time_stamp() {

    xetla_vector<uint32_t, 4> time_stamp = 0;

    return time_stamp;

}


template <typename Ty, int N>

__XETLA_API xetla_vector<Ty, N> xetla_vector_gen(int InitVal, int Step) {

    xetla_vector<Ty, N> tmp(InitVal, Step);

    return tmp;

}


template <uint32_t N>

__XETLA_API xetla_mask_int<N> xetla_mask_int_gen(uint32_t mask_val) {

    xetla_mask_int<N> tmp;

    tmp = sycl::ext::intel::esimd::unpack_mask<N>(mask_val);

    return tmp;

}


template <typename dtype_acc, uint32_t N, uint32_t num_flag = 4,

        typename dtype_mask = uint8_t>

__XETLA_API xetla_vector<dtype_acc, N> drop_out(xetla_vector<dtype_acc, N> in,

        xetla_vector<dtype_mask, N> mask, dtype_acc scale) {

    xetla_vector<dtype_acc, N> out = in * scale;

    constexpr uint32_t unroll_size = num_flag * 16;

    SW_BARRIER();

#pragma unroll

    for (uint32_t i = 0; i < N / unroll_size; i++) {

        xetla_mask<unroll_size> mask_flag

                = mask.xetla_select<unroll_size, 1>(i * unroll_size) > 0;

        out.xetla_select<unroll_size, 1>(i * unroll_size)

                .xetla_merge(0, mask_flag);

    }

    if constexpr (N % unroll_size != 0) {

        constexpr uint32_t remain_len = N % unroll_size;

        constexpr uint32_t remain_start = N / unroll_size * unroll_size;

        xetla_mask<remain_len> mask_flag

                = mask.xetla_select<remain_len, 1>(remain_start) > 0;

        out.xetla_select<remain_len, 1>(remain_start).xetla_merge(0, mask_flag);

    }

    return out;

}


template <reduce_op reduce_kind, typename dtype, int size>

__XETLA_API typename std::enable_if_t<reduce_kind == reduce_op::sum,

        xetla_vector<dtype, size>>

reduce_helper(xetla_vector<dtype, size> a, xetla_vector<dtype, size> b) {

    return a + b;

}


template <reduce_op reduce_kind, typename dtype, int size>

__XETLA_API typename std::enable_if_t<reduce_kind == reduce_op::prod,

        xetla_vector<dtype, size>>

reduce_helper(xetla_vector<dtype, size> a, xetla_vector<dtype, size> b) {

    return a * b;

}


template <reduce_op reduce_kind, typename dtype, int size>

__XETLA_API typename std::enable_if_t<reduce_kind == reduce_op::max,

        xetla_vector<dtype, size>>

reduce_helper(xetla_vector<dtype, size> a, xetla_vector<dtype, size> b) {

    xetla_vector<dtype, size> out;

    xetla_mask<size> mask = a > b;

    out.xetla_merge(a, b, mask);

    return out;

}


template <reduce_op reduce_kind, typename dtype, int size>

__XETLA_API typename std::enable_if_t<reduce_kind == reduce_op::min,

        xetla_vector<dtype, size>>

reduce_helper(xetla_vector<dtype, size> a, xetla_vector<dtype, size> b) {

    xetla_vector<dtype, size> out;

    xetla_mask<size> mask = a < b;

    out.xetla_merge(a, b, mask);

    return out;

}


template <reduce_op reduce_kind, typename dtype, int N_x, int N_y>

__XETLA_API typename std::enable_if_t<N_y == 1, xetla_vector<dtype, N_x>>

recur_row_reduce(xetla_vector<dtype, N_x> in) {

    return in;

}

template <reduce_op reduce_kind, typename dtype, int N_x, int N_y>

__XETLA_API typename std::enable_if_t<(N_y > 1), xetla_vector<dtype, N_x>>

recur_row_reduce(xetla_vector<dtype, N_x * N_y> in) {

    static_assert(((N_y) & (N_y - 1)) == 0, "N_y should be power of 2");

    xetla_vector<dtype, N_x * N_y / 2> temp;

    temp = reduce_helper<reduce_kind, dtype, N_x * N_y / 2>(

            in.xetla_select<N_x * N_y / 2, 1>(0),

            in.xetla_select<N_x * N_y / 2, 1>(N_x * N_y / 2));


    return recur_row_reduce<reduce_kind, dtype, N_x, N_y / 2>(temp);

}


template <reduce_op reduce_kind, typename dtype, int N_x, int N_y>

__XETLA_API typename std::enable_if_t<N_x == 1, xetla_vector<dtype, N_y>>

recur_col_reduce(xetla_vector<dtype, N_y> in) {

    return in;

}

template <reduce_op reduce_kind, typename dtype, int N_x, int N_y>

__XETLA_API typename std::enable_if_t<(N_x > 1), xetla_vector<dtype, N_y>>

recur_col_reduce(xetla_vector<dtype, N_x * N_y> in) {

    static_assert(((N_x) & (N_x - 1)) == 0, "N_x should be power of 2");

    xetla_vector<dtype, N_x * N_y / 2> temp;

    auto in_2d = in.xetla_format<dtype, N_y, N_x>();

    temp = reduce_helper<reduce_kind, dtype, N_y * N_x / 2>(

            in_2d.xetla_select<N_y, 1, N_x / 2, 1>(0, 0),

            in_2d.xetla_select<N_y, 1, N_x / 2, 1>(0, N_x / 2));


    return recur_col_reduce<reduce_kind, dtype, N_x / 2, N_y>(temp);

}


__XETLA_API uint32_t get_2d_group_linear_id(sycl::nd_item<3> &item) {

    return item.get_group(2) + item.get_group(1) * item.get_group_range(2);

}


} // namespace gpu::xetla

SW_BARRIER
#define SW_BARRIER()
SW_BARRIER, insert software scheduling barrier, for better code control.
Definition common.hpp:227

__XETLA_API
#define __XETLA_API
Definition common.hpp:43

common.hpp
C++ API.

xetla_merge
#define xetla_merge
xetla merge.
Definition base_ops.hpp:60

gpu::xetla::xetla_mask_int
__ESIMD_NS::simd_mask< N > xetla_mask_int
wrapper for xetla_mask_int.
Definition base_types.hpp:172

gpu::xetla::xetla_vector
__ESIMD_NS::simd< native_type_t< Ty >, N > xetla_vector
wrapper for xetla_vector.
Definition base_types.hpp:149

gpu::xetla::xetla_mask
__ESIMD_NS::simd_mask< N > xetla_mask
wrapper for xetla_mask.
Definition base_types.hpp:165

gpu::xetla::recur_col_reduce
__XETLA_API std::enable_if_t< N_x==1, xetla_vector< dtype, N_y > > recur_col_reduce(xetla_vector< dtype, N_y > in)
Definition misc.hpp:162

gpu::xetla::drop_out
__XETLA_API xetla_vector< dtype_acc, N > drop_out(xetla_vector< dtype_acc, N > in, xetla_vector< dtype_mask, N > mask, dtype_acc scale)
Definition misc.hpp:87

gpu::xetla::get_time_stamp
__XETLA_API xetla_vector< uint32_t, 4 > get_time_stamp()
Returns time stamp.
Definition misc.hpp:57

gpu::xetla::reduce_helper
__XETLA_API std::enable_if_t< reduce_kind==reduce_op::sum, xetla_vector< dtype, size > > reduce_helper(xetla_vector< dtype, size > a, xetla_vector< dtype, size > b)
Definition misc.hpp:112

gpu::xetla::recur_row_reduce
__XETLA_API std::enable_if_t< N_y==1, xetla_vector< dtype, N_x > > recur_row_reduce(xetla_vector< dtype, N_x > in)
Definition misc.hpp:145

gpu::xetla::xetla_mask_int_gen
__XETLA_API xetla_mask_int< N > xetla_mask_int_gen(uint32_t mask_val)
Definition misc.hpp:79

gpu::xetla::xetla_vector_gen
__XETLA_API xetla_vector< Ty, N > xetla_vector_gen(int InitVal, int Step)
xetla_vector generation.
Definition misc.hpp:73

gpu::xetla::get_2d_group_linear_id
__XETLA_API uint32_t get_2d_group_linear_id(sycl::nd_item< 3 > &item)
get linear group id of the last two dimensions.
Definition misc.hpp:180

gpu::xetla
Definition arch_config.hpp:24

gpu::xetla::reduce_op::sum
@ sum

gpu::xetla::reduce_op::max
@ max

gpu::xetla::reduce_op::prod
@ prod

gpu::xetla::reduce_op::min
@ min

div_round_down
__XETLA_API constexpr int div_round_down(int n, int d)
Definition misc.hpp:30

modulo
__XETLA_API constexpr int modulo(int n, int d)
Definition misc.hpp:37

div_round_up
__XETLA_API constexpr uint32_t div_round_up(uint32_t n, uint32_t d)
Definition misc.hpp:24

cacheline_align_up
__XETLA_API constexpr uint32_t cacheline_align_up(size_t size)
Definition misc.hpp:42