XeTLA v0.3.6
IntelĀ® Xe Templates for Linear Algebra - API Definition Document
 
Loading...
Searching...
No Matches
gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ > Struct Template Reference

Is the epilogue functor specialized for stream_k. More...

#include <stream_k_op_xe.hpp>

Collaboration diagram for gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >:

Public Types

using epilogue_t = epilogue_t_
 
using mem_desc_d_t = mem_desc_d_t_
 
using mem_desc_c_t = typename epilogue_t::mem_desc_c_t
 
using mem_desc_atomic_sync_t = mem_desc_atomic_sync_t_
 
using tile_shape = tile_shape_
 
using epilogue_args_t = typename epilogue_t::arguments_t
 
using work_group_t = typename tile_shape::work_group_t
 
using dtype_d = typename mem_desc_d_t::dtype
 
using dtype_flag = typename mem_desc_atomic_sync_t::dtype
 
using residual_op_t = subgroup::elemwise_reduce_op_stream_k_t< reduce_op::sum, dtype_d >
 
using residual_op_args_t = typename residual_op_t::arguments_t
 

Public Member Functions

template<typename matAcc_t >
__XETLA_API KERNEL_FUNC void operator() (work_group_t &g, matAcc_t &matAcc, mem_desc_c_t mem_desc_c, mem_desc_d_t mem_desc_d, mem_desc_atomic_sync_t mem_desc_atomic_sync, int group_idx, int first_group_idx, bool tile_finished, bool tile_started, epilogue_args_t epilogue_args, uint32_t slm_base=0, uint32_t nbarrier_base=0)
 Epilogue for stream_k.
 

Static Public Member Functions

static __XETLA_API void update_sg_tile_tdesc (work_group_t &g, mem_desc_d_t &mem_desc_d)
 Updates tile base descriptor based on the tid.
 

Public Attributes

xetla_nbarrier_t< N_SG, N_SG, arch_tagnbarrier
 

Static Public Attributes

static constexpr gpu_arch arch_tag = gpu_arch::Xe
 
static constexpr uint32_t wg_tile_m = tile_shape::wg_tile_size_y
 
static constexpr uint32_t wg_tile_n = tile_shape::wg_tile_size_x
 
static constexpr uint32_t sg_tile_m = tile_shape::sg_tile_size_y
 
static constexpr uint32_t sg_tile_n = tile_shape::sg_tile_size_x
 
static constexpr uint32_t wg_size_x = tile_shape::wg_size_x
 
static constexpr uint32_t wg_size_y = tile_shape::wg_size_y
 
static constexpr uint32_t barrier_count = 1
 
static constexpr uint32_t slm_size = mem_desc_c_t::is_local ? wg_tile_m * wg_tile_n : 0
 
static constexpr uint32_t N_SG = wg_size_x * wg_size_y
 
static constexpr mem_layout mem_layout_d = mem_desc_d_t::layout
 
static constexpr mem_space mem_space_d = mem_desc_d_t::space
 
static constexpr msg_type msg_type_d_block2d = msg_type::block_2d
 
static constexpr msg_type msg_type_d_atomic = msg_type::atomic_add
 

Detailed Description

template<typename tile_shape_, typename epilogue_t_, typename mem_desc_d_t_, typename mem_desc_atomic_sync_t_>
struct gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >

Is the epilogue functor specialized for stream_k.

Member Typedef Documentation

◆ dtype_d

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
using gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::dtype_d = typename mem_desc_d_t::dtype

◆ dtype_flag

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
using gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::dtype_flag = typename mem_desc_atomic_sync_t::dtype

◆ epilogue_args_t

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
using gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::epilogue_args_t = typename epilogue_t::arguments_t

◆ epilogue_t

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
using gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::epilogue_t = epilogue_t_

◆ mem_desc_atomic_sync_t

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
using gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::mem_desc_atomic_sync_t = mem_desc_atomic_sync_t_

◆ mem_desc_c_t

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
using gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::mem_desc_c_t = typename epilogue_t::mem_desc_c_t

◆ mem_desc_d_t

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
using gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::mem_desc_d_t = mem_desc_d_t_

◆ residual_op_args_t

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
using gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::residual_op_args_t = typename residual_op_t::arguments_t

◆ residual_op_t

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
using gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::residual_op_t = subgroup::elemwise_reduce_op_stream_k_t<reduce_op::sum, dtype_d>

◆ tile_shape

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
using gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::tile_shape = tile_shape_

◆ work_group_t

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
using gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::work_group_t = typename tile_shape::work_group_t

Member Function Documentation

◆ operator()()

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
template<typename matAcc_t >
__XETLA_API KERNEL_FUNC void gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::operator() ( work_group_t g,
matAcc_t &  matAcc,
mem_desc_c_t  mem_desc_c,
mem_desc_d_t  mem_desc_d,
mem_desc_atomic_sync_t  mem_desc_atomic_sync,
int  group_idx,
int  first_group_idx,
bool  tile_finished,
bool  tile_started,
epilogue_args_t  epilogue_args,
uint32_t  slm_base = 0,
uint32_t  nbarrier_base = 0 
)
inline

Epilogue for stream_k.

Differentiate between Non-finishing SK groups vs finishing SK groups vs DP groups Initial SK groups perform atomic writes to scratchspace Final SK groups wait for their peers to finish , reads partial data from scratchspace and reduce in GRF DP groups and finishing SK groups perform regular epilogue operations.

Template Parameters
matAcc_tIs the type of the input tile.
Parameters
gIs the workgroup of the current tile.
matAccIs the input tile.
mem_desc_cIs the memory description of matC, including base, shape and coordinate.
dp_groupindicates whether current group is data-parallel or stream_k

◆ update_sg_tile_tdesc()

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
static __XETLA_API void gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::update_sg_tile_tdesc ( work_group_t g,
mem_desc_d_t mem_desc_d 
)
inlinestatic

Updates tile base descriptor based on the tid.

Member Data Documentation

◆ arch_tag

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
constexpr gpu_arch gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::arch_tag = gpu_arch::Xe
staticconstexpr

◆ barrier_count

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
constexpr uint32_t gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::barrier_count = 1
staticconstexpr

◆ mem_layout_d

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
constexpr mem_layout gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::mem_layout_d = mem_desc_d_t::layout
staticconstexpr

◆ mem_space_d

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
constexpr mem_space gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::mem_space_d = mem_desc_d_t::space
staticconstexpr

◆ msg_type_d_atomic

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
constexpr msg_type gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::msg_type_d_atomic = msg_type::atomic_add
staticconstexpr

◆ msg_type_d_block2d

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
constexpr msg_type gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::msg_type_d_block2d = msg_type::block_2d
staticconstexpr

◆ N_SG

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
constexpr uint32_t gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::N_SG = wg_size_x * wg_size_y
staticconstexpr

◆ nbarrier

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
xetla_nbarrier_t<N_SG, N_SG, arch_tag> gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::nbarrier

◆ sg_tile_m

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
constexpr uint32_t gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::sg_tile_m = tile_shape::sg_tile_size_y
staticconstexpr

◆ sg_tile_n

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
constexpr uint32_t gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::sg_tile_n = tile_shape::sg_tile_size_x
staticconstexpr

◆ slm_size

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
constexpr uint32_t gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::slm_size = mem_desc_c_t::is_local ? wg_tile_m * wg_tile_n : 0
staticconstexpr

◆ wg_size_x

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
constexpr uint32_t gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::wg_size_x = tile_shape::wg_size_x
staticconstexpr

◆ wg_size_y

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
constexpr uint32_t gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::wg_size_y = tile_shape::wg_size_y
staticconstexpr

◆ wg_tile_m

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
constexpr uint32_t gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::wg_tile_m = tile_shape::wg_tile_size_y
staticconstexpr

◆ wg_tile_n

template<typename tile_shape_ , typename epilogue_t_ , typename mem_desc_d_t_ , typename mem_desc_atomic_sync_t_ >
constexpr uint32_t gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::wg_tile_n = tile_shape::wg_tile_size_x
staticconstexpr