Is the epilogue functor specialized for stream_k. More...
#include <stream_k_op_xe.hpp>

Public Types | |
| using | epilogue_t = epilogue_t_ |
| using | mem_desc_d_t = mem_desc_d_t_ |
| using | mem_desc_c_t = typename epilogue_t::mem_desc_c_t |
| using | mem_desc_atomic_sync_t = mem_desc_atomic_sync_t_ |
| using | tile_shape = tile_shape_ |
| using | epilogue_args_t = typename epilogue_t::arguments_t |
| using | work_group_t = typename tile_shape::work_group_t |
| using | dtype_d = typename mem_desc_d_t::dtype |
| using | dtype_flag = typename mem_desc_atomic_sync_t::dtype |
| using | residual_op_t = subgroup::elemwise_reduce_op_stream_k_t< reduce_op::sum, dtype_d > |
| using | residual_op_args_t = typename residual_op_t::arguments_t |
Public Member Functions | |
| template<typename matAcc_t > | |
| __XETLA_API KERNEL_FUNC void | operator() (work_group_t &g, matAcc_t &matAcc, mem_desc_c_t mem_desc_c, mem_desc_d_t mem_desc_d, mem_desc_atomic_sync_t mem_desc_atomic_sync, int group_idx, int first_group_idx, bool tile_finished, bool tile_started, epilogue_args_t epilogue_args, uint32_t slm_base=0, uint32_t nbarrier_base=0) |
| Epilogue for stream_k. | |
Static Public Member Functions | |
| static __XETLA_API void | update_sg_tile_tdesc (work_group_t &g, mem_desc_d_t &mem_desc_d) |
| Updates tile base descriptor based on the tid. | |
Public Attributes | |
| xetla_nbarrier_t< N_SG, N_SG, arch_tag > | nbarrier |
Static Public Attributes | |
| static constexpr gpu_arch | arch_tag = gpu_arch::Xe |
| static constexpr uint32_t | wg_tile_m = tile_shape::wg_tile_size_y |
| static constexpr uint32_t | wg_tile_n = tile_shape::wg_tile_size_x |
| static constexpr uint32_t | sg_tile_m = tile_shape::sg_tile_size_y |
| static constexpr uint32_t | sg_tile_n = tile_shape::sg_tile_size_x |
| static constexpr uint32_t | wg_size_x = tile_shape::wg_size_x |
| static constexpr uint32_t | wg_size_y = tile_shape::wg_size_y |
| static constexpr uint32_t | barrier_count = 1 |
| static constexpr uint32_t | slm_size = mem_desc_c_t::is_local ? wg_tile_m * wg_tile_n : 0 |
| static constexpr uint32_t | N_SG = wg_size_x * wg_size_y |
| static constexpr mem_layout | mem_layout_d = mem_desc_d_t::layout |
| static constexpr mem_space | mem_space_d = mem_desc_d_t::space |
| static constexpr msg_type | msg_type_d_block2d = msg_type::block_2d |
| static constexpr msg_type | msg_type_d_atomic = msg_type::atomic_add |
Is the epilogue functor specialized for stream_k.
| using gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::dtype_d = typename mem_desc_d_t::dtype |
| using gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::dtype_flag = typename mem_desc_atomic_sync_t::dtype |
| using gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::epilogue_args_t = typename epilogue_t::arguments_t |
| using gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::epilogue_t = epilogue_t_ |
| using gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::mem_desc_atomic_sync_t = mem_desc_atomic_sync_t_ |
| using gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::mem_desc_c_t = typename epilogue_t::mem_desc_c_t |
| using gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::mem_desc_d_t = mem_desc_d_t_ |
| using gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::residual_op_args_t = typename residual_op_t::arguments_t |
| using gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::residual_op_t = subgroup::elemwise_reduce_op_stream_k_t<reduce_op::sum, dtype_d> |
| using gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::tile_shape = tile_shape_ |
| using gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::work_group_t = typename tile_shape::work_group_t |
|
inline |
Epilogue for stream_k.
Differentiate between Non-finishing SK groups vs finishing SK groups vs DP groups Initial SK groups perform atomic writes to scratchspace Final SK groups wait for their peers to finish , reads partial data from scratchspace and reduce in GRF DP groups and finishing SK groups perform regular epilogue operations.
| matAcc_t | Is the type of the input tile. |
| g | Is the workgroup of the current tile. |
| matAcc | Is the input tile. |
| mem_desc_c | Is the memory description of matC, including base, shape and coordinate. |
| dp_group | indicates whether current group is data-parallel or stream_k |
|
inlinestatic |
Updates tile base descriptor based on the tid.
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
| xetla_nbarrier_t<N_SG, N_SG, arch_tag> gpu::xetla::group::epilogue_stream_k_t< tile_shape_, epilogue_t_, mem_desc_d_t_, mem_desc_atomic_sync_t_ >::nbarrier |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |