|
| struct | bias_add_op_t |
| | Is the bias_add op functor. More...
|
| |
| struct | bias_add_op_t< dtype_bias_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> > |
| | Is the bias_add op functor, specialized for Xe architecture. More...
|
| |
| struct | chained_tile_op_arg_t |
| |
| struct | chained_tile_op_arg_t< idx, curr_args_t, remain_args_t... > |
| |
| struct | chained_tile_op_t |
| |
| struct | check_load |
| |
| struct | check_load< gpu_arch::Xe, dtype, mem_dtype > |
| |
| struct | check_store |
| |
| struct | check_store< gpu_arch::Xe, dtype, mem_dtype > |
| |
| class | cooperative_load_helper_t |
| | Helper to do the cooperative workgroups load. More...
|
| |
| class | cooperative_load_helper_t< matAcc_t_, mem_layout::col_major, num_cooperative_wg, arch_tag_, std::enable_if_t< gpu_arch::Xe==arch_tag_ > > |
| | Workgroups to do the cooperative load. Specialized for and row_major and Xe architecture. More...
|
| |
| class | cooperative_load_helper_t< matAcc_t_, mem_layout::row_major, num_cooperative_wg, arch_tag_, std::enable_if_t< gpu_arch::Xe==arch_tag_ > > |
| | Workgroups to do the cooperative load. Specialized for and row_major and Xe architecture. More...
|
| |
| struct | dequant_op_t |
| | Is the dequantization op functor. More...
|
| |
| struct | dequant_op_t< tile_op_t_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> > |
| | Is the quantization op functor, specialized for Xe architecture. More...
|
| |
| struct | dropout_op_t |
| | Is the dropout op functor. More...
|
| |
| struct | dropout_op_t< dtype_mask_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> > |
| | Is the dropout op functor, specialized for Xe architecture. More...
|
| |
| struct | elemwise_reduce_op_stream_k_t |
| | Is the element-wise reduce op functor, specialized for stream_k dispatch Load partial sum from scratchspace Reduce in GRF Store zero to scratchspace Do these steps with smaller tiles to minimize GRF pressure. More...
|
| |
| struct | elemwise_reduce_op_stream_k_t< reduce_kind_, dtype_in_, gpu_arch::Xe > |
| | Is the element-wise reduce op functor, specialized for Xe architecture. More...
|
| |
| struct | elemwise_reduce_op_t |
| | Is the element-wise reduce op functor. More...
|
| |
| struct | elemwise_reduce_op_t< reduce_kind_, dtype_in_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> > |
| | Is the element-wise reduce op functor, specialized for Xe architecture. More...
|
| |
| struct | gelu_bwd_op_t |
| | Is the element-wise gelu backward op functor. More...
|
| |
| struct | gelu_bwd_op_t< dtype_in_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> > |
| | Is the element-wise gelu backward op functor, specialized for Xe architecture. More...
|
| |
| struct | gelu_fwd_op_t |
| | Is the element-wise gelu inference forward op functor. More...
|
| |
| struct | gelu_fwd_w_op_t |
| | Is the element-wise gelu training forward op functor. More...
|
| |
| struct | gelu_fwd_w_op_t< dtype_out_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> > |
| | Is the element-wise gelu training forward op functor, specialized for Xe architecture. More...
|
| |
| struct | get_load_block_size_auto |
| |
| struct | get_load_block_size_auto< dtype, tile_size_x, tile_size_y, gpu_arch::Xe, mem_layout::row_major, reg_layout::tiled > |
| |
| struct | get_store_block_size_auto |
| |
| struct | get_store_block_size_auto< dtype, tile_size_x, tile_size_y, gpu_arch::Xe, mem_layout::row_major, reg_layout::tiled > |
| |
| struct | global_atomic_oob_check_off_tag |
| |
| struct | global_atomic_oob_check_on_tag |
| |
| struct | is_floating_to_integer |
| |
| struct | is_same_layout |
| |
| struct | linear_op_t |
| | Is the linear_op functor. More...
|
| |
| struct | linear_op_t< dtype_in_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> > |
| | Is the linear_op functor, specialized for Xe architecture. More...
|
| |
| struct | mem_payload_t |
| | Is to illustrate the memory information. More...
|
| |
| struct | mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::global, alignment_ >, tile_desc_, msg_type::atomic_add, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> > |
| | Is to describe the global memory surface for atomic store For atomic store, we need to prepare necessary information for each simd channel. More...
|
| |
| struct | mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::global, alignment_ >, tile_desc_, msg_type::block_1d, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> > |
| | Is to describe the global memory surface for block-1d load/store For a block-1d payload message we need to set the base address and offset of surface. More...
|
| |
| struct | mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ >, tile_desc_, msg_type::block_1d, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> > |
| | Is to describe the shared local memory surface for block-1d load/store. More...
|
| |
| struct | mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ >, tile_desc_, msg_type::scatter, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> > |
| | Is to describe the shared local memory surface for scatter load/store. More...
|
| |
| struct | mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout::vnni_tiled_col_major >, msg_type::scatter, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> > |
| | Is to describe the shared local memory surface for scattering store. More...
|
| |
| struct | mem_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_, msg_type::block_2d, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> > |
| | Is to describe the global memory surface for block-2d load/store for each block in one tile, a payload message is prepared here. More...
|
| |
| struct | mem_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_, msg_type::unaligned_2d, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> > |
| | Is to describe the global memory surface for unaligned-2d load/store for each block in one tile, a payload message is prepared here. More...
|
| |
| struct | msg_type_query |
| |
| struct | none_op_t |
| | Is none op functor, for placeholder purpose. More...
|
| |
| struct | polynomial_op_t |
| |
| struct | prefetch_payload_t |
| | Is to illustrate the memory information to prefetch data to cache. More...
|
| |
| struct | prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, 1, block_size_x_, 1, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> > |
| | Is to describe the memory memory to prefetch data to cache data in global memory will be prefetched into 1d tile. More...
|
| |
| struct | prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> > |
| | Is to describe the global memory surface to prefetch data to cache data in global memory will be prefetched into 2d tile. More...
|
| |
| struct | prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::local, alignment_ >, tile_desc_, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> > |
| | Is to describe the memory infomation to prefetch data to cache data located in shared local memory, nothing will do. More...
|
| |
| struct | quant_op_t |
| | Is the quantization op functor. More...
|
| |
| struct | quant_op_t< tile_op_t_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> > |
| | Is the quantization op functor, specialized for Xe architecture. More...
|
| |
| struct | relu_op_t |
| | Is the element-wise relu op functor. More...
|
| |
| struct | rng_dropout_op_t |
| | Is the random number generator and dropout op functor. More...
|
| |
| struct | rng_dropout_op_t< dtype_mask_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> > |
| | Is the random number generator and dropout op functor, specialized for Xe architecture. More...
|
| |
| struct | scalar_mul_op_t |
| | Is the scalar_multiply op functor. More...
|
| |
| struct | scalar_mul_op_t< dtype_in_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> > |
| | Is the scalar_multiply op functor, specialized for Xe architecture. More...
|
| |
| struct | scale_v_offset_v_op_t |
| | Is MatAcc * vector scale + vector offset. More...
|
| |
| struct | scale_v_offset_v_op_t< scale_dtype_, offset_dtype_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> > |
| | Is the scale_v_offset_v op functor, specialized for Xe architecture. More...
|
| |
| struct | scale_v_op_t |
| | Is MatAcc * vector scale. More...
|
| |
| struct | scale_v_op_t< scale_dtype_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> > |
| | Is the scale_v op functor, specialized for Xe architecture. More...
|
| |
| struct | sigmoid_op_t |
| | Is the element-wise sigmoid op functor. More...
|
| |
| struct | tanh_op_t |
| | Is the element-wise tanh op functor. More...
|
| |
| struct | tile_desc_t |
| | Is to illustrate the tile information about a sub matrix. More...
|
| |
| struct | tile_div |
| |
| struct | tile_minus |
| |
| struct | tile_mma_t |
| | Is the xetla tile mma operation definition API. More...
|
| |
| struct | tile_mma_t< matAcc_dst_t_, matAcc_src_t_, matB_t_, matA_t_, mma_engine::fpu, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> > |
| | Is the tile mma operation functor, specialized for Xe and fpu engine. More...
|
| |
| struct | tile_mma_t< matAcc_dst_t_, matAcc_src_t_, matB_t_, matA_t_, mma_engine::xmx, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> > |
| | Is the tile mma operation functor, specialized for Xe and matrix engine. More...
|
| |
| struct | tile_op_arg_helper_t |
| |
| struct | tile_t |
| | Is a struct contains some register file. More...
|
| |
|
| template<cache_hint L1 = cache_hint::cached, cache_hint L2 = cache_hint::cached, typename tile_t , typename payload_t > |
| __XETLA_API std::enable_if_t< detail::check_load_type< tile_t, payload_t >::is_global_2d_xe > | tile_load (tile_t &tile, payload_t &payload) |
| | This function loads data from 2D memory surface.
|
| |
| template<cache_hint L1 = cache_hint::cached, cache_hint L2 = cache_hint::cached, typename tile_t , typename payload_t > |
| __XETLA_API std::enable_if_t< detail::check_load_type< tile_t, payload_t >::is_global_block_1d_xe > | tile_load (tile_t &tile, payload_t &payload) |
| | This function loads data from memory.
|
| |
| template<cache_hint L1 = cache_hint::cached, cache_hint L3 = cache_hint::cached, typename tile_t , typename payload_t , typename oob_check_tag = global_atomic_oob_check_on_tag> |
| __XETLA_API std::enable_if_t< detail::check_load_type< tile_t, payload_t >::is_global_unaligned_2d_xe > | tile_load (tile_t &tile, payload_t &payload, oob_check_tag tag={}) |
| | This function loads data from unaligned-2D memory surface.
|
| |
| template<cache_hint L1 = cache_hint::cached, cache_hint L2 = cache_hint::cached, typename tile_t , typename payload_t > |
| __XETLA_API std::enable_if_t< detail::check_load_type< tile_t, payload_t >::is_local_scatter_xe > | tile_load (tile_t &tile, payload_t &payload) |
| | Is the data load func from local shared memory to register file, which supports the memory surface is 1d or 2d scenario.
|
| |
| template<cache_hint L1 = cache_hint::cached, cache_hint L2 = cache_hint::cached, typename tile_t , typename payload_t > |
| __XETLA_API std::enable_if_t< detail::check_load_type< tile_t, payload_t >::is_local_block_1d_xe > | tile_load (tile_t &tile, payload_t &payload) |
| | Is the data load func from shared local memory to register file, which supports the memory surface is 1d scenario.
|
| |
| template<typename T_dst , typename T_src > |
| __XETLA_API std::enable_if_t<(T_src::register_layout !=reg_layout::linear) &&(T_dst::register_layout !=reg_layout::linear) &&is_same_layout< T_dst, T_src >::value &&(!is_floating_to_integer< T_dst, T_src >::value)> | elemwise_cvt (T_dst &dst, T_src &src) |
| | Is the element wise data conversion, the src and dst tile should have the same layout.
|
| |
| template<typename T_dst , typename T_src > |
| __XETLA_API std::enable_if_t<(T_src::register_layout !=reg_layout::linear) &&(T_dst::register_layout !=reg_layout::linear) &&is_same_layout< T_dst, T_src >::value &&is_floating_to_integer< T_dst, T_src >::value > | elemwise_cvt (T_dst &dst, T_src &src) |
| | Is the element wise data conversion from floating point to integral, the src and dst tile should have the same layout.
|
| |
| template<typename T_dst , typename T_src > |
| __XETLA_API std::enable_if_t<(T_src::register_layout !=reg_layout::linear) &&(T_dst::register_layout !=reg_layout::linear) &&is_same_layout< T_dst, T_src >::value > | elemwise_cvt (T_dst &dst, T_src &src, float scale) |
| | element wise data conversion with scaling, the src and dst tile should have the same layout.
|
| |
| template<typename T > |
| __XETLA_API std::enable_if_t< T::register_layout==reg_layout::vnni_tiled > | vnni_convert (T &mat_Acc) |
| | Converts tiled layout to vnni_tiled layout format.
|
| |
| template<typename T > |
| __XETLA_API std::enable_if_t< T::register_layout==reg_layout::tiled > | vnni_reverse (T &mat_Acc) |
| | Converts vnni_tiled layout format to tiled layout.
|
| |
| template<typename T > |
| __XETLA_API std::enable_if_t< T::register_layout==reg_layout::transpose_tiled > | vnni_reverse (T &mat_Acc) |
| | Converts vnni_tiled layout format to transpose_tiled layout.
|
| |
| template<typename T_dst , typename T_src > |
| __XETLA_API std::enable_if_t< is_same_layout< T_dst, T_src >::value > | vnni_transform (T_dst &dst, T_src &src) |
| | Changes vnni layout.
|
| |
| template<typename T_dst , typename T_src > |
| __XETLA_API std::enable_if_t<(T_dst::register_layout==reg_layout::tiled) &&(T_src::register_layout==reg_layout::tiled) &&(T_src::tile_size_x==T_dst::tile_size_x) &&(T_src::tile_size_y==1)> | row_broadcast (T_dst &dst, T_src &src) |
| | Broadcasts 1d src tile to the entire 2d tile, as well as do the data conversion.
|
| |
| template<typename T_dst , typename T_src > |
| __XETLA_API std::enable_if_t<(T_dst::register_layout==reg_layout::linear) &&(T_src::register_layout==reg_layout::tiled) &&(T_src::tile_size_x==T_dst::tile_size_x) &&(T_src::tile_size_y==T_dst::tile_size_y) &&(T_dst::tile_size_x==T_dst::block_size_x) &&(T_dst::tile_size_y==T_dst::block_size_y) &&(std::is_same< typename T_dst::dtype, typename T_src::dtype >::value)> | layout_convert (T_dst &dst, T_src &src) |
| | convert 2d tile in a tiled register layout to a 2d tile in a linear register layout
|
| |
| template<typename T_dst , typename T_src > |
| __XETLA_API std::enable_if_t<(T_dst::register_layout==reg_layout::tiled) &&(T_src::register_layout==reg_layout::linear) &&(T_dst::tile_size_x==T_src::tile_size_x) &&(T_dst::tile_size_y==T_src::tile_size_y) &&(T_src::tile_size_x==T_src::block_size_x) &&(T_src::tile_size_y==T_src::block_size_y) &&(std::is_same< typename T_dst::dtype, typename T_src::dtype >::value)> | layout_convert (T_dst &dst, T_src &src) |
| | convert 2d tile in a linear register layout to a 2d tile in a tiled register layout
|
| |
| template<cache_hint L1 = cache_hint::cached, cache_hint L2 = cache_hint::cached, typename payload_t > |
| __XETLA_API std::enable_if_t< detail::check_prefetch_type< payload_t >::is_global_2d_xe > | tile_prefetch (payload_t &payload) |
| | Is prefetch data func, which data located in global memory is prefetched to cache, where has higher bandwidth.
|
| |
| template<cache_hint L1 = cache_hint::cached, cache_hint L2 = cache_hint::cached, typename payload_t > |
| __XETLA_API std::enable_if_t< detail::check_prefetch_type< payload_t >::is_global_block_1d_xe > | tile_prefetch (payload_t &payload) |
| | Is prefetch data func, which data located in global memory is prefetched to cache, where has higher bandwidth.
|
| |
| template<cache_hint L1 = cache_hint::cached, cache_hint L2 = cache_hint::cached, typename payload_t > |
| __XETLA_API std::enable_if_t< detail::check_prefetch_type< payload_t >::is_local_xe > | tile_prefetch (payload_t &payload) |
| | Is prefetch data func.
|
| |
| template<reduce_op reduce_kind, typename dtype_out , typename dtype_acc , int dim, typename mat_t > |
| __XETLA_API std::enable_if_t<(dim==1), xetla_vector< dtype_out, mat_t::tile_size_y > > | tile_reduce (mat_t &src) |
| |
| template<reduce_op reduce_kind, typename dtype_out , typename dtype_acc , int dim, typename mat_t > |
| __XETLA_API std::enable_if_t<(dim==0), xetla_vector< dtype_out, mat_t::tile_size_x > > | tile_reduce (mat_t &src) |
| |
| template<typename T_dst , typename T_src , bool accumulate = true, typename dtype_acc = float, uint32_t num_acc = 4> |
| | XETLA_MARKER ("This is only for reduce add, and will be deprecated in future. " "Please use tile_reduce instead.") __XETLA_API typename std |
| | Reduce 2d src tile to the 1d tile, and output to 1d dst.
|
| |
| template<cache_hint L1 = cache_hint::write_back, cache_hint L2 = cache_hint::write_back, typename tile_t , typename payload_t > |
| __XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_global_2d_xe > | tile_store (tile_t &tile, payload_t &payload) |
| | Is the func storing data from register file to global memory.
|
| |
| template<cache_hint L1 = cache_hint::write_back, cache_hint L2 = cache_hint::write_back, typename tile_t , typename payload_t > |
| __XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_global_block_1d_xe > | tile_store (tile_t &tile, payload_t &payload) |
| | Is the func storing data from register file to global memory.
|
| |
| template<cache_hint L1 = cache_hint::write_back, cache_hint L3 = cache_hint::write_back, typename tile_t , typename payload_t , typename oob_check_tag = global_atomic_oob_check_on_tag> |
| __XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_global_unaligned_2d_xe > | tile_store (tile_t &tile, payload_t &payload, oob_check_tag tag={}) |
| | Is the func storing data from register file to unaligned global memory surface.
|
| |
| template<cache_hint L1 = cache_hint::uncached, cache_hint L2 = cache_hint::write_back, typename tile_t , typename payload_t , typename oob_check_tag = global_atomic_oob_check_on_tag> |
| __XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_global_atomic_xe > | tile_store (tile_t &tile, payload_t &payload, oob_check_tag tag={}) |
| | Is the func storing data from register file to global memory enable atomic adding data into the same buffer, but support float32, float64, uint32_t, uint64_t and int type.
|
| |
| template<cache_hint L1 = cache_hint::write_back, cache_hint L2 = cache_hint::write_back, typename tile_t , typename payload_t > |
| __XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_local_scatter_xe > | tile_store (tile_t &tile, payload_t &payload) |
| | Is the func storing data from register file to shared local memory, which supports the memory surface 2d scenario.
|
| |
| template<cache_hint L1 = cache_hint::write_back, cache_hint L2 = cache_hint::write_back, typename tile_t , typename payload_t > |
| __XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_local_scatter_vnni_col_xe > | tile_store (tile_t &tile, payload_t &payload) |
| | Is the data store func from register file to local shared memory, where the data in register is vnni packed and col major.
|
| |
| template<cache_hint L1 = cache_hint::write_back, cache_hint L2 = cache_hint::write_back, typename tile_t , typename payload_t > |
| __XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_local_block_1d_xe &&tile_t::block_size_y !=1 > | tile_store (tile_t &tile, payload_t &payload) |
| | Is the data store func from register file to shared local memory, where supports memory surface 1d or 2d scenario, and we always assume dst memory layout is row major.
|
| |
| template<cache_hint L1 = cache_hint::write_back, cache_hint L2 = cache_hint::write_back, typename tile_t , typename payload_t > |
| __XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_local_block_1d_xe &&tile_t::tile_size_y==1 &&tile_t::block_size_y==1 > | tile_store (tile_t &tile, payload_t &payload) |
| | Is the func storing data from register file to shared local memory, the data in registers will be stored to SLM in 1d mode, and we always assume dst memory layout is row major.
|
| |
| template<typename op , typename matAcc_t > |
| void | tile_broadcast_op (matAcc_t &matAcc, xetla_vector< typename matAcc_t::dtype, matAcc_t::tile_size_y > data) |
| |