|
| struct | compute_attr_t |
| | Compute attribute for gemm. More...
|
| |
| struct | compute_policy_default_fpu |
| | Compute policy for fpu engine. More...
|
| |
| struct | compute_policy_default_fpu< compute_attr_, perf_tuning_knob_, gpu_arch::Xe > |
| | Specialized for Xe architecture. More...
|
| |
| struct | compute_policy_default_xmx |
| | Compute policy for xmx engine. More...
|
| |
| struct | compute_policy_default_xmx< compute_attr_, perf_tuning_knob_, gpu_arch::Xe > |
| | Specialized for Xe architecture. More...
|
| |
| struct | compute_policy_int4_dequantize_xmx |
| |
| struct | compute_policy_int4_dequantize_xmx< compute_attr_, perf_tuning_knob_, dtype_scale_, dtype_zero_pt_, dequant_s_, gpu_arch::Xe > |
| |
| struct | compute_policy_unaligned_xmx |
| | Compute policy for unaligned shape and xmx engine. More...
|
| |
| struct | compute_policy_unaligned_xmx< compute_attr_, perf_tuning_knob_, gpu_arch::Xe > |
| | Specialized for Xe architecture. More...
|
| |
| class | cooperative_reduce_t |
| | Workgroups to do the cooperative reduction. More...
|
| |
| class | cooperative_reduce_t< reduce_kind, tile_shape_, matAcc_t, 1, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> > |
| | Workgroups to do the cooperative reduction. More...
|
| |
| class | cooperative_reduce_t< reduce_kind, tile_shape_, matAcc_t, num_cooperative_wg, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> > |
| | Workgroups to do the cooperative reduction. Specialized for Xe architecture. More...
|
| |
| struct | default_epilogue_selector_config_t |
| |
| struct | default_epilogue_selector_t |
| |
| struct | default_gemm_selector_config_t |
| |
| struct | default_gemm_selector_t |
| |
| struct | epilogue_policy_default |
| | Default epilogue policy for store C. More...
|
| |
| struct | epilogue_policy_quant_op |
| | Epilogue functor, specialized for quantization operator. More...
|
| |
| struct | epilogue_policy_tile_op |
| | Epilogue policy for tile_op + store C fusion. More...
|
| |
| struct | epilogue_policy_unaligned |
| | Epilogue policy for store unaligned C. More...
|
| |
| struct | epilogue_stream_k_t |
| | Is the epilogue functor specialized for stream_k. More...
|
| |
| class | epilogue_t |
| | Is the epilogue functor. More...
|
| |
| class | epilogue_t< epilogue_policy_default< arch_tag_ >, tile_shape_, mem_desc_c_t_, std::enable_if_t<((arch_tag_==gpu_arch::Xe))> > |
| | Is the epilogue functor specialized for epilogue_policy_default and Xe architecture. More...
|
| |
| class | epilogue_t< epilogue_policy_quant_op< dequant_op_t_, tile_op_t_, quant_op_t_, arch_tag_, dtype_dequant_ >, tile_shape_, mem_desc_c_t_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> > |
| | Is the epilogue functor specialized for epilogue_policy_quant_op and Xe architecture. More...
|
| |
| class | epilogue_t< epilogue_policy_tile_op< tile_op_t_, arch_tag_ >, tile_shape_, mem_desc_c_t_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> > |
| | Is the epilogue functor specialized for epilogue_policy_tile_op and Xe architecture. More...
|
| |
| class | epilogue_t< epilogue_policy_unaligned< arch_tag_ >, tile_shape_, mem_desc_c_t_, std::enable_if_t<((arch_tag_==gpu_arch::Xe))> > |
| | Is the epilogue functor specialized for epilogue_policy_default and Xe architecture. More...
|
| |
| struct | gemm |
| |
| struct | gemm< gpu_arch::Xe > |
| |
| class | gemm_selector_t |
| | Gemm selection functor. More...
|
| |
| class | gemm_selector_t< dtype_a, dtype_b, mem_layout_a, mem_layout_b, mem_space_a, mem_space_b, alignment_a, alignment_b, dtype_acc, tile_shape, k_stride, mma_engine::fpu, arch_tag, stages, sync_freq, std::enable_if_t< detail::check_2d_block_pitch_alignment< dtype_a, dtype_b, alignment_a, alignment_b, arch_tag >::value > > |
| | Selects 2d block && fpu based gemm. More...
|
| |
| class | gemm_selector_t< dtype_a, dtype_b, mem_layout_a, mem_layout_b, mem_space_a, mem_space_b, alignment_a, alignment_b, dtype_acc, tile_shape, k_stride, mma_engine::xmx, arch_tag, stages, sync_freq, std::enable_if_t< detail::check_2d_block_pitch_alignment< dtype_a, dtype_b, alignment_a, alignment_b, arch_tag >::value > > |
| | Selects 2d block && xmx based gemm. More...
|
| |
| class | gemm_selector_t< dtype_a, dtype_b, mem_layout_a, mem_layout_b, mem_space_a, mem_space_b, alignment_a, alignment_b, dtype_acc, tile_shape, k_stride, mma_engine::xmx, arch_tag, stages, sync_freq, std::enable_if_t<!detail::check_2d_block_pitch_alignment< dtype_a, dtype_b, alignment_a, alignment_b, arch_tag >::value > > |
| | Selects scatter && xmx based brgemm. More...
|
| |
| class | gemm_t |
| | Gemm functor. More...
|
| |
| class | gemm_t< compute_policy_default_fpu< compute_attr_, perf_tuning_knob_, arch_tag_ >, tile_shape_, mem_desc_a_t_, mem_desc_b_t_, pre_processing_t_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> > |
| | Is the gemm functor for Xe architecture and vector engine. More...
|
| |
| class | gemm_t< compute_policy_default_xmx< compute_attr_, perf_tuning_knob_, arch_tag_ >, tile_shape_, mem_desc_a_t_, mem_desc_b_t_, pre_processing_t_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> > |
| | Is the gemm functor for Xe architecture and matrix engine. More...
|
| |
| class | gemm_t< compute_policy_int4_dequantize_xmx< compute_attr_, perf_tuning_knob_, dtype_scale_, dtype_zero_pt_, dequant_s_, gpu_arch::Xe >, tile_shape_, mem_desc_a_t_, mem_desc_b_t_, pre_processing_t_ > |
| | Is the gemm functor for Xe architecture and matrix engine. More...
|
| |
| class | gemm_t< compute_policy_unaligned_xmx< compute_attr_, perf_tuning_knob_, arch_tag_ >, tile_shape_, mem_desc_a_t_, mem_desc_b_t_, pre_processing_t_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> > |
| | Is the gemm functor for unaligned input, Xe architecture and matrix engine. More...
|
| |
| class | global_reduce_t |
| | Cross group global reduction. More...
|
| |
| class | global_reduce_t< reduce_op::sum, tile_shape_acc_, tile_shape_cnt_, mem_desc_acc_t_, mem_desc_cnt_t_, 1, counter_size_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> > |
| | Cross group global reduction. Specialized for num_group_reduction=1 and Xe architecture. More...
|
| |
| class | global_reduce_t< reduce_op::sum, tile_shape_acc_, tile_shape_cnt_, mem_desc_acc_t_, mem_desc_cnt_t_, num_group_reduction, counter_size, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> > |
| | Cross group global reduction. Specialized for reduce_op::sum and Xe architecture. More...
|
| |
| struct | group_reduce_t |
| | This is the group reduction. More...
|
| |
| struct | group_reduce_t< T, SZ, N, Op, 1, is_all_reduce, gpu_arch::Xe > |
| |
| struct | group_reduce_t< T, SZ, N, Op, N_SG, is_all_reduce, gpu_arch::Xe > |
| |
| struct | group_row_reduce_store_t |
| | This is the group row reduction(reduce_sum) + cooperative write out. More...
|
| |
| struct | group_row_reduce_store_t< dtype_acc, dtype_out, row_size, wg_size_x, 1, max_simd_len, gpu_arch::Xe > |
| |
| struct | group_row_reduce_store_t< dtype_acc, dtype_out, row_size, wg_size_x, wg_size_y, max_simd_len, gpu_arch::Xe > |
| |
| struct | ln_bwd_fused_op_arguments_t |
| |
| struct | ln_bwd_fused_op_t |
| |
| struct | ln_bwd_fused_op_t< ln_bwd_fused_kind::bias_dropout_resAdd_ln, dtype_in_, dtype_out_, dtype_acc_, layer_norm_attr_, gpu_arch::Xe > |
| |
| struct | ln_bwd_fused_op_t< ln_bwd_fused_kind::ln_dropout, dtype_in_, dtype_out_, dtype_acc_, layer_norm_attr_, gpu_arch::Xe > |
| |
| struct | ln_bwd_fused_op_t< ln_bwd_fused_kind::ln_dropout_gradAdd, dtype_in_, dtype_out_, dtype_acc_, layer_norm_attr_, gpu_arch::Xe > |
| |
| struct | ln_bwd_fused_op_t< ln_fused_op_kind_, dtype_in_, dtype_out_, dtype_acc_, layer_norm_attr_, gpu_arch::Xe > |
| |
| struct | ln_fwd_fused_op_arguments_t |
| |
| struct | ln_fwd_fused_op_t |
| |
| struct | ln_fwd_fused_op_t< ln_fused_op_kind_, dtype_in_, dtype_out_, dtype_acc_, layer_norm_attr_, gpu_arch::Xe > |
| |
| struct | ln_fwd_fused_op_t< ln_fwd_fused_kind::bias_dropout_resAdd_ln, dtype_in_, dtype_out_, dtype_acc_, layer_norm_attr_, gpu_arch::Xe > |
| |
| struct | ln_fwd_fused_op_t< ln_fwd_fused_kind::bias_rng_dropout_resAdd_ln, dtype_in_, dtype_out_, dtype_acc_, layer_norm_attr_, gpu_arch::Xe > |
| |
| struct | ln_fwd_fused_op_t< ln_fwd_fused_kind::ln_dropout, dtype_in_, dtype_out_, dtype_acc_, layer_norm_attr_, gpu_arch::Xe > |
| |
| struct | ln_fwd_fused_op_t< ln_fwd_fused_kind::ln_rng_dropout, dtype_in_, dtype_out_, dtype_acc_, layer_norm_attr_, gpu_arch::Xe > |
| |
| struct | mask_gen_t |
| |
| struct | perf_tuning_knob_t |
| | Fine-tune knobs for gemm. More...
|
| |
| struct | pre_processing_default_t |
| | Gemm default pre_processing functor. More...
|
| |
| class | pre_processing_default_t< tile_shape_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> > |
| | gemm default pre_processing functor. Specialized for Xe architecture. More...
|
| |
| struct | pre_processing_matA_neg_filter_t |
| | Gemm pre_processing functor with applying relu op to matA. More...
|
| |
| class | pre_processing_matA_neg_filter_t< tile_shape_, arch_tag, std::enable_if_t<(arch_tag==gpu_arch::Xe)> > |
| | gemm pre_processing functor with applying relu op to matA. Specialized for Xe architecture. More...
|
| |
| struct | row_reduction_fused_op_t |
| | Additional Ops that can be fused with row reduction processing flow. More...
|
| |
| struct | row_reduction_fused_op_t< fused_op_kind_, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe > |
| |
| struct | row_reduction_fused_op_t< reduction_fused_kind::bias_dropout_bwd, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe > |
| |
| struct | row_reduction_fused_op_t< reduction_fused_kind::bias_gelu_w_bwd, dtype_in_, dtype_out_, dtype_acc_, reduction_attr_, gpu_arch::Xe > |
| |
| struct | softmax_policy_bwd |
| |
| struct | softmax_policy_fwd |
| |
| class | softmax_t |
| |
| class | softmax_t< softmax_policy_bwd< dtype_in_, dtype_acc_, gpu_arch::Xe >, tile_shape_ > |
| |
| class | softmax_t< softmax_policy_fwd< dtype_acc_, gpu_arch::Xe >, tile_shape_ > |
| |
| struct | tile_shape_t |
| | Workgroup level tile shape description. More...
|
| |
| struct | xetla_row_reduction_fused_op_arguments_t |
| |