22#include "group/gemm/common.hpp"
33template <
typename compute_attr_,
typename perf_tuning_knob_,
38template <
typename compute_attr_,
typename perf_tuning_knob_>
43 static constexpr int k_stride = perf_tuning_knob::k_stride;
44 static constexpr int stages = perf_tuning_knob::stages;
45 static constexpr int sync_freq = perf_tuning_knob::sync_freq;
51 static constexpr uint32_t block_bytes_x_a = 32;
52 static constexpr uint32_t block_size_x_a
54 static constexpr uint32_t block_size_y_a = 16;
56 static constexpr uint32_t block_size_x_b = 16;
57 static constexpr uint32_t block_bytes_y_b = 32;
58 static constexpr uint32_t block_size_y_b
60 static_assert(block_size_x_a == block_size_y_b,
61 "mat_a x need to match with mat_b y");
68template <
typename compute_attr_,
typename perf_tuning_knob_,
73template <
typename compute_attr_,
typename perf_tuning_knob_>
78 static constexpr int k_stride = perf_tuning_knob::k_stride;
79 static constexpr int stages = perf_tuning_knob::stages;
80 static constexpr int sync_freq = perf_tuning_knob::sync_freq;
86 static constexpr uint32_t block_bytes_x_a = 32;
87 static constexpr uint32_t block_size_x_a
89 static constexpr uint32_t block_size_y_a = 16;
91 static constexpr uint32_t block_size_x_b = 16;
92 static constexpr uint32_t block_bytes_y_b = 32;
93 static constexpr uint32_t block_size_y_b
95 static_assert(block_size_x_a == block_size_y_b,
96 "mat_a x need to match with mat_b y");
103template <
typename compute_attr_,
typename perf_tuning_knob_,
108template <
typename compute_attr_,
typename perf_tuning_knob_>
113 static constexpr int k_stride = perf_tuning_knob::k_stride;
114 static constexpr int stages = perf_tuning_knob::stages;
115 static constexpr int sync_freq = perf_tuning_knob::sync_freq;
121 static constexpr uint32_t block_bytes_x_a = 32;
122 static constexpr uint32_t block_size_x_a
124 static constexpr uint32_t block_size_y_a = 16;
125 static constexpr uint32_t block_bytes_x_b = 64;
126 static constexpr uint32_t block_size_x_b
128 static constexpr uint32_t block_size_y_b = block_size_x_a;
Definition limitation.hpp:607
gpu_arch
Definition common.hpp:73
typename compute_attr::dtype_acc dtype_mma_acc
Definition compute_policy.hpp:117
perf_tuning_knob_ perf_tuning_knob
Definition compute_policy.hpp:112
typename compute_attr::dtype_b dtype_mma_b
Definition compute_policy.hpp:119
compute_attr_ compute_attr
Definition compute_policy.hpp:111
typename compute_attr::dtype_a dtype_mma_a
Definition compute_policy.hpp:118
Compute policy for fpu engine.
Definition compute_policy.hpp:105
compute_attr_ compute_attr
Definition compute_policy.hpp:41
typename compute_attr::dtype_b dtype_mma_b
Definition compute_policy.hpp:49
perf_tuning_knob_ perf_tuning_knob
Definition compute_policy.hpp:42
typename compute_attr::dtype_acc dtype_mma_acc
Definition compute_policy.hpp:47
typename compute_attr::dtype_a dtype_mma_a
Definition compute_policy.hpp:48
Compute policy for xmx engine.
Definition compute_policy.hpp:35
typename compute_attr::dtype_a dtype_mma_a
Definition compute_policy.hpp:83
typename compute_attr::dtype_acc dtype_mma_acc
Definition compute_policy.hpp:82
typename compute_attr::dtype_b dtype_mma_b
Definition compute_policy.hpp:84
compute_attr_ compute_attr
Definition compute_policy.hpp:76
perf_tuning_knob_ perf_tuning_knob
Definition compute_policy.hpp:77
Compute policy for unaligned shape and xmx engine.
Definition compute_policy.hpp:70