XeTLA v0.3.6
IntelĀ® Xe Templates for Linear Algebra - API Definition Document
 
Loading...
Searching...
No Matches
common.hpp
Go to the documentation of this file.
1/*******************************************************************************
2 * Copyright (c) 2022-2023 Intel Corporation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
16
19
20#pragma once
21
22#include "common/core/core.hpp"
23
24namespace gpu::xetla {
25namespace detail {
26
33template <uint32_t element_size>
34constexpr uint32_t get_element_size_code() {
35 static_assert(element_size == 1 || element_size == 2 || element_size == 4
36 || element_size == 8,
37 "element_size not supported!");
38 switch (element_size) {
39 case 1: return 0;
40 case 2: return 1;
41 case 4: return 2;
42 case 8: return 3;
43 }
44}
45
46enum class lsc_action : uint8_t { prefetch, load, store, atomic };
47
48template <lsc_action Action, cache_hint L1H, cache_hint L2H, gpu_arch arch_tag>
49constexpr std::enable_if_t<arch_tag == gpu_arch::Xe, void>
51 if constexpr (Action == lsc_action::prefetch) {
52 // https://gfxspecs.intel.com/Predator/Home/Index/53560
53 static_assert(
55 && (L1H == cache_hint::uncached
56 || L1H == cache_hint::cached
57 || L1H == cache_hint::streaming)),
58 "cache hint type not supported!");
59 } else if constexpr (Action == lsc_action::load) {
60 // https://gfxspecs.intel.com/Predator/Home/Index/53560
61 static_assert((L1H == cache_hint::none && L2H == cache_hint::none)
62 || ((L2H == cache_hint::uncached)
63 && (L1H == cache_hint::uncached
64 || L1H == cache_hint::cached
65 || L1H == cache_hint::streaming))
66 || ((L2H == cache_hint::cached)
67 && (L1H == cache_hint::uncached
68 || L1H == cache_hint::cached
69 || L1H == cache_hint::streaming
71 "unsupported cache hint!");
72 } else if constexpr (Action == lsc_action::store) {
73 // https://gfxspecs.intel.com/Predator/Home/Index/53561
74 static_assert((L1H == cache_hint::none && L2H == cache_hint::none)
75 || ((L2H == cache_hint::uncached)
76 && (L1H == cache_hint::uncached
78 || L1H == cache_hint::streaming))
79 || ((L2H == cache_hint::write_back)
80 && (L1H == cache_hint::uncached
82 || L1H == cache_hint::streaming
83 || L1H == cache_hint::write_back)),
84 "unsupported cache hint!");
85 } else if constexpr (Action == lsc_action::atomic) {
86 // https://gfxspecs.intel.com/Predator/Home/Index/53561
87 static_assert((L1H == cache_hint::none && L2H == cache_hint::none)
88 || (L1H == cache_hint::uncached
89 && (L2H == cache_hint::uncached
90 || L2H == cache_hint::write_back)),
91 "unsupported cache hint!");
92 }
93}
94
95template <cache_hint L1H, cache_hint L2H, gpu_arch arch_tag>
96constexpr std::enable_if_t<arch_tag == gpu_arch::Xe, uint32_t>
98 check_lsc_cache_hint<lsc_action::load, L1H, L2H, arch_tag>();
99 if (L1H == cache_hint::none && L2H == cache_hint::none) {
100 return 0;
101 } else if (L2H == cache_hint::uncached) {
102 if (L1H == cache_hint::uncached) { return 1; }
103 if (L1H == cache_hint::cached) { return 3; }
104 if (L1H == cache_hint::streaming) { return 5; }
105 } else if (L2H == cache_hint::cached) {
106 if (L1H == cache_hint::uncached) { return 2; }
107 if (L1H == cache_hint::cached) { return 4; }
108 if (L1H == cache_hint::streaming) { return 6; }
109 if (L1H == cache_hint::read_invalidate) { return 7; }
110 }
111}
112
113template <cache_hint L1H, cache_hint L2H, gpu_arch arch_tag>
114constexpr std::enable_if_t<arch_tag == gpu_arch::Xe, uint32_t>
116 check_lsc_cache_hint<lsc_action::prefetch, L1H, L2H, arch_tag>();
117 if (L2H == cache_hint::uncached) {
118 if (L1H == cache_hint::uncached) { return 1; }
119 if (L1H == cache_hint::cached) { return 3; }
120 if (L1H == cache_hint::streaming) { return 5; }
121 } else if (L2H == cache_hint::cached) {
122 if (L1H == cache_hint::uncached) { return 2; }
123 if (L1H == cache_hint::cached) { return 4; }
124 if (L1H == cache_hint::streaming) { return 6; }
125 }
126}
127
128template <cache_hint L1H, cache_hint L2H, gpu_arch arch_tag>
129constexpr std::enable_if_t<arch_tag == gpu_arch::Xe, uint32_t>
131 check_lsc_cache_hint<lsc_action::store, L1H, L2H, arch_tag>();
132 if (L1H == cache_hint::none && L2H == cache_hint::none) {
133 return 0;
134 } else if (L2H == cache_hint::uncached) {
135 if (L1H == cache_hint::uncached) { return 1; }
136 if (L1H == cache_hint::write_through) { return 3; }
137 if (L1H == cache_hint::streaming) { return 5; }
138 } else if (L2H == cache_hint::write_back) {
139 if (L1H == cache_hint::uncached) { return 2; }
140 if (L1H == cache_hint::write_through) { return 4; }
141 if (L1H == cache_hint::streaming) { return 6; }
142 if (L1H == cache_hint::write_back) { return 7; }
143 }
144}
145
146template <cache_hint L1H, cache_hint L2H, gpu_arch arch_tag>
147constexpr std::enable_if_t<arch_tag == gpu_arch::Xe, uint32_t>
149 check_lsc_cache_hint<lsc_action::atomic, L1H, L2H, arch_tag>();
150 if (L1H == cache_hint::none && L2H == cache_hint::none) {
151 return 0;
152 } else if (L2H == cache_hint::uncached) {
153 if (L1H == cache_hint::uncached) { return 1; }
154 if (L1H == cache_hint::write_through) { return 3; }
155 if (L1H == cache_hint::streaming) { return 5; }
156 } else if (L2H == cache_hint::write_back) {
157 if (L1H == cache_hint::uncached) { return 2; }
158 if (L1H == cache_hint::write_through) { return 4; }
159 if (L1H == cache_hint::streaming) { return 6; }
160 if (L1H == cache_hint::write_back) { return 7; }
161 }
162}
163
164template <uint32_t num_channel>
165constexpr uint32_t get_execSize_code() {
166 static_assert(num_channel == 1 || num_channel == 2 || num_channel == 4
167 || num_channel == 8 || num_channel == 16
168 || num_channel == 32,
169 "num_channel not supported!");
170 switch (num_channel) {
171 case 1: return 0;
172 case 2: return 1;
173 case 4: return 2;
174 case 8: return 3;
175 case 16: return 4;
176 case 32: return 5;
177 }
178}
179
180template <atomic_op Op>
181constexpr uint32_t get_atomic_opcode() {
182 static_assert(Op == atomic_op::fadd || Op == atomic_op::fmax
183 || Op == atomic_op::iadd,
184 "Other atomic op didn't added");
185 switch (Op) {
186 case atomic_op::fadd: return 19;
187 case atomic_op::fmax: return 22;
188 case atomic_op::iadd: return 12;
189 }
190}
191
192} // namespace detail
193
209enum class reg_layout : uint8_t {
210 linear = 0,
211 tiled = 1,
212 vnni_tiled = 2,
213 transpose_tiled = 3,
217};
218enum class store_op : uint8_t {
219 normal = 0,
220 atomic_fadd = 1,
221 atomic_iadd = 2,
223 block_1d = 4
224};
225enum class mma_engine : uint8_t { xmx = 0, fpu = 1 };
226// enum class trans_mode : uint8_t { none = 0, transpose = 1 };
227enum class memory_op : uint8_t { load = 0, store = 1 };
228enum class tdesc_update_dir : uint8_t { x_dir = 0, y_dir = 1 };
229enum class post_kind : uint8_t {
230 none = 0,
231 relu = 1,
232 gelu = 2,
233 gelu_bwd_w = 3,
234 sigmoid = 4,
235 tanh = 5
236};
237enum class pre_kind : uint8_t { none = 0, bias_add = 1, res_add = 2 };
238enum class offset_mode : uint8_t {
239 const_offset = 0,
240 cyclic_offset = 1,
242};
243
246template <typename kernel_t>
248 xetla_nbarrier_init<kernel_t::get_barrier_count()>();
249 xetla_local_init<kernel_t::get_slm_size()>();
250}
251
255template <uint32_t slm_size, uint32_t nbarrier_count>
257 xetla_nbarrier_init<nbarrier_count>();
258 xetla_local_init<slm_size>();
259}
260
261} // namespace gpu::xetla
C++ API.
constexpr uint32_t get_execSize_code()
Definition common.hpp:165
constexpr std::enable_if_t< arch_tag==gpu_arch::Xe, uint32_t > get_store_cache_hint_code()
Definition common.hpp:130
constexpr std::enable_if_t< arch_tag==gpu_arch::Xe, uint32_t > get_atomic_cache_hint_code()
Definition common.hpp:148
constexpr std::enable_if_t< arch_tag==gpu_arch::Xe, uint32_t > get_load_cache_hint_code()
Definition common.hpp:97
constexpr std::enable_if_t< arch_tag==gpu_arch::Xe, uint32_t > get_prefetch_cache_hint_code()
Definition common.hpp:115
constexpr std::enable_if_t< arch_tag==gpu_arch::Xe, void > check_lsc_cache_hint()
Definition common.hpp:50
constexpr uint32_t get_atomic_opcode()
Definition common.hpp:181
lsc_action
Definition common.hpp:46
constexpr uint32_t get_element_size_code()
Get the element size code object.
Definition common.hpp:34
Definition arch_config.hpp:24
post_kind
Definition common.hpp:229
reg_layout
tile layout in register linear: linear layout with one tile tiled: 2d block stacked in raster order v...
Definition common.hpp:209
@ vnni_tiled_col_major
this is vnni tiled format, but for each block, they are stored in col major order
mma_engine
Definition common.hpp:225
pre_kind
Definition common.hpp:237
@ iadd
Atomic signed int add of src1 from memory data and return the old value. see
@ fmax
Atomic store the float max of src1 and memory data and return the old value. see
@ fadd
Atomic float add of src1 from memory data and return the old value. see
@ store
Atomic store untyped data to memory. see
@ load
Atomic read of the memory data value, without modifying the data. see
memory_op
Definition common.hpp:227
tdesc_update_dir
Definition common.hpp:228
offset_mode
Definition common.hpp:238
void slm_barrier_init()
Initial the local memory size and named barrier count with kernel_t.
Definition common.hpp:247
store_op
Definition common.hpp:218