XeTLA v0.3.6
IntelĀ® Xe Templates for Linear Algebra - API Definition Document
 
Loading...
Searching...
No Matches
raw_send_load_store.hpp
Go to the documentation of this file.
1/*******************************************************************************
2* Copyright (c) 2022-2023 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
19
20#pragma once
21
25
26namespace gpu::xetla {
27
30
32
35
50template <typename Ty, uint32_t block_width = 1, uint32_t block_height = 1,
51 uint8_t array_len = 1>
53 int tensor_width, int tensor_height, int tensor_pitch, int offset_x,
54 int offset_y) {
55 detail::xetla_set_tensor_base_address(tdesc, (uint64_t)p);
56 detail::xetla_set_tensor_width_x(tdesc, tensor_width * sizeof(Ty) - 1);
57 detail::xetla_set_tensor_width_y(tdesc, tensor_height - 1);
58 detail::xetla_set_tensor_pitch_x(tdesc, tensor_pitch * sizeof(Ty) - 1);
59 detail::xetla_set_tensor_offset_x(tdesc, offset_x);
60 detail::xetla_set_tensor_offset_y(tdesc, offset_y);
61 uint32_t block_widthx_widthy_arrlen = (block_width - 1)
62 | ((block_height - 1) << 8) | ((array_len - 1) << 16);
64 tdesc, block_widthx_widthy_arrlen);
65}
66
78template <typename Ty>
80 uint32_t base_address, int tensor_width, int tensor_height,
81 int tensor_pitch, int offset_x, int offset_y) {
82 detail::xetla_set_tensor_base_address(tdesc, base_address);
83 detail::xetla_set_tensor_width_x(tdesc, tensor_width * sizeof(Ty));
84 detail::xetla_set_tensor_width_y(tdesc, tensor_height);
85 detail::xetla_set_tensor_pitch_x(tdesc, tensor_pitch * sizeof(Ty));
86 detail::xetla_set_tensor_offset_x(tdesc, offset_x);
87 detail::xetla_set_tensor_offset_y(tdesc, offset_y);
88}
89
104template <typename Ty, uint32_t block_width = 1, uint32_t block_height = 1,
105 uint8_t array_len = 1>
107 int tensor_height, int tensor_pitch, int offset_x, int offset_y) {
108 xetla_tdescriptor tdesc;
109 auto tdesc_ref = tdesc.xetla_format<uint32_t>();
110 detail::xetla_set_tensor_base_address(tdesc_ref, (uint64_t)p);
111 detail::xetla_set_tensor_width_x(tdesc_ref, tensor_width * sizeof(Ty) - 1);
112 detail::xetla_set_tensor_width_y(tdesc_ref, tensor_height - 1);
113 detail::xetla_set_tensor_pitch_x(tdesc_ref, tensor_pitch * sizeof(Ty) - 1);
114 detail::xetla_set_tensor_offset_x(tdesc_ref, offset_x);
115 detail::xetla_set_tensor_offset_y(tdesc_ref, offset_y);
116 uint32_t block_widthx_widthy_arrlen = (block_width - 1)
117 | ((block_height - 1) << 8) | ((array_len - 1) << 16);
119 tdesc_ref, block_widthx_widthy_arrlen);
120 return tdesc;
121}
122
134template <typename Ty>
136 int tensor_width, int tensor_height, int tensor_pitch, int offset_x,
137 int offset_y) {
138 xetla_tdescriptor tdesc;
139 auto tdesc_ref = tdesc.xetla_format<uint32_t>();
140 detail::xetla_set_tensor_base_address(tdesc_ref, base_address);
141 detail::xetla_set_tensor_width_x(tdesc_ref, tensor_width * sizeof(Ty));
142 detail::xetla_set_tensor_width_y(tdesc_ref, tensor_height);
143 detail::xetla_set_tensor_pitch_x(tdesc_ref, tensor_pitch * sizeof(Ty));
144 detail::xetla_set_tensor_offset_x(tdesc_ref, offset_x);
145 detail::xetla_set_tensor_offset_y(tdesc_ref, offset_y);
146 return tdesc;
147}
148
153 xetla_tdescriptor_ref tdesc, int32_t doffset_x) {
155 tdesc, detail::xetla_get_tensor_offset_x(tdesc) + doffset_x);
156}
157
162 xetla_tdescriptor_ref tdesc, int32_t doffset_y) {
164 tdesc, detail::xetla_get_tensor_offset_y(tdesc) + doffset_y);
165}
166
179template <typename Ty, uint32_t N, cache_hint L1H = cache_hint::none,
180 cache_hint L2H = cache_hint::none, bool transpose = false,
181 bool transform = false, gpu_arch arch_tag = gpu_arch::Xe>
182__XETLA_API std::enable_if_t<arch_tag == gpu_arch::Xe, xetla_vector<Ty, N>>
184 DEBUG_INVOKE(dbg_level::core,
185 core::block_2d<arch_tag, Ty>::template check_load<transpose,
186 transform>(tdesc));
187
188 constexpr uint32_t numDst = 31 < ((N * sizeof(Ty) + 63) / 64)
189 ? 31
190 : ((N * sizeof(Ty) + 63) / 64);
191 uint32_t msg_desc = 3;
192 msg_desc |= (transform ? 1 : 0) << 7;
193 msg_desc |= detail::get_element_size_code<sizeof(Ty)>() << 9;
194 msg_desc |= (transpose ? 1 : 0) << 15;
195 msg_desc |= detail::get_load_cache_hint_code<L1H, L2H, arch_tag>() << 17;
196 msg_desc |= 1 << 25;
197 msg_desc |= numDst << 20;
198
199 constexpr uint32_t numSrc0 = 1;
200 constexpr uint32_t execSize = 0;
201 constexpr uint32_t sfid = 0xF;
202 constexpr uint32_t exDesc = 0;
203
204 constexpr uint32_t ret_N = (N * sizeof(Ty)) >= 32 ? N : 32 / sizeof(Ty);
206
207 xetla_raw_send<Ty, ret_N, uint32_t, 16, execSize, sfid, numSrc0, numDst>(
208 ret.xetla_format<native_type_t<Ty>>(), tdesc, exDesc, msg_desc);
209
210 return ret.xetla_select<N, 1>(0);
211}
212
224template <typename Ty, uint32_t N, cache_hint L1H = cache_hint::none,
226__XETLA_API std::enable_if_t<arch_tag == gpu_arch::Xe, void>
229 dbg_level::core, core::block_2d<arch_tag, Ty>::check_store(tdesc));
230
231 uint32_t msg_desc = 7; // store operation
232 msg_desc |= detail::get_element_size_code<sizeof(Ty)>() << 9;
233 msg_desc |= detail::get_store_cache_hint_code<L1H, L2H, arch_tag>() << 17;
234 msg_desc |= 1 << 25;
235
236 constexpr uint32_t numSrc1 = (N * sizeof(Ty) + 63) / 64;
237 constexpr uint32_t numSrc0 = 1;
238 constexpr uint32_t execSize = 0;
239 constexpr uint32_t sfid = 0xF;
240 constexpr uint32_t exDesc = 0;
241
242 xetla_raw_send<uint32_t, 16, Ty, N, execSize, sfid, numSrc0, numSrc1>(
243 tdesc, data, exDesc, msg_desc);
244}
245
255template <typename Ty, cache_hint L1H = cache_hint::cached,
257__XETLA_API std::enable_if_t<arch_tag == gpu_arch::Xe, void>
259
260 uint32_t msg_desc = 3;
261 msg_desc |= 0 << 7;
262 msg_desc |= detail::get_element_size_code<sizeof(Ty)>() << 9;
263 msg_desc |= 0 << 15;
264 msg_desc |= detail::get_prefetch_cache_hint_code<L1H, L2H, arch_tag>()
265 << 17;
266 msg_desc |= 1 << 25;
267
268 constexpr uint32_t numSrc0 = 1;
269 constexpr uint32_t execSize = 0;
270 constexpr uint32_t sfid = 0xF;
271 constexpr uint32_t exDesc = 0;
272
273 xetla_raw_send<uint32_t, 16, execSize, sfid, numSrc0>(
274 tdesc, exDesc, msg_desc);
275}
276
290template <typename Ty, uint32_t N, cache_hint L1H = cache_hint::none,
292 gpu_arch arch_tag = gpu_arch::Xe, typename Toffset = uint32_t>
293__XETLA_API std::enable_if_t<arch_tag == gpu_arch::Xe, void>
294xetla_tatomic_store_global(uint64_t base_address,
296 xetla_mask<N> pred = 1) {
297
298 constexpr uint32_t numSrc0 = (N * sizeof(uint64_t) + 63) / 64;
299 constexpr uint32_t numSrc1 = (N * sizeof(Ty) + 63) / 64;
300
301 static_assert(sizeof(Ty) == 2 || sizeof(Ty) == 4 || sizeof(Ty) == 8,
302 "element_size not supported!");
303 uint32_t element_size_code;
304 if constexpr (sizeof(Ty) == 2) {
305 element_size_code = 5;
306 } else if constexpr (sizeof(Ty) == 4) {
307 element_size_code = 2;
308 } else if constexpr (sizeof(Ty) == 8) {
309 element_size_code = 3;
310 }
311
312 uint32_t msg_desc = detail::get_atomic_opcode<Op>();
314 msg_desc |= 3 << 7;
315 msg_desc |= element_size_code << 9;
316 msg_desc |= detail::get_atomic_cache_hint_code<L1H, L2H, arch_tag>() << 17;
317 msg_desc |= numSrc0 << 25;
318
319 constexpr uint32_t execSize = gpu::xetla::detail::get_execSize_code<N>();
320 constexpr uint32_t sfid = 0xF;
321 constexpr uint32_t exDesc = 0;
322
323 xetla_vector<uint64_t, N> address = base_address + offset;
324
325 xetla_raw_send<uint64_t, N, Ty, N, execSize, sfid, numSrc0, numSrc1>(
326 address, data, exDesc, msg_desc, pred);
327}
328
330
331} // namespace gpu::xetla
Definition limitation.hpp:33
#define __XETLA_API
Definition common.hpp:43
C++ API.
#define DEBUG_INVOKE(level,...)
Definition debug.hpp:180
xetla_vector< uint32_t, 16 > xetla_tdescriptor
Description of nd tensor descriptor for load and store.
Definition base_types.hpp:155
typename native_type< T >::type native_type_t
Return the native data type of T.
Definition base_types.hpp:106
__ESIMD_NS::simd< native_type_t< Ty >, N > xetla_vector
wrapper for xetla_vector.
Definition base_types.hpp:149
#define xetla_tdescriptor_ref
Alias to xetla_vector<uint32_t, 16> reference.
Definition base_types.hpp:158
__ESIMD_NS::simd_mask< N > xetla_mask
wrapper for xetla_mask.
Definition base_types.hpp:165
__XETLA_API void xetla_update_tdesc_offsetx(xetla_tdescriptor_ref tdesc, int32_t doffset_x)
Update the x coordinate in the given tensor descriptor.
Definition raw_send_load_store.hpp:152
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void > xetla_tprefetch_global(xetla_tdescriptor tdesc)
Tensor prefetch API.
Definition raw_send_load_store.hpp:258
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void > xetla_tatomic_store_global(uint64_t base_address, xetla_vector< Toffset, N > offset, xetla_vector< Ty, N > data, xetla_mask< N > pred=1)
Tensor atomic store API.
Definition raw_send_load_store.hpp:294
__XETLA_API void xetla_update_tdesc_offsety(xetla_tdescriptor_ref tdesc, int32_t doffset_y)
Update the y coordinate in the given tensor descriptor.
Definition raw_send_load_store.hpp:161
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void > xetla_tstore_global(xetla_tdescriptor tdesc, xetla_vector< Ty, N > data)
Tensor store API.
Definition raw_send_load_store.hpp:227
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, xetla_vector< Ty, N > > xetla_tload_global(xetla_tdescriptor tdesc)
Tensor load API.
Definition raw_send_load_store.hpp:183
__XETLA_API void xetla_fill_tdesc(xetla_tdescriptor_ref tdesc, Ty *p, int tensor_width, int tensor_height, int tensor_pitch, int offset_x, int offset_y)
Tensor descriptor construction(global memory version).
Definition raw_send_load_store.hpp:52
__XETLA_API xetla_tdescriptor xetla_get_tdesc(Ty *p, int tensor_width, int tensor_height, int tensor_pitch, int offset_x, int offset_y)
Generate a new tensor descriptor(global memory version).
Definition raw_send_load_store.hpp:106
__XETLA_API void xetla_set_tensor_offset_y(xetla_tdescriptor_ref desc, int32_t offset_y)
Definition tensor_descriptor.hpp:71
__XETLA_API void xetla_set_tensor_width_x(xetla_tdescriptor_ref desc, uint32_t width_x)
Definition tensor_descriptor.hpp:39
__XETLA_API void xetla_set_tensor_width_y(xetla_tdescriptor_ref desc, uint32_t width_y)
Definition tensor_descriptor.hpp:47
__XETLA_API void xetla_set_tensor_base_address(xetla_tdescriptor_ref desc, uint64_t base_address)
Definition tensor_descriptor.hpp:27
__XETLA_API void xetla_set_tensor_offset_x(xetla_tdescriptor_ref desc, int32_t offset_x)
Definition tensor_descriptor.hpp:63
__XETLA_API void xetla_set_tensor_pitch_x(xetla_tdescriptor_ref desc, uint32_t pitch_x)
Definition tensor_descriptor.hpp:55
__XETLA_API void xetla_set_block_widthx_widthy_arrlen(xetla_tdescriptor_ref desc, uint32_t block_widthx_widthy_arrlen)
Definition tensor_descriptor.hpp:79
__XETLA_API int32_t xetla_get_tensor_offset_x(xetla_tdescriptor desc)
Definition tensor_descriptor.hpp:67
__XETLA_API int32_t xetla_get_tensor_offset_y(xetla_tdescriptor desc)
Definition tensor_descriptor.hpp:75
Definition arch_config.hpp:24
cache_hint
L1 or L2 cache hint kinds.
Definition common.hpp:89
atomic_op
Represents an atomic operation.
Definition common.hpp:142
gpu_arch
Definition common.hpp:73