50template <
typename Ty, uint32_t block_width = 1, uint32_t block_height = 1,
51 uint8_t array_len = 1>
53 int tensor_width,
int tensor_height,
int tensor_pitch,
int offset_x,
61 uint32_t block_widthx_widthy_arrlen = (block_width - 1)
62 | ((block_height - 1) << 8) | ((array_len - 1) << 16);
64 tdesc, block_widthx_widthy_arrlen);
80 uint32_t base_address,
int tensor_width,
int tensor_height,
81 int tensor_pitch,
int offset_x,
int offset_y) {
104template <
typename Ty, uint32_t block_width = 1, uint32_t block_height = 1,
105 uint8_t array_len = 1>
107 int tensor_height,
int tensor_pitch,
int offset_x,
int offset_y) {
109 auto tdesc_ref = tdesc.xetla_format<uint32_t>();
116 uint32_t block_widthx_widthy_arrlen = (block_width - 1)
117 | ((block_height - 1) << 8) | ((array_len - 1) << 16);
119 tdesc_ref, block_widthx_widthy_arrlen);
134template <
typename Ty>
136 int tensor_width,
int tensor_height,
int tensor_pitch,
int offset_x,
139 auto tdesc_ref = tdesc.xetla_format<uint32_t>();
182__XETLA_API std::enable_if_t<arch_tag == gpu_arch::Xe, xetla_vector<Ty, N>>
188 constexpr uint32_t numDst = 31 < ((N *
sizeof(Ty) + 63) / 64)
190 : ((N *
sizeof(Ty) + 63) / 64);
191 uint32_t msg_desc = 3;
192 msg_desc |= (transform ? 1 : 0) << 7;
193 msg_desc |= detail::get_element_size_code<sizeof(Ty)>() << 9;
194 msg_desc |= (transpose ? 1 : 0) << 15;
195 msg_desc |= detail::get_load_cache_hint_code<L1H, L2H, arch_tag>() << 17;
197 msg_desc |= numDst << 20;
199 constexpr uint32_t numSrc0 = 1;
200 constexpr uint32_t execSize = 0;
201 constexpr uint32_t sfid = 0xF;
202 constexpr uint32_t exDesc = 0;
204 constexpr uint32_t ret_N = (N *
sizeof(Ty)) >= 32 ? N : 32 /
sizeof(Ty);
207 xetla_raw_send<Ty, ret_N, uint32_t, 16, execSize, sfid, numSrc0, numDst>(
210 return ret.xetla_select<N, 1>(0);
226__XETLA_API std::enable_if_t<arch_tag == gpu_arch::Xe, void>
231 uint32_t msg_desc = 7;
232 msg_desc |= detail::get_element_size_code<sizeof(Ty)>() << 9;
233 msg_desc |= detail::get_store_cache_hint_code<L1H, L2H, arch_tag>() << 17;
236 constexpr uint32_t numSrc1 = (N *
sizeof(Ty) + 63) / 64;
237 constexpr uint32_t numSrc0 = 1;
238 constexpr uint32_t execSize = 0;
239 constexpr uint32_t sfid = 0xF;
240 constexpr uint32_t exDesc = 0;
242 xetla_raw_send<uint32_t, 16, Ty, N, execSize, sfid, numSrc0, numSrc1>(
243 tdesc, data, exDesc, msg_desc);
257__XETLA_API std::enable_if_t<arch_tag == gpu_arch::Xe, void>
260 uint32_t msg_desc = 3;
262 msg_desc |= detail::get_element_size_code<sizeof(Ty)>() << 9;
264 msg_desc |= detail::get_prefetch_cache_hint_code<L1H, L2H, arch_tag>()
268 constexpr uint32_t numSrc0 = 1;
269 constexpr uint32_t execSize = 0;
270 constexpr uint32_t sfid = 0xF;
271 constexpr uint32_t exDesc = 0;
273 xetla_raw_send<uint32_t, 16, execSize, sfid, numSrc0>(
274 tdesc, exDesc, msg_desc);
293__XETLA_API std::enable_if_t<arch_tag == gpu_arch::Xe, void>
298 constexpr uint32_t numSrc0 = (N *
sizeof(uint64_t) + 63) / 64;
299 constexpr uint32_t numSrc1 = (N *
sizeof(Ty) + 63) / 64;
301 static_assert(
sizeof(Ty) == 2 ||
sizeof(Ty) == 4 ||
sizeof(Ty) == 8,
302 "element_size not supported!");
303 uint32_t element_size_code;
304 if constexpr (
sizeof(Ty) == 2) {
305 element_size_code = 5;
306 }
else if constexpr (
sizeof(Ty) == 4) {
307 element_size_code = 2;
308 }
else if constexpr (
sizeof(Ty) == 8) {
309 element_size_code = 3;
312 uint32_t msg_desc = detail::get_atomic_opcode<Op>();
315 msg_desc |= element_size_code << 9;
316 msg_desc |= detail::get_atomic_cache_hint_code<L1H, L2H, arch_tag>() << 17;
317 msg_desc |= numSrc0 << 25;
319 constexpr uint32_t execSize = gpu::xetla::detail::get_execSize_code<N>();
320 constexpr uint32_t sfid = 0xF;
321 constexpr uint32_t exDesc = 0;
325 xetla_raw_send<uint64_t, N, Ty, N, execSize, sfid, numSrc0, numSrc1>(
326 address, data, exDesc, msg_desc, pred);
Definition limitation.hpp:33
#define __XETLA_API
Definition common.hpp:43
#define DEBUG_INVOKE(level,...)
Definition debug.hpp:180
xetla_vector< uint32_t, 16 > xetla_tdescriptor
Description of nd tensor descriptor for load and store.
Definition base_types.hpp:155
typename native_type< T >::type native_type_t
Return the native data type of T.
Definition base_types.hpp:106
__ESIMD_NS::simd< native_type_t< Ty >, N > xetla_vector
wrapper for xetla_vector.
Definition base_types.hpp:149
#define xetla_tdescriptor_ref
Alias to xetla_vector<uint32_t, 16> reference.
Definition base_types.hpp:158
__ESIMD_NS::simd_mask< N > xetla_mask
wrapper for xetla_mask.
Definition base_types.hpp:165
__XETLA_API void xetla_update_tdesc_offsetx(xetla_tdescriptor_ref tdesc, int32_t doffset_x)
Update the x coordinate in the given tensor descriptor.
Definition raw_send_load_store.hpp:152
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void > xetla_tprefetch_global(xetla_tdescriptor tdesc)
Tensor prefetch API.
Definition raw_send_load_store.hpp:258
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void > xetla_tatomic_store_global(uint64_t base_address, xetla_vector< Toffset, N > offset, xetla_vector< Ty, N > data, xetla_mask< N > pred=1)
Tensor atomic store API.
Definition raw_send_load_store.hpp:294
__XETLA_API void xetla_update_tdesc_offsety(xetla_tdescriptor_ref tdesc, int32_t doffset_y)
Update the y coordinate in the given tensor descriptor.
Definition raw_send_load_store.hpp:161
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void > xetla_tstore_global(xetla_tdescriptor tdesc, xetla_vector< Ty, N > data)
Tensor store API.
Definition raw_send_load_store.hpp:227
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, xetla_vector< Ty, N > > xetla_tload_global(xetla_tdescriptor tdesc)
Tensor load API.
Definition raw_send_load_store.hpp:183
__XETLA_API void xetla_fill_tdesc(xetla_tdescriptor_ref tdesc, Ty *p, int tensor_width, int tensor_height, int tensor_pitch, int offset_x, int offset_y)
Tensor descriptor construction(global memory version).
Definition raw_send_load_store.hpp:52
__XETLA_API xetla_tdescriptor xetla_get_tdesc(Ty *p, int tensor_width, int tensor_height, int tensor_pitch, int offset_x, int offset_y)
Generate a new tensor descriptor(global memory version).
Definition raw_send_load_store.hpp:106
__XETLA_API void xetla_set_tensor_offset_y(xetla_tdescriptor_ref desc, int32_t offset_y)
Definition tensor_descriptor.hpp:71
__XETLA_API void xetla_set_tensor_width_x(xetla_tdescriptor_ref desc, uint32_t width_x)
Definition tensor_descriptor.hpp:39
__XETLA_API void xetla_set_tensor_width_y(xetla_tdescriptor_ref desc, uint32_t width_y)
Definition tensor_descriptor.hpp:47
__XETLA_API void xetla_set_tensor_base_address(xetla_tdescriptor_ref desc, uint64_t base_address)
Definition tensor_descriptor.hpp:27
__XETLA_API void xetla_set_tensor_offset_x(xetla_tdescriptor_ref desc, int32_t offset_x)
Definition tensor_descriptor.hpp:63
__XETLA_API void xetla_set_tensor_pitch_x(xetla_tdescriptor_ref desc, uint32_t pitch_x)
Definition tensor_descriptor.hpp:55
__XETLA_API void xetla_set_block_widthx_widthy_arrlen(xetla_tdescriptor_ref desc, uint32_t block_widthx_widthy_arrlen)
Definition tensor_descriptor.hpp:79
__XETLA_API int32_t xetla_get_tensor_offset_x(xetla_tdescriptor desc)
Definition tensor_descriptor.hpp:67
__XETLA_API int32_t xetla_get_tensor_offset_y(xetla_tdescriptor desc)
Definition tensor_descriptor.hpp:75
Definition arch_config.hpp:24
cache_hint
L1 or L2 cache hint kinds.
Definition common.hpp:89
atomic_op
Represents an atomic operation.
Definition common.hpp:142
gpu_arch
Definition common.hpp:73