29template <
typename tile_t,
typename payload_t>
38 && (tile_t::tile_size_y == 1) && (tile_t::block_size_y == 1)
57 || payload_t::tile_desc::register_layout
64 && (payload_t::tile_desc::register_layout
95 static constexpr uint32_t tile_size_x = tile_desc::tile_size_x;
96 static constexpr uint32_t block_size_x = tile_desc::block_size_x;
97 static constexpr uint32_t block_size_y = tile_desc::block_size_y;
98 static constexpr uint32_t remained_size_y = tile_desc::remained_size_y;
100 static constexpr uint32_t block_elems = tile_desc::block_elems;
102 static constexpr uint32_t num_block_x = tile_desc::num_block_x;
103 static constexpr uint32_t num_block_y = tile_desc::num_block_y;
104 static constexpr uint32_t num_block = tile_desc::num_block;
107 payload_t::arch_tag>::template load_store_attr<msg_type::block_2d>;
109 static constexpr int32_t max_block_width
110 = load_store_attr::max_load_width_in_bytes /
sizeof(dtype);
111 static constexpr int32_t max_store_block_height
112 = load_store_attr::max_store_height_in_elem;
113 static_assert((max_block_width % block_size_x) == 0,
114 "max_block_width should be a multiply of block size x.");
115 static constexpr uint32_t elems_per_CL
116 = load_store_attr::cache_line_size_in_bytes /
sizeof(dtype);
117 static constexpr uint32_t st_blk_size_y
118 = block_size_y > max_store_block_height ? max_store_block_height
121 static constexpr uint32_t st_block_x
122 = (((tile_size_x % elems_per_CL) == 0) ? elems_per_CL
125 static constexpr uint8_t arr_len = st_block_x / block_size_x;
127 auto payload_2d = payload.payloads.xetla_format<uint32_t, num_block, 16>();
129 for (uint32_t i = 0; i < num_block_y; ++i) {
130 constexpr uint32_t store_block_elems = block_elems * arr_len;
131 auto payload_row = payload_2d.xetla_select<num_block_x, 1, 16, 1>(
133 detail::reset_tile_desc_core<num_block_x, block_size_x * arr_len,
134 st_blk_size_y, 1, 1,
false>(payload_row);
136 for (uint32_t j = 0; j < num_block_x; j += arr_len) {
138 auto reg_blk =
tile.reg.xetla_select<store_block_elems, 1>(
139 (i * num_block_x + j) * block_elems);
142 block_size_y, block_size_x * arr_len>();
144 for (uint32_t combine_i = 0; combine_i < arr_len; ++combine_i) {
145 combine_blk_2d.xetla_select<block_size_y, 1, block_size_x, 1>(
146 0, combine_i * block_size_x)
147 = reg_blk.xetla_select<block_elems, 1>(
148 combine_i * block_elems);
151 for (uint32_t ii = 0; ii < block_size_y / st_blk_size_y; ++ii) {
152 constexpr uint32_t store_elems
153 = st_blk_size_y * block_size_x * arr_len;
154 auto st_blk = combine_blk.xetla_select<store_elems, 1>(
157 payload_t::arch_tag>(tdesc, st_blk);
159 tdesc.xetla_format<uint32_t>(), st_blk_size_y);
162 if constexpr ((block_size_y % st_blk_size_y) != 0) {
163 constexpr uint32_t blk_remained_start = block_size_y
164 / st_blk_size_y * st_blk_size_y * block_size_x
166 constexpr uint8_t blk_remained_y = block_size_y % st_blk_size_y;
167 constexpr uint8_t blk_remained_elems
168 = blk_remained_y * block_size_x * arr_len;
169 auto st_blk = combine_blk.xetla_select<blk_remained_elems, 1>(
171 constexpr uint32_t block_widthx_widthy_arrlen
172 = (block_size_x * arr_len - 1)
173 | ((blk_remained_y - 1) << 8);
175 tdesc.xetla_format<uint32_t>(),
176 block_widthx_widthy_arrlen);
178 payload_t::arch_tag>(tdesc, st_blk);
183 if constexpr (remained_size_y > 0) {
184 constexpr uint32_t remained_block_elems
185 = block_size_x * remained_size_y;
186 constexpr uint32_t processed_elems
187 = num_block_y * num_block_x * block_elems;
188 constexpr uint32_t remained_st_blk_size_y
189 = st_blk_size_y > remained_size_y ? remained_size_y
191 auto payload_row = payload_2d.xetla_select<num_block_x, 1, 16, 1>(
192 num_block_y * num_block_x, 0);
193 detail::reset_tile_desc_core<num_block_x, block_size_x * arr_len,
194 remained_st_blk_size_y, 1, 1,
false>(payload_row);
196 for (uint32_t j = 0; j < num_block_x; j += arr_len) {
199 =
tile.reg.xetla_select<remained_block_elems * arr_len, 1>(
200 processed_elems + j * remained_block_elems);
204 remained_size_y, block_size_x * arr_len>();
206 for (uint32_t combine_i = 0; combine_i < arr_len; ++combine_i) {
208 .xetla_select<remained_size_y, 1, block_size_x, 1>(
209 0, combine_i * block_size_x)
210 = reg_blk.xetla_select<remained_block_elems, 1>(
211 combine_i * remained_block_elems);
214 for (uint32_t ii = 0; ii < remained_size_y / remained_st_blk_size_y;
216 constexpr uint32_t store_elems
217 = remained_st_blk_size_y * block_size_x * arr_len;
218 auto st_blk = combine_blk.xetla_select<store_elems, 1>(
221 payload_t::arch_tag>(tdesc, st_blk);
223 tdesc.xetla_format<uint32_t>(), remained_st_blk_size_y);
225 constexpr uint32_t final_st_blk_size_y
226 = remained_size_y % remained_st_blk_size_y;
227 if constexpr (final_st_blk_size_y != 0) {
228 constexpr uint32_t final_start = remained_size_y
229 / remained_st_blk_size_y * remained_st_blk_size_y
230 * block_size_x * arr_len;
231 constexpr uint32_t final_store_elems
232 = final_st_blk_size_y * block_size_x * arr_len;
233 auto st_blk = combine_blk.xetla_select<final_store_elems, 1>(
235 constexpr uint32_t block_widthx_widthy_arrlen
236 = (block_size_x * arr_len - 1)
237 | ((final_st_blk_size_y - 1) << 8);
239 tdesc.xetla_format<uint32_t>(),
240 block_widthx_widthy_arrlen);
242 payload_t::arch_tag>(tdesc, st_blk);
265 using store_dtype =
typename payload_t::mem_dtype;
267 static constexpr uint32_t tile_size_x = tile_t::tile_size_x;
268 static constexpr uint32_t scale_factor = payload_t::scale_factor;
270 constexpr uint32_t store_len = tile_size_x / scale_factor;
271 if constexpr (store_len >= 64) {
273 for (uint32_t i = 0; i < store_len / 64; i++) {
274 uint32_t offset_x = i * 64 * scale_factor;
276 =
tile.reg.xetla_select<64 * scale_factor, 1>(offset_x);
277 uint32_t address_offset = offset_x *
sizeof(dtype);
280 L2>(payload.base_ptr, payload.base_offset + address_offset,
281 reg_sub.xetla_format<store_dtype>());
284 constexpr uint32_t tail_len = store_len % 64;
285 uint32_t tail_offset = store_len / 64 * 64 * scale_factor;
286 detail::process_1d_tail<tail_len, 32, detail::process_flag::store, L1, L2>(
287 tile, payload, tail_offset);
304 typename oob_check_tag = global_atomic_oob_check_on_tag>
308 [[maybe_unused]] oob_check_tag tag = {}) {
309 constexpr bool oob_check = std::is_same<oob_check_tag,
310 global_atomic_oob_check_on_tag>::value;
311 using dtype =
typename payload_t::dtype;
312 using tile_desc =
typename payload_t::tile_desc;
313 using store_dtype =
typename payload_t::mem_dtype;
314 constexpr uint32_t num_channel_y = payload_t::num_channel_y;
315 constexpr uint32_t load_elems = num_channel_y * payload_t::num_channel_x;
316 constexpr uint32_t scale_factor = payload_t::scale_factor;
319 for (uint32_t i = 0; i < tile_desc::tile_size_y / tile_desc::block_size_y;
321 uint32_t offset_y = i * tile_desc::block_size_y;
323 for (uint32_t j = 0; j < tile_desc::num_block_x; j++) {
324 uint32_t offset_x = j * tile_desc::block_size_x;
325 auto reg_sub =
tile.reg.xetla_select<tile_desc::block_elems, 1>(
326 (i * tile_desc::num_block_x + j) * tile_desc::block_elems);
328 ? payload.step_x + payload.base_x + offset_x
329 < payload.width_in_elems
332 for (uint32_t sub_block_y = 0;
333 sub_block_y < tile_desc::block_size_y;
334 sub_block_y += num_channel_y) {
336 ? payload.step_y + payload.base_y + offset_y
338 < payload.height_in_elems
341 uint32_t address_offset = offset_x *
sizeof(dtype)
342 + (offset_y + sub_block_y) * payload.pitch_in_bytes;
344 L3, load_elems>(payload.base_ptr,
345 (payload.base_offset + address_offset
346 + payload.channel_offset),
347 reg_sub.xetla_select<load_elems * scale_factor, 1>(
348 sub_block_y * tile_desc::block_size_x)
349 .xetla_format<store_dtype>(),
355 if constexpr ((tile_desc::tile_size_y % tile_desc::block_size_y) != 0) {
356 constexpr uint32_t remained_size_y = tile_desc::remained_size_y;
357 constexpr uint32_t offset_y = tile_desc::tile_size_y - remained_size_y;
358 constexpr uint32_t processed_elems = offset_y * tile_desc::tile_size_x;
359 constexpr uint32_t remain_block_elems
360 = remained_size_y * tile_desc::block_size_x;
362 for (uint32_t j = 0; j < tile_desc::num_block_x; j++) {
363 uint32_t offset_x = j * tile_desc::block_size_x;
364 auto reg_sub =
tile.reg.xetla_select<remain_block_elems, 1>(
365 processed_elems + j * remain_block_elems);
366 xetla_mask<load_elems> pred_x = oob_check
367 ? payload.step_x + payload.base_x + offset_x
368 < payload.width_in_elems
371 for (uint32_t sub_block_y = 0; sub_block_y < remained_size_y;
372 sub_block_y += num_channel_y) {
373 xetla_mask<load_elems> pred_y = oob_check
374 ? payload.step_y + payload.base_y + offset_y
376 < payload.height_in_elems
379 uint32_t address_offset = offset_x *
sizeof(dtype)
380 + (offset_y + sub_block_y) * payload.pitch_in_bytes;
382 L3, load_elems>(payload.base_ptr,
383 (payload.base_offset + address_offset
384 + payload.channel_offset),
385 reg_sub.xetla_select<load_elems * scale_factor, 1>(
386 sub_block_y * tile_desc::block_size_x)
387 .xetla_format<store_dtype>(),
408 typename oob_check_tag = global_atomic_oob_check_on_tag>
412 [[maybe_unused]] oob_check_tag tag = {}) {
413 constexpr bool oob_check = std::is_same<oob_check_tag,
414 global_atomic_oob_check_on_tag>::value;
418 static constexpr uint32_t tile_size_x = tile_desc::tile_size_x;
419 static constexpr uint32_t tile_size_y = tile_desc::tile_size_y;
420 static constexpr uint32_t block_size_x = tile_desc::block_size_x;
421 static constexpr uint32_t block_size_y = tile_desc::block_size_y;
422 static constexpr uint32_t block_elems = tile_desc::block_elems;
423 static constexpr uint32_t num_block_x = tile_desc::num_block_x;
426 = (std::is_same<remove_const_t<dtype>,
float>::value
427 || std::is_same<remove_const_t<dtype>,
double>::value)
432 for (uint32_t i = 0; i < tile_size_y / block_size_y; i++) {
433 uint32_t offset_y = i * block_size_y;
435 for (uint32_t j = 0; j < num_block_x; j++) {
436 uint32_t offset_x = j * block_size_x;
437 auto reg_sub =
tile.reg.xetla_select<block_elems, 1>(
438 (i * num_block_x + j) * block_elems);
440 ? (payload.step_x + offset_x + payload.base_x)
441 < payload.width_in_elems
444 for (uint32_t sub_block_y = 0; sub_block_y < block_size_y;
445 sub_block_y += payload_t::num_channel_y) {
447 ? (payload.step_y + offset_y + payload.base_y
449 < payload.height_in_elems
451 uint64_t address_offset = offset_x *
sizeof(dtype)
452 + (sub_block_y + offset_y) * payload.pitch_in_bytes;
455 L2, op_kind, payload_t::arch_tag>(
456 payload.base_pointer + address_offset,
457 payload.channel_offset,
458 reg_sub.xetla_select<payload_t::store_elems, 1>(
459 sub_block_y * block_size_x),
465 if constexpr ((tile_size_y % block_size_y) != 0) {
466 constexpr uint32_t remained_size_y = tile_desc::remained_size_y;
467 constexpr uint32_t offset_y = tile_size_y - remained_size_y;
468 constexpr uint32_t processed_elems = offset_y * tile_size_x;
469 constexpr uint32_t remain_block_elems = remained_size_y * block_size_x;
471 for (uint32_t j = 0; j < num_block_x; j++) {
472 uint32_t offset_x = j * block_size_x;
473 auto reg_sub =
tile.reg.xetla_select<remain_block_elems, 1>(
474 processed_elems + j * remain_block_elems);
475 xetla_mask<payload_t::num_channel> pred_x = oob_check
476 ? (payload.step_x + offset_x + payload.base_x)
477 < payload.width_in_elems
480 for (uint32_t sub_block_y = 0; sub_block_y < remained_size_y;
481 sub_block_y += payload_t::num_channel_y) {
482 xetla_mask<payload_t::num_channel> pred_y = oob_check
483 ? (payload.step_y + offset_y + payload.base_y
485 < payload.height_in_elems
487 uint64_t address_offset = offset_x *
sizeof(dtype)
488 + (sub_block_y + offset_y) * payload.pitch_in_bytes;
491 L2, op_kind, payload_t::arch_tag>(
492 (uint64_t)payload.base_pointer + address_offset,
493 payload.channel_offset,
494 reg_sub.xetla_select<payload_t::store_elems, 1>(
495 sub_block_y * block_size_x),
521 using store_dtype =
typename payload_t::mem_dtype;
523 constexpr uint32_t num_channel_y = payload_t::num_channel_y;
524 constexpr uint32_t store_elems = num_channel_y * tile_desc::block_size_x;
526 for (uint32_t i = 0; i < tile_desc::tile_size_y / tile_desc::block_size_y;
528 uint32_t offset_y = i * tile_desc::block_size_y;
530 for (uint32_t j = 0; j < tile_desc::num_block_x; j++) {
531 uint32_t offset_x = j * tile_desc::block_size_x;
532 auto reg_sub =
tile.reg.xetla_select<tile_desc::block_elems, 1>(
533 (i * tile_desc::num_block_x + j) * tile_desc::block_elems);
535 for (uint32_t sub_block_y = 0;
536 sub_block_y < tile_desc::block_size_y;
537 sub_block_y += num_channel_y) {
538 uint32_t address_offset = offset_x *
sizeof(dtype)
539 + (sub_block_y + offset_y) * payload.pitch_in_bytes;
540 xetla_store_local<store_dtype>(payload.address + address_offset,
541 reg_sub.xetla_select<store_elems, 1>(
542 sub_block_y * tile_desc::block_size_x)
543 .xetla_format<store_dtype>());
548 if constexpr ((tile_desc::tile_size_y % tile_desc::block_size_y) != 0) {
549 constexpr uint32_t remained_size_y = tile_desc::remained_size_y;
550 constexpr uint32_t offset_y = tile_desc::block_size_y - remained_size_y;
551 constexpr uint32_t processed_elems = offset_y * tile_desc::tile_size_x;
552 constexpr uint32_t remain_block_elems
553 = remained_size_y * tile_desc::block_size_x;
555 for (uint32_t j = 0; j < tile_desc::num_block_x; j++) {
556 uint32_t offset_x = j * tile_desc::block_size_x;
557 auto reg_sub =
tile.reg.xetla_select<remain_block_elems, 1>(
558 processed_elems + j * remain_block_elems);
560 for (uint32_t sub_block_y = 0; sub_block_y < remained_size_y;
561 sub_block_y += num_channel_y) {
562 uint32_t address_offset = offset_x *
sizeof(dtype)
563 + (sub_block_y + offset_y) * payload.pitch_in_bytes;
564 xetla_store_local<store_dtype>(payload.address + address_offset,
565 reg_sub.xetla_select<store_elems, 1>(
566 sub_block_y * tile_desc::block_size_x)
567 .xetla_format<store_dtype>());
587__XETLA_API typename std::enable_if_t<detail::check_store_type<tile_t,
588 payload_t>::is_local_scatter_vnni_col_xe>
592 using store_dtype =
typename payload_t::store_dtype;
594 constexpr uint32_t vnni_scale_factor = payload_t::vnni_scale_factor;
595 constexpr uint32_t num_vector_size = payload_t::num_vector_size;
596 constexpr uint32_t num_channel_y = payload_t::num_channel_y;
597 constexpr uint32_t store_elems = payload_t::store_elems;
599 for (uint32_t i = 0; i < tile_desc::tile_size_y / tile_desc::block_size_y;
601 uint32_t offset_y = i * tile_desc::block_size_y;
603 for (uint32_t j = 0; j < tile_desc::num_block_x; j++) {
604 uint32_t offset_x = j * tile_desc::block_size_x;
605 auto reg_sub =
tile.reg.xetla_select<tile_desc::block_elems, 1>(
606 (i * tile_desc::num_block_x + j) * tile_desc::block_elems);
608 for (uint32_t sub_block_y = 0;
609 sub_block_y < tile_desc::block_size_y; sub_block_y
610 += num_channel_y * num_vector_size * vnni_scale_factor) {
611 uint32_t address_offset = offset_x * payload.pitch_in_bytes
612 + (sub_block_y + offset_y) *
sizeof(dtype);
613 xetla_store_local<store_dtype, num_vector_size>(
614 payload.address + address_offset,
615 reg_sub.xetla_select<store_elems, 1>(
616 sub_block_y * tile_desc::block_size_x)
617 .xetla_format<store_dtype>());
622 if constexpr ((tile_desc::tile_size_y % tile_desc::block_size_y) != 0) {
623 constexpr uint32_t remained_size_y = tile_desc::remained_size_y;
624 constexpr uint32_t offset_y = tile_desc::tile_size_y - remained_size_y;
625 constexpr uint32_t processed_elems = offset_y * tile_desc::tile_size_x;
626 constexpr uint32_t remain_block_elems
627 = remained_size_y * tile_desc::block_size_x;
629 for (uint32_t j = 0; j < tile_desc::num_block_x; j++) {
630 uint32_t offset_x = j * tile_desc::block_size_x;
631 auto reg_sub =
tile.reg.xetla_select<remain_block_elems, 1>(
632 processed_elems + j * remain_block_elems);
634 for (uint32_t sub_block_y = 0; sub_block_y < remained_size_y;
636 += num_channel_y * num_vector_size * vnni_scale_factor) {
637 uint32_t address_offset = offset_x * payload.pitch_in_bytes
638 + (sub_block_y + offset_y) *
sizeof(dtype);
639 xetla_store_local<store_dtype, num_vector_size>(
640 payload.address + address_offset,
641 reg_sub.xetla_select<store_elems, 1>(
642 sub_block_y * tile_desc::block_size_x)
643 .xetla_format<store_dtype>());
665 && tile_t::block_size_y != 1>
669 using store_dtype =
typename payload_t::mem_dtype;
671 constexpr uint32_t vector_size
672 = payload_t::bytes_per_row /
sizeof(store_dtype);
675 for (uint32_t i = 0; i < tile_desc::tile_size_y / tile_desc::block_size_y;
677 uint32_t offset_y = i * tile_desc::block_size_y;
679 for (uint32_t j = 0; j < tile_desc::num_block_x; j++) {
680 uint32_t offset_x = j * tile_desc::block_size_x;
681 auto reg_sub =
tile.reg.xetla_select<tile_desc::block_elems, 1>(
682 (i * tile_desc::num_block_x + j) * tile_desc::block_elems);
683 auto reg_sub_2d = reg_sub.xetla_format<dtype,
684 tile_desc::block_size_y, tile_desc::block_size_x>();
685 uint32_t address_offset = offset_x *
sizeof(dtype)
686 + offset_y * payload.pitch_in_bytes;
688 for (uint32_t row_i = 0; row_i < tile_desc::block_size_y; row_i++) {
689 xetla_store_local<store_dtype, vector_size>(payload.address
691 + row_i * payload.pitch_in_bytes,
692 reg_sub_2d.row(row_i).xetla_format<store_dtype>());
697 if constexpr ((tile_desc::tile_size_y % tile_desc::block_size_y) != 0) {
698 constexpr uint32_t remained_size_y = tile_desc::remained_size_y;
699 constexpr uint32_t offset_y = tile_desc::tile_size_y - remained_size_y;
700 constexpr uint32_t processed_elems = offset_y * tile_desc::tile_size_x;
701 constexpr uint32_t remain_block_elems
702 = remained_size_y * tile_desc::block_size_x;
704 for (uint32_t j = 0; j < tile_desc::num_block_x; j++) {
705 uint32_t offset_x = j * tile_desc::block_size_x;
706 auto reg_sub =
tile.reg.xetla_select<remain_block_elems, 1>(
707 processed_elems + j * remain_block_elems);
708 auto reg_sub_2d = reg_sub.xetla_format<dtype, remained_size_y,
709 tile_desc::block_size_x>();
710 uint32_t address_offset = offset_x *
sizeof(dtype)
711 + offset_y * payload.pitch_in_bytes;
713 for (uint32_t row_i = 0; row_i < remained_size_y; row_i++) {
714 xetla_store_local<store_dtype, vector_size>(payload.address
716 + row_i * payload.pitch_in_bytes,
717 reg_sub_2d.row(row_i).xetla_format<store_dtype>());
739 && tile_t::tile_size_y == 1 && tile_t::block_size_y == 1>
742 using tile_desc =
typename payload_t::tile_desc;
743 using store_dtype =
typename payload_t::mem_dtype;
745 constexpr uint32_t scale_factor = payload_t::scale_factor;
746 constexpr uint32_t store_len = tile_desc::tile_size_x / scale_factor;
747 if constexpr (store_len >= 64) {
749 for (uint32_t j = 0; j < store_len / 64; j++) {
750 uint32_t offset_x = j * 64 * scale_factor;
752 =
tile.reg.xetla_select<64 * scale_factor, 1>(offset_x);
753 uint32_t address_offset = offset_x *
sizeof(dtype);
754 xetla_store_local<store_dtype, 64>(payload.address + address_offset,
755 reg_sub.xetla_format<store_dtype>());
759 L2>(
tile, payload, store_len / 64 * 64 * scale_factor);
#define __XETLA_API
Definition common.hpp:43
xetla_vector< uint32_t, 16 > xetla_tdescriptor
Description of nd tensor descriptor for load and store.
Definition base_types.hpp:155
typename native_type< T >::type native_type_t
Return the native data type of T.
Definition base_types.hpp:106
__ESIMD_NS::simd< native_type_t< Ty >, N > xetla_vector
wrapper for xetla_vector.
Definition base_types.hpp:149
__ESIMD_NS::simd_mask< N > xetla_mask
wrapper for xetla_mask.
Definition base_types.hpp:165
__XETLA_API void xetla_store_global(Ty *p, xetla_vector< Toffset, N > offsets, xetla_vector< Ty, N *NElts > vals, xetla_mask< N > pred=1)
Stateless scattered store.
Definition memory.hpp:316
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void > xetla_tatomic_store_global(uint64_t base_address, xetla_vector< Toffset, N > offset, xetla_vector< Ty, N > data, xetla_mask< N > pred=1)
Tensor atomic store API.
Definition raw_send_load_store.hpp:294
__XETLA_API void xetla_update_tdesc_offsety(xetla_tdescriptor_ref tdesc, int32_t doffset_y)
Update the y coordinate in the given tensor descriptor.
Definition raw_send_load_store.hpp:161
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void > xetla_tstore_global(xetla_tdescriptor tdesc, xetla_vector< Ty, N > data)
Tensor store API.
Definition raw_send_load_store.hpp:227
__XETLA_API void xetla_set_block_widthx_widthy_arrlen(xetla_tdescriptor_ref desc, uint32_t block_widthx_widthy_arrlen)
Definition tensor_descriptor.hpp:79
__XETLA_API std::enable_if_t< base_len==0 > process_1d_tail(tile_t &tile, payload_t &payload, uint32_t offset)
Definition common.hpp:96
Definition limitation.hpp:457
__XETLA_API std::enable_if_t< detail::check_store_type< tile_t, payload_t >::is_global_2d_xe > tile_store(tile_t &tile, payload_t &payload)
Is the func storing data from register file to global memory.
Definition store_xe.hpp:91
cache_hint
L1 or L2 cache hint kinds.
Definition common.hpp:89
@ vnni_tiled_col_major
this is vnni tiled format, but for each block, they are stored in col major order
@ tile
flush out to the local scope
atomic_op
Represents an atomic operation.
Definition common.hpp:142
@ iadd
Atomic signed int add of src1 from memory data and return the old value. see
@ fadd
Atomic float add of src1 from memory data and return the old value. see
Definition arch_config.hpp:72
Definition store_xe.hpp:30
static constexpr bool is_global_block_1d_xe
Definition store_xe.hpp:37
static constexpr bool is_local_block_1d_xe
Definition store_xe.hpp:67
static constexpr bool is_local_scatter_xe
Definition store_xe.hpp:52
static constexpr bool is_global_atomic_xe
Definition store_xe.hpp:48
static constexpr bool is_global_unaligned_2d_xe
Definition store_xe.hpp:43
static constexpr bool is_local_scatter_vnni_col_xe
Definition store_xe.hpp:61
static constexpr bool is_global_2d_xe
Definition store_xe.hpp:32
Is a struct contains some register file.
Definition api.hpp:99
tile_desc_ tile_desc
Definition api.hpp:101
dtype_ dtype
Definition api.hpp:100