36template <
typename dtype_,
typename tile_desc_,
mem_layout mem_layout_,
37 gpu_arch arch_tag_, uint32_t alignment_>
41 std::enable_if_t<(arch_tag_ == gpu_arch::Xe)>> {
49 static constexpr gpu_arch arch_tag = arch_tag_;
52 static constexpr uint32_t block_size_x = tile_desc::block_size_x;
53 static constexpr uint32_t block_size_y = tile_desc::block_size_y;
54 static constexpr uint32_t num_block_x = tile_desc::num_block_x;
55 static constexpr uint32_t num_block_y = tile_desc::num_block_y;
56 static constexpr uint32_t num_block = tile_desc::num_block;
57 static constexpr uint32_t remained_size_y = tile_desc::remained_size_y;
62 static constexpr bool mem_transpose
65 static constexpr reg_layout register_layout = tile_desc::register_layout;
66 static constexpr bool reg_transpose
68 static constexpr bool trans = mem_transpose ^ reg_transpose;
70 static constexpr bool mem_transform = (
sizeof(
dtype) < 4) && !mem_transpose
73 static constexpr bool mem_dword_transpose = (
sizeof(
dtype) < 4) && trans;
75 using mem_dtype =
typename std::conditional<mem_dword_transpose, uint32_t,
82 this->payload = rhs.payload;
89 / int32_t(scale_factor);
91 base_tdesc.xetla_format<uint32_t>(), offset);
92 prepare_tdesc(base_tdesc);
96 uint32_t surface_height, uint32_t surface_pitch,
97 int32_t surface_offset_x = 0, int32_t surface_offset_y = 0) {
100 surface_height, surface_pitch,
101 surface_offset_x / int32_t(scale_factor), surface_offset_y);
102 prepare_tdesc(base_tdesc);
109 / int32_t(scale_factor);
111 base_tdesc.xetla_format<uint32_t>(), offset);
112 prepare_tdesc(base_tdesc);
116 uint32_t surface_height, uint32_t surface_pitch,
117 int32_t surface_offset_x = 0, int32_t surface_offset_y = 0) {
120 surface_height, surface_pitch,
121 surface_offset_x / int32_t(scale_factor), surface_offset_y);
122 prepare_tdesc(base_tdesc);
131 this->payload = rhs.payload;
135 template <tdesc_update_dir update_dir = tdesc_update_dir::x_dir>
137 auto payloads_2d = payloads.xetla_format<uint32_t, num_block, 16>();
140 for (uint32_t i = 0; i < num_block; i++) {
142 payloads_2d.row(i), offset / int32_t(scale_factor));
146 for (uint32_t i = 0; i < num_block; i++) {
154 auto payloads_2d = payloads.xetla_format<uint32_t, num_block, 16>();
155 uint32_t base_offset_y = 0;
157 for (uint32_t i = 0; i < num_block_y; i++) {
158 auto tdesc_row_2d = payloads_2d.xetla_select<num_block_x, 1, 16, 1>(
160 prepare_tile_desc_core<num_block_x, block_size_x, block_size_y, 1,
161 mem_transpose>(tdesc_row_2d, base_tdesc, base_offset_y);
162 base_offset_y += block_size_y;
165 if constexpr (remained_size_y > 0) {
166 auto tdesc_row_2d = payloads_2d.xetla_select<num_block_x, 1, 16, 1>(
167 num_block_y * num_block_x, 0);
168 prepare_tile_desc_core<num_block_x, block_size_x, remained_size_y,
169 1, mem_transpose>(tdesc_row_2d, base_tdesc, base_offset_y);
173 template <uint32_t num_tdesc, uint32_t size_x, uint32_t size_y,
174 uint8_t arr_len,
bool trans>
176 xetla_matrix_ref<uint32_t, num_tdesc, 16>
__REF__ payloads_row_2d,
178 uint32_t base_offset_x = 0;
180 for (uint32_t j = 0; j < num_tdesc; j++) {
181 payloads_row_2d.row(j) = base_tdesc;
183 constexpr uint8_t block_width
184 = trans ? (size_y / scale_factor) : (size_x / scale_factor);
185 constexpr uint8_t block_height = trans ? size_x : size_y;
186 constexpr uint32_t block_widthx_widthy_arrlen = (block_width - 1)
187 | ((block_height - 1) << 8) | ((arr_len - 1) << 16);
189 payloads_row_2d.row(j), block_widthx_widthy_arrlen);
192 uint32_t offset_width = trans
193 ? (base_offset_y / int32_t(scale_factor))
194 : (base_offset_x / int32_t(scale_factor));
195 uint32_t offset_height = trans ? base_offset_x : base_offset_y;
199 base_offset_x += size_x * arr_len;
212template <
typename dtype_,
typename tile_desc_,
gpu_arch arch_tag_,
217 std::enable_if_t<(arch_tag_ == gpu_arch::Xe)>> {
225 static constexpr uint32_t alignment_in_bytes
226 = mem_desc_t::alignment_in_bytes;
228 static_assert((alignment_in_bytes %
sizeof(uint32_t)) == 0,
229 "alignment should at least DW aligned");
232 static constexpr uint32_t tile_size_x = tile_desc::tile_size_x;
233 static constexpr uint32_t tile_size_y = tile_desc::tile_size_y;
234 static_assert(tile_size_y == 1,
235 "For tile_size_y > 1 case, please use 2d block message! ");
240 static constexpr uint32_t bytes_per_row = tile_size_x *
sizeof(
dtype);
242 (bytes_per_row %
sizeof(uint64_t) == 0)
243 && (alignment_in_bytes %
sizeof(uint64_t) == 0),
245 typename std::conditional<(bytes_per_row %
sizeof(uint32_t) == 0),
246 uint32_t,
dtype>::type>::type;
254 pitch_in_bytes = mem_tdesc.shape.stride *
sizeof(
dtype);
255 uint32_t offset_x = mem_tdesc.coord.x;
256 uint32_t offset_y = mem_tdesc.coord.y;
257 base_offset = offset_y * pitch_in_bytes + offset_x *
sizeof(
dtype);
258 base_ptr = (
mem_dtype *)mem_tdesc.base.base;
262 [[maybe_unused]]
int surface_height,
int surface_pitch,
263 int surface_offset_x,
int surface_offset_y) {
264 pitch_in_bytes = surface_pitch *
sizeof(
dtype);
265 uint32_t offset_x = surface_offset_x;
266 uint32_t offset_y = surface_offset_y;
267 base_offset = offset_y * pitch_in_bytes + offset_x *
sizeof(
dtype);
272 pitch_in_bytes = mem_tdesc.shape.stride *
sizeof(
dtype);
273 uint32_t offset_x = mem_tdesc.coord.x;
274 uint32_t offset_y = mem_tdesc.coord.y;
275 base_offset = offset_y * pitch_in_bytes + offset_x *
sizeof(
dtype);
276 base_ptr = (
mem_dtype *)mem_tdesc.base.base;
280 [[maybe_unused]]
int surface_height,
int surface_pitch,
281 int surface_offset_x,
int surface_offset_y) {
282 pitch_in_bytes = surface_pitch *
sizeof(
dtype);
283 uint32_t offset_x = surface_offset_x;
284 uint32_t offset_y = surface_offset_y;
285 base_offset = offset_y * pitch_in_bytes + offset_x *
sizeof(
dtype);
290 this->base_offset = rhs.base_offset;
291 this->base_ptr = rhs.base_ptr;
292 this->pitch_in_bytes = rhs.pitch_in_bytes;
297 this->base_offset = rhs.base_offset;
298 this->base_ptr = rhs.base_ptr;
299 this->pitch_in_bytes = rhs.pitch_in_bytes;
303 template <tdesc_update_dir update_dir = tdesc_update_dir::x_dir>
306 base_offset += int64_t(offset) *
sizeof(
dtype);
308 base_offset += int64_t(offset) * pitch_in_bytes;
319template <
typename dtype_,
typename tile_desc_,
gpu_arch arch_tag_,
324 std::enable_if_t<(arch_tag_ == gpu_arch::Xe)>> {
332 static constexpr uint32_t alignment_in_bytes
333 = mem_desc_t::alignment_in_bytes;
336 sizeof(
dtype) >= 4,
"for atomic add, we only support DW or QW");
339 static constexpr uint32_t tile_size_x = tile_desc::tile_size_x;
340 static constexpr uint32_t tile_size_y = tile_desc::tile_size_y;
341 static constexpr uint32_t block_size_x = tile_desc::block_size_x;
342 static constexpr uint32_t block_size_y = tile_desc::block_size_y;
347 static constexpr uint32_t tile_bytes
348 = tile_size_x * tile_size_y *
sizeof(
dtype);
349 static constexpr uint32_t block_bytes
350 = block_size_x * block_size_y *
sizeof(
dtype);
353 static constexpr uint32_t min_store_bytes = 16 *
sizeof(
dtype);
354 static constexpr uint32_t max_store_bytes = 32 *
sizeof(
dtype);
355 static constexpr uint32_t num_channel
356 = ((tile_bytes % max_store_bytes) == 0
357 && (block_bytes % max_store_bytes) == 0)
361 static constexpr uint32_t num_channel_x = block_size_x;
362 static constexpr uint32_t num_channel_y = num_channel / num_channel_x;
363 static constexpr uint32_t store_elems = num_channel_y * block_size_x;
376 pitch_in_bytes = mem_tdesc.shape.stride *
sizeof(
dtype);
377 base_x = mem_tdesc.coord.x;
378 base_y = mem_tdesc.coord.y;
379 width_in_elems = mem_tdesc.shape.x;
380 height_in_elems = mem_tdesc.shape.y;
381 base_pointer = (uint64_t)mem_tdesc.base.base;
382 base_pointer += base_y * pitch_in_bytes + base_x *
sizeof(
dtype);
384 = xetla_vector_gen<uint32_t, num_channel>(0, 1);
385 step_x = channel_index % num_channel_x;
386 step_y = channel_index / num_channel_x;
387 channel_offset = step_x *
sizeof(
dtype) + step_y * pitch_in_bytes;
391 int surface_pitch,
int surface_offset_x,
int surface_offset_y) {
392 pitch_in_bytes = surface_pitch *
sizeof(
dtype);
393 base_x = surface_offset_x;
394 base_y = surface_offset_y;
395 width_in_elems = surface_width;
396 height_in_elems = surface_height;
397 base_pointer = (uint64_t)p;
398 base_pointer += base_y * pitch_in_bytes + base_x *
sizeof(
dtype);
400 = xetla_vector_gen<uint32_t, num_channel>(0, 1);
401 step_x = channel_index % num_channel_x;
402 step_y = channel_index / num_channel_x;
403 channel_offset = step_x *
sizeof(
dtype) + step_y * pitch_in_bytes;
407 int surface_pitch,
int surface_offset_x,
int surface_offset_y) {
408 pitch_in_bytes = surface_pitch *
sizeof(
dtype);
409 base_x = surface_offset_x;
410 base_y = surface_offset_y;
411 width_in_elems = surface_width;
412 height_in_elems = surface_height;
413 base_pointer = (uint64_t)p;
414 base_pointer += base_y * pitch_in_bytes + base_x *
sizeof(
dtype);
416 = xetla_vector_gen<uint32_t, num_channel>(0, 1);
417 step_x = channel_index % num_channel_x;
418 step_y = channel_index / num_channel_x;
419 channel_offset = step_x *
sizeof(
dtype) + step_y * pitch_in_bytes;
423 pitch_in_bytes = mem_tdesc.shape.stride *
sizeof(
dtype);
424 base_x = mem_tdesc.coord.x;
425 base_y = mem_tdesc.coord.y;
426 width_in_elems = mem_tdesc.shape.x;
427 height_in_elems = mem_tdesc.shape.y;
428 base_pointer = (uint64_t)mem_tdesc.base.base;
429 base_pointer += base_y * pitch_in_bytes + base_x *
sizeof(
dtype);
431 = xetla_vector_gen<uint32_t, num_channel>(0, 1);
432 step_x = channel_index % num_channel_x;
433 step_y = channel_index / num_channel_x;
434 channel_offset = step_x *
sizeof(
dtype) + step_y * pitch_in_bytes;
438 this->pitch_in_bytes = rhs.pitch_in_bytes;
439 this->width_in_elems = rhs.width_in_elems;
440 this->height_in_elems = rhs.height_in_elems;
441 this->base_x = rhs.base_x;
442 this->base_y = rhs.base_y;
443 this->base_pointer = rhs.base_pointer;
444 this->channel_offset = rhs.channel_offset;
445 this->step_x = rhs.step_x;
446 this->step_y = rhs.step_y;
451 this->pitch_in_bytes = rhs.pitch_in_bytes;
452 this->width_in_elems = rhs.width_in_elems;
453 this->height_in_elems = rhs.height_in_elems;
454 this->base_x = rhs.base_x;
455 this->base_y = rhs.base_y;
456 this->base_pointer = rhs.base_pointer;
457 this->channel_offset = rhs.channel_offset;
458 this->step_x = rhs.step_x;
459 this->step_y = rhs.step_y;
463 template <tdesc_update_dir update_dir = tdesc_update_dir::x_dir>
466 base_pointer += int64_t(offset) *
sizeof(
dtype);
469 base_pointer += int64_t(offset) * pitch_in_bytes;
480template <
typename dtype_,
typename tile_desc_,
gpu_arch arch_tag_,
485 std::enable_if_t<(arch_tag_ == gpu_arch::Xe)>> {
493 static constexpr uint32_t alignment_in_bytes
494 = mem_desc_t::alignment_in_bytes;
495 static_assert((alignment_in_bytes %
sizeof(uint32_t)) == 0,
496 "alignment should at least DW aligned");
500 static constexpr uint32_t tile_size_x = tile_desc::tile_size_x;
501 static constexpr uint32_t tile_size_y = tile_desc::tile_size_y;
502 static constexpr uint32_t block_size_x = tile_desc::block_size_x;
503 static constexpr uint32_t block_size_y = tile_desc::block_size_y;
508 static constexpr uint32_t tile_bytes
509 = tile_size_x * tile_size_y *
sizeof(
dtype);
510 static constexpr uint32_t block_bytes
511 = block_size_x * block_size_y *
sizeof(
dtype);
512 static constexpr uint32_t bytes_per_row = block_size_x *
sizeof(
dtype);
514 (bytes_per_row %
sizeof(uint64_t) == 0)
515 && (alignment_in_bytes %
sizeof(uint64_t) == 0),
517 typename std::conditional<(bytes_per_row %
sizeof(uint32_t) == 0),
518 uint32_t,
dtype>::type>::type;
525 pitch_in_bytes = mem_tdesc.shape.stride *
sizeof(
dtype);
526 uint32_t offset_x = mem_tdesc.coord.x;
527 uint32_t offset_y = mem_tdesc.coord.y;
528 address = mem_tdesc.base.base + offset_y * pitch_in_bytes
529 + offset_x *
sizeof(
dtype);
532 [[maybe_unused]]
int surface_height,
int surface_pitch,
533 int surface_offset_x,
int surface_offset_y) {
534 uint32_t offset_x = surface_offset_x;
535 uint32_t offset_y = surface_offset_y;
536 pitch_in_bytes = surface_pitch *
sizeof(
dtype);
537 address = base + offset_y * pitch_in_bytes + offset_x *
sizeof(
dtype);
541 pitch_in_bytes = mem_tdesc.shape.stride *
sizeof(
dtype);
542 uint32_t offset_x = mem_tdesc.coord.x;
543 uint32_t offset_y = mem_tdesc.coord.y;
544 address = mem_tdesc.base.base + offset_y * pitch_in_bytes
545 + offset_x *
sizeof(
dtype);
549 [[maybe_unused]]
int surface_height,
int surface_pitch,
550 int surface_offset_x,
int surface_offset_y) {
551 uint32_t offset_x = surface_offset_x;
552 uint32_t offset_y = surface_offset_y;
553 pitch_in_bytes = surface_pitch *
sizeof(
dtype);
554 address = base + offset_y * pitch_in_bytes + offset_x *
sizeof(
dtype);
558 this->address = rhs.address;
559 this->pitch_in_bytes = rhs.pitch_in_bytes;
564 this->address = rhs.address;
565 this->pitch_in_bytes = rhs.pitch_in_bytes;
569 template <tdesc_update_dir update_dir = tdesc_update_dir::x_dir>
572 address += offset *
sizeof(
dtype);
574 address += offset * pitch_in_bytes;
588template <
typename dtype_,
typename tile_desc_,
mem_layout mem_layout_,
589 uint32_t alignment_,
gpu_arch arch_tag_>
593 std::enable_if_t<(arch_tag_ == gpu_arch::Xe)>> {
601 static constexpr uint32_t alignment_in_bytes
602 = mem_desc_t::alignment_in_bytes;
606 static constexpr uint32_t tile_size_x = tile_desc::tile_size_x;
607 static constexpr uint32_t tile_size_y = tile_desc::tile_size_y;
608 static constexpr uint32_t block_size_x = tile_desc::block_size_x;
609 static constexpr uint32_t block_size_y = tile_desc::block_size_y;
615 static constexpr bool mem_transpose
618 static constexpr reg_layout register_layout = tile_desc::register_layout;
619 static constexpr bool reg_transpose
621 static constexpr bool trans = mem_transpose ^ reg_transpose;
623 static constexpr bool mem_transform = (
sizeof(
dtype) < 4)
627 static constexpr uint32_t tile_bytes
628 = tile_size_x * tile_size_y *
sizeof(
dtype);
629 static constexpr uint32_t block_bytes
630 = block_size_x * block_size_y *
sizeof(
dtype);
633 (alignment_in_bytes %
sizeof(uint64_t) == 0), uint64_t,
634 typename std::conditional<(alignment_in_bytes %
sizeof(uint32_t)
636 uint32_t,
dtype>::type>::type;
640 static constexpr uint32_t min_store_bytes = 16 *
sizeof(
dtype);
641 static constexpr uint32_t max_store_bytes = 32 *
sizeof(
dtype);
642 static constexpr uint32_t num_channel
643 = ((tile_bytes % max_store_bytes) == 0
644 && (block_bytes % max_store_bytes) == 0)
648 static constexpr uint32_t num_channel_x
650 static constexpr uint32_t num_channel_y = num_channel / num_channel_x;
666 pitch_in_bytes = mem_tdesc.shape.stride *
sizeof(
dtype);
667 base_x = mem_tdesc.coord.x;
668 base_y = mem_tdesc.coord.y;
669 width_in_elems = mem_tdesc.shape.x;
670 height_in_elems = mem_tdesc.shape.y;
671 base_offset = trans ? base_x * pitch_in_bytes + base_y *
sizeof(
dtype)
672 : base_y * pitch_in_bytes + base_x *
sizeof(
dtype);
673 base_ptr = (
mem_dtype *)mem_tdesc.base.base;
676 = xetla_vector_gen<uint32_t, num_channel>(0, 1);
677 step_x = channel_index % num_channel_x;
678 step_y = channel_index / num_channel_x;
679 channel_offset = trans
680 ? step_y *
sizeof(
mem_dtype) + step_x * pitch_in_bytes
681 : step_x *
sizeof(
mem_dtype) + step_y * pitch_in_bytes;
685 int surface_pitch,
int surface_offset_x,
int surface_offset_y) {
686 pitch_in_bytes = surface_pitch *
sizeof(
dtype);
687 base_x = surface_offset_x;
688 base_y = surface_offset_y;
689 width_in_elems = surface_width;
690 height_in_elems = surface_height;
691 base_offset = trans ? base_x * pitch_in_bytes + base_y *
sizeof(
dtype)
692 : base_y * pitch_in_bytes + base_x *
sizeof(
dtype);
696 = xetla_vector_gen<uint32_t, num_channel>(0, 1);
697 step_x = channel_index % num_channel_x;
698 step_y = channel_index / num_channel_x;
699 channel_offset = trans
700 ? step_y *
sizeof(
mem_dtype) + step_x * pitch_in_bytes
701 : step_x *
sizeof(
mem_dtype) + step_y * pitch_in_bytes;
705 pitch_in_bytes = mem_tdesc.shape.stride *
sizeof(
dtype);
706 base_x = mem_tdesc.coord.x;
707 base_y = mem_tdesc.coord.y;
708 width_in_elems = mem_tdesc.shape.x;
709 height_in_elems = mem_tdesc.shape.y;
710 base_offset = trans ? base_x * pitch_in_bytes + base_y *
sizeof(
dtype)
711 : base_y * pitch_in_bytes + base_x *
sizeof(
dtype);
712 base_ptr = (
mem_dtype *)mem_tdesc.base.base;
715 = xetla_vector_gen<uint32_t, num_channel>(0, 1);
716 step_x = channel_index % num_channel_x;
717 step_y = channel_index / num_channel_x;
718 channel_offset = trans
719 ? step_y *
sizeof(
mem_dtype) + step_x * pitch_in_bytes
720 : step_x *
sizeof(
mem_dtype) + step_y * pitch_in_bytes;
724 int surface_pitch,
int surface_offset_x,
int surface_offset_y) {
725 pitch_in_bytes = surface_pitch *
sizeof(
dtype);
726 base_x = surface_offset_x;
727 base_y = surface_offset_y;
728 width_in_elems = surface_width;
729 height_in_elems = surface_height;
730 base_offset = trans ? base_x * pitch_in_bytes + base_y *
sizeof(
dtype)
731 : base_y * pitch_in_bytes + base_x *
sizeof(
dtype);
735 = xetla_vector_gen<uint32_t, num_channel>(0, 1);
736 step_x = channel_index % num_channel_x;
737 step_y = channel_index / num_channel_x;
738 channel_offset = trans
739 ? step_y *
sizeof(
mem_dtype) + step_x * pitch_in_bytes
740 : step_x *
sizeof(
mem_dtype) + step_y * pitch_in_bytes;
744 this->base_offset = rhs.base_offset;
745 this->base_ptr = rhs.base_ptr;
746 this->pitch_in_bytes = rhs.pitch_in_bytes;
747 this->base_x = rhs.base_x;
748 this->base_y = rhs.base_y;
749 this->width_in_elems = rhs.width_in_elems;
750 this->height_in_elems = rhs.height_in_elems;
752 this->step_x = rhs.step_x;
753 this->step_y = rhs.step_y;
755 this->channel_offset = rhs.channel_offset;
760 this->base_offset = rhs.base_offset;
761 this->base_ptr = rhs.base_ptr;
762 this->pitch_in_bytes = rhs.pitch_in_bytes;
763 this->base_x = rhs.base_x;
764 this->base_y = rhs.base_y;
765 this->width_in_elems = rhs.width_in_elems;
766 this->height_in_elems = rhs.height_in_elems;
768 this->step_x = rhs.step_x;
769 this->step_y = rhs.step_y;
770 this->channel_offset = rhs.channel_offset;
775 template <tdesc_update_dir update_dir = tdesc_update_dir::x_dir>
778 base_offset += int64_t(offset) *
sizeof(
dtype);
779 trans ? base_y += offset : base_x += offset;
781 base_offset += int64_t(offset) * pitch_in_bytes;
782 trans ? base_x += offset : base_y += offset;
792template <
typename dtype_,
typename tile_desc_,
gpu_arch arch_tag_,
797 std::enable_if_t<(arch_tag_ == gpu_arch::Xe)>> {
805 static constexpr uint32_t alignment_in_bytes
806 = mem_desc_t::alignment_in_bytes;
810 static constexpr uint32_t tile_size_x = tile_desc::tile_size_x;
811 static constexpr uint32_t tile_size_y = tile_desc::tile_size_y;
812 static constexpr uint32_t block_size_x = tile_desc::block_size_x;
813 static constexpr uint32_t block_size_y = tile_desc::block_size_y;
818 static constexpr reg_layout register_layout = tile_desc::register_layout;
819 static constexpr bool mem_transform
822 static constexpr uint32_t tile_bytes
823 = tile_size_x * tile_size_y *
sizeof(
dtype);
824 static constexpr uint32_t block_bytes
825 = block_size_x * block_size_y *
sizeof(
dtype);
827 (block_bytes % (16 *
sizeof(uint64_t)) == 0), uint64_t,
828 typename std::conditional<(block_bytes % (16 *
sizeof(uint32_t))
830 uint32_t,
dtype>::type>::type;
832 static constexpr uint32_t min_bytes = 16 *
sizeof(
mem_dtype);
833 static constexpr uint32_t max_bytes = 32 *
sizeof(
mem_dtype);
835 static constexpr uint32_t num_channel
836 = ((tile_bytes % max_bytes) == 0 && (block_bytes % max_bytes) == 0)
839 static constexpr uint32_t num_channel_x
841 static constexpr uint32_t num_channel_y = num_channel / num_channel_x;
849 pitch_in_bytes = base_tdesc[4];
850 wg_width_in_bytes = base_tdesc[2];
851 wg_height_in_elems = base_tdesc[3];
852 uint32_t offset_x = base_tdesc[5];
853 uint32_t offset_y = base_tdesc[6];
854 uint32_t start_address = base_tdesc[0];
855 start_address += offset_y * pitch_in_bytes + offset_x *
sizeof(
dtype);
857 = xetla_vector_gen<uint32_t, num_channel>(0, 1);
858 address = start_address
859 + (channel_index % num_channel_x) *
sizeof(
mem_dtype)
860 + (channel_index / num_channel_x) * pitch_in_bytes;
864 int surface_pitch,
int surface_offset_x,
int surface_offset_y) {
865 pitch_in_bytes = surface_pitch *
sizeof(
dtype);
866 wg_width_in_bytes = surface_width *
sizeof(
dtype);
867 wg_height_in_elems = surface_height;
868 uint32_t offset_x = surface_offset_x;
869 uint32_t offset_y = surface_offset_y;
870 uint32_t start_address = base;
871 start_address += offset_y * pitch_in_bytes + offset_x *
sizeof(
dtype);
873 = xetla_vector_gen<uint32_t, num_channel>(0, 1);
874 address = start_address
875 + (channel_index % num_channel_x) *
sizeof(
mem_dtype)
876 + (channel_index / num_channel_x) * pitch_in_bytes;
880 int surface_pitch,
int surface_offset_x,
int surface_offset_y) {
881 pitch_in_bytes = surface_pitch *
sizeof(
dtype);
882 wg_width_in_bytes = surface_width *
sizeof(
dtype);
883 wg_height_in_elems = surface_height;
884 uint32_t offset_x = surface_offset_x;
885 uint32_t offset_y = surface_offset_y;
886 uint32_t start_address = base;
887 start_address += offset_y * pitch_in_bytes + offset_x *
sizeof(
dtype);
889 = xetla_vector_gen<uint32_t, num_channel>(0, 1);
890 address = start_address
891 + (channel_index % num_channel_x) *
sizeof(
mem_dtype)
892 + (channel_index / num_channel_x) * pitch_in_bytes;
897 pitch_in_bytes = base_tdesc[4];
898 wg_width_in_bytes = base_tdesc[2];
899 wg_height_in_elems = base_tdesc[3];
900 uint32_t offset_x = base_tdesc[5];
901 uint32_t offset_y = base_tdesc[6];
902 uint32_t start_address = base_tdesc[0];
903 start_address += offset_y * pitch_in_bytes + offset_x *
sizeof(
dtype);
905 = xetla_vector_gen<uint32_t, num_channel>(0, 1);
906 address = start_address
907 + (channel_index % num_channel_x) *
sizeof(
mem_dtype)
908 + (channel_index / num_channel_x) * pitch_in_bytes;
912 this->address = rhs.address;
913 this->pitch_in_bytes = rhs.pitch_in_bytes;
914 this->wg_width_in_bytes = rhs.wg_width_in_bytes;
915 this->wg_height_in_elems = rhs.wg_height_in_elems;
920 this->address = rhs.address;
921 this->pitch_in_bytes = rhs.pitch_in_bytes;
922 this->wg_width_in_bytes = rhs.wg_width_in_bytes;
923 this->wg_height_in_elems = rhs.wg_height_in_elems;
927 template <tdesc_update_dir update_dir = tdesc_update_dir::x_dir>
930 address += offset *
sizeof(
dtype);
932 address += offset * pitch_in_bytes;
943template <
typename dtype_, uint32_t tile_size_x_, uint32_t tile_size_y_,
944 uint32_t block_size_x_, uint32_t block_size_y_,
gpu_arch arch_tag_,
948 tile_desc_t<tile_size_x_, tile_size_y_, block_size_x_, block_size_y_,
949 reg_layout::vnni_tiled_col_major>,
951 std::enable_if_t<(arch_tag_ == gpu_arch::Xe)>> {
960 static constexpr uint32_t alignment_in_bytes
961 = mem_desc_t::alignment_in_bytes;
965 static constexpr uint32_t tile_size_x = tile_desc::tile_size_x;
966 static constexpr uint32_t tile_size_y = tile_desc::tile_size_y;
967 static constexpr uint32_t block_size_x = tile_desc::block_size_x;
968 static constexpr uint32_t block_size_y = tile_desc::block_size_y;
973 static constexpr uint32_t tile_bytes
974 = tile_size_x * tile_size_y *
sizeof(
dtype);
975 static constexpr uint32_t block_bytes
976 = block_size_x * block_size_y *
sizeof(
dtype);
978 static constexpr uint32_t vnni_scale_factor
980 static constexpr uint32_t is_simd16_vec
981 = (block_size_x == 16) && ((tile_size_y & (tile_size_y - 1)) == 0);
982 static constexpr uint32_t num_vector_size = is_simd16_vec
983 ?
detail::gcd<tile_size_y / vnni_scale_factor, 8>::value
986 static constexpr uint32_t min_store_bytes = 16 *
sizeof(
store_dtype);
987 static constexpr uint32_t max_store_bytes = 32 *
sizeof(
store_dtype);
988 static constexpr uint32_t num_channel = is_simd16_vec
990 : (((tile_bytes % max_store_bytes) == 0
991 && (block_bytes % max_store_bytes) == 0)
994 static constexpr uint32_t num_channel_x = block_size_x;
995 static constexpr uint32_t num_channel_y
996 = is_simd16_vec ? 1 : num_channel / num_channel_x;
997 static constexpr uint32_t store_elems = num_channel_y * num_vector_size
998 * vnni_scale_factor * block_size_x;
1011 pitch_in_bytes = base_tdesc[4];
1012 wg_width_in_bytes = base_tdesc[2];
1013 wg_height_in_elems = base_tdesc[3];
1014 uint32_t offset_x = base_tdesc[5];
1015 uint32_t offset_y = base_tdesc[6];
1016 uint32_t start_address = base_tdesc[0];
1017 start_address += offset_y * pitch_in_bytes + offset_x *
sizeof(
dtype);
1019 = xetla_vector_gen<uint32_t, num_channel>(0, 1);
1020 address = start_address
1021 + (channel_index % num_channel_x) * pitch_in_bytes
1022 + (channel_index / num_channel_x) *
sizeof(
store_dtype);
1026 int surface_pitch,
int surface_offset_x,
int surface_offset_y) {
1027 pitch_in_bytes = surface_pitch *
sizeof(
dtype);
1028 wg_width_in_bytes = surface_width *
sizeof(
dtype);
1029 wg_height_in_elems = surface_height;
1030 uint32_t offset_x = surface_offset_x;
1031 uint32_t offset_y = surface_offset_y;
1032 uint32_t start_address = base;
1033 start_address += offset_y * pitch_in_bytes + offset_x *
sizeof(
dtype);
1035 = xetla_vector_gen<uint32_t, num_channel>(0, 1);
1036 address = start_address
1037 + ((channel_index % num_channel_x) * pitch_in_bytes
1038 + (channel_index / num_channel_x)
1044 int surface_pitch,
int surface_offset_x,
int surface_offset_y) {
1045 pitch_in_bytes = surface_pitch *
sizeof(
dtype);
1046 wg_width_in_bytes = surface_width *
sizeof(
dtype);
1047 wg_height_in_elems = surface_height;
1048 uint32_t offset_x = surface_offset_x;
1049 uint32_t offset_y = surface_offset_y;
1050 uint32_t start_address = base;
1051 start_address += offset_y * pitch_in_bytes + offset_x *
sizeof(
dtype);
1053 = xetla_vector_gen<uint32_t, num_channel>(0, 1);
1054 address = start_address
1055 + ((channel_index % num_channel_x) * pitch_in_bytes
1056 + (channel_index / num_channel_x)
1064 pitch_in_bytes = base_tdesc[4];
1065 wg_width_in_bytes = base_tdesc[2];
1066 wg_height_in_elems = base_tdesc[3];
1067 uint32_t offset_x = base_tdesc[5];
1068 uint32_t offset_y = base_tdesc[6];
1069 uint32_t start_address = base_tdesc[0];
1070 start_address += offset_y * pitch_in_bytes + offset_x *
sizeof(
dtype);
1072 = xetla_vector_gen<uint32_t, num_channel>(0, 1);
1073 address = start_address
1074 + (channel_index % num_channel_x) * pitch_in_bytes
1075 + (channel_index / num_channel_x) *
sizeof(
store_dtype);
1079 this->address = rhs.address;
1080 this->pitch_in_bytes = rhs.pitch_in_bytes;
1081 this->cyclic_count = 0;
1082 this->wg_width_in_bytes = rhs.wg_width_in_bytes;
1083 this->wg_height_in_elems = rhs.wg_height_in_elems;
1088 this->address = rhs.address;
1089 this->pitch_in_bytes = rhs.pitch_in_bytes;
1090 this->cyclic_count = 0;
1091 this->wg_width_in_bytes = rhs.wg_width_in_bytes;
1092 this->wg_height_in_elems = rhs.wg_height_in_elems;
1096 template <tdesc_update_dir update_dir = tdesc_update_dir::x_dir>
1099 address += offset *
sizeof(
dtype);
1101 address += offset * pitch_in_bytes;
1112template <
typename dtype_, uint32_t tile_size_x_, uint32_t tile_size_y_,
1113 uint32_t block_size_x_, uint32_t block_size_y_,
mem_layout mem_layout_,
1114 uint32_t alignment_, uint32_t num_coop_sg_,
reg_layout reg_layout_,
1118 tile_desc_t<tile_size_x_, tile_size_y_, block_size_x_, block_size_y_,
1120 num_coop_sg_, arch_tag_,
1121 std::enable_if_t<(arch_tag_ == gpu_arch::Xe)>> {
1126 block_size_y_, reg_layout_>;
1132 static constexpr uint32_t tile_size_x = tile_desc::tile_size_x;
1133 static constexpr uint32_t tile_size_y = tile_desc::tile_size_y;
1135 static constexpr uint32_t mem_tile_size_w
1136 = is_col_major ? tile_size_y : tile_size_x;
1137 static constexpr uint32_t mem_tile_size_h
1138 = is_col_major ? tile_size_x : tile_size_y;
1140 arch_tag>::template load_store_attr<msg_type::block_2d>;
1141 static constexpr uint32_t special_prefetch_width
1142 = load_store_attr::special_prefetch_width_in_bytes /
sizeof(
dtype);
1143 static constexpr uint32_t normal_prefetch_width
1144 = load_store_attr::max_load_width_in_bytes /
sizeof(
dtype);
1145 static constexpr bool is_special_prefetch
1146 = (mem_tile_size_w % special_prefetch_width) == 0;
1148 static constexpr uint32_t block_size_w = is_special_prefetch
1149 ? special_prefetch_width
1150 : (normal_prefetch_width > mem_tile_size_w ? mem_tile_size_w
1151 : normal_prefetch_width);
1152 static constexpr uint32_t block_size_h
1153 = load_store_attr::max_load_height_in_elem;
1155 static constexpr uint32_t max_num_block_w
1156 = (mem_tile_size_w + block_size_w - 1) / block_size_w;
1157 static constexpr uint32_t num_coop_sg = num_coop_sg_;
1158 static constexpr uint32_t num_coop_sg_w
1160 static constexpr uint32_t num_coop_sg_h = num_coop_sg / num_coop_sg_w;
1162 static constexpr uint32_t num_block_w = max_num_block_w / num_coop_sg_w;
1163 static constexpr uint32_t tile_size_w = block_size_w * num_block_w;
1164 static constexpr uint32_t tile_size_h
1165 = (mem_tile_size_h + num_coop_sg_h - 1) / num_coop_sg_h;
1166 static constexpr uint32_t num_block_h
1167 = (tile_size_h + block_size_h - 1) / block_size_h;
1172 static constexpr uint32_t num_tdesc = num_block_w * num_block_h;
1176 this->tdesc_prefetch = rhs.tdesc_prefetch;
1182 this->tdesc_prefetch = rhs.tdesc_prefetch;
1188 uint32_t coop_id_x = coop_id % num_coop_sg_w;
1189 uint32_t coop_id_y = coop_id / num_coop_sg_w;
1191 base_tdesc.xetla_format<uint32_t>(), coop_id_x * tile_size_w);
1193 base_tdesc.xetla_format<uint32_t>(), coop_id_y * tile_size_h);
1194 prepare_tdesc(base_tdesc);
1198 int surface_pitch,
int surface_offset_x,
int surface_offset_y,
1199 uint32_t coop_id = 0) {
1200 uint32_t coop_id_x = coop_id % num_coop_sg_w;
1201 uint32_t coop_id_y = coop_id / num_coop_sg_w;
1204 surface_height, surface_pitch,
1205 surface_offset_x + coop_id_x * tile_size_w,
1206 surface_offset_y + coop_id_y * tile_size_h);
1207 prepare_tdesc(base_tdesc);
1213 template <tdesc_update_dir update_dir = tdesc_update_dir::x_dir>
1215 auto tdesc_2d = tdesc_prefetch.xetla_format<uint32_t, num_tdesc, 16>();
1218 for (uint32_t i = 0; i < num_tdesc; i++) {
1223 for (uint32_t i = 0; i < num_tdesc; i++) {
1231 auto tdesc_2d = tdesc_prefetch.xetla_format<uint32_t, num_tdesc, 16>();
1232 uint32_t base_offset_y = 0;
1234 for (uint32_t i = 0; i < tile_size_h / block_size_h; i++) {
1235 auto tdesc_row_2d = tdesc_2d.xetla_select<num_block_w, 1, 16, 1>(
1236 i * num_block_w, 0);
1237 prepare_tile_desc_core<num_block_w, block_size_w, block_size_h>(
1238 tdesc_row_2d, base_tdesc, base_offset_y);
1239 base_offset_y += block_size_h;
1241 if constexpr ((tile_size_h % block_size_h) != 0) {
1242 constexpr int i = tile_size_h / block_size_h;
1243 auto tdesc_row_2d = tdesc_2d.xetla_select<num_block_w, 1, 16, 1>(
1244 i * num_block_w, 0);
1245 constexpr uint32_t remain_size_y = tile_size_h % block_size_h;
1246 prepare_tile_desc_core<num_block_w, block_size_w, remain_size_y>(
1247 tdesc_row_2d, base_tdesc, base_offset_y);
1251 template <
int32_t num_tdesc, u
int32_t size_x, u
int32_t size_y>
1253 xetla_matrix_ref<uint32_t, num_tdesc, 16>
__REF__ tdesc_2d,
1255 uint32_t base_offset_x = 0;
1257 for (
int j = 0; j < num_tdesc; j++) {
1258 tdesc_2d.row(j) = base_tdesc;
1260 constexpr uint32_t block_widthx_widthy_arrlen
1261 = (size_x - 1) | ((size_y - 1) << 8);
1263 tdesc_2d.row(j), block_widthx_widthy_arrlen);
1267 base_offset_x += size_x;
1278template <
typename dtype_, uint32_t tile_size_x_, uint32_t block_size_x_,
1279 mem_layout mem_layout_, uint32_t alignment_, uint32_t num_coop_sg_,
1283 tile_desc_t<tile_size_x_, 1, block_size_x_, 1, reg_layout_>,
1284 num_coop_sg_, arch_tag_,
1285 std::enable_if_t<(arch_tag_ == gpu_arch::Xe)>> {
1299 static constexpr uint32_t cacheline_elems = 64 /
sizeof(
dtype);
1300 static constexpr uint32_t mem_block_nums
1301 = (tile_desc::tile_size_x + cacheline_elems - 1) / cacheline_elems;
1302 static constexpr uint32_t num_coop_sg = num_coop_sg_;
1308 static constexpr uint32_t mem_tile_size_x = mem_block_nums > num_coop_sg
1309 ? (mem_block_nums + num_coop_sg - 1) / num_coop_sg *cacheline_elems
1319 static constexpr uint32_t scale_factor
1326 this->base_offset = rhs.base_offset;
1327 this->base_ptr = rhs.base_ptr;
1328 this->pitch_in_bytes = rhs.pitch_in_bytes;
1334 this->base_offset = rhs.base_offset;
1335 this->base_ptr = rhs.base_ptr;
1336 this->pitch_in_bytes = rhs.pitch_in_bytes;
1341 pitch_in_bytes = mem_desc.shape.stride *
sizeof(
dtype);
1342 uint32_t offset_x = mem_desc.coord.x;
1343 uint32_t offset_y = mem_desc.coord.y;
1344 base_offset = offset_y * pitch_in_bytes + offset_x *
sizeof(
dtype);
1345 uint64_t ptr_temp = (uint64_t)mem_desc.base.base;
1347 + (coop_id % num_coop_sg) * mem_tile_size_x;
1351 [[maybe_unused]]
int surface_height,
int surface_pitch,
1352 int surface_offset_x,
int surface_offset_y, uint32_t coop_id = 0) {
1353 pitch_in_bytes = surface_pitch *
sizeof(
dtype);
1354 uint32_t offset_x = surface_offset_x;
1355 uint32_t offset_y = surface_offset_y;
1356 base_offset = offset_y * pitch_in_bytes + offset_x *
sizeof(
dtype);
1358 + (coop_id % num_coop_sg) * mem_tile_size_x;
1361 template <tdesc_update_dir update_dir = tdesc_update_dir::x_dir>
1364 base_offset += offset *
sizeof(
dtype);
1366 base_offset += offset * pitch_in_bytes;
1377template <
typename dtype_,
typename tile_desc_,
mem_layout mem_layout_,
1378 uint32_t alignment_, uint32_t num_coop_sg_,
gpu_arch arch_tag_>
1381 tile_desc_, num_coop_sg_, arch_tag_,
1382 std::enable_if_t<(arch_tag_ == gpu_arch::Xe)>> {
1392 [[maybe_unused]] uint32_t coop_id = 0) {}
1395 [[maybe_unused]]
int surface_width,
1396 [[maybe_unused]]
int surface_height,
1397 [[maybe_unused]]
int surface_pitch,
1398 [[maybe_unused]]
int surface_offset_x,
1399 [[maybe_unused]]
int surface_offset_y,
1400 [[maybe_unused]] uint32_t coop_id = 0) {}
1402 template <tdesc_update_dir update_dir = tdesc_update_dir::x_dir>
#define __XETLA_API
Definition common.hpp:43
#define __REF__
Workaround for ESIMD reference usage.
Definition base_types.hpp:177
xetla_vector< uint32_t, 16 > xetla_tdescriptor
Description of nd tensor descriptor for load and store.
Definition base_types.hpp:155
__ESIMD_NS::simd< native_type_t< Ty >, N > xetla_vector
wrapper for xetla_vector.
Definition base_types.hpp:149
__XETLA_API void xetla_update_tdesc_offsetx(xetla_tdescriptor_ref tdesc, int32_t doffset_x)
Update the x coordinate in the given tensor descriptor.
Definition raw_send_load_store.hpp:152
__XETLA_API void xetla_update_tdesc_offsety(xetla_tdescriptor_ref tdesc, int32_t doffset_y)
Update the y coordinate in the given tensor descriptor.
Definition raw_send_load_store.hpp:161
__XETLA_API void xetla_fill_tdesc(xetla_tdescriptor_ref tdesc, Ty *p, int tensor_width, int tensor_height, int tensor_pitch, int offset_x, int offset_y)
Tensor descriptor construction(global memory version).
Definition raw_send_load_store.hpp:52
__XETLA_API void xetla_set_tensor_offset_x(xetla_tdescriptor_ref desc, int32_t offset_x)
Definition tensor_descriptor.hpp:63
__XETLA_API void xetla_set_block_widthx_widthy_arrlen(xetla_tdescriptor_ref desc, uint32_t block_widthx_widthy_arrlen)
Definition tensor_descriptor.hpp:79
__XETLA_API int32_t xetla_get_tensor_offset_x(xetla_tdescriptor desc)
Definition tensor_descriptor.hpp:67
Definition limitation.hpp:457
reg_layout
tile layout in register linear: linear layout with one tile tiled: 2d block stacked in raster order v...
Definition common.hpp:209
@ vnni_tiled_col_major
this is vnni tiled format, but for each block, they are stored in col major order
mem_space
Definition common.hpp:77
gpu_arch
Definition common.hpp:73
msg_type
Definition common.hpp:78
mem_layout
Definition common.hpp:76
Definition arch_config.hpp:72
uint32_t base_y
Definition payload_xe.hpp:372
xetla_vector< uint32_t, num_channel > step_y
Definition payload_xe.hpp:367
this_payload_t & operator=(const this_payload_t &rhs)
Definition payload_xe.hpp:450
dtype_ dtype
Definition payload_xe.hpp:327
__XETLA_API void init(mem_desc_t &mem_tdesc)
Definition payload_xe.hpp:422
mem_payload_t(dtype *p, int surface_width, int surface_height, int surface_pitch, int surface_offset_x, int surface_offset_y)
Definition payload_xe.hpp:390
uint32_t pitch_in_bytes
Definition payload_xe.hpp:368
uint32_t base_x
Definition payload_xe.hpp:371
tile_desc_ tile_desc
Definition payload_xe.hpp:328
xetla_vector< uint32_t, num_channel > channel_offset
Definition payload_xe.hpp:365
uint32_t height_in_elems
Definition payload_xe.hpp:370
xetla_vector< uint32_t, num_channel > step_x
Definition payload_xe.hpp:366
mem_desc_t< dtype_, mem_layout::row_major, mem_space::global, alignment_ > mem_desc_t
Definition payload_xe.hpp:326
__XETLA_API void update_tdesc(int offset)
Definition payload_xe.hpp:464
uint64_t base_pointer
Definition payload_xe.hpp:373
__XETLA_API void init(dtype *p, int surface_width, int surface_height, int surface_pitch, int surface_offset_x, int surface_offset_y)
Definition payload_xe.hpp:406
uint32_t width_in_elems
Definition payload_xe.hpp:369
mem_payload_t(const this_payload_t &rhs)
Definition payload_xe.hpp:437
mem_payload_t(mem_desc_t &mem_tdesc)
Definition payload_xe.hpp:375
uint32_t pitch_in_bytes
Definition payload_xe.hpp:663
mem_payload_t(dtype *p, int surface_width, int surface_height, int surface_pitch, int surface_offset_x, int surface_offset_y)
Definition payload_xe.hpp:684
uint32_t base_y
Definition payload_xe.hpp:658
uint32_t base_x
Definition payload_xe.hpp:657
xetla_vector< uint32_t, num_channel > step_y
Definition payload_xe.hpp:654
uint32_t width_in_elems
Definition payload_xe.hpp:659
mem_payload_t(const this_payload_t &rhs)
Definition payload_xe.hpp:743
this_payload_t & operator=(const this_payload_t &rhs)
Definition payload_xe.hpp:759
uint64_t base_offset
Definition payload_xe.hpp:656
__XETLA_API void update_tdesc(int offset)
Definition payload_xe.hpp:776
__XETLA_API void init(mem_desc_t &mem_tdesc)
Definition payload_xe.hpp:704
dtype_ dtype
Definition payload_xe.hpp:594
mem_payload_t(mem_desc_t &mem_tdesc)
Definition payload_xe.hpp:665
mem_dtype * base_ptr
Definition payload_xe.hpp:662
xetla_vector< uint32_t, num_channel > channel_offset
Definition payload_xe.hpp:652
uint32_t height_in_elems
Definition payload_xe.hpp:660
xetla_vector< uint32_t, num_channel > step_x
Definition payload_xe.hpp:653
typename std::conditional<(alignment_in_bytes % sizeof(uint64_t)==0), uint64_t, typename std::conditional<(alignment_in_bytes % sizeof(uint32_t)==0), uint32_t, dtype >::type >::type mem_dtype
Definition payload_xe.hpp:636
mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ > mem_desc_t
Definition payload_xe.hpp:596
__XETLA_API void init(dtype *p, int surface_width, int surface_height, int surface_pitch, int surface_offset_x, int surface_offset_y)
Definition payload_xe.hpp:723
tile_desc_ tile_desc
Definition payload_xe.hpp:597
xetla_vector< uint32_t, 16 *num_block > payloads
Definition payload_xe.hpp:79
tile_desc_ tile_desc
Definition payload_xe.hpp:42
typename std::conditional< mem_dword_transpose, uint32_t, dtype >::type mem_dtype
Definition payload_xe.hpp:76
mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ > mem_desc_t
Definition payload_xe.hpp:44
mem_payload_t(dtype *p, uint32_t surface_width, uint32_t surface_height, uint32_t surface_pitch, int32_t surface_offset_x=0, int32_t surface_offset_y=0)
Definition payload_xe.hpp:95
__XETLA_API void update_tdesc(int offset)
Definition payload_xe.hpp:136
mem_payload_t(mem_desc_t &mem_desc)
Definition payload_xe.hpp:85
mem_payload_t(const this_payload_t &rhs)
Definition payload_xe.hpp:81
dtype_ dtype
Definition payload_xe.hpp:45
__XETLA_API void init(dtype *p, uint32_t surface_width, uint32_t surface_height, uint32_t surface_pitch, int32_t surface_offset_x=0, int32_t surface_offset_y=0)
Definition payload_xe.hpp:115
this_payload_t & operator=(const this_payload_t &rhs)
Definition payload_xe.hpp:130
__XETLA_API void init(mem_desc_t &mem_desc)
Definition payload_xe.hpp:105
__XETLA_API void init(mem_desc_t &mem_tdesc)
Definition payload_xe.hpp:271
__XETLA_API void init(dtype *p, int surface_width, int surface_height, int surface_pitch, int surface_offset_x, int surface_offset_y)
Definition payload_xe.hpp:279
mem_payload_t(dtype *p, int surface_width, int surface_height, int surface_pitch, int surface_offset_x, int surface_offset_y)
Definition payload_xe.hpp:261
tile_desc_ tile_desc
Definition payload_xe.hpp:221
dtype_ dtype
Definition payload_xe.hpp:220
typename std::conditional<(bytes_per_row % sizeof(uint64_t)==0) &&(alignment_in_bytes % sizeof(uint64_t)==0), uint64_t, typename std::conditional<(bytes_per_row % sizeof(uint32_t)==0), uint32_t, dtype >::type >::type mem_dtype
Definition payload_xe.hpp:246
mem_payload_t(const this_payload_t &rhs)
Definition payload_xe.hpp:289
mem_desc_t< dtype_, mem_layout::row_major, mem_space::global, alignment_ > mem_desc_t
Definition payload_xe.hpp:219
__XETLA_API void update_tdesc(int offset)
Definition payload_xe.hpp:304
mem_dtype * base_ptr
Definition payload_xe.hpp:250
this_payload_t & operator=(const this_payload_t &rhs)
Definition payload_xe.hpp:296
mem_payload_t(mem_desc_t &mem_tdesc)
Definition payload_xe.hpp:253
uint64_t base_offset
Definition payload_xe.hpp:249
uint32_t pitch_in_bytes
Definition payload_xe.hpp:251
gpu::xetla::subgroup::mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout::vnni_tiled_col_major >, msg_type::scatter, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::store_dtype uint32_t store_dtype
Definition payload_xe.hpp:977
gpu::xetla::subgroup::mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout::vnni_tiled_col_major >, msg_type::scatter, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::init __XETLA_API void init(mem_desc_t mem_tdesc)
Definition payload_xe.hpp:1061
gpu::xetla::subgroup::mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout::vnni_tiled_col_major >, msg_type::scatter, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::init __XETLA_API void init(uint32_t base, int surface_width, int surface_height, int surface_pitch, int surface_offset_x, int surface_offset_y)
Definition payload_xe.hpp:1043
gpu::xetla::subgroup::mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout::vnni_tiled_col_major >, msg_type::scatter, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::pitch_in_bytes uint32_t pitch_in_bytes
Definition payload_xe.hpp:1000
gpu::xetla::subgroup::mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout::vnni_tiled_col_major >, msg_type::scatter, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::mem_payload_t mem_payload_t(uint32_t base, int surface_width, int surface_height, int surface_pitch, int surface_offset_x, int surface_offset_y)
Definition payload_xe.hpp:1025
gpu::xetla::subgroup::mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout::vnni_tiled_col_major >, msg_type::scatter, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::wg_width_in_bytes uint32_t wg_width_in_bytes
Definition payload_xe.hpp:1002
gpu::xetla::subgroup::mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout::vnni_tiled_col_major >, msg_type::scatter, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::operator= this_payload_t & operator=(const this_payload_t &rhs)
Definition payload_xe.hpp:1087
gpu::xetla::subgroup::mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout::vnni_tiled_col_major >, msg_type::scatter, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::address xetla_vector< uint32_t, num_channel > address
Definition payload_xe.hpp:999
gpu::xetla::subgroup::mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout::vnni_tiled_col_major >, msg_type::scatter, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::update_tdesc __XETLA_API void update_tdesc(int offset)
Definition payload_xe.hpp:1097
gpu::xetla::subgroup::mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout::vnni_tiled_col_major >, msg_type::scatter, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::mem_payload_t mem_payload_t(const this_payload_t &rhs)
Definition payload_xe.hpp:1078
gpu::xetla::subgroup::mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout::vnni_tiled_col_major >, msg_type::scatter, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::cyclic_count uint32_t cyclic_count
Definition payload_xe.hpp:1001
gpu::xetla::subgroup::mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout::vnni_tiled_col_major >, msg_type::scatter, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::mem_desc_t mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ > mem_desc_t
Definition payload_xe.hpp:953
gpu::xetla::subgroup::mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout::vnni_tiled_col_major >, msg_type::scatter, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::dtype dtype_ dtype
Definition payload_xe.hpp:954
gpu::xetla::subgroup::mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout::vnni_tiled_col_major >, msg_type::scatter, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::wg_height_in_elems uint32_t wg_height_in_elems
Definition payload_xe.hpp:1003
gpu::xetla::subgroup::mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout::vnni_tiled_col_major >, msg_type::scatter, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::mem_payload_t mem_payload_t(mem_desc_t mem_tdesc)
Definition payload_xe.hpp:1008
gpu::xetla::subgroup::mem_payload_t< mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout::vnni_tiled_col_major >, msg_type::scatter, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::mem_payload_t mem_payload_t()=default
mem_payload_t(uint32_t base, int surface_width, int surface_height, int surface_pitch, int surface_offset_x, int surface_offset_y)
Definition payload_xe.hpp:863
typename std::conditional<(block_bytes %(16 *sizeof(uint64_t))==0), uint64_t, typename std::conditional<(block_bytes %(16 *sizeof(uint32_t))==0), uint32_t, dtype >::type >::type mem_dtype
Definition payload_xe.hpp:830
__XETLA_API void init(mem_desc_t &mem_tdesc)
Definition payload_xe.hpp:895
__XETLA_API void init(uint32_t base, int surface_width, int surface_height, int surface_pitch, int surface_offset_x, int surface_offset_y)
Definition payload_xe.hpp:879
uint32_t pitch_in_bytes
Definition payload_xe.hpp:843
mem_payload_t(mem_desc_t &mem_tdesc)
Definition payload_xe.hpp:847
mem_payload_t(const this_payload_t &rhs)
Definition payload_xe.hpp:911
__XETLA_API void update_tdesc(int offset)
Definition payload_xe.hpp:928
uint32_t wg_width_in_bytes
Definition payload_xe.hpp:844
this_payload_t & operator=(const this_payload_t &rhs)
Definition payload_xe.hpp:919
xetla_vector< uint32_t, num_channel > address
Definition payload_xe.hpp:842
mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ > mem_desc_t
Definition payload_xe.hpp:799
tile_desc_ tile_desc
Definition payload_xe.hpp:801
dtype_ dtype
Definition payload_xe.hpp:800
uint32_t wg_height_in_elems
Definition payload_xe.hpp:845
mem_desc_t< dtype_, mem_layout::row_major, mem_space::local, alignment_ > mem_desc_t
Definition payload_xe.hpp:487
__XETLA_API void update_tdesc(int offset)
Definition payload_xe.hpp:570
dtype_ dtype
Definition payload_xe.hpp:488
mem_payload_t(mem_desc_t &mem_tdesc)
Definition payload_xe.hpp:524
uint32_t pitch_in_bytes
Definition payload_xe.hpp:522
typename std::conditional<(bytes_per_row % sizeof(uint64_t)==0) &&(alignment_in_bytes % sizeof(uint64_t)==0), uint64_t, typename std::conditional<(bytes_per_row % sizeof(uint32_t)==0), uint32_t, dtype >::type >::type mem_dtype
Definition payload_xe.hpp:518
this_payload_t & operator=(const this_payload_t &rhs)
Definition payload_xe.hpp:563
__XETLA_API void init(mem_desc_t &mem_tdesc)
Definition payload_xe.hpp:540
__XETLA_API void init(uint32_t base, int surface_width, int surface_height, int surface_pitch, int surface_offset_x, int surface_offset_y)
Definition payload_xe.hpp:548
uint32_t address
Definition payload_xe.hpp:521
mem_payload_t(uint32_t base, int surface_width, int surface_height, int surface_pitch, int surface_offset_x, int surface_offset_y)
Definition payload_xe.hpp:531
mem_payload_t(const this_payload_t &rhs)
Definition payload_xe.hpp:557
tile_desc_ tile_desc
Definition payload_xe.hpp:489
Is to illustrate the memory information.
Definition api.hpp:44
prefetch_payload_t(dtype *p, int surface_width, int surface_height, int surface_pitch, int surface_offset_x, int surface_offset_y, uint32_t coop_id=0)
Definition payload_xe.hpp:1394
tile_desc_ tile_desc
Definition payload_xe.hpp:1386
dtype_ dtype
Definition payload_xe.hpp:1383
mem_desc_t< dtype_, mem_layout_, mem_space::local, alignment_ > mem_desc_t
Definition payload_xe.hpp:1385
prefetch_payload_t(mem_desc_t &mem_desc, uint32_t coop_id=0)
Definition payload_xe.hpp:1391
__XETLA_API void update_tdesc(int offset)
Definition payload_xe.hpp:1403
gpu::xetla::subgroup::prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::update_tdesc __XETLA_API void update_tdesc(int offset)
Definition payload_xe.hpp:1214
gpu::xetla::subgroup::prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::tdesc_prefetch xetla_vector< uint32_t, num_tdesc *16 > tdesc_prefetch
Definition payload_xe.hpp:1173
gpu::xetla::subgroup::prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::prefetch_payload_t prefetch_payload_t()=default
gpu::xetla::subgroup::prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::dtype dtype_ dtype
Definition payload_xe.hpp:1122
gpu::xetla::subgroup::prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::prefetch_payload_t prefetch_payload_t(mem_desc_t &mem_desc, uint32_t coop_id=0)
Definition payload_xe.hpp:1186
gpu::xetla::subgroup::prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::mem_desc_t mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ > mem_desc_t
Definition payload_xe.hpp:1124
gpu::xetla::subgroup::prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::operator= this_payload_t & operator=(const this_payload_t &rhs)
Definition payload_xe.hpp:1181
gpu::xetla::subgroup::prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::prefetch_payload_t prefetch_payload_t(const this_payload_t &rhs)
Definition payload_xe.hpp:1175
gpu::xetla::subgroup::prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, tile_size_y_, block_size_x_, block_size_y_, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::prefetch_payload_t prefetch_payload_t(dtype *p, int surface_width, int surface_height, int surface_pitch, int surface_offset_x, int surface_offset_y, uint32_t coop_id=0)
Definition payload_xe.hpp:1197
gpu::xetla::subgroup::prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, 1, block_size_x_, 1, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::pitch_in_bytes uint32_t pitch_in_bytes
Definition payload_xe.hpp:1323
gpu::xetla::subgroup::prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, 1, block_size_x_, 1, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::base_offset uint32_t base_offset
Definition payload_xe.hpp:1321
gpu::xetla::subgroup::prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, 1, block_size_x_, 1, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::mem_desc_t mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ > mem_desc_t
Definition payload_xe.hpp:1288
gpu::xetla::subgroup::prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, 1, block_size_x_, 1, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::dtype dtype_ dtype
Definition payload_xe.hpp:1286
gpu::xetla::subgroup::prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, 1, block_size_x_, 1, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::update_tdesc __XETLA_API void update_tdesc(int offset)
Definition payload_xe.hpp:1362
gpu::xetla::subgroup::prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, 1, block_size_x_, 1, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::prefetch_dtype uint64_t prefetch_dtype
Definition payload_xe.hpp:1290
gpu::xetla::subgroup::prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, 1, block_size_x_, 1, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::prefetch_payload_t prefetch_payload_t(const this_payload_t &rhs)
Definition payload_xe.hpp:1325
gpu::xetla::subgroup::prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, 1, block_size_x_, 1, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::prefetch_payload_t prefetch_payload_t(dtype *p, int surface_width, int surface_height, int surface_pitch, int surface_offset_x, int surface_offset_y, uint32_t coop_id=0)
Definition payload_xe.hpp:1350
gpu::xetla::subgroup::prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, 1, block_size_x_, 1, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::prefetch_payload_t prefetch_payload_t()=default
gpu::xetla::subgroup::prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, 1, block_size_x_, 1, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::operator= this_payload_t & operator=(const this_payload_t &rhs)
Definition payload_xe.hpp:1333
gpu::xetla::subgroup::prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, 1, block_size_x_, 1, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::prefetch_payload_t prefetch_payload_t(mem_desc_t &mem_desc, uint32_t coop_id=0)
Definition payload_xe.hpp:1340
gpu::xetla::subgroup::prefetch_payload_t< mem_desc_t< dtype_, mem_layout_, mem_space::global, alignment_ >, tile_desc_t< tile_size_x_, 1, block_size_x_, 1, reg_layout_ >, num_coop_sg_, arch_tag_, std::enable_if_t<(arch_tag_==gpu_arch::Xe)> >::base_ptr prefetch_dtype * base_ptr
Definition payload_xe.hpp:1322
Is to illustrate the memory information to prefetch data to cache.
Definition api.hpp:53
Is to illustrate the tile information about a sub matrix.
Definition api.hpp:64