DPC++ Runtime
Runtime libraries for oneAPI DPC++
util.hpp
Go to the documentation of this file.
1 /***************************************************************************
2  *
3  * Copyright (C) Codeplay Software Ltd.
4  *
5  * Part of the LLVM Project, under the Apache License v2.0 with LLVM
6  * Exceptions. See https://llvm.org/LICENSE.txt for license information.
7  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  *
15  * SYCL compatibility extension
16  *
17  * util.hpp
18  *
19  * Description:
20  * util functionality for the SYCL compatibility extension
21  **************************************************************************/
22 
23 // The original source was under the license below:
24 //==---- util.hpp ---------------------------------*- C++ -*----------------==//
25 //
26 // Copyright (C) Intel Corporation
27 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
28 // See https://llvm.org/LICENSE.txt for license information.
29 //
30 //===----------------------------------------------------------------------===//
31 
32 #pragma once
33 
34 #include <cassert>
35 #include <type_traits>
36 
37 #include <sycl/atomic_ref.hpp>
38 #include <sycl/group_barrier.hpp>
39 
40 #include <syclcompat/math.hpp>
41 #include <syclcompat/memory.hpp>
42 
43 #if defined(__NVPTX__)
45 #endif
46 
47 // TODO: Remove these function definitions once they exist in the DPC++ compiler
48 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
49 template <typename T>
50 __SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT
51  __attribute__((noduplicate)) T
52  __spirv_GroupNonUniformShuffle(__spv::Scope::Flag, T, unsigned) noexcept;
53 
54 template <typename T>
55 __SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT
56  __attribute__((noduplicate)) T
57  __spirv_GroupNonUniformShuffleDown(__spv::Scope::Flag, T,
58  unsigned) noexcept;
59 
60 template <typename T>
61 __SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT
62  __attribute__((noduplicate)) T
63  __spirv_GroupNonUniformShuffleUp(__spv::Scope::Flag, T, unsigned) noexcept;
64 #endif
65 
66 namespace syclcompat {
67 
68 namespace detail {
69 
70 template <typename tag, typename T> class generic_error_type {
71 public:
72  generic_error_type() = default;
73  generic_error_type(T value) : value{value} {}
74  operator T() const { return value; }
75 
76 private:
77  T value;
78 };
79 
80 template <typename T> struct DataType {
81  using T2 = T;
82 };
83 template <typename T> struct DataType<sycl::vec<T, 2>> {
85 };
86 
87 inline void matrix_mem_copy(void *to_ptr, const void *from_ptr, int to_ld,
88  int from_ld, int rows, int cols, int elem_size,
90  bool async = false) {
91  if (to_ptr == from_ptr && to_ld == from_ld) {
92  return;
93  }
94 
95  if (to_ld == from_ld) {
96  size_t copy_size = elem_size * ((cols - 1) * (size_t)to_ld + rows);
97  if (async)
98  detail::memcpy(queue, (void *)to_ptr, (void *)from_ptr, copy_size);
99  else
100  detail::memcpy(queue, (void *)to_ptr, (void *)from_ptr, copy_size).wait();
101  } else {
102  if (async)
103  detail::memcpy(queue, to_ptr, from_ptr, elem_size * to_ld,
104  elem_size * from_ld, elem_size * rows, cols);
105  else
106  sycl::event::wait(detail::memcpy(queue, to_ptr, from_ptr,
107  elem_size * to_ld, elem_size * from_ld,
108  elem_size * rows, cols));
109  }
110 }
111 
122 template <typename T>
123 inline void matrix_mem_copy(T *to_ptr, const T *from_ptr, int to_ld,
124  int from_ld, int rows, int cols,
125  sycl::queue queue = get_default_queue(),
126  bool async = false) {
127  using Ty = typename DataType<T>::T2;
128  matrix_mem_copy((void *)to_ptr, (void *)from_ptr, to_ld, from_ld, rows, cols,
129  sizeof(Ty), queue, async);
130 }
131 } // namespace detail
132 
135 
140 inline int cast_double_to_int(double d, bool use_high32 = true) {
141  sycl::vec<double, 1> v0{d};
142  auto v1 = v0.as<sycl::int2>();
143  if (use_high32)
144  return v1[0];
145  return v1[1];
146 }
147 
152 inline double cast_ints_to_double(int high32, int low32) {
153  sycl::int2 v0{high32, low32};
154  auto v1 = v0.as<sycl::vec<double, 1>>();
155  return v1;
156 }
157 
161 template <typename T> inline T reverse_bits(T a) {
162  static_assert(std::is_unsigned<T>::value && std::is_integral<T>::value,
163  "unsigned integer required");
164  if (!a)
165  return 0;
166  T mask = 0;
167  size_t count = 4 * sizeof(T);
168  mask = ~mask >> count;
169  while (count) {
170  a = ((a & mask) << count) | ((a & ~mask) >> count);
171  count = count >> 1;
172  mask = mask ^ (mask << count);
173  }
174  return a;
175 }
176 
182 inline unsigned int byte_level_permute(unsigned int a, unsigned int b,
183  unsigned int s) {
184  unsigned int ret;
185  ret =
186  ((((std::uint64_t)b << 32 | a) >> (s & 0x7) * 8) & 0xff) |
187  (((((std::uint64_t)b << 32 | a) >> ((s >> 4) & 0x7) * 8) & 0xff) << 8) |
188  (((((std::uint64_t)b << 32 | a) >> ((s >> 8) & 0x7) * 8) & 0xff) << 16) |
189  (((((std::uint64_t)b << 32 | a) >> ((s >> 12) & 0x7) * 8) & 0xff) << 24);
190  return ret;
191 }
192 
198 template <typename T> inline int ffs(T a) {
199  static_assert(std::is_integral<T>::value, "integer required");
200  return (sycl::ctz(a) + 1) % (sizeof(T) * 8 + 1);
201 }
202 
217 template <typename T>
218 T select_from_sub_group(sycl::sub_group g, T x, int remote_local_id,
219  int logical_sub_group_size = 32) {
220  unsigned int start_index =
221  g.get_local_linear_id() / logical_sub_group_size * logical_sub_group_size;
223  g, x, start_index + remote_local_id % logical_sub_group_size);
224 }
225 
241 template <typename T>
242 T shift_sub_group_left(sycl::sub_group g, T x, unsigned int delta,
243  int logical_sub_group_size = 32) {
244  unsigned int id = g.get_local_linear_id();
245  unsigned int end_index =
246  (id / logical_sub_group_size + 1) * logical_sub_group_size;
247  T result = sycl::shift_group_left(g, x, delta);
248  if ((id + delta) >= end_index) {
249  result = x;
250  }
251  return result;
252 }
253 
269 template <typename T>
270 T shift_sub_group_right(sycl::sub_group g, T x, unsigned int delta,
271  int logical_sub_group_size = 32) {
272  unsigned int id = g.get_local_linear_id();
273  unsigned int start_index =
274  id / logical_sub_group_size * logical_sub_group_size;
275  T result = sycl::shift_group_right(g, x, delta);
276  if ((id - start_index) < delta) {
277  result = x;
278  }
279  return result;
280 }
281 
297 template <typename T>
298 T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask,
299  int logical_sub_group_size = 32) {
300  unsigned int id = g.get_local_linear_id();
301  unsigned int start_index =
302  id / logical_sub_group_size * logical_sub_group_size;
303  unsigned int target_offset = (id % logical_sub_group_size) ^ mask;
304  return sycl::select_from_group(g, x,
305  target_offset < logical_sub_group_size
306  ? start_index + target_offset
307  : id);
308 }
309 
310 namespace experimental {
324 template <typename T>
325 T select_from_sub_group(unsigned int member_mask, sycl::sub_group g, T x,
326  int remote_local_id, int logical_sub_group_size = 32) {
327  unsigned int start_index =
328  g.get_local_linear_id() / logical_sub_group_size * logical_sub_group_size;
329  unsigned logical_remote_id =
330  start_index + remote_local_id % logical_sub_group_size;
331 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
332 #if defined(__SPIR__)
333  return __spirv_GroupNonUniformShuffle(__spv::Scope::Subgroup, x,
334  logical_remote_id);
335 #elif defined(__NVPTX__)
336  int cVal = ((32 - logical_sub_group_size) << 8) | 31;
337  return cuda_shfl_sync_idx_i32(member_mask, x, remote_local_id, cVal);
338 #else
339  throw sycl::exception(sycl::errc::runtime,
340  "[SYCLcompat] Masked version of select_from_sub_group "
341  "only supports SPIR-V or cuda backends.");
342 #endif // __SPIR__
343 #else
344  (void)g;
345  (void)x;
346  (void)remote_local_id;
347  (void)logical_sub_group_size;
348  (void)member_mask;
349  throw sycl::exception(
350  sycl::errc::runtime,
351  "[SYCLcompat] Masked version of select_from_sub_group not "
352  "supported on host device and non intel compiler.");
353 #endif // __SYCL_DEVICE_ONLY__ && __INTEL_LLVM_COMPILER
354 }
355 
369 template <typename T>
370 T shift_sub_group_left(unsigned int member_mask, sycl::sub_group g, T x,
371  unsigned int delta, int logical_sub_group_size = 32) {
372  unsigned int id = g.get_local_linear_id();
373  unsigned int end_index =
374  (id / logical_sub_group_size + 1) * logical_sub_group_size;
375 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
376 #if defined(__SPIR__)
377  T result =
378  __spirv_GroupNonUniformShuffleDown(__spv::Scope::Subgroup, x, delta);
379  if ((id + delta) >= end_index) {
380  result = x;
381  }
382  return result;
383 #elif defined(__NVPTX__)
384  int cVal = ((32 - logical_sub_group_size) << 8) | 31;
385  return cuda_shfl_sync_down_i32(member_mask, x, delta, cVal);
386 #else
387  throw sycl::exception(sycl::errc::runtime,
388  "[SYCLcompat] Masked version of shift_sub_group_left "
389  "only supports SPIR-V or cuda backends.");
390 #endif // __SPIR__
391 #else
392  (void)g;
393  (void)x;
394  (void)delta;
395  (void)logical_sub_group_size;
396  (void)member_mask;
397  throw sycl::exception(
398  sycl::errc::runtime,
399  "[SYCLcompat] Masked version of shift_sub_group_left not "
400  "supported on host device and non intel compiler.");
401 #endif // __SYCL_DEVICE_ONLY__ && __INTEL_LLVM_COMPILER
402 }
403 
417 template <typename T>
418 T shift_sub_group_right(unsigned int member_mask, sycl::sub_group g, T x,
419  unsigned int delta, int logical_sub_group_size = 32) {
420  unsigned int id = g.get_local_linear_id();
421  unsigned int start_index =
422  id / logical_sub_group_size * logical_sub_group_size;
423 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
424 #if defined(__SPIR__)
425  T result = __spirv_GroupNonUniformShuffleUp(__spv::Scope::Subgroup, x, delta);
426  if ((id - start_index) < delta) {
427  result = x;
428  }
429  return result;
430 #elif defined(__NVPTX__)
431  int cVal = ((32 - logical_sub_group_size) << 8);
432  return cuda_shfl_sync_up_i32(member_mask, x, delta, cVal);
433 #else
434  throw sycl::exception(sycl::errc::runtime,
435  "Masked version of shift_sub_group_right "
436  "only supports SPIR-V or cuda backends.");
437 #endif // __SPIR__
438 #else
439  (void)g;
440  (void)x;
441  (void)delta;
442  (void)logical_sub_group_size;
443  (void)member_mask;
444  throw sycl::exception(sycl::errc::runtime,
445  "Masked version of shift_sub_group_right not "
446  "supported on host device and non intel compiler.");
447 #endif // __SYCL_DEVICE_ONLY && __INTEL_LLVM_COMPILER
448 }
449 
463 template <typename T>
464 T permute_sub_group_by_xor(unsigned int member_mask, sycl::sub_group g, T x,
465  unsigned int mask, int logical_sub_group_size = 32) {
466  unsigned int id = g.get_local_linear_id();
467  unsigned int start_index =
468  id / logical_sub_group_size * logical_sub_group_size;
469  unsigned int target_offset = (id % logical_sub_group_size) ^ mask;
470  unsigned logical_remote_id = (target_offset < logical_sub_group_size)
471  ? start_index + target_offset
472  : id;
473 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
474 #if defined(__SPIR__)
475  return __spirv_GroupNonUniformShuffle(__spv::Scope::Subgroup, x,
476  logical_remote_id);
477 #elif defined(__NVPTX__)
478  int cVal = ((32 - logical_sub_group_size) << 8) | 31;
479  return cuda_shfl_sync_bfly_i32(member_mask, x, mask, cVal);
480 #else
481  throw sycl::exception(
482  sycl::errc::runtime,
483  "[SYCLcompat] Masked version of permute_sub_group_by_xor "
484  "only supports SPIR-V or cuda backends.");
485 #endif // __SPIR__
486 #else
487  (void)g;
488  (void)x;
489  (void)mask;
490  (void)logical_sub_group_size;
491  (void)member_mask;
492  throw sycl::exception(
493  sycl::errc::runtime,
494  "[SYCLcompat]Masked version of permute_sub_group_by_xor not "
495  "supported on host device and non intel compiler.");
496 #endif // __SYCL_DEVICE_ONLY__ && __INTEL_LLVM_COMPILER
497 }
498 } // namespace experimental
499 
503 #ifdef SYCL_LANGUAGE_VERSION
504  return SYCL_LANGUAGE_VERSION;
505 #else
506  return 202000;
507 #endif
508 }
509 
521 template <typename T>
522 unsigned int match_any_over_sub_group(sycl::sub_group g, unsigned member_mask,
523  T value) {
524  static_assert(std::is_arithmetic_v<T>, "Value type must be arithmetic type.");
525  if (!member_mask) {
526  return 0;
527  }
528  unsigned int id = g.get_local_linear_id();
529  unsigned int flag = 0, result = 0, reduce_result = 0;
530  unsigned int bit_index = 0x1 << id;
531  bool is_participate = member_mask & bit_index;
532  T broadcast_value = 0;
533  bool matched = false;
534  while (flag != member_mask) {
535  broadcast_value =
536  sycl::select_from_group(g, value, sycl::ctz((~flag & member_mask)));
537  reduce_result = sycl::reduce_over_group(
538  g, is_participate ? (broadcast_value == value ? bit_index : 0) : 0,
539  sycl::plus<>());
540  flag |= reduce_result;
541  matched = reduce_result & bit_index;
542  result = matched * reduce_result + (1 - matched) * result;
543  }
544  return result;
545 }
546 
560 template <typename T>
561 unsigned int match_all_over_sub_group(sycl::sub_group g, unsigned member_mask,
562  T value, int *pred) {
563  static_assert(std::is_arithmetic_v<T>, "Value type must be arithmetic type.");
564  if (!member_mask) {
565  return 0;
566  }
567  unsigned int id = g.get_local_linear_id();
568  unsigned int bit_index = 0x1 << id;
569  bool is_participate = member_mask & bit_index;
570  T broadcast_value = sycl::select_from_group(g, value, sycl::ctz(member_mask));
571  unsigned int reduce_result = sycl::reduce_over_group(
572  g,
573  (member_mask & bit_index) ? (broadcast_value == value ? bit_index : 0)
574  : 0,
575  sycl::plus<>());
576  bool all_equal = (reduce_result == member_mask);
577  *pred = is_participate & all_equal;
578  return (is_participate & all_equal) * member_mask;
579 }
580 
581 namespace experimental {
582 
583 // FIXME(@intel/syclcompat-lib-reviewers): unify once supported in the CUDA and
584 // AMD backends.
585 #if defined(__AMDGPU__) || defined(__NVPTX__)
586 constexpr sycl::memory_order barrier_memory_order = sycl::memory_order::acq_rel;
587 #else
588 constexpr sycl::memory_order barrier_memory_order = sycl::memory_order::seq_cst;
589 #endif
590 
598 template <int dimensions = 3>
599 inline void nd_range_barrier(
600  const sycl::nd_item<dimensions> &item,
603  sycl::access::address_space::global_space> &counter) {
604 
605  static_assert(dimensions == 3, "dimensions must be 3.");
606  constexpr unsigned int MSB32_MASK = 0x80000000;
607 
608  unsigned int num_groups = item.get_group_range(2) * item.get_group_range(1) *
609  item.get_group_range(0);
610 
611  item.barrier();
612 
613  if (item.get_local_linear_id() == 0) {
614  unsigned int inc = 1;
615  unsigned int old_arrive = 0;
616  bool is_group0 =
617  (item.get_group(2) + item.get_group(1) + item.get_group(0) == 0);
618  if (is_group0) {
619  inc = MSB32_MASK - (num_groups - 1);
620  }
621 
622  old_arrive = counter.fetch_add(inc);
623  // Synchronize all the work groups
624  while (((old_arrive ^ counter.load()) & MSB32_MASK) == 0)
625  ;
626  }
627 
628  item.barrier();
629 }
630 
638 template <>
639 inline void nd_range_barrier(
640  const sycl::nd_item<1> &item,
643  sycl::access::address_space::global_space> &counter) {
644  unsigned int num_groups = item.get_group_range(0);
645  constexpr unsigned int MSB32_MASK = 0x80000000;
646 
647  item.barrier();
648 
649  if (item.get_local_linear_id() == 0) {
650  unsigned int inc = 1;
651  unsigned int old_arrive = 0;
652  bool is_group0 = (item.get_group(0) == 0);
653  if (is_group0) {
654  inc = MSB32_MASK - (num_groups - 1);
655  }
656 
657  old_arrive = counter.fetch_add(inc);
658  // Synchronize all the work groups
659  while (((old_arrive ^ counter.load()) & MSB32_MASK) == 0)
660  ;
661  }
662 
663  item.barrier();
664 }
665 
670 template <int dimensions = 3> class logical_group {
673  uint32_t _logical_group_size;
674  uint32_t _group_linear_range_in_parent;
675 
676 public:
682  sycl::group<dimensions> parent_group, uint32_t size)
683  : _item(item), _g(parent_group), _logical_group_size(size) {
684  _group_linear_range_in_parent =
685  (_g.get_local_linear_range() - 1) / _logical_group_size + 1;
686  }
688  : _item(item), _g(item.get_group()) {}
690  uint32_t get_local_linear_id() const {
691  return _item.get_local_linear_id() % _logical_group_size;
692  }
694  uint32_t get_group_linear_id() const {
695  return _item.get_local_linear_id() / _logical_group_size;
696  }
698  uint32_t get_local_linear_range() const {
699  if (_g.get_local_linear_range() % _logical_group_size == 0) {
700  return _logical_group_size;
701  }
702  uint32_t last_item_group_id =
703  _g.get_local_linear_range() / _logical_group_size;
704  uint32_t first_of_last_group = last_item_group_id * _logical_group_size;
705  if (_item.get_local_linear_id() >= first_of_last_group) {
706  return _g.get_local_linear_range() - first_of_last_group;
707  } else {
708  return _logical_group_size;
709  }
710  }
712  uint32_t get_group_linear_range() const {
713  return _group_linear_range_in_parent;
714  }
715 };
716 
717 // The original source of the functions calculate_max_active_wg_per_xecore and
718 // calculate_max_potential_wg were under the license below:
719 //
720 // Copyright (C) Intel Corporation
721 //
722 // Permission is hereby granted, free of charge, to any person obtaining a copy
723 // of this software and associated documentation files (the "Software"), to deal
724 // in the Software without restriction, including without limitation the rights
725 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
726 // copies of the Software, and to permit persons to whom the Software is
727 // furnished to do so, subject to the following conditions:
728 //
729 // The above copyright notice and this permission notice shall be included in
730 // all copies or substantial portions of the Software.
731 //
732 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
733 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
734 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
735 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
736 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
737 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
738 // SOFTWARE.
739 //
752 inline int calculate_max_active_wg_per_xecore(int *num_wg, int wg_size,
753  int slm_size = 0,
754  int sg_size = 32,
755  bool used_barrier = false,
756  bool used_large_grf = false) {
757  int ret = 0;
758  const int slm_size_per_xe_core = 64 * 1024;
759  const int max_barrier_registers = 32;
761 
762  size_t max_wg_size = dev.get_info<sycl::info::device::max_work_group_size>();
763  if (wg_size > max_wg_size) {
764  wg_size = max_wg_size;
765  ret = -1;
766  }
767 
768  int num_threads_ss = 56;
769  int max_num_wg = 56;
770  if (dev.has(sycl::aspect::ext_intel_gpu_eu_count_per_subslice) &&
771  dev.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
772  auto eu_count =
773  dev.get_info<sycl::info::device::ext_intel_gpu_eu_count_per_subslice>();
774  auto threads_count =
775  dev.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
776  num_threads_ss = eu_count * threads_count;
777  max_num_wg = eu_count * threads_count;
778  }
779 
780  if (used_barrier) {
781  max_num_wg = max_barrier_registers;
782  }
783 
784  // Calculate num_wg_slm
785  int num_wg_slm = 0;
786  if (slm_size == 0) {
787  num_wg_slm = max_num_wg;
788  } else {
789  num_wg_slm = std::floor((float)slm_size_per_xe_core / slm_size);
790  }
791 
792  // Calculate num_wg_threads
793  if (used_large_grf)
794  num_threads_ss = num_threads_ss / 2;
795  int num_threads = std::ceil((float)wg_size / sg_size);
796  int num_wg_threads = std::floor((float)num_threads_ss / num_threads);
797 
798  // Calculate num_wg
799  *num_wg = std::min(num_wg_slm, num_wg_threads);
800  *num_wg = std::min(*num_wg, max_num_wg);
801  return ret;
802 }
803 
817 inline int calculate_max_potential_wg(int *num_wg, int *wg_size,
818  int max_wg_size_for_device_code,
819  int slm_size = 0, int sg_size = 32,
820  bool used_barrier = false,
821  bool used_large_grf = false) {
823  size_t max_wg_size = dev.get_info<sycl::info::device::max_work_group_size>();
824  if (max_wg_size_for_device_code == 0 ||
825  max_wg_size_for_device_code >= max_wg_size)
826  *wg_size = (int)max_wg_size;
827  else
828  *wg_size = max_wg_size_for_device_code;
829  calculate_max_active_wg_per_xecore(num_wg, *wg_size, slm_size, sg_size,
830  used_barrier, used_large_grf);
831  std::uint32_t num_ss = 1;
832  if (dev.has(sycl::aspect::ext_intel_gpu_slices) &&
833  dev.has(sycl::aspect::ext_intel_gpu_subslices_per_slice)) {
834  num_ss =
835  dev.get_info<sycl::ext::intel::info::device::gpu_slices>() *
836  dev.get_info<sycl::ext::intel::info::device::gpu_subslices_per_slice>();
837  }
838  num_wg[0] = num_ss * num_wg[0];
839  return 0;
840 }
841 
844 
847 template <int dimensions = 3> class group_base {
848 public:
850  : nd_item(item), logical_group(item) {}
854  switch (type) {
856  return nd_item.get_group().get_local_linear_range();
858  return nd_item.get_sub_group().get_local_linear_range();
861  default:
862  return -1; // Unkonwn group type
863  }
864  }
867  switch (type) {
869  return nd_item.get_group().get_local_linear_id();
871  return nd_item.get_sub_group().get_local_linear_id();
874  default:
875  return -1; // Unkonwn group type
876  }
877  }
880  void barrier() {
881  switch (type) {
883  sycl::group_barrier(nd_item.get_group());
884  break;
887  sycl::group_barrier(nd_item.get_sub_group());
888  break;
889  default:
890  break;
891  }
892  }
893 
894 protected:
898 };
899 
901 template <typename GroupT, int dimensions = 3>
902 class group : public group_base<dimensions> {
905 
906 public:
908  : group_base<dimensions>(item) {
909  if constexpr (std::is_same_v<GroupT, sycl::sub_group>) {
911  } else if constexpr (std::is_same_v<GroupT, sycl::group<dimensions>>) {
913  } else if constexpr (std::is_same_v<
915  logical_group = g;
917  }
918  }
919 };
920 } // namespace experimental
921 
924 inline queue_ptr int_as_queue_ptr(uintptr_t x) {
926  : reinterpret_cast<queue_ptr>(x);
927 }
928 
929 template <int n_nondefault_params, int n_default_params, typename T>
931 
951 template <int n_nondefault_params, int n_default_params, typename R,
952  typename... Ts>
953 class args_selector<n_nondefault_params, n_default_params, R(Ts...)> {
954 private:
955  void **kernel_params;
956  char *args_buffer;
957 
958  template <int i> static constexpr int account_for_default_params() {
959  constexpr int n_total_params = sizeof...(Ts);
960  if constexpr (i >= n_nondefault_params) {
961  return n_total_params - n_default_params + (i - n_nondefault_params);
962  } else {
963  return i;
964  }
965  }
966 
967 public:
971  template <int i>
972  using arg_type =
973  std::tuple_element_t<account_for_default_params<i>(), std::tuple<Ts...>>;
974 
975 private:
976  template <int i> static constexpr int get_offset() {
977  if constexpr (i == 0) {
978  // we can assume args_buffer is properly aligned to the
979  // first argument
980  return 0;
981  } else {
982  constexpr int prev_off = get_offset<i - 1>();
983  constexpr int prev_past_end = prev_off + sizeof(arg_type<i - 1>);
984  using T = arg_type<i>;
985  // is the past-the-end of the i-1st element properly aligned
986  // with the ith element's alignment?
987  if constexpr (prev_past_end % alignof(T) == 0) {
988  return prev_past_end;
989  }
990  // otherwise bump prev_past_end to match alignment
991  else {
992  return prev_past_end + (alignof(T) - (prev_past_end % alignof(T)));
993  }
994  }
995  }
996 
997  static char *get_args_buffer(void **extra) {
998  if (!extra)
999  return nullptr;
1000  for (; (std::size_t)*extra != 0; ++extra) {
1001  if ((std::size_t)*extra == 1) {
1002  return static_cast<char *>(*(extra + 1));
1003  }
1004  }
1005  return nullptr;
1006  }
1007 
1008 public:
1015  args_selector(void **kernel_params, void **extra)
1016  : kernel_params(kernel_params), args_buffer(get_args_buffer(extra)) {}
1017 
1022  template <int i> arg_type<i> &get() {
1023  if (kernel_params) {
1024  return *static_cast<arg_type<i> *>(kernel_params[i]);
1025  } else {
1026  return *reinterpret_cast<arg_type<i> *>(args_buffer + get_offset<i>());
1027  }
1028  }
1029 };
1030 
1031 } // namespace syclcompat
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64
detail::is_device_info_desc< Param >::return_type get_info() const
Queries this SYCL device for information requested by the template parameter param.
Definition: device.hpp:215
bool has(aspect Aspect) const __SYCL_WARN_IMAGE_ASPECT(Aspect)
Indicates if the SYCL device has the given feature.
Definition: device.cpp:207
void wait()
Wait for the event.
Definition: event.cpp:41
Identifies an instance of the function object executing at each point in an nd_range.
Definition: nd_item.hpp:48
size_t get_local_linear_id() const
Definition: nd_item.hpp:97
group< Dimensions > get_group() const
Definition: nd_item.hpp:113
range< Dimensions > get_group_range() const
Definition: nd_item.hpp:144
void barrier(access::fence_space accessSpace=access::fence_space::global_and_local) const
Definition: nd_item.hpp:200
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
Definition: queue.hpp:110
arg_type< i > & get()
Get a reference to the ith argument extracted from kernel_params or extra.
Definition: util.hpp:1022
args_selector(void **kernel_params, void **extra)
If kernel_params is nonnull, then args_selector will extract arguments from kernel_params.
Definition: util.hpp:1015
std::tuple_element_t< account_for_default_params< i >(), std::tuple< Ts... > > arg_type
Get the type of the ith argument of R(Ts...)
Definition: util.hpp:973
static dev_mgr & instance()
Returns the instance of device manager singleton.
Definition: device.hpp:813
device_ext & current_device()
Definition: device.hpp:703
device extension
Definition: device.hpp:338
queue_ptr default_queue()
Definition: device.hpp:564
The group_base will dispatch the function call to the specific interface based on the group type.
Definition: util.hpp:847
size_t get_local_linear_range()
Returns the number of work-items in the group.
Definition: util.hpp:853
logical_group< dimensions > logical_group
Definition: util.hpp:895
group_base(sycl::nd_item< dimensions > item)
Definition: util.hpp:849
void barrier()
Wait for all the elements within the group to complete their execution before proceeding.
Definition: util.hpp:880
sycl::nd_item< dimensions > nd_item
Definition: util.hpp:896
size_t get_local_linear_id()
Returns the index of the work-item within the group.
Definition: util.hpp:866
Container type that can store supported group_types.
Definition: util.hpp:902
group(GroupT g, sycl::nd_item< dimensions > item)
Definition: util.hpp:907
The logical-group is a logical collection of some work-items within a work-group.
Definition: util.hpp:670
uint32_t get_group_linear_range() const
Returns the number of logical-group in the parent group.
Definition: util.hpp:712
uint32_t get_local_linear_range() const
Returns the number of work-items in the logical-group.
Definition: util.hpp:698
logical_group(sycl::nd_item< dimensions > item)
Definition: util.hpp:687
logical_group(sycl::nd_item< dimensions > item, sycl::group< dimensions > parent_group, uint32_t size)
Dividing parent_group into several logical-groups.
Definition: util.hpp:681
uint32_t get_group_linear_id() const
Returns the index of the logical-group in the parent group.
Definition: util.hpp:694
uint32_t get_local_linear_id() const
Returns the index of the work-item within the logical-group.
Definition: util.hpp:690
#define SYCL_EXTERNAL
__ESIMD_API std::enable_if_t<(sizeof(T) *N >=2)> wait(sycl::ext::intel::esimd::simd< T, N > value)
Create explicit scoreboard dependency to avoid device code motion across this call and preserve the v...
Definition: memory.hpp:213
__attribute__((always_inline)) auto invoke_simd(sycl
The invoke_simd free function invokes a SIMD function using all work-items in a sub_group.
float ceil(float)
std::enable_if_t<((std::is_same_v< std::decay_t< Group >, sub_group >||sycl::ext::oneapi::experimental::is_user_constructed_group_v< std::decay_t< Group >>) &&(std::is_trivially_copyable_v< T >||detail::is_vec< T >::value)), T > shift_group_left(Group g, T x, typename Group::linear_id_type delta=1)
std::enable_if_t<((std::is_same_v< std::decay_t< Group >, sub_group >||sycl::ext::oneapi::experimental::is_user_constructed_group_v< std::decay_t< Group >>) &&(std::is_trivially_copyable_v< T >||detail::is_vec< T >::value)), T > shift_group_right(Group g, T x, typename Group::linear_id_type delta=1)
void group_barrier(ext::oneapi::experimental::root_group< dimensions > G, memory_scope FenceScope=decltype(G)::fence_scope)
Definition: root_group.hpp:100
class __SYCL_EBO vec
Definition: aliases.hpp:18
std::plus< T > plus
Definition: functional.hpp:18
std::enable_if_t<(is_group_v< std::decay_t< Group >> &&(detail::is_scalar_arithmetic< T >::value||(detail::is_complex< T >::value &&detail::is_multiplies< T, BinaryOperation >::value)) &&detail::is_native_op< T, BinaryOperation >::value), T > reduce_over_group(Group g, T x, BinaryOperation binary_op)
std::enable_if_t<((std::is_same_v< std::decay_t< Group >, sub_group >||sycl::ext::oneapi::experimental::is_user_constructed_group_v< std::decay_t< Group >>) &&(std::is_trivially_copyable_v< T >||detail::is_vec< T >::value)), T > select_from_group(Group g, T x, typename Group::id_type local_id)
float floor(float)
Definition: access.hpp:18
static sycl::event memcpy(sycl::queue q, void *to_ptr, const void *from_ptr, size_t size, const std::vector< sycl::event > &dep_events={})
Definition: memory.hpp:315
detail::complex_namespace::complex< ValueT > complex_type
Definition: math.hpp:47
void matrix_mem_copy(void *to_ptr, const void *from_ptr, int to_ld, int from_ld, int rows, int cols, int elem_size, sycl::queue queue=syclcompat::get_default_queue(), bool async=false)
Definition: util.hpp:87
static size_t get_offset(sycl::id< 3 > id, size_t slice, size_t pitch)
Definition: memory.hpp:329
int calculate_max_active_wg_per_xecore(int *num_wg, int wg_size, int slm_size=0, int sg_size=32, bool used_barrier=false, bool used_large_grf=false)
This function is used for occupancy calculation, it computes the max active work-group number per Xe-...
Definition: util.hpp:752
T shift_sub_group_left(unsigned int member_mask, sycl::sub_group g, T x, unsigned int delta, int logical_sub_group_size=32)
Masked version of shift_sub_group_left, which execute masked sub-group operation.
Definition: util.hpp:370
constexpr sycl::memory_order barrier_memory_order
Definition: util.hpp:588
group_type
Supported group types.
Definition: util.hpp:843
int calculate_max_potential_wg(int *num_wg, int *wg_size, int max_wg_size_for_device_code, int slm_size=0, int sg_size=32, bool used_barrier=false, bool used_large_grf=false)
This function is used for occupancy calculation, it computes the work-group number and the work-group...
Definition: util.hpp:817
T select_from_sub_group(unsigned int member_mask, sycl::sub_group g, T x, int remote_local_id, int logical_sub_group_size=32)
Masked version of select_from_sub_group, which execute masked sub-group operation.
Definition: util.hpp:325
T permute_sub_group_by_xor(unsigned int member_mask, sycl::sub_group g, T x, unsigned int mask, int logical_sub_group_size=32)
Masked version of permute_sub_group_by_xor, which execute masked sub-group operation.
Definition: util.hpp:464
T shift_sub_group_right(unsigned int member_mask, sycl::sub_group g, T x, unsigned int delta, int logical_sub_group_size=32)
Masked version of shift_sub_group_right, which execute masked sub-group operation.
Definition: util.hpp:418
void nd_range_barrier(const sycl::nd_item< dimensions > &item, sycl::atomic_ref< unsigned int, barrier_memory_order, sycl::memory_scope::device, sycl::access::address_space::global_space > &counter)
Synchronize work items from all work groups within a SYCL kernel.
Definition: util.hpp:599
double cast_ints_to_double(int high32, int low32)
Combine two integers, the first as the high 32 bits and the second as the low 32 bits,...
Definition: util.hpp:152
T select_from_sub_group(sycl::sub_group g, T x, int remote_local_id, int logical_sub_group_size=32)
select_from_sub_group allows work-items to obtain a copy of a value held by any other work-item in th...
Definition: util.hpp:218
queue_ptr int_as_queue_ptr(uintptr_t x)
If x <= 2, then return a pointer to the default queue; otherwise, return x reinterpreted as a queue_p...
Definition: util.hpp:924
unsigned int match_any_over_sub_group(sycl::sub_group g, unsigned member_mask, T value)
The function match_any_over_sub_group conducts a comparison of values across work-items within a sub-...
Definition: util.hpp:522
unsigned int byte_level_permute(unsigned int a, unsigned int b, unsigned int s)
Definition: util.hpp:182
static sycl::queue get_default_queue()
Util function to get the default queue of current device in device manager.
Definition: device.hpp:872
static device_ext & get_current_device()
Util function to get the current device.
Definition: device.hpp:900
unsigned int match_all_over_sub_group(sycl::sub_group g, unsigned member_mask, T value, int *pred)
The function match_all_over_sub_group conducts a comparison of values across work-items within a sub-...
Definition: util.hpp:561
T reverse_bits(T a)
Reverse the bit order of an unsigned integer.
Definition: util.hpp:161
int get_sycl_language_version()
Inherited from the original SYCLomatic compatibility headers.
Definition: util.hpp:502
int cast_double_to_int(double d, bool use_high32=true)
Cast the high or low 32 bits of a double to an integer.
Definition: util.hpp:140
T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask, int logical_sub_group_size=32)
permute_sub_group_by_xor permutes values by exchanging values held by pairs of work-items identified ...
Definition: util.hpp:298
T shift_sub_group_right(sycl::sub_group g, T x, unsigned int delta, int logical_sub_group_size=32)
shift_sub_group_right move values held by the work-items in a sub_group directly to another work-item...
Definition: util.hpp:270
T shift_sub_group_left(sycl::sub_group g, T x, unsigned int delta, int logical_sub_group_size=32)
shift_sub_group_left move values held by the work-items in a sub_group directly to another work-item ...
Definition: util.hpp:242
int ffs(T a)
Find position of first least significant set bit in an integer.
Definition: util.hpp:198
#define __SYCL_CONVERGENT__
Definition: spirv_ops.hpp:23
_Abi const simd< _Tp, _Abi > & noexcept
Definition: simd.hpp:1324
linear_id_type get_local_linear_id() const
Definition: sub_group.hpp:153