DPC++ Runtime
Runtime libraries for oneAPI DPC++
util.hpp
Go to the documentation of this file.
1 /***************************************************************************
2  *
3  * Copyright (C) Codeplay Software Ltd.
4  *
5  * Part of the LLVM Project, under the Apache License v2.0 with LLVM
6  * Exceptions. See https://llvm.org/LICENSE.txt for license information.
7  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  *
15  * SYCL compatibility extension
16  *
17  * util.hpp
18  *
19  * Description:
20  * util functionality for the SYCL compatibility extension
21  **************************************************************************/
22 
23 // The original source was under the license below:
24 //==---- util.hpp ---------------------------------*- C++ -*----------------==//
25 //
26 // Copyright (C) Intel Corporation
27 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
28 // See https://llvm.org/LICENSE.txt for license information.
29 //
30 //===----------------------------------------------------------------------===//
31 
32 #pragma once
33 
34 #include <cassert>
35 #include <type_traits>
36 
37 #include <sycl/sycl.hpp>
38 
39 #include <syclcompat/math.hpp>
40 #include <syclcompat/memory.hpp>
41 
42 #if defined(__NVPTX__)
44 #endif
45 
46 // TODO: Remove these function definitions once they exist in the DPC++ compiler
47 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
48 template <typename T>
49 __SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT
50  __attribute__((noduplicate)) T
51  __spirv_GroupNonUniformShuffle(__spv::Scope::Flag, T, unsigned) noexcept;
52 
53 template <typename T>
54 __SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT
55  __attribute__((noduplicate)) T
56  __spirv_GroupNonUniformShuffleDown(__spv::Scope::Flag, T,
57  unsigned) noexcept;
58 
59 template <typename T>
60 __SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT
61  __attribute__((noduplicate)) T
62  __spirv_GroupNonUniformShuffleUp(__spv::Scope::Flag, T, unsigned) noexcept;
63 #endif
64 
65 namespace syclcompat {
66 
67 namespace detail {
68 
69 template <typename tag, typename T> class generic_error_type {
70 public:
71  generic_error_type() = default;
72  generic_error_type(T value) : value{value} {}
73  operator T() const { return value; }
74 
75 private:
76  T value;
77 };
78 
79 template <typename T> struct DataType {
80  using T2 = T;
81 };
82 template <typename T> struct DataType<sycl::vec<T, 2>> {
84 };
85 
86 inline void matrix_mem_copy(void *to_ptr, const void *from_ptr, int to_ld,
87  int from_ld, int rows, int cols, int elem_size,
89  bool async = false) {
90  if (to_ptr == from_ptr && to_ld == from_ld) {
91  return;
92  }
93 
94  if (to_ld == from_ld) {
95  size_t copy_size = elem_size * ((cols - 1) * (size_t)to_ld + rows);
96  if (async)
97  detail::memcpy(queue, (void *)to_ptr, (void *)from_ptr, copy_size);
98  else
99  detail::memcpy(queue, (void *)to_ptr, (void *)from_ptr, copy_size).wait();
100  } else {
101  if (async)
102  detail::memcpy(queue, to_ptr, from_ptr, elem_size * to_ld,
103  elem_size * from_ld, elem_size * rows, cols);
104  else
105  sycl::event::wait(detail::memcpy(queue, to_ptr, from_ptr,
106  elem_size * to_ld, elem_size * from_ld,
107  elem_size * rows, cols));
108  }
109 }
110 
121 template <typename T>
122 inline void matrix_mem_copy(T *to_ptr, const T *from_ptr, int to_ld,
123  int from_ld, int rows, int cols,
124  sycl::queue queue = get_default_queue(),
125  bool async = false) {
126  using Ty = typename DataType<T>::T2;
127  matrix_mem_copy((void *)to_ptr, (void *)from_ptr, to_ld, from_ld, rows, cols,
128  sizeof(Ty), queue, async);
129 }
130 } // namespace detail
131 
134 
139 inline int cast_double_to_int(double d, bool use_high32 = true) {
140  sycl::vec<double, 1> v0{d};
141  auto v1 = v0.as<sycl::int2>();
142  if (use_high32)
143  return v1[0];
144  return v1[1];
145 }
146 
151 inline double cast_ints_to_double(int high32, int low32) {
152  sycl::int2 v0{high32, low32};
153  auto v1 = v0.as<sycl::vec<double, 1>>();
154  return v1;
155 }
156 
160 template <typename T> inline T reverse_bits(T a) {
161  static_assert(std::is_unsigned<T>::value && std::is_integral<T>::value,
162  "unsigned integer required");
163  if (!a)
164  return 0;
165  T mask = 0;
166  size_t count = 4 * sizeof(T);
167  mask = ~mask >> count;
168  while (count) {
169  a = ((a & mask) << count) | ((a & ~mask) >> count);
170  count = count >> 1;
171  mask = mask ^ (mask << count);
172  }
173  return a;
174 }
175 
181 inline unsigned int byte_level_permute(unsigned int a, unsigned int b,
182  unsigned int s) {
183  unsigned int ret;
184  ret =
185  ((((std::uint64_t)b << 32 | a) >> (s & 0x7) * 8) & 0xff) |
186  (((((std::uint64_t)b << 32 | a) >> ((s >> 4) & 0x7) * 8) & 0xff) << 8) |
187  (((((std::uint64_t)b << 32 | a) >> ((s >> 8) & 0x7) * 8) & 0xff) << 16) |
188  (((((std::uint64_t)b << 32 | a) >> ((s >> 12) & 0x7) * 8) & 0xff) << 24);
189  return ret;
190 }
191 
197 template <typename T> inline int ffs(T a) {
198  static_assert(std::is_integral<T>::value, "integer required");
199  return (sycl::ctz(a) + 1) % (sizeof(T) * 8 + 1);
200 }
201 
216 template <typename T>
217 T select_from_sub_group(sycl::sub_group g, T x, int remote_local_id,
218  int logical_sub_group_size = 32) {
219  unsigned int start_index =
220  g.get_local_linear_id() / logical_sub_group_size * logical_sub_group_size;
222  g, x, start_index + remote_local_id % logical_sub_group_size);
223 }
224 
240 template <typename T>
241 T shift_sub_group_left(sycl::sub_group g, T x, unsigned int delta,
242  int logical_sub_group_size = 32) {
243  unsigned int id = g.get_local_linear_id();
244  unsigned int end_index =
245  (id / logical_sub_group_size + 1) * logical_sub_group_size;
246  T result = sycl::shift_group_left(g, x, delta);
247  if ((id + delta) >= end_index) {
248  result = x;
249  }
250  return result;
251 }
252 
268 template <typename T>
269 T shift_sub_group_right(sycl::sub_group g, T x, unsigned int delta,
270  int logical_sub_group_size = 32) {
271  unsigned int id = g.get_local_linear_id();
272  unsigned int start_index =
273  id / logical_sub_group_size * logical_sub_group_size;
274  T result = sycl::shift_group_right(g, x, delta);
275  if ((id - start_index) < delta) {
276  result = x;
277  }
278  return result;
279 }
280 
296 template <typename T>
297 T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask,
298  int logical_sub_group_size = 32) {
299  unsigned int id = g.get_local_linear_id();
300  unsigned int start_index =
301  id / logical_sub_group_size * logical_sub_group_size;
302  unsigned int target_offset = (id % logical_sub_group_size) ^ mask;
303  return sycl::select_from_group(g, x,
304  target_offset < logical_sub_group_size
305  ? start_index + target_offset
306  : id);
307 }
308 
309 namespace experimental {
323 template <typename T>
324 T select_from_sub_group(unsigned int member_mask, sycl::sub_group g, T x,
325  int remote_local_id, int logical_sub_group_size = 32) {
326  unsigned int start_index =
327  g.get_local_linear_id() / logical_sub_group_size * logical_sub_group_size;
328  unsigned logical_remote_id =
329  start_index + remote_local_id % logical_sub_group_size;
330 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
331 #if defined(__SPIR__)
332  return __spirv_GroupNonUniformShuffle(__spv::Scope::Subgroup, x,
333  logical_remote_id);
334 #elif defined(__NVPTX__)
335  int cVal = ((32 - logical_sub_group_size) << 8) | 31;
336  return cuda_shfl_sync_idx_i32(member_mask, x, remote_local_id, cVal);
337 #else
338  throw sycl::exception(sycl::errc::runtime,
339  "[SYCLcompat] Masked version of select_from_sub_group "
340  "only supports SPIR-V or cuda backends.");
341 #endif // __SPIR__
342 #else
343  (void)g;
344  (void)x;
345  (void)remote_local_id;
346  (void)logical_sub_group_size;
347  (void)member_mask;
348  throw sycl::exception(
349  sycl::errc::runtime,
350  "[SYCLcompat] Masked version of select_from_sub_group not "
351  "supported on host device and non intel compiler.");
352 #endif // __SYCL_DEVICE_ONLY__ && __INTEL_LLVM_COMPILER
353 }
354 
368 template <typename T>
369 T shift_sub_group_left(unsigned int member_mask, sycl::sub_group g, T x,
370  unsigned int delta, int logical_sub_group_size = 32) {
371  unsigned int id = g.get_local_linear_id();
372  unsigned int end_index =
373  (id / logical_sub_group_size + 1) * logical_sub_group_size;
374 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
375 #if defined(__SPIR__)
376  T result =
377  __spirv_GroupNonUniformShuffleDown(__spv::Scope::Subgroup, x, delta);
378  if ((id + delta) >= end_index) {
379  result = x;
380  }
381  return result;
382 #elif defined(__NVPTX__)
383  int cVal = ((32 - logical_sub_group_size) << 8) | 31;
384  return cuda_shfl_sync_down_i32(member_mask, x, delta, cVal);
385 #else
386  throw sycl::exception(sycl::errc::runtime,
387  "[SYCLcompat] Masked version of shift_sub_group_left "
388  "only supports SPIR-V or cuda backends.");
389 #endif // __SPIR__
390 #else
391  (void)g;
392  (void)x;
393  (void)delta;
394  (void)logical_sub_group_size;
395  (void)member_mask;
396  throw sycl::exception(
397  sycl::errc::runtime,
398  "[SYCLcompat] Masked version of shift_sub_group_left not "
399  "supported on host device and non intel compiler.");
400 #endif // __SYCL_DEVICE_ONLY__ && __INTEL_LLVM_COMPILER
401 }
402 
416 template <typename T>
417 T shift_sub_group_right(unsigned int member_mask, sycl::sub_group g, T x,
418  unsigned int delta, int logical_sub_group_size = 32) {
419  unsigned int id = g.get_local_linear_id();
420  unsigned int start_index =
421  id / logical_sub_group_size * logical_sub_group_size;
422 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
423 #if defined(__SPIR__)
424  T result = __spirv_GroupNonUniformShuffleUp(__spv::Scope::Subgroup, x, delta);
425  if ((id - start_index) < delta) {
426  result = x;
427  }
428  return result;
429 #elif defined(__NVPTX__)
430  int cVal = ((32 - logical_sub_group_size) << 8);
431  return cuda_shfl_sync_up_i32(member_mask, x, delta, cVal);
432 #else
433  throw sycl::exception(sycl::errc::runtime,
434  "Masked version of shift_sub_group_right "
435  "only supports SPIR-V or cuda backends.");
436 #endif // __SPIR__
437 #else
438  (void)g;
439  (void)x;
440  (void)delta;
441  (void)logical_sub_group_size;
442  (void)member_mask;
443  throw sycl::exception(sycl::errc::runtime,
444  "Masked version of shift_sub_group_right not "
445  "supported on host device and non intel compiler.");
446 #endif // __SYCL_DEVICE_ONLY && __INTEL_LLVM_COMPILER
447 }
448 
462 template <typename T>
463 T permute_sub_group_by_xor(unsigned int member_mask, sycl::sub_group g, T x,
464  unsigned int mask, int logical_sub_group_size = 32) {
465  unsigned int id = g.get_local_linear_id();
466  unsigned int start_index =
467  id / logical_sub_group_size * logical_sub_group_size;
468  unsigned int target_offset = (id % logical_sub_group_size) ^ mask;
469  unsigned logical_remote_id = (target_offset < logical_sub_group_size)
470  ? start_index + target_offset
471  : id;
472 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
473 #if defined(__SPIR__)
474  return __spirv_GroupNonUniformShuffle(__spv::Scope::Subgroup, x,
475  logical_remote_id);
476 #elif defined(__NVPTX__)
477  int cVal = ((32 - logical_sub_group_size) << 8) | 31;
478  return cuda_shfl_sync_bfly_i32(member_mask, x, mask, cVal);
479 #else
480  throw sycl::exception(
481  sycl::errc::runtime,
482  "[SYCLcompat] Masked version of permute_sub_group_by_xor "
483  "only supports SPIR-V or cuda backends.");
484 #endif // __SPIR__
485 #else
486  (void)g;
487  (void)x;
488  (void)mask;
489  (void)logical_sub_group_size;
490  (void)member_mask;
491  throw sycl::exception(
492  sycl::errc::runtime,
493  "[SYCLcompat]Masked version of permute_sub_group_by_xor not "
494  "supported on host device and non intel compiler.");
495 #endif // __SYCL_DEVICE_ONLY__ && __INTEL_LLVM_COMPILER
496 }
497 } // namespace experimental
498 
502 #ifdef SYCL_LANGUAGE_VERSION
503  return SYCL_LANGUAGE_VERSION;
504 #else
505  return 202000;
506 #endif
507 }
508 
520 template <typename T>
521 unsigned int match_any_over_sub_group(sycl::sub_group g, unsigned member_mask,
522  T value) {
523  static_assert(std::is_arithmetic_v<T>, "Value type must be arithmetic type.");
524  if (!member_mask) {
525  return 0;
526  }
527  unsigned int id = g.get_local_linear_id();
528  unsigned int flag = 0, result = 0, reduce_result = 0;
529  unsigned int bit_index = 0x1 << id;
530  bool is_participate = member_mask & bit_index;
531  T broadcast_value = 0;
532  bool matched = false;
533  while (flag != member_mask) {
534  broadcast_value =
535  sycl::select_from_group(g, value, sycl::ctz((~flag & member_mask)));
536  reduce_result = sycl::reduce_over_group(
537  g, is_participate ? (broadcast_value == value ? bit_index : 0) : 0,
538  sycl::plus<>());
539  flag |= reduce_result;
540  matched = reduce_result & bit_index;
541  result = matched * reduce_result + (1 - matched) * result;
542  }
543  return result;
544 }
545 
559 template <typename T>
560 unsigned int match_all_over_sub_group(sycl::sub_group g, unsigned member_mask,
561  T value, int *pred) {
562  static_assert(std::is_arithmetic_v<T>, "Value type must be arithmetic type.");
563  if (!member_mask) {
564  return 0;
565  }
566  unsigned int id = g.get_local_linear_id();
567  unsigned int bit_index = 0x1 << id;
568  bool is_participate = member_mask & bit_index;
569  T broadcast_value = sycl::select_from_group(g, value, sycl::ctz(member_mask));
570  unsigned int reduce_result = sycl::reduce_over_group(
571  g,
572  (member_mask & bit_index) ? (broadcast_value == value ? bit_index : 0)
573  : 0,
574  sycl::plus<>());
575  bool all_equal = (reduce_result == member_mask);
576  *pred = is_participate & all_equal;
577  return (is_participate & all_equal) * member_mask;
578 }
579 
580 namespace experimental {
581 
582 // FIXME(@intel/syclcompat-lib-reviewers): unify once supported in the AMD
583 // backend.
584 #if defined(__AMDGPU__)
585 constexpr sycl::memory_order barrier_memory_order = sycl::memory_order::acq_rel;
586 #else
587 constexpr sycl::memory_order barrier_memory_order = sycl::memory_order::seq_cst;
588 #endif
589 
597 template <int dimensions = 3>
598 inline void nd_range_barrier(
599  const sycl::nd_item<dimensions> &item,
602  sycl::access::address_space::global_space> &counter) {
603 
604  static_assert(dimensions == 3, "dimensions must be 3.");
605  constexpr unsigned int MSB32_MASK = 0x80000000;
606 
607  unsigned int num_groups = item.get_group_range(2) * item.get_group_range(1) *
608  item.get_group_range(0);
609 
610  item.barrier();
611 
612  if (item.get_local_linear_id() == 0) {
613  unsigned int inc = 1;
614  unsigned int old_arrive = 0;
615  bool is_group0 =
616  (item.get_group(2) + item.get_group(1) + item.get_group(0) == 0);
617  if (is_group0) {
618  inc = MSB32_MASK - (num_groups - 1);
619  }
620 
621  old_arrive = counter.fetch_add(inc);
622  // Synchronize all the work groups
623  while (((old_arrive ^ counter.load()) & MSB32_MASK) == 0)
624  ;
625  }
626 
627  item.barrier();
628 }
629 
637 template <>
638 inline void nd_range_barrier(
639  const sycl::nd_item<1> &item,
642  sycl::access::address_space::global_space> &counter) {
643  unsigned int num_groups = item.get_group_range(0);
644  constexpr unsigned int MSB32_MASK = 0x80000000;
645 
646  item.barrier();
647 
648  if (item.get_local_linear_id() == 0) {
649  unsigned int inc = 1;
650  unsigned int old_arrive = 0;
651  bool is_group0 = (item.get_group(0) == 0);
652  if (is_group0) {
653  inc = MSB32_MASK - (num_groups - 1);
654  }
655 
656  old_arrive = counter.fetch_add(inc);
657  // Synchronize all the work groups
658  while (((old_arrive ^ counter.load()) & MSB32_MASK) == 0)
659  ;
660  }
661 
662  item.barrier();
663 }
664 
669 template <int dimensions = 3> class logical_group {
672  uint32_t _logical_group_size;
673  uint32_t _group_linear_range_in_parent;
674 
675 public:
681  sycl::group<dimensions> parent_group, uint32_t size)
682  : _item(item), _g(parent_group), _logical_group_size(size) {
683  _group_linear_range_in_parent =
684  (_g.get_local_linear_range() - 1) / _logical_group_size + 1;
685  }
687  : _item(item), _g(item.get_group()) {}
689  uint32_t get_local_linear_id() const {
690  return _item.get_local_linear_id() % _logical_group_size;
691  }
693  uint32_t get_group_linear_id() const {
694  return _item.get_local_linear_id() / _logical_group_size;
695  }
697  uint32_t get_local_linear_range() const {
698  if (_g.get_local_linear_range() % _logical_group_size == 0) {
699  return _logical_group_size;
700  }
701  uint32_t last_item_group_id =
702  _g.get_local_linear_range() / _logical_group_size;
703  uint32_t first_of_last_group = last_item_group_id * _logical_group_size;
704  if (_item.get_local_linear_id() >= first_of_last_group) {
705  return _g.get_local_linear_range() - first_of_last_group;
706  } else {
707  return _logical_group_size;
708  }
709  }
711  uint32_t get_group_linear_range() const {
712  return _group_linear_range_in_parent;
713  }
714 };
715 
716 // The original source of the functions calculate_max_active_wg_per_xecore and
717 // calculate_max_potential_wg were under the license below:
718 //
719 // Copyright (C) Intel Corporation
720 //
721 // Permission is hereby granted, free of charge, to any person obtaining a copy
722 // of this software and associated documentation files (the "Software"), to deal
723 // in the Software without restriction, including without limitation the rights
724 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
725 // copies of the Software, and to permit persons to whom the Software is
726 // furnished to do so, subject to the following conditions:
727 //
728 // The above copyright notice and this permission notice shall be included in
729 // all copies or substantial portions of the Software.
730 //
731 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
732 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
733 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
734 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
735 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
736 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
737 // SOFTWARE.
738 //
751 inline int calculate_max_active_wg_per_xecore(int *num_wg, int wg_size,
752  int slm_size = 0,
753  int sg_size = 32,
754  bool used_barrier = false,
755  bool used_large_grf = false) {
756  int ret = 0;
757  const int slm_size_per_xe_core = 64 * 1024;
758  const int max_barrier_registers = 32;
760 
761  size_t max_wg_size = dev.get_info<sycl::info::device::max_work_group_size>();
762  if (wg_size > max_wg_size) {
763  wg_size = max_wg_size;
764  ret = -1;
765  }
766 
767  int num_threads_ss = 56;
768  int max_num_wg = 56;
769  if (dev.has(sycl::aspect::ext_intel_gpu_eu_count_per_subslice) &&
770  dev.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
771  auto eu_count =
772  dev.get_info<sycl::info::device::ext_intel_gpu_eu_count_per_subslice>();
773  auto threads_count =
774  dev.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
775  num_threads_ss = eu_count * threads_count;
776  max_num_wg = eu_count * threads_count;
777  }
778 
779  if (used_barrier) {
780  max_num_wg = max_barrier_registers;
781  }
782 
783  // Calculate num_wg_slm
784  int num_wg_slm = 0;
785  if (slm_size == 0) {
786  num_wg_slm = max_num_wg;
787  } else {
788  num_wg_slm = std::floor((float)slm_size_per_xe_core / slm_size);
789  }
790 
791  // Calculate num_wg_threads
792  if (used_large_grf)
793  num_threads_ss = num_threads_ss / 2;
794  int num_threads = std::ceil((float)wg_size / sg_size);
795  int num_wg_threads = std::floor((float)num_threads_ss / num_threads);
796 
797  // Calculate num_wg
798  *num_wg = std::min(num_wg_slm, num_wg_threads);
799  *num_wg = std::min(*num_wg, max_num_wg);
800  return ret;
801 }
802 
816 inline int calculate_max_potential_wg(int *num_wg, int *wg_size,
817  int max_wg_size_for_device_code,
818  int slm_size = 0, int sg_size = 32,
819  bool used_barrier = false,
820  bool used_large_grf = false) {
822  size_t max_wg_size = dev.get_info<sycl::info::device::max_work_group_size>();
823  if (max_wg_size_for_device_code == 0 ||
824  max_wg_size_for_device_code >= max_wg_size)
825  *wg_size = (int)max_wg_size;
826  else
827  *wg_size = max_wg_size_for_device_code;
828  calculate_max_active_wg_per_xecore(num_wg, *wg_size, slm_size, sg_size,
829  used_barrier, used_large_grf);
830  std::uint32_t num_ss = 1;
831  if (dev.has(sycl::aspect::ext_intel_gpu_slices) &&
832  dev.has(sycl::aspect::ext_intel_gpu_subslices_per_slice)) {
833  num_ss =
834  dev.get_info<sycl::ext::intel::info::device::gpu_slices>() *
835  dev.get_info<sycl::ext::intel::info::device::gpu_subslices_per_slice>();
836  }
837  num_wg[0] = num_ss * num_wg[0];
838  return 0;
839 }
840 
843 
846 template <int dimensions = 3> class group_base {
847 public:
849  : nd_item(item), logical_group(item) {}
853  switch (type) {
855  return nd_item.get_group().get_local_linear_range();
857  return nd_item.get_sub_group().get_local_linear_range();
860  default:
861  return -1; // Unkonwn group type
862  }
863  }
866  switch (type) {
868  return nd_item.get_group().get_local_linear_id();
870  return nd_item.get_sub_group().get_local_linear_id();
873  default:
874  return -1; // Unkonwn group type
875  }
876  }
879  void barrier() {
880  switch (type) {
882  sycl::group_barrier(nd_item.get_group());
883  break;
886  sycl::group_barrier(nd_item.get_sub_group());
887  break;
888  default:
889  break;
890  }
891  }
892 
893 protected:
897 };
898 
900 template <typename GroupT, int dimensions = 3>
901 class group : public group_base<dimensions> {
904 
905 public:
907  : group_base<dimensions>(item) {
908  if constexpr (std::is_same_v<GroupT, sycl::sub_group>) {
910  } else if constexpr (std::is_same_v<GroupT, sycl::group<dimensions>>) {
912  } else if constexpr (std::is_same_v<
914  logical_group = g;
916  }
917  }
918 };
919 } // namespace experimental
920 
923 inline queue_ptr int_as_queue_ptr(uintptr_t x) {
924  return x <= 2 ? &get_default_queue() : reinterpret_cast<queue_ptr>(x);
925 }
926 
927 template <int n_nondefault_params, int n_default_params, typename T>
929 
949 template <int n_nondefault_params, int n_default_params, typename R,
950  typename... Ts>
951 class args_selector<n_nondefault_params, n_default_params, R(Ts...)> {
952 private:
953  void **kernel_params;
954  char *args_buffer;
955 
956  template <int i> static constexpr int account_for_default_params() {
957  constexpr int n_total_params = sizeof...(Ts);
958  if constexpr (i >= n_nondefault_params) {
959  return n_total_params - n_default_params + (i - n_nondefault_params);
960  } else {
961  return i;
962  }
963  }
964 
965 public:
969  template <int i>
970  using arg_type =
971  std::tuple_element_t<account_for_default_params<i>(), std::tuple<Ts...>>;
972 
973 private:
974  template <int i> static constexpr int get_offset() {
975  if constexpr (i == 0) {
976  // we can assume args_buffer is properly aligned to the
977  // first argument
978  return 0;
979  } else {
980  constexpr int prev_off = get_offset<i - 1>();
981  constexpr int prev_past_end = prev_off + sizeof(arg_type<i - 1>);
982  using T = arg_type<i>;
983  // is the past-the-end of the i-1st element properly aligned
984  // with the ith element's alignment?
985  if constexpr (prev_past_end % alignof(T) == 0) {
986  return prev_past_end;
987  }
988  // otherwise bump prev_past_end to match alignment
989  else {
990  return prev_past_end + (alignof(T) - (prev_past_end % alignof(T)));
991  }
992  }
993  }
994 
995  static char *get_args_buffer(void **extra) {
996  if (!extra)
997  return nullptr;
998  for (; (std::size_t)*extra != 0; ++extra) {
999  if ((std::size_t)*extra == 1) {
1000  return static_cast<char *>(*(extra + 1));
1001  }
1002  }
1003  return nullptr;
1004  }
1005 
1006 public:
1013  args_selector(void **kernel_params, void **extra)
1014  : kernel_params(kernel_params), args_buffer(get_args_buffer(extra)) {}
1015 
1020  template <int i> arg_type<i> &get() {
1021  if (kernel_params) {
1022  return *static_cast<arg_type<i> *>(kernel_params[i]);
1023  } else {
1024  return *reinterpret_cast<arg_type<i> *>(args_buffer + get_offset<i>());
1025  }
1026  }
1027 };
1028 
1029 } // namespace syclcompat
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64
detail::is_device_info_desc< Param >::return_type get_info() const
Queries this SYCL device for information requested by the template parameter param.
Definition: device.hpp:223
bool has(aspect Aspect) const __SYCL_WARN_IMAGE_ASPECT(Aspect)
Indicates if the SYCL device has the given feature.
Definition: device.cpp:219
void wait()
Wait for the event.
Definition: event.cpp:46
Identifies an instance of the function object executing at each point in an nd_range.
Definition: nd_item.hpp:48
size_t get_local_linear_id() const
Definition: nd_item.hpp:101
group< Dimensions > get_group() const
Definition: nd_item.hpp:117
range< Dimensions > get_group_range() const
Definition: nd_item.hpp:148
void barrier(access::fence_space accessSpace=access::fence_space::global_and_local) const
Definition: nd_item.hpp:212
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
Definition: queue.hpp:111
class sycl::vec ///////////////////////// Provides a cross-patform vector class template that works e...
arg_type< i > & get()
Get a reference to the ith argument extracted from kernel_params or extra.
Definition: util.hpp:1020
args_selector(void **kernel_params, void **extra)
If kernel_params is nonnull, then args_selector will extract arguments from kernel_params.
Definition: util.hpp:1013
std::tuple_element_t< account_for_default_params< i >(), std::tuple< Ts... > > arg_type
Get the type of the ith argument of R(Ts...)
Definition: util.hpp:971
device extension
Definition: device.hpp:265
The group_base will dispatch the function call to the specific interface based on the group type.
Definition: util.hpp:846
size_t get_local_linear_range()
Returns the number of work-items in the group.
Definition: util.hpp:852
logical_group< dimensions > logical_group
Definition: util.hpp:894
group_base(sycl::nd_item< dimensions > item)
Definition: util.hpp:848
void barrier()
Wait for all the elements within the group to complete their execution before proceeding.
Definition: util.hpp:879
sycl::nd_item< dimensions > nd_item
Definition: util.hpp:895
size_t get_local_linear_id()
Returns the index of the work-item within the group.
Definition: util.hpp:865
Container type that can store supported group_types.
Definition: util.hpp:901
group(GroupT g, sycl::nd_item< dimensions > item)
Definition: util.hpp:906
The logical-group is a logical collection of some work-items within a work-group.
Definition: util.hpp:669
uint32_t get_group_linear_range() const
Returns the number of logical-group in the parent group.
Definition: util.hpp:711
uint32_t get_local_linear_range() const
Returns the number of work-items in the logical-group.
Definition: util.hpp:697
logical_group(sycl::nd_item< dimensions > item)
Definition: util.hpp:686
logical_group(sycl::nd_item< dimensions > item, sycl::group< dimensions > parent_group, uint32_t size)
Dividing parent_group into several logical-groups.
Definition: util.hpp:680
uint32_t get_group_linear_id() const
Returns the index of the logical-group in the parent group.
Definition: util.hpp:693
uint32_t get_local_linear_id() const
Returns the index of the work-item within the logical-group.
Definition: util.hpp:689
#define SYCL_EXTERNAL
__ESIMD_API std::enable_if_t<(sizeof(T) *N >=2)> wait(sycl::ext::intel::esimd::simd< T, N > value)
Create explicit scoreboard dependency to avoid device code motion across this call and preserve the v...
Definition: memory.hpp:422
__attribute__((always_inline)) auto invoke_simd(sycl
The invoke_simd free function invokes a SIMD function using all work-items in a sub_group.
float ceil(float)
std::enable_if_t<((std::is_same_v< std::decay_t< Group >, sub_group >||sycl::ext::oneapi::experimental::is_user_constructed_group_v< std::decay_t< Group >>) &&(std::is_trivially_copyable_v< T >||detail::is_vec< T >::value)), T > shift_group_left(Group g, T x, typename Group::linear_id_type delta=1)
std::enable_if_t<((std::is_same_v< std::decay_t< Group >, sub_group >||sycl::ext::oneapi::experimental::is_user_constructed_group_v< std::decay_t< Group >>) &&(std::is_trivially_copyable_v< T >||detail::is_vec< T >::value)), T > shift_group_right(Group g, T x, typename Group::linear_id_type delta=1)
void group_barrier(ext::oneapi::experimental::root_group< dimensions > G, memory_scope FenceScope=decltype(G)::fence_scope)
Definition: root_group.hpp:102
std::plus< T > plus
Definition: functional.hpp:18
std::enable_if_t<(is_group_v< std::decay_t< Group >> &&(detail::is_scalar_arithmetic< T >::value||(detail::is_complex< T >::value &&detail::is_multiplies< T, BinaryOperation >::value)) &&detail::is_native_op< T, BinaryOperation >::value), T > reduce_over_group(Group g, T x, BinaryOperation binary_op)
std::enable_if_t<((std::is_same_v< std::decay_t< Group >, sub_group >||sycl::ext::oneapi::experimental::is_user_constructed_group_v< std::decay_t< Group >>) &&(std::is_trivially_copyable_v< T >||detail::is_vec< T >::value)), T > select_from_group(Group g, T x, typename Group::id_type local_id)
float floor(float)
Definition: access.hpp:18
static sycl::event memcpy(sycl::queue q, void *to_ptr, const void *from_ptr, size_t size, const std::vector< sycl::event > &dep_events={})
Definition: memory.hpp:283
detail::complex_namespace::complex< ValueT > complex_type
Definition: math.hpp:48
void matrix_mem_copy(void *to_ptr, const void *from_ptr, int to_ld, int from_ld, int rows, int cols, int elem_size, sycl::queue queue=syclcompat::get_default_queue(), bool async=false)
Definition: util.hpp:86
static size_t get_offset(sycl::id< 3 > id, size_t slice, size_t pitch)
Definition: memory.hpp:297
int calculate_max_active_wg_per_xecore(int *num_wg, int wg_size, int slm_size=0, int sg_size=32, bool used_barrier=false, bool used_large_grf=false)
This function is used for occupancy calculation, it computes the max active work-group number per Xe-...
Definition: util.hpp:751
T shift_sub_group_left(unsigned int member_mask, sycl::sub_group g, T x, unsigned int delta, int logical_sub_group_size=32)
Masked version of shift_sub_group_left, which execute masked sub-group operation.
Definition: util.hpp:369
constexpr sycl::memory_order barrier_memory_order
Definition: util.hpp:587
group_type
Supported group types.
Definition: util.hpp:842
int calculate_max_potential_wg(int *num_wg, int *wg_size, int max_wg_size_for_device_code, int slm_size=0, int sg_size=32, bool used_barrier=false, bool used_large_grf=false)
This function is used for occupancy calculation, it computes the work-group number and the work-group...
Definition: util.hpp:816
T select_from_sub_group(unsigned int member_mask, sycl::sub_group g, T x, int remote_local_id, int logical_sub_group_size=32)
Masked version of select_from_sub_group, which execute masked sub-group operation.
Definition: util.hpp:324
T permute_sub_group_by_xor(unsigned int member_mask, sycl::sub_group g, T x, unsigned int mask, int logical_sub_group_size=32)
Masked version of permute_sub_group_by_xor, which execute masked sub-group operation.
Definition: util.hpp:463
T shift_sub_group_right(unsigned int member_mask, sycl::sub_group g, T x, unsigned int delta, int logical_sub_group_size=32)
Masked version of shift_sub_group_right, which execute masked sub-group operation.
Definition: util.hpp:417
void nd_range_barrier(const sycl::nd_item< dimensions > &item, sycl::atomic_ref< unsigned int, barrier_memory_order, sycl::memory_scope::device, sycl::access::address_space::global_space > &counter)
Synchronize work items from all work groups within a SYCL kernel.
Definition: util.hpp:598
double cast_ints_to_double(int high32, int low32)
Combine two integers, the first as the high 32 bits and the second as the low 32 bits,...
Definition: util.hpp:151
T select_from_sub_group(sycl::sub_group g, T x, int remote_local_id, int logical_sub_group_size=32)
select_from_sub_group allows work-items to obtain a copy of a value held by any other work-item in th...
Definition: util.hpp:217
queue_ptr int_as_queue_ptr(uintptr_t x)
If x <= 2, then return a pointer to the default queue; otherwise, return x reinterpreted as a queue_p...
Definition: util.hpp:923
unsigned int match_any_over_sub_group(sycl::sub_group g, unsigned member_mask, T value)
The function match_any_over_sub_group conducts a comparison of values across work-items within a sub-...
Definition: util.hpp:521
unsigned int byte_level_permute(unsigned int a, unsigned int b, unsigned int s)
Definition: util.hpp:181
static sycl::queue get_default_queue()
Util function to get the default queue of current device in device manager.
Definition: device.hpp:744
static device_ext & get_current_device()
Util function to get the current device.
Definition: device.hpp:772
unsigned int match_all_over_sub_group(sycl::sub_group g, unsigned member_mask, T value, int *pred)
The function match_all_over_sub_group conducts a comparison of values across work-items within a sub-...
Definition: util.hpp:560
T reverse_bits(T a)
Reverse the bit order of an unsigned integer.
Definition: util.hpp:160
int get_sycl_language_version()
Inherited from the original SYCLomatic compatibility headers.
Definition: util.hpp:501
int cast_double_to_int(double d, bool use_high32=true)
Cast the high or low 32 bits of a double to an integer.
Definition: util.hpp:139
T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask, int logical_sub_group_size=32)
permute_sub_group_by_xor permutes values by exchanging values held by pairs of work-items identified ...
Definition: util.hpp:297
T shift_sub_group_right(sycl::sub_group g, T x, unsigned int delta, int logical_sub_group_size=32)
shift_sub_group_right move values held by the work-items in a sub_group directly to another work-item...
Definition: util.hpp:269
T shift_sub_group_left(sycl::sub_group g, T x, unsigned int delta, int logical_sub_group_size=32)
shift_sub_group_left move values held by the work-items in a sub_group directly to another work-item ...
Definition: util.hpp:241
int ffs(T a)
Find position of first least significant set bit in an integer.
Definition: util.hpp:197
#define __SYCL_CONVERGENT__
Definition: spirv_ops.hpp:23
_Abi const simd< _Tp, _Abi > & noexcept
Definition: simd.hpp:1324
linear_id_type get_local_linear_id() const
Definition: sub_group.hpp:159