35 #include <type_traits>
42 #if defined(__NVPTX__)
47 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
73 operator T()
const {
return value; }
87 int from_ld,
int rows,
int cols,
int elem_size,
90 if (to_ptr == from_ptr && to_ld == from_ld) {
94 if (to_ld == from_ld) {
95 size_t copy_size = elem_size * ((cols - 1) * (
size_t)to_ld + rows);
97 detail::memcpy(queue, (
void *)to_ptr, (
void *)from_ptr, copy_size);
103 elem_size * from_ld, elem_size * rows, cols);
106 elem_size * to_ld, elem_size * from_ld,
107 elem_size * rows, cols));
121 template <
typename T>
123 int from_ld,
int rows,
int cols,
125 bool async =
false) {
127 matrix_mem_copy((
void *)to_ptr, (
void *)from_ptr, to_ld, from_ld, rows, cols,
128 sizeof(Ty), queue, async);
141 auto v1 = v0.as<sycl::int2>();
152 sycl::int2 v0{high32, low32};
161 static_assert(std::is_unsigned<T>::value && std::is_integral<T>::value,
162 "unsigned integer required");
166 size_t count = 4 *
sizeof(T);
167 mask = ~mask >> count;
169 a = ((a & mask) << count) | ((a & ~mask) >> count);
171 mask = mask ^ (mask << count);
185 ((((std::uint64_t)b << 32 | a) >> (s & 0x7) * 8) & 0xff) |
186 (((((std::uint64_t)b << 32 | a) >> ((s >> 4) & 0x7) * 8) & 0xff) << 8) |
187 (((((std::uint64_t)b << 32 | a) >> ((s >> 8) & 0x7) * 8) & 0xff) << 16) |
188 (((((std::uint64_t)b << 32 | a) >> ((s >> 12) & 0x7) * 8) & 0xff) << 24);
197 template <
typename T>
inline int ffs(T a) {
198 static_assert(std::is_integral<T>::value,
"integer required");
199 return (sycl::ctz(a) + 1) % (
sizeof(T) * 8 + 1);
216 template <
typename T>
218 int logical_sub_group_size = 32) {
219 unsigned int start_index =
222 g, x, start_index + remote_local_id % logical_sub_group_size);
240 template <
typename T>
242 int logical_sub_group_size = 32) {
244 unsigned int end_index =
245 (
id / logical_sub_group_size + 1) * logical_sub_group_size;
247 if ((
id + delta) >= end_index) {
268 template <
typename T>
270 int logical_sub_group_size = 32) {
272 unsigned int start_index =
273 id / logical_sub_group_size * logical_sub_group_size;
275 if ((
id - start_index) < delta) {
296 template <
typename T>
298 int logical_sub_group_size = 32) {
300 unsigned int start_index =
301 id / logical_sub_group_size * logical_sub_group_size;
302 unsigned int target_offset = (
id % logical_sub_group_size) ^ mask;
304 target_offset < logical_sub_group_size
305 ? start_index + target_offset
309 namespace experimental {
323 template <
typename T>
325 int remote_local_id,
int logical_sub_group_size = 32) {
326 unsigned int start_index =
328 unsigned logical_remote_id =
329 start_index + remote_local_id % logical_sub_group_size;
330 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
331 #if defined(__SPIR__)
334 #elif defined(__NVPTX__)
335 int cVal = ((32 - logical_sub_group_size) << 8) | 31;
336 return cuda_shfl_sync_idx_i32(member_mask, x, remote_local_id, cVal);
339 "[SYCLcompat] Masked version of select_from_sub_group "
340 "only supports SPIR-V or cuda backends.");
345 (void)remote_local_id;
346 (void)logical_sub_group_size;
350 "[SYCLcompat] Masked version of select_from_sub_group not "
351 "supported on host device and non intel compiler.");
368 template <
typename T>
370 unsigned int delta,
int logical_sub_group_size = 32) {
372 unsigned int end_index =
373 (
id / logical_sub_group_size + 1) * logical_sub_group_size;
374 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
375 #if defined(__SPIR__)
378 if ((
id + delta) >= end_index) {
382 #elif defined(__NVPTX__)
383 int cVal = ((32 - logical_sub_group_size) << 8) | 31;
384 return cuda_shfl_sync_down_i32(member_mask, x, delta, cVal);
387 "[SYCLcompat] Masked version of shift_sub_group_left "
388 "only supports SPIR-V or cuda backends.");
394 (void)logical_sub_group_size;
398 "[SYCLcompat] Masked version of shift_sub_group_left not "
399 "supported on host device and non intel compiler.");
416 template <
typename T>
418 unsigned int delta,
int logical_sub_group_size = 32) {
420 unsigned int start_index =
421 id / logical_sub_group_size * logical_sub_group_size;
422 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
423 #if defined(__SPIR__)
425 if ((
id - start_index) < delta) {
429 #elif defined(__NVPTX__)
430 int cVal = ((32 - logical_sub_group_size) << 8);
431 return cuda_shfl_sync_up_i32(member_mask, x, delta, cVal);
434 "Masked version of shift_sub_group_right "
435 "only supports SPIR-V or cuda backends.");
441 (void)logical_sub_group_size;
444 "Masked version of shift_sub_group_right not "
445 "supported on host device and non intel compiler.");
462 template <
typename T>
464 unsigned int mask,
int logical_sub_group_size = 32) {
466 unsigned int start_index =
467 id / logical_sub_group_size * logical_sub_group_size;
468 unsigned int target_offset = (
id % logical_sub_group_size) ^ mask;
469 unsigned logical_remote_id = (target_offset < logical_sub_group_size)
470 ? start_index + target_offset
472 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
473 #if defined(__SPIR__)
476 #elif defined(__NVPTX__)
477 int cVal = ((32 - logical_sub_group_size) << 8) | 31;
478 return cuda_shfl_sync_bfly_i32(member_mask, x, mask, cVal);
482 "[SYCLcompat] Masked version of permute_sub_group_by_xor "
483 "only supports SPIR-V or cuda backends.");
489 (void)logical_sub_group_size;
493 "[SYCLcompat]Masked version of permute_sub_group_by_xor not "
494 "supported on host device and non intel compiler.");
502 #ifdef SYCL_LANGUAGE_VERSION
503 return SYCL_LANGUAGE_VERSION;
520 template <
typename T>
523 static_assert(std::is_arithmetic_v<T>,
"Value type must be arithmetic type.");
528 unsigned int flag = 0, result = 0, reduce_result = 0;
529 unsigned int bit_index = 0x1 << id;
530 bool is_participate = member_mask & bit_index;
531 T broadcast_value = 0;
532 bool matched =
false;
533 while (flag != member_mask) {
537 g, is_participate ? (broadcast_value == value ? bit_index : 0) : 0,
539 flag |= reduce_result;
540 matched = reduce_result & bit_index;
541 result = matched * reduce_result + (1 - matched) * result;
559 template <
typename T>
561 T value,
int *pred) {
562 static_assert(std::is_arithmetic_v<T>,
"Value type must be arithmetic type.");
567 unsigned int bit_index = 0x1 << id;
568 bool is_participate = member_mask & bit_index;
572 (member_mask & bit_index) ? (broadcast_value == value ? bit_index : 0)
575 bool all_equal = (reduce_result == member_mask);
576 *pred = is_participate & all_equal;
577 return (is_participate & all_equal) * member_mask;
580 namespace experimental {
584 #if defined(__AMDGPU__)
597 template <
int dimensions = 3>
602 sycl::access::address_space::global_space> &counter) {
604 static_assert(dimensions == 3,
"dimensions must be 3.");
605 constexpr
unsigned int MSB32_MASK = 0x80000000;
613 unsigned int inc = 1;
614 unsigned int old_arrive = 0;
618 inc = MSB32_MASK - (num_groups - 1);
621 old_arrive = counter.fetch_add(inc);
623 while (((old_arrive ^ counter.load()) & MSB32_MASK) == 0)
642 sycl::access::address_space::global_space> &counter) {
644 constexpr
unsigned int MSB32_MASK = 0x80000000;
649 unsigned int inc = 1;
650 unsigned int old_arrive = 0;
651 bool is_group0 = (item.
get_group(0) == 0);
653 inc = MSB32_MASK - (num_groups - 1);
656 old_arrive = counter.fetch_add(inc);
658 while (((old_arrive ^ counter.load()) & MSB32_MASK) == 0)
672 uint32_t _logical_group_size;
673 uint32_t _group_linear_range_in_parent;
682 : _item(item), _g(parent_group), _logical_group_size(size) {
683 _group_linear_range_in_parent =
684 (_g.get_local_linear_range() - 1) / _logical_group_size + 1;
687 : _item(item), _g(item.get_group()) {}
690 return _item.get_local_linear_id() % _logical_group_size;
694 return _item.get_local_linear_id() / _logical_group_size;
698 if (_g.get_local_linear_range() % _logical_group_size == 0) {
699 return _logical_group_size;
701 uint32_t last_item_group_id =
702 _g.get_local_linear_range() / _logical_group_size;
703 uint32_t first_of_last_group = last_item_group_id * _logical_group_size;
704 if (_item.get_local_linear_id() >= first_of_last_group) {
705 return _g.get_local_linear_range() - first_of_last_group;
707 return _logical_group_size;
712 return _group_linear_range_in_parent;
754 bool used_barrier =
false,
755 bool used_large_grf =
false) {
757 const int slm_size_per_xe_core = 64 * 1024;
758 const int max_barrier_registers = 32;
761 size_t max_wg_size = dev.get_info<sycl::info::device::max_work_group_size>();
762 if (wg_size > max_wg_size) {
763 wg_size = max_wg_size;
767 int num_threads_ss = 56;
769 if (dev.has(sycl::aspect::ext_intel_gpu_eu_count_per_subslice) &&
770 dev.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
772 dev.get_info<sycl::info::device::ext_intel_gpu_eu_count_per_subslice>();
774 dev.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
775 num_threads_ss = eu_count * threads_count;
776 max_num_wg = eu_count * threads_count;
780 max_num_wg = max_barrier_registers;
786 num_wg_slm = max_num_wg;
788 num_wg_slm =
std::floor((
float)slm_size_per_xe_core / slm_size);
793 num_threads_ss = num_threads_ss / 2;
794 int num_threads =
std::ceil((
float)wg_size / sg_size);
795 int num_wg_threads =
std::floor((
float)num_threads_ss / num_threads);
798 *num_wg = std::min(num_wg_slm, num_wg_threads);
799 *num_wg = std::min(*num_wg, max_num_wg);
817 int max_wg_size_for_device_code,
818 int slm_size = 0,
int sg_size = 32,
819 bool used_barrier =
false,
820 bool used_large_grf =
false) {
822 size_t max_wg_size = dev.
get_info<sycl::info::device::max_work_group_size>();
823 if (max_wg_size_for_device_code == 0 ||
824 max_wg_size_for_device_code >= max_wg_size)
825 *wg_size = (int)max_wg_size;
827 *wg_size = max_wg_size_for_device_code;
829 used_barrier, used_large_grf);
830 std::uint32_t num_ss = 1;
831 if (dev.
has(sycl::aspect::ext_intel_gpu_slices) &&
832 dev.
has(sycl::aspect::ext_intel_gpu_subslices_per_slice)) {
834 dev.
get_info<sycl::ext::intel::info::device::gpu_slices>() *
835 dev.
get_info<sycl::ext::intel::info::device::gpu_subslices_per_slice>();
837 num_wg[0] = num_ss * num_wg[0];
855 return nd_item.get_group().get_local_linear_range();
857 return nd_item.get_sub_group().get_local_linear_range();
868 return nd_item.get_group().get_local_linear_id();
870 return nd_item.get_sub_group().get_local_linear_id();
900 template <
typename GroupT,
int dimensions = 3>
908 if constexpr (std::is_same_v<GroupT, sycl::sub_group>) {
912 }
else if constexpr (std::is_same_v<
924 return x <= 2 ? &get_default_queue() : reinterpret_cast<queue_ptr>(x);
927 template <
int n_nondefault_params,
int n_default_params,
typename T>
949 template <
int n_nondefault_params,
int n_default_params,
typename R,
953 void **kernel_params;
956 template <
int i>
static constexpr
int account_for_default_params() {
957 constexpr
int n_total_params =
sizeof...(Ts);
958 if constexpr (i >= n_nondefault_params) {
959 return n_total_params - n_default_params + (i - n_nondefault_params);
971 std::tuple_element_t<account_for_default_params<i>(), std::tuple<Ts...>>;
974 template <
int i>
static constexpr
int get_offset() {
975 if constexpr (i == 0) {
981 constexpr
int prev_past_end = prev_off +
sizeof(
arg_type<i - 1>);
985 if constexpr (prev_past_end %
alignof(T) == 0) {
986 return prev_past_end;
990 return prev_past_end + (
alignof(T) - (prev_past_end %
alignof(T)));
995 static char *get_args_buffer(
void **extra) {
998 for (; (std::size_t)*extra != 0; ++extra) {
999 if ((std::size_t)*extra == 1) {
1000 return static_cast<char *
>(*(extra + 1));
1014 : kernel_params(kernel_params), args_buffer(get_args_buffer(extra)) {}
1021 if (kernel_params) {
1022 return *
static_cast<arg_type<i> *
>(kernel_params[i]);
1024 return *
reinterpret_cast<arg_type<i> *
>(args_buffer + get_offset<i>());
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
detail::is_device_info_desc< Param >::return_type get_info() const
Queries this SYCL device for information requested by the template parameter param.
bool has(aspect Aspect) const __SYCL_WARN_IMAGE_ASPECT(Aspect)
Indicates if the SYCL device has the given feature.
void wait()
Wait for the event.
Identifies an instance of the function object executing at each point in an nd_range.
size_t get_local_linear_id() const
group< Dimensions > get_group() const
range< Dimensions > get_group_range() const
void barrier(access::fence_space accessSpace=access::fence_space::global_and_local) const
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
class sycl::vec ///////////////////////// Provides a cross-patform vector class template that works e...
arg_type< i > & get()
Get a reference to the ith argument extracted from kernel_params or extra.
args_selector(void **kernel_params, void **extra)
If kernel_params is nonnull, then args_selector will extract arguments from kernel_params.
std::tuple_element_t< account_for_default_params< i >(), std::tuple< Ts... > > arg_type
Get the type of the ith argument of R(Ts...)
generic_error_type(T value)
generic_error_type()=default
The group_base will dispatch the function call to the specific interface based on the group type.
size_t get_local_linear_range()
Returns the number of work-items in the group.
logical_group< dimensions > logical_group
group_base(sycl::nd_item< dimensions > item)
void barrier()
Wait for all the elements within the group to complete their execution before proceeding.
sycl::nd_item< dimensions > nd_item
size_t get_local_linear_id()
Returns the index of the work-item within the group.
Container type that can store supported group_types.
group(GroupT g, sycl::nd_item< dimensions > item)
The logical-group is a logical collection of some work-items within a work-group.
uint32_t get_group_linear_range() const
Returns the number of logical-group in the parent group.
uint32_t get_local_linear_range() const
Returns the number of work-items in the logical-group.
logical_group(sycl::nd_item< dimensions > item)
logical_group(sycl::nd_item< dimensions > item, sycl::group< dimensions > parent_group, uint32_t size)
Dividing parent_group into several logical-groups.
uint32_t get_group_linear_id() const
Returns the index of the logical-group in the parent group.
uint32_t get_local_linear_id() const
Returns the index of the work-item within the logical-group.
__ESIMD_API std::enable_if_t<(sizeof(T) *N >=2)> wait(sycl::ext::intel::esimd::simd< T, N > value)
Create explicit scoreboard dependency to avoid device code motion across this call and preserve the v...
__attribute__((always_inline)) auto invoke_simd(sycl
The invoke_simd free function invokes a SIMD function using all work-items in a sub_group.
std::enable_if_t<((std::is_same_v< std::decay_t< Group >, sub_group >||sycl::ext::oneapi::experimental::is_user_constructed_group_v< std::decay_t< Group >>) &&(std::is_trivially_copyable_v< T >||detail::is_vec< T >::value)), T > shift_group_left(Group g, T x, typename Group::linear_id_type delta=1)
std::enable_if_t<((std::is_same_v< std::decay_t< Group >, sub_group >||sycl::ext::oneapi::experimental::is_user_constructed_group_v< std::decay_t< Group >>) &&(std::is_trivially_copyable_v< T >||detail::is_vec< T >::value)), T > shift_group_right(Group g, T x, typename Group::linear_id_type delta=1)
void group_barrier(ext::oneapi::experimental::root_group< dimensions > G, memory_scope FenceScope=decltype(G)::fence_scope)
std::enable_if_t<(is_group_v< std::decay_t< Group >> &&(detail::is_scalar_arithmetic< T >::value||(detail::is_complex< T >::value &&detail::is_multiplies< T, BinaryOperation >::value)) &&detail::is_native_op< T, BinaryOperation >::value), T > reduce_over_group(Group g, T x, BinaryOperation binary_op)
std::enable_if_t<((std::is_same_v< std::decay_t< Group >, sub_group >||sycl::ext::oneapi::experimental::is_user_constructed_group_v< std::decay_t< Group >>) &&(std::is_trivially_copyable_v< T >||detail::is_vec< T >::value)), T > select_from_group(Group g, T x, typename Group::id_type local_id)
static sycl::event memcpy(sycl::queue q, void *to_ptr, const void *from_ptr, size_t size, const std::vector< sycl::event > &dep_events={})
detail::complex_namespace::complex< ValueT > complex_type
void matrix_mem_copy(void *to_ptr, const void *from_ptr, int to_ld, int from_ld, int rows, int cols, int elem_size, sycl::queue queue=syclcompat::get_default_queue(), bool async=false)
static size_t get_offset(sycl::id< 3 > id, size_t slice, size_t pitch)
int calculate_max_active_wg_per_xecore(int *num_wg, int wg_size, int slm_size=0, int sg_size=32, bool used_barrier=false, bool used_large_grf=false)
This function is used for occupancy calculation, it computes the max active work-group number per Xe-...
T shift_sub_group_left(unsigned int member_mask, sycl::sub_group g, T x, unsigned int delta, int logical_sub_group_size=32)
Masked version of shift_sub_group_left, which execute masked sub-group operation.
constexpr sycl::memory_order barrier_memory_order
group_type
Supported group types.
int calculate_max_potential_wg(int *num_wg, int *wg_size, int max_wg_size_for_device_code, int slm_size=0, int sg_size=32, bool used_barrier=false, bool used_large_grf=false)
This function is used for occupancy calculation, it computes the work-group number and the work-group...
T select_from_sub_group(unsigned int member_mask, sycl::sub_group g, T x, int remote_local_id, int logical_sub_group_size=32)
Masked version of select_from_sub_group, which execute masked sub-group operation.
T permute_sub_group_by_xor(unsigned int member_mask, sycl::sub_group g, T x, unsigned int mask, int logical_sub_group_size=32)
Masked version of permute_sub_group_by_xor, which execute masked sub-group operation.
T shift_sub_group_right(unsigned int member_mask, sycl::sub_group g, T x, unsigned int delta, int logical_sub_group_size=32)
Masked version of shift_sub_group_right, which execute masked sub-group operation.
void nd_range_barrier(const sycl::nd_item< dimensions > &item, sycl::atomic_ref< unsigned int, barrier_memory_order, sycl::memory_scope::device, sycl::access::address_space::global_space > &counter)
Synchronize work items from all work groups within a SYCL kernel.
double cast_ints_to_double(int high32, int low32)
Combine two integers, the first as the high 32 bits and the second as the low 32 bits,...
T select_from_sub_group(sycl::sub_group g, T x, int remote_local_id, int logical_sub_group_size=32)
select_from_sub_group allows work-items to obtain a copy of a value held by any other work-item in th...
queue_ptr int_as_queue_ptr(uintptr_t x)
If x <= 2, then return a pointer to the default queue; otherwise, return x reinterpreted as a queue_p...
unsigned int match_any_over_sub_group(sycl::sub_group g, unsigned member_mask, T value)
The function match_any_over_sub_group conducts a comparison of values across work-items within a sub-...
unsigned int byte_level_permute(unsigned int a, unsigned int b, unsigned int s)
static sycl::queue get_default_queue()
Util function to get the default queue of current device in device manager.
static device_ext & get_current_device()
Util function to get the current device.
unsigned int match_all_over_sub_group(sycl::sub_group g, unsigned member_mask, T value, int *pred)
The function match_all_over_sub_group conducts a comparison of values across work-items within a sub-...
T reverse_bits(T a)
Reverse the bit order of an unsigned integer.
int get_sycl_language_version()
Inherited from the original SYCLomatic compatibility headers.
int cast_double_to_int(double d, bool use_high32=true)
Cast the high or low 32 bits of a double to an integer.
T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask, int logical_sub_group_size=32)
permute_sub_group_by_xor permutes values by exchanging values held by pairs of work-items identified ...
T shift_sub_group_right(sycl::sub_group g, T x, unsigned int delta, int logical_sub_group_size=32)
shift_sub_group_right move values held by the work-items in a sub_group directly to another work-item...
T shift_sub_group_left(sycl::sub_group g, T x, unsigned int delta, int logical_sub_group_size=32)
shift_sub_group_left move values held by the work-items in a sub_group directly to another work-item ...
int ffs(T a)
Find position of first least significant set bit in an integer.
#define __SYCL_CONVERGENT__
_Abi const simd< _Tp, _Abi > & noexcept
linear_id_type get_local_linear_id() const
detail::complex_type< T > T2