35 #include <type_traits>
43 #if defined(__NVPTX__)
48 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
74 operator T()
const {
return value; }
88 int from_ld,
int rows,
int cols,
int elem_size,
91 if (to_ptr == from_ptr && to_ld == from_ld) {
95 if (to_ld == from_ld) {
96 size_t copy_size = elem_size * ((cols - 1) * (
size_t)to_ld + rows);
98 detail::memcpy(queue, (
void *)to_ptr, (
void *)from_ptr, copy_size);
104 elem_size * from_ld, elem_size * rows, cols);
107 elem_size * to_ld, elem_size * from_ld,
108 elem_size * rows, cols));
122 template <
typename T>
124 int from_ld,
int rows,
int cols,
126 bool async =
false) {
128 matrix_mem_copy((
void *)to_ptr, (
void *)from_ptr, to_ld, from_ld, rows, cols,
129 sizeof(Ty), queue, async);
142 auto v1 = v0.as<sycl::int2>();
153 sycl::int2 v0{high32, low32};
162 static_assert(std::is_unsigned<T>::value && std::is_integral<T>::value,
163 "unsigned integer required");
167 size_t count = 4 *
sizeof(T);
168 mask = ~mask >> count;
170 a = ((a & mask) << count) | ((a & ~mask) >> count);
172 mask = mask ^ (mask << count);
186 ((((std::uint64_t)b << 32 | a) >> (s & 0x7) * 8) & 0xff) |
187 (((((std::uint64_t)b << 32 | a) >> ((s >> 4) & 0x7) * 8) & 0xff) << 8) |
188 (((((std::uint64_t)b << 32 | a) >> ((s >> 8) & 0x7) * 8) & 0xff) << 16) |
189 (((((std::uint64_t)b << 32 | a) >> ((s >> 12) & 0x7) * 8) & 0xff) << 24);
198 template <
typename T>
inline int ffs(T a) {
199 static_assert(std::is_integral<T>::value,
"integer required");
200 return (sycl::ctz(a) + 1) % (
sizeof(T) * 8 + 1);
217 template <
typename T>
219 int logical_sub_group_size = 32) {
220 unsigned int start_index =
223 g, x, start_index + remote_local_id % logical_sub_group_size);
241 template <
typename T>
243 int logical_sub_group_size = 32) {
245 unsigned int end_index =
246 (
id / logical_sub_group_size + 1) * logical_sub_group_size;
248 if ((
id + delta) >= end_index) {
269 template <
typename T>
271 int logical_sub_group_size = 32) {
273 unsigned int start_index =
274 id / logical_sub_group_size * logical_sub_group_size;
276 if ((
id - start_index) < delta) {
297 template <
typename T>
299 int logical_sub_group_size = 32) {
301 unsigned int start_index =
302 id / logical_sub_group_size * logical_sub_group_size;
303 unsigned int target_offset = (
id % logical_sub_group_size) ^ mask;
305 target_offset < logical_sub_group_size
306 ? start_index + target_offset
310 namespace experimental {
324 template <
typename T>
326 int remote_local_id,
int logical_sub_group_size = 32) {
327 unsigned int start_index =
329 unsigned logical_remote_id =
330 start_index + remote_local_id % logical_sub_group_size;
331 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
332 #if defined(__SPIR__)
335 #elif defined(__NVPTX__)
336 int cVal = ((32 - logical_sub_group_size) << 8) | 31;
337 return cuda_shfl_sync_idx_i32(member_mask, x, remote_local_id, cVal);
340 "[SYCLcompat] Masked version of select_from_sub_group "
341 "only supports SPIR-V or cuda backends.");
346 (void)remote_local_id;
347 (void)logical_sub_group_size;
351 "[SYCLcompat] Masked version of select_from_sub_group not "
352 "supported on host device and non intel compiler.");
369 template <
typename T>
371 unsigned int delta,
int logical_sub_group_size = 32) {
373 unsigned int end_index =
374 (
id / logical_sub_group_size + 1) * logical_sub_group_size;
375 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
376 #if defined(__SPIR__)
379 if ((
id + delta) >= end_index) {
383 #elif defined(__NVPTX__)
384 int cVal = ((32 - logical_sub_group_size) << 8) | 31;
385 return cuda_shfl_sync_down_i32(member_mask, x, delta, cVal);
388 "[SYCLcompat] Masked version of shift_sub_group_left "
389 "only supports SPIR-V or cuda backends.");
395 (void)logical_sub_group_size;
399 "[SYCLcompat] Masked version of shift_sub_group_left not "
400 "supported on host device and non intel compiler.");
417 template <
typename T>
419 unsigned int delta,
int logical_sub_group_size = 32) {
421 unsigned int start_index =
422 id / logical_sub_group_size * logical_sub_group_size;
423 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
424 #if defined(__SPIR__)
426 if ((
id - start_index) < delta) {
430 #elif defined(__NVPTX__)
431 int cVal = ((32 - logical_sub_group_size) << 8);
432 return cuda_shfl_sync_up_i32(member_mask, x, delta, cVal);
435 "Masked version of shift_sub_group_right "
436 "only supports SPIR-V or cuda backends.");
442 (void)logical_sub_group_size;
445 "Masked version of shift_sub_group_right not "
446 "supported on host device and non intel compiler.");
463 template <
typename T>
465 unsigned int mask,
int logical_sub_group_size = 32) {
467 unsigned int start_index =
468 id / logical_sub_group_size * logical_sub_group_size;
469 unsigned int target_offset = (
id % logical_sub_group_size) ^ mask;
470 unsigned logical_remote_id = (target_offset < logical_sub_group_size)
471 ? start_index + target_offset
473 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
474 #if defined(__SPIR__)
477 #elif defined(__NVPTX__)
478 int cVal = ((32 - logical_sub_group_size) << 8) | 31;
479 return cuda_shfl_sync_bfly_i32(member_mask, x, mask, cVal);
483 "[SYCLcompat] Masked version of permute_sub_group_by_xor "
484 "only supports SPIR-V or cuda backends.");
490 (void)logical_sub_group_size;
494 "[SYCLcompat]Masked version of permute_sub_group_by_xor not "
495 "supported on host device and non intel compiler.");
503 #ifdef SYCL_LANGUAGE_VERSION
504 return SYCL_LANGUAGE_VERSION;
521 template <
typename T>
524 static_assert(std::is_arithmetic_v<T>,
"Value type must be arithmetic type.");
529 unsigned int flag = 0, result = 0, reduce_result = 0;
530 unsigned int bit_index = 0x1 << id;
531 bool is_participate = member_mask & bit_index;
532 T broadcast_value = 0;
533 bool matched =
false;
534 while (flag != member_mask) {
538 g, is_participate ? (broadcast_value == value ? bit_index : 0) : 0,
540 flag |= reduce_result;
541 matched = reduce_result & bit_index;
542 result = matched * reduce_result + (1 - matched) * result;
560 template <
typename T>
562 T value,
int *pred) {
563 static_assert(std::is_arithmetic_v<T>,
"Value type must be arithmetic type.");
568 unsigned int bit_index = 0x1 << id;
569 bool is_participate = member_mask & bit_index;
573 (member_mask & bit_index) ? (broadcast_value == value ? bit_index : 0)
576 bool all_equal = (reduce_result == member_mask);
577 *pred = is_participate & all_equal;
578 return (is_participate & all_equal) * member_mask;
581 namespace experimental {
585 #if defined(__AMDGPU__) || defined(__NVPTX__)
598 template <
int dimensions = 3>
603 sycl::access::address_space::global_space> &counter) {
605 static_assert(dimensions == 3,
"dimensions must be 3.");
606 constexpr
unsigned int MSB32_MASK = 0x80000000;
614 unsigned int inc = 1;
615 unsigned int old_arrive = 0;
619 inc = MSB32_MASK - (num_groups - 1);
622 old_arrive = counter.fetch_add(inc);
624 while (((old_arrive ^ counter.load()) & MSB32_MASK) == 0)
643 sycl::access::address_space::global_space> &counter) {
645 constexpr
unsigned int MSB32_MASK = 0x80000000;
650 unsigned int inc = 1;
651 unsigned int old_arrive = 0;
652 bool is_group0 = (item.
get_group(0) == 0);
654 inc = MSB32_MASK - (num_groups - 1);
657 old_arrive = counter.fetch_add(inc);
659 while (((old_arrive ^ counter.load()) & MSB32_MASK) == 0)
673 uint32_t _logical_group_size;
674 uint32_t _group_linear_range_in_parent;
683 : _item(item), _g(parent_group), _logical_group_size(size) {
684 _group_linear_range_in_parent =
685 (_g.get_local_linear_range() - 1) / _logical_group_size + 1;
688 : _item(item), _g(item.get_group()) {}
691 return _item.get_local_linear_id() % _logical_group_size;
695 return _item.get_local_linear_id() / _logical_group_size;
699 if (_g.get_local_linear_range() % _logical_group_size == 0) {
700 return _logical_group_size;
702 uint32_t last_item_group_id =
703 _g.get_local_linear_range() / _logical_group_size;
704 uint32_t first_of_last_group = last_item_group_id * _logical_group_size;
705 if (_item.get_local_linear_id() >= first_of_last_group) {
706 return _g.get_local_linear_range() - first_of_last_group;
708 return _logical_group_size;
713 return _group_linear_range_in_parent;
755 bool used_barrier =
false,
756 bool used_large_grf =
false) {
758 const int slm_size_per_xe_core = 64 * 1024;
759 const int max_barrier_registers = 32;
762 size_t max_wg_size = dev.get_info<sycl::info::device::max_work_group_size>();
763 if (wg_size > max_wg_size) {
764 wg_size = max_wg_size;
768 int num_threads_ss = 56;
770 if (dev.has(sycl::aspect::ext_intel_gpu_eu_count_per_subslice) &&
771 dev.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
773 dev.get_info<sycl::info::device::ext_intel_gpu_eu_count_per_subslice>();
775 dev.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
776 num_threads_ss = eu_count * threads_count;
777 max_num_wg = eu_count * threads_count;
781 max_num_wg = max_barrier_registers;
787 num_wg_slm = max_num_wg;
789 num_wg_slm =
std::floor((
float)slm_size_per_xe_core / slm_size);
794 num_threads_ss = num_threads_ss / 2;
795 int num_threads =
std::ceil((
float)wg_size / sg_size);
796 int num_wg_threads =
std::floor((
float)num_threads_ss / num_threads);
799 *num_wg = std::min(num_wg_slm, num_wg_threads);
800 *num_wg = std::min(*num_wg, max_num_wg);
818 int max_wg_size_for_device_code,
819 int slm_size = 0,
int sg_size = 32,
820 bool used_barrier =
false,
821 bool used_large_grf =
false) {
823 size_t max_wg_size = dev.
get_info<sycl::info::device::max_work_group_size>();
824 if (max_wg_size_for_device_code == 0 ||
825 max_wg_size_for_device_code >= max_wg_size)
826 *wg_size = (int)max_wg_size;
828 *wg_size = max_wg_size_for_device_code;
830 used_barrier, used_large_grf);
831 std::uint32_t num_ss = 1;
832 if (dev.
has(sycl::aspect::ext_intel_gpu_slices) &&
833 dev.
has(sycl::aspect::ext_intel_gpu_subslices_per_slice)) {
835 dev.
get_info<sycl::ext::intel::info::device::gpu_slices>() *
836 dev.
get_info<sycl::ext::intel::info::device::gpu_subslices_per_slice>();
838 num_wg[0] = num_ss * num_wg[0];
856 return nd_item.get_group().get_local_linear_range();
858 return nd_item.get_sub_group().get_local_linear_range();
869 return nd_item.get_group().get_local_linear_id();
871 return nd_item.get_sub_group().get_local_linear_id();
901 template <
typename GroupT,
int dimensions = 3>
909 if constexpr (std::is_same_v<GroupT, sycl::sub_group>) {
913 }
else if constexpr (std::is_same_v<
929 template <
int n_nondefault_params,
int n_default_params,
typename T>
951 template <
int n_nondefault_params,
int n_default_params,
typename R,
955 void **kernel_params;
958 template <
int i>
static constexpr
int account_for_default_params() {
959 constexpr
int n_total_params =
sizeof...(Ts);
960 if constexpr (i >= n_nondefault_params) {
961 return n_total_params - n_default_params + (i - n_nondefault_params);
973 std::tuple_element_t<account_for_default_params<i>(), std::tuple<Ts...>>;
976 template <
int i>
static constexpr
int get_offset() {
977 if constexpr (i == 0) {
983 constexpr
int prev_past_end = prev_off +
sizeof(
arg_type<i - 1>);
987 if constexpr (prev_past_end %
alignof(T) == 0) {
988 return prev_past_end;
992 return prev_past_end + (
alignof(T) - (prev_past_end %
alignof(T)));
997 static char *get_args_buffer(
void **extra) {
1000 for (; (std::size_t)*extra != 0; ++extra) {
1001 if ((std::size_t)*extra == 1) {
1002 return static_cast<char *
>(*(extra + 1));
1016 : kernel_params(kernel_params), args_buffer(get_args_buffer(extra)) {}
1023 if (kernel_params) {
1024 return *
static_cast<arg_type<i> *
>(kernel_params[i]);
1026 return *
reinterpret_cast<arg_type<i> *
>(args_buffer + get_offset<i>());
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
detail::is_device_info_desc< Param >::return_type get_info() const
Queries this SYCL device for information requested by the template parameter param.
bool has(aspect Aspect) const __SYCL_WARN_IMAGE_ASPECT(Aspect)
Indicates if the SYCL device has the given feature.
void wait()
Wait for the event.
Identifies an instance of the function object executing at each point in an nd_range.
size_t get_local_linear_id() const
group< Dimensions > get_group() const
range< Dimensions > get_group_range() const
void barrier(access::fence_space accessSpace=access::fence_space::global_and_local) const
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
arg_type< i > & get()
Get a reference to the ith argument extracted from kernel_params or extra.
args_selector(void **kernel_params, void **extra)
If kernel_params is nonnull, then args_selector will extract arguments from kernel_params.
std::tuple_element_t< account_for_default_params< i >(), std::tuple< Ts... > > arg_type
Get the type of the ith argument of R(Ts...)
static dev_mgr & instance()
Returns the instance of device manager singleton.
device_ext & current_device()
generic_error_type(T value)
generic_error_type()=default
queue_ptr default_queue()
The group_base will dispatch the function call to the specific interface based on the group type.
size_t get_local_linear_range()
Returns the number of work-items in the group.
logical_group< dimensions > logical_group
group_base(sycl::nd_item< dimensions > item)
void barrier()
Wait for all the elements within the group to complete their execution before proceeding.
sycl::nd_item< dimensions > nd_item
size_t get_local_linear_id()
Returns the index of the work-item within the group.
Container type that can store supported group_types.
group(GroupT g, sycl::nd_item< dimensions > item)
The logical-group is a logical collection of some work-items within a work-group.
uint32_t get_group_linear_range() const
Returns the number of logical-group in the parent group.
uint32_t get_local_linear_range() const
Returns the number of work-items in the logical-group.
logical_group(sycl::nd_item< dimensions > item)
logical_group(sycl::nd_item< dimensions > item, sycl::group< dimensions > parent_group, uint32_t size)
Dividing parent_group into several logical-groups.
uint32_t get_group_linear_id() const
Returns the index of the logical-group in the parent group.
uint32_t get_local_linear_id() const
Returns the index of the work-item within the logical-group.
__ESIMD_API std::enable_if_t<(sizeof(T) *N >=2)> wait(sycl::ext::intel::esimd::simd< T, N > value)
Create explicit scoreboard dependency to avoid device code motion across this call and preserve the v...
__attribute__((always_inline)) auto invoke_simd(sycl
The invoke_simd free function invokes a SIMD function using all work-items in a sub_group.
std::enable_if_t<((std::is_same_v< std::decay_t< Group >, sub_group >||sycl::ext::oneapi::experimental::is_user_constructed_group_v< std::decay_t< Group >>) &&(std::is_trivially_copyable_v< T >||detail::is_vec< T >::value)), T > shift_group_left(Group g, T x, typename Group::linear_id_type delta=1)
std::enable_if_t<((std::is_same_v< std::decay_t< Group >, sub_group >||sycl::ext::oneapi::experimental::is_user_constructed_group_v< std::decay_t< Group >>) &&(std::is_trivially_copyable_v< T >||detail::is_vec< T >::value)), T > shift_group_right(Group g, T x, typename Group::linear_id_type delta=1)
void group_barrier(ext::oneapi::experimental::root_group< dimensions > G, memory_scope FenceScope=decltype(G)::fence_scope)
std::enable_if_t<(is_group_v< std::decay_t< Group >> &&(detail::is_scalar_arithmetic< T >::value||(detail::is_complex< T >::value &&detail::is_multiplies< T, BinaryOperation >::value)) &&detail::is_native_op< T, BinaryOperation >::value), T > reduce_over_group(Group g, T x, BinaryOperation binary_op)
std::enable_if_t<((std::is_same_v< std::decay_t< Group >, sub_group >||sycl::ext::oneapi::experimental::is_user_constructed_group_v< std::decay_t< Group >>) &&(std::is_trivially_copyable_v< T >||detail::is_vec< T >::value)), T > select_from_group(Group g, T x, typename Group::id_type local_id)
static sycl::event memcpy(sycl::queue q, void *to_ptr, const void *from_ptr, size_t size, const std::vector< sycl::event > &dep_events={})
detail::complex_namespace::complex< ValueT > complex_type
void matrix_mem_copy(void *to_ptr, const void *from_ptr, int to_ld, int from_ld, int rows, int cols, int elem_size, sycl::queue queue=syclcompat::get_default_queue(), bool async=false)
static size_t get_offset(sycl::id< 3 > id, size_t slice, size_t pitch)
int calculate_max_active_wg_per_xecore(int *num_wg, int wg_size, int slm_size=0, int sg_size=32, bool used_barrier=false, bool used_large_grf=false)
This function is used for occupancy calculation, it computes the max active work-group number per Xe-...
T shift_sub_group_left(unsigned int member_mask, sycl::sub_group g, T x, unsigned int delta, int logical_sub_group_size=32)
Masked version of shift_sub_group_left, which execute masked sub-group operation.
constexpr sycl::memory_order barrier_memory_order
group_type
Supported group types.
int calculate_max_potential_wg(int *num_wg, int *wg_size, int max_wg_size_for_device_code, int slm_size=0, int sg_size=32, bool used_barrier=false, bool used_large_grf=false)
This function is used for occupancy calculation, it computes the work-group number and the work-group...
T select_from_sub_group(unsigned int member_mask, sycl::sub_group g, T x, int remote_local_id, int logical_sub_group_size=32)
Masked version of select_from_sub_group, which execute masked sub-group operation.
T permute_sub_group_by_xor(unsigned int member_mask, sycl::sub_group g, T x, unsigned int mask, int logical_sub_group_size=32)
Masked version of permute_sub_group_by_xor, which execute masked sub-group operation.
T shift_sub_group_right(unsigned int member_mask, sycl::sub_group g, T x, unsigned int delta, int logical_sub_group_size=32)
Masked version of shift_sub_group_right, which execute masked sub-group operation.
void nd_range_barrier(const sycl::nd_item< dimensions > &item, sycl::atomic_ref< unsigned int, barrier_memory_order, sycl::memory_scope::device, sycl::access::address_space::global_space > &counter)
Synchronize work items from all work groups within a SYCL kernel.
double cast_ints_to_double(int high32, int low32)
Combine two integers, the first as the high 32 bits and the second as the low 32 bits,...
T select_from_sub_group(sycl::sub_group g, T x, int remote_local_id, int logical_sub_group_size=32)
select_from_sub_group allows work-items to obtain a copy of a value held by any other work-item in th...
queue_ptr int_as_queue_ptr(uintptr_t x)
If x <= 2, then return a pointer to the default queue; otherwise, return x reinterpreted as a queue_p...
unsigned int match_any_over_sub_group(sycl::sub_group g, unsigned member_mask, T value)
The function match_any_over_sub_group conducts a comparison of values across work-items within a sub-...
unsigned int byte_level_permute(unsigned int a, unsigned int b, unsigned int s)
static sycl::queue get_default_queue()
Util function to get the default queue of current device in device manager.
static device_ext & get_current_device()
Util function to get the current device.
unsigned int match_all_over_sub_group(sycl::sub_group g, unsigned member_mask, T value, int *pred)
The function match_all_over_sub_group conducts a comparison of values across work-items within a sub-...
T reverse_bits(T a)
Reverse the bit order of an unsigned integer.
int get_sycl_language_version()
Inherited from the original SYCLomatic compatibility headers.
int cast_double_to_int(double d, bool use_high32=true)
Cast the high or low 32 bits of a double to an integer.
T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask, int logical_sub_group_size=32)
permute_sub_group_by_xor permutes values by exchanging values held by pairs of work-items identified ...
T shift_sub_group_right(sycl::sub_group g, T x, unsigned int delta, int logical_sub_group_size=32)
shift_sub_group_right move values held by the work-items in a sub_group directly to another work-item...
T shift_sub_group_left(sycl::sub_group g, T x, unsigned int delta, int logical_sub_group_size=32)
shift_sub_group_left move values held by the work-items in a sub_group directly to another work-item ...
int ffs(T a)
Find position of first least significant set bit in an integer.
#define __SYCL_CONVERGENT__
_Abi const simd< _Tp, _Abi > & noexcept
linear_id_type get_local_linear_id() const
detail::complex_type< T > T2