DPC++ Runtime
Runtime libraries for oneAPI DPC++
util.hpp
Go to the documentation of this file.
1 /***************************************************************************
2  *
3  * Copyright (C) Codeplay Software Ltd.
4  *
5  * Part of the LLVM Project, under the Apache License v2.0 with LLVM
6  * Exceptions. See https://llvm.org/LICENSE.txt for license information.
7  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  *
15  * SYCL compatibility extension
16  *
17  * util.hpp
18  *
19  * Description:
20  * util functionality for the SYCL compatibility extension
21  **************************************************************************/
22 
23 // The original source was under the license below:
24 //==---- util.hpp ---------------------------------*- C++ -*----------------==//
25 //
26 // Copyright (C) Intel Corporation
27 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
28 // See https://llvm.org/LICENSE.txt for license information.
29 //
30 //===----------------------------------------------------------------------===//
31 
32 #pragma once
33 
34 #define SYCL_EXT_ONEAPI_COMPLEX
35 
36 #include <cassert>
37 #include <complex>
39 #include <sycl/sycl.hpp>
40 #include <type_traits>
41 
42 #include <syclcompat/memory.hpp>
43 
44 namespace syclcompat {
45 
46 namespace detail {
47 template <typename T> struct DataType {
48  using T2 = T;
49 };
50 template <typename T> struct DataType<sycl::vec<T, 2>> {
52 };
53 
54 inline void matrix_mem_copy(void *to_ptr, const void *from_ptr, int to_ld,
55  int from_ld, int rows, int cols, int elem_size,
57  bool async = false) {
58  if (to_ptr == from_ptr && to_ld == from_ld) {
59  return;
60  }
61 
62  if (to_ld == from_ld) {
63  size_t cpoy_size = elem_size * ((cols - 1) * to_ld + rows);
64  if (async)
65  detail::memcpy(queue, (void *)to_ptr, (void *)from_ptr, cpoy_size);
66  else
67  detail::memcpy(queue, (void *)to_ptr, (void *)from_ptr, cpoy_size).wait();
68  } else {
69  if (async)
70  detail::memcpy(queue, to_ptr, from_ptr, elem_size * to_ld,
71  elem_size * from_ld, elem_size * rows, cols);
72  else
73  sycl::event::wait(detail::memcpy(queue, to_ptr, from_ptr,
74  elem_size * to_ld, elem_size * from_ld,
75  elem_size * rows, cols));
76  }
77 }
78 
89 template <typename T>
90 inline void matrix_mem_copy(T *to_ptr, const T *from_ptr, int to_ld,
91  int from_ld, int rows, int cols,
93  bool async = false) {
94  using Ty = typename DataType<T>::T2;
95  matrix_mem_copy((void *)to_ptr, (void *)from_ptr, to_ld, from_ld, rows, cols,
96  sizeof(Ty), queue, async);
97 }
98 } // namespace detail
99 
104 inline int cast_double_to_int(double d, bool use_high32 = true) {
105  sycl::vec<double, 1> v0{d};
106  auto v1 = v0.as<sycl::int2>();
107  if (use_high32)
108  return v1[0];
109  return v1[1];
110 }
111 
116 inline double cast_ints_to_double(int high32, int low32) {
117  sycl::int2 v0{high32, low32};
118  auto v1 = v0.as<sycl::vec<double, 1>>();
119  return v1;
120 }
121 
126 inline float fast_length(const float *a, int len) {
127  switch (len) {
128  case 1:
129  return sycl::fast_length(a[0]);
130  case 2:
131  return sycl::fast_length(sycl::float2(a[0], a[1]));
132  case 3:
133  return sycl::fast_length(sycl::float3(a[0], a[1], a[2]));
134  case 4:
135  return sycl::fast_length(sycl::float4(a[0], a[1], a[2], a[3]));
136  case 0:
137  return 0;
138  default:
139  float f = 0;
140  for (int i = 0; i < len; ++i)
141  f += a[i] * a[i];
142  return sycl::sqrt(f);
143  }
144 }
145 
153 template <typename S, typename T> inline T vectorized_max(T a, T b) {
154  sycl::vec<T, 1> v0{a}, v1{b};
155  auto v2 = v0.template as<S>();
156  auto v3 = v1.template as<S>();
157  v2 = sycl::max(v2, v3);
158  v0 = v2.template as<sycl::vec<T, 1>>();
159  return v0;
160 }
161 
169 template <typename S, typename T> inline T vectorized_min(T a, T b) {
170  sycl::vec<T, 1> v0{a}, v1{b};
171  auto v2 = v0.template as<S>();
172  auto v3 = v1.template as<S>();
173  v2 = sycl::min(v2, v3);
174  v0 = v2.template as<sycl::vec<T, 1>>();
175  return v0;
176 }
177 
185 template <typename S, typename T> inline T vectorized_isgreater(T a, T b) {
186  sycl::vec<T, 1> v0{a}, v1{b};
187  auto v2 = v0.template as<S>();
188  auto v3 = v1.template as<S>();
189  auto v4 = sycl::isgreater(v2, v3);
190  v0 = v4.template as<sycl::vec<T, 1>>();
191  return v0;
192 }
193 
199 template <>
200 inline unsigned vectorized_isgreater<sycl::ushort2, unsigned>(unsigned a,
201  unsigned b) {
202  sycl::vec<unsigned, 1> v0{a}, v1{b};
203  auto v2 = v0.template as<sycl::ushort2>();
204  auto v3 = v1.template as<sycl::ushort2>();
205  sycl::ushort2 v4;
206  v4[0] = v2[0] > v3[0];
207  v4[1] = v2[1] > v3[1];
208  v0 = v4.template as<sycl::vec<unsigned, 1>>();
209  return v0;
210 }
211 
215 template <typename T> inline T reverse_bits(T a) {
216  static_assert(std::is_unsigned<T>::value && std::is_integral<T>::value,
217  "unsigned integer required");
218  if (!a)
219  return 0;
220  T mask = 0;
221  size_t count = 4 * sizeof(T);
222  mask = ~mask >> count;
223  while (count) {
224  a = ((a & mask) << count) | ((a & ~mask) >> count);
225  count = count >> 1;
226  mask = mask ^ (mask << count);
227  }
228  return a;
229 }
230 
236 inline unsigned int byte_level_permute(unsigned int a, unsigned int b,
237  unsigned int s) {
238  unsigned int ret;
239  ret =
240  ((((std::uint64_t)b << 32 | a) >> (s & 0x7) * 8) & 0xff) |
241  (((((std::uint64_t)b << 32 | a) >> ((s >> 4) & 0x7) * 8) & 0xff) << 8) |
242  (((((std::uint64_t)b << 32 | a) >> ((s >> 8) & 0x7) * 8) & 0xff) << 16) |
243  (((((std::uint64_t)b << 32 | a) >> ((s >> 12) & 0x7) * 8) & 0xff) << 24);
244  return ret;
245 }
246 
252 template <typename T> inline int ffs(T a) {
253  static_assert(std::is_integral<T>::value, "integer required");
254  return (sycl::ctz(a) + 1) % (sizeof(T) * 8 + 1);
255 }
256 
271 template <typename T>
272 T select_from_sub_group(sycl::sub_group g, T x, int remote_local_id,
273  int logical_sub_group_size = 32) {
274  unsigned int start_index =
275  g.get_local_linear_id() / logical_sub_group_size * logical_sub_group_size;
277  g, x, start_index + remote_local_id % logical_sub_group_size);
278 }
279 
295 template <typename T>
296 T shift_sub_group_left(sycl::sub_group g, T x, unsigned int delta,
297  int logical_sub_group_size = 32) {
298  unsigned int id = g.get_local_linear_id();
299  unsigned int end_index =
300  (id / logical_sub_group_size + 1) * logical_sub_group_size;
301  T result = sycl::shift_group_left(g, x, delta);
302  if ((id + delta) >= end_index) {
303  result = x;
304  }
305  return result;
306 }
307 
323 template <typename T>
324 T shift_sub_group_right(sycl::sub_group g, T x, unsigned int delta,
325  int logical_sub_group_size = 32) {
326  unsigned int id = g.get_local_linear_id();
327  unsigned int start_index =
328  id / logical_sub_group_size * logical_sub_group_size;
329  T result = sycl::shift_group_right(g, x, delta);
330  if ((id - start_index) < delta) {
331  result = x;
332  }
333  return result;
334 }
335 
351 template <typename T>
352 T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask,
353  int logical_sub_group_size = 32) {
354  unsigned int id = g.get_local_linear_id();
355  unsigned int start_index =
356  id / logical_sub_group_size * logical_sub_group_size;
357  unsigned int target_offset = (id % logical_sub_group_size) ^ mask;
358  return sycl::select_from_group(g, x,
359  target_offset < logical_sub_group_size
360  ? start_index + target_offset
361  : id);
362 }
363 
369 template <typename T>
371  sycl::ext::oneapi::experimental::complex<T> t1(x[0], x[1]), t2(y[0], y[1]);
372  t1 = t1 * t2;
373  return sycl::vec<T, 2>(t1.real(), t1.imag());
374 }
375 
381 template <typename T>
383  sycl::ext::oneapi::experimental::complex<T> t1(x[0], x[1]), t2(y[0], y[1]);
384  t1 = t1 / t2;
385  return sycl::vec<T, 2>(t1.real(), t1.imag());
386 }
387 
392 template <typename T> T cabs(sycl::vec<T, 2> x) {
394  return abs(t);
395 }
396 
401 template <typename T> sycl::vec<T, 2> conj(sycl::vec<T, 2> x) {
403  t = conj(t);
404  return sycl::vec<T, 2>(t.real(), t.imag());
405 }
406 
410 #ifdef SYCL_LANGUAGE_VERSION
411  return SYCL_LANGUAGE_VERSION;
412 #else
413  return 202000;
414 #endif
415 }
416 
417 namespace experimental {
425 template <int dimensions = 3>
426 inline void nd_range_barrier(
428  sycl::atomic_ref<unsigned int, sycl::memory_order::acq_rel,
430  sycl::access::address_space::global_space> &counter) {
431 
432  static_assert(dimensions == 3, "dimensions must be 3.");
433  constexpr unsigned int MSB32_MASK = 0x80000000;
434 
435  unsigned int num_groups = item.get_group_range(2) * item.get_group_range(1) *
436  item.get_group_range(0);
437 
438  item.barrier();
439 
440  if (item.get_local_linear_id() == 0) {
441  unsigned int inc = 1;
442  unsigned int old_arrive = 0;
443  bool is_group0 =
444  (item.get_group(2) + item.get_group(1) + item.get_group(0) == 0);
445  if (is_group0) {
446  inc = MSB32_MASK - (num_groups - 1);
447  }
448 
449  old_arrive = counter.fetch_add(inc);
450  // Synchronize all the work groups
451  while (((old_arrive ^ counter.load()) & MSB32_MASK) == 0)
452  ;
453  }
454 
455  item.barrier();
456 }
457 
465 template <>
466 inline void nd_range_barrier(
467  sycl::nd_item<1> item,
468  sycl::atomic_ref<unsigned int, sycl::memory_order::acq_rel,
470  sycl::access::address_space::global_space> &counter) {
471  unsigned int num_groups = item.get_group_range(0);
472  constexpr unsigned int MSB32_MASK = 0x80000000;
473 
474  item.barrier();
475 
476  if (item.get_local_linear_id() == 0) {
477  unsigned int inc = 1;
478  unsigned int old_arrive = 0;
479  bool is_group0 = (item.get_group(0) == 0);
480  if (is_group0) {
481  inc = MSB32_MASK - (num_groups - 1);
482  }
483 
484  old_arrive = counter.fetch_add(inc);
485  // Synchronize all the work groups
486  while (((old_arrive ^ counter.load()) & MSB32_MASK) == 0)
487  ;
488  }
489 
490  item.barrier();
491 }
492 
498  sycl::nd_item<3> _item;
499  sycl::group<3> _g;
500  uint32_t _logical_group_size;
501  uint32_t _group_linear_range_in_parent;
502 
503 public:
509  uint32_t size)
510  : _item(item), _g(parent_group), _logical_group_size(size) {
511  _group_linear_range_in_parent =
512  (_g.get_local_linear_range() - 1) / _logical_group_size + 1;
513  }
515  uint32_t get_local_linear_id() const {
516  return _item.get_local_linear_id() % _logical_group_size;
517  }
519  uint32_t get_group_linear_id() const {
520  return _item.get_local_linear_id() / _logical_group_size;
521  }
523  uint32_t get_local_linear_range() const {
524  if (_g.get_local_linear_range() % _logical_group_size == 0) {
525  return _logical_group_size;
526  }
527  uint32_t last_item_group_id =
528  _g.get_local_linear_range() / _logical_group_size;
529  uint32_t first_of_last_group = last_item_group_id * _logical_group_size;
530  if (_item.get_local_linear_id() >= first_of_last_group) {
531  return _g.get_local_linear_range() - first_of_last_group;
532  } else {
533  return _logical_group_size;
534  }
535  }
537  uint32_t get_group_linear_range() const {
538  return _group_linear_range_in_parent;
539  }
540 };
541 
542 } // namespace experimental
543 } // namespace syclcompat
void wait()
Wait for the event.
Definition: event.cpp:47
Identifies an instance of the function object executing at each point in an nd_range.
Definition: nd_item.hpp:544
size_t get_local_linear_id() const
Definition: nd_item.hpp:572
group< Dimensions > get_group() const
Definition: nd_item.hpp:578
range< Dimensions > get_group_range() const
Definition: nd_item.hpp:594
void barrier(access::fence_space accessSpace=access::fence_space::global_and_local) const
Definition: nd_item.hpp:622
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
Definition: queue.hpp:119
defined(__INTEL_PREVIEW_BREAKING_CHANGES)
Definition: types.hpp:346
The logical-group is a logical collection of some work-items within a work-group.
Definition: util.hpp:497
uint32_t get_group_linear_id() const
Returns the index of the logical-group in the parent group.
Definition: util.hpp:519
uint32_t get_group_linear_range() const
Returns the number of logical-group in the parent group.
Definition: util.hpp:537
uint32_t get_local_linear_range() const
Returns the number of work-items in the logical-group.
Definition: util.hpp:523
logical_group(sycl::nd_item< 3 > item, sycl::group< 3 > parent_group, uint32_t size)
Dividing parent_group into several logical-groups.
Definition: util.hpp:508
uint32_t get_local_linear_id() const
Returns the index of the work-item within the logical-group.
Definition: util.hpp:515
__ESIMD_API std::enable_if_t<(sizeof(T) *N >=2)> wait(sycl::ext::intel::esimd::simd< T, N > value)
Create explicit scoreboard dependency to avoid device code motion across this call and preserve the v...
Definition: memory.hpp:415
std::enable_if_t<(std::is_same_v< std::decay_t< Group >, sub_group > &&(std::is_trivially_copyable_v< T >||detail::is_vec< T >::value)), T > shift_group_right(Group, T x, typename Group::linear_id_type delta=1)
std::enable_if_t<(std::is_same_v< std::decay_t< Group >, sub_group > &&(std::is_trivially_copyable_v< T >||detail::is_vec< T >::value)), T > select_from_group(Group, T x, typename Group::id_type local_id)
std::enable_if_t< __FAST_MATH_GENFLOAT(T), T > sqrt(T x)
std::enable_if_t<(std::is_same_v< std::decay_t< Group >, sub_group > &&(std::is_trivially_copyable_v< T >||detail::is_vec< T >::value)), T > shift_group_left(Group, T x, typename Group::linear_id_type delta=1)
std::enable_if_t< detail::is_ugeninteger_v< T >, T > abs(T x)
std::enable_if_t< detail::is_geninteger_v< T >, T > ctz(T x)
detail::common_rel_ret_t< T > isgreater(T x, T y)
Definition: access.hpp:18
static sycl::event memcpy(sycl::queue q, void *to_ptr, const void *from_ptr, size_t size, const std::vector< sycl::event > &dep_events={})
Definition: memory.hpp:259
void matrix_mem_copy(void *to_ptr, const void *from_ptr, int to_ld, int from_ld, int rows, int cols, int elem_size, sycl::queue queue=get_default_queue(), bool async=false)
Definition: util.hpp:54
void nd_range_barrier(sycl::nd_item< dimensions > item, sycl::atomic_ref< unsigned int, sycl::memory_order::acq_rel, sycl::memory_scope::device, sycl::access::address_space::global_space > &counter)
Synchronize work items from all work groups within a SYCL kernel.
Definition: util.hpp:426
double cast_ints_to_double(int high32, int low32)
Combine two integers, the first as the high 32 bits and the second as the low 32 bits,...
Definition: util.hpp:116
T select_from_sub_group(sycl::sub_group g, T x, int remote_local_id, int logical_sub_group_size=32)
select_from_sub_group allows work-items to obtain a copy of a value held by any other work-item in th...
Definition: util.hpp:272
unsigned int byte_level_permute(unsigned int a, unsigned int b, unsigned int s)
Definition: util.hpp:236
T vectorized_max(T a, T b)
Compute vectorized max for two values, with each value treated as a vector type S.
Definition: util.hpp:153
static sycl::queue get_default_queue()
Util function to get the default queue of current device in device manager.
Definition: device.hpp:516
float fast_length(const float *a, int len)
Compute fast_length for variable-length array.
Definition: util.hpp:126
T cabs(sycl::vec< T, 2 > x)
Computes the magnitude of a complex number.
Definition: util.hpp:392
sycl::vec< T, 2 > conj(sycl::vec< T, 2 > x)
Computes the complex conjugate of a complex number.
Definition: util.hpp:401
T reverse_bits(T a)
Reverse the bit order of an unsigned integer.
Definition: util.hpp:215
sycl::vec< T, 2 > cdiv(sycl::vec< T, 2 > x, sycl::vec< T, 2 > y)
Computes the division of two complex numbers.
Definition: util.hpp:382
T vectorized_min(T a, T b)
Compute vectorized min for two values, with each value treated as a vector type S.
Definition: util.hpp:169
int get_sycl_language_version()
Inherited from the original SYCLomatic compatibility headers.
Definition: util.hpp:409
int cast_double_to_int(double d, bool use_high32=true)
Cast the high or low 32 bits of a double to an integer.
Definition: util.hpp:104
T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask, int logical_sub_group_size=32)
permute_sub_group_by_xor permutes values by exchanging values held by pairs of work-items identified ...
Definition: util.hpp:352
T shift_sub_group_right(sycl::sub_group g, T x, unsigned int delta, int logical_sub_group_size=32)
shift_sub_group_right move values held by the work-items in a sub_group directly to another work-item...
Definition: util.hpp:324
T shift_sub_group_left(sycl::sub_group g, T x, unsigned int delta, int logical_sub_group_size=32)
shift_sub_group_left move values held by the work-items in a sub_group directly to another work-item ...
Definition: util.hpp:296
int ffs(T a)
Find position of first least significant set bit in an integer.
Definition: util.hpp:252
T vectorized_isgreater(T a, T b)
Compute vectorized isgreater for two values, with each value treated as a vector type S.
Definition: util.hpp:185
sycl::vec< T, 2 > cmul(sycl::vec< T, 2 > x, sycl::vec< T, 2 > y)
Computes the multiplication of two complex numbers.
Definition: util.hpp:370
linear_id_type get_local_linear_id() const
Definition: sub_group.hpp:162