llvm-docs/doxygen/ompat_2util_8hpp_source.html

 /***************************************************************************

  *

  *  Copyright (C) Codeplay Software Ltd.

  *

  *  Part of the LLVM Project, under the Apache License v2.0 with LLVM

  *  Exceptions. See https://llvm.org/LICENSE.txt for license information.

  *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

  *

  *  Unless required by applicable law or agreed to in writing, software

  *  distributed under the License is distributed on an "AS IS" BASIS,

  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  *  See the License for the specific language governing permissions and

  *  limitations under the License.

  *

  *  SYCL compatibility extension

  *

  *  util.hpp

  *

  *  Description:

  *    util functionality for the SYCL compatibility extension

  **************************************************************************/


 // The original source was under the license below:

 //==---- util.hpp ---------------------------------*- C++ -*----------------==//

 //

 // Copyright (C) Intel Corporation

 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 // See https://llvm.org/LICENSE.txt for license information.

 //

 //===----------------------------------------------------------------------===//


 #pragma once


 #include <cassert>

 #include <type_traits>


 #include <sycl/atomic_ref.hpp>

 #include <sycl/group_barrier.hpp>


 #include <syclcompat/math.hpp>

 #include <syclcompat/memory.hpp>


 #if defined(__NVPTX__)

 #include <sycl/ext/oneapi/experimental/cuda/masked_shuffles.hpp>

 #endif


 // TODO: Remove these function definitions once they exist in the DPC++ compiler

 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)

 template <typename T>

 __SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT

     __attribute__((noduplicate)) T

     __spirv_GroupNonUniformShuffle(__spv::Scope::Flag, T, unsigned) noexcept;


 template <typename T>

 __SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT

     __attribute__((noduplicate)) T

     __spirv_GroupNonUniformShuffleDown(__spv::Scope::Flag, T,

                                        unsigned) noexcept;


 template <typename T>

 __SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT

     __attribute__((noduplicate)) T

     __spirv_GroupNonUniformShuffleUp(__spv::Scope::Flag, T, unsigned) noexcept;

 #endif


 namespace syclcompat {


 namespace detail {


 template <typename tag, typename T> class generic_error_type {

 public:

   generic_error_type() = default;

   generic_error_type(T value) : value{value} {}

   operator T() const { return value; }


 private:

   T value;

 };


 template <typename T> struct DataType {

   using T2 = T;

 };

 template <typename T> struct DataType<sycl::vec<T, 2>> {

   using T2 = detail::complex_type<T>;

 };


 inline void matrix_mem_copy(void *to_ptr, const void *from_ptr, int to_ld,

                             int from_ld, int rows, int cols, int elem_size,

                             sycl::queue queue = syclcompat::get_default_queue(),

                             bool async = false) {

   if (to_ptr == from_ptr && to_ld == from_ld) {

     return;

   }


   if (to_ld == from_ld) {

     size_t copy_size = elem_size * ((cols - 1) * (size_t)to_ld + rows);

     if (async)

       detail::memcpy(queue, (void *)to_ptr, (void *)from_ptr, copy_size);

     else

       detail::memcpy(queue, (void *)to_ptr, (void *)from_ptr, copy_size).wait();

   } else {

     if (async)

       detail::memcpy(queue, to_ptr, from_ptr, elem_size * to_ld,

                      elem_size * from_ld, elem_size * rows, cols);

     else

       sycl::event::wait(detail::memcpy(queue, to_ptr, from_ptr,

                                        elem_size * to_ld, elem_size * from_ld,

                                        elem_size * rows, cols));

   }

 }


 template <typename T>

 inline void matrix_mem_copy(T *to_ptr, const T *from_ptr, int to_ld,

                             int from_ld, int rows, int cols,

                             sycl::queue queue = get_default_queue(),

                             bool async = false) {

   using Ty = typename DataType<T>::T2;

   matrix_mem_copy((void *)to_ptr, (void *)from_ptr, to_ld, from_ld, rows, cols,

                   sizeof(Ty), queue, async);

 }

 } // namespace detail


 using err0 = detail::generic_error_type<struct err0_tag, int>;

 using err1 = detail::generic_error_type<struct err1_tag, int>;


 inline int cast_double_to_int(double d, bool use_high32 = true) {

   sycl::vec<double, 1> v0{d};

   auto v1 = v0.as<sycl::int2>();

   if (use_high32)

     return v1[0];

   return v1[1];

 }


 inline double cast_ints_to_double(int high32, int low32) {

   sycl::int2 v0{high32, low32};

   auto v1 = v0.as<sycl::vec<double, 1>>();

   return v1;

 }


 template <typename T> inline T reverse_bits(T a) {

   static_assert(std::is_unsigned<T>::value && std::is_integral<T>::value,

                 "unsigned integer required");

   if (!a)

     return 0;

   T mask = 0;

   size_t count = 4 * sizeof(T);

   mask = ~mask >> count;

   while (count) {

     a = ((a & mask) << count) | ((a & ~mask) >> count);

     count = count >> 1;

     mask = mask ^ (mask << count);

   }

   return a;

 }


 inline unsigned int byte_level_permute(unsigned int a, unsigned int b,

                                        unsigned int s) {

   unsigned int ret;

   ret =

       ((((std::uint64_t)b << 32 | a) >> (s & 0x7) * 8) & 0xff) |

       (((((std::uint64_t)b << 32 | a) >> ((s >> 4) & 0x7) * 8) & 0xff) << 8) |

       (((((std::uint64_t)b << 32 | a) >> ((s >> 8) & 0x7) * 8) & 0xff) << 16) |

       (((((std::uint64_t)b << 32 | a) >> ((s >> 12) & 0x7) * 8) & 0xff) << 24);

   return ret;

 }


 template <typename T> inline int ffs(T a) {

   static_assert(std::is_integral<T>::value, "integer required");

   return (sycl::ctz(a) + 1) % (sizeof(T) * 8 + 1);

 }


 template <typename T>

 T select_from_sub_group(sycl::sub_group g, T x, int remote_local_id,

                         int logical_sub_group_size = 32) {

   unsigned int start_index =

       g.get_local_linear_id() / logical_sub_group_size * logical_sub_group_size;

   return sycl::select_from_group(

       g, x, start_index + remote_local_id % logical_sub_group_size);

 }


 template <typename T>

 T shift_sub_group_left(sycl::sub_group g, T x, unsigned int delta,

                        int logical_sub_group_size = 32) {

   unsigned int id = g.get_local_linear_id();

   unsigned int end_index =

       (id / logical_sub_group_size + 1) * logical_sub_group_size;

   T result = sycl::shift_group_left(g, x, delta);

   if ((id + delta) >= end_index) {

     result = x;

   }

   return result;

 }


 template <typename T>

 T shift_sub_group_right(sycl::sub_group g, T x, unsigned int delta,

                         int logical_sub_group_size = 32) {

   unsigned int id = g.get_local_linear_id();

   unsigned int start_index =

       id / logical_sub_group_size * logical_sub_group_size;

   T result = sycl::shift_group_right(g, x, delta);

   if ((id - start_index) < delta) {

     result = x;

   }

   return result;

 }


 template <typename T>

 T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask,

                            int logical_sub_group_size = 32) {

   unsigned int id = g.get_local_linear_id();

   unsigned int start_index =

       id / logical_sub_group_size * logical_sub_group_size;

   unsigned int target_offset = (id % logical_sub_group_size) ^ mask;

   return sycl::select_from_group(g, x,

                                  target_offset < logical_sub_group_size

                                      ? start_index + target_offset

                                      : id);

 }


 namespace experimental {

 template <typename T>

 T select_from_sub_group(unsigned int member_mask, sycl::sub_group g, T x,

                         int remote_local_id, int logical_sub_group_size = 32) {

   unsigned int start_index =

       g.get_local_linear_id() / logical_sub_group_size * logical_sub_group_size;

   unsigned logical_remote_id =

       start_index + remote_local_id % logical_sub_group_size;

 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)

 #if defined(__SPIR__)

   return __spirv_GroupNonUniformShuffle(__spv::Scope::Subgroup, x,

                                         logical_remote_id);

 #elif defined(__NVPTX__)

   int cVal = ((32 - logical_sub_group_size) << 8) | 31;

   return cuda_shfl_sync_idx_i32(member_mask, x, remote_local_id, cVal);

 #else

   throw sycl::exception(sycl::errc::runtime,

                         "[SYCLcompat] Masked version of select_from_sub_group "

                         "only supports SPIR-V or cuda backends.");

 #endif // __SPIR__

 #else

   (void)g;

   (void)x;

   (void)remote_local_id;

   (void)logical_sub_group_size;

   (void)member_mask;

   throw sycl::exception(

       sycl::errc::runtime,

       "[SYCLcompat] Masked version of select_from_sub_group not "

       "supported on host device and non intel compiler.");

 #endif // __SYCL_DEVICE_ONLY__ && __INTEL_LLVM_COMPILER

 }


 template <typename T>

 T shift_sub_group_left(unsigned int member_mask, sycl::sub_group g, T x,

                        unsigned int delta, int logical_sub_group_size = 32) {

   unsigned int id = g.get_local_linear_id();

   unsigned int end_index =

       (id / logical_sub_group_size + 1) * logical_sub_group_size;

 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)

 #if defined(__SPIR__)

   T result =

       __spirv_GroupNonUniformShuffleDown(__spv::Scope::Subgroup, x, delta);

   if ((id + delta) >= end_index) {

     result = x;

   }

   return result;

 #elif defined(__NVPTX__)

   int cVal = ((32 - logical_sub_group_size) << 8) | 31;

   return cuda_shfl_sync_down_i32(member_mask, x, delta, cVal);

 #else

   throw sycl::exception(sycl::errc::runtime,

                         "[SYCLcompat] Masked version of shift_sub_group_left "

                         "only supports SPIR-V or cuda backends.");

 #endif // __SPIR__

 #else

   (void)g;

   (void)x;

   (void)delta;

   (void)logical_sub_group_size;

   (void)member_mask;

   throw sycl::exception(

       sycl::errc::runtime,

       "[SYCLcompat] Masked version of shift_sub_group_left not "

       "supported on host device and non intel compiler.");

 #endif // __SYCL_DEVICE_ONLY__ && __INTEL_LLVM_COMPILER

 }


 template <typename T>

 T shift_sub_group_right(unsigned int member_mask, sycl::sub_group g, T x,

                         unsigned int delta, int logical_sub_group_size = 32) {

   unsigned int id = g.get_local_linear_id();

   unsigned int start_index =

       id / logical_sub_group_size * logical_sub_group_size;

 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)

 #if defined(__SPIR__)

   T result = __spirv_GroupNonUniformShuffleUp(__spv::Scope::Subgroup, x, delta);

   if ((id - start_index) < delta) {

     result = x;

   }

   return result;

 #elif defined(__NVPTX__)

   int cVal = ((32 - logical_sub_group_size) << 8);

   return cuda_shfl_sync_up_i32(member_mask, x, delta, cVal);

 #else

   throw sycl::exception(sycl::errc::runtime,

                         "Masked version of shift_sub_group_right "

                         "only supports SPIR-V or cuda backends.");

 #endif // __SPIR__

 #else

   (void)g;

   (void)x;

   (void)delta;

   (void)logical_sub_group_size;

   (void)member_mask;

   throw sycl::exception(sycl::errc::runtime,

                         "Masked version of shift_sub_group_right not "

                         "supported on host device and non intel compiler.");

 #endif // __SYCL_DEVICE_ONLY && __INTEL_LLVM_COMPILER

 }


 template <typename T>

 T permute_sub_group_by_xor(unsigned int member_mask, sycl::sub_group g, T x,

                            unsigned int mask, int logical_sub_group_size = 32) {

   unsigned int id = g.get_local_linear_id();

   unsigned int start_index =

       id / logical_sub_group_size * logical_sub_group_size;

   unsigned int target_offset = (id % logical_sub_group_size) ^ mask;

   unsigned logical_remote_id = (target_offset < logical_sub_group_size)

                                    ? start_index + target_offset

                                    : id;

 #if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)

 #if defined(__SPIR__)

   return __spirv_GroupNonUniformShuffle(__spv::Scope::Subgroup, x,

                                         logical_remote_id);

 #elif defined(__NVPTX__)

   int cVal = ((32 - logical_sub_group_size) << 8) | 31;

   return cuda_shfl_sync_bfly_i32(member_mask, x, mask, cVal);

 #else

   throw sycl::exception(

       sycl::errc::runtime,

       "[SYCLcompat] Masked version of permute_sub_group_by_xor "

       "only supports SPIR-V or cuda backends.");

 #endif // __SPIR__

 #else

   (void)g;

   (void)x;

   (void)mask;

   (void)logical_sub_group_size;

   (void)member_mask;

   throw sycl::exception(

       sycl::errc::runtime,

       "[SYCLcompat]Masked version of permute_sub_group_by_xor not "

       "supported on host device and non intel compiler.");

 #endif // __SYCL_DEVICE_ONLY__ && __INTEL_LLVM_COMPILER

 }

 } // namespace experimental


 inline int get_sycl_language_version() {

 #ifdef SYCL_LANGUAGE_VERSION

   return SYCL_LANGUAGE_VERSION;

 #else

   return 202000;

 #endif

 }


 template <typename T>

 unsigned int match_any_over_sub_group(sycl::sub_group g, unsigned member_mask,

                                       T value) {

   static_assert(std::is_arithmetic_v<T>, "Value type must be arithmetic type.");

   if (!member_mask) {

     return 0;

   }

   unsigned int id = g.get_local_linear_id();

   unsigned int flag = 0, result = 0, reduce_result = 0;

   unsigned int bit_index = 0x1 << id;

   bool is_participate = member_mask & bit_index;

   T broadcast_value = 0;

   bool matched = false;

   while (flag != member_mask) {

     broadcast_value =

         sycl::select_from_group(g, value, sycl::ctz((~flag & member_mask)));

     reduce_result = sycl::reduce_over_group(

         g, is_participate ? (broadcast_value == value ? bit_index : 0) : 0,

         sycl::plus<>());

     flag |= reduce_result;

     matched = reduce_result & bit_index;

     result = matched * reduce_result + (1 - matched) * result;

   }

   return result;

 }


 template <typename T>

 unsigned int match_all_over_sub_group(sycl::sub_group g, unsigned member_mask,

                                       T value, int *pred) {

   static_assert(std::is_arithmetic_v<T>, "Value type must be arithmetic type.");

   if (!member_mask) {

     return 0;

   }

   unsigned int id = g.get_local_linear_id();

   unsigned int bit_index = 0x1 << id;

   bool is_participate = member_mask & bit_index;

   T broadcast_value = sycl::select_from_group(g, value, sycl::ctz(member_mask));

   unsigned int reduce_result = sycl::reduce_over_group(

       g,

       (member_mask & bit_index) ? (broadcast_value == value ? bit_index : 0)

                                 : 0,

       sycl::plus<>());

   bool all_equal = (reduce_result == member_mask);

   *pred = is_participate & all_equal;

   return (is_participate & all_equal) * member_mask;

 }


 namespace experimental {


 // FIXME(@intel/syclcompat-lib-reviewers): unify once supported in the CUDA and

 // AMD backends.

 #if defined(__AMDGPU__) || defined(__NVPTX__)

 constexpr sycl::memory_order barrier_memory_order = sycl::memory_order::acq_rel;

 #else

 constexpr sycl::memory_order barrier_memory_order = sycl::memory_order::seq_cst;

 #endif


 template <int dimensions = 3>

 inline void nd_range_barrier(

     const sycl::nd_item<dimensions> &item,

     sycl::atomic_ref<unsigned int, barrier_memory_order,

                      sycl::memory_scope::device,

                      sycl::access::address_space::global_space> &counter) {


   static_assert(dimensions == 3, "dimensions must be 3.");

   constexpr unsigned int MSB32_MASK = 0x80000000;


   unsigned int num_groups = item.get_group_range(2) * item.get_group_range(1) *

                             item.get_group_range(0);


   item.barrier();


   if (item.get_local_linear_id() == 0) {

     unsigned int inc = 1;

     unsigned int old_arrive = 0;

     bool is_group0 =

         (item.get_group(2) + item.get_group(1) + item.get_group(0) == 0);

     if (is_group0) {

       inc = MSB32_MASK - (num_groups - 1);

     }


     old_arrive = counter.fetch_add(inc);

     // Synchronize all the work groups

     while (((old_arrive ^ counter.load()) & MSB32_MASK) == 0)

       ;

   }


   item.barrier();

 }


 template <>

 inline void nd_range_barrier(

     const sycl::nd_item<1> &item,

     sycl::atomic_ref<unsigned int, barrier_memory_order,

                      sycl::memory_scope::device,

                      sycl::access::address_space::global_space> &counter) {

   unsigned int num_groups = item.get_group_range(0);

   constexpr unsigned int MSB32_MASK = 0x80000000;


   item.barrier();


   if (item.get_local_linear_id() == 0) {

     unsigned int inc = 1;

     unsigned int old_arrive = 0;

     bool is_group0 = (item.get_group(0) == 0);

     if (is_group0) {

       inc = MSB32_MASK - (num_groups - 1);

     }


     old_arrive = counter.fetch_add(inc);

     // Synchronize all the work groups

     while (((old_arrive ^ counter.load()) & MSB32_MASK) == 0)

       ;

   }


   item.barrier();

 }


 template <int dimensions = 3> class logical_group {

   sycl::nd_item<dimensions> _item;

   sycl::group<dimensions> _g;

   uint32_t _logical_group_size;

   uint32_t _group_linear_range_in_parent;


 public:

   logical_group(sycl::nd_item<dimensions> item,

                 sycl::group<dimensions> parent_group, uint32_t size)

       : _item(item), _g(parent_group), _logical_group_size(size) {

     _group_linear_range_in_parent =

         (_g.get_local_linear_range() - 1) / _logical_group_size + 1;

   }

   logical_group(sycl::nd_item<dimensions> item)

       : _item(item), _g(item.get_group()) {}

   uint32_t get_local_linear_id() const {

     return _item.get_local_linear_id() % _logical_group_size;

   }

   uint32_t get_group_linear_id() const {

     return _item.get_local_linear_id() / _logical_group_size;

   }

   uint32_t get_local_linear_range() const {

     if (_g.get_local_linear_range() % _logical_group_size == 0) {

       return _logical_group_size;

     }

     uint32_t last_item_group_id =

         _g.get_local_linear_range() / _logical_group_size;

     uint32_t first_of_last_group = last_item_group_id * _logical_group_size;

     if (_item.get_local_linear_id() >= first_of_last_group) {

       return _g.get_local_linear_range() - first_of_last_group;

     } else {

       return _logical_group_size;

     }

   }

   uint32_t get_group_linear_range() const {

     return _group_linear_range_in_parent;

   }

 };


 // The original source of the functions calculate_max_active_wg_per_xecore and

 // calculate_max_potential_wg were under the license below:

 //

 // Copyright (C) Intel Corporation

 //

 // Permission is hereby granted, free of charge, to any person obtaining a copy

 // of this software and associated documentation files (the "Software"), to deal

 // in the Software without restriction, including without limitation the rights

 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell

 // copies of the Software, and to permit persons to whom the Software is

 // furnished to do so, subject to the following conditions:

 //

 // The above copyright notice and this permission notice shall be included in

 // all copies or substantial portions of the Software.

 //

 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

 // SOFTWARE.

 //

 inline int calculate_max_active_wg_per_xecore(int *num_wg, int wg_size,

                                               int slm_size = 0,

                                               int sg_size = 32,

                                               bool used_barrier = false,

                                               bool used_large_grf = false) {

   int ret = 0;

   const int slm_size_per_xe_core = 64 * 1024;

   const int max_barrier_registers = 32;

   syclcompat::device_ext &dev = syclcompat::get_current_device();


   size_t max_wg_size = dev.get_info<sycl::info::device::max_work_group_size>();

   if (wg_size > max_wg_size) {

     wg_size = max_wg_size;

     ret = -1;

   }


   int num_threads_ss = 56;

   int max_num_wg = 56;

   if (dev.has(sycl::aspect::ext_intel_gpu_eu_count_per_subslice) &&

       dev.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {

     auto eu_count =

         dev.get_info<sycl::info::device::ext_intel_gpu_eu_count_per_subslice>();

     auto threads_count =

         dev.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();

     num_threads_ss = eu_count * threads_count;

     max_num_wg = eu_count * threads_count;

   }


   if (used_barrier) {

     max_num_wg = max_barrier_registers;

   }


   // Calculate num_wg_slm

   int num_wg_slm = 0;

   if (slm_size == 0) {

     num_wg_slm = max_num_wg;

   } else {

     num_wg_slm = std::floor((float)slm_size_per_xe_core / slm_size);

   }


   // Calculate num_wg_threads

   if (used_large_grf)

     num_threads_ss = num_threads_ss / 2;

   int num_threads = std::ceil((float)wg_size / sg_size);

   int num_wg_threads = std::floor((float)num_threads_ss / num_threads);


   // Calculate num_wg

   *num_wg = std::min(num_wg_slm, num_wg_threads);

   *num_wg = std::min(*num_wg, max_num_wg);

   return ret;

 }


 inline int calculate_max_potential_wg(int *num_wg, int *wg_size,

                                       int max_wg_size_for_device_code,

                                       int slm_size = 0, int sg_size = 32,

                                       bool used_barrier = false,

                                       bool used_large_grf = false) {

   sycl::device &dev = syclcompat::get_current_device();

   size_t max_wg_size = dev.get_info<sycl::info::device::max_work_group_size>();

   if (max_wg_size_for_device_code == 0 ||

       max_wg_size_for_device_code >= max_wg_size)

     *wg_size = (int)max_wg_size;

   else

     *wg_size = max_wg_size_for_device_code;

   calculate_max_active_wg_per_xecore(num_wg, *wg_size, slm_size, sg_size,

                                      used_barrier, used_large_grf);

   std::uint32_t num_ss = 1;

   if (dev.has(sycl::aspect::ext_intel_gpu_slices) &&

       dev.has(sycl::aspect::ext_intel_gpu_subslices_per_slice)) {

     num_ss =

         dev.get_info<sycl::ext::intel::info::device::gpu_slices>() *

         dev.get_info<sycl::ext::intel::info::device::gpu_subslices_per_slice>();

   }

   num_wg[0] = num_ss * num_wg[0];

   return 0;

 }


 enum class group_type { work_group, sub_group, logical_group, root_group };


 template <int dimensions = 3> class group_base {

 public:

   group_base(sycl::nd_item<dimensions> item)

       : nd_item(item), logical_group(item) {}

   ~group_base() {}

   size_t get_local_linear_range() {

     switch (type) {

     case group_type::work_group:

       return nd_item.get_group().get_local_linear_range();

     case group_type::sub_group:

       return nd_item.get_sub_group().get_local_linear_range();

     case group_type::logical_group:

       return logical_group.get_local_linear_range();

     default:

       return -1; // Unkonwn group type

     }

   }

   size_t get_local_linear_id() {

     switch (type) {

     case group_type::work_group:

       return nd_item.get_group().get_local_linear_id();

     case group_type::sub_group:

       return nd_item.get_sub_group().get_local_linear_id();

     case group_type::logical_group:

       return logical_group.get_local_linear_id();

     default:

       return -1; // Unkonwn group type

     }

   }

   void barrier() {

     switch (type) {

     case group_type::work_group:

       sycl::group_barrier(nd_item.get_group());

       break;

     case group_type::sub_group:

     case group_type::logical_group:

       sycl::group_barrier(nd_item.get_sub_group());

       break;

     default:

       break;

     }

   }


 protected:

   logical_group<dimensions> logical_group;

   sycl::nd_item<dimensions> nd_item;

   group_type type;

 };


 template <typename GroupT, int dimensions = 3>

 class group : public group_base<dimensions> {

   using group_base<dimensions>::type;

   using group_base<dimensions>::logical_group;


 public:

   group(GroupT g, sycl::nd_item<dimensions> item)

       : group_base<dimensions>(item) {

     if constexpr (std::is_same_v<GroupT, sycl::sub_group>) {

       type = group_type::sub_group;

     } else if constexpr (std::is_same_v<GroupT, sycl::group<dimensions>>) {

       type = group_type::work_group;

     } else if constexpr (std::is_same_v<

                              GroupT, experimental::logical_group<dimensions>>) {

       logical_group = g;

       type = group_type::logical_group;

     }

   }

 };

 } // namespace experimental


 inline queue_ptr int_as_queue_ptr(uintptr_t x) {

   return x <= 2 ? detail::dev_mgr::instance().current_device().default_queue()

                 : reinterpret_cast<queue_ptr>(x);

 }


 template <int n_nondefault_params, int n_default_params, typename T>

 class args_selector;


 template <int n_nondefault_params, int n_default_params, typename R,

           typename... Ts>

 class args_selector<n_nondefault_params, n_default_params, R(Ts...)> {

 private:

   void **kernel_params;

   char *args_buffer;


   template <int i> static constexpr int account_for_default_params() {

     constexpr int n_total_params = sizeof...(Ts);

     if constexpr (i >= n_nondefault_params) {

       return n_total_params - n_default_params + (i - n_nondefault_params);

     } else {

       return i;

     }

   }


 public:

   template <int i>

   using arg_type =

       std::tuple_element_t<account_for_default_params<i>(), std::tuple<Ts...>>;


 private:

   template <int i> static constexpr int get_offset() {

     if constexpr (i == 0) {

       // we can assume args_buffer is properly aligned to the

       // first argument

       return 0;

     } else {

       constexpr int prev_off = get_offset<i - 1>();

       constexpr int prev_past_end = prev_off + sizeof(arg_type<i - 1>);

       using T = arg_type<i>;

       // is the past-the-end of the i-1st element properly aligned

       // with the ith element's alignment?

       if constexpr (prev_past_end % alignof(T) == 0) {

         return prev_past_end;

       }

       // otherwise bump prev_past_end to match alignment

       else {

         return prev_past_end + (alignof(T) - (prev_past_end % alignof(T)));

       }

     }

   }


   static char *get_args_buffer(void **extra) {

     if (!extra)

       return nullptr;

     for (; (std::size_t)*extra != 0; ++extra) {

       if ((std::size_t)*extra == 1) {

         return static_cast<char *>(*(extra + 1));

       }

     }

     return nullptr;

   }


 public:

   args_selector(void **kernel_params, void **extra)

       : kernel_params(kernel_params), args_buffer(get_args_buffer(extra)) {}


   template <int i> arg_type<i> &get() {

     if (kernel_params) {

       return *static_cast<arg_type<i> *>(kernel_params[i]);

     } else {

       return *reinterpret_cast<arg_type<i> *>(args_buffer + get_offset<i>());

     }

   }

 };


 } // namespace syclcompat

atomic_ref.hpp

sycl::atomic_ref
Definition: atomic_ref.hpp:720

sycl::device
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64

sycl::_V1::device::get_info
detail::is_device_info_desc< Param >::return_type get_info() const
Queries this SYCL device for information requested by the template parameter param.
Definition: device.hpp:215

sycl::_V1::device::has
bool has(aspect Aspect) const __SYCL_WARN_IMAGE_ASPECT(Aspect)
Indicates if the SYCL device has the given feature.
Definition: device.cpp:207

sycl::_V1::event::wait
void wait()
Wait for the event.
Definition: event.cpp:41

sycl::exception
Definition: exception.hpp:75

sycl::group
Definition: helpers.hpp:32

sycl::nd_item
Identifies an instance of the function object executing at each point in an nd_range.
Definition: nd_item.hpp:48

sycl::_V1::nd_item::get_local_linear_id
size_t get_local_linear_id() const
Definition: nd_item.hpp:97

sycl::_V1::nd_item::get_group
group< Dimensions > get_group() const
Definition: nd_item.hpp:113

sycl::_V1::nd_item::get_group_range
range< Dimensions > get_group_range() const
Definition: nd_item.hpp:144

sycl::_V1::nd_item::barrier
void barrier(access::fence_space accessSpace=access::fence_space::global_and_local) const
Definition: nd_item.hpp:200

sycl::queue
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
Definition: queue.hpp:110

sycl::vec
Definition: vector.hpp:131

syclcompat::args_selector< n_nondefault_params, n_default_params, R(Ts...)>::get
arg_type< i > & get()
Get a reference to the ith argument extracted from kernel_params or extra.
Definition: util.hpp:1022

syclcompat::args_selector< n_nondefault_params, n_default_params, R(Ts...)>::args_selector
args_selector(void **kernel_params, void **extra)
If kernel_params is nonnull, then args_selector will extract arguments from kernel_params.
Definition: util.hpp:1015

syclcompat::args_selector< n_nondefault_params, n_default_params, R(Ts...)>::arg_type
std::tuple_element_t< account_for_default_params< i >(), std::tuple< Ts... > > arg_type
Get the type of the ith argument of R(Ts...)
Definition: util.hpp:973

syclcompat::args_selector
Definition: util.hpp:930

syclcompat::detail::dev_mgr::instance
static dev_mgr & instance()
Returns the instance of device manager singleton.
Definition: device.hpp:813

syclcompat::detail::dev_mgr::current_device
device_ext & current_device()
Definition: device.hpp:703

syclcompat::detail::generic_error_type
Definition: util.hpp:70

syclcompat::detail::generic_error_type::generic_error_type
generic_error_type(T value)
Definition: util.hpp:73

syclcompat::detail::generic_error_type::generic_error_type
generic_error_type()=default

syclcompat::device_ext
device extension
Definition: device.hpp:338

syclcompat::device_ext::default_queue
queue_ptr default_queue()
Definition: device.hpp:564

syclcompat::experimental::group_base
The group_base will dispatch the function call to the specific interface based on the group type.
Definition: util.hpp:847

syclcompat::experimental::group_base::type
group_type type
Definition: util.hpp:897

syclcompat::experimental::group_base::get_local_linear_range
size_t get_local_linear_range()
Returns the number of work-items in the group.
Definition: util.hpp:853

syclcompat::experimental::group_base::logical_group
logical_group< dimensions > logical_group
Definition: util.hpp:895

syclcompat::experimental::group_base::~group_base
~group_base()
Definition: util.hpp:851

syclcompat::experimental::group_base::group_base
group_base(sycl::nd_item< dimensions > item)
Definition: util.hpp:849

syclcompat::experimental::group_base::barrier
void barrier()
Wait for all the elements within the group to complete their execution before proceeding.
Definition: util.hpp:880

syclcompat::experimental::group_base::nd_item
sycl::nd_item< dimensions > nd_item
Definition: util.hpp:896

syclcompat::experimental::group_base::get_local_linear_id
size_t get_local_linear_id()
Returns the index of the work-item within the group.
Definition: util.hpp:866

syclcompat::experimental::group
Container type that can store supported group_types.
Definition: util.hpp:902

syclcompat::experimental::group::group
group(GroupT g, sycl::nd_item< dimensions > item)
Definition: util.hpp:907

syclcompat::experimental::logical_group
The logical-group is a logical collection of some work-items within a work-group.
Definition: util.hpp:670

syclcompat::experimental::logical_group::get_group_linear_range
uint32_t get_group_linear_range() const
Returns the number of logical-group in the parent group.
Definition: util.hpp:712

syclcompat::experimental::logical_group::get_local_linear_range
uint32_t get_local_linear_range() const
Returns the number of work-items in the logical-group.
Definition: util.hpp:698

syclcompat::experimental::logical_group::logical_group
logical_group(sycl::nd_item< dimensions > item)
Definition: util.hpp:687

syclcompat::experimental::logical_group::logical_group
logical_group(sycl::nd_item< dimensions > item, sycl::group< dimensions > parent_group, uint32_t size)
Dividing parent_group into several logical-groups.
Definition: util.hpp:681

syclcompat::experimental::logical_group::get_group_linear_id
uint32_t get_group_linear_id() const
Returns the index of the logical-group in the parent group.
Definition: util.hpp:694

syclcompat::experimental::logical_group::get_local_linear_id
uint32_t get_local_linear_id() const
Returns the index of the work-item within the logical-group.
Definition: util.hpp:690

SYCL_EXTERNAL
#define SYCL_EXTERNAL
Definition: defines_elementary.hpp:34

group_barrier.hpp

sycl::_V1::ext::intel::experimental::esimd::wait
__ESIMD_API std::enable_if_t<(sizeof(T) *N >=2)> wait(sycl::ext::intel::esimd::simd< T, N > value)
Create explicit scoreboard dependency to avoid device code motion across this call and preserve the v...
Definition: memory.hpp:213

masked_shuffles.hpp

sycl::_V1::ext::oneapi::experimental::__attribute__
__attribute__((always_inline)) auto invoke_simd(sycl
The invoke_simd free function invokes a SIMD function using all work-items in a sub_group.
Definition: invoke_simd.hpp:464

sycl::_V1::ceil
float ceil(float)

sycl::_V1::shift_group_left
std::enable_if_t<((std::is_same_v< std::decay_t< Group >, sub_group >||sycl::ext::oneapi::experimental::is_user_constructed_group_v< std::decay_t< Group >>) &&(std::is_trivially_copyable_v< T >||detail::is_vec< T >::value)), T > shift_group_left(Group g, T x, typename Group::linear_id_type delta=1)
Definition: group_algorithm.hpp:532

sycl::_V1::shift_group_right
std::enable_if_t<((std::is_same_v< std::decay_t< Group >, sub_group >||sycl::ext::oneapi::experimental::is_user_constructed_group_v< std::decay_t< Group >>) &&(std::is_trivially_copyable_v< T >||detail::is_vec< T >::value)), T > shift_group_right(Group g, T x, typename Group::linear_id_type delta=1)
Definition: group_algorithm.hpp:554

sycl::_V1::group_barrier
void group_barrier(ext::oneapi::experimental::root_group< dimensions > G, memory_scope FenceScope=decltype(G)::fence_scope)
Definition: root_group.hpp:100

sycl::_V1::vec
class __SYCL_EBO vec
Definition: aliases.hpp:18

sycl::_V1::plus
std::plus< T > plus
Definition: functional.hpp:18

sycl::_V1::reduce_over_group
std::enable_if_t<(is_group_v< std::decay_t< Group >> &&(detail::is_scalar_arithmetic< T >::value||(detail::is_complex< T >::value &&detail::is_multiplies< T, BinaryOperation >::value)) &&detail::is_native_op< T, BinaryOperation >::value), T > reduce_over_group(Group g, T x, BinaryOperation binary_op)
Definition: group_algorithm.hpp:219

sycl::_V1::select_from_group
std::enable_if_t<((std::is_same_v< std::decay_t< Group >, sub_group >||sycl::ext::oneapi::experimental::is_user_constructed_group_v< std::decay_t< Group >>) &&(std::is_trivially_copyable_v< T >||detail::is_vec< T >::value)), T > select_from_group(Group g, T x, typename Group::id_type local_id)
Definition: group_algorithm.hpp:598

sycl::_V1::memory_scope::device
@ device

sycl::_V1::floor
float floor(float)

sycl::_V1::memory_order
memory_order
Definition: memory_enums.hpp:19

sycl
Definition: access.hpp:18

syclcompat::detail::memcpy
static sycl::event memcpy(sycl::queue q, void *to_ptr, const void *from_ptr, size_t size, const std::vector< sycl::event > &dep_events={})
Definition: memory.hpp:315

syclcompat::detail::complex_type
detail::complex_namespace::complex< ValueT > complex_type
Definition: math.hpp:47

syclcompat::detail::matrix_mem_copy
void matrix_mem_copy(void *to_ptr, const void *from_ptr, int to_ld, int from_ld, int rows, int cols, int elem_size, sycl::queue queue=syclcompat::get_default_queue(), bool async=false)
Definition: util.hpp:87

syclcompat::detail::get_offset
static size_t get_offset(sycl::id< 3 > id, size_t slice, size_t pitch)
Definition: memory.hpp:329

syclcompat::experimental::calculate_max_active_wg_per_xecore
int calculate_max_active_wg_per_xecore(int *num_wg, int wg_size, int slm_size=0, int sg_size=32, bool used_barrier=false, bool used_large_grf=false)
This function is used for occupancy calculation, it computes the max active work-group number per Xe-...
Definition: util.hpp:752

syclcompat::experimental::shift_sub_group_left
T shift_sub_group_left(unsigned int member_mask, sycl::sub_group g, T x, unsigned int delta, int logical_sub_group_size=32)
Masked version of shift_sub_group_left, which execute masked sub-group operation.
Definition: util.hpp:370

syclcompat::experimental::barrier_memory_order
constexpr sycl::memory_order barrier_memory_order
Definition: util.hpp:588

syclcompat::experimental::group_type
group_type
Supported group types.
Definition: util.hpp:843

syclcompat::experimental::group_type::sub_group
@ sub_group

syclcompat::experimental::group_type::root_group
@ root_group

syclcompat::experimental::group_type::work_group
@ work_group

syclcompat::experimental::group_type::logical_group
@ logical_group

syclcompat::experimental::calculate_max_potential_wg
int calculate_max_potential_wg(int *num_wg, int *wg_size, int max_wg_size_for_device_code, int slm_size=0, int sg_size=32, bool used_barrier=false, bool used_large_grf=false)
This function is used for occupancy calculation, it computes the work-group number and the work-group...
Definition: util.hpp:817

syclcompat::experimental::select_from_sub_group
T select_from_sub_group(unsigned int member_mask, sycl::sub_group g, T x, int remote_local_id, int logical_sub_group_size=32)
Masked version of select_from_sub_group, which execute masked sub-group operation.
Definition: util.hpp:325

syclcompat::experimental::permute_sub_group_by_xor
T permute_sub_group_by_xor(unsigned int member_mask, sycl::sub_group g, T x, unsigned int mask, int logical_sub_group_size=32)
Masked version of permute_sub_group_by_xor, which execute masked sub-group operation.
Definition: util.hpp:464

syclcompat::experimental::shift_sub_group_right
T shift_sub_group_right(unsigned int member_mask, sycl::sub_group g, T x, unsigned int delta, int logical_sub_group_size=32)
Masked version of shift_sub_group_right, which execute masked sub-group operation.
Definition: util.hpp:418

syclcompat::experimental::nd_range_barrier
void nd_range_barrier(const sycl::nd_item< dimensions > &item, sycl::atomic_ref< unsigned int, barrier_memory_order, sycl::memory_scope::device, sycl::access::address_space::global_space > &counter)
Synchronize work items from all work groups within a SYCL kernel.
Definition: util.hpp:599

syclcompat
Definition: atomic.hpp:43

syclcompat::cast_ints_to_double
double cast_ints_to_double(int high32, int low32)
Combine two integers, the first as the high 32 bits and the second as the low 32 bits,...
Definition: util.hpp:152

syclcompat::select_from_sub_group
T select_from_sub_group(sycl::sub_group g, T x, int remote_local_id, int logical_sub_group_size=32)
select_from_sub_group allows work-items to obtain a copy of a value held by any other work-item in th...
Definition: util.hpp:218

syclcompat::int_as_queue_ptr
queue_ptr int_as_queue_ptr(uintptr_t x)
If x <= 2, then return a pointer to the default queue; otherwise, return x reinterpreted as a queue_p...
Definition: util.hpp:924

syclcompat::match_any_over_sub_group
unsigned int match_any_over_sub_group(sycl::sub_group g, unsigned member_mask, T value)
The function match_any_over_sub_group conducts a comparison of values across work-items within a sub-...
Definition: util.hpp:522

syclcompat::byte_level_permute
unsigned int byte_level_permute(unsigned int a, unsigned int b, unsigned int s)
Definition: util.hpp:182

syclcompat::get_default_queue
static sycl::queue get_default_queue()
Util function to get the default queue of current device in device manager.
Definition: device.hpp:872

syclcompat::get_current_device
static device_ext & get_current_device()
Util function to get the current device.
Definition: device.hpp:900

syclcompat::match_all_over_sub_group
unsigned int match_all_over_sub_group(sycl::sub_group g, unsigned member_mask, T value, int *pred)
The function match_all_over_sub_group conducts a comparison of values across work-items within a sub-...
Definition: util.hpp:561

syclcompat::reverse_bits
T reverse_bits(T a)
Reverse the bit order of an unsigned integer.
Definition: util.hpp:161

syclcompat::get_sycl_language_version
int get_sycl_language_version()
Inherited from the original SYCLomatic compatibility headers.
Definition: util.hpp:502

syclcompat::cast_double_to_int
int cast_double_to_int(double d, bool use_high32=true)
Cast the high or low 32 bits of a double to an integer.
Definition: util.hpp:140

syclcompat::permute_sub_group_by_xor
T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask, int logical_sub_group_size=32)
permute_sub_group_by_xor permutes values by exchanging values held by pairs of work-items identified ...
Definition: util.hpp:298

syclcompat::shift_sub_group_right
T shift_sub_group_right(sycl::sub_group g, T x, unsigned int delta, int logical_sub_group_size=32)
shift_sub_group_right move values held by the work-items in a sub_group directly to another work-item...
Definition: util.hpp:270

syclcompat::shift_sub_group_left
T shift_sub_group_left(sycl::sub_group g, T x, unsigned int delta, int logical_sub_group_size=32)
shift_sub_group_left move values held by the work-items in a sub_group directly to another work-item ...
Definition: util.hpp:242

syclcompat::ffs
int ffs(T a)
Find position of first least significant set bit in an integer.
Definition: util.hpp:198

math.hpp

memory.hpp

__SYCL_CONVERGENT__
#define __SYCL_CONVERGENT__
Definition: spirv_ops.hpp:23

noexcept
_Abi const simd< _Tp, _Abi > & noexcept
Definition: simd.hpp:1324

__spv::Scope::Flag
Flag
Definition: spirv_types.hpp:27

__spv::Scope::Subgroup
@ Subgroup
Definition: spirv_types.hpp:31

sycl::sub_group
Definition: sub_group.hpp:133

sycl::_V1::sub_group::get_local_linear_id
linear_id_type get_local_linear_id() const
Definition: sub_group.hpp:153

syclcompat::detail::DataType< sycl::vec< T, 2 > >::T2
detail::complex_type< T > T2
Definition: util.hpp:84

syclcompat::detail::DataType
Definition: util.hpp:80

syclcompat::detail::DataType::T2
T T2
Definition: util.hpp:81