DPC++ Runtime
Runtime libraries for oneAPI DPC++
reduction.cpp
Go to the documentation of this file.
1 //==---------------- reduction.cpp - SYCL reduction ------------*- C++ -*---==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include <detail/queue_impl.hpp>
11 
13 namespace sycl {
14 namespace ext {
15 namespace oneapi {
16 namespace detail {
17 
18 // TODO: The algorithm of choosing the work-group size is definitely
19 // imperfect now and can be improved.
20 __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
21  size_t &NWorkGroups) {
22  size_t WGSize = MaxWGSize;
23  if (NWorkItems <= WGSize) {
24  NWorkGroups = 1;
25  WGSize = NWorkItems;
26  } else {
27  NWorkGroups = NWorkItems / WGSize;
28  size_t Rem = NWorkItems % WGSize;
29  if (Rem != 0) {
30  // Let's suppose MaxWGSize = 128 and NWorkItems = (128+32).
31  // It seems better to have 5 groups 32 work-items each than 2 groups with
32  // 128 work-items in the 1st group and 32 work-items in the 2nd group.
33  size_t NWorkGroupsAlt = NWorkItems / Rem;
34  size_t RemAlt = NWorkItems % Rem;
35  if (RemAlt == 0 && NWorkGroupsAlt <= MaxWGSize) {
36  // Choose smaller uniform work-groups.
37  // The condition 'NWorkGroupsAlt <= MaxWGSize' was checked to ensure
38  // that choosing smaller groups will not cause the need in additional
39  // invocations of the kernel.
40  NWorkGroups = NWorkGroupsAlt;
41  WGSize = Rem;
42  } else {
43  // Add 1 more group to process the remaining elements and proceed
44  // with bigger non-uniform work-groups
45  NWorkGroups++;
46  }
47  }
48  }
49  return WGSize;
50 }
51 
52 // Returns the estimated number of physical threads on the device associated
53 // with the given queue.
54 __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(
55  std::shared_ptr<sycl::detail::queue_impl> Queue) {
56  device Dev = Queue->get_device();
57  uint32_t NumThreads = Dev.get_info<info::device::max_compute_units>();
58  // TODO: The heuristics here require additional tuning for various devices
59  // and vendors. For now this code assumes that execution units have about
60  // 8 working threads, which gives good results on some known/supported
61  // GPU devices.
62  if (Dev.is_gpu())
63  NumThreads *= 8;
64  return NumThreads;
65 }
66 
67 __SYCL_EXPORT size_t
68 reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
69  size_t LocalMemBytesPerWorkItem) {
70  device Dev = Queue->get_device();
71  size_t MaxWGSize = Dev.get_info<info::device::max_work_group_size>();
72  size_t WGSizePerMem = MaxWGSize * 2;
73  size_t WGSize = MaxWGSize;
74  if (LocalMemBytesPerWorkItem != 0) {
75  size_t MemSize = Dev.get_info<info::device::local_mem_size>();
76  WGSizePerMem = MemSize / LocalMemBytesPerWorkItem;
77 
78  // If the work group size is NOT power of two, then an additional element
79  // in local memory is needed for the reduction algorithm and thus the real
80  // work-group size requirement per available memory is stricter.
81  if ((WGSizePerMem & (WGSizePerMem - 1)) != 0)
82  WGSizePerMem--;
83  WGSize = (std::min)(WGSizePerMem, WGSize);
84  }
85  // TODO: This is a temporary workaround for a big problem of detecting
86  // the maximal usable work-group size. The detection method used above
87  // is based on maximal work-group size possible on the device is too risky
88  // as may return too big value. Even though it also tries using the memory
89  // factor into consideration, it is too rough estimation. For example,
90  // if (WGSize * LocalMemBytesPerWorkItem) is equal to local_mem_size, then
91  // the reduction local accessor takes all available local memory for it needs
92  // not leaving any local memory for other kernel needs (barriers,
93  // builtin calls, etc), which often leads to crushes with CL_OUT_OF_RESOURCES
94  // error, or in even worse cases it may cause silent writes/clobbers of
95  // the local memory assigned to one work-group by code in another work-group.
96  // It seems the only good solution for this work-group detection problem is
97  // kernel precompilation and querying the kernel properties.
98  if (WGSize >= 4) {
99  // Let's return a twice smaller number, but... do that only if the kernel
100  // is limited by memory, or the kernel uses opencl:cpu backend, which
101  // surprisingly uses lots of resources to run the kernels with reductions
102  // and often causes CL_OUT_OF_RESOURCES error even when reduction
103  // does not use local accessors.
104  if (WGSizePerMem < MaxWGSize * 2 ||
105  (Queue->get_device().is_cpu() &&
106  Queue->get_device().get_platform().get_backend() == backend::opencl))
107  WGSize /= 2;
108  }
109 
110  return WGSize;
111 }
112 
113 } // namespace detail
114 } // namespace oneapi
115 } // namespace ext
116 
117 namespace __SYCL2020_DEPRECATED("use 'ext::oneapi' instead") ONEAPI {
118  using namespace ext::oneapi;
119  namespace detail {
120  __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
121  size_t &NWorkGroups) {
122  return ext::oneapi::detail::reduComputeWGSize(NWorkItems, MaxWGSize,
123  NWorkGroups);
124  }
125 
126  __SYCL_EXPORT size_t
127  reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
128  size_t LocalMemBytesPerWorkItem) {
130  LocalMemBytesPerWorkItem);
131  }
132  } // namespace detail
133 } // namespace ONEAPI
134 } // namespace sycl
135 } // __SYCL_INLINE_NAMESPACE(cl)
__SYCL2020_DEPRECATED
#define __SYCL2020_DEPRECATED(message)
Definition: defines_elementary.hpp:56
cl::sycl::device::is_gpu
bool is_gpu() const
Check if device is a GPU device.
Definition: device.cpp:106
sycl
Definition: invoke_simd.hpp:68
queue_impl.hpp
cl::sycl::ext::oneapi::detail::reduGetMaxNumConcurrentWorkGroups
uint32_t reduGetMaxNumConcurrentWorkGroups(std::shared_ptr< sycl::detail::queue_impl > Queue)
Definition: reduction.cpp:54
cl::sycl::device
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:35
cl
We provide new interfaces for matrix muliply in this patch:
Definition: access.hpp:13
reduction.hpp
cl::sycl::device::get_info
info::param_traits< info::device, param >::return_type get_info() const
Queries this SYCL device for information requested by the template parameter param.
Definition: device.cpp:147
cl::sycl::ext::oneapi::detail::reduGetMaxWGSize
size_t reduGetMaxWGSize(std::shared_ptr< sycl::detail::queue_impl > Queue, size_t LocalMemBytesPerWorkItem)
Definition: reduction.cpp:68
cl::sycl::instead
std::uint8_t instead
Definition: aliases.hpp:68
cl::sycl::ext::oneapi::detail::reduComputeWGSize
size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize, size_t &NWorkGroups)
Definition: reduction.cpp:20
__SYCL_INLINE_NAMESPACE
#define __SYCL_INLINE_NAMESPACE(X)
Definition: defines_elementary.hpp:12