DPC++ Runtime
Runtime libraries for oneAPI DPC++
reduction.cpp
Go to the documentation of this file.
1 //==---------------- reduction.cpp - SYCL reduction ------------*- C++ -*---==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include <detail/config.hpp>
10 #include <detail/queue_impl.hpp>
11 #include <sycl/reduction.hpp>
12 
13 namespace sycl {
14 inline namespace _V1 {
15 namespace detail {
16 
17 // TODO: The algorithm of choosing the work-group size is definitely
18 // imperfect now and can be improved.
19 __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
20  size_t &NWorkGroups) {
21  size_t WGSize = MaxWGSize;
22  if (NWorkItems <= WGSize) {
23  NWorkGroups = 1;
24  WGSize = NWorkItems;
25  } else {
26  NWorkGroups = NWorkItems / WGSize;
27  size_t Rem = NWorkItems % WGSize;
28  if (Rem != 0) {
29  // Let's suppose MaxWGSize = 128 and NWorkItems = (128+32).
30  // It seems better to have 5 groups 32 work-items each than 2 groups with
31  // 128 work-items in the 1st group and 32 work-items in the 2nd group.
32  size_t NWorkGroupsAlt = NWorkItems / Rem;
33  size_t RemAlt = NWorkItems % Rem;
34  if (RemAlt == 0 && NWorkGroupsAlt <= MaxWGSize) {
35  // Choose smaller uniform work-groups.
36  // The condition 'NWorkGroupsAlt <= MaxWGSize' was checked to ensure
37  // that choosing smaller groups will not cause the need in additional
38  // invocations of the kernel.
39  NWorkGroups = NWorkGroupsAlt;
40  WGSize = Rem;
41  } else {
42  // Add 1 more group to process the remaining elements and proceed
43  // with bigger non-uniform work-groups
44  NWorkGroups++;
45  }
46  }
47  }
48  return WGSize;
49 }
50 
51 // Returns the estimated number of physical threads on the device associated
52 // with the given queue.
53 __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(
54  std::shared_ptr<sycl::detail::queue_impl> Queue) {
55  // TODO: Graphs extension explicit API uses a handler with no queue attached,
56  // so return some value here. In the future we should have access to the
57  // device so can remove this.
58  //
59  // The 8 value was chosen as the hardcoded value as it is the returned
60  // value for sycl::info::device::max_compute_units on
61  // Intel HD Graphics devices used as a L0 backend during development.
62  if (Queue == nullptr) {
63  return 8;
64  }
65  device Dev = Queue->get_device();
66  uint32_t NumThreads = Dev.get_info<sycl::info::device::max_compute_units>();
67  // TODO: The heuristics here require additional tuning for various devices
68  // and vendors. Also, it would be better to check vendor/generation/etc.
69  if (Dev.is_gpu() && Dev.get_info<sycl::info::device::host_unified_memory>())
70  NumThreads *= 8;
71  return NumThreads;
72 }
73 
74 __SYCL_EXPORT size_t
75 reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
76  size_t LocalMemBytesPerWorkItem) {
77  device Dev = Queue->get_device();
78  size_t MaxWGSize = Dev.get_info<sycl::info::device::max_work_group_size>();
79 
80  size_t WGSizePerMem = MaxWGSize * 2;
81  size_t WGSize = MaxWGSize;
82  if (LocalMemBytesPerWorkItem != 0) {
83  size_t MemSize = Dev.get_info<sycl::info::device::local_mem_size>();
84  WGSizePerMem = MemSize / LocalMemBytesPerWorkItem;
85 
86  // If the work group size is NOT power of two, then an additional element
87  // in local memory is needed for the reduction algorithm and thus the real
88  // work-group size requirement per available memory is stricter.
89  if ((WGSizePerMem & (WGSizePerMem - 1)) != 0)
90  WGSizePerMem--;
91  WGSize = (std::min)(WGSizePerMem, WGSize);
92  }
93  // TODO: This is a temporary workaround for a big problem of detecting
94  // the maximal usable work-group size. The detection method used above
95  // is based on maximal work-group size possible on the device is too risky
96  // as may return too big value. Even though it also tries using the memory
97  // factor into consideration, it is too rough estimation. For example,
98  // if (WGSize * LocalMemBytesPerWorkItem) is equal to local_mem_size, then
99  // the reduction local accessor takes all available local memory for it needs
100  // not leaving any local memory for other kernel needs (barriers,
101  // builtin calls, etc), which often leads to crushes with CL_OUT_OF_RESOURCES
102  // error, or in even worse cases it may cause silent writes/clobbers of
103  // the local memory assigned to one work-group by code in another work-group.
104  // It seems the only good solution for this work-group detection problem is
105  // kernel precompilation and querying the kernel properties.
106  if (WGSize >= 4 && WGSizePerMem < MaxWGSize * 2) {
107  // Let's return a twice smaller number, but... do that only if the kernel
108  // is limited by memory.
109  WGSize /= 2;
110  }
111 
112  return WGSize;
113 }
114 
115 __SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
116  size_t LocalMemBytesPerWorkItem) {
117  // TODO: Graphs extension explicit API uses a handler with a null queue to
118  // process CGFs, in future we should have access to the device so we can
119  // correctly calculate this.
120  //
121  // The 32 value was chosen as the hardcoded value as it is the returned
122  // value for SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE on
123  // Intel HD Graphics devices used as a L0 backend during development.
124  if (Queue == nullptr) {
125  return 32;
126  }
127  device Dev = Queue->get_device();
128 
129  // The maximum WGSize returned by CPU devices is very large and does not
130  // help the reduction implementation: since all work associated with a
131  // work-group is typically assigned to one CPU thread, selecting a large
132  // work-group size unnecessarily increases the number of accumulators.
133  // The default of 16 was chosen based on empirical benchmarking results;
134  // an environment variable is provided to allow users to override this
135  // behavior.
136  using PrefWGConfig = sycl::detail::SYCLConfig<
137  sycl::detail::SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE>;
138  if (Dev.is_cpu()) {
139  size_t CPUMaxWGSize = PrefWGConfig::get(sycl::info::device_type::cpu);
140  if (CPUMaxWGSize == 0)
141  return 16;
142  size_t DevMaxWGSize =
143  Dev.get_info<sycl::info::device::max_work_group_size>();
144  return std::min(CPUMaxWGSize, DevMaxWGSize);
145  }
146 
147  // If the user has specified an explicit preferred work-group size we use
148  // that.
149  if (Dev.is_gpu() && PrefWGConfig::get(sycl::info::device_type::gpu)) {
150  size_t DevMaxWGSize =
151  Dev.get_info<sycl::info::device::max_work_group_size>();
152  return std::min(PrefWGConfig::get(sycl::info::device_type::gpu),
153  DevMaxWGSize);
154  }
155 
156  if (Dev.is_accelerator() &&
157  PrefWGConfig::get(sycl::info::device_type::accelerator)) {
158  size_t DevMaxWGSize =
159  Dev.get_info<sycl::info::device::max_work_group_size>();
160  return std::min(PrefWGConfig::get(sycl::info::device_type::accelerator),
161  DevMaxWGSize);
162  }
163 
164  // Use the maximum work-group size otherwise.
165  return reduGetMaxWGSize(Queue, LocalMemBytesPerWorkItem);
166 }
167 
168 } // namespace detail
169 } // namespace _V1
170 } // namespace sycl
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64
bool is_accelerator() const
Check if device is an accelerator device.
Definition: device.cpp:83
bool is_gpu() const
Check if device is a GPU device.
Definition: device.cpp:81
detail::is_device_info_desc< Param >::return_type get_info() const
Queries this SYCL device for information requested by the template parameter param.
Definition: device.hpp:223
bool is_cpu() const
Check if device is a CPU device.
Definition: device.cpp:79
constexpr tuple_element< I, tuple< Types... > >::type & get(sycl::detail::tuple< Types... > &Arg) noexcept
Definition: tuple.hpp:198
uint32_t reduGetMaxNumConcurrentWorkGroups(std::shared_ptr< queue_impl > Queue)
size_t reduGetMaxWGSize(std::shared_ptr< queue_impl > Queue, size_t LocalMemBytesPerWorkItem)
size_t reduGetPreferredWGSize(std::shared_ptr< queue_impl > &Queue, size_t LocalMemBytesPerWorkItem)
Definition: reduction.cpp:115
size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize, size_t &NWorkGroups)
Definition: reduction.cpp:19
Definition: access.hpp:18