DPC++ Runtime
Runtime libraries for oneAPI DPC++
reduction.cpp
Go to the documentation of this file.
1 //==---------------- reduction.cpp - SYCL reduction ------------*- C++ -*---==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include <detail/config.hpp>
10 #include <detail/queue_impl.hpp>
11 #include <sycl/reduction.hpp>
12 
13 namespace sycl {
15 namespace detail {
16 
17 // TODO: The algorithm of choosing the work-group size is definitely
18 // imperfect now and can be improved.
19 __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
20  size_t &NWorkGroups) {
21  size_t WGSize = MaxWGSize;
22  if (NWorkItems <= WGSize) {
23  NWorkGroups = 1;
24  WGSize = NWorkItems;
25  } else {
26  NWorkGroups = NWorkItems / WGSize;
27  size_t Rem = NWorkItems % WGSize;
28  if (Rem != 0) {
29  // Let's suppose MaxWGSize = 128 and NWorkItems = (128+32).
30  // It seems better to have 5 groups 32 work-items each than 2 groups with
31  // 128 work-items in the 1st group and 32 work-items in the 2nd group.
32  size_t NWorkGroupsAlt = NWorkItems / Rem;
33  size_t RemAlt = NWorkItems % Rem;
34  if (RemAlt == 0 && NWorkGroupsAlt <= MaxWGSize) {
35  // Choose smaller uniform work-groups.
36  // The condition 'NWorkGroupsAlt <= MaxWGSize' was checked to ensure
37  // that choosing smaller groups will not cause the need in additional
38  // invocations of the kernel.
39  NWorkGroups = NWorkGroupsAlt;
40  WGSize = Rem;
41  } else {
42  // Add 1 more group to process the remaining elements and proceed
43  // with bigger non-uniform work-groups
44  NWorkGroups++;
45  }
46  }
47  }
48  return WGSize;
49 }
50 
51 // Returns the estimated number of physical threads on the device associated
52 // with the given queue.
53 __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(
54  std::shared_ptr<sycl::detail::queue_impl> Queue) {
55  device Dev = Queue->get_device();
56  uint32_t NumThreads = Dev.get_info<sycl::info::device::max_compute_units>();
57  // TODO: The heuristics here require additional tuning for various devices
58  // and vendors. Also, it would be better to check vendor/generation/etc.
59  if (Dev.is_gpu() && Dev.get_info<sycl::info::device::host_unified_memory>())
60  NumThreads *= 8;
61  return NumThreads;
62 }
63 
64 __SYCL_EXPORT size_t
65 reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
66  size_t LocalMemBytesPerWorkItem) {
67  device Dev = Queue->get_device();
68  size_t MaxWGSize = Dev.get_info<sycl::info::device::max_work_group_size>();
69 
70  size_t WGSizePerMem = MaxWGSize * 2;
71  size_t WGSize = MaxWGSize;
72  if (LocalMemBytesPerWorkItem != 0) {
73  size_t MemSize = Dev.get_info<sycl::info::device::local_mem_size>();
74  WGSizePerMem = MemSize / LocalMemBytesPerWorkItem;
75 
76  // If the work group size is NOT power of two, then an additional element
77  // in local memory is needed for the reduction algorithm and thus the real
78  // work-group size requirement per available memory is stricter.
79  if ((WGSizePerMem & (WGSizePerMem - 1)) != 0)
80  WGSizePerMem--;
81  WGSize = (std::min)(WGSizePerMem, WGSize);
82  }
83  // TODO: This is a temporary workaround for a big problem of detecting
84  // the maximal usable work-group size. The detection method used above
85  // is based on maximal work-group size possible on the device is too risky
86  // as may return too big value. Even though it also tries using the memory
87  // factor into consideration, it is too rough estimation. For example,
88  // if (WGSize * LocalMemBytesPerWorkItem) is equal to local_mem_size, then
89  // the reduction local accessor takes all available local memory for it needs
90  // not leaving any local memory for other kernel needs (barriers,
91  // builtin calls, etc), which often leads to crushes with CL_OUT_OF_RESOURCES
92  // error, or in even worse cases it may cause silent writes/clobbers of
93  // the local memory assigned to one work-group by code in another work-group.
94  // It seems the only good solution for this work-group detection problem is
95  // kernel precompilation and querying the kernel properties.
96  if (WGSize >= 4 && WGSizePerMem < MaxWGSize * 2) {
97  // Let's return a twice smaller number, but... do that only if the kernel
98  // is limited by memory.
99  WGSize /= 2;
100  }
101 
102  return WGSize;
103 }
104 
105 __SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
106  size_t LocalMemBytesPerWorkItem) {
107  device Dev = Queue->get_device();
108 
109  // The maximum WGSize returned by CPU devices is very large and does not
110  // help the reduction implementation: since all work associated with a
111  // work-group is typically assigned to one CPU thread, selecting a large
112  // work-group size unnecessarily increases the number of accumulators.
113  // The default of 16 was chosen based on empirical benchmarking results;
114  // an environment variable is provided to allow users to override this
115  // behavior.
116  using PrefWGConfig = sycl::detail::SYCLConfig<
117  sycl::detail::SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE>;
118  if (Dev.is_cpu()) {
119  size_t CPUMaxWGSize = PrefWGConfig::get(sycl::info::device_type::cpu);
120  if (CPUMaxWGSize == 0)
121  return 16;
122  size_t DevMaxWGSize =
123  Dev.get_info<sycl::info::device::max_work_group_size>();
124  return std::min(CPUMaxWGSize, DevMaxWGSize);
125  }
126 
127  // If the user has specified an explicit preferred work-group size we use
128  // that.
129  if (Dev.is_gpu() && PrefWGConfig::get(sycl::info::device_type::gpu)) {
130  size_t DevMaxWGSize =
131  Dev.get_info<sycl::info::device::max_work_group_size>();
132  return std::min(PrefWGConfig::get(sycl::info::device_type::gpu),
133  DevMaxWGSize);
134  }
135 
136  if (Dev.is_accelerator() &&
137  PrefWGConfig::get(sycl::info::device_type::accelerator)) {
138  size_t DevMaxWGSize =
139  Dev.get_info<sycl::info::device::max_work_group_size>();
140  return std::min(PrefWGConfig::get(sycl::info::device_type::accelerator),
141  DevMaxWGSize);
142  }
143 
144  // Use the maximum work-group size otherwise.
145  return reduGetMaxWGSize(Queue, LocalMemBytesPerWorkItem);
146 }
147 
148 } // namespace detail
149 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
150 } // namespace sycl
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:49
bool is_accelerator() const
Check if device is an accelerator device.
Definition: device.cpp:87
bool is_gpu() const
Check if device is a GPU device.
Definition: device.cpp:85
detail::is_device_info_desc< Param >::return_type get_info() const
Queries this SYCL device for information requested by the template parameter param.
Definition: device.cpp:126
bool is_cpu() const
Check if device is a CPU device.
Definition: device.cpp:83
#define __SYCL_INLINE_VER_NAMESPACE(X)
constexpr tuple_element< I, tuple< Types... > >::type & get(sycl::detail::tuple< Types... > &Arg) noexcept
Definition: tuple.hpp:199
uint32_t reduGetMaxNumConcurrentWorkGroups(std::shared_ptr< sycl::detail::queue_impl > Queue)
Definition: reduction.cpp:53
size_t reduGetMaxWGSize(std::shared_ptr< sycl::detail::queue_impl > Queue, size_t LocalMemBytesPerWorkItem)
Definition: reduction.cpp:65
size_t reduGetPreferredWGSize(std::shared_ptr< queue_impl > &Queue, size_t LocalMemBytesPerWorkItem)
Definition: reduction.cpp:105
size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize, size_t &NWorkGroups)
Definition: reduction.cpp:19
---— Error handling, matching OpenCL plugin semantics.
Definition: access.hpp:14