DPC++ Runtime
Runtime libraries for oneAPI DPC++
reduction.cpp
Go to the documentation of this file.
1 //==---------------- reduction.cpp - SYCL reduction ------------*- C++ -*---==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include <detail/config.hpp>
11 #include <detail/queue_impl.hpp>
12 #include <sycl/reduction.hpp>
13 
14 namespace sycl {
15 inline namespace _V1 {
16 namespace detail {
17 
18 // TODO: The algorithm of choosing the work-group size is definitely
19 // imperfect now and can be improved.
20 __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
21  size_t &NWorkGroups) {
22  size_t WGSize = MaxWGSize;
23  if (NWorkItems <= WGSize) {
24  NWorkGroups = 1;
25  WGSize = NWorkItems;
26  } else {
27  NWorkGroups = NWorkItems / WGSize;
28  size_t Rem = NWorkItems % WGSize;
29  if (Rem != 0) {
30  // Let's suppose MaxWGSize = 128 and NWorkItems = (128+32).
31  // It seems better to have 5 groups 32 work-items each than 2 groups with
32  // 128 work-items in the 1st group and 32 work-items in the 2nd group.
33  size_t NWorkGroupsAlt = NWorkItems / Rem;
34  size_t RemAlt = NWorkItems % Rem;
35  if (RemAlt == 0 && NWorkGroupsAlt <= MaxWGSize) {
36  // Choose smaller uniform work-groups.
37  // The condition 'NWorkGroupsAlt <= MaxWGSize' was checked to ensure
38  // that choosing smaller groups will not cause the need in additional
39  // invocations of the kernel.
40  NWorkGroups = NWorkGroupsAlt;
41  WGSize = Rem;
42  } else {
43  // Add 1 more group to process the remaining elements and proceed
44  // with bigger non-uniform work-groups
45  NWorkGroups++;
46  }
47  }
48  }
49  return WGSize;
50 }
51 
52 // Returns the estimated number of physical threads on the device associated
53 // with the given queue.
54 __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(
55  std::shared_ptr<sycl::detail::queue_impl> Queue) {
56  // TODO: Graphs extension explicit API uses a handler with no queue attached,
57  // so return some value here. In the future we should have access to the
58  // device so can remove this.
59  //
60  // The 8 value was chosen as the hardcoded value as it is the returned
61  // value for sycl::info::device::max_compute_units on
62  // Intel HD Graphics devices used as a L0 backend during development.
63  if (Queue == nullptr) {
64  return 8;
65  }
66  device Dev = Queue->get_device();
67  uint32_t NumThreads = Dev.get_info<sycl::info::device::max_compute_units>();
68  // TODO: The heuristics here require additional tuning for various devices
69  // and vendors. Also, it would be better to check vendor/generation/etc.
70  if (Dev.is_gpu() && Dev.get_info<sycl::info::device::host_unified_memory>())
71  NumThreads *= 8;
72  return NumThreads;
73 }
74 
75 __SYCL_EXPORT size_t
76 reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
77  size_t LocalMemBytesPerWorkItem) {
78  device Dev = Queue->get_device();
79  size_t MaxWGSize = Dev.get_info<sycl::info::device::max_work_group_size>();
80 
81  size_t WGSizePerMem = MaxWGSize * 2;
82  size_t WGSize = MaxWGSize;
83  if (LocalMemBytesPerWorkItem != 0) {
84  size_t MemSize = Dev.get_info<sycl::info::device::local_mem_size>();
85  WGSizePerMem = MemSize / LocalMemBytesPerWorkItem;
86 
87  // If the work group size is NOT power of two, then an additional element
88  // in local memory is needed for the reduction algorithm and thus the real
89  // work-group size requirement per available memory is stricter.
90  if ((WGSizePerMem & (WGSizePerMem - 1)) != 0)
91  WGSizePerMem--;
92  WGSize = (std::min)(WGSizePerMem, WGSize);
93  }
94  // TODO: This is a temporary workaround for a big problem of detecting
95  // the maximal usable work-group size. The detection method used above
96  // is based on maximal work-group size possible on the device is too risky
97  // as may return too big value. Even though it also tries using the memory
98  // factor into consideration, it is too rough estimation. For example,
99  // if (WGSize * LocalMemBytesPerWorkItem) is equal to local_mem_size, then
100  // the reduction local accessor takes all available local memory for it needs
101  // not leaving any local memory for other kernel needs (barriers,
102  // builtin calls, etc), which often leads to crushes with CL_OUT_OF_RESOURCES
103  // error, or in even worse cases it may cause silent writes/clobbers of
104  // the local memory assigned to one work-group by code in another work-group.
105  // It seems the only good solution for this work-group detection problem is
106  // kernel precompilation and querying the kernel properties.
107  if (WGSize >= 4 && WGSizePerMem < MaxWGSize * 2) {
108  // Let's return a twice smaller number, but... do that only if the kernel
109  // is limited by memory.
110  WGSize /= 2;
111  }
112 
113  return WGSize;
114 }
115 
116 __SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
117  size_t LocalMemBytesPerWorkItem) {
118  // TODO: Graphs extension explicit API uses a handler with a null queue to
119  // process CGFs, in future we should have access to the device so we can
120  // correctly calculate this.
121  //
122  // The 32 value was chosen as the hardcoded value as it is the returned
123  // value for SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE on
124  // Intel HD Graphics devices used as a L0 backend during development.
125  if (Queue == nullptr) {
126  return 32;
127  }
128  device Dev = Queue->get_device();
129 
130  // The maximum WGSize returned by CPU devices is very large and does not
131  // help the reduction implementation: since all work associated with a
132  // work-group is typically assigned to one CPU thread, selecting a large
133  // work-group size unnecessarily increases the number of accumulators.
134  // The default of 16 was chosen based on empirical benchmarking results;
135  // an environment variable is provided to allow users to override this
136  // behavior.
137  using PrefWGConfig = sycl::detail::SYCLConfig<
138  sycl::detail::SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE>;
139  if (Dev.is_cpu()) {
140  size_t CPUMaxWGSize = PrefWGConfig::get(sycl::info::device_type::cpu);
141  if (CPUMaxWGSize == 0)
142  return 16;
143  size_t DevMaxWGSize =
144  Dev.get_info<sycl::info::device::max_work_group_size>();
145  return std::min(CPUMaxWGSize, DevMaxWGSize);
146  }
147 
148  // If the user has specified an explicit preferred work-group size we use
149  // that.
150  if (Dev.is_gpu() && PrefWGConfig::get(sycl::info::device_type::gpu)) {
151  size_t DevMaxWGSize =
152  Dev.get_info<sycl::info::device::max_work_group_size>();
153  return std::min(PrefWGConfig::get(sycl::info::device_type::gpu),
154  DevMaxWGSize);
155  }
156 
157  if (Dev.is_accelerator() &&
158  PrefWGConfig::get(sycl::info::device_type::accelerator)) {
159  size_t DevMaxWGSize =
160  Dev.get_info<sycl::info::device::max_work_group_size>();
161  return std::min(PrefWGConfig::get(sycl::info::device_type::accelerator),
162  DevMaxWGSize);
163  }
164 
165  // Use the maximum work-group size otherwise.
166  return reduGetMaxWGSize(Queue, LocalMemBytesPerWorkItem);
167 }
168 
169 __SYCL_EXPORT void
170 addCounterInit(handler &CGH, std::shared_ptr<sycl::detail::queue_impl> &Queue,
171  std::shared_ptr<int> &Counter) {
172  auto EventImpl = std::make_shared<detail::event_impl>(Queue);
173  EventImpl->setContextImpl(detail::getSyclObjImpl(Queue->get_context()));
174  EventImpl->setStateIncomplete();
175  MemoryManager::fill_usm(Counter.get(), Queue, sizeof(int), {0}, {},
176  &EventImpl->getHandleRef(), EventImpl);
177  CGH.depends_on(createSyclObjFromImpl<event>(EventImpl));
178 }
179 
180 } // namespace detail
181 } // namespace _V1
182 } // namespace sycl
static void fill_usm(void *DstMem, QueueImplPtr Queue, size_t Len, const std::vector< unsigned char > &Pattern, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64
bool is_accelerator() const
Check if device is an accelerator device.
Definition: device.cpp:79
bool is_gpu() const
Check if device is a GPU device.
Definition: device.cpp:77
detail::is_device_info_desc< Param >::return_type get_info() const
Queries this SYCL device for information requested by the template parameter param.
Definition: device.hpp:215
bool is_cpu() const
Get instance of device.
Definition: device.cpp:75
Command group handler class.
Definition: handler.hpp:467
void depends_on(event Event)
Registers event dependencies on this command group.
Definition: handler.cpp:1529
constexpr tuple_element< I, tuple< Types... > >::type & get(sycl::detail::tuple< Types... > &Arg) noexcept
Definition: tuple.hpp:198
decltype(Obj::impl) const & getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:31
uint32_t reduGetMaxNumConcurrentWorkGroups(std::shared_ptr< queue_impl > Queue)
size_t reduGetMaxWGSize(std::shared_ptr< queue_impl > Queue, size_t LocalMemBytesPerWorkItem)
size_t reduGetPreferredWGSize(std::shared_ptr< queue_impl > &Queue, size_t LocalMemBytesPerWorkItem)
Definition: reduction.cpp:116
void addCounterInit(handler &CGH, std::shared_ptr< sycl::detail::queue_impl > &Queue, std::shared_ptr< int > &Counter)
Definition: reduction.cpp:170
size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize, size_t &NWorkGroups)
Definition: reduction.cpp:20
Definition: access.hpp:18