DPC++ Runtime
Runtime libraries for oneAPI DPC++
queue_impl.cpp
Go to the documentation of this file.
1 //==------------------ queue_impl.cpp - SYCL queue -------------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include <CL/sycl/context.hpp>
11 #include <CL/sycl/detail/pi.hpp>
12 #include <CL/sycl/device.hpp>
13 #include <detail/event_impl.hpp>
14 #include <detail/queue_impl.hpp>
15 
16 #include <cstring>
17 #include <utility>
18 
19 #ifdef XPTI_ENABLE_INSTRUMENTATION
20 #include "xpti/xpti_trace_framework.hpp"
21 #include <detail/xpti_registry.hpp>
22 #include <sstream>
23 #endif
24 
26 namespace sycl {
27 namespace detail {
28 template <> cl_uint queue_impl::get_info<info::queue::reference_count>() const {
29  RT::PiResult result = PI_SUCCESS;
30  if (!is_host())
32  MQueues[0], PI_QUEUE_INFO_REFERENCE_COUNT, sizeof(result), &result,
33  nullptr);
34  return result;
35 }
36 
37 template <> context queue_impl::get_info<info::queue::context>() const {
38  return get_context();
39 }
40 
41 template <> device queue_impl::get_info<info::queue::device>() const {
42  return get_device();
43 }
44 
45 static event
46 prepareUSMEvent(const std::shared_ptr<detail::queue_impl> &QueueImpl,
47  RT::PiEvent NativeEvent) {
48  auto EventImpl = std::make_shared<detail::event_impl>(QueueImpl);
49  EventImpl->getHandleRef() = NativeEvent;
50  EventImpl->setContextImpl(detail::getSyclObjImpl(QueueImpl->get_context()));
51  return detail::createSyclObjFromImpl<event>(EventImpl);
52 }
53 
54 static event createDiscardedEvent() {
55  EventImplPtr EventImpl =
56  std::make_shared<event_impl>(event_impl::HES_Discarded);
57  return createSyclObjFromImpl<event>(EventImpl);
58 }
59 
60 event queue_impl::memset(const std::shared_ptr<detail::queue_impl> &Self,
61  void *Ptr, int Value, size_t Count,
62  const std::vector<event> &DepEvents) {
63  if (MHasDiscardEventsSupport) {
64  MemoryManager::fill_usm(Ptr, Self, Count, Value,
65  getOrWaitEvents(DepEvents, MContext), nullptr);
66  return createDiscardedEvent();
67  }
68  RT::PiEvent NativeEvent{};
69  MemoryManager::fill_usm(Ptr, Self, Count, Value,
70  getOrWaitEvents(DepEvents, MContext), &NativeEvent);
71 
72  if (MContext->is_host())
73  return MDiscardEvents ? createDiscardedEvent() : event();
74 
75  event ResEvent = prepareUSMEvent(Self, NativeEvent);
76  // Track only if we won't be able to handle it with piQueueFinish.
77  if (!MSupportOOO)
78  addSharedEvent(ResEvent);
79  return MDiscardEvents ? createDiscardedEvent() : ResEvent;
80 }
81 
82 event queue_impl::memcpy(const std::shared_ptr<detail::queue_impl> &Self,
83  void *Dest, const void *Src, size_t Count,
84  const std::vector<event> &DepEvents) {
85  if (MHasDiscardEventsSupport) {
86  MemoryManager::copy_usm(Src, Self, Count, Dest,
87  getOrWaitEvents(DepEvents, MContext), nullptr);
88  return createDiscardedEvent();
89  }
90  RT::PiEvent NativeEvent{};
91  MemoryManager::copy_usm(Src, Self, Count, Dest,
92  getOrWaitEvents(DepEvents, MContext), &NativeEvent);
93 
94  if (MContext->is_host())
95  return MDiscardEvents ? createDiscardedEvent() : event();
96 
97  event ResEvent = prepareUSMEvent(Self, NativeEvent);
98  // Track only if we won't be able to handle it with piQueueFinish.
99  if (!MSupportOOO)
100  addSharedEvent(ResEvent);
101  return MDiscardEvents ? createDiscardedEvent() : ResEvent;
102 }
103 
104 event queue_impl::mem_advise(const std::shared_ptr<detail::queue_impl> &Self,
105  const void *Ptr, size_t Length,
106  pi_mem_advice Advice,
107  const std::vector<event> &DepEvents) {
108  if (MHasDiscardEventsSupport) {
109  MemoryManager::advise_usm(Ptr, Self, Length, Advice,
110  getOrWaitEvents(DepEvents, MContext), nullptr);
111  return createDiscardedEvent();
112  }
113  RT::PiEvent NativeEvent{};
114  MemoryManager::advise_usm(Ptr, Self, Length, Advice,
115  getOrWaitEvents(DepEvents, MContext), &NativeEvent);
116 
117  if (MContext->is_host())
118  return MDiscardEvents ? createDiscardedEvent() : event();
119 
120  event ResEvent = prepareUSMEvent(Self, NativeEvent);
121  // Track only if we won't be able to handle it with piQueueFinish.
122  if (!MSupportOOO)
123  addSharedEvent(ResEvent);
124  return MDiscardEvents ? createDiscardedEvent() : ResEvent;
125 }
126 
127 void queue_impl::addEvent(const event &Event) {
128  EventImplPtr EImpl = getSyclObjImpl(Event);
129  assert(EImpl && "Event implementation is missing");
130  auto *Cmd = static_cast<Command *>(EImpl->getCommand());
131  if (!Cmd) {
132  // if there is no command on the event, we cannot track it with MEventsWeak
133  // as that will leave it with no owner. Track in MEventsShared only if we're
134  // unable to call piQueueFinish during wait.
135  if (is_host() || !MSupportOOO)
136  addSharedEvent(Event);
137  }
138  // As long as the queue supports piQueueFinish we only need to store events
139  // with command nodes in the following cases:
140  // 1. Unenqueued commands, since they aren't covered by piQueueFinish.
141  // 2. Kernels with streams, since they are not supported by post enqueue
142  // cleanup.
143  // 3. Host tasks, for both reasons.
144  else if (is_host() || !MSupportOOO || EImpl->getHandleRef() == nullptr ||
145  EImpl->needsCleanupAfterWait()) {
146  std::weak_ptr<event_impl> EventWeakPtr{EImpl};
147  std::lock_guard<std::mutex> Lock{MMutex};
148  MEventsWeak.push_back(std::move(EventWeakPtr));
149  }
150 }
151 
155 void queue_impl::addSharedEvent(const event &Event) {
156  assert(is_host() || !MSupportOOO);
157  std::lock_guard<std::mutex> Lock(MMutex);
158  // Events stored in MEventsShared are not released anywhere else aside from
159  // calls to queue::wait/wait_and_throw, which a user application might not
160  // make, and ~queue_impl(). If the number of events grows large enough,
161  // there's a good chance that most of them are already completed and ownership
162  // of them can be released.
163  const size_t EventThreshold = 128;
164  if (MEventsShared.size() >= EventThreshold) {
165  // Generally, the vector is ordered so that the oldest events are in the
166  // front and the newer events are in the end. So, search to find the first
167  // event that isn't yet complete. All the events prior to that can be
168  // erased. This could leave some few events further on that have completed
169  // not yet erased, but that is OK. This cleanup doesn't have to be perfect.
170  // This also keeps the algorithm linear rather than quadratic because it
171  // doesn't continually recheck things towards the back of the list that
172  // really haven't had time to complete.
173  MEventsShared.erase(
174  MEventsShared.begin(),
175  std::find_if(
176  MEventsShared.begin(), MEventsShared.end(), [](const event &E) {
177  return E.get_info<info::event::command_execution_status>() !=
178  info::event_command_status::complete;
179  }));
180  }
181  MEventsShared.push_back(Event);
182 }
183 
184 void *queue_impl::instrumentationProlog(const detail::code_location &CodeLoc,
185  std::string &Name, int32_t StreamID,
186  uint64_t &IId) {
187  void *TraceEvent = nullptr;
188  (void)CodeLoc;
189  (void)Name;
190  (void)StreamID;
191  (void)IId;
192 #ifdef XPTI_ENABLE_INSTRUMENTATION
193  xpti::trace_event_data_t *WaitEvent = nullptr;
194  if (!xptiTraceEnabled())
195  return TraceEvent;
196 
197  xpti::payload_t Payload;
198  bool HasSourceInfo = false;
199  // We try to create a unique string for the wait() call by combining it with
200  // the queue address
201  xpti::utils::StringHelper NG;
202  Name = NG.nameWithAddress<queue_impl *>("queue.wait", this);
203 
204  if (CodeLoc.fileName()) {
205  // We have source code location information
206  Payload =
207  xpti::payload_t(Name.c_str(), CodeLoc.fileName(), CodeLoc.lineNumber(),
208  CodeLoc.columnNumber(), (void *)this);
209  HasSourceInfo = true;
210  } else {
211  // We have no location information, so we'll use the address of the queue
212  Payload = xpti::payload_t(Name.c_str(), (void *)this);
213  }
214  // wait() calls could be at different user-code locations; We create a new
215  // event based on the code location info and if this has been seen before, a
216  // previously created event will be returned.
217  uint64_t QWaitInstanceNo = 0;
218  WaitEvent = xptiMakeEvent(Name.c_str(), &Payload, xpti::trace_graph_event,
219  xpti_at::active, &QWaitInstanceNo);
220  IId = QWaitInstanceNo;
221  if (WaitEvent) {
222  device D = get_device();
223  std::string DevStr;
224  if (D.is_host())
225  DevStr = "HOST";
226  else if (D.is_cpu())
227  DevStr = "CPU";
228  else if (D.is_gpu())
229  DevStr = "GPU";
230  else if (D.is_accelerator())
231  DevStr = "ACCELERATOR";
232  else
233  DevStr = "UNKNOWN";
234  xpti::addMetadata(WaitEvent, "sycl_device", DevStr);
235  if (HasSourceInfo) {
236  xpti::addMetadata(WaitEvent, "sym_function_name", CodeLoc.functionName());
237  xpti::addMetadata(WaitEvent, "sym_source_file_name", CodeLoc.fileName());
238  xpti::addMetadata(WaitEvent, "sym_line_no",
239  static_cast<int32_t>((CodeLoc.lineNumber())));
240  xpti::addMetadata(WaitEvent, "sym_column_no",
241  static_cast<int32_t>((CodeLoc.columnNumber())));
242  }
243  xptiNotifySubscribers(StreamID, xpti::trace_wait_begin, nullptr, WaitEvent,
244  QWaitInstanceNo,
245  static_cast<const void *>(Name.c_str()));
246  TraceEvent = (void *)WaitEvent;
247  }
248 #endif
249  return TraceEvent;
250 }
251 
252 void queue_impl::instrumentationEpilog(void *TelemetryEvent, std::string &Name,
253  int32_t StreamID, uint64_t IId) {
254  (void)TelemetryEvent;
255  (void)Name;
256  (void)StreamID;
257  (void)IId;
258 #ifdef XPTI_ENABLE_INSTRUMENTATION
259  if (!(xptiTraceEnabled() && TelemetryEvent))
260  return;
261  // Close the wait() scope
262  xpti::trace_event_data_t *TraceEvent =
263  (xpti::trace_event_data_t *)TelemetryEvent;
264  xptiNotifySubscribers(StreamID, xpti::trace_wait_end, nullptr, TraceEvent,
265  IId, static_cast<const void *>(Name.c_str()));
266 #endif
267 }
268 
269 void queue_impl::wait(const detail::code_location &CodeLoc) {
270  (void)CodeLoc;
271 #ifdef XPTI_ENABLE_INSTRUMENTATION
272  void *TelemetryEvent = nullptr;
273  uint64_t IId;
274  std::string Name;
275  int32_t StreamID = xptiRegisterStream(SYCL_STREAM_NAME);
276  TelemetryEvent = instrumentationProlog(CodeLoc, Name, StreamID, IId);
277 #endif
278 
279  std::vector<std::weak_ptr<event_impl>> WeakEvents;
280  std::vector<event> SharedEvents;
281  {
282  std::lock_guard<std::mutex> Lock(MMutex);
283  WeakEvents.swap(MEventsWeak);
284  SharedEvents.swap(MEventsShared);
285  }
286  // If the queue is either a host one or does not support OOO (and we use
287  // multiple in-order queues as a result of that), wait for each event
288  // directly. Otherwise, only wait for unenqueued or host task events, starting
289  // from the latest submitted task in order to minimize total amount of calls,
290  // then handle the rest with piQueueFinish.
291  const bool SupportsPiFinish = !is_host() && MSupportOOO;
292  for (auto EventImplWeakPtrIt = WeakEvents.rbegin();
293  EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) {
294  if (std::shared_ptr<event_impl> EventImplSharedPtr =
295  EventImplWeakPtrIt->lock()) {
296  // A nullptr PI event indicates that piQueueFinish will not cover it,
297  // either because it's a host task event or an unenqueued one.
298  if (!SupportsPiFinish || nullptr == EventImplSharedPtr->getHandleRef()) {
299  EventImplSharedPtr->wait(EventImplSharedPtr);
300  }
301  }
302  }
303  if (SupportsPiFinish) {
304  const detail::plugin &Plugin = getPlugin();
305  Plugin.call<detail::PiApiKind::piQueueFinish>(getHandleRef());
306  for (std::weak_ptr<event_impl> &EventImplWeakPtr : WeakEvents)
307  if (std::shared_ptr<event_impl> EventImplSharedPtr =
308  EventImplWeakPtr.lock())
309  if (EventImplSharedPtr->needsCleanupAfterWait())
310  EventImplSharedPtr->cleanupCommand(EventImplSharedPtr);
311  assert(SharedEvents.empty() && "Queues that support calling piQueueFinish "
312  "shouldn't have shared events");
313  } else {
314  for (event &Event : SharedEvents)
315  Event.wait();
316  }
317 #ifdef XPTI_ENABLE_INSTRUMENTATION
318  instrumentationEpilog(TelemetryEvent, Name, StreamID, IId);
319 #endif
320 }
321 
322 pi_native_handle queue_impl::getNative() const {
323  const detail::plugin &Plugin = getPlugin();
324  if (Plugin.getBackend() == backend::opencl)
325  Plugin.call<PiApiKind::piQueueRetain>(MQueues[0]);
326  pi_native_handle Handle{};
327  Plugin.call<PiApiKind::piextQueueGetNativeHandle>(MQueues[0], &Handle);
328  return Handle;
329 }
330 
331 } // namespace detail
332 } // namespace sycl
333 } // __SYCL_INLINE_NAMESPACE(cl)
event_impl.hpp
cl::sycl::detail::pi::getPlugin
const plugin & getPlugin()
Definition: pi.cpp:511
PI_SUCCESS
@ PI_SUCCESS
Definition: pi.h:86
cl::sycl::info::device
device
Definition: info_desc.hpp:53
cl::sycl::event
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Definition: event.hpp:31
device.hpp
xpti_registry.hpp
_pi_mem_advice
_pi_mem_advice
Definition: pi.h:459
_pi_result
_pi_result
Definition: pi.h:85
piextQueueGetNativeHandle
pi_result piextQueueGetNativeHandle(pi_queue queue, pi_native_handle *nativeHandle)
Gets the native handle of a PI queue object.
Definition: pi_esimd_emulator.cpp:998
context.hpp
sycl
Definition: invoke_simd.hpp:68
queue_impl.hpp
cl::sycl::detail::code_location
Definition: common.hpp:54
pi.hpp
cl::sycl::detail::SYCL_STREAM_NAME
constexpr const char * SYCL_STREAM_NAME
Definition: xpti_registry.hpp:28
cl::sycl::detail::createDiscardedEvent
static event createDiscardedEvent()
Definition: queue_impl.cpp:54
cl::sycl::detail::memcpy
void memcpy(void *Dst, const void *Src, std::size_t Size)
cl::sycl::detail::plugin::getBackend
backend getBackend(void) const
Definition: plugin.hpp:229
cl::sycl::device
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:35
cl::sycl::detail::EventImplPtr
std::shared_ptr< detail::event_impl > EventImplPtr
Definition: memory_manager.hpp:31
cl::sycl::detail::plugin::call
void call(ArgsT... Args) const
Calls the API, traces the call, checks the result.
Definition: plugin.hpp:217
piQueueRetain
pi_result piQueueRetain(pi_queue command_queue)
Definition: pi_esimd_emulator.cpp:962
cl::sycl::detail::Command
The Command class represents some action that needs to be performed on one or more memory objects.
Definition: commands.hpp:95
cl
We provide new interfaces for matrix muliply in this patch:
Definition: access.hpp:13
cl::sycl::detail::getOrWaitEvents
std::vector< RT::PiEvent > getOrWaitEvents(std::vector< cl::sycl::event > DepEvents, std::shared_ptr< cl::sycl::detail::context_impl > Context)
cl::sycl::detail::plugin
The plugin class provides a unified interface to the underlying low-level runtimes for the device-agn...
Definition: plugin.hpp:90
pi_native_handle
uintptr_t pi_native_handle
Definition: pi.h:76
piQueueFinish
pi_result piQueueFinish(pi_queue command_queue)
Definition: pi_esimd_emulator.cpp:984
cl::sycl::detail::prepareUSMEvent
static event prepareUSMEvent(const std::shared_ptr< detail::queue_impl > &QueueImpl, RT::PiEvent NativeEvent)
Definition: queue_impl.cpp:46
cl::sycl::detail::getSyclObjImpl
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: common.hpp:198
piQueueGetInfo
pi_result piQueueGetInfo(pi_queue command_queue, pi_queue_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_esimd_emulator.cpp:958
_pi_event
PI Event mapping to CUevent.
Definition: pi_cuda.hpp:458
cl::sycl::context
The context class represents a SYCL context on which kernel functions may be executed.
Definition: context.hpp:35
cl::sycl::cl_uint
std::uint32_t cl_uint
Definition: aliases.hpp:83
PI_QUEUE_INFO_REFERENCE_COUNT
@ PI_QUEUE_INFO_REFERENCE_COUNT
Definition: pi.h:355
__SYCL_INLINE_NAMESPACE
#define __SYCL_INLINE_NAMESPACE(X)
Definition: defines_elementary.hpp:12
memory_manager.hpp