DPC++ Runtime
Runtime libraries for oneAPI DPC++
queue_impl.cpp
Go to the documentation of this file.
1 //==------------------ queue_impl.cpp - SYCL queue -------------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include <detail/event_impl.hpp>
11 #include <detail/queue_impl.hpp>
12 #include <sycl/context.hpp>
13 #include <sycl/detail/common.hpp>
14 #include <sycl/detail/ur.hpp>
15 #include <sycl/device.hpp>
16 
17 #include <cstring>
18 #include <utility>
19 
20 #ifdef XPTI_ENABLE_INSTRUMENTATION
21 #include "xpti/xpti_trace_framework.hpp"
22 #include <detail/xpti_registry.hpp>
23 #include <sstream>
24 #endif
25 
26 namespace sycl {
27 inline namespace _V1 {
28 namespace detail {
29 // Treat 0 as reserved for host task traces
30 std::atomic<unsigned long long> queue_impl::MNextAvailableQueueID = 1;
31 
32 thread_local bool NestedCallsDetector = false;
34 public:
37  throw sycl::exception(
39  "Calls to sycl::queue::submit cannot be nested. Command group "
40  "function objects should use the sycl::handler API instead.");
41  NestedCallsDetector = true;
42  }
43 
45 };
46 
47 static std::vector<ur_event_handle_t>
48 getUrEvents(const std::vector<sycl::event> &DepEvents) {
49  std::vector<ur_event_handle_t> RetUrEvents;
50  for (const sycl::event &Event : DepEvents) {
51  const EventImplPtr &EventImpl = detail::getSyclObjImpl(Event);
52  if (EventImpl->getHandleRef() != nullptr)
53  RetUrEvents.push_back(EventImpl->getHandleRef());
54  }
55  return RetUrEvents;
56 }
57 
58 template <>
59 uint32_t queue_impl::get_info<info::queue::reference_count>() const {
60  ur_result_t result = UR_RESULT_SUCCESS;
61  getPlugin()->call(urQueueGetInfo, MQueues[0], UR_QUEUE_INFO_REFERENCE_COUNT,
62  sizeof(result), &result, nullptr);
63  return result;
64 }
65 
66 template <> context queue_impl::get_info<info::queue::context>() const {
67  return get_context();
68 }
69 
70 template <> device queue_impl::get_info<info::queue::device>() const {
71  return get_device();
72 }
73 
74 template <>
75 typename info::platform::version::return_type
76 queue_impl::get_backend_info<info::platform::version>() const {
77  if (getContextImplPtr()->getBackend() != backend::opencl) {
79  "the info::platform::version info descriptor can "
80  "only be queried with an OpenCL backend");
81  }
82  return get_device().get_platform().get_info<info::platform::version>();
83 }
84 
85 template <>
86 typename info::device::version::return_type
87 queue_impl::get_backend_info<info::device::version>() const {
88  if (getContextImplPtr()->getBackend() != backend::opencl) {
90  "the info::device::version info descriptor can only "
91  "be queried with an OpenCL backend");
92  }
93  return get_device().get_info<info::device::version>();
94 }
95 
96 template <>
97 typename info::device::backend_version::return_type
98 queue_impl::get_backend_info<info::device::backend_version>() const {
99  if (getContextImplPtr()->getBackend() != backend::ext_oneapi_level_zero) {
101  "the info::device::backend_version info descriptor "
102  "can only be queried with a Level Zero backend");
103  }
104  return "";
105  // Currently The Level Zero backend does not define the value of this
106  // information descriptor and implementations are encouraged to return the
107  // empty string as per specification.
108 }
109 
111  const std::shared_ptr<detail::queue_impl> &QueueImpl) {
112  auto EventImpl = std::make_shared<detail::event_impl>(QueueImpl);
113  EventImpl->setContextImpl(detail::getSyclObjImpl(QueueImpl->get_context()));
114  EventImpl->setStateIncomplete();
115  return detail::createSyclObjFromImpl<event>(EventImpl);
116 }
117 
118 static event createDiscardedEvent() {
119  EventImplPtr EventImpl =
120  std::make_shared<event_impl>(event_impl::HES_Discarded);
121  return createSyclObjFromImpl<event>(EventImpl);
122 }
123 
124 const std::vector<event> &
125 queue_impl::getExtendDependencyList(const std::vector<event> &DepEvents,
126  std::vector<event> &MutableVec,
127  std::unique_lock<std::mutex> &QueueLock) {
128  if (!isInOrder())
129  return DepEvents;
130 
131  QueueLock.lock();
132  EventImplPtr ExtraEvent = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr
134  std::optional<event> ExternalEvent = popExternalEvent();
135 
136  if (!ExternalEvent && !ExtraEvent)
137  return DepEvents;
138 
139  MutableVec = DepEvents;
140  if (ExternalEvent)
141  MutableVec.push_back(*ExternalEvent);
142  if (ExtraEvent)
143  MutableVec.push_back(detail::createSyclObjFromImpl<event>(ExtraEvent));
144  return MutableVec;
145 }
146 
147 event queue_impl::memset(const std::shared_ptr<detail::queue_impl> &Self,
148  void *Ptr, int Value, size_t Count,
149  const std::vector<event> &DepEvents,
150  bool CallerNeedsEvent) {
151 #if XPTI_ENABLE_INSTRUMENTATION
152  // We need a code pointer value and we use the object ptr; if code location
153  // information is available, we will have function name and source file
154  // information
155  XPTIScope PrepareNotify((void *)this,
156  (uint16_t)xpti::trace_point_type_t::node_create,
157  SYCL_STREAM_NAME, "memory_transfer_node");
158  PrepareNotify.addMetadata([&](auto TEvent) {
159  xpti::addMetadata(TEvent, "sycl_device",
160  reinterpret_cast<size_t>(MDevice->getHandleRef()));
161  xpti::addMetadata(TEvent, "memory_ptr", reinterpret_cast<size_t>(Ptr));
162  xpti::addMetadata(TEvent, "value_set", Value);
163  xpti::addMetadata(TEvent, "memory_size", Count);
164  xpti::addMetadata(TEvent, "queue_id", MQueueID);
165  });
166  // Before we notifiy the subscribers, we broadcast the 'queue_id', which was a
167  // metadata entry to TLS for use by callback handlers
168  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, MQueueID);
169  // Notify XPTI about the memset submission
170  PrepareNotify.notify();
171  // Emit a begin/end scope for this call
172  PrepareNotify.scopedNotify((uint16_t)xpti::trace_point_type_t::task_begin);
173 #endif
174  const std::vector<unsigned char> Pattern{static_cast<unsigned char>(Value)};
175  return submitMemOpHelper(
176  Self, DepEvents, CallerNeedsEvent,
177  [&](handler &CGH) { CGH.memset(Ptr, Value, Count); },
178  [](const auto &...Args) { MemoryManager::fill_usm(Args...); }, Ptr, Self,
179  Count, Pattern);
180 }
181 
182 void report(const code_location &CodeLoc) {
183  std::cout << "Exception caught at ";
184  if (CodeLoc.fileName())
185  std::cout << "File: " << CodeLoc.fileName();
186  if (CodeLoc.functionName())
187  std::cout << " | Function: " << CodeLoc.functionName();
188  if (CodeLoc.lineNumber())
189  std::cout << " | Line: " << CodeLoc.lineNumber();
190  if (CodeLoc.columnNumber())
191  std::cout << " | Column: " << CodeLoc.columnNumber();
192  std::cout << '\n';
193 }
194 
195 event queue_impl::memcpy(const std::shared_ptr<detail::queue_impl> &Self,
196  void *Dest, const void *Src, size_t Count,
197  const std::vector<event> &DepEvents,
198  bool CallerNeedsEvent, const code_location &CodeLoc) {
199 #if XPTI_ENABLE_INSTRUMENTATION
200  // We need a code pointer value and we duse the object ptr; If code location
201  // is available, we use the source file information along with the object
202  // pointer.
203  XPTIScope PrepareNotify((void *)this,
204  (uint16_t)xpti::trace_point_type_t::node_create,
205  SYCL_STREAM_NAME, "memory_transfer_node");
206  PrepareNotify.addMetadata([&](auto TEvent) {
207  xpti::addMetadata(TEvent, "sycl_device",
208  reinterpret_cast<size_t>(MDevice->getHandleRef()));
209  xpti::addMetadata(TEvent, "src_memory_ptr", reinterpret_cast<size_t>(Src));
210  xpti::addMetadata(TEvent, "dest_memory_ptr",
211  reinterpret_cast<size_t>(Dest));
212  xpti::addMetadata(TEvent, "memory_size", Count);
213  xpti::addMetadata(TEvent, "queue_id", MQueueID);
214  });
215  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, MQueueID);
216  // Notify XPTI about the memcpy submission
217  PrepareNotify.notify();
218  // Emit a begin/end scope for this call
219  PrepareNotify.scopedNotify((uint16_t)xpti::trace_point_type_t::task_begin);
220 #endif
221 
222  if ((!Src || !Dest) && Count != 0) {
223  report(CodeLoc);
225  "NULL pointer argument in memory copy operation.");
226  }
227  return submitMemOpHelper(
228  Self, DepEvents, CallerNeedsEvent,
229  [&](handler &CGH) { CGH.memcpy(Dest, Src, Count); },
230  [](const auto &...Args) { MemoryManager::copy_usm(Args...); }, Src, Self,
231  Count, Dest);
232 }
233 
234 event queue_impl::mem_advise(const std::shared_ptr<detail::queue_impl> &Self,
235  const void *Ptr, size_t Length,
236  ur_usm_advice_flags_t Advice,
237  const std::vector<event> &DepEvents,
238  bool CallerNeedsEvent) {
239  return submitMemOpHelper(
240  Self, DepEvents, CallerNeedsEvent,
241  [&](handler &CGH) { CGH.mem_advise(Ptr, Length, Advice); },
242  [](const auto &...Args) { MemoryManager::advise_usm(Args...); }, Ptr,
243  Self, Length, Advice);
244 }
245 
247  const std::shared_ptr<detail::queue_impl> &Self, void *DeviceGlobalPtr,
248  const void *Src, bool IsDeviceImageScope, size_t NumBytes, size_t Offset,
249  const std::vector<event> &DepEvents, bool CallerNeedsEvent) {
250  return submitMemOpHelper(
251  Self, DepEvents, CallerNeedsEvent,
252  [&](handler &CGH) {
253  CGH.memcpyToDeviceGlobal(DeviceGlobalPtr, Src, IsDeviceImageScope,
254  NumBytes, Offset);
255  },
256  [](const auto &...Args) {
258  },
259  DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Src);
260 }
261 
263  const std::shared_ptr<detail::queue_impl> &Self, void *Dest,
264  const void *DeviceGlobalPtr, bool IsDeviceImageScope, size_t NumBytes,
265  size_t Offset, const std::vector<event> &DepEvents, bool CallerNeedsEvent) {
266  return submitMemOpHelper(
267  Self, DepEvents, CallerNeedsEvent,
268  [&](handler &CGH) {
269  CGH.memcpyFromDeviceGlobal(Dest, DeviceGlobalPtr, IsDeviceImageScope,
270  NumBytes, Offset);
271  },
272  [](const auto &...Args) {
274  },
275  DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Dest);
276 }
277 
279  {
280  // The external event is required to finish last if set, so it is considered
281  // the last event if present.
282  std::lock_guard<std::mutex> Lock(MInOrderExternalEventMtx);
284  return *MInOrderExternalEvent;
285  }
286 
287  std::lock_guard<std::mutex> Lock{MMutex};
288  if (MDiscardEvents)
289  return createDiscardedEvent();
290  if (!MGraph.expired() && MExtGraphDeps.LastEventPtr)
291  return detail::createSyclObjFromImpl<event>(MExtGraphDeps.LastEventPtr);
293  MDefaultGraphDeps.LastEventPtr = std::make_shared<event_impl>(std::nullopt);
294  return detail::createSyclObjFromImpl<event>(MDefaultGraphDeps.LastEventPtr);
295 }
296 
297 void queue_impl::addEvent(const event &Event) {
298  EventImplPtr EImpl = getSyclObjImpl(Event);
299  assert(EImpl && "Event implementation is missing");
300  auto *Cmd = static_cast<Command *>(EImpl->getCommand());
301  if (!Cmd) {
302  // if there is no command on the event, we cannot track it with MEventsWeak
303  // as that will leave it with no owner. Track in MEventsShared only if we're
304  // unable to call urQueueFinish during wait.
305  if (MEmulateOOO)
306  addSharedEvent(Event);
307  }
308  // As long as the queue supports urQueueFinish we only need to store events
309  // for unenqueued commands and host tasks.
310  else if (MEmulateOOO || EImpl->getHandleRef() == nullptr) {
311  std::weak_ptr<event_impl> EventWeakPtr{EImpl};
312  std::lock_guard<std::mutex> Lock{MMutex};
313  MEventsWeak.push_back(std::move(EventWeakPtr));
314  }
315 }
316 
320 void queue_impl::addSharedEvent(const event &Event) {
321  assert(MEmulateOOO);
322  std::lock_guard<std::mutex> Lock(MMutex);
323  // Events stored in MEventsShared are not released anywhere else aside from
324  // calls to queue::wait/wait_and_throw, which a user application might not
325  // make, and ~queue_impl(). If the number of events grows large enough,
326  // there's a good chance that most of them are already completed and ownership
327  // of them can be released.
328  const size_t EventThreshold = 128;
329  if (MEventsShared.size() >= EventThreshold) {
330  // Generally, the vector is ordered so that the oldest events are in the
331  // front and the newer events are in the end. So, search to find the first
332  // event that isn't yet complete. All the events prior to that can be
333  // erased. This could leave some few events further on that have completed
334  // not yet erased, but that is OK. This cleanup doesn't have to be perfect.
335  // This also keeps the algorithm linear rather than quadratic because it
336  // doesn't continually recheck things towards the back of the list that
337  // really haven't had time to complete.
338  MEventsShared.erase(
339  MEventsShared.begin(),
340  std::find_if(
341  MEventsShared.begin(), MEventsShared.end(), [](const event &E) {
342  return E.get_info<info::event::command_execution_status>() !=
343  info::event_command_status::complete;
344  }));
345  }
346  MEventsShared.push_back(Event);
347 }
348 
349 event queue_impl::submit_impl(const std::function<void(handler &)> &CGF,
350  const std::shared_ptr<queue_impl> &Self,
351  const std::shared_ptr<queue_impl> &PrimaryQueue,
352  const std::shared_ptr<queue_impl> &SecondaryQueue,
353  bool CallerNeedsEvent,
354  const detail::code_location &Loc,
355  const SubmitPostProcessF *PostProcess) {
356  handler Handler(Self, PrimaryQueue, SecondaryQueue, CallerNeedsEvent);
357  Handler.saveCodeLoc(Loc);
358 
359  {
360  NestedCallsTracker tracker;
361  CGF(Handler);
362  }
363 
364  // Scheduler will later omit events, that are not required to execute tasks.
365  // Host and interop tasks, however, are not submitted to low-level runtimes
366  // and require separate dependency management.
367  const CGType Type = detail::getSyclObjImpl(Handler)->MCGType;
368  event Event = detail::createSyclObjFromImpl<event>(
369  std::make_shared<detail::event_impl>());
370  std::vector<StreamImplPtr> Streams;
371  if (Type == CGType::Kernel)
372  Streams = std::move(Handler.MStreamStorage);
373 
374  if (PostProcess) {
375  bool IsKernel = Type == CGType::Kernel;
376  bool KernelUsesAssert = false;
377 
378  if (IsKernel)
379  // Kernel only uses assert if it's non interop one
380  KernelUsesAssert = !(Handler.MKernel && Handler.MKernel->isInterop()) &&
382  Handler.MKernelName.c_str());
383  finalizeHandler(Handler, Event);
384 
385  (*PostProcess)(IsKernel, KernelUsesAssert, Event);
386  } else
387  finalizeHandler(Handler, Event);
388 
389  addEvent(Event);
390 
391  auto EventImpl = detail::getSyclObjImpl(Event);
392  for (auto &Stream : Streams) {
393  // We don't want stream flushing to be blocking operation that is why submit
394  // a host task to print stream buffer. It will fire up as soon as the kernel
395  // finishes execution.
396  event FlushEvent = submit_impl(
397  [&](handler &ServiceCGH) { Stream->generateFlushCommand(ServiceCGH); },
398  Self, PrimaryQueue, SecondaryQueue, /*CallerNeedsEvent*/ true, Loc, {});
399  EventImpl->attachEventToComplete(detail::getSyclObjImpl(FlushEvent));
401  }
402 
403  return Event;
404 }
405 
406 template <typename HandlerFuncT>
407 event queue_impl::submitWithHandler(const std::shared_ptr<queue_impl> &Self,
408  const std::vector<event> &DepEvents,
409  HandlerFuncT HandlerFunc) {
410  return submit(
411  [&](handler &CGH) {
412  CGH.depends_on(DepEvents);
413  HandlerFunc(CGH);
414  },
415  Self, {});
416 }
417 
418 template <typename HandlerFuncT, typename MemOpFuncT, typename... MemOpArgTs>
419 event queue_impl::submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,
420  const std::vector<event> &DepEvents,
421  bool CallerNeedsEvent,
422  HandlerFuncT HandlerFunc,
423  MemOpFuncT MemOpFunc,
424  MemOpArgTs... MemOpArgs) {
425  // We need to submit command and update the last event under same lock if we
426  // have in-order queue.
427  {
428  std::unique_lock<std::mutex> Lock(MMutex, std::defer_lock);
429 
430  std::vector<event> MutableDepEvents;
431  const std::vector<event> &ExpandedDepEvents =
432  getExtendDependencyList(DepEvents, MutableDepEvents, Lock);
433 
434  // If we have a command graph set we need to capture the op through the
435  // handler rather than by-passing the scheduler.
437  ExpandedDepEvents, MContext)) {
438  if ((MDiscardEvents || !CallerNeedsEvent) &&
440  NestedCallsTracker tracker;
441  MemOpFunc(MemOpArgs..., getUrEvents(ExpandedDepEvents),
442  /*PiEvent*/ nullptr, /*EventImplPtr*/ nullptr);
443  return createDiscardedEvent();
444  }
445 
446  event ResEvent = prepareSYCLEventAssociatedWithQueue(Self);
447  auto EventImpl = detail::getSyclObjImpl(ResEvent);
448  {
449  NestedCallsTracker tracker;
450  MemOpFunc(MemOpArgs..., getUrEvents(ExpandedDepEvents),
451  &EventImpl->getHandleRef(), EventImpl);
452  }
453 
454  if (isInOrder()) {
455  auto &EventToStoreIn = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr
457  EventToStoreIn = EventImpl;
458  }
459  // Track only if we won't be able to handle it with urQueueFinish.
460  if (MEmulateOOO)
461  addSharedEvent(ResEvent);
462  return discard_or_return(ResEvent);
463  }
464  }
465  return submitWithHandler(Self, DepEvents, HandlerFunc);
466 }
467 
469  std::string &Name, int32_t StreamID,
470  uint64_t &IId) {
471  void *TraceEvent = nullptr;
472  (void)CodeLoc;
473  (void)Name;
474  (void)StreamID;
475  (void)IId;
476 #ifdef XPTI_ENABLE_INSTRUMENTATION
477  constexpr uint16_t NotificationTraceType = xpti::trace_wait_begin;
478  if (!xptiCheckTraceEnabled(StreamID, NotificationTraceType))
479  return TraceEvent;
480 
481  xpti::payload_t Payload;
482  bool HasSourceInfo = false;
483  // We try to create a unique string for the wait() call by combining it with
484  // the queue address
485  xpti::utils::StringHelper NG;
486  Name = NG.nameWithAddress<queue_impl *>("queue.wait", this);
487 
488  if (CodeLoc.fileName()) {
489  // We have source code location information
490  Payload =
491  xpti::payload_t(Name.c_str(), CodeLoc.fileName(), CodeLoc.lineNumber(),
492  CodeLoc.columnNumber(), (void *)this);
493  HasSourceInfo = true;
494  } else {
495  // We have no location information, so we'll use the address of the queue
496  Payload = xpti::payload_t(Name.c_str(), (void *)this);
497  }
498  // wait() calls could be at different user-code locations; We create a new
499  // event based on the code location info and if this has been seen before, a
500  // previously created event will be returned.
501  uint64_t QWaitInstanceNo = 0;
502  xpti::trace_event_data_t *WaitEvent =
503  xptiMakeEvent(Name.c_str(), &Payload, xpti::trace_graph_event,
504  xpti_at::active, &QWaitInstanceNo);
505  IId = QWaitInstanceNo;
506  if (WaitEvent) {
507  xpti::addMetadata(WaitEvent, "sycl_device_type", queueDeviceToString(this));
508  if (HasSourceInfo) {
509  xpti::addMetadata(WaitEvent, "sym_function_name", CodeLoc.functionName());
510  xpti::addMetadata(WaitEvent, "sym_source_file_name", CodeLoc.fileName());
511  xpti::addMetadata(WaitEvent, "sym_line_no",
512  static_cast<int32_t>((CodeLoc.lineNumber())));
513  xpti::addMetadata(WaitEvent, "sym_column_no",
514  static_cast<int32_t>((CodeLoc.columnNumber())));
515  }
516  xptiNotifySubscribers(StreamID, xpti::trace_wait_begin, nullptr, WaitEvent,
517  QWaitInstanceNo,
518  static_cast<const void *>(Name.c_str()));
519  TraceEvent = (void *)WaitEvent;
520  }
521 #endif
522  return TraceEvent;
523 }
524 
525 void queue_impl::instrumentationEpilog(void *TelemetryEvent, std::string &Name,
526  int32_t StreamID, uint64_t IId) {
527  (void)TelemetryEvent;
528  (void)Name;
529  (void)StreamID;
530  (void)IId;
531 #ifdef XPTI_ENABLE_INSTRUMENTATION
532  constexpr uint16_t NotificationTraceType = xpti::trace_wait_end;
533  if (!(xptiCheckTraceEnabled(StreamID, NotificationTraceType) &&
534  TelemetryEvent))
535  return;
536  // Close the wait() scope
537  xpti::trace_event_data_t *TraceEvent =
538  (xpti::trace_event_data_t *)TelemetryEvent;
539  xptiNotifySubscribers(StreamID, NotificationTraceType, nullptr, TraceEvent,
540  IId, static_cast<const void *>(Name.c_str()));
541 #endif
542 }
543 
545  (void)CodeLoc;
546 #ifdef XPTI_ENABLE_INSTRUMENTATION
547  void *TelemetryEvent = nullptr;
548  uint64_t IId;
549  std::string Name;
550  int32_t StreamID = xptiRegisterStream(SYCL_STREAM_NAME);
551  TelemetryEvent = instrumentationProlog(CodeLoc, Name, StreamID, IId);
552 #endif
553 
554  if (MGraph.lock()) {
556  "wait cannot be called for a queue which is "
557  "recording to a command graph.");
558  }
559 
560  // If there is an external event set, we know we are using an in-order queue
561  // and the event is required to finish after the last event in the queue. As
562  // such, we can just wait for it and finish.
563  std::optional<event> ExternalEvent = popExternalEvent();
564  if (ExternalEvent) {
565  ExternalEvent->wait();
566 
567  // Additionally, we can clean up the event lists that we would have
568  // otherwise cleared.
569  if (!MEventsWeak.empty() || !MEventsShared.empty()) {
570  std::lock_guard<std::mutex> Lock(MMutex);
571  MEventsWeak.clear();
572  MEventsShared.clear();
573  }
574  if (!MStreamsServiceEvents.empty()) {
575  std::lock_guard<std::mutex> Lock(MStreamsServiceEventsMutex);
576  MStreamsServiceEvents.clear();
577  }
578  }
579 
580  std::vector<std::weak_ptr<event_impl>> WeakEvents;
581  std::vector<event> SharedEvents;
582  {
583  std::lock_guard<std::mutex> Lock(MMutex);
584  WeakEvents.swap(MEventsWeak);
585  SharedEvents.swap(MEventsShared);
586 
587  {
588  std::lock_guard<std::mutex> RequestLock(MMissedCleanupRequestsMtx);
589  for (auto &UpdatedGraph : MMissedCleanupRequests)
590  doUnenqueuedCommandCleanup(UpdatedGraph);
591  MMissedCleanupRequests.clear();
592  }
593  }
594  // If the queue is either a host one or does not support OOO (and we use
595  // multiple in-order queues as a result of that), wait for each event
596  // directly. Otherwise, only wait for unenqueued or host task events, starting
597  // from the latest submitted task in order to minimize total amount of calls,
598  // then handle the rest with urQueueFinish.
599  const bool SupportsPiFinish = !MEmulateOOO;
600  for (auto EventImplWeakPtrIt = WeakEvents.rbegin();
601  EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) {
602  if (std::shared_ptr<event_impl> EventImplSharedPtr =
603  EventImplWeakPtrIt->lock()) {
604  // A nullptr UR event indicates that urQueueFinish will not cover it,
605  // either because it's a host task event or an unenqueued one.
606  if (!SupportsPiFinish || nullptr == EventImplSharedPtr->getHandleRef()) {
607  EventImplSharedPtr->wait(EventImplSharedPtr);
608  }
609  }
610  }
611  if (SupportsPiFinish) {
612  const PluginPtr &Plugin = getPlugin();
613  Plugin->call(urQueueFinish, getHandleRef());
614  assert(SharedEvents.empty() && "Queues that support calling piQueueFinish "
615  "shouldn't have shared events");
616  } else {
617  for (event &Event : SharedEvents)
618  Event.wait();
619  }
620 
621  std::vector<EventImplPtr> StreamsServiceEvents;
622  {
623  std::lock_guard<std::mutex> Lock(MStreamsServiceEventsMutex);
624  StreamsServiceEvents.swap(MStreamsServiceEvents);
625  }
626  for (const EventImplPtr &Event : StreamsServiceEvents)
627  Event->wait(Event);
628 
629 #ifdef XPTI_ENABLE_INSTRUMENTATION
630  instrumentationEpilog(TelemetryEvent, Name, StreamID, IId);
631 #endif
632 }
633 
634 ur_native_handle_t queue_impl::getNative(int32_t &NativeHandleDesc) const {
635  const PluginPtr &Plugin = getPlugin();
636  if (getContextImplPtr()->getBackend() == backend::opencl)
637  Plugin->call(urQueueRetain, MQueues[0]);
638  ur_native_handle_t Handle{};
639  ur_queue_native_desc_t UrNativeDesc{UR_STRUCTURE_TYPE_QUEUE_NATIVE_DESC,
640  nullptr, nullptr};
641  UrNativeDesc.pNativeData = &NativeHandleDesc;
642 
643  Plugin->call(urQueueGetNativeHandle, MQueues[0], &UrNativeDesc, &Handle);
644  return Handle;
645 }
646 
648  // Clean up only if a scheduler instance exits.
651 }
652 
654  // If we have in-order queue where events are not discarded then just check
655  // the status of the last event.
656  if (isInOrder() && !MDiscardEvents) {
657  std::lock_guard<std::mutex> Lock(MMutex);
658  // If there is no last event we know that no work has been submitted, so it
659  // must be trivially empty.
661  return true;
662  // Otherwise, check if the last event is finished.
663  // Note that we fall back to the backend query if the event was discarded,
664  // which may happend despite the queue not being a discard event queue.
665  if (!MDefaultGraphDeps.LastEventPtr->isDiscarded())
667  ->get_info<info::event::command_execution_status>() ==
669  }
670 
671  // Check the status of the backend queue if this is not a host queue.
672  ur_bool_t IsReady = false;
673  getPlugin()->call(urQueueGetInfo, MQueues[0], UR_QUEUE_INFO_EMPTY,
674  sizeof(IsReady), &IsReady, nullptr);
675  if (!IsReady)
676  return false;
677 
678  // We may have events like host tasks which are not submitted to the backend
679  // queue so we need to get their status separately.
680  std::lock_guard<std::mutex> Lock(MMutex);
681  for (event Event : MEventsShared)
682  if (Event.get_info<info::event::command_execution_status>() !=
684  return false;
685 
686  for (auto EventImplWeakPtrIt = MEventsWeak.begin();
687  EventImplWeakPtrIt != MEventsWeak.end(); ++EventImplWeakPtrIt)
688  if (std::shared_ptr<event_impl> EventImplSharedPtr =
689  EventImplWeakPtrIt->lock())
690  if (EventImplSharedPtr->isHost() &&
691  EventImplSharedPtr
692  ->get_info<info::event::command_execution_status>() !=
694  return false;
695 
696  // If we didn't exit early above then it means that all events in the queue
697  // are completed.
698  return true;
699 }
700 
702  if (!(MDiscardEvents))
703  return Event;
704  return createDiscardedEvent();
705 }
706 
708  const EventImplPtr &CompletedHostTask) {
709  if (MIsInorder)
710  return;
711  std::unique_lock<std::mutex> Lock{MMutex, std::try_to_lock};
712  if (Lock.owns_lock())
713  doUnenqueuedCommandCleanup(CompletedHostTask->getCommandGraph());
714  else {
715  std::lock_guard<std::mutex> RequestLock(MMissedCleanupRequestsMtx);
716  MMissedCleanupRequests.push_back(CompletedHostTask->getCommandGraph());
717  }
718 }
719 
721  const std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
722  &Graph) {
723  auto tryToCleanup = [](DependencyTrackingItems &Deps) {
724  if (Deps.LastBarrier && Deps.LastBarrier->isEnqueued()) {
725  Deps.LastBarrier = nullptr;
726  Deps.UnenqueuedCmdEvents.clear();
727  } else {
728  if (Deps.UnenqueuedCmdEvents.empty())
729  return;
730  Deps.UnenqueuedCmdEvents.erase(
731  std::remove_if(
732  Deps.UnenqueuedCmdEvents.begin(), Deps.UnenqueuedCmdEvents.end(),
733  [](const EventImplPtr &CommandEvent) {
734  return (CommandEvent->isHost() ? CommandEvent->isCompleted()
735  : CommandEvent->isEnqueued());
736  }),
737  Deps.UnenqueuedCmdEvents.end());
738  }
739  };
740  // Barrier enqueue could be significantly postponed due to host task
741  // dependency if any. No guarantee that it will happen while same graph deps
742  // are still recording.
743  if (Graph && Graph == getCommandGraph())
744  tryToCleanup(MExtGraphDeps);
745  else
746  tryToCleanup(MDefaultGraphDeps);
747 }
748 
749 } // namespace detail
750 } // namespace _V1
751 } // namespace sycl
The context class represents a SYCL context on which kernel functions may be executed.
Definition: context.hpp:50
The Command class represents some action that needs to be performed on one or more memory objects.
Definition: commands.hpp:109
static void copy_to_device_global(const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue, size_t NumBytes, size_t Offset, const void *SrcMem, const std::vector< ur_event_handle_t > &DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void advise_usm(const void *Ptr, QueueImplPtr Queue, size_t Len, ur_usm_advice_flags_t Advice, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy_usm(const void *SrcMem, QueueImplPtr Queue, size_t Len, void *DstMem, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void fill_usm(void *DstMem, QueueImplPtr Queue, size_t Len, const std::vector< unsigned char > &Pattern, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy_from_device_global(const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue, size_t NumBytes, size_t Offset, void *DstMem, const std::vector< ur_event_handle_t > &DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static ProgramManager & getInstance()
bool kernelUsesAssert(const std::string &KernelName) const
static Scheduler & getInstance()
Definition: scheduler.cpp:248
void cleanUpCmdFusion(sycl::detail::queue_impl *Queue)
Definition: scheduler.cpp:593
static bool areEventsSafeForSchedulerBypass(const std::vector< sycl::event > &DepEvents, ContextImplPtr Context)
Definition: scheduler.cpp:734
event discard_or_return(const event &Event)
Definition: queue_impl.cpp:701
std::deque< std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > > MMissedCleanupRequests
event submit(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const std::shared_ptr< queue_impl > &SecondQueue, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess=nullptr)
Submits a command group function object to the queue, in order to be scheduled for execution on the d...
Definition: queue_impl.hpp:404
void revisitUnenqueuedCommandsState(const EventImplPtr &CompletedHostTask)
Definition: queue_impl.cpp:707
std::vector< EventImplPtr > MStreamsServiceEvents
Definition: queue_impl.hpp:997
event submitMemOpHelper(const std::shared_ptr< queue_impl > &Self, const std::vector< event > &DepEvents, bool CallerNeedsEvent, HandlerFuncT HandlerFunc, MemMngrFuncT MemMngrFunc, MemMngrArgTs... MemOpArgs)
Performs submission of a memory operation directly if scheduler can be bypassed, or with a handler ot...
std::optional< event > MInOrderExternalEvent
std::optional< event > popExternalEvent()
Definition: queue_impl.hpp:756
void registerStreamServiceEvent(const EventImplPtr &Event)
Definition: queue_impl.hpp:707
static std::atomic< unsigned long long > MNextAvailableQueueID
void addEvent(const event &Event)
Stores an event that should be associated with the queue.
Definition: queue_impl.cpp:297
struct sycl::_V1::detail::queue_impl::DependencyTrackingItems MExtGraphDeps
unsigned long long MQueueID
std::vector< std::weak_ptr< event_impl > > MEventsWeak
These events are tracked, but not owned, by the queue.
Definition: queue_impl.hpp:959
std::vector< event > MEventsShared
Events without data dependencies (such as USM) need an owner, additionally, USM operations are not ad...
Definition: queue_impl.hpp:964
ur_native_handle_t getNative(int32_t &NativeHandleDesc) const
Gets the native handle of the SYCL queue.
Definition: queue_impl.cpp:634
std::mutex MMutex
Protects all the fields that can be changed by class' methods.
Definition: queue_impl.hpp:953
event memcpyToDeviceGlobal(const std::shared_ptr< queue_impl > &Self, void *DeviceGlobalPtr, const void *Src, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector< event > &DepEvents, bool CallerNeedsEvent)
Definition: queue_impl.cpp:246
bool supportsDiscardingPiEvents() const
Definition: queue_impl.hpp:360
event submitWithHandler(const std::shared_ptr< queue_impl > &Self, const std::vector< event > &DepEvents, HandlerFuncT HandlerFunc)
Helper function for submitting a memory operation with a handler.
Definition: queue_impl.cpp:407
void doUnenqueuedCommandCleanup(const std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > &Graph)
Definition: queue_impl.cpp:720
event submit_impl(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const std::shared_ptr< queue_impl > &PrimaryQueue, const std::shared_ptr< queue_impl > &SecondaryQueue, bool CallerNeedsEvent, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess)
Performs command group submission to the queue.
Definition: queue_impl.cpp:349
std::function< void(bool, bool, event &)> SubmitPostProcessF
Definition: queue_impl.hpp:389
event mem_advise(const std::shared_ptr< queue_impl > &Self, const void *Ptr, size_t Length, ur_usm_advice_flags_t Advice, const std::vector< event > &DepEvents, bool CallerNeedsEvent)
Provides additional information to the underlying runtime about how different allocations are used.
Definition: queue_impl.cpp:234
event memcpyFromDeviceGlobal(const std::shared_ptr< queue_impl > &Self, void *Dest, const void *DeviceGlobalPtr, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector< event > &DepEvents, bool CallerNeedsEvent)
Definition: queue_impl.cpp:262
void addSharedEvent(const event &Event)
queue_impl.addEvent tracks events with weak pointers but some events have no other owners.
Definition: queue_impl.cpp:320
const ContextImplPtr MContext
Definition: queue_impl.hpp:956
std::vector< ur_queue_handle_t > MQueues
List of queues created for FPGA device from a single SYCL queue.
Definition: queue_impl.hpp:970
void finalizeHandler(HandlerType &Handler, event &EventRet)
Definition: queue_impl.hpp:807
const PluginPtr & getPlugin() const
Definition: queue_impl.hpp:347
void wait(const detail::code_location &Loc={})
Performs a blocking wait for the completion of all enqueued tasks in the queue.
Definition: queue_impl.cpp:544
std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > getCommandGraph() const
Definition: queue_impl.hpp:745
struct sycl::_V1::detail::queue_impl::DependencyTrackingItems MDefaultGraphDeps
bool MEmulateOOO
Indicates that a native out-of-order queue could not be created and we need to emulate it with multip...
Definition: queue_impl.hpp:976
void instrumentationEpilog(void *TelementryEvent, std::string &Name, int32_t StreamID, uint64_t IId)
Definition: queue_impl.cpp:525
const ContextImplPtr & getContextImplPtr() const
Definition: queue_impl.hpp:349
void * instrumentationProlog(const detail::code_location &CodeLoc, std::string &Name, int32_t StreamID, uint64_t &iid)
Definition: queue_impl.cpp:468
ur_queue_handle_t & getHandleRef()
Definition: queue_impl.hpp:627
std::weak_ptr< ext::oneapi::experimental::detail::graph_impl > MGraph
event memset(const std::shared_ptr< queue_impl > &Self, void *Ptr, int Value, size_t Count, const std::vector< event > &DepEvents, bool CallerNeedsEvent)
Fills the memory pointed by a USM pointer with the value specified.
Definition: queue_impl.cpp:147
event memcpy(const std::shared_ptr< queue_impl > &Self, void *Dest, const void *Src, size_t Count, const std::vector< event > &DepEvents, bool CallerNeedsEvent, const code_location &CodeLoc)
Copies data from one memory region to another, both pointed by USM pointers.
Definition: queue_impl.cpp:195
const std::vector< event > & getExtendDependencyList(const std::vector< event > &DepEvents, std::vector< event > &MutableVec, std::unique_lock< std::mutex > &QueueLock)
Definition: queue_impl.cpp:125
const char * c_str() const noexcept
Definition: string.hpp:60
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Definition: event.hpp:44
Command group handler class.
Definition: handler.hpp:467
void depends_on(event Event)
Registers event dependencies on this command group.
Definition: handler.cpp:1529
void memcpy(void *Dest, const void *Src, size_t Count)
Copies data from one memory region to another, each is either a host pointer or a pointer within USM ...
Definition: handler.cpp:934
void mem_advise(const void *Ptr, size_t Length, int Advice)
Provides additional information to the underlying runtime about how different allocations are used.
Definition: handler.cpp:958
void memset(void *Dest, int Value, size_t Count)
Fills the memory pointed by a USM pointer with the value specified.
Definition: handler.cpp:942
__SYCL_EXTERN_STREAM_ATTRS ostream cout
Linked to standard output.
thread_local bool NestedCallsDetector
Definition: queue_impl.cpp:32
decltype(Obj::impl) const & getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:31
constexpr const char * SYCL_STREAM_NAME
static event createDiscardedEvent()
Definition: queue_impl.cpp:118
static const PluginPtr & getPlugin(backend Backend)
Definition: backend.cpp:32
static std::vector< ur_event_handle_t > getUrEvents(const std::vector< sycl::event > &DepEvents)
Definition: queue_impl.cpp:48
std::shared_ptr< event_impl > EventImplPtr
Definition: handler.hpp:183
std::shared_ptr< plugin > PluginPtr
Definition: ur.hpp:60
CGType
Type of the command group.
Definition: cg_types.hpp:42
void report(const code_location &CodeLoc)
Definition: queue_impl.cpp:182
static event prepareSYCLEventAssociatedWithQueue(const std::shared_ptr< detail::queue_impl > &QueueImpl)
Definition: queue_impl.cpp:110
std::string queueDeviceToString(const queue_impl *const &Queue)
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:65
Definition: access.hpp:18
static device_ext & get_device(unsigned int id)
Util function to get a device by id.
Definition: device.hpp:905
constexpr unsigned long columnNumber() const noexcept
Definition: common.hpp:88
constexpr const char * fileName() const noexcept
Definition: common.hpp:89
constexpr const char * functionName() const noexcept
Definition: common.hpp:90
constexpr unsigned long lineNumber() const noexcept
Definition: common.hpp:87
C++ utilities for Unified Runtime integration.