DPC++ Runtime
Runtime libraries for oneAPI DPC++
queue_impl.cpp
Go to the documentation of this file.
1 //==------------------ queue_impl.cpp - SYCL queue -------------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include <detail/event_impl.hpp>
11 #include <detail/queue_impl.hpp>
12 #include <sycl/context.hpp>
13 #include <sycl/detail/common.hpp>
14 #include <sycl/detail/pi.hpp>
15 #include <sycl/device.hpp>
16 
17 #include <cstring>
18 #include <utility>
19 
20 #ifdef XPTI_ENABLE_INSTRUMENTATION
21 #include "xpti/xpti_trace_framework.hpp"
22 #include <detail/xpti_registry.hpp>
23 #include <sstream>
24 #endif
25 
26 namespace sycl {
27 inline namespace _V1 {
28 namespace detail {
29 // Treat 0 as reserved for host task traces
30 std::atomic<unsigned long long> queue_impl::MNextAvailableQueueID = 1;
31 
32 thread_local bool NestedCallsDetector = false;
34 public:
37  throw sycl::exception(
39  "Calls to sycl::queue::submit cannot be nested. Command group "
40  "function objects should use the sycl::handler API instead.");
41  NestedCallsDetector = true;
42  }
43 
45 };
46 
47 static std::vector<sycl::detail::pi::PiEvent>
48 getPIEvents(const std::vector<sycl::event> &DepEvents) {
49  std::vector<sycl::detail::pi::PiEvent> RetPiEvents;
50  for (const sycl::event &Event : DepEvents) {
51  const EventImplPtr &EventImpl = detail::getSyclObjImpl(Event);
52  if (EventImpl->getHandleRef() != nullptr)
53  RetPiEvents.push_back(EventImpl->getHandleRef());
54  }
55  return RetPiEvents;
56 }
57 
58 template <>
59 uint32_t queue_impl::get_info<info::queue::reference_count>() const {
60  sycl::detail::pi::PiResult result = PI_SUCCESS;
62  MQueues[0], PI_QUEUE_INFO_REFERENCE_COUNT, sizeof(result), &result,
63  nullptr);
64  return result;
65 }
66 
67 template <> context queue_impl::get_info<info::queue::context>() const {
68  return get_context();
69 }
70 
71 template <> device queue_impl::get_info<info::queue::device>() const {
72  return get_device();
73 }
74 
75 template <>
76 typename info::platform::version::return_type
77 queue_impl::get_backend_info<info::platform::version>() const {
78  if (getContextImplPtr()->getBackend() != backend::opencl) {
80  "the info::platform::version info descriptor can "
81  "only be queried with an OpenCL backend");
82  }
83  return get_device().get_platform().get_info<info::platform::version>();
84 }
85 
86 template <>
87 typename info::device::version::return_type
88 queue_impl::get_backend_info<info::device::version>() const {
89  if (getContextImplPtr()->getBackend() != backend::opencl) {
91  "the info::device::version info descriptor can only "
92  "be queried with an OpenCL backend");
93  }
94  return get_device().get_info<info::device::version>();
95 }
96 
97 template <>
98 typename info::device::backend_version::return_type
99 queue_impl::get_backend_info<info::device::backend_version>() const {
100  if (getContextImplPtr()->getBackend() != backend::ext_oneapi_level_zero) {
102  "the info::device::backend_version info descriptor "
103  "can only be queried with a Level Zero backend");
104  }
105  return "";
106  // Currently The Level Zero backend does not define the value of this
107  // information descriptor and implementations are encouraged to return the
108  // empty string as per specification.
109 }
110 
112  const std::shared_ptr<detail::queue_impl> &QueueImpl) {
113  auto EventImpl = std::make_shared<detail::event_impl>(QueueImpl);
114  EventImpl->setContextImpl(detail::getSyclObjImpl(QueueImpl->get_context()));
115  EventImpl->setStateIncomplete();
116  return detail::createSyclObjFromImpl<event>(EventImpl);
117 }
118 
119 static event createDiscardedEvent() {
120  EventImplPtr EventImpl =
121  std::make_shared<event_impl>(event_impl::HES_Discarded);
122  return createSyclObjFromImpl<event>(EventImpl);
123 }
124 
125 const std::vector<event> &
126 queue_impl::getExtendDependencyList(const std::vector<event> &DepEvents,
127  std::vector<event> &MutableVec,
128  std::unique_lock<std::mutex> &QueueLock) {
129  if (!isInOrder())
130  return DepEvents;
131 
132  QueueLock.lock();
133  EventImplPtr ExtraEvent = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr
135  std::optional<event> ExternalEvent = popExternalEvent();
136 
137  if (!ExternalEvent && !ExtraEvent)
138  return DepEvents;
139 
140  MutableVec = DepEvents;
141  if (ExternalEvent)
142  MutableVec.push_back(*ExternalEvent);
143  if (ExtraEvent)
144  MutableVec.push_back(detail::createSyclObjFromImpl<event>(ExtraEvent));
145  return MutableVec;
146 }
147 
148 event queue_impl::memset(const std::shared_ptr<detail::queue_impl> &Self,
149  void *Ptr, int Value, size_t Count,
150  const std::vector<event> &DepEvents,
151  bool CallerNeedsEvent) {
152 #if XPTI_ENABLE_INSTRUMENTATION
153  // We need a code pointer value and we use the object ptr; if code location
154  // information is available, we will have function name and source file
155  // information
156  XPTIScope PrepareNotify((void *)this,
157  (uint16_t)xpti::trace_point_type_t::node_create,
158  SYCL_STREAM_NAME, "memory_transfer_node");
159  PrepareNotify.addMetadata([&](auto TEvent) {
160  xpti::addMetadata(TEvent, "sycl_device",
161  reinterpret_cast<size_t>(MDevice->getHandleRef()));
162  xpti::addMetadata(TEvent, "memory_ptr", reinterpret_cast<size_t>(Ptr));
163  xpti::addMetadata(TEvent, "value_set", Value);
164  xpti::addMetadata(TEvent, "memory_size", Count);
165  xpti::addMetadata(TEvent, "queue_id", MQueueID);
166  });
167  // Before we notifiy the subscribers, we broadcast the 'queue_id', which was a
168  // metadata entry to TLS for use by callback handlers
169  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, MQueueID);
170  // Notify XPTI about the memset submission
171  PrepareNotify.notify();
172  // Emit a begin/end scope for this call
173  PrepareNotify.scopedNotify((uint16_t)xpti::trace_point_type_t::task_begin);
174 #endif
175  const std::vector<unsigned char> Pattern{static_cast<unsigned char>(Value)};
176  return submitMemOpHelper(
177  Self, DepEvents, CallerNeedsEvent,
178  [&](handler &CGH) { CGH.memset(Ptr, Value, Count); },
179  [](const auto &...Args) { MemoryManager::fill_usm(Args...); }, Ptr, Self,
180  Count, Pattern);
181 }
182 
183 void report(const code_location &CodeLoc) {
184  std::cout << "Exception caught at ";
185  if (CodeLoc.fileName())
186  std::cout << "File: " << CodeLoc.fileName();
187  if (CodeLoc.functionName())
188  std::cout << " | Function: " << CodeLoc.functionName();
189  if (CodeLoc.lineNumber())
190  std::cout << " | Line: " << CodeLoc.lineNumber();
191  if (CodeLoc.columnNumber())
192  std::cout << " | Column: " << CodeLoc.columnNumber();
193  std::cout << '\n';
194 }
195 
196 event queue_impl::memcpy(const std::shared_ptr<detail::queue_impl> &Self,
197  void *Dest, const void *Src, size_t Count,
198  const std::vector<event> &DepEvents,
199  bool CallerNeedsEvent, const code_location &CodeLoc) {
200 #if XPTI_ENABLE_INSTRUMENTATION
201  // We need a code pointer value and we duse the object ptr; If code location
202  // is available, we use the source file information along with the object
203  // pointer.
204  XPTIScope PrepareNotify((void *)this,
205  (uint16_t)xpti::trace_point_type_t::node_create,
206  SYCL_STREAM_NAME, "memory_transfer_node");
207  PrepareNotify.addMetadata([&](auto TEvent) {
208  xpti::addMetadata(TEvent, "sycl_device",
209  reinterpret_cast<size_t>(MDevice->getHandleRef()));
210  xpti::addMetadata(TEvent, "src_memory_ptr", reinterpret_cast<size_t>(Src));
211  xpti::addMetadata(TEvent, "dest_memory_ptr",
212  reinterpret_cast<size_t>(Dest));
213  xpti::addMetadata(TEvent, "memory_size", Count);
214  xpti::addMetadata(TEvent, "queue_id", MQueueID);
215  });
216  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, MQueueID);
217  // Notify XPTI about the memcpy submission
218  PrepareNotify.notify();
219  // Emit a begin/end scope for this call
220  PrepareNotify.scopedNotify((uint16_t)xpti::trace_point_type_t::task_begin);
221 #endif
222 
223  if ((!Src || !Dest) && Count != 0) {
224  report(CodeLoc);
226  "NULL pointer argument in memory copy operation.");
227  }
228  return submitMemOpHelper(
229  Self, DepEvents, CallerNeedsEvent,
230  [&](handler &CGH) { CGH.memcpy(Dest, Src, Count); },
231  [](const auto &...Args) { MemoryManager::copy_usm(Args...); }, Src, Self,
232  Count, Dest);
233 }
234 
235 event queue_impl::mem_advise(const std::shared_ptr<detail::queue_impl> &Self,
236  const void *Ptr, size_t Length,
237  pi_mem_advice Advice,
238  const std::vector<event> &DepEvents,
239  bool CallerNeedsEvent) {
240  return submitMemOpHelper(
241  Self, DepEvents, CallerNeedsEvent,
242  [&](handler &CGH) { CGH.mem_advise(Ptr, Length, Advice); },
243  [](const auto &...Args) { MemoryManager::advise_usm(Args...); }, Ptr,
244  Self, Length, Advice);
245 }
246 
248  const std::shared_ptr<detail::queue_impl> &Self, void *DeviceGlobalPtr,
249  const void *Src, bool IsDeviceImageScope, size_t NumBytes, size_t Offset,
250  const std::vector<event> &DepEvents, bool CallerNeedsEvent) {
251  return submitMemOpHelper(
252  Self, DepEvents, CallerNeedsEvent,
253  [&](handler &CGH) {
254  CGH.memcpyToDeviceGlobal(DeviceGlobalPtr, Src, IsDeviceImageScope,
255  NumBytes, Offset);
256  },
257  [](const auto &...Args) {
259  },
260  DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Src);
261 }
262 
264  const std::shared_ptr<detail::queue_impl> &Self, void *Dest,
265  const void *DeviceGlobalPtr, bool IsDeviceImageScope, size_t NumBytes,
266  size_t Offset, const std::vector<event> &DepEvents, bool CallerNeedsEvent) {
267  return submitMemOpHelper(
268  Self, DepEvents, CallerNeedsEvent,
269  [&](handler &CGH) {
270  CGH.memcpyFromDeviceGlobal(Dest, DeviceGlobalPtr, IsDeviceImageScope,
271  NumBytes, Offset);
272  },
273  [](const auto &...Args) {
275  },
276  DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Dest);
277 }
278 
280  {
281  // The external event is required to finish last if set, so it is considered
282  // the last event if present.
283  std::lock_guard<std::mutex> Lock(MInOrderExternalEventMtx);
285  return *MInOrderExternalEvent;
286  }
287 
288  std::lock_guard<std::mutex> Lock{MMutex};
289  if (MDiscardEvents)
290  return createDiscardedEvent();
291  if (!MGraph.expired() && MExtGraphDeps.LastEventPtr)
292  return detail::createSyclObjFromImpl<event>(MExtGraphDeps.LastEventPtr);
294  MDefaultGraphDeps.LastEventPtr = std::make_shared<event_impl>(std::nullopt);
295  return detail::createSyclObjFromImpl<event>(MDefaultGraphDeps.LastEventPtr);
296 }
297 
298 void queue_impl::addEvent(const event &Event) {
299  EventImplPtr EImpl = getSyclObjImpl(Event);
300  assert(EImpl && "Event implementation is missing");
301  auto *Cmd = static_cast<Command *>(EImpl->getCommand());
302  if (!Cmd) {
303  // if there is no command on the event, we cannot track it with MEventsWeak
304  // as that will leave it with no owner. Track in MEventsShared only if we're
305  // unable to call piQueueFinish during wait.
306  if (MEmulateOOO)
307  addSharedEvent(Event);
308  }
309  // As long as the queue supports piQueueFinish we only need to store events
310  // for unenqueued commands and host tasks.
311  else if (MEmulateOOO || EImpl->getHandleRef() == nullptr) {
312  std::weak_ptr<event_impl> EventWeakPtr{EImpl};
313  std::lock_guard<std::mutex> Lock{MMutex};
314  MEventsWeak.push_back(std::move(EventWeakPtr));
315  }
316 }
317 
321 void queue_impl::addSharedEvent(const event &Event) {
322  assert(MEmulateOOO);
323  std::lock_guard<std::mutex> Lock(MMutex);
324  // Events stored in MEventsShared are not released anywhere else aside from
325  // calls to queue::wait/wait_and_throw, which a user application might not
326  // make, and ~queue_impl(). If the number of events grows large enough,
327  // there's a good chance that most of them are already completed and ownership
328  // of them can be released.
329  const size_t EventThreshold = 128;
330  if (MEventsShared.size() >= EventThreshold) {
331  // Generally, the vector is ordered so that the oldest events are in the
332  // front and the newer events are in the end. So, search to find the first
333  // event that isn't yet complete. All the events prior to that can be
334  // erased. This could leave some few events further on that have completed
335  // not yet erased, but that is OK. This cleanup doesn't have to be perfect.
336  // This also keeps the algorithm linear rather than quadratic because it
337  // doesn't continually recheck things towards the back of the list that
338  // really haven't had time to complete.
339  MEventsShared.erase(
340  MEventsShared.begin(),
341  std::find_if(
342  MEventsShared.begin(), MEventsShared.end(), [](const event &E) {
343  return E.get_info<info::event::command_execution_status>() !=
344  info::event_command_status::complete;
345  }));
346  }
347  MEventsShared.push_back(Event);
348 }
349 
350 event queue_impl::submit_impl(const std::function<void(handler &)> &CGF,
351  const std::shared_ptr<queue_impl> &Self,
352  const std::shared_ptr<queue_impl> &PrimaryQueue,
353  const std::shared_ptr<queue_impl> &SecondaryQueue,
354  bool CallerNeedsEvent,
355  const detail::code_location &Loc,
356  const SubmitPostProcessF *PostProcess) {
357  handler Handler(Self, PrimaryQueue, SecondaryQueue, CallerNeedsEvent);
358  Handler.saveCodeLoc(Loc);
359 
360  {
361  NestedCallsTracker tracker;
362  CGF(Handler);
363  }
364 
365  // Scheduler will later omit events, that are not required to execute tasks.
366  // Host and interop tasks, however, are not submitted to low-level runtimes
367  // and require separate dependency management.
368  const CGType Type = detail::getSyclObjImpl(Handler)->MCGType;
369  event Event = detail::createSyclObjFromImpl<event>(
370  std::make_shared<detail::event_impl>());
371  std::vector<StreamImplPtr> Streams;
372  if (Type == CGType::Kernel)
373  Streams = std::move(Handler.MStreamStorage);
374 
375  if (PostProcess) {
376  bool IsKernel = Type == CGType::Kernel;
377  bool KernelUsesAssert = false;
378 
379  if (IsKernel)
380  // Kernel only uses assert if it's non interop one
381  KernelUsesAssert = !(Handler.MKernel && Handler.MKernel->isInterop()) &&
383  Handler.MKernelName.c_str());
384  finalizeHandler(Handler, Event);
385 
386  (*PostProcess)(IsKernel, KernelUsesAssert, Event);
387  } else
388  finalizeHandler(Handler, Event);
389 
390  addEvent(Event);
391 
392  auto EventImpl = detail::getSyclObjImpl(Event);
393  for (auto &Stream : Streams) {
394  // We don't want stream flushing to be blocking operation that is why submit
395  // a host task to print stream buffer. It will fire up as soon as the kernel
396  // finishes execution.
397  event FlushEvent = submit_impl(
398  [&](handler &ServiceCGH) { Stream->generateFlushCommand(ServiceCGH); },
399  Self, PrimaryQueue, SecondaryQueue, /*CallerNeedsEvent*/ true, Loc, {});
400  EventImpl->attachEventToComplete(detail::getSyclObjImpl(FlushEvent));
402  }
403 
404  return Event;
405 }
406 
407 template <typename HandlerFuncT>
408 event queue_impl::submitWithHandler(const std::shared_ptr<queue_impl> &Self,
409  const std::vector<event> &DepEvents,
410  HandlerFuncT HandlerFunc) {
411  return submit(
412  [&](handler &CGH) {
413  CGH.depends_on(DepEvents);
414  HandlerFunc(CGH);
415  },
416  Self, {});
417 }
418 
419 template <typename HandlerFuncT, typename MemOpFuncT, typename... MemOpArgTs>
420 event queue_impl::submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,
421  const std::vector<event> &DepEvents,
422  bool CallerNeedsEvent,
423  HandlerFuncT HandlerFunc,
424  MemOpFuncT MemOpFunc,
425  MemOpArgTs... MemOpArgs) {
426  // We need to submit command and update the last event under same lock if we
427  // have in-order queue.
428  {
429  std::unique_lock<std::mutex> Lock(MMutex, std::defer_lock);
430 
431  std::vector<event> MutableDepEvents;
432  const std::vector<event> &ExpandedDepEvents =
433  getExtendDependencyList(DepEvents, MutableDepEvents, Lock);
434 
435  // If we have a command graph set we need to capture the op through the
436  // handler rather than by-passing the scheduler.
438  ExpandedDepEvents, MContext)) {
439  if ((MDiscardEvents || !CallerNeedsEvent) &&
441  NestedCallsTracker tracker;
442  MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents),
443  /*PiEvent*/ nullptr, /*EventImplPtr*/ nullptr);
444  return createDiscardedEvent();
445  }
446 
447  event ResEvent = prepareSYCLEventAssociatedWithQueue(Self);
448  auto EventImpl = detail::getSyclObjImpl(ResEvent);
449  {
450  NestedCallsTracker tracker;
451  MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents),
452  &EventImpl->getHandleRef(), EventImpl);
453  }
454 
455  if (isInOrder()) {
456  auto &EventToStoreIn = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr
458  EventToStoreIn = EventImpl;
459  }
460  // Track only if we won't be able to handle it with piQueueFinish.
461  if (MEmulateOOO)
462  addSharedEvent(ResEvent);
463  return discard_or_return(ResEvent);
464  }
465  }
466  return submitWithHandler(Self, DepEvents, HandlerFunc);
467 }
468 
470  std::string &Name, int32_t StreamID,
471  uint64_t &IId) {
472  void *TraceEvent = nullptr;
473  (void)CodeLoc;
474  (void)Name;
475  (void)StreamID;
476  (void)IId;
477 #ifdef XPTI_ENABLE_INSTRUMENTATION
478  constexpr uint16_t NotificationTraceType = xpti::trace_wait_begin;
479  if (!xptiCheckTraceEnabled(StreamID, NotificationTraceType))
480  return TraceEvent;
481 
482  xpti::payload_t Payload;
483  bool HasSourceInfo = false;
484  // We try to create a unique string for the wait() call by combining it with
485  // the queue address
486  xpti::utils::StringHelper NG;
487  Name = NG.nameWithAddress<queue_impl *>("queue.wait", this);
488 
489  if (CodeLoc.fileName()) {
490  // We have source code location information
491  Payload =
492  xpti::payload_t(Name.c_str(), CodeLoc.fileName(), CodeLoc.lineNumber(),
493  CodeLoc.columnNumber(), (void *)this);
494  HasSourceInfo = true;
495  } else {
496  // We have no location information, so we'll use the address of the queue
497  Payload = xpti::payload_t(Name.c_str(), (void *)this);
498  }
499  // wait() calls could be at different user-code locations; We create a new
500  // event based on the code location info and if this has been seen before, a
501  // previously created event will be returned.
502  uint64_t QWaitInstanceNo = 0;
503  xpti::trace_event_data_t *WaitEvent =
504  xptiMakeEvent(Name.c_str(), &Payload, xpti::trace_graph_event,
505  xpti_at::active, &QWaitInstanceNo);
506  IId = QWaitInstanceNo;
507  if (WaitEvent) {
508  xpti::addMetadata(WaitEvent, "sycl_device_type", queueDeviceToString(this));
509  if (HasSourceInfo) {
510  xpti::addMetadata(WaitEvent, "sym_function_name", CodeLoc.functionName());
511  xpti::addMetadata(WaitEvent, "sym_source_file_name", CodeLoc.fileName());
512  xpti::addMetadata(WaitEvent, "sym_line_no",
513  static_cast<int32_t>((CodeLoc.lineNumber())));
514  xpti::addMetadata(WaitEvent, "sym_column_no",
515  static_cast<int32_t>((CodeLoc.columnNumber())));
516  }
517  xptiNotifySubscribers(StreamID, xpti::trace_wait_begin, nullptr, WaitEvent,
518  QWaitInstanceNo,
519  static_cast<const void *>(Name.c_str()));
520  TraceEvent = (void *)WaitEvent;
521  }
522 #endif
523  return TraceEvent;
524 }
525 
526 void queue_impl::instrumentationEpilog(void *TelemetryEvent, std::string &Name,
527  int32_t StreamID, uint64_t IId) {
528  (void)TelemetryEvent;
529  (void)Name;
530  (void)StreamID;
531  (void)IId;
532 #ifdef XPTI_ENABLE_INSTRUMENTATION
533  constexpr uint16_t NotificationTraceType = xpti::trace_wait_end;
534  if (!(xptiCheckTraceEnabled(StreamID, NotificationTraceType) &&
535  TelemetryEvent))
536  return;
537  // Close the wait() scope
538  xpti::trace_event_data_t *TraceEvent =
539  (xpti::trace_event_data_t *)TelemetryEvent;
540  xptiNotifySubscribers(StreamID, NotificationTraceType, nullptr, TraceEvent,
541  IId, static_cast<const void *>(Name.c_str()));
542 #endif
543 }
544 
546  (void)CodeLoc;
547 #ifdef XPTI_ENABLE_INSTRUMENTATION
548  void *TelemetryEvent = nullptr;
549  uint64_t IId;
550  std::string Name;
551  int32_t StreamID = xptiRegisterStream(SYCL_STREAM_NAME);
552  TelemetryEvent = instrumentationProlog(CodeLoc, Name, StreamID, IId);
553 #endif
554 
555  if (MGraph.lock()) {
557  "wait cannot be called for a queue which is "
558  "recording to a command graph.");
559  }
560 
561  // If there is an external event set, we know we are using an in-order queue
562  // and the event is required to finish after the last event in the queue. As
563  // such, we can just wait for it and finish.
564  std::optional<event> ExternalEvent = popExternalEvent();
565  if (ExternalEvent) {
566  ExternalEvent->wait();
567 
568  // Additionally, we can clean up the event lists that we would have
569  // otherwise cleared.
570  if (!MEventsWeak.empty() || !MEventsShared.empty()) {
571  std::lock_guard<std::mutex> Lock(MMutex);
572  MEventsWeak.clear();
573  MEventsShared.clear();
574  }
575  if (!MStreamsServiceEvents.empty()) {
576  std::lock_guard<std::mutex> Lock(MStreamsServiceEventsMutex);
577  MStreamsServiceEvents.clear();
578  }
579  }
580 
581  std::vector<std::weak_ptr<event_impl>> WeakEvents;
582  std::vector<event> SharedEvents;
583  {
584  std::lock_guard<std::mutex> Lock(MMutex);
585  WeakEvents.swap(MEventsWeak);
586  SharedEvents.swap(MEventsShared);
587 
588  {
589  std::lock_guard<std::mutex> RequestLock(MMissedCleanupRequestsMtx);
590  for (auto &UpdatedGraph : MMissedCleanupRequests)
591  doUnenqueuedCommandCleanup(UpdatedGraph);
592  MMissedCleanupRequests.clear();
593  }
594  }
595  // If the queue is either a host one or does not support OOO (and we use
596  // multiple in-order queues as a result of that), wait for each event
597  // directly. Otherwise, only wait for unenqueued or host task events, starting
598  // from the latest submitted task in order to minimize total amount of calls,
599  // then handle the rest with piQueueFinish.
600  const bool SupportsPiFinish = !MEmulateOOO;
601  for (auto EventImplWeakPtrIt = WeakEvents.rbegin();
602  EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) {
603  if (std::shared_ptr<event_impl> EventImplSharedPtr =
604  EventImplWeakPtrIt->lock()) {
605  // A nullptr PI event indicates that piQueueFinish will not cover it,
606  // either because it's a host task event or an unenqueued one.
607  if (!SupportsPiFinish || nullptr == EventImplSharedPtr->getHandleRef()) {
608  EventImplSharedPtr->wait(EventImplSharedPtr);
609  }
610  }
611  }
612  if (SupportsPiFinish) {
613  const PluginPtr &Plugin = getPlugin();
615  assert(SharedEvents.empty() && "Queues that support calling piQueueFinish "
616  "shouldn't have shared events");
617  } else {
618  for (event &Event : SharedEvents)
619  Event.wait();
620  }
621 
622  std::vector<EventImplPtr> StreamsServiceEvents;
623  {
624  std::lock_guard<std::mutex> Lock(MStreamsServiceEventsMutex);
625  StreamsServiceEvents.swap(MStreamsServiceEvents);
626  }
627  for (const EventImplPtr &Event : StreamsServiceEvents)
628  Event->wait(Event);
629 
630 #ifdef XPTI_ENABLE_INSTRUMENTATION
631  instrumentationEpilog(TelemetryEvent, Name, StreamID, IId);
632 #endif
633 }
634 
635 pi_native_handle queue_impl::getNative(int32_t &NativeHandleDesc) const {
636  const PluginPtr &Plugin = getPlugin();
637  if (getContextImplPtr()->getBackend() == backend::opencl)
638  Plugin->call<PiApiKind::piQueueRetain>(MQueues[0]);
639  pi_native_handle Handle{};
640  Plugin->call<PiApiKind::piextQueueGetNativeHandle>(MQueues[0], &Handle,
641  &NativeHandleDesc);
642  return Handle;
643 }
644 
646  // Clean up only if a scheduler instance exits.
649 }
650 
652  // If we have in-order queue where events are not discarded then just check
653  // the status of the last event.
654  if (isInOrder() && !MDiscardEvents) {
655  std::lock_guard<std::mutex> Lock(MMutex);
656  // If there is no last event we know that no work has been submitted, so it
657  // must be trivially empty.
659  return true;
660  // Otherwise, check if the last event is finished.
661  // Note that we fall back to the backend query if the event was discarded,
662  // which may happend despite the queue not being a discard event queue.
663  if (!MDefaultGraphDeps.LastEventPtr->isDiscarded())
665  ->get_info<info::event::command_execution_status>() ==
667  }
668 
669  // Check the status of the backend queue.
670  pi_bool IsReady = false;
672  MQueues[0], PI_EXT_ONEAPI_QUEUE_INFO_EMPTY, sizeof(pi_bool), &IsReady,
673  nullptr);
674  if (!IsReady)
675  return false;
676 
677  // We may have events like host tasks which are not submitted to the backend
678  // queue so we need to get their status separately.
679  std::lock_guard<std::mutex> Lock(MMutex);
680  for (event Event : MEventsShared)
681  if (Event.get_info<info::event::command_execution_status>() !=
683  return false;
684 
685  for (auto EventImplWeakPtrIt = MEventsWeak.begin();
686  EventImplWeakPtrIt != MEventsWeak.end(); ++EventImplWeakPtrIt)
687  if (std::shared_ptr<event_impl> EventImplSharedPtr =
688  EventImplWeakPtrIt->lock())
689  if (EventImplSharedPtr->isHost() &&
690  EventImplSharedPtr
691  ->get_info<info::event::command_execution_status>() !=
693  return false;
694 
695  // If we didn't exit early above then it means that all events in the queue
696  // are completed.
697  return true;
698 }
699 
701  if (!(MDiscardEvents))
702  return Event;
703  return createDiscardedEvent();
704 }
705 
707  const EventImplPtr &CompletedHostTask) {
708  if (MIsInorder)
709  return;
710  std::unique_lock<std::mutex> Lock{MMutex, std::try_to_lock};
711  if (Lock.owns_lock())
712  doUnenqueuedCommandCleanup(CompletedHostTask->getCommandGraph());
713  else {
714  std::lock_guard<std::mutex> RequestLock(MMissedCleanupRequestsMtx);
715  MMissedCleanupRequests.push_back(CompletedHostTask->getCommandGraph());
716  }
717 }
718 
720  const std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
721  &Graph) {
722  auto tryToCleanup = [](DependencyTrackingItems &Deps) {
723  if (Deps.LastBarrier && Deps.LastBarrier->isEnqueued()) {
724  Deps.LastBarrier = nullptr;
725  Deps.UnenqueuedCmdEvents.clear();
726  } else {
727  if (Deps.UnenqueuedCmdEvents.empty())
728  return;
729  Deps.UnenqueuedCmdEvents.erase(
730  std::remove_if(
731  Deps.UnenqueuedCmdEvents.begin(), Deps.UnenqueuedCmdEvents.end(),
732  [](const EventImplPtr &CommandEvent) {
733  return (CommandEvent->isHost() ? CommandEvent->isCompleted()
734  : CommandEvent->isEnqueued());
735  }),
736  Deps.UnenqueuedCmdEvents.end());
737  }
738  };
739  // Barrier enqueue could be significantly postponed due to host task
740  // dependency if any. No guarantee that it will happen while same graph deps
741  // are still recording.
742  if (Graph && Graph == getCommandGraph())
743  tryToCleanup(MExtGraphDeps);
744  else
745  tryToCleanup(MDefaultGraphDeps);
746 }
747 
748 } // namespace detail
749 } // namespace _V1
750 } // namespace sycl
The context class represents a SYCL context on which kernel functions may be executed.
Definition: context.hpp:50
The Command class represents some action that needs to be performed on one or more memory objects.
Definition: commands.hpp:106
static void advise_usm(const void *Ptr, QueueImplPtr Queue, size_t Len, pi_mem_advice Advice, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy_usm(const void *SrcMem, QueueImplPtr Queue, size_t Len, void *DstMem, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy_to_device_global(const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue, size_t NumBytes, size_t Offset, const void *SrcMem, const std::vector< sycl::detail::pi::PiEvent > &DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy_from_device_global(const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue, size_t NumBytes, size_t Offset, void *DstMem, const std::vector< sycl::detail::pi::PiEvent > &DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void fill_usm(void *DstMem, QueueImplPtr Queue, size_t Len, const std::vector< unsigned char > &Pattern, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static ProgramManager & getInstance()
bool kernelUsesAssert(const std::string &KernelName) const
static Scheduler & getInstance()
Definition: scheduler.cpp:244
void cleanUpCmdFusion(sycl::detail::queue_impl *Queue)
Definition: scheduler.cpp:589
static bool areEventsSafeForSchedulerBypass(const std::vector< sycl::event > &DepEvents, ContextImplPtr Context)
Definition: scheduler.cpp:715
event discard_or_return(const event &Event)
Definition: queue_impl.cpp:700
std::deque< std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > > MMissedCleanupRequests
event submit(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const std::shared_ptr< queue_impl > &SecondQueue, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess=nullptr)
Submits a command group function object to the queue, in order to be scheduled for execution on the d...
Definition: queue_impl.hpp:401
void revisitUnenqueuedCommandsState(const EventImplPtr &CompletedHostTask)
Definition: queue_impl.cpp:706
std::vector< EventImplPtr > MStreamsServiceEvents
Definition: queue_impl.hpp:987
event submitMemOpHelper(const std::shared_ptr< queue_impl > &Self, const std::vector< event > &DepEvents, bool CallerNeedsEvent, HandlerFuncT HandlerFunc, MemMngrFuncT MemMngrFunc, MemMngrArgTs... MemOpArgs)
Performs submission of a memory operation directly if scheduler can be bypassed, or with a handler ot...
std::optional< event > MInOrderExternalEvent
std::optional< event > popExternalEvent()
Definition: queue_impl.hpp:747
void registerStreamServiceEvent(const EventImplPtr &Event)
Definition: queue_impl.hpp:698
static std::atomic< unsigned long long > MNextAvailableQueueID
void addEvent(const event &Event)
Stores an event that should be associated with the queue.
Definition: queue_impl.cpp:298
std::vector< sycl::detail::pi::PiQueue > MQueues
List of queues created for FPGA device from a single SYCL queue.
Definition: queue_impl.hpp:960
pi_native_handle getNative(int32_t &NativeHandleDesc) const
Gets the native handle of the SYCL queue.
Definition: queue_impl.cpp:635
struct sycl::_V1::detail::queue_impl::DependencyTrackingItems MExtGraphDeps
unsigned long long MQueueID
std::vector< std::weak_ptr< event_impl > > MEventsWeak
These events are tracked, but not owned, by the queue.
Definition: queue_impl.hpp:949
std::vector< event > MEventsShared
Events without data dependencies (such as USM) need an owner, additionally, USM operations are not ad...
Definition: queue_impl.hpp:954
std::mutex MMutex
Protects all the fields that can be changed by class' methods.
Definition: queue_impl.hpp:943
event memcpyToDeviceGlobal(const std::shared_ptr< queue_impl > &Self, void *DeviceGlobalPtr, const void *Src, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector< event > &DepEvents, bool CallerNeedsEvent)
Definition: queue_impl.cpp:247
bool supportsDiscardingPiEvents() const
Definition: queue_impl.hpp:357
event submitWithHandler(const std::shared_ptr< queue_impl > &Self, const std::vector< event > &DepEvents, HandlerFuncT HandlerFunc)
Helper function for submitting a memory operation with a handler.
Definition: queue_impl.cpp:408
void doUnenqueuedCommandCleanup(const std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > &Graph)
Definition: queue_impl.cpp:719
event mem_advise(const std::shared_ptr< queue_impl > &Self, const void *Ptr, size_t Length, pi_mem_advice Advice, const std::vector< event > &DepEvents, bool CallerNeedsEvent)
Provides additional information to the underlying runtime about how different allocations are used.
Definition: queue_impl.cpp:235
event submit_impl(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const std::shared_ptr< queue_impl > &PrimaryQueue, const std::shared_ptr< queue_impl > &SecondaryQueue, bool CallerNeedsEvent, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess)
Performs command group submission to the queue.
Definition: queue_impl.cpp:350
std::function< void(bool, bool, event &)> SubmitPostProcessF
Definition: queue_impl.hpp:386
event memcpyFromDeviceGlobal(const std::shared_ptr< queue_impl > &Self, void *Dest, const void *DeviceGlobalPtr, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector< event > &DepEvents, bool CallerNeedsEvent)
Definition: queue_impl.cpp:263
void addSharedEvent(const event &Event)
queue_impl.addEvent tracks events with weak pointers but some events have no other owners.
Definition: queue_impl.cpp:321
const ContextImplPtr MContext
Definition: queue_impl.hpp:946
void finalizeHandler(HandlerType &Handler, event &EventRet)
Definition: queue_impl.hpp:797
sycl::detail::pi::PiQueue & getHandleRef()
Definition: queue_impl.hpp:618
const PluginPtr & getPlugin() const
Definition: queue_impl.hpp:344
void wait(const detail::code_location &Loc={})
Performs a blocking wait for the completion of all enqueued tasks in the queue.
Definition: queue_impl.cpp:545
std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > getCommandGraph() const
Definition: queue_impl.hpp:736
struct sycl::_V1::detail::queue_impl::DependencyTrackingItems MDefaultGraphDeps
bool MEmulateOOO
Indicates that a native out-of-order queue could not be created and we need to emulate it with multip...
Definition: queue_impl.hpp:966
void instrumentationEpilog(void *TelementryEvent, std::string &Name, int32_t StreamID, uint64_t IId)
Definition: queue_impl.cpp:526
const ContextImplPtr & getContextImplPtr() const
Definition: queue_impl.hpp:346
void * instrumentationProlog(const detail::code_location &CodeLoc, std::string &Name, int32_t StreamID, uint64_t &iid)
Definition: queue_impl.cpp:469
std::weak_ptr< ext::oneapi::experimental::detail::graph_impl > MGraph
event memset(const std::shared_ptr< queue_impl > &Self, void *Ptr, int Value, size_t Count, const std::vector< event > &DepEvents, bool CallerNeedsEvent)
Fills the memory pointed by a USM pointer with the value specified.
Definition: queue_impl.cpp:148
event memcpy(const std::shared_ptr< queue_impl > &Self, void *Dest, const void *Src, size_t Count, const std::vector< event > &DepEvents, bool CallerNeedsEvent, const code_location &CodeLoc)
Copies data from one memory region to another, both pointed by USM pointers.
Definition: queue_impl.cpp:196
const std::vector< event > & getExtendDependencyList(const std::vector< event > &DepEvents, std::vector< event > &MutableVec, std::unique_lock< std::mutex > &QueueLock)
Definition: queue_impl.cpp:126
const char * c_str() const noexcept
Definition: string.hpp:60
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Definition: event.hpp:44
Command group handler class.
Definition: handler.hpp:468
void depends_on(event Event)
Registers event dependencies on this command group.
Definition: handler.cpp:1526
void memcpy(void *Dest, const void *Src, size_t Count)
Copies data from one memory region to another, each is either a host pointer or a pointer within USM ...
Definition: handler.cpp:927
void mem_advise(const void *Ptr, size_t Length, int Advice)
Provides additional information to the underlying runtime about how different allocations are used.
Definition: handler.cpp:951
void memset(void *Dest, int Value, size_t Count)
Fills the memory pointed by a USM pointer with the value specified.
Definition: handler.cpp:935
__SYCL_EXTERN_STREAM_ATTRS ostream cout
Linked to standard output.
thread_local bool NestedCallsDetector
Definition: queue_impl.cpp:32
decltype(Obj::impl) const & getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:31
constexpr const char * SYCL_STREAM_NAME
static event createDiscardedEvent()
Definition: queue_impl.cpp:119
static std::vector< sycl::detail::pi::PiEvent > getPIEvents(const std::vector< sycl::event > &DepEvents)
Definition: queue_impl.cpp:48
static const PluginPtr & getPlugin(backend Backend)
Definition: backend.cpp:32
std::shared_ptr< event_impl > EventImplPtr
Definition: handler.hpp:184
std::shared_ptr< plugin > PluginPtr
Definition: pi.hpp:47
CGType
Type of the command group.
Definition: cg_types.hpp:41
void report(const code_location &CodeLoc)
Definition: queue_impl.cpp:183
static event prepareSYCLEventAssociatedWithQueue(const std::shared_ptr< detail::queue_impl > &QueueImpl)
Definition: queue_impl.cpp:111
std::string queueDeviceToString(const queue_impl *const &Queue)
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:64
Definition: access.hpp:18
static device_ext & get_device(unsigned int id)
Util function to get a device by id.
Definition: device.hpp:905
pi_result piQueueFinish(pi_queue command_queue)
Definition: pi_cuda.cpp:186
uintptr_t pi_native_handle
Definition: pi.h:258
_pi_result
Definition: pi.h:265
pi_uint32 pi_bool
Definition: pi.h:256
@ PI_QUEUE_INFO_REFERENCE_COUNT
Definition: pi.h:568
@ PI_EXT_ONEAPI_QUEUE_INFO_EMPTY
Definition: pi.h:572
_pi_mem_advice
Definition: pi.h:666
pi_result piextQueueGetNativeHandle(pi_queue queue, pi_native_handle *nativeHandle, int32_t *nativeHandleDesc)
Gets the native handle of a PI queue object.
Definition: pi_cuda.cpp:190
pi_result piQueueGetInfo(pi_queue command_queue, pi_queue_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_cuda.cpp:172
pi_result piQueueRetain(pi_queue command_queue)
Definition: pi_cuda.cpp:180
C++ wrapper of extern "C" PI interfaces.
constexpr unsigned long columnNumber() const noexcept
Definition: common.hpp:88
constexpr const char * fileName() const noexcept
Definition: common.hpp:89
constexpr const char * functionName() const noexcept
Definition: common.hpp:90
constexpr unsigned long lineNumber() const noexcept
Definition: common.hpp:87