DPC++ Runtime
Runtime libraries for oneAPI DPC++
queue_impl.hpp
Go to the documentation of this file.
1 //==------------------ queue_impl.hpp - SYCL queue -------------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #pragma once
10 
11 #include <detail/config.hpp>
12 #include <detail/context_impl.hpp>
13 #include <detail/device_impl.hpp>
14 #include <detail/device_info.hpp>
15 #include <detail/event_impl.hpp>
17 #include <detail/handler_impl.hpp>
18 #include <detail/kernel_impl.hpp>
19 #include <detail/plugin.hpp>
21 #include <detail/stream_impl.hpp>
22 #include <detail/thread_pool.hpp>
23 #include <sycl/context.hpp>
25 #include <sycl/detail/ur.hpp>
26 #include <sycl/device.hpp>
27 #include <sycl/event.hpp>
28 #include <sycl/exception.hpp>
29 #include <sycl/exception_list.hpp>
30 #include <sycl/handler.hpp>
32 #include <sycl/property_list.hpp>
33 #include <sycl/queue.hpp>
34 
35 #include "detail/graph_impl.hpp"
36 
37 #include <utility>
38 
39 #ifdef XPTI_ENABLE_INSTRUMENTATION
40 #include "xpti/xpti_trace_framework.hpp"
41 #include <detail/xpti_registry.hpp>
42 #endif
43 
44 namespace sycl {
45 inline namespace _V1 {
46 
47 // forward declaration
48 
49 namespace ext::oneapi::experimental::detail {
50 class graph_impl;
51 }
52 
53 namespace detail {
54 
55 using ContextImplPtr = std::shared_ptr<detail::context_impl>;
56 using DeviceImplPtr = std::shared_ptr<detail::device_impl>;
57 
59 static constexpr size_t MaxNumQueues = 256;
60 
63 enum class CUDAContextT : char { primary, custom };
64 
67 
69 
70 class queue_impl {
71 public:
72  // \return a default context for the platform if it includes the device
73  // passed and default contexts are enabled, a new context otherwise.
77  context{createSyclObjFromImpl<device>(Device), {}, {}});
78 
79  ContextImplPtr DefaultContext = detail::getSyclObjImpl(
80  Device->get_platform().ext_oneapi_get_default_context());
81  if (DefaultContext->isDeviceValid(Device))
82  return DefaultContext;
84  context{createSyclObjFromImpl<device>(Device), {}, {}});
85  }
93  queue_impl(const DeviceImplPtr &Device, const async_handler &AsyncHandler,
94  const property_list &PropList)
95  : queue_impl(Device, getDefaultOrNew(Device), AsyncHandler, PropList) {};
96 
106  queue_impl(const DeviceImplPtr &Device, const ContextImplPtr &Context,
107  const async_handler &AsyncHandler, const property_list &PropList)
108  : MDevice(Device), MContext(Context), MAsyncHandler(AsyncHandler),
109  MPropList(PropList),
110  MIsInorder(has_property<property::queue::in_order>()),
112  has_property<ext::oneapi::property::queue::discard_events>()),
113  MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
114  MQueueID{
116  if (has_property<property::queue::enable_profiling>()) {
117  if (has_property<ext::oneapi::property::queue::discard_events>())
119  "Queue cannot be constructed with both of "
120  "discard_events and enable_profiling.");
121  // fallback profiling support. See MFallbackProfiling
122  if (MDevice->has(aspect::queue_profiling)) {
123  // When urDeviceGetGlobalTimestamps is not supported, compute the
124  // profiling time OpenCL version < 2.1 case
125  if (!getDeviceImplPtr()->isGetDeviceAndHostTimerSupported())
126  MFallbackProfiling = true;
127  } else {
129  "Cannot enable profiling, the associated device "
130  "does not have the queue_profiling aspect");
131  }
132  }
133  if (has_property<ext::intel::property::queue::compute_index>()) {
134  int Idx = get_property<ext::intel::property::queue::compute_index>()
135  .get_index();
136  int NumIndices =
137  createSyclObjFromImpl<device>(Device)
138  .get_info<ext::intel::info::device::max_compute_queue_indices>();
139  if (Idx < 0 || Idx >= NumIndices)
140  throw sycl::exception(
142  "Queue compute index must be a non-negative number less than "
143  "device's number of available compute queue indices.");
144  }
145  if (!Context->isDeviceValid(Device)) {
146  if (Context->getBackend() == backend::opencl)
147  throw sycl::exception(
149  "Queue cannot be constructed with the given context and device "
150  "since the device is not a member of the context (descendants of "
151  "devices from the context are not supported on OpenCL yet).");
152  throw sycl::exception(
154  "Queue cannot be constructed with the given context and device "
155  "since the device is neither a member of the context nor a "
156  "descendant of its member.");
157  }
158  const QueueOrder QOrder =
160  MQueues.push_back(createQueue(QOrder));
161  // This section is the second part of the instrumentation that uses the
162  // tracepoint information and notifies
163 
164  // We enable XPTI tracing events using the TLS mechanism; if the code
165  // location data is available, then the tracing data will be rich.
166 #if XPTI_ENABLE_INSTRUMENTATION
167  // Emit a trace event for queue creation; we currently do not get code
168  // location information, so all queueus will have the same UID with a
169  // different instance ID until this gets added.
171 #endif
172  }
173 
174  event getLastEvent();
175 
176 private:
177  void queue_impl_interop(ur_queue_handle_t UrQueue) {
178  if (has_property<ext::oneapi::property::queue::discard_events>() &&
179  has_property<property::queue::enable_profiling>()) {
181  "Queue cannot be constructed with both of "
182  "discard_events and enable_profiling.");
183  }
184 
185  MQueues.push_back(UrQueue);
186 
187  ur_device_handle_t DeviceUr{};
188  const PluginPtr &Plugin = getPlugin();
189  // TODO catch an exception and put it to list of asynchronous exceptions
190  Plugin->call<UrApiKind::urQueueGetInfo>(
191  MQueues[0], UR_QUEUE_INFO_DEVICE, sizeof(DeviceUr), &DeviceUr, nullptr);
192  MDevice = MContext->findMatchingDeviceImpl(DeviceUr);
193  if (MDevice == nullptr) {
194  throw sycl::exception(
196  "Device provided by native Queue not found in Context.");
197  }
198  // The following commented section provides a guideline on how to use the
199  // TLS enabled mechanism to create a tracepoint and notify using XPTI. This
200  // is the prolog section and the epilog section will initiate the
201  // notification.
202 #if XPTI_ENABLE_INSTRUMENTATION
203  // Emit a trace event for queue creation; we currently do not get code
204  // location information, so all queueus will have the same UID with a
205  // different instance ID until this gets added.
207 #endif
208  }
209 
210 public:
217  queue_impl(ur_queue_handle_t UrQueue, const ContextImplPtr &Context,
218  const async_handler &AsyncHandler)
219  : MContext(Context), MAsyncHandler(AsyncHandler),
220  MIsInorder(has_property<property::queue::in_order>()),
222  has_property<ext::oneapi::property::queue::discard_events>()),
223  MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
224  MQueueID{
226  queue_impl_interop(UrQueue);
227  }
228 
236  queue_impl(ur_queue_handle_t UrQueue, const ContextImplPtr &Context,
237  const async_handler &AsyncHandler, const property_list &PropList)
238  : MContext(Context), MAsyncHandler(AsyncHandler), MPropList(PropList),
239  MIsInorder(has_property<property::queue::in_order>()),
241  has_property<ext::oneapi::property::queue::discard_events>()),
242  MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
243  MQueueID{
245  queue_impl_interop(UrQueue);
246  }
247 
249  try {
250 #if XPTI_ENABLE_INSTRUMENTATION
251  // The trace event created in the constructor should be active through the
252  // lifetime of the queue object as member variable. We will send a
253  // notification and destroy the trace event for this queue.
255 #endif
257  getPlugin()->call<UrApiKind::urQueueRelease>(MQueues[0]);
258  } catch (std::exception &e) {
259  __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~queue_impl", e);
260  }
261  }
262 
264 
265  cl_command_queue get() {
266  getPlugin()->call<UrApiKind::urQueueRetain>(MQueues[0]);
267  ur_native_handle_t nativeHandle = 0;
268  getPlugin()->call<UrApiKind::urQueueGetNativeHandle>(MQueues[0], nullptr,
269  &nativeHandle);
270  return ur::cast<cl_command_queue>(nativeHandle);
271  }
272 
275  return createSyclObjFromImpl<context>(MContext);
276  }
277 
278  const PluginPtr &getPlugin() const { return MContext->getPlugin(); }
279 
280  const ContextImplPtr &getContextImplPtr() const { return MContext; }
281 
282  const DeviceImplPtr &getDeviceImplPtr() const { return MDevice; }
283 
285  device get_device() const { return createSyclObjFromImpl<device>(MDevice); }
286 
288  bool hasDiscardEventsProperty() const { return MDiscardEvents; }
289 
291  bool supportsDiscardingPiEvents() const { return MIsInorder; }
292 
293  bool isInOrder() const { return MIsInorder; }
294 
298  template <typename Param> typename Param::return_type get_info() const;
299 
303  template <typename Param>
304  typename Param::return_type get_backend_info() const;
305 
309  void flush() {
310  if (MGraph.lock()) {
312  "flush cannot be called for a queue which is "
313  "recording to a command graph.");
314  }
315  for (const auto &queue : MQueues) {
316  getPlugin()->call<UrApiKind::urQueueFlush>(queue);
317  }
318  }
319 
320  using SubmitPostProcessF = std::function<void(bool, bool, event &)>;
321 
335  event submit(const std::function<void(handler &)> &CGF,
336  const std::shared_ptr<queue_impl> &Self,
337  const std::shared_ptr<queue_impl> &SecondQueue,
338  const detail::code_location &Loc,
339  const SubmitPostProcessF *PostProcess = nullptr) {
340  event ResEvent;
341  try {
342  ResEvent = submit_impl(CGF, Self, Self, SecondQueue,
343  /*CallerNeedsEvent=*/true, Loc, PostProcess);
344  } catch (...) {
345  ResEvent =
346  SecondQueue->submit_impl(CGF, SecondQueue, Self, SecondQueue,
347  /*CallerNeedsEvent=*/true, Loc, PostProcess);
348  }
349  return discard_or_return(ResEvent);
350  }
351 
360  event submit(const std::function<void(handler &)> &CGF,
361  const std::shared_ptr<queue_impl> &Self,
362  const detail::code_location &Loc,
363  const SubmitPostProcessF *PostProcess = nullptr) {
364  auto ResEvent = submit_impl(CGF, Self, Self, nullptr,
365  /*CallerNeedsEvent=*/true, Loc, PostProcess);
366  return discard_or_return(ResEvent);
367  }
368 
369  void submit_without_event(const std::function<void(handler &)> &CGF,
370  const std::shared_ptr<queue_impl> &Self,
371  const detail::code_location &Loc,
372  const SubmitPostProcessF *PostProcess = nullptr) {
373  submit_impl(CGF, Self, Self, nullptr, /*CallerNeedsEvent=*/false, Loc,
374  PostProcess);
375  }
376 
382  void wait(const detail::code_location &Loc = {});
383 
386 
388  void wait_and_throw(const detail::code_location &Loc = {}) {
389  wait(Loc);
391  }
392 
401  if (!MAsyncHandler)
402  return;
403 
404  exception_list Exceptions;
405  {
406  std::lock_guard<std::mutex> Lock(MMutex);
407  std::swap(Exceptions, MExceptions);
408  }
409  // Unlock the mutex before calling user-provided handler to avoid
410  // potential deadlock if the same queue is somehow referenced in the
411  // handler.
412  if (Exceptions.size())
413  MAsyncHandler(std::move(Exceptions));
414  }
415 
421  static ur_queue_flags_t createUrQueueFlags(const property_list &PropList,
422  QueueOrder Order) {
423  ur_queue_flags_t CreationFlags = 0;
424 
425  if (Order == QueueOrder::OOO) {
426  CreationFlags = UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
427  }
428  if (PropList.has_property<property::queue::enable_profiling>()) {
429  CreationFlags |= UR_QUEUE_FLAG_PROFILING_ENABLE;
430  }
431  if (PropList.has_property<
432  ext::oneapi::cuda::property::queue::use_default_stream>()) {
433  CreationFlags |= UR_QUEUE_FLAG_USE_DEFAULT_STREAM;
434  }
435  if (PropList.has_property<ext::oneapi::property::queue::discard_events>()) {
436  // Pass this flag to the Level Zero plugin to be able to check it from
437  // queue property.
438  CreationFlags |= UR_QUEUE_FLAG_DISCARD_EVENTS;
439  }
440  // Track that priority settings are not ambiguous.
441  bool PrioritySeen = false;
442  if (PropList
443  .has_property<ext::oneapi::property::queue::priority_normal>()) {
444  // Normal is the default priority, don't pass anything.
445  PrioritySeen = true;
446  }
447  if (PropList.has_property<ext::oneapi::property::queue::priority_low>()) {
448  if (PrioritySeen) {
449  throw sycl::exception(
451  "Queue cannot be constructed with different priorities.");
452  }
453  CreationFlags |= UR_QUEUE_FLAG_PRIORITY_LOW;
454  PrioritySeen = true;
455  }
456  if (PropList.has_property<ext::oneapi::property::queue::priority_high>()) {
457  if (PrioritySeen) {
458  throw sycl::exception(
460  "Queue cannot be constructed with different priorities.");
461  }
462  CreationFlags |= UR_QUEUE_FLAG_PRIORITY_HIGH;
463  }
464  // Track that submission modes do not conflict.
465  bool SubmissionSeen = false;
466  if (PropList.has_property<
467  ext::intel::property::queue::no_immediate_command_list>()) {
468  SubmissionSeen = true;
469  CreationFlags |= UR_QUEUE_FLAG_SUBMISSION_BATCHED;
470  }
471  if (PropList.has_property<
472  ext::intel::property::queue::immediate_command_list>()) {
473  if (SubmissionSeen) {
474  throw sycl::exception(
476  "Queue cannot be constructed with different submission modes.");
477  }
478  SubmissionSeen = true;
479  CreationFlags |= UR_QUEUE_FLAG_SUBMISSION_IMMEDIATE;
480  }
481  return CreationFlags;
482  }
483 
488  ur_queue_handle_t createQueue(QueueOrder Order) {
489  ur_queue_handle_t Queue{};
490  ur_context_handle_t Context = MContext->getHandleRef();
491  ur_device_handle_t Device = MDevice->getHandleRef();
492  const PluginPtr &Plugin = getPlugin();
493  /*
494  sycl::detail::pi::PiQueueProperties Properties[] = {
495  PI_QUEUE_FLAGS, createPiQueueProperties(MPropList, Order), 0, 0, 0};
496  */
497  ur_queue_properties_t Properties = {UR_STRUCTURE_TYPE_QUEUE_PROPERTIES,
498  nullptr, 0};
499  Properties.flags = createUrQueueFlags(MPropList, Order);
500  ur_queue_index_properties_t IndexProperties = {
501  UR_STRUCTURE_TYPE_QUEUE_INDEX_PROPERTIES, nullptr, 0};
502  if (has_property<ext::intel::property::queue::compute_index>()) {
503  IndexProperties.computeIndex =
504  get_property<ext::intel::property::queue::compute_index>()
505  .get_index();
506  Properties.pNext = &IndexProperties;
507  }
508  ur_result_t Error = Plugin->call_nocheck<UrApiKind::urQueueCreate>(
509  Context, Device, &Properties, &Queue);
510 
511  // If creating out-of-order queue failed and this property is not
512  // supported (for example, on FPGA), it will return
513  // UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES and will try to create in-order
514  // queue.
515  if (!MEmulateOOO && Error == UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES) {
516  MEmulateOOO = true;
518  } else {
519  Plugin->checkUrResult(Error);
520  }
521 
522  return Queue;
523  }
524 
527  ur_queue_handle_t &getExclusiveUrQueueHandleRef() {
528  ur_queue_handle_t *PIQ = nullptr;
529  bool ReuseQueue = false;
530  {
531  std::lock_guard<std::mutex> Lock(MMutex);
532 
533  // To achieve parallelism for FPGA with in order execution model with
534  // possibility of two kernels to share data with each other we shall
535  // create a queue for every kernel enqueued.
536  if (MQueues.size() < MaxNumQueues) {
537  MQueues.push_back({});
538  PIQ = &MQueues.back();
539  } else {
540  // If the limit of OpenCL queues is going to be exceeded - take the
541  // earliest used queue, wait until it finished and then reuse it.
542  PIQ = &MQueues[MNextQueueIdx];
544  ReuseQueue = true;
545  }
546  }
547 
548  if (!ReuseQueue)
550  else
551  getPlugin()->call<UrApiKind::urQueueFinish>(*PIQ);
552 
553  return *PIQ;
554  }
555 
558  ur_queue_handle_t &getHandleRef() {
559  if (!MEmulateOOO)
560  return MQueues[0];
561 
563  }
564 
567  template <typename propertyT> bool has_property() const noexcept {
568  return MPropList.has_property<propertyT>();
569  }
570 
574  template <typename propertyT> propertyT get_property() const {
575  return MPropList.get_property<propertyT>();
576  }
577 
588  event memset(const std::shared_ptr<queue_impl> &Self, void *Ptr, int Value,
589  size_t Count, const std::vector<event> &DepEvents,
590  bool CallerNeedsEvent);
602  event memcpy(const std::shared_ptr<queue_impl> &Self, void *Dest,
603  const void *Src, size_t Count,
604  const std::vector<event> &DepEvents, bool CallerNeedsEvent,
605  const code_location &CodeLoc);
617  event mem_advise(const std::shared_ptr<queue_impl> &Self, const void *Ptr,
618  size_t Length, ur_usm_advice_flags_t Advice,
619  const std::vector<event> &DepEvents, bool CallerNeedsEvent);
620 
624  void reportAsyncException(const std::exception_ptr &ExceptionPtr) {
625  std::lock_guard<std::mutex> Lock(MMutex);
626  MExceptions.PushBack(ExceptionPtr);
627  }
628 
631  }
632 
636  ur_native_handle_t getNative(int32_t &NativeHandleDesc) const;
637 
639  std::lock_guard<std::mutex> Lock(MStreamsServiceEventsMutex);
640  MStreamsServiceEvents.push_back(Event);
641  }
642 
643  bool ext_oneapi_empty() const;
644 
645  event memcpyToDeviceGlobal(const std::shared_ptr<queue_impl> &Self,
646  void *DeviceGlobalPtr, const void *Src,
647  bool IsDeviceImageScope, size_t NumBytes,
648  size_t Offset, const std::vector<event> &DepEvents,
649  bool CallerNeedsEvent);
650  event memcpyFromDeviceGlobal(const std::shared_ptr<queue_impl> &Self,
651  void *Dest, const void *DeviceGlobalPtr,
652  bool IsDeviceImageScope, size_t NumBytes,
653  size_t Offset,
654  const std::vector<event> &DepEvents,
655  bool CallerNeedsEvent);
656 
658 
660  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph) {
661  std::lock_guard<std::mutex> Lock(MMutex);
662  MGraph = Graph;
664  }
665 
666  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
667  getCommandGraph() const {
668  return MGraph.lock();
669  }
670 
671  unsigned long long getQueueID() { return MQueueID; }
672 
673  void *getTraceEvent() { return MTraceEvent; }
674 
675  void setExternalEvent(const event &Event) {
676  std::lock_guard<std::mutex> Lock(MInOrderExternalEventMtx);
677  MInOrderExternalEvent = Event;
678  }
679 
680  std::optional<event> popExternalEvent() {
681  std::lock_guard<std::mutex> Lock(MInOrderExternalEventMtx);
682  std::optional<event> Result = std::nullopt;
683  std::swap(Result, MInOrderExternalEvent);
684  return Result;
685  }
686 
687  const std::vector<event> &
688  getExtendDependencyList(const std::vector<event> &DepEvents,
689  std::vector<event> &MutableVec,
690  std::unique_lock<std::mutex> &QueueLock);
691 
692  // Helps to manage host tasks presence in scenario with barrier usage.
693  // Approach that tracks almost all tasks to provide barrier sync for both ur
694  // tasks and host tasks is applicable for out of order queues only. No-op
695  // for in order ones.
696  void tryToResetEnqueuedBarrierDep(const EventImplPtr &EnqueuedBarrierEvent);
697 
698  // Called on host task completion that could block some kernels from enqueue.
699  // Approach that tracks almost all tasks to provide barrier sync for both ur
700  // tasks and host tasks is applicable for out of order queues only. Not neede
701  // for in order ones.
702  void revisitUnenqueuedCommandsState(const EventImplPtr &CompletedHostTask);
703 
704  static ContextImplPtr getContext(const QueueImplPtr &Queue) {
705  return Queue ? Queue->getContextImplPtr() : nullptr;
706  }
707 
708  // Must be called under MMutex protection
710  const std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
711  &Graph);
712 
713  const property_list &getPropList() const { return MPropList; }
714 
715 protected:
716  event discard_or_return(const event &Event);
717 
718  template <typename HandlerType = handler>
719  EventImplPtr insertHelperBarrier(const HandlerType &Handler) {
720  auto ResEvent = std::make_shared<detail::event_impl>(Handler.MQueue);
721  ur_event_handle_t UREvent = nullptr;
722  getPlugin()->call<UrApiKind::urEnqueueEventsWaitWithBarrier>(
723  Handler.MQueue->getHandleRef(), 0, nullptr, &UREvent);
724  ResEvent->setHandle(UREvent);
725  return ResEvent;
726  }
727 
728  // template is needed for proper unit testing
729  template <typename HandlerType = handler>
730  void finalizeHandler(HandlerType &Handler, event &EventRet) {
731  if (MIsInorder) {
732  // Accessing and changing of an event isn't atomic operation.
733  // Hence, here is the lock for thread-safety.
734  std::lock_guard<std::mutex> Lock{MMutex};
735 
736  auto &EventToBuildDeps = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr
738 
739  // This dependency is needed for the following purposes:
740  // - host tasks are handled by the runtime and cannot be implicitly
741  // synchronized by the backend.
742  // - to prevent the 2nd kernel enqueue when the 1st kernel is blocked
743  // by a host task. This dependency allows to build the enqueue order in
744  // the RT but will not be passed to the backend. See getPIEvents in
745  // Command.
746  if (EventToBuildDeps) {
747  // In the case where the last event was discarded and we are to run a
748  // host_task, we insert a barrier into the queue and use the resulting
749  // event as the dependency for the host_task.
750  // Note that host_task events can never be discarded, so this will not
751  // insert barriers between host_task enqueues.
752  if (EventToBuildDeps->isDiscarded() &&
753  getSyclObjImpl(Handler)->MCGType == CGType::CodeplayHostTask)
754  EventToBuildDeps = insertHelperBarrier(Handler);
755 
756  if (!EventToBuildDeps->isDiscarded())
757  Handler.depends_on(EventToBuildDeps);
758  }
759 
760  // If there is an external event set, add it as a dependency and clear it.
761  // We do not need to hold the lock as MLastEventMtx will ensure the last
762  // event reflects the corresponding external event dependence as well.
763  std::optional<event> ExternalEvent = popExternalEvent();
764  if (ExternalEvent)
765  Handler.depends_on(*ExternalEvent);
766 
767  EventRet = Handler.finalize();
768  EventToBuildDeps = getSyclObjImpl(EventRet);
769  } else {
770  const CGType Type = getSyclObjImpl(Handler)->MCGType;
771  std::lock_guard<std::mutex> Lock{MMutex};
772  // The following code supports barrier synchronization if host task is
773  // involved in the scenario. Native barriers cannot handle host task
774  // dependency so in the case where some commands were not enqueued
775  // (blocked), we track them to prevent barrier from being enqueued
776  // earlier.
777  {
778  std::lock_guard<std::mutex> RequestLock(MMissedCleanupRequestsMtx);
779  for (auto &UpdatedGraph : MMissedCleanupRequests)
780  doUnenqueuedCommandCleanup(UpdatedGraph);
781  MMissedCleanupRequests.clear();
782  }
783  auto &Deps = MGraph.expired() ? MDefaultGraphDeps : MExtGraphDeps;
784  if (Type == CGType::Barrier && !Deps.UnenqueuedCmdEvents.empty()) {
785  Handler.depends_on(Deps.UnenqueuedCmdEvents);
786  }
787  if (Deps.LastBarrier && (Type == CGType::CodeplayHostTask ||
788  (!Deps.LastBarrier->isEnqueued())))
789  Handler.depends_on(Deps.LastBarrier);
790 
791  EventRet = Handler.finalize();
792  EventImplPtr EventRetImpl = getSyclObjImpl(EventRet);
793  if (Type == CGType::CodeplayHostTask)
794  Deps.UnenqueuedCmdEvents.push_back(EventRetImpl);
795  else if (Type == CGType::Barrier || Type == CGType::BarrierWaitlist) {
796  Deps.LastBarrier = EventRetImpl;
797  Deps.UnenqueuedCmdEvents.clear();
798  } else if (!EventRetImpl->isEnqueued()) {
799  Deps.UnenqueuedCmdEvents.push_back(EventRetImpl);
800  }
801  }
802  }
803 
816  event submit_impl(const std::function<void(handler &)> &CGF,
817  const std::shared_ptr<queue_impl> &Self,
818  const std::shared_ptr<queue_impl> &PrimaryQueue,
819  const std::shared_ptr<queue_impl> &SecondaryQueue,
820  bool CallerNeedsEvent, const detail::code_location &Loc,
821  const SubmitPostProcessF *PostProcess);
822 
828  template <typename HandlerFuncT>
829  event submitWithHandler(const std::shared_ptr<queue_impl> &Self,
830  const std::vector<event> &DepEvents,
831  HandlerFuncT HandlerFunc);
832 
848  template <typename HandlerFuncT, typename MemMngrFuncT,
849  typename... MemMngrArgTs>
850  event submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,
851  const std::vector<event> &DepEvents,
852  bool CallerNeedsEvent, HandlerFuncT HandlerFunc,
853  MemMngrFuncT MemMngrFunc, MemMngrArgTs... MemOpArgs);
854 
855  // When instrumentation is enabled emits trace event for wait begin and
856  // returns the telemetry event generated for the wait
857  void *instrumentationProlog(const detail::code_location &CodeLoc,
858  std::string &Name, int32_t StreamID,
859  uint64_t &iid);
860  // Uses events generated by the Prolog and emits wait done event
861  void instrumentationEpilog(void *TelementryEvent, std::string &Name,
862  int32_t StreamID, uint64_t IId);
863 
864  // We need to emit a queue_create notification when a queue object is created
866 
867  // We need to emit a queue_destroy notification when a queue object is
868  // destroyed
869  void destructorNotification();
870 
876  void addSharedEvent(const event &Event);
877 
881  void addEvent(const event &Event);
882 
884  mutable std::mutex MMutex;
885 
888 
890  std::vector<std::weak_ptr<event_impl>> MEventsWeak;
891 
895  std::vector<event> MEventsShared;
899 
901  std::vector<ur_queue_handle_t> MQueues;
903  size_t MNextQueueIdx = 0;
904 
907  bool MEmulateOOO = false;
908 
909  // Access should be guarded with MMutex
911  // This event is employed for enhanced dependency tracking with in-order
912  // queue
914  // The following two items are employed for proper out of order enqueue
915  // ordering
916  std::vector<EventImplPtr> UnenqueuedCmdEvents;
918 
919  void reset() {
920  LastEventPtr = nullptr;
921  UnenqueuedCmdEvents.clear();
922  LastBarrier = nullptr;
923  }
925 
926  const bool MIsInorder;
927 
928  std::vector<EventImplPtr> MStreamsServiceEvents;
930 
931  // All member variable defined here are needed for the SYCL instrumentation
932  // layer. Do not guard these variables below with XPTI_ENABLE_INSTRUMENTATION
933  // to ensure we have the same object layout when the macro in the library and
934  // SYCL app are not the same.
935  void *MTraceEvent = nullptr;
937  uint8_t MStreamID = 0;
939  uint64_t MInstanceID = 0;
940 
941  // the fallback implementation of profiling info
942  bool MFallbackProfiling = false;
943 
944  // This event can be optionally provided by users for in-order queues to add
945  // an additional dependency for the subsequent submission in to the queue.
946  // Access to the event should be guarded with MInOrderExternalEventMtx.
947  // NOTE: std::optional must not be exposed in the ABI.
948  std::optional<event> MInOrderExternalEvent;
949  mutable std::mutex MInOrderExternalEventMtx;
950 
951 public:
952  // Queue constructed with the discard_events property
953  const bool MDiscardEvents;
955 
956 protected:
957  // Command graph which is associated with this queue for the purposes of
958  // recording commands to it.
959  std::weak_ptr<ext::oneapi::experimental::detail::graph_impl> MGraph{};
960 
961  unsigned long long MQueueID;
962  static std::atomic<unsigned long long> MNextAvailableQueueID;
963 
964  std::deque<std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>>
967 
969 };
970 
971 } // namespace detail
972 } // namespace _V1
973 } // namespace sycl
The context class represents a SYCL context on which kernel functions may be executed.
Definition: context.hpp:50
static GlobalHandler & instance()
event discard_or_return(const event &Event)
Definition: queue_impl.cpp:758
std::deque< std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > > MMissedCleanupRequests
Definition: queue_impl.hpp:965
const property_list MPropList
Definition: queue_impl.hpp:898
bool hasDiscardEventsProperty() const
Definition: queue_impl.hpp:288
static ContextImplPtr getContext(const QueueImplPtr &Queue)
Definition: queue_impl.hpp:704
queue_impl(ur_queue_handle_t UrQueue, const ContextImplPtr &Context, const async_handler &AsyncHandler)
Constructs a SYCL queue from plugin interoperability handle.
Definition: queue_impl.hpp:217
queue_impl(ur_queue_handle_t UrQueue, const ContextImplPtr &Context, const async_handler &AsyncHandler, const property_list &PropList)
Constructs a SYCL queue from plugin interoperability handle.
Definition: queue_impl.hpp:236
event submit(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const std::shared_ptr< queue_impl > &SecondQueue, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess=nullptr)
Submits a command group function object to the queue, in order to be scheduled for execution on the d...
Definition: queue_impl.hpp:335
void revisitUnenqueuedCommandsState(const EventImplPtr &CompletedHostTask)
Definition: queue_impl.cpp:764
uint64_t MInstanceID
The instance ID of the trace event for queue object.
Definition: queue_impl.hpp:939
std::vector< EventImplPtr > MStreamsServiceEvents
Definition: queue_impl.hpp:928
const property_list & getPropList() const
Definition: queue_impl.hpp:713
void wait_and_throw(const detail::code_location &Loc={})
Definition: queue_impl.hpp:388
event submitMemOpHelper(const std::shared_ptr< queue_impl > &Self, const std::vector< event > &DepEvents, bool CallerNeedsEvent, HandlerFuncT HandlerFunc, MemMngrFuncT MemMngrFunc, MemMngrArgTs... MemOpArgs)
Performs submission of a memory operation directly if scheduler can be bypassed, or with a handler ot...
Param::return_type get_info() const
Queries SYCL queue for information.
std::optional< event > MInOrderExternalEvent
Definition: queue_impl.hpp:948
std::optional< event > popExternalEvent()
Definition: queue_impl.hpp:680
void registerStreamServiceEvent(const EventImplPtr &Event)
Definition: queue_impl.hpp:638
static std::atomic< unsigned long long > MNextAvailableQueueID
Definition: queue_impl.hpp:962
void addEvent(const event &Event)
Stores an event that should be associated with the queue.
Definition: queue_impl.cpp:299
struct sycl::_V1::detail::queue_impl::DependencyTrackingItems MExtGraphDeps
void tryToResetEnqueuedBarrierDep(const EventImplPtr &EnqueuedBarrierEvent)
unsigned long long MQueueID
Definition: queue_impl.hpp:961
std::vector< std::weak_ptr< event_impl > > MEventsWeak
These events are tracked, but not owned, by the queue.
Definition: queue_impl.hpp:890
bool has_property() const noexcept
Definition: queue_impl.hpp:567
event submit(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess=nullptr)
Submits a command group function object to the queue, in order to be scheduled for execution on the d...
Definition: queue_impl.hpp:360
std::vector< event > MEventsShared
Events without data dependencies (such as USM) need an owner, additionally, USM operations are not ad...
Definition: queue_impl.hpp:895
ur_native_handle_t getNative(int32_t &NativeHandleDesc) const
Gets the native handle of the SYCL queue.
Definition: queue_impl.cpp:696
std::mutex MMutex
Protects all the fields that can be changed by class' methods.
Definition: queue_impl.hpp:884
void submit_without_event(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess=nullptr)
Definition: queue_impl.hpp:369
void setCommandGraph(std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > Graph)
Definition: queue_impl.hpp:659
event memcpyToDeviceGlobal(const std::shared_ptr< queue_impl > &Self, void *DeviceGlobalPtr, const void *Src, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector< event > &DepEvents, bool CallerNeedsEvent)
Definition: queue_impl.cpp:248
bool supportsDiscardingPiEvents() const
Definition: queue_impl.hpp:291
static ThreadPool & getThreadPool()
Definition: queue_impl.hpp:629
event submitWithHandler(const std::shared_ptr< queue_impl > &Self, const std::vector< event > &DepEvents, HandlerFuncT HandlerFunc)
Helper function for submitting a memory operation with a handler.
Definition: queue_impl.cpp:409
void doUnenqueuedCommandCleanup(const std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > &Graph)
Definition: queue_impl.cpp:777
event submit_impl(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const std::shared_ptr< queue_impl > &PrimaryQueue, const std::shared_ptr< queue_impl > &SecondaryQueue, bool CallerNeedsEvent, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess)
Performs command group submission to the queue.
Definition: queue_impl.cpp:351
std::function< void(bool, bool, event &)> SubmitPostProcessF
Definition: queue_impl.hpp:320
event mem_advise(const std::shared_ptr< queue_impl > &Self, const void *Ptr, size_t Length, ur_usm_advice_flags_t Advice, const std::vector< event > &DepEvents, bool CallerNeedsEvent)
Provides additional information to the underlying runtime about how different allocations are used.
Definition: queue_impl.cpp:236
event memcpyFromDeviceGlobal(const std::shared_ptr< queue_impl > &Self, void *Dest, const void *DeviceGlobalPtr, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector< event > &DepEvents, bool CallerNeedsEvent)
Definition: queue_impl.cpp:264
void addSharedEvent(const event &Event)
queue_impl.addEvent tracks events with weak pointers but some events have no other owners.
Definition: queue_impl.cpp:322
const ContextImplPtr MContext
Definition: queue_impl.hpp:887
std::vector< ur_queue_handle_t > MQueues
List of queues created for FPGA device from a single SYCL queue.
Definition: queue_impl.hpp:901
void finalizeHandler(HandlerType &Handler, event &EventRet)
Definition: queue_impl.hpp:730
void setExternalEvent(const event &Event)
Definition: queue_impl.hpp:675
void throw_asynchronous()
Performs a blocking wait for the completion of all enqueued tasks in the queue.
Definition: queue_impl.hpp:400
queue_impl(const DeviceImplPtr &Device, const ContextImplPtr &Context, const async_handler &AsyncHandler, const property_list &PropList)
Constructs a SYCL queue with an async_handler and property_list provided form a device and a context.
Definition: queue_impl.hpp:106
void flush()
Provides a hint to the backend to execute previously issued commands on this queue.
Definition: queue_impl.hpp:309
const PluginPtr & getPlugin() const
Definition: queue_impl.hpp:278
const DeviceImplPtr & getDeviceImplPtr() const
Definition: queue_impl.hpp:282
const async_handler MAsyncHandler
Definition: queue_impl.hpp:897
exception_list getExceptionList() const
Definition: queue_impl.hpp:385
ur_queue_handle_t createQueue(QueueOrder Order)
Creates UR queue.
Definition: queue_impl.hpp:488
uint8_t MStreamID
The stream under which the traces are emitted from the queue object.
Definition: queue_impl.hpp:937
ur_queue_handle_t & getExclusiveUrQueueHandleRef()
Definition: queue_impl.hpp:527
void reportAsyncException(const std::exception_ptr &ExceptionPtr)
Puts exception to the list of asynchronous ecxeptions.
Definition: queue_impl.hpp:624
void wait(const detail::code_location &Loc={})
Performs a blocking wait for the completion of all enqueued tasks in the queue.
Definition: queue_impl.cpp:548
std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > getCommandGraph() const
Definition: queue_impl.hpp:667
struct sycl::_V1::detail::queue_impl::DependencyTrackingItems MDefaultGraphDeps
EventImplPtr insertHelperBarrier(const HandlerType &Handler)
Definition: queue_impl.hpp:719
bool MEmulateOOO
Indicates that a native out-of-order queue could not be created and we need to emulate it with multip...
Definition: queue_impl.hpp:907
void instrumentationEpilog(void *TelementryEvent, std::string &Name, int32_t StreamID, uint64_t IId)
Definition: queue_impl.cpp:529
const ContextImplPtr & getContextImplPtr() const
Definition: queue_impl.hpp:280
void * instrumentationProlog(const detail::code_location &CodeLoc, std::string &Name, int32_t StreamID, uint64_t &iid)
Definition: queue_impl.cpp:472
ur_queue_handle_t & getHandleRef()
Definition: queue_impl.hpp:558
size_t MNextQueueIdx
Iterator through MQueues.
Definition: queue_impl.hpp:903
std::weak_ptr< ext::oneapi::experimental::detail::graph_impl > MGraph
Definition: queue_impl.hpp:959
event memset(const std::shared_ptr< queue_impl > &Self, void *Ptr, int Value, size_t Count, const std::vector< event > &DepEvents, bool CallerNeedsEvent)
Fills the memory pointed by a USM pointer with the value specified.
Definition: queue_impl.cpp:149
event memcpy(const std::shared_ptr< queue_impl > &Self, void *Dest, const void *Src, size_t Count, const std::vector< event > &DepEvents, bool CallerNeedsEvent, const code_location &CodeLoc)
Copies data from one memory region to another, both pointed by USM pointers.
Definition: queue_impl.cpp:197
Param::return_type get_backend_info() const
Queries SYCL queue for SYCL backend-specific information.
const std::vector< event > & getExtendDependencyList(const std::vector< event > &DepEvents, std::vector< event > &MutableVec, std::unique_lock< std::mutex > &QueueLock)
Definition: queue_impl.cpp:127
queue_impl(const DeviceImplPtr &Device, const async_handler &AsyncHandler, const property_list &PropList)
Constructs a SYCL queue from a device using an async_handler and property_list provided.
Definition: queue_impl.hpp:93
static ContextImplPtr getDefaultOrNew(const DeviceImplPtr &Device)
Definition: queue_impl.hpp:74
unsigned long long getQueueID()
Definition: queue_impl.hpp:671
static ur_queue_flags_t createUrQueueFlags(const property_list &PropList, QueueOrder Order)
Creates UR properties array.
Definition: queue_impl.hpp:421
propertyT get_property() const
Definition: queue_impl.hpp:574
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Definition: event.hpp:44
A list of asynchronous exceptions.
Implementation of node class from SYCL_EXT_ONEAPI_GRAPH.
Definition: graph_impl.hpp:81
Command group handler class.
Definition: handler.hpp:478
Objects of the property_list class are containers for the SYCL properties.
bool has_property() const noexcept
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
Definition: queue.hpp:110
#define __SYCL_REPORT_EXCEPTION_TO_STREAM(str, e)
Definition: common.hpp:364
CUDAContextT
Possible CUDA context types supported by UR CUDA backend TODO: Implement this as a property once ther...
Definition: queue_impl.hpp:63
decltype(Obj::impl) const & getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:31
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
Definition: event_impl.hpp:32
std::shared_ptr< event_impl > EventImplPtr
Definition: handler.hpp:183
std::shared_ptr< plugin > PluginPtr
Definition: ur.hpp:107
std::shared_ptr< device_impl > DeviceImplPtr
static constexpr size_t MaxNumQueues
Sets max number of queues supported by FPGA RT.
Definition: queue_impl.hpp:59
CGType
Type of the command group.
Definition: cg_types.hpp:42
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
Definition: helpers.hpp:45
constexpr CUDAContextT DefaultContextType
Default context type created for CUDA backend.
Definition: queue_impl.hpp:66
constexpr auto memory_order_relaxed
std::function< void(sycl::exception_list)> async_handler
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:65
Definition: access.hpp:18
_Abi const simd< _Tp, _Abi > & noexcept
Definition: simd.hpp:1324
C++ utilities for Unified Runtime integration.