DPC++ Runtime
Runtime libraries for oneAPI DPC++
queue_impl.hpp
Go to the documentation of this file.
1 //==------------------ queue_impl.hpp - SYCL queue -------------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #pragma once
10 
11 #include <detail/config.hpp>
12 #include <detail/context_impl.hpp>
13 #include <detail/device_impl.hpp>
14 #include <detail/device_info.hpp>
15 #include <detail/event_impl.hpp>
17 #include <detail/handler_impl.hpp>
18 #include <detail/kernel_impl.hpp>
19 #include <detail/plugin.hpp>
21 #include <detail/stream_impl.hpp>
22 #include <detail/thread_pool.hpp>
23 #include <sycl/context.hpp>
26 #include <sycl/device.hpp>
27 #include <sycl/event.hpp>
28 #include <sycl/exception.hpp>
29 #include <sycl/exception_list.hpp>
31 #include <sycl/handler.hpp>
33 #include <sycl/property_list.hpp>
34 #include <sycl/queue.hpp>
35 
36 #include "detail/graph_impl.hpp"
37 
38 #include <utility>
39 
40 #ifdef XPTI_ENABLE_INSTRUMENTATION
41 #include "xpti/xpti_trace_framework.hpp"
42 #include <detail/xpti_registry.hpp>
43 #endif
44 
45 namespace sycl {
46 inline namespace _V1 {
47 
48 // forward declaration
49 
50 namespace ext::oneapi::experimental::detail {
51 class graph_impl;
52 }
53 
54 namespace detail {
55 
56 using ContextImplPtr = std::shared_ptr<detail::context_impl>;
57 using DeviceImplPtr = std::shared_ptr<detail::device_impl>;
58 
60 static constexpr size_t MaxNumQueues = 256;
61 
64 enum class CUDAContextT : char { primary, custom };
65 
68 
70 
71 class queue_impl {
72 public:
73  // \return a default context for the platform if it includes the device
74  // passed and default contexts are enabled, a new context otherwise.
78  context{createSyclObjFromImpl<device>(Device), {}, {}});
79 
80  ContextImplPtr DefaultContext = detail::getSyclObjImpl(
81  Device->get_platform().ext_oneapi_get_default_context());
82  if (DefaultContext->isDeviceValid(Device))
83  return DefaultContext;
85  context{createSyclObjFromImpl<device>(Device), {}, {}});
86  }
94  queue_impl(const DeviceImplPtr &Device, const async_handler &AsyncHandler,
95  const property_list &PropList)
96  : queue_impl(Device, getDefaultOrNew(Device), AsyncHandler, PropList) {};
97 
107  queue_impl(const DeviceImplPtr &Device, const ContextImplPtr &Context,
108  const async_handler &AsyncHandler, const property_list &PropList)
109  : MDevice(Device), MContext(Context), MAsyncHandler(AsyncHandler),
110  MPropList(PropList),
111  MIsInorder(has_property<property::queue::in_order>()),
113  has_property<ext::oneapi::property::queue::discard_events>()),
114  MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
115  MQueueID{
117  if (has_property<property::queue::enable_profiling>()) {
118  if (has_property<ext::oneapi::property::queue::discard_events>())
120  "Queue cannot be constructed with both of "
121  "discard_events and enable_profiling.");
122  // fallback profiling support. See MFallbackProfiling
123  if (MDevice->has(aspect::queue_profiling)) {
124  // When piGetDeviceAndHostTimer is not supported, compute the
125  // profiling time OpenCL version < 2.1 case
126  if (!getDeviceImplPtr()->isGetDeviceAndHostTimerSupported())
127  MFallbackProfiling = true;
128  } else {
130  "Cannot enable profiling, the associated device "
131  "does not have the queue_profiling aspect");
132  }
133  }
134  if (has_property<ext::intel::property::queue::compute_index>()) {
135  int Idx = get_property<ext::intel::property::queue::compute_index>()
136  .get_index();
137  int NumIndices =
138  createSyclObjFromImpl<device>(Device)
139  .get_info<ext::intel::info::device::max_compute_queue_indices>();
140  if (Idx < 0 || Idx >= NumIndices)
141  throw sycl::exception(
143  "Queue compute index must be a non-negative number less than "
144  "device's number of available compute queue indices.");
145  }
146  if (has_property<
148  !MDevice->get_info<
149  ext::codeplay::experimental::info::device::supports_fusion>()) {
150  throw sycl::exception(
152  "Cannot enable fusion if device does not support fusion");
153  }
154  if (!Context->isDeviceValid(Device)) {
155  if (Context->getBackend() == backend::opencl)
156  throw sycl::exception(
158  "Queue cannot be constructed with the given context and device "
159  "since the device is not a member of the context (descendants of "
160  "devices from the context are not supported on OpenCL yet).");
161  throw sycl::exception(
163  "Queue cannot be constructed with the given context and device "
164  "since the device is neither a member of the context nor a "
165  "descendant of its member.");
166  }
167 
168  const QueueOrder QOrder =
170  MQueues.push_back(createQueue(QOrder));
171  // This section is the second part of the instrumentation that uses the
172  // tracepoint information and notifies
173 
174  // We enable XPTI tracing events using the TLS mechanism; if the code
175  // location data is available, then the tracing data will be rich.
176 #if XPTI_ENABLE_INSTRUMENTATION
177  constexpr uint16_t NotificationTraceType =
178  static_cast<uint16_t>(xpti::trace_point_type_t::queue_create);
179  // Using the instance override constructor for use with queues as queues
180  // maintain instance IDs in the object
181  XPTIScope PrepareNotify((void *)this, NotificationTraceType,
182  SYCL_STREAM_NAME, MQueueID, "queue_create");
183  // Cache the trace event, stream id and instance IDs for the destructor
184  if (xptiCheckTraceEnabled(PrepareNotify.streamID(),
185  NotificationTraceType)) {
186  MTraceEvent = (void *)PrepareNotify.traceEvent();
187  MStreamID = PrepareNotify.streamID();
188  MInstanceID = PrepareNotify.instanceID();
189  // Add the function to capture meta data for the XPTI trace event
190  PrepareNotify.addMetadata([&](auto TEvent) {
191  xpti::addMetadata(TEvent, "sycl_context",
192  reinterpret_cast<size_t>(MContext->getHandleRef()));
193  if (MDevice) {
194  xpti::addMetadata(TEvent, "sycl_device_name",
195  MDevice->getDeviceName());
196  xpti::addMetadata(TEvent, "sycl_device",
197  reinterpret_cast<size_t>(MDevice->getHandleRef()));
198  }
199  xpti::addMetadata(TEvent, "is_inorder", MIsInorder);
200  xpti::addMetadata(TEvent, "queue_id", MQueueID);
201  xpti::addMetadata(TEvent, "queue_handle",
202  reinterpret_cast<size_t>(getHandleRef()));
203  });
204  // Also publish to TLS
205  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, MQueueID);
206  PrepareNotify.notify();
207  }
208 #endif
209  }
210 
211  event getLastEvent();
212 
213 private:
214  void queue_impl_interop(sycl::detail::pi::PiQueue PiQueue) {
215  if (has_property<ext::oneapi::property::queue::discard_events>() &&
216  has_property<property::queue::enable_profiling>()) {
218  "Queue cannot be constructed with both of "
219  "discard_events and enable_profiling.");
220  }
221 
222  MQueues.push_back(pi::cast<sycl::detail::pi::PiQueue>(PiQueue));
223 
224  sycl::detail::pi::PiDevice DevicePI{};
225  const PluginPtr &Plugin = getPlugin();
226  // TODO catch an exception and put it to list of asynchronous exceptions
227  Plugin->call<PiApiKind::piQueueGetInfo>(
228  MQueues[0], PI_QUEUE_INFO_DEVICE, sizeof(DevicePI), &DevicePI, nullptr);
229  MDevice = MContext->findMatchingDeviceImpl(DevicePI);
230  if (MDevice == nullptr) {
231  throw sycl::exception(
233  "Device provided by native Queue not found in Context.");
234  }
235  // The following commented section provides a guideline on how to use the
236  // TLS enabled mechanism to create a tracepoint and notify using XPTI. This
237  // is the prolog section and the epilog section will initiate the
238  // notification.
239 #if XPTI_ENABLE_INSTRUMENTATION
240  constexpr uint16_t NotificationTraceType =
241  static_cast<uint16_t>(xpti::trace_point_type_t::queue_create);
242  XPTIScope PrepareNotify((void *)this, NotificationTraceType,
243  SYCL_STREAM_NAME, MQueueID, "queue_create");
244  if (xptiCheckTraceEnabled(PrepareNotify.streamID(),
245  NotificationTraceType)) {
246  // Cache the trace event, stream id and instance IDs for the destructor
247  MTraceEvent = (void *)PrepareNotify.traceEvent();
248  MStreamID = PrepareNotify.streamID();
249  MInstanceID = PrepareNotify.instanceID();
250 
251  // Add the function to capture meta data for the XPTI trace event
252  PrepareNotify.addMetadata([&](auto TEvent) {
253  xpti::addMetadata(TEvent, "sycl_context",
254  reinterpret_cast<size_t>(MContext->getHandleRef()));
255  if (MDevice) {
256  xpti::addMetadata(TEvent, "sycl_device_name",
257  MDevice->getDeviceName());
258  xpti::addMetadata(TEvent, "sycl_device",
259  reinterpret_cast<size_t>(MDevice->getHandleRef()));
260  }
261  xpti::addMetadata(TEvent, "is_inorder", MIsInorder);
262  xpti::addMetadata(TEvent, "queue_id", MQueueID);
263  xpti::addMetadata(TEvent, "queue_handle", getHandleRef());
264  });
265  // Also publish to TLS before notification
266  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, MQueueID);
267  PrepareNotify.notify();
268  }
269 #endif
270  }
271 
272 public:
280  const async_handler &AsyncHandler)
281  : MContext(Context), MAsyncHandler(AsyncHandler),
282  MIsInorder(has_property<property::queue::in_order>()),
284  has_property<ext::oneapi::property::queue::discard_events>()),
285  MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
286  MQueueID{
288  queue_impl_interop(PiQueue);
289  }
290 
299  const async_handler &AsyncHandler, const property_list &PropList)
300  : MContext(Context), MAsyncHandler(AsyncHandler), MPropList(PropList),
301  MIsInorder(has_property<property::queue::in_order>()),
303  has_property<ext::oneapi::property::queue::discard_events>()),
304  MIsProfilingEnabled(has_property<property::queue::enable_profiling>()) {
305  queue_impl_interop(PiQueue);
306  }
307 
309  try {
310  // The trace event created in the constructor should be active through the
311  // lifetime of the queue object as member variables when ABI breakage is
312  // allowed. This example shows MTraceEvent as a member variable.
313 #if XPTI_ENABLE_INSTRUMENTATION
314  constexpr uint16_t NotificationTraceType =
315  static_cast<uint16_t>(xpti::trace_point_type_t::queue_destroy);
316  if (xptiCheckTraceEnabled(MStreamID, NotificationTraceType)) {
317  // Used cached information in member variables
318  xptiNotifySubscribers(MStreamID, NotificationTraceType, nullptr,
319  (xpti::trace_event_data_t *)MTraceEvent,
320  MInstanceID,
321  static_cast<const void *>("queue_destroy"));
322  xptiReleaseEvent((xpti::trace_event_data_t *)MTraceEvent);
323  }
324 #endif
328  } catch (std::exception &e) {
329  __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~queue_impl", e);
330  }
331  }
332 
334  cl_command_queue get() {
336  return pi::cast<cl_command_queue>(MQueues[0]);
337  }
338 
341  return createSyclObjFromImpl<context>(MContext);
342  }
343 
344  const PluginPtr &getPlugin() const { return MContext->getPlugin(); }
345 
346  const ContextImplPtr &getContextImplPtr() const { return MContext; }
347 
348  const DeviceImplPtr &getDeviceImplPtr() const { return MDevice; }
349 
351  device get_device() const { return createSyclObjFromImpl<device>(MDevice); }
352 
354  bool hasDiscardEventsProperty() const { return MDiscardEvents; }
355 
357  bool supportsDiscardingPiEvents() const { return MIsInorder; }
358 
359  bool isInOrder() const { return MIsInorder; }
360 
364  template <typename Param> typename Param::return_type get_info() const;
365 
369  template <typename Param>
370  typename Param::return_type get_backend_info() const;
371 
375  void flush() {
376  if (MGraph.lock()) {
378  "flush cannot be called for a queue which is "
379  "recording to a command graph.");
380  }
381  for (const auto &queue : MQueues) {
383  }
384  }
385 
386  using SubmitPostProcessF = std::function<void(bool, bool, event &)>;
387 
401  event submit(const std::function<void(handler &)> &CGF,
402  const std::shared_ptr<queue_impl> &Self,
403  const std::shared_ptr<queue_impl> &SecondQueue,
404  const detail::code_location &Loc,
405  const SubmitPostProcessF *PostProcess = nullptr) {
406  event ResEvent;
407  try {
408  ResEvent = submit_impl(CGF, Self, Self, SecondQueue,
409  /*CallerNeedsEvent=*/true, Loc, PostProcess);
410  } catch (...) {
411  ResEvent =
412  SecondQueue->submit_impl(CGF, SecondQueue, Self, SecondQueue,
413  /*CallerNeedsEvent=*/true, Loc, PostProcess);
414  }
415  return discard_or_return(ResEvent);
416  }
417 
426  event submit(const std::function<void(handler &)> &CGF,
427  const std::shared_ptr<queue_impl> &Self,
428  const detail::code_location &Loc,
429  const SubmitPostProcessF *PostProcess = nullptr) {
430  auto ResEvent = submit_impl(CGF, Self, Self, nullptr,
431  /*CallerNeedsEvent=*/true, Loc, PostProcess);
432  return discard_or_return(ResEvent);
433  }
434 
435  void submit_without_event(const std::function<void(handler &)> &CGF,
436  const std::shared_ptr<queue_impl> &Self,
437  const detail::code_location &Loc,
438  const SubmitPostProcessF *PostProcess = nullptr) {
439  submit_impl(CGF, Self, Self, nullptr, /*CallerNeedsEvent=*/false, Loc,
440  PostProcess);
441  }
442 
448  void wait(const detail::code_location &Loc = {});
449 
452 
454  void wait_and_throw(const detail::code_location &Loc = {}) {
455  wait(Loc);
457  }
458 
467  if (!MAsyncHandler)
468  return;
469 
470  exception_list Exceptions;
471  {
472  std::lock_guard<std::mutex> Lock(MMutex);
473  std::swap(Exceptions, MExceptions);
474  }
475  // Unlock the mutex before calling user-provided handler to avoid
476  // potential deadlock if the same queue is somehow referenced in the
477  // handler.
478  if (Exceptions.size())
479  MAsyncHandler(std::move(Exceptions));
480  }
481 
489  sycl::detail::pi::PiQueueProperties CreationFlags = 0;
490 
491  if (Order == QueueOrder::OOO) {
493  }
494  if (PropList.has_property<property::queue::enable_profiling>()) {
495  CreationFlags |= PI_QUEUE_FLAG_PROFILING_ENABLE;
496  }
497  if (PropList.has_property<
498  ext::oneapi::cuda::property::queue::use_default_stream>()) {
499  CreationFlags |= __SYCL_PI_CUDA_USE_DEFAULT_STREAM;
500  }
501  if (PropList.has_property<ext::oneapi::property::queue::discard_events>()) {
502  // Pass this flag to the Level Zero plugin to be able to check it from
503  // queue property.
505  }
506  // Track that priority settings are not ambiguous.
507  bool PrioritySeen = false;
508  if (PropList
509  .has_property<ext::oneapi::property::queue::priority_normal>()) {
510  // Normal is the default priority, don't pass anything.
511  PrioritySeen = true;
512  }
513  if (PropList.has_property<ext::oneapi::property::queue::priority_low>()) {
514  if (PrioritySeen) {
515  throw sycl::exception(
517  "Queue cannot be constructed with different priorities.");
518  }
519  CreationFlags |= PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW;
520  PrioritySeen = true;
521  }
522  if (PropList.has_property<ext::oneapi::property::queue::priority_high>()) {
523  if (PrioritySeen) {
524  throw sycl::exception(
526  "Queue cannot be constructed with different priorities.");
527  }
529  }
530  // Track that submission modes do not conflict.
531  bool SubmissionSeen = false;
532  if (PropList.has_property<
533  ext::intel::property::queue::no_immediate_command_list>()) {
534  SubmissionSeen = true;
536  }
537  if (PropList.has_property<
538  ext::intel::property::queue::immediate_command_list>()) {
539  if (SubmissionSeen) {
540  throw sycl::exception(
542  "Queue cannot be constructed with different submission modes.");
543  }
544  SubmissionSeen = true;
546  }
547  return CreationFlags;
548  }
549 
556  sycl::detail::pi::PiContext Context = MContext->getHandleRef();
557  sycl::detail::pi::PiDevice Device = MDevice->getHandleRef();
558  const PluginPtr &Plugin = getPlugin();
559 
560  sycl::detail::pi::PiQueueProperties Properties[] = {
562  if (has_property<ext::intel::property::queue::compute_index>()) {
563  int Idx = get_property<ext::intel::property::queue::compute_index>()
564  .get_index();
565  Properties[2] = PI_QUEUE_COMPUTE_INDEX;
566  Properties[3] = static_cast<sycl::detail::pi::PiQueueProperties>(Idx);
567  }
569  Plugin->call_nocheck<PiApiKind::piextQueueCreate>(Context, Device,
570  Properties, &Queue);
571 
572  // If creating out-of-order queue failed and this property is not
573  // supported (for example, on FPGA), it will return
574  // PI_ERROR_INVALID_QUEUE_PROPERTIES and will try to create in-order queue.
575  if (!MEmulateOOO && Error == PI_ERROR_INVALID_QUEUE_PROPERTIES) {
576  MEmulateOOO = true;
578  } else {
579  Plugin->checkPiResult(Error);
580  }
581 
582  return Queue;
583  }
584 
588  sycl::detail::pi::PiQueue *PIQ = nullptr;
589  bool ReuseQueue = false;
590  {
591  std::lock_guard<std::mutex> Lock(MMutex);
592 
593  // To achieve parallelism for FPGA with in order execution model with
594  // possibility of two kernels to share data with each other we shall
595  // create a queue for every kernel enqueued.
596  if (MQueues.size() < MaxNumQueues) {
597  MQueues.push_back({});
598  PIQ = &MQueues.back();
599  } else {
600  // If the limit of OpenCL queues is going to be exceeded - take the
601  // earliest used queue, wait until it finished and then reuse it.
602  PIQ = &MQueues[MNextQueueIdx];
604  ReuseQueue = true;
605  }
606  }
607 
608  if (!ReuseQueue)
610  else
611  getPlugin()->call<PiApiKind::piQueueFinish>(*PIQ);
612 
613  return *PIQ;
614  }
615 
619  if (!MEmulateOOO)
620  return MQueues[0];
621 
623  }
624 
627  template <typename propertyT> bool has_property() const noexcept {
628  return MPropList.has_property<propertyT>();
629  }
630 
634  template <typename propertyT> propertyT get_property() const {
635  return MPropList.get_property<propertyT>();
636  }
637 
648  event memset(const std::shared_ptr<queue_impl> &Self, void *Ptr, int Value,
649  size_t Count, const std::vector<event> &DepEvents,
650  bool CallerNeedsEvent);
662  event memcpy(const std::shared_ptr<queue_impl> &Self, void *Dest,
663  const void *Src, size_t Count,
664  const std::vector<event> &DepEvents, bool CallerNeedsEvent,
665  const code_location &CodeLoc);
677  event mem_advise(const std::shared_ptr<queue_impl> &Self, const void *Ptr,
678  size_t Length, pi_mem_advice Advice,
679  const std::vector<event> &DepEvents, bool CallerNeedsEvent);
680 
684  void reportAsyncException(const std::exception_ptr &ExceptionPtr) {
685  std::lock_guard<std::mutex> Lock(MMutex);
686  MExceptions.PushBack(ExceptionPtr);
687  }
688 
691  }
692 
696  pi_native_handle getNative(int32_t &NativeHandleDesc) const;
697 
699  std::lock_guard<std::mutex> Lock(MStreamsServiceEventsMutex);
700  MStreamsServiceEvents.push_back(Event);
701  }
702 
703  bool ext_oneapi_empty() const;
704 
710  std::hash<typename std::shared_ptr<queue_impl>::element_type *>()(
711  this));
712  }
713 
714  event memcpyToDeviceGlobal(const std::shared_ptr<queue_impl> &Self,
715  void *DeviceGlobalPtr, const void *Src,
716  bool IsDeviceImageScope, size_t NumBytes,
717  size_t Offset, const std::vector<event> &DepEvents,
718  bool CallerNeedsEvent);
719  event memcpyFromDeviceGlobal(const std::shared_ptr<queue_impl> &Self,
720  void *Dest, const void *DeviceGlobalPtr,
721  bool IsDeviceImageScope, size_t NumBytes,
722  size_t Offset,
723  const std::vector<event> &DepEvents,
724  bool CallerNeedsEvent);
725 
727 
729  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph) {
730  std::lock_guard<std::mutex> Lock(MMutex);
731  MGraph = Graph;
733  }
734 
735  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
736  getCommandGraph() const {
737  return MGraph.lock();
738  }
739 
740  unsigned long long getQueueID() { return MQueueID; }
741 
742  void setExternalEvent(const event &Event) {
743  std::lock_guard<std::mutex> Lock(MInOrderExternalEventMtx);
744  MInOrderExternalEvent = Event;
745  }
746 
747  std::optional<event> popExternalEvent() {
748  std::lock_guard<std::mutex> Lock(MInOrderExternalEventMtx);
749  std::optional<event> Result = std::nullopt;
750  std::swap(Result, MInOrderExternalEvent);
751  return Result;
752  }
753 
754  const std::vector<event> &
755  getExtendDependencyList(const std::vector<event> &DepEvents,
756  std::vector<event> &MutableVec,
757  std::unique_lock<std::mutex> &QueueLock);
758 
759  // Helps to manage host tasks presence in scenario with barrier usage.
760  // Approach that tracks almost all tasks to provide barrier sync for both pi
761  // tasks and host tasks is applicable for out of order queues only. No-op
762  // for in order ones.
763  void tryToResetEnqueuedBarrierDep(const EventImplPtr &EnqueuedBarrierEvent);
764 
765  // Called on host task completion that could block some kernels from enqueue.
766  // Approach that tracks almost all tasks to provide barrier sync for both pi
767  // tasks and host tasks is applicable for out of order queues only. Not neede
768  // for in order ones.
769  void revisitUnenqueuedCommandsState(const EventImplPtr &CompletedHostTask);
770 
771  static ContextImplPtr getContext(const QueueImplPtr &Queue) {
772  return Queue ? Queue->getContextImplPtr() : nullptr;
773  }
774 
775  // Must be called under MMutex protection
777  const std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
778  &Graph);
779 
780  const property_list &getPropList() const { return MPropList; }
781 
782 protected:
783  event discard_or_return(const event &Event);
784  // Hook to the scheduler to clean up any fusion command held on destruction.
785  void cleanup_fusion_cmd();
786 
787  template <typename HandlerType = handler>
788  EventImplPtr insertHelperBarrier(const HandlerType &Handler) {
789  auto ResEvent = std::make_shared<detail::event_impl>(Handler.MQueue);
791  Handler.MQueue->getHandleRef(), 0, nullptr, &ResEvent->getHandleRef());
792  return ResEvent;
793  }
794 
795  // template is needed for proper unit testing
796  template <typename HandlerType = handler>
797  void finalizeHandler(HandlerType &Handler, event &EventRet) {
798  if (MIsInorder) {
799  // Accessing and changing of an event isn't atomic operation.
800  // Hence, here is the lock for thread-safety.
801  std::lock_guard<std::mutex> Lock{MMutex};
802 
803  auto &EventToBuildDeps = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr
805 
806  // This dependency is needed for the following purposes:
807  // - host tasks are handled by the runtime and cannot be implicitly
808  // synchronized by the backend.
809  // - to prevent the 2nd kernel enqueue when the 1st kernel is blocked
810  // by a host task. This dependency allows to build the enqueue order in
811  // the RT but will not be passed to the backend. See getPIEvents in
812  // Command.
813  if (EventToBuildDeps) {
814  // In the case where the last event was discarded and we are to run a
815  // host_task, we insert a barrier into the queue and use the resulting
816  // event as the dependency for the host_task.
817  // Note that host_task events can never be discarded, so this will not
818  // insert barriers between host_task enqueues.
819  if (EventToBuildDeps->isDiscarded() &&
820  getSyclObjImpl(Handler)->MCGType == CGType::CodeplayHostTask)
821  EventToBuildDeps = insertHelperBarrier(Handler);
822 
823  if (!EventToBuildDeps->isDiscarded())
824  Handler.depends_on(EventToBuildDeps);
825  }
826 
827  // If there is an external event set, add it as a dependency and clear it.
828  // We do not need to hold the lock as MLastEventMtx will ensure the last
829  // event reflects the corresponding external event dependence as well.
830  std::optional<event> ExternalEvent = popExternalEvent();
831  if (ExternalEvent)
832  Handler.depends_on(*ExternalEvent);
833 
834  EventRet = Handler.finalize();
835  EventToBuildDeps = getSyclObjImpl(EventRet);
836  } else {
837  const CGType Type = getSyclObjImpl(Handler)->MCGType;
838  std::lock_guard<std::mutex> Lock{MMutex};
839  // The following code supports barrier synchronization if host task is
840  // involved in the scenario. Native barriers cannot handle host task
841  // dependency so in the case where some commands were not enqueued
842  // (blocked), we track them to prevent barrier from being enqueued
843  // earlier.
844  {
845  std::lock_guard<std::mutex> RequestLock(MMissedCleanupRequestsMtx);
846  for (auto &UpdatedGraph : MMissedCleanupRequests)
847  doUnenqueuedCommandCleanup(UpdatedGraph);
848  MMissedCleanupRequests.clear();
849  }
850  auto &Deps = MGraph.expired() ? MDefaultGraphDeps : MExtGraphDeps;
851  if (Type == CGType::Barrier && !Deps.UnenqueuedCmdEvents.empty()) {
852  Handler.depends_on(Deps.UnenqueuedCmdEvents);
853  }
854  if (Deps.LastBarrier)
855  Handler.depends_on(Deps.LastBarrier);
856  EventRet = Handler.finalize();
857  EventImplPtr EventRetImpl = getSyclObjImpl(EventRet);
858  if (Type == CGType::CodeplayHostTask)
859  Deps.UnenqueuedCmdEvents.push_back(EventRetImpl);
860  else if (!EventRetImpl->isEnqueued()) {
861  if (Type == CGType::Barrier || Type == CGType::BarrierWaitlist) {
862  Deps.LastBarrier = EventRetImpl;
863  Deps.UnenqueuedCmdEvents.clear();
864  } else
865  Deps.UnenqueuedCmdEvents.push_back(EventRetImpl);
866  }
867  }
868  }
869 
882  event submit_impl(const std::function<void(handler &)> &CGF,
883  const std::shared_ptr<queue_impl> &Self,
884  const std::shared_ptr<queue_impl> &PrimaryQueue,
885  const std::shared_ptr<queue_impl> &SecondaryQueue,
886  bool CallerNeedsEvent, const detail::code_location &Loc,
887  const SubmitPostProcessF *PostProcess);
888 
894  template <typename HandlerFuncT>
895  event submitWithHandler(const std::shared_ptr<queue_impl> &Self,
896  const std::vector<event> &DepEvents,
897  HandlerFuncT HandlerFunc);
898 
914  template <typename HandlerFuncT, typename MemMngrFuncT,
915  typename... MemMngrArgTs>
916  event submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,
917  const std::vector<event> &DepEvents,
918  bool CallerNeedsEvent, HandlerFuncT HandlerFunc,
919  MemMngrFuncT MemMngrFunc, MemMngrArgTs... MemOpArgs);
920 
921  // When instrumentation is enabled emits trace event for wait begin and
922  // returns the telemetry event generated for the wait
923  void *instrumentationProlog(const detail::code_location &CodeLoc,
924  std::string &Name, int32_t StreamID,
925  uint64_t &iid);
926  // Uses events generated by the Prolog and emits wait done event
927  void instrumentationEpilog(void *TelementryEvent, std::string &Name,
928  int32_t StreamID, uint64_t IId);
929 
935  void addSharedEvent(const event &Event);
936 
940  void addEvent(const event &Event);
941 
943  mutable std::mutex MMutex;
944 
947 
949  std::vector<std::weak_ptr<event_impl>> MEventsWeak;
950 
954  std::vector<event> MEventsShared;
958 
960  std::vector<sycl::detail::pi::PiQueue> MQueues;
962  size_t MNextQueueIdx = 0;
963 
966  bool MEmulateOOO = false;
967 
968  // Access should be guarded with MMutex
970  // This event is employed for enhanced dependency tracking with in-order
971  // queue
973  // The following two items are employed for proper out of order enqueue
974  // ordering
975  std::vector<EventImplPtr> UnenqueuedCmdEvents;
977 
978  void reset() {
979  LastEventPtr = nullptr;
980  UnenqueuedCmdEvents.clear();
981  LastBarrier = nullptr;
982  }
984 
985  const bool MIsInorder;
986 
987  std::vector<EventImplPtr> MStreamsServiceEvents;
989 
990  // All member variable defined here are needed for the SYCL instrumentation
991  // layer. Do not guard these variables below with XPTI_ENABLE_INSTRUMENTATION
992  // to ensure we have the same object layout when the macro in the library and
993  // SYCL app are not the same.
994  void *MTraceEvent = nullptr;
996  uint8_t MStreamID = 0;
998  uint64_t MInstanceID = 0;
999 
1000  // the fallback implementation of profiling info
1001  bool MFallbackProfiling = false;
1002 
1003  // This event can be optionally provided by users for in-order queues to add
1004  // an additional dependency for the subsequent submission in to the queue.
1005  // Access to the event should be guarded with MInOrderExternalEventMtx.
1006  // NOTE: std::optional must not be exposed in the ABI.
1007  std::optional<event> MInOrderExternalEvent;
1008  mutable std::mutex MInOrderExternalEventMtx;
1009 
1010 public:
1011  // Queue constructed with the discard_events property
1012  const bool MDiscardEvents;
1014 
1015 protected:
1016  // Command graph which is associated with this queue for the purposes of
1017  // recording commands to it.
1018  std::weak_ptr<ext::oneapi::experimental::detail::graph_impl> MGraph{};
1019 
1020  unsigned long long MQueueID;
1021  static std::atomic<unsigned long long> MNextAvailableQueueID;
1022 
1023  std::deque<std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>>
1026 
1028 };
1029 
1030 } // namespace detail
1031 } // namespace _V1
1032 } // namespace sycl
The context class represents a SYCL context on which kernel functions may be executed.
Definition: context.hpp:50
static GlobalHandler & instance()
bool isInFusionMode(QueueIdT Queue)
Definition: scheduler.cpp:639
static Scheduler & getInstance()
Definition: scheduler.cpp:248
event discard_or_return(const event &Event)
Definition: queue_impl.cpp:700
std::deque< std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > > MMissedCleanupRequests
const property_list MPropList
Definition: queue_impl.hpp:957
bool hasDiscardEventsProperty() const
Definition: queue_impl.hpp:354
static ContextImplPtr getContext(const QueueImplPtr &Queue)
Definition: queue_impl.hpp:771
bool is_in_fusion_mode()
Check whether the queue is in fusion mode.
Definition: queue_impl.hpp:708
event submit(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const std::shared_ptr< queue_impl > &SecondQueue, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess=nullptr)
Submits a command group function object to the queue, in order to be scheduled for execution on the d...
Definition: queue_impl.hpp:401
void revisitUnenqueuedCommandsState(const EventImplPtr &CompletedHostTask)
Definition: queue_impl.cpp:706
uint64_t MInstanceID
The instance ID of the trace event for queue object.
Definition: queue_impl.hpp:998
std::vector< EventImplPtr > MStreamsServiceEvents
Definition: queue_impl.hpp:987
const property_list & getPropList() const
Definition: queue_impl.hpp:780
void wait_and_throw(const detail::code_location &Loc={})
Definition: queue_impl.hpp:454
event submitMemOpHelper(const std::shared_ptr< queue_impl > &Self, const std::vector< event > &DepEvents, bool CallerNeedsEvent, HandlerFuncT HandlerFunc, MemMngrFuncT MemMngrFunc, MemMngrArgTs... MemOpArgs)
Performs submission of a memory operation directly if scheduler can be bypassed, or with a handler ot...
Param::return_type get_info() const
Queries SYCL queue for information.
std::optional< event > MInOrderExternalEvent
std::optional< event > popExternalEvent()
Definition: queue_impl.hpp:747
sycl::detail::pi::PiQueue createQueue(QueueOrder Order)
Creates PI queue.
Definition: queue_impl.hpp:554
void registerStreamServiceEvent(const EventImplPtr &Event)
Definition: queue_impl.hpp:698
static std::atomic< unsigned long long > MNextAvailableQueueID
void addEvent(const event &Event)
Stores an event that should be associated with the queue.
Definition: queue_impl.cpp:298
std::vector< sycl::detail::pi::PiQueue > MQueues
List of queues created for FPGA device from a single SYCL queue.
Definition: queue_impl.hpp:960
pi_native_handle getNative(int32_t &NativeHandleDesc) const
Gets the native handle of the SYCL queue.
Definition: queue_impl.cpp:635
struct sycl::_V1::detail::queue_impl::DependencyTrackingItems MExtGraphDeps
sycl::detail::pi::PiQueue & getExclusiveQueueHandleRef()
Definition: queue_impl.hpp:587
void tryToResetEnqueuedBarrierDep(const EventImplPtr &EnqueuedBarrierEvent)
unsigned long long MQueueID
std::vector< std::weak_ptr< event_impl > > MEventsWeak
These events are tracked, but not owned, by the queue.
Definition: queue_impl.hpp:949
bool has_property() const noexcept
Definition: queue_impl.hpp:627
event submit(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess=nullptr)
Submits a command group function object to the queue, in order to be scheduled for execution on the d...
Definition: queue_impl.hpp:426
std::vector< event > MEventsShared
Events without data dependencies (such as USM) need an owner, additionally, USM operations are not ad...
Definition: queue_impl.hpp:954
std::mutex MMutex
Protects all the fields that can be changed by class' methods.
Definition: queue_impl.hpp:943
void submit_without_event(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess=nullptr)
Definition: queue_impl.hpp:435
void setCommandGraph(std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > Graph)
Definition: queue_impl.hpp:728
event memcpyToDeviceGlobal(const std::shared_ptr< queue_impl > &Self, void *DeviceGlobalPtr, const void *Src, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector< event > &DepEvents, bool CallerNeedsEvent)
Definition: queue_impl.cpp:247
queue_impl(sycl::detail::pi::PiQueue PiQueue, const ContextImplPtr &Context, const async_handler &AsyncHandler)
Constructs a SYCL queue from plugin interoperability handle.
Definition: queue_impl.hpp:279
bool supportsDiscardingPiEvents() const
Definition: queue_impl.hpp:357
static ThreadPool & getThreadPool()
Definition: queue_impl.hpp:689
event submitWithHandler(const std::shared_ptr< queue_impl > &Self, const std::vector< event > &DepEvents, HandlerFuncT HandlerFunc)
Helper function for submitting a memory operation with a handler.
Definition: queue_impl.cpp:408
void doUnenqueuedCommandCleanup(const std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > &Graph)
Definition: queue_impl.cpp:719
event mem_advise(const std::shared_ptr< queue_impl > &Self, const void *Ptr, size_t Length, pi_mem_advice Advice, const std::vector< event > &DepEvents, bool CallerNeedsEvent)
Provides additional information to the underlying runtime about how different allocations are used.
Definition: queue_impl.cpp:235
event submit_impl(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const std::shared_ptr< queue_impl > &PrimaryQueue, const std::shared_ptr< queue_impl > &SecondaryQueue, bool CallerNeedsEvent, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess)
Performs command group submission to the queue.
Definition: queue_impl.cpp:350
std::function< void(bool, bool, event &)> SubmitPostProcessF
Definition: queue_impl.hpp:386
event memcpyFromDeviceGlobal(const std::shared_ptr< queue_impl > &Self, void *Dest, const void *DeviceGlobalPtr, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector< event > &DepEvents, bool CallerNeedsEvent)
Definition: queue_impl.cpp:263
void addSharedEvent(const event &Event)
queue_impl.addEvent tracks events with weak pointers but some events have no other owners.
Definition: queue_impl.cpp:321
const ContextImplPtr MContext
Definition: queue_impl.hpp:946
void finalizeHandler(HandlerType &Handler, event &EventRet)
Definition: queue_impl.hpp:797
void setExternalEvent(const event &Event)
Definition: queue_impl.hpp:742
sycl::detail::pi::PiQueue & getHandleRef()
Definition: queue_impl.hpp:618
void throw_asynchronous()
Performs a blocking wait for the completion of all enqueued tasks in the queue.
Definition: queue_impl.hpp:466
queue_impl(const DeviceImplPtr &Device, const ContextImplPtr &Context, const async_handler &AsyncHandler, const property_list &PropList)
Constructs a SYCL queue with an async_handler and property_list provided form a device and a context.
Definition: queue_impl.hpp:107
void flush()
Provides a hint to the backend to execute previously issued commands on this queue.
Definition: queue_impl.hpp:375
const PluginPtr & getPlugin() const
Definition: queue_impl.hpp:344
const DeviceImplPtr & getDeviceImplPtr() const
Definition: queue_impl.hpp:348
const async_handler MAsyncHandler
Definition: queue_impl.hpp:956
exception_list getExceptionList() const
Definition: queue_impl.hpp:451
uint8_t MStreamID
The stream under which the traces are emitted from the queue object.
Definition: queue_impl.hpp:996
void reportAsyncException(const std::exception_ptr &ExceptionPtr)
Puts exception to the list of asynchronous ecxeptions.
Definition: queue_impl.hpp:684
void wait(const detail::code_location &Loc={})
Performs a blocking wait for the completion of all enqueued tasks in the queue.
Definition: queue_impl.cpp:545
std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > getCommandGraph() const
Definition: queue_impl.hpp:736
struct sycl::_V1::detail::queue_impl::DependencyTrackingItems MDefaultGraphDeps
EventImplPtr insertHelperBarrier(const HandlerType &Handler)
Definition: queue_impl.hpp:788
bool MEmulateOOO
Indicates that a native out-of-order queue could not be created and we need to emulate it with multip...
Definition: queue_impl.hpp:966
void instrumentationEpilog(void *TelementryEvent, std::string &Name, int32_t StreamID, uint64_t IId)
Definition: queue_impl.cpp:526
static sycl::detail::pi::PiQueueProperties createPiQueueProperties(const property_list &PropList, QueueOrder Order)
Creates PI properties array.
Definition: queue_impl.hpp:488
const ContextImplPtr & getContextImplPtr() const
Definition: queue_impl.hpp:346
void * instrumentationProlog(const detail::code_location &CodeLoc, std::string &Name, int32_t StreamID, uint64_t &iid)
Definition: queue_impl.cpp:469
size_t MNextQueueIdx
Iterator through MQueues.
Definition: queue_impl.hpp:962
std::weak_ptr< ext::oneapi::experimental::detail::graph_impl > MGraph
event memset(const std::shared_ptr< queue_impl > &Self, void *Ptr, int Value, size_t Count, const std::vector< event > &DepEvents, bool CallerNeedsEvent)
Fills the memory pointed by a USM pointer with the value specified.
Definition: queue_impl.cpp:148
event memcpy(const std::shared_ptr< queue_impl > &Self, void *Dest, const void *Src, size_t Count, const std::vector< event > &DepEvents, bool CallerNeedsEvent, const code_location &CodeLoc)
Copies data from one memory region to another, both pointed by USM pointers.
Definition: queue_impl.cpp:196
Param::return_type get_backend_info() const
Queries SYCL queue for SYCL backend-specific information.
const std::vector< event > & getExtendDependencyList(const std::vector< event > &DepEvents, std::vector< event > &MutableVec, std::unique_lock< std::mutex > &QueueLock)
Definition: queue_impl.cpp:126
queue_impl(const DeviceImplPtr &Device, const async_handler &AsyncHandler, const property_list &PropList)
Constructs a SYCL queue from a device using an async_handler and property_list provided.
Definition: queue_impl.hpp:94
static ContextImplPtr getDefaultOrNew(const DeviceImplPtr &Device)
Definition: queue_impl.hpp:75
unsigned long long getQueueID()
Definition: queue_impl.hpp:740
queue_impl(sycl::detail::pi::PiQueue PiQueue, const ContextImplPtr &Context, const async_handler &AsyncHandler, const property_list &PropList)
Constructs a SYCL queue from plugin interoperability handle.
Definition: queue_impl.hpp:298
propertyT get_property() const
Definition: queue_impl.hpp:634
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Definition: event.hpp:44
A list of asynchronous exceptions.
Implementation of node class from SYCL_EXT_ONEAPI_GRAPH.
Definition: graph_impl.hpp:80
Command group handler class.
Definition: handler.hpp:468
Objects of the property_list class are containers for the SYCL properties.
bool has_property() const noexcept
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
Definition: queue.hpp:110
#define __SYCL_PI_CUDA_USE_DEFAULT_STREAM
#define __SYCL_REPORT_EXCEPTION_TO_STREAM(str, e)
Definition: common.hpp:365
::pi_queue PiQueue
Definition: pi.hpp:113
::pi_queue_properties PiQueueProperties
Definition: pi.hpp:114
CUDAContextT
Possible CUDA context types supported by PI CUDA backend TODO: Implement this as a property once ther...
Definition: queue_impl.hpp:64
decltype(Obj::impl) const & getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:31
constexpr const char * SYCL_STREAM_NAME
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
Definition: event_impl.hpp:32
std::shared_ptr< event_impl > EventImplPtr
Definition: handler.hpp:184
std::shared_ptr< plugin > PluginPtr
Definition: pi.hpp:47
std::shared_ptr< device_impl > DeviceImplPtr
static constexpr size_t MaxNumQueues
Sets max number of queues supported by FPGA RT.
Definition: queue_impl.hpp:60
CGType
Type of the command group.
Definition: cg_types.hpp:41
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
Definition: helpers.hpp:46
constexpr CUDAContextT DefaultContextType
Default context type created for CUDA backend.
Definition: queue_impl.hpp:67
constexpr auto memory_order_relaxed
std::function< void(sycl::exception_list)> async_handler
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:64
Definition: access.hpp:18
constexpr pi_queue_properties PI_QUEUE_COMPUTE_INDEX
Definition: pi.h:883
pi_result piQueueFinish(pi_queue command_queue)
Definition: pi_cuda.cpp:186
uintptr_t pi_native_handle
Definition: pi.h:267
_pi_result
Definition: pi.h:274
constexpr pi_queue_properties PI_EXT_QUEUE_FLAG_SUBMISSION_IMMEDIATE
Definition: pi.h:893
@ PI_QUEUE_INFO_DEVICE
Definition: pi.h:577
_pi_mem_advice
Definition: pi.h:678
pi_result piEnqueueEventsWaitWithBarrier(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:689
constexpr pi_queue_properties PI_EXT_QUEUE_FLAG_SUBMISSION_NO_IMMEDIATE
Definition: pi.h:892
constexpr pi_queue_properties PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE
Definition: pi.h:885
pi_result piextQueueCreate(pi_context context, pi_device device, pi_queue_properties *properties, pi_queue *queue)
Definition: pi_cuda.cpp:167
pi_result piQueueRelease(pi_queue command_queue)
Definition: pi_cuda.cpp:182
constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW
Definition: pi.h:890
constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS
Definition: pi.h:889
constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH
Definition: pi.h:891
constexpr pi_queue_properties PI_QUEUE_FLAGS
Definition: pi.h:882
constexpr pi_queue_properties PI_QUEUE_FLAG_PROFILING_ENABLE
Definition: pi.h:886
pi_result piQueueFlush(pi_queue command_queue)
Definition: pi_cuda.cpp:188
pi_result piQueueGetInfo(pi_queue command_queue, pi_queue_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_cuda.cpp:172
pi_result piQueueRetain(pi_queue command_queue)
Definition: pi_cuda.cpp:180
_Abi const simd< _Tp, _Abi > & noexcept
Definition: simd.hpp:1324