DPC++ Runtime
Runtime libraries for oneAPI DPC++
queue_impl.hpp
Go to the documentation of this file.
1 //==------------------ queue_impl.hpp - SYCL queue -------------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #pragma once
10 
11 #include <detail/config.hpp>
12 #include <detail/context_impl.hpp>
13 #include <detail/device_impl.hpp>
14 #include <detail/device_info.hpp>
15 #include <detail/event_impl.hpp>
17 #include <detail/kernel_impl.hpp>
18 #include <detail/plugin.hpp>
20 #include <detail/thread_pool.hpp>
21 #include <sycl/context.hpp>
24 #include <sycl/device.hpp>
25 #include <sycl/event.hpp>
26 #include <sycl/exception.hpp>
27 #include <sycl/exception_list.hpp>
29 #include <sycl/handler.hpp>
32 #include <sycl/property_list.hpp>
33 #include <sycl/queue.hpp>
34 
35 #include "detail/graph_impl.hpp"
36 
37 #include <utility>
38 
39 #ifdef XPTI_ENABLE_INSTRUMENTATION
40 #include "xpti/xpti_trace_framework.hpp"
41 #include <detail/xpti_registry.hpp>
42 #endif
43 
44 namespace sycl {
45 inline namespace _V1 {
46 
47 // forward declaration
48 
49 namespace ext::oneapi::experimental::detail {
50 class graph_impl;
51 }
52 
53 namespace detail {
54 
55 using ContextImplPtr = std::shared_ptr<detail::context_impl>;
56 using DeviceImplPtr = std::shared_ptr<detail::device_impl>;
57 
59 static constexpr size_t MaxNumQueues = 256;
60 
63 enum class CUDAContextT : char { primary, custom };
64 
67 
69 
70 class queue_impl {
71 public:
72  // \return a default context for the platform if it includes the device
73  // passed and default contexts are enabled, a new context otherwise.
77  context{createSyclObjFromImpl<device>(Device), {}, {}});
78 
79  ContextImplPtr DefaultContext = detail::getSyclObjImpl(
80  Device->get_platform().ext_oneapi_get_default_context());
81  if (DefaultContext->isDeviceValid(Device))
82  return DefaultContext;
84  context{createSyclObjFromImpl<device>(Device), {}, {}});
85  }
93  queue_impl(const DeviceImplPtr &Device, const async_handler &AsyncHandler,
94  const property_list &PropList)
95  : queue_impl(Device, getDefaultOrNew(Device), AsyncHandler, PropList){};
96 
106  queue_impl(const DeviceImplPtr &Device, const ContextImplPtr &Context,
107  const async_handler &AsyncHandler, const property_list &PropList)
108  : MDevice(Device), MContext(Context), MAsyncHandler(AsyncHandler),
109  MPropList(PropList), MHostQueue(MDevice->is_host()),
110  MIsInorder(has_property<property::queue::in_order>()),
112  has_property<ext::oneapi::property::queue::discard_events>()),
113  MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
115  (MHostQueue ? true : MIsInorder)),
116  MQueueID{
118  if (has_property<property::queue::enable_profiling>()) {
119  if (has_property<ext::oneapi::property::queue::discard_events>())
121  "Queue cannot be constructed with both of "
122  "discard_events and enable_profiling.");
123  // fallback profiling support. See MFallbackProfiling
124  if (MDevice->has(aspect::queue_profiling)) {
125  // When piGetDeviceAndHostTimer is not supported, compute the
126  // profiling time OpenCL version < 2.1 case
127  if (!getDeviceImplPtr()->is_host() &&
128  !getDeviceImplPtr()->isGetDeviceAndHostTimerSupported())
129  MFallbackProfiling = true;
130  } else {
132  "Cannot enable profiling, the associated device "
133  "does not have the queue_profiling aspect");
134  }
135  }
136  if (has_property<ext::intel::property::queue::compute_index>()) {
137  int Idx = get_property<ext::intel::property::queue::compute_index>()
138  .get_index();
139  int NumIndices =
140  createSyclObjFromImpl<device>(Device)
141  .get_info<ext::intel::info::device::max_compute_queue_indices>();
142  if (Idx < 0 || Idx >= NumIndices)
143  throw sycl::exception(
145  "Queue compute index must be a non-negative number less than "
146  "device's number of available compute queue indices.");
147  }
148  if (has_property<
150  !MDevice->get_info<
151  ext::codeplay::experimental::info::device::supports_fusion>()) {
152  throw sycl::exception(
154  "Cannot enable fusion if device does not support fusion");
155  }
156  if (!Context->isDeviceValid(Device)) {
157  if (!Context->is_host() && Context->getBackend() == backend::opencl)
158  throw sycl::invalid_object_error(
159  "Queue cannot be constructed with the given context and device "
160  "since the device is not a member of the context (descendants of "
161  "devices from the context are not supported on OpenCL yet).",
162  PI_ERROR_INVALID_DEVICE);
163  throw sycl::invalid_object_error(
164  "Queue cannot be constructed with the given context and device "
165  "since the device is neither a member of the context nor a "
166  "descendant of its member.",
167  PI_ERROR_INVALID_DEVICE);
168  }
169  if (!MHostQueue) {
170  const QueueOrder QOrder =
172  MQueues.push_back(createQueue(QOrder));
173  // This section is the second part of the instrumentation that uses the
174  // tracepoint information and notifies
175  }
176 
177  // We enable XPTI tracing events using the TLS mechanism; if the code
178  // location data is available, then the tracing data will be rich.
179 #if XPTI_ENABLE_INSTRUMENTATION
180  constexpr uint16_t NotificationTraceType =
181  static_cast<uint16_t>(xpti::trace_point_type_t::queue_create);
182  // Using the instance override constructor for use with queues as queues
183  // maintain instance IDs in the object
184  XPTIScope PrepareNotify((void *)this, NotificationTraceType,
185  SYCL_STREAM_NAME, MQueueID, "queue_create");
186  // Cache the trace event, stream id and instance IDs for the destructor
187  if (xptiCheckTraceEnabled(PrepareNotify.streamID(),
188  NotificationTraceType)) {
189  MTraceEvent = (void *)PrepareNotify.traceEvent();
190  MStreamID = PrepareNotify.streamID();
191  MInstanceID = PrepareNotify.instanceID();
192  // Add the function to capture meta data for the XPTI trace event
193  PrepareNotify.addMetadata([&](auto TEvent) {
194  xpti::addMetadata(TEvent, "sycl_context",
195  reinterpret_cast<size_t>(MContext->getHandleRef()));
196  if (MDevice) {
197  xpti::addMetadata(TEvent, "sycl_device_name",
198  MDevice->getDeviceName());
199  xpti::addMetadata(
200  TEvent, "sycl_device",
201  reinterpret_cast<size_t>(
202  MDevice->is_host() ? 0 : MDevice->getHandleRef()));
203  }
204  xpti::addMetadata(TEvent, "is_inorder", MIsInorder);
205  xpti::addMetadata(TEvent, "queue_id", MQueueID);
206  if (!MHostQueue)
207  xpti::addMetadata(TEvent, "queue_handle",
208  reinterpret_cast<size_t>(getHandleRef()));
209  });
210  // Also publish to TLS
211  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, MQueueID);
212  PrepareNotify.notify();
213  }
214 #endif
215  }
216 
217  event getLastEvent();
218 
219 private:
220  void queue_impl_interop(sycl::detail::pi::PiQueue PiQueue) {
221  if (has_property<ext::oneapi::property::queue::discard_events>() &&
222  has_property<property::queue::enable_profiling>()) {
224  "Queue cannot be constructed with both of "
225  "discard_events and enable_profiling.");
226  }
227 
228  MQueues.push_back(pi::cast<sycl::detail::pi::PiQueue>(PiQueue));
229 
230  sycl::detail::pi::PiDevice DevicePI{};
231  const PluginPtr &Plugin = getPlugin();
232  // TODO catch an exception and put it to list of asynchronous exceptions
233  Plugin->call<PiApiKind::piQueueGetInfo>(
234  MQueues[0], PI_QUEUE_INFO_DEVICE, sizeof(DevicePI), &DevicePI, nullptr);
235  MDevice = MContext->findMatchingDeviceImpl(DevicePI);
236  if (MDevice == nullptr) {
237  throw sycl::exception(
239  "Device provided by native Queue not found in Context.");
240  }
241  // The following commented section provides a guideline on how to use the
242  // TLS enabled mechanism to create a tracepoint and notify using XPTI. This
243  // is the prolog section and the epilog section will initiate the
244  // notification.
245 #if XPTI_ENABLE_INSTRUMENTATION
246  constexpr uint16_t NotificationTraceType =
247  static_cast<uint16_t>(xpti::trace_point_type_t::queue_create);
248  XPTIScope PrepareNotify((void *)this, NotificationTraceType,
249  SYCL_STREAM_NAME, MQueueID, "queue_create");
250  if (xptiCheckTraceEnabled(PrepareNotify.streamID(),
251  NotificationTraceType)) {
252  // Cache the trace event, stream id and instance IDs for the destructor
253  MTraceEvent = (void *)PrepareNotify.traceEvent();
254  MStreamID = PrepareNotify.streamID();
255  MInstanceID = PrepareNotify.instanceID();
256 
257  // Add the function to capture meta data for the XPTI trace event
258  PrepareNotify.addMetadata([&](auto TEvent) {
259  xpti::addMetadata(TEvent, "sycl_context",
260  reinterpret_cast<size_t>(MContext->getHandleRef()));
261  if (MDevice) {
262  xpti::addMetadata(TEvent, "sycl_device_name",
263  MDevice->getDeviceName());
264  xpti::addMetadata(
265  TEvent, "sycl_device",
266  reinterpret_cast<size_t>(
267  MDevice->is_host() ? 0 : MDevice->getHandleRef()));
268  }
269  xpti::addMetadata(TEvent, "is_inorder", MIsInorder);
270  xpti::addMetadata(TEvent, "queue_id", MQueueID);
271  if (!MHostQueue)
272  xpti::addMetadata(TEvent, "queue_handle", getHandleRef());
273  });
274  // Also publish to TLS before notification
275  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, MQueueID);
276  PrepareNotify.notify();
277  }
278 #endif
279  }
280 
281 public:
289  const async_handler &AsyncHandler)
290  : MContext(Context), MAsyncHandler(AsyncHandler), MHostQueue(false),
291  MIsInorder(has_property<property::queue::in_order>()),
293  has_property<ext::oneapi::property::queue::discard_events>()),
294  MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
296  (MHostQueue ? true : MIsInorder)),
297  MQueueID{
299  queue_impl_interop(PiQueue);
300  }
301 
310  const async_handler &AsyncHandler, const property_list &PropList)
311  : MContext(Context), MAsyncHandler(AsyncHandler), MPropList(PropList),
312  MHostQueue(false),
313  MIsInorder(has_property<property::queue::in_order>()),
315  has_property<ext::oneapi::property::queue::discard_events>()),
316  MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
318  (MHostQueue ? true : MIsInorder)) {
319  queue_impl_interop(PiQueue);
320  }
321 
323  // The trace event created in the constructor should be active through the
324  // lifetime of the queue object as member variables when ABI breakage is
325  // allowed. This example shows MTraceEvent as a member variable.
326 #if XPTI_ENABLE_INSTRUMENTATION
327  constexpr uint16_t NotificationTraceType =
328  static_cast<uint16_t>(xpti::trace_point_type_t::queue_destroy);
329  if (xptiCheckTraceEnabled(MStreamID, NotificationTraceType)) {
330  // Used cached information in member variables
331  xptiNotifySubscribers(MStreamID, NotificationTraceType, nullptr,
332  (xpti::trace_event_data_t *)MTraceEvent,
333  MInstanceID,
334  static_cast<const void *>("queue_destroy"));
335  xptiReleaseEvent((xpti::trace_event_data_t *)MTraceEvent);
336  }
337 #endif
339  if (!MHostQueue) {
342  }
343  }
344 
346  cl_command_queue get() {
347  if (MHostQueue) {
348  throw invalid_object_error(
349  "This instance of queue doesn't support OpenCL interoperability",
350  PI_ERROR_INVALID_QUEUE);
351  }
353  return pi::cast<cl_command_queue>(MQueues[0]);
354  }
355 
358  return createSyclObjFromImpl<context>(MContext);
359  }
360 
361  const PluginPtr &getPlugin() const { return MContext->getPlugin(); }
362 
363  const ContextImplPtr &getContextImplPtr() const { return MContext; }
364 
365  const DeviceImplPtr &getDeviceImplPtr() const { return MDevice; }
366 
368  device get_device() const { return createSyclObjFromImpl<device>(MDevice); }
369 
371  bool is_host() const { return MHostQueue; }
372 
376  }
377 
378  bool isInOrder() const { return MIsInorder; }
379 
383  template <typename Param> typename Param::return_type get_info() const;
384 
388  template <typename Param>
389  typename Param::return_type get_backend_info() const;
390 
394  void flush() {
395  if (MGraph.lock()) {
397  "flush cannot be called for a queue which is "
398  "recording to a command graph.");
399  }
400  for (const auto &queue : MQueues) {
402  }
403  }
404 
405  using SubmitPostProcessF = std::function<void(bool, bool, event &)>;
406 
420  event submit(const std::function<void(handler &)> &CGF,
421  const std::shared_ptr<queue_impl> &Self,
422  const std::shared_ptr<queue_impl> &SecondQueue,
423  const detail::code_location &Loc,
424  const SubmitPostProcessF *PostProcess = nullptr) {
425  event ResEvent;
426  try {
427  ResEvent = submit_impl(CGF, Self, Self, SecondQueue, Loc, PostProcess);
428  } catch (...) {
429  ResEvent = SecondQueue->submit_impl(CGF, SecondQueue, Self, SecondQueue,
430  Loc, PostProcess);
431  }
432  return discard_or_return(ResEvent);
433  }
434 
443  event submit(const std::function<void(handler &)> &CGF,
444  const std::shared_ptr<queue_impl> &Self,
445  const detail::code_location &Loc,
446  const SubmitPostProcessF *PostProcess = nullptr) {
447  auto ResEvent = submit_impl(CGF, Self, Self, nullptr, Loc, PostProcess);
448  return discard_or_return(ResEvent);
449  }
450 
456  void wait(const detail::code_location &Loc = {});
457 
460 
462  void wait_and_throw(const detail::code_location &Loc = {}) {
463  wait(Loc);
465  }
466 
475  if (!MAsyncHandler)
476  return;
477 
478  exception_list Exceptions;
479  {
480  std::lock_guard<std::mutex> Lock(MMutex);
481  std::swap(Exceptions, MExceptions);
482  }
483  // Unlock the mutex before calling user-provided handler to avoid
484  // potential deadlock if the same queue is somehow referenced in the
485  // handler.
486  if (Exceptions.size())
487  MAsyncHandler(std::move(Exceptions));
488  }
489 
497  sycl::detail::pi::PiQueueProperties CreationFlags = 0;
498 
499  if (Order == QueueOrder::OOO) {
501  }
502  if (PropList.has_property<property::queue::enable_profiling>()) {
503  CreationFlags |= PI_QUEUE_FLAG_PROFILING_ENABLE;
504  }
505  if (PropList.has_property<
506  ext::oneapi::cuda::property::queue::use_default_stream>()) {
507  CreationFlags |= __SYCL_PI_CUDA_USE_DEFAULT_STREAM;
508  }
509  if (PropList.has_property<ext::oneapi::property::queue::discard_events>()) {
510  // Pass this flag to the Level Zero plugin to be able to check it from
511  // queue property.
513  }
514  // Track that priority settings are not ambiguous.
515  bool PrioritySeen = false;
516  if (PropList
517  .has_property<ext::oneapi::property::queue::priority_normal>()) {
518  // Normal is the default priority, don't pass anything.
519  PrioritySeen = true;
520  }
521  if (PropList.has_property<ext::oneapi::property::queue::priority_low>()) {
522  if (PrioritySeen) {
523  throw sycl::exception(
525  "Queue cannot be constructed with different priorities.");
526  }
527  CreationFlags |= PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW;
528  PrioritySeen = true;
529  }
530  if (PropList.has_property<ext::oneapi::property::queue::priority_high>()) {
531  if (PrioritySeen) {
532  throw sycl::exception(
534  "Queue cannot be constructed with different priorities.");
535  }
537  }
538  // Track that submission modes do not conflict.
539  bool SubmissionSeen = false;
540  if (PropList.has_property<
541  ext::intel::property::queue::no_immediate_command_list>()) {
542  SubmissionSeen = true;
544  }
545  if (PropList.has_property<
546  ext::intel::property::queue::immediate_command_list>()) {
547  if (SubmissionSeen) {
548  throw sycl::exception(
550  "Queue cannot be constructed with different submission modes.");
551  }
552  SubmissionSeen = true;
554  }
555  return CreationFlags;
556  }
557 
564  sycl::detail::pi::PiContext Context = MContext->getHandleRef();
565  sycl::detail::pi::PiDevice Device = MDevice->getHandleRef();
566  const PluginPtr &Plugin = getPlugin();
567 
568  sycl::detail::pi::PiQueueProperties Properties[] = {
570  if (has_property<ext::intel::property::queue::compute_index>()) {
571  int Idx = get_property<ext::intel::property::queue::compute_index>()
572  .get_index();
573  Properties[2] = PI_QUEUE_COMPUTE_INDEX;
574  Properties[3] = static_cast<sycl::detail::pi::PiQueueProperties>(Idx);
575  }
577  Plugin->call_nocheck<PiApiKind::piextQueueCreate>(Context, Device,
578  Properties, &Queue);
579 
580  // If creating out-of-order queue failed and this property is not
581  // supported (for example, on FPGA), it will return
582  // PI_ERROR_INVALID_QUEUE_PROPERTIES and will try to create in-order queue.
583  if (!MEmulateOOO && Error == PI_ERROR_INVALID_QUEUE_PROPERTIES) {
584  MEmulateOOO = true;
586  } else {
587  Plugin->checkPiResult(Error);
588  }
589 
590  return Queue;
591  }
592 
596  sycl::detail::pi::PiQueue *PIQ = nullptr;
597  bool ReuseQueue = false;
598  {
599  std::lock_guard<std::mutex> Lock(MMutex);
600 
601  // To achieve parallelism for FPGA with in order execution model with
602  // possibility of two kernels to share data with each other we shall
603  // create a queue for every kernel enqueued.
604  if (MQueues.size() < MaxNumQueues) {
605  MQueues.push_back({});
606  PIQ = &MQueues.back();
607  } else {
608  // If the limit of OpenCL queues is going to be exceeded - take the
609  // earliest used queue, wait until it finished and then reuse it.
610  PIQ = &MQueues[MNextQueueIdx];
612  ReuseQueue = true;
613  }
614  }
615 
616  if (!ReuseQueue)
618  else
619  getPlugin()->call<PiApiKind::piQueueFinish>(*PIQ);
620 
621  return *PIQ;
622  }
623 
627  if (!MEmulateOOO)
628  return MQueues[0];
629 
631  }
632 
635  template <typename propertyT> bool has_property() const noexcept {
636  return MPropList.has_property<propertyT>();
637  }
638 
642  template <typename propertyT> propertyT get_property() const {
643  return MPropList.get_property<propertyT>();
644  }
645 
655  event memset(const std::shared_ptr<queue_impl> &Self, void *Ptr, int Value,
656  size_t Count, const std::vector<event> &DepEvents);
667  event memcpy(const std::shared_ptr<queue_impl> &Self, void *Dest,
668  const void *Src, size_t Count,
669  const std::vector<event> &DepEvents,
670  const code_location &CodeLoc);
681  event mem_advise(const std::shared_ptr<queue_impl> &Self, const void *Ptr,
682  size_t Length, pi_mem_advice Advice,
683  const std::vector<event> &DepEvents);
684 
688  void reportAsyncException(const std::exception_ptr &ExceptionPtr) {
689  std::lock_guard<std::mutex> Lock(MMutex);
690  MExceptions.PushBack(ExceptionPtr);
691  }
692 
695  }
696 
700  pi_native_handle getNative(int32_t &NativeHandleDesc) const;
701 
703  std::lock_guard<std::mutex> Lock(MStreamsServiceEventsMutex);
704  MStreamsServiceEvents.push_back(Event);
705  }
706 
707  bool ext_oneapi_empty() const;
708 
714  std::hash<typename std::shared_ptr<queue_impl>::element_type *>()(
715  this));
716  }
717 
718  event memcpyToDeviceGlobal(const std::shared_ptr<queue_impl> &Self,
719  void *DeviceGlobalPtr, const void *Src,
720  bool IsDeviceImageScope, size_t NumBytes,
721  size_t Offset,
722  const std::vector<event> &DepEvents);
723  event memcpyFromDeviceGlobal(const std::shared_ptr<queue_impl> &Self,
724  void *Dest, const void *DeviceGlobalPtr,
725  bool IsDeviceImageScope, size_t NumBytes,
726  size_t Offset,
727  const std::vector<event> &DepEvents);
728 
730 
732  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph) {
733  std::lock_guard<std::mutex> Lock(MMutex);
734  MGraph = Graph;
735  MExtGraphDeps.LastEventPtr = nullptr;
736  }
737 
738  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
739  getCommandGraph() const {
740  return MGraph.lock();
741  }
742 
743  unsigned long long getQueueID() { return MQueueID; }
744 
745  void setExternalEvent(const event &Event) {
746  std::lock_guard<std::mutex> Lock(MInOrderExternalEventMtx);
747  MInOrderExternalEvent = Event;
748  }
749 
750  std::optional<event> popExternalEvent() {
751  std::lock_guard<std::mutex> Lock(MInOrderExternalEventMtx);
752  std::optional<event> Result = std::nullopt;
753  std::swap(Result, MInOrderExternalEvent);
754  return Result;
755  }
756 
757  const std::vector<event> &
758  getExtendDependencyList(const std::vector<event> &DepEvents,
759  std::vector<event> &MutableVec,
760  std::unique_lock<std::mutex> &QueueLock);
761 
762  // Helps to manage host tasks presence in scenario with barrier usage.
763  // Approach that tracks almost all tasks to provide barrier sync for both pi
764  // tasks and host tasks is applicable for out of order queues only. No-op
765  // for in order ones.
766  void tryToResetEnqueuedBarrierDep(const EventImplPtr &EnqueuedBarrierEvent);
767 
768  // Called on host task completion that could block some kernels from enqueue.
769  // Approach that tracks almost all tasks to provide barrier sync for both pi
770  // tasks and host tasks is applicable for out of order queues only. Not neede
771  // for in order ones.
772  void revisitUnenqueuedCommandsState(const EventImplPtr &CompletedHostTask);
773 
774 protected:
775  event discard_or_return(const event &Event);
776  // Hook to the scheduler to clean up any fusion command held on destruction.
777  void cleanup_fusion_cmd();
778 
779  // template is needed for proper unit testing
780  template <typename HandlerType = handler>
781  void finalizeHandler(HandlerType &Handler, event &EventRet) {
782  if (MIsInorder) {
783  // Accessing and changing of an event isn't atomic operation.
784  // Hence, here is the lock for thread-safety.
785  std::lock_guard<std::mutex> Lock{MMutex};
786  // This dependency is needed for the following purposes:
787  // - host tasks are handled by the runtime and cannot be implicitly
788  // synchronized by the backend.
789  // - to prevent the 2nd kernel enqueue when the 1st kernel is blocked
790  // by a host task. This dependency allows to build the enqueue order in
791  // the RT but will not be passed to the backend. See getPIEvents in
792  // Command.
793 
794  auto &EventToBuildDeps = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr
796  if (EventToBuildDeps)
797  Handler.depends_on(EventToBuildDeps);
798 
799  // If there is an external event set, add it as a dependency and clear it.
800  // We do not need to hold the lock as MLastEventMtx will ensure the last
801  // event reflects the corresponding external event dependence as well.
802  std::optional<event> ExternalEvent = popExternalEvent();
803  if (ExternalEvent)
804  Handler.depends_on(*ExternalEvent);
805 
806  EventRet = Handler.finalize();
807  EventToBuildDeps = getSyclObjImpl(EventRet);
808  } else {
809  const CG::CGTYPE Type = Handler.getType();
810 
811  // The following code supports barrier synchronization if host task is
812  // involved in the scenario. Native barriers cannot handle host task
813  // dependency so in the case where some commands were not enqueued
814  // (blocked), we track them to prevent barrier from being enqueued
815  // earlier.
816  std::lock_guard<std::mutex> Lock{MMutex};
817  auto &Deps = MGraph.expired() ? MDefaultGraphDeps : MExtGraphDeps;
818  if (Type == CG::Barrier && !Deps.UnenqueuedCmdEvents.empty()) {
819  Handler.depends_on(Deps.UnenqueuedCmdEvents);
820  }
821  if (Deps.LastBarrier)
822  Handler.depends_on(Deps.LastBarrier);
823  EventRet = Handler.finalize();
824  EventImplPtr EventRetImpl = getSyclObjImpl(EventRet);
825  if (Type == CG::CodeplayHostTask)
826  Deps.UnenqueuedCmdEvents.push_back(EventRetImpl);
827  else if (!EventRetImpl->isEnqueued()) {
828  if (Type == CG::Barrier || Type == CG::BarrierWaitlist) {
829  Deps.LastBarrier = EventRetImpl;
830  Deps.UnenqueuedCmdEvents.clear();
831  } else
832  Deps.UnenqueuedCmdEvents.push_back(EventRetImpl);
833  }
834  }
835  }
836 
847  event submit_impl(const std::function<void(handler &)> &CGF,
848  const std::shared_ptr<queue_impl> &Self,
849  const std::shared_ptr<queue_impl> &PrimaryQueue,
850  const std::shared_ptr<queue_impl> &SecondaryQueue,
851  const detail::code_location &Loc,
852  const SubmitPostProcessF *PostProcess);
853 
859  template <typename HandlerFuncT>
860  event submitWithHandler(const std::shared_ptr<queue_impl> &Self,
861  const std::vector<event> &DepEvents,
862  HandlerFuncT HandlerFunc);
863 
877  template <typename HandlerFuncT, typename MemMngrFuncT,
878  typename... MemMngrArgTs>
879  event submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,
880  const std::vector<event> &DepEvents,
881  HandlerFuncT HandlerFunc, MemMngrFuncT MemMngrFunc,
882  MemMngrArgTs... MemOpArgs);
883 
884  // When instrumentation is enabled emits trace event for wait begin and
885  // returns the telemetry event generated for the wait
886  void *instrumentationProlog(const detail::code_location &CodeLoc,
887  std::string &Name, int32_t StreamID,
888  uint64_t &iid);
889  // Uses events generated by the Prolog and emits wait done event
890  void instrumentationEpilog(void *TelementryEvent, std::string &Name,
891  int32_t StreamID, uint64_t IId);
892 
898  void addSharedEvent(const event &Event);
899 
903  void addEvent(const event &Event);
904 
906  mutable std::mutex MMutex;
907 
910 
912  std::vector<std::weak_ptr<event_impl>> MEventsWeak;
913 
917  std::vector<event> MEventsShared;
921 
923  std::vector<sycl::detail::pi::PiQueue> MQueues;
925  size_t MNextQueueIdx = 0;
926 
927  const bool MHostQueue = false;
930  bool MEmulateOOO = false;
931 
932  // Access should be guarded with MMutex
934  // This event is employed for enhanced dependency tracking with in-order
935  // queue
937  // The following two items are employed for proper out of order enqueue
938  // ordering
939  std::vector<EventImplPtr> UnenqueuedCmdEvents;
942 
943  const bool MIsInorder;
944 
945  std::vector<EventImplPtr> MStreamsServiceEvents;
947 
948  // All member variable defined here are needed for the SYCL instrumentation
949  // layer. Do not guard these variables below with XPTI_ENABLE_INSTRUMENTATION
950  // to ensure we have the same object layout when the macro in the library and
951  // SYCL app are not the same.
952  void *MTraceEvent = nullptr;
954  uint8_t MStreamID = 0;
956  uint64_t MInstanceID = 0;
957 
958  // the fallback implementation of profiling info
959  bool MFallbackProfiling = false;
960 
961  // This event can be optionally provided by users for in-order queues to add
962  // an additional dependency for the subsequent submission in to the queue.
963  // Access to the event should be guarded with MInOrderExternalEventMtx.
964  // NOTE: std::optional must not be exposed in the ABI.
965  std::optional<event> MInOrderExternalEvent;
966  mutable std::mutex MInOrderExternalEventMtx;
967 
968 public:
969  // Queue constructed with the discard_events property
970  const bool MDiscardEvents;
972 
973 protected:
974  // Indicates whether the queue supports discarding PI events for tasks
975  // submitted to it. This condition is necessary but not sufficient, PI events
976  // should be discarded only if they also don't represent potential implicit
977  // dependencies for future tasks in other queues.
979 
980  // Command graph which is associated with this queue for the purposes of
981  // recording commands to it.
982  std::weak_ptr<ext::oneapi::experimental::detail::graph_impl> MGraph{};
983 
984  unsigned long long MQueueID;
985  static std::atomic<unsigned long long> MNextAvailableQueueID;
986 
988 };
989 
990 } // namespace detail
991 } // namespace _V1
992 } // namespace sycl
The context class represents a SYCL context on which kernel functions may be executed.
Definition: context.hpp:50
CGTYPE
Type of the command group.
Definition: cg.hpp:56
static GlobalHandler & instance()
bool isInFusionMode(QueueIdT Queue)
Definition: scheduler.cpp:648
static Scheduler & getInstance()
Definition: scheduler.cpp:261
event discard_or_return(const event &Event)
Definition: queue_impl.cpp:681
const property_list MPropList
Definition: queue_impl.hpp:920
bool is_in_fusion_mode()
Check whether the queue is in fusion mode.
Definition: queue_impl.hpp:712
event submit(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const std::shared_ptr< queue_impl > &SecondQueue, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess=nullptr)
Submits a command group function object to the queue, in order to be scheduled for execution on the d...
Definition: queue_impl.hpp:420
void revisitUnenqueuedCommandsState(const EventImplPtr &CompletedHostTask)
Definition: queue_impl.cpp:687
uint64_t MInstanceID
The instance ID of the trace event for queue object.
Definition: queue_impl.hpp:956
std::vector< EventImplPtr > MStreamsServiceEvents
Definition: queue_impl.hpp:945
void wait_and_throw(const detail::code_location &Loc={})
Definition: queue_impl.hpp:462
Param::return_type get_info() const
Queries SYCL queue for information.
std::optional< event > MInOrderExternalEvent
Definition: queue_impl.hpp:965
std::optional< event > popExternalEvent()
Definition: queue_impl.hpp:750
sycl::detail::pi::PiQueue createQueue(QueueOrder Order)
Creates PI queue.
Definition: queue_impl.hpp:562
void registerStreamServiceEvent(const EventImplPtr &Event)
Definition: queue_impl.hpp:702
static std::atomic< unsigned long long > MNextAvailableQueueID
Definition: queue_impl.hpp:985
void addEvent(const event &Event)
Stores an event that should be associated with the queue.
Definition: queue_impl.cpp:295
std::vector< sycl::detail::pi::PiQueue > MQueues
List of queues created for FPGA device from a single SYCL queue.
Definition: queue_impl.hpp:923
pi_native_handle getNative(int32_t &NativeHandleDesc) const
Gets the native handle of the SYCL queue.
Definition: queue_impl.cpp:621
struct sycl::_V1::detail::queue_impl::DependencyTrackingItems MExtGraphDeps
sycl::detail::pi::PiQueue & getExclusiveQueueHandleRef()
Definition: queue_impl.hpp:595
void tryToResetEnqueuedBarrierDep(const EventImplPtr &EnqueuedBarrierEvent)
unsigned long long MQueueID
Definition: queue_impl.hpp:984
std::vector< std::weak_ptr< event_impl > > MEventsWeak
These events are tracked, but not owned, by the queue.
Definition: queue_impl.hpp:912
event submitMemOpHelper(const std::shared_ptr< queue_impl > &Self, const std::vector< event > &DepEvents, HandlerFuncT HandlerFunc, MemMngrFuncT MemMngrFunc, MemMngrArgTs... MemOpArgs)
Performs submission of a memory operation directly if scheduler can be bypassed, or with a handler ot...
bool has_property() const noexcept
Definition: queue_impl.hpp:635
event submit(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess=nullptr)
Submits a command group function object to the queue, in order to be scheduled for execution on the d...
Definition: queue_impl.hpp:443
std::vector< event > MEventsShared
Events without data dependencies (such as USM) need an owner, additionally, USM operations are not ad...
Definition: queue_impl.hpp:917
std::mutex MMutex
Protects all the fields that can be changed by class' methods.
Definition: queue_impl.hpp:906
void setCommandGraph(std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > Graph)
Definition: queue_impl.hpp:731
queue_impl(sycl::detail::pi::PiQueue PiQueue, const ContextImplPtr &Context, const async_handler &AsyncHandler)
Constructs a SYCL queue from plugin interoperability handle.
Definition: queue_impl.hpp:288
bool supportsDiscardingPiEvents() const
Definition: queue_impl.hpp:374
event submitWithHandler(const std::shared_ptr< queue_impl > &Self, const std::vector< event > &DepEvents, HandlerFuncT HandlerFunc)
Helper function for submitting a memory operation with a handler.
Definition: queue_impl.cpp:388
std::function< void(bool, bool, event &)> SubmitPostProcessF
Definition: queue_impl.hpp:405
void addSharedEvent(const event &Event)
queue_impl.addEvent tracks events with weak pointers but some events have no other owners.
Definition: queue_impl.cpp:318
const ContextImplPtr MContext
Definition: queue_impl.hpp:909
void finalizeHandler(HandlerType &Handler, event &EventRet)
Definition: queue_impl.hpp:781
void setExternalEvent(const event &Event)
Definition: queue_impl.hpp:745
sycl::detail::pi::PiQueue & getHandleRef()
Definition: queue_impl.hpp:626
void throw_asynchronous()
Performs a blocking wait for the completion of all enqueued tasks in the queue.
Definition: queue_impl.hpp:474
event memset(const std::shared_ptr< queue_impl > &Self, void *Ptr, int Value, size_t Count, const std::vector< event > &DepEvents)
Fills the memory pointed by a USM pointer with the value specified.
Definition: queue_impl.cpp:147
queue_impl(const DeviceImplPtr &Device, const ContextImplPtr &Context, const async_handler &AsyncHandler, const property_list &PropList)
Constructs a SYCL queue with an async_handler and property_list provided form a device and a context.
Definition: queue_impl.hpp:106
void flush()
Provides a hint to the backend to execute previously issued commands on this queue.
Definition: queue_impl.hpp:394
const PluginPtr & getPlugin() const
Definition: queue_impl.hpp:361
event submit_impl(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const std::shared_ptr< queue_impl > &PrimaryQueue, const std::shared_ptr< queue_impl > &SecondaryQueue, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess)
Performs command group submission to the queue.
Definition: queue_impl.cpp:347
const DeviceImplPtr & getDeviceImplPtr() const
Definition: queue_impl.hpp:365
event mem_advise(const std::shared_ptr< queue_impl > &Self, const void *Ptr, size_t Length, pi_mem_advice Advice, const std::vector< event > &DepEvents)
Provides additional information to the underlying runtime about how different allocations are used.
Definition: queue_impl.cpp:233
const async_handler MAsyncHandler
Definition: queue_impl.hpp:919
event memcpyToDeviceGlobal(const std::shared_ptr< queue_impl > &Self, void *DeviceGlobalPtr, const void *Src, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector< event > &DepEvents)
Definition: queue_impl.cpp:244
exception_list getExceptionList() const
Definition: queue_impl.hpp:459
event memcpyFromDeviceGlobal(const std::shared_ptr< queue_impl > &Self, void *Dest, const void *DeviceGlobalPtr, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector< event > &DepEvents)
Definition: queue_impl.cpp:260
event memcpy(const std::shared_ptr< queue_impl > &Self, void *Dest, const void *Src, size_t Count, const std::vector< event > &DepEvents, const code_location &CodeLoc)
Copies data from one memory region to another, both pointed by USM pointers.
Definition: queue_impl.cpp:194
uint8_t MStreamID
The stream under which the traces are emitted from the queue object.
Definition: queue_impl.hpp:954
void reportAsyncException(const std::exception_ptr &ExceptionPtr)
Puts exception to the list of asynchronous ecxeptions.
Definition: queue_impl.hpp:688
void wait(const detail::code_location &Loc={})
Performs a blocking wait for the completion of all enqueued tasks in the queue.
Definition: queue_impl.cpp:538
std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > getCommandGraph() const
Definition: queue_impl.hpp:739
struct sycl::_V1::detail::queue_impl::DependencyTrackingItems MDefaultGraphDeps
bool MEmulateOOO
Indicates that a native out-of-order queue could not be created and we need to emulate it with multip...
Definition: queue_impl.hpp:930
void instrumentationEpilog(void *TelementryEvent, std::string &Name, int32_t StreamID, uint64_t IId)
Definition: queue_impl.cpp:519
static sycl::detail::pi::PiQueueProperties createPiQueueProperties(const property_list &PropList, QueueOrder Order)
Creates PI properties array.
Definition: queue_impl.hpp:496
const ContextImplPtr & getContextImplPtr() const
Definition: queue_impl.hpp:363
void * instrumentationProlog(const detail::code_location &CodeLoc, std::string &Name, int32_t StreamID, uint64_t &iid)
Definition: queue_impl.cpp:450
size_t MNextQueueIdx
Iterator through MQueues.
Definition: queue_impl.hpp:925
std::weak_ptr< ext::oneapi::experimental::detail::graph_impl > MGraph
Definition: queue_impl.hpp:982
Param::return_type get_backend_info() const
Queries SYCL queue for SYCL backend-specific information.
const std::vector< event > & getExtendDependencyList(const std::vector< event > &DepEvents, std::vector< event > &MutableVec, std::unique_lock< std::mutex > &QueueLock)
Definition: queue_impl.cpp:125
queue_impl(const DeviceImplPtr &Device, const async_handler &AsyncHandler, const property_list &PropList)
Constructs a SYCL queue from a device using an async_handler and property_list provided.
Definition: queue_impl.hpp:93
static ContextImplPtr getDefaultOrNew(const DeviceImplPtr &Device)
Definition: queue_impl.hpp:74
unsigned long long getQueueID()
Definition: queue_impl.hpp:743
queue_impl(sycl::detail::pi::PiQueue PiQueue, const ContextImplPtr &Context, const async_handler &AsyncHandler, const property_list &PropList)
Constructs a SYCL queue from plugin interoperability handle.
Definition: queue_impl.hpp:309
propertyT get_property() const
Definition: queue_impl.hpp:642
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Definition: event.hpp:44
A list of asynchronous exceptions.
Implementation of node class from SYCL_EXT_ONEAPI_GRAPH.
Definition: graph_impl.hpp:80
Command group handler class.
Definition: handler.hpp:462
Objects of the property_list class are containers for the SYCL properties.
bool has_property() const noexcept
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
Definition: queue.hpp:111
#define __SYCL_PI_CUDA_USE_DEFAULT_STREAM
::pi_queue PiQueue
Definition: pi.hpp:122
::pi_queue_properties PiQueueProperties
Definition: pi.hpp:123
CUDAContextT
Possible CUDA context types supported by PI CUDA backend TODO: Implement this as a property once ther...
Definition: queue_impl.hpp:63
constexpr const char * SYCL_STREAM_NAME
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
Definition: event_impl.hpp:32
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:30
std::shared_ptr< event_impl > EventImplPtr
Definition: cg.hpp:43
std::shared_ptr< plugin > PluginPtr
Definition: pi.hpp:47
std::shared_ptr< device_impl > DeviceImplPtr
static constexpr size_t MaxNumQueues
Sets max number of queues supported by FPGA RT.
Definition: queue_impl.hpp:59
constexpr CUDAContextT DefaultContextType
Default context type created for CUDA backend.
Definition: queue_impl.hpp:66
constexpr auto memory_order_relaxed
std::function< void(sycl::exception_list)> async_handler
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:93
Definition: access.hpp:18
constexpr pi_queue_properties PI_QUEUE_COMPUTE_INDEX
Definition: pi.h:822
pi_result piQueueFinish(pi_queue command_queue)
Definition: pi_cuda.cpp:186
uintptr_t pi_native_handle
Definition: pi.h:228
_pi_result
Definition: pi.h:235
constexpr pi_queue_properties PI_EXT_QUEUE_FLAG_SUBMISSION_IMMEDIATE
Definition: pi.h:832
@ PI_QUEUE_INFO_DEVICE
Definition: pi.h:525
_pi_mem_advice
Definition: pi.h:626
constexpr pi_queue_properties PI_EXT_QUEUE_FLAG_SUBMISSION_NO_IMMEDIATE
Definition: pi.h:831
constexpr pi_queue_properties PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE
Definition: pi.h:824
pi_result piextQueueCreate(pi_context context, pi_device device, pi_queue_properties *properties, pi_queue *queue)
Definition: pi_cuda.cpp:167
pi_result piQueueRelease(pi_queue command_queue)
Definition: pi_cuda.cpp:182
constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW
Definition: pi.h:829
constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS
Definition: pi.h:828
constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH
Definition: pi.h:830
constexpr pi_queue_properties PI_QUEUE_FLAGS
Definition: pi.h:821
constexpr pi_queue_properties PI_QUEUE_FLAG_PROFILING_ENABLE
Definition: pi.h:825
pi_result piQueueFlush(pi_queue command_queue)
Definition: pi_cuda.cpp:188
pi_result piQueueGetInfo(pi_queue command_queue, pi_queue_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_cuda.cpp:172
pi_result piQueueRetain(pi_queue command_queue)
Definition: pi_cuda.cpp:180
_Abi const simd< _Tp, _Abi > & noexcept
Definition: simd.hpp:1324