DPC++ Runtime
Runtime libraries for oneAPI DPC++
queue_impl.hpp
Go to the documentation of this file.
1 //==------------------ queue_impl.hpp - SYCL queue -------------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #pragma once
10 
11 #include <detail/config.hpp>
12 #include <detail/context_impl.hpp>
13 #include <detail/device_impl.hpp>
14 #include <detail/device_info.hpp>
15 #include <detail/event_impl.hpp>
17 #include <detail/kernel_impl.hpp>
18 #include <detail/plugin.hpp>
20 #include <detail/thread_pool.hpp>
21 #include <sycl/context.hpp>
24 #include <sycl/device.hpp>
25 #include <sycl/event.hpp>
26 #include <sycl/exception.hpp>
27 #include <sycl/exception_list.hpp>
29 #include <sycl/handler.hpp>
32 #include <sycl/property_list.hpp>
33 #include <sycl/queue.hpp>
34 
35 #include "detail/graph_impl.hpp"
36 
37 #include <utility>
38 
39 #ifdef XPTI_ENABLE_INSTRUMENTATION
40 #include "xpti/xpti_trace_framework.hpp"
41 #include <detail/xpti_registry.hpp>
42 #endif
43 
44 namespace sycl {
45 inline namespace _V1 {
46 
47 // forward declaration
48 
49 namespace ext::oneapi::experimental::detail {
50 class graph_impl;
51 }
52 
53 namespace detail {
54 
55 using ContextImplPtr = std::shared_ptr<detail::context_impl>;
56 using DeviceImplPtr = std::shared_ptr<detail::device_impl>;
57 
59 static constexpr size_t MaxNumQueues = 256;
60 
63 enum class CUDAContextT : char { primary, custom };
64 
67 
69 
70 class queue_impl {
71 public:
72  // \return a default context for the platform if it includes the device
73  // passed and default contexts are enabled, a new context otherwise.
77  context{createSyclObjFromImpl<device>(Device), {}, {}});
78 
79  ContextImplPtr DefaultContext = detail::getSyclObjImpl(
80  Device->get_platform().ext_oneapi_get_default_context());
81  if (DefaultContext->isDeviceValid(Device))
82  return DefaultContext;
84  context{createSyclObjFromImpl<device>(Device), {}, {}});
85  }
93  queue_impl(const DeviceImplPtr &Device, const async_handler &AsyncHandler,
94  const property_list &PropList)
95  : queue_impl(Device, getDefaultOrNew(Device), AsyncHandler, PropList){};
96 
106  queue_impl(const DeviceImplPtr &Device, const ContextImplPtr &Context,
107  const async_handler &AsyncHandler, const property_list &PropList)
108  : MDevice(Device), MContext(Context), MAsyncHandler(AsyncHandler),
109  MPropList(PropList), MHostQueue(MDevice->is_host()),
110  MIsInorder(has_property<property::queue::in_order>()),
112  has_property<ext::oneapi::property::queue::discard_events>()),
113  MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
115  (MHostQueue ? true : MIsInorder)),
116  MQueueID{
118  if (has_property<property::queue::enable_profiling>()) {
119  if (has_property<ext::oneapi::property::queue::discard_events>())
121  "Queue cannot be constructed with both of "
122  "discard_events and enable_profiling.");
123  // fallback profiling support. See MFallbackProfiling
124  if (MDevice->has(aspect::queue_profiling)) {
125  // When piGetDeviceAndHostTimer is not supported, compute the
126  // profiling time OpenCL version < 2.1 case
127  if (!getDeviceImplPtr()->is_host() &&
128  !getDeviceImplPtr()->isGetDeviceAndHostTimerSupported())
129  MFallbackProfiling = true;
130  } else {
132  "Cannot enable profiling, the associated device "
133  "does not have the queue_profiling aspect");
134  }
135  }
136  if (has_property<ext::intel::property::queue::compute_index>()) {
137  int Idx = get_property<ext::intel::property::queue::compute_index>()
138  .get_index();
139  int NumIndices =
140  createSyclObjFromImpl<device>(Device)
141  .get_info<ext::intel::info::device::max_compute_queue_indices>();
142  if (Idx < 0 || Idx >= NumIndices)
143  throw sycl::exception(
145  "Queue compute index must be a non-negative number less than "
146  "device's number of available compute queue indices.");
147  }
148  if (has_property<
150  !MDevice->get_info<
151  ext::codeplay::experimental::info::device::supports_fusion>()) {
152  throw sycl::exception(
154  "Cannot enable fusion if device does not support fusion");
155  }
156  if (!Context->isDeviceValid(Device)) {
157  if (!Context->is_host() && Context->getBackend() == backend::opencl)
158  throw sycl::invalid_object_error(
159  "Queue cannot be constructed with the given context and device "
160  "since the device is not a member of the context (descendants of "
161  "devices from the context are not supported on OpenCL yet).",
162  PI_ERROR_INVALID_DEVICE);
163  throw sycl::invalid_object_error(
164  "Queue cannot be constructed with the given context and device "
165  "since the device is neither a member of the context nor a "
166  "descendant of its member.",
167  PI_ERROR_INVALID_DEVICE);
168  }
169  if (!MHostQueue) {
170  const QueueOrder QOrder =
172  MQueues.push_back(createQueue(QOrder));
173  // This section is the second part of the instrumentation that uses the
174  // tracepoint information and notifies
175  }
176 
177  // We enable XPTI tracing events using the TLS mechanism; if the code
178  // location data is available, then the tracing data will be rich.
179 #if XPTI_ENABLE_INSTRUMENTATION
180  constexpr uint16_t NotificationTraceType =
181  static_cast<uint16_t>(xpti::trace_point_type_t::queue_create);
182  // Using the instance override constructor for use with queues as queues
183  // maintain instance IDs in the object
184  XPTIScope PrepareNotify((void *)this, NotificationTraceType,
185  SYCL_STREAM_NAME, MQueueID, "queue_create");
186  // Cache the trace event, stream id and instance IDs for the destructor
187  if (xptiCheckTraceEnabled(PrepareNotify.streamID(),
188  NotificationTraceType)) {
189  MTraceEvent = (void *)PrepareNotify.traceEvent();
190  MStreamID = PrepareNotify.streamID();
191  MInstanceID = PrepareNotify.instanceID();
192  // Add the function to capture meta data for the XPTI trace event
193  PrepareNotify.addMetadata([&](auto TEvent) {
194  xpti::addMetadata(TEvent, "sycl_context",
195  reinterpret_cast<size_t>(MContext->getHandleRef()));
196  if (MDevice) {
197  xpti::addMetadata(TEvent, "sycl_device_name",
198  MDevice->getDeviceName());
199  xpti::addMetadata(
200  TEvent, "sycl_device",
201  reinterpret_cast<size_t>(
202  MDevice->is_host() ? 0 : MDevice->getHandleRef()));
203  }
204  xpti::addMetadata(TEvent, "is_inorder", MIsInorder);
205  xpti::addMetadata(TEvent, "queue_id", MQueueID);
206  if (!MHostQueue)
207  xpti::addMetadata(TEvent, "queue_handle",
208  reinterpret_cast<size_t>(getHandleRef()));
209  });
210  // Also publish to TLS
211  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, MQueueID);
212  PrepareNotify.notify();
213  }
214 #endif
215  }
216 
217  event getLastEvent();
218 
219 private:
220  void queue_impl_interop(sycl::detail::pi::PiQueue PiQueue) {
221  if (has_property<ext::oneapi::property::queue::discard_events>() &&
222  has_property<property::queue::enable_profiling>()) {
224  "Queue cannot be constructed with both of "
225  "discard_events and enable_profiling.");
226  }
227 
228  MQueues.push_back(pi::cast<sycl::detail::pi::PiQueue>(PiQueue));
229 
230  sycl::detail::pi::PiDevice DevicePI{};
231  const PluginPtr &Plugin = getPlugin();
232  // TODO catch an exception and put it to list of asynchronous exceptions
233  Plugin->call<PiApiKind::piQueueGetInfo>(
234  MQueues[0], PI_QUEUE_INFO_DEVICE, sizeof(DevicePI), &DevicePI, nullptr);
235  MDevice = MContext->findMatchingDeviceImpl(DevicePI);
236  if (MDevice == nullptr) {
237  throw sycl::exception(
239  "Device provided by native Queue not found in Context.");
240  }
241  // The following commented section provides a guideline on how to use the
242  // TLS enabled mechanism to create a tracepoint and notify using XPTI. This
243  // is the prolog section and the epilog section will initiate the
244  // notification.
245 #if XPTI_ENABLE_INSTRUMENTATION
246  constexpr uint16_t NotificationTraceType =
247  static_cast<uint16_t>(xpti::trace_point_type_t::queue_create);
248  XPTIScope PrepareNotify((void *)this, NotificationTraceType,
249  SYCL_STREAM_NAME, MQueueID, "queue_create");
250  if (xptiCheckTraceEnabled(PrepareNotify.streamID(),
251  NotificationTraceType)) {
252  // Cache the trace event, stream id and instance IDs for the destructor
253  MTraceEvent = (void *)PrepareNotify.traceEvent();
254  MStreamID = PrepareNotify.streamID();
255  MInstanceID = PrepareNotify.instanceID();
256 
257  // Add the function to capture meta data for the XPTI trace event
258  PrepareNotify.addMetadata([&](auto TEvent) {
259  xpti::addMetadata(TEvent, "sycl_context",
260  reinterpret_cast<size_t>(MContext->getHandleRef()));
261  if (MDevice) {
262  xpti::addMetadata(TEvent, "sycl_device_name",
263  MDevice->getDeviceName());
264  xpti::addMetadata(
265  TEvent, "sycl_device",
266  reinterpret_cast<size_t>(
267  MDevice->is_host() ? 0 : MDevice->getHandleRef()));
268  }
269  xpti::addMetadata(TEvent, "is_inorder", MIsInorder);
270  xpti::addMetadata(TEvent, "queue_id", MQueueID);
271  if (!MHostQueue)
272  xpti::addMetadata(TEvent, "queue_handle", getHandleRef());
273  });
274  // Also publish to TLS before notification
275  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, MQueueID);
276  PrepareNotify.notify();
277  }
278 #endif
279  }
280 
281 public:
289  const async_handler &AsyncHandler)
290  : MContext(Context), MAsyncHandler(AsyncHandler), MHostQueue(false),
291  MIsInorder(has_property<property::queue::in_order>()),
293  has_property<ext::oneapi::property::queue::discard_events>()),
294  MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
296  (MHostQueue ? true : MIsInorder)),
297  MQueueID{
299  queue_impl_interop(PiQueue);
300  }
301 
310  const async_handler &AsyncHandler, const property_list &PropList)
311  : MContext(Context), MAsyncHandler(AsyncHandler), MPropList(PropList),
312  MHostQueue(false),
313  MIsInorder(has_property<property::queue::in_order>()),
315  has_property<ext::oneapi::property::queue::discard_events>()),
316  MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
318  (MHostQueue ? true : MIsInorder)) {
319  queue_impl_interop(PiQueue);
320  }
321 
323  // The trace event created in the constructor should be active through the
324  // lifetime of the queue object as member variables when ABI breakage is
325  // allowed. This example shows MTraceEvent as a member variable.
326 #if XPTI_ENABLE_INSTRUMENTATION
327  constexpr uint16_t NotificationTraceType =
328  static_cast<uint16_t>(xpti::trace_point_type_t::queue_destroy);
329  if (xptiCheckTraceEnabled(MStreamID, NotificationTraceType)) {
330  // Used cached information in member variables
331  xptiNotifySubscribers(MStreamID, NotificationTraceType, nullptr,
332  (xpti::trace_event_data_t *)MTraceEvent,
333  MInstanceID,
334  static_cast<const void *>("queue_destroy"));
335  xptiReleaseEvent((xpti::trace_event_data_t *)MTraceEvent);
336  }
337 #endif
339  if (!MHostQueue) {
342  }
343  }
344 
346  cl_command_queue get() {
347  if (MHostQueue) {
348  throw invalid_object_error(
349  "This instance of queue doesn't support OpenCL interoperability",
350  PI_ERROR_INVALID_QUEUE);
351  }
353  return pi::cast<cl_command_queue>(MQueues[0]);
354  }
355 
358  return createSyclObjFromImpl<context>(MContext);
359  }
360 
361  const PluginPtr &getPlugin() const { return MContext->getPlugin(); }
362 
363  const ContextImplPtr &getContextImplPtr() const { return MContext; }
364 
365  const DeviceImplPtr &getDeviceImplPtr() const { return MDevice; }
366 
368  device get_device() const { return createSyclObjFromImpl<device>(MDevice); }
369 
371  bool is_host() const { return MHostQueue; }
372 
376  }
377 
378  bool isInOrder() const { return MIsInorder; }
379 
383  template <typename Param> typename Param::return_type get_info() const;
384 
388  template <typename Param>
389  typename Param::return_type get_backend_info() const;
390 
391  using SubmitPostProcessF = std::function<void(bool, bool, event &)>;
392 
406  event submit(const std::function<void(handler &)> &CGF,
407  const std::shared_ptr<queue_impl> &Self,
408  const std::shared_ptr<queue_impl> &SecondQueue,
409  const detail::code_location &Loc,
410  const SubmitPostProcessF *PostProcess = nullptr) {
411  event ResEvent;
412  try {
413  ResEvent = submit_impl(CGF, Self, Self, SecondQueue, Loc, PostProcess);
414  } catch (...) {
415  ResEvent = SecondQueue->submit_impl(CGF, SecondQueue, Self, SecondQueue,
416  Loc, PostProcess);
417  }
418  return discard_or_return(ResEvent);
419  }
420 
429  event submit(const std::function<void(handler &)> &CGF,
430  const std::shared_ptr<queue_impl> &Self,
431  const detail::code_location &Loc,
432  const SubmitPostProcessF *PostProcess = nullptr) {
433  auto ResEvent = submit_impl(CGF, Self, Self, nullptr, Loc, PostProcess);
434  return discard_or_return(ResEvent);
435  }
436 
442  void wait(const detail::code_location &Loc = {});
443 
446 
448  void wait_and_throw(const detail::code_location &Loc = {}) {
449  wait(Loc);
451  }
452 
461  if (!MAsyncHandler)
462  return;
463 
464  exception_list Exceptions;
465  {
466  std::lock_guard<std::mutex> Lock(MMutex);
467  std::swap(Exceptions, MExceptions);
468  }
469  // Unlock the mutex before calling user-provided handler to avoid
470  // potential deadlock if the same queue is somehow referenced in the
471  // handler.
472  if (Exceptions.size())
473  MAsyncHandler(std::move(Exceptions));
474  }
475 
483  sycl::detail::pi::PiQueueProperties CreationFlags = 0;
484 
485  if (Order == QueueOrder::OOO) {
487  }
488  if (PropList.has_property<property::queue::enable_profiling>()) {
489  CreationFlags |= PI_QUEUE_FLAG_PROFILING_ENABLE;
490  }
491  if (PropList.has_property<
492  ext::oneapi::cuda::property::queue::use_default_stream>()) {
493  CreationFlags |= __SYCL_PI_CUDA_USE_DEFAULT_STREAM;
494  }
495  if (PropList.has_property<ext::oneapi::property::queue::discard_events>()) {
496  // Pass this flag to the Level Zero plugin to be able to check it from
497  // queue property.
499  }
500  // Track that priority settings are not ambiguous.
501  bool PrioritySeen = false;
502  if (PropList
503  .has_property<ext::oneapi::property::queue::priority_normal>()) {
504  // Normal is the default priority, don't pass anything.
505  PrioritySeen = true;
506  }
507  if (PropList.has_property<ext::oneapi::property::queue::priority_low>()) {
508  if (PrioritySeen) {
509  throw sycl::exception(
511  "Queue cannot be constructed with different priorities.");
512  }
513  CreationFlags |= PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW;
514  PrioritySeen = true;
515  }
516  if (PropList.has_property<ext::oneapi::property::queue::priority_high>()) {
517  if (PrioritySeen) {
518  throw sycl::exception(
520  "Queue cannot be constructed with different priorities.");
521  }
523  }
524  // Track that submission modes do not conflict.
525  bool SubmissionSeen = false;
526  if (PropList.has_property<
527  ext::intel::property::queue::no_immediate_command_list>()) {
528  SubmissionSeen = true;
530  }
531  if (PropList.has_property<
532  ext::intel::property::queue::immediate_command_list>()) {
533  if (SubmissionSeen) {
534  throw sycl::exception(
536  "Queue cannot be constructed with different submission modes.");
537  }
538  SubmissionSeen = true;
540  }
541  return CreationFlags;
542  }
543 
550  sycl::detail::pi::PiContext Context = MContext->getHandleRef();
551  sycl::detail::pi::PiDevice Device = MDevice->getHandleRef();
552  const PluginPtr &Plugin = getPlugin();
553 
554  sycl::detail::pi::PiQueueProperties Properties[] = {
556  if (has_property<ext::intel::property::queue::compute_index>()) {
557  int Idx = get_property<ext::intel::property::queue::compute_index>()
558  .get_index();
559  Properties[2] = PI_QUEUE_COMPUTE_INDEX;
560  Properties[3] = static_cast<sycl::detail::pi::PiQueueProperties>(Idx);
561  }
563  Plugin->call_nocheck<PiApiKind::piextQueueCreate>(Context, Device,
564  Properties, &Queue);
565 
566  // If creating out-of-order queue failed and this property is not
567  // supported (for example, on FPGA), it will return
568  // PI_ERROR_INVALID_QUEUE_PROPERTIES and will try to create in-order queue.
569  if (!MEmulateOOO && Error == PI_ERROR_INVALID_QUEUE_PROPERTIES) {
570  MEmulateOOO = true;
572  } else {
573  Plugin->checkPiResult(Error);
574  }
575 
576  return Queue;
577  }
578 
582  sycl::detail::pi::PiQueue *PIQ = nullptr;
583  bool ReuseQueue = false;
584  {
585  std::lock_guard<std::mutex> Lock(MMutex);
586 
587  // To achieve parallelism for FPGA with in order execution model with
588  // possibility of two kernels to share data with each other we shall
589  // create a queue for every kernel enqueued.
590  if (MQueues.size() < MaxNumQueues) {
591  MQueues.push_back({});
592  PIQ = &MQueues.back();
593  } else {
594  // If the limit of OpenCL queues is going to be exceeded - take the
595  // earliest used queue, wait until it finished and then reuse it.
596  PIQ = &MQueues[MNextQueueIdx];
598  ReuseQueue = true;
599  }
600  }
601 
602  if (!ReuseQueue)
604  else
605  getPlugin()->call<PiApiKind::piQueueFinish>(*PIQ);
606 
607  return *PIQ;
608  }
609 
613  if (!MEmulateOOO)
614  return MQueues[0];
615 
617  }
618 
621  template <typename propertyT> bool has_property() const noexcept {
622  return MPropList.has_property<propertyT>();
623  }
624 
628  template <typename propertyT> propertyT get_property() const {
629  return MPropList.get_property<propertyT>();
630  }
631 
641  event memset(const std::shared_ptr<queue_impl> &Self, void *Ptr, int Value,
642  size_t Count, const std::vector<event> &DepEvents);
653  event memcpy(const std::shared_ptr<queue_impl> &Self, void *Dest,
654  const void *Src, size_t Count,
655  const std::vector<event> &DepEvents,
656  const code_location &CodeLoc);
667  event mem_advise(const std::shared_ptr<queue_impl> &Self, const void *Ptr,
668  size_t Length, pi_mem_advice Advice,
669  const std::vector<event> &DepEvents);
670 
674  void reportAsyncException(const std::exception_ptr &ExceptionPtr) {
675  std::lock_guard<std::mutex> Lock(MMutex);
676  MExceptions.PushBack(ExceptionPtr);
677  }
678 
681  }
682 
686  pi_native_handle getNative(int32_t &NativeHandleDesc) const;
687 
689  std::lock_guard<std::mutex> Lock(MStreamsServiceEventsMutex);
690  MStreamsServiceEvents.push_back(Event);
691  }
692 
693  bool ext_oneapi_empty() const;
694 
700  std::hash<typename std::shared_ptr<queue_impl>::element_type *>()(
701  this));
702  }
703 
704  event memcpyToDeviceGlobal(const std::shared_ptr<queue_impl> &Self,
705  void *DeviceGlobalPtr, const void *Src,
706  bool IsDeviceImageScope, size_t NumBytes,
707  size_t Offset,
708  const std::vector<event> &DepEvents);
709  event memcpyFromDeviceGlobal(const std::shared_ptr<queue_impl> &Self,
710  void *Dest, const void *DeviceGlobalPtr,
711  bool IsDeviceImageScope, size_t NumBytes,
712  size_t Offset,
713  const std::vector<event> &DepEvents);
714 
716 
718  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph) {
719  std::lock_guard<std::mutex> Lock(MMutex);
720  MGraph = Graph;
721  MGraphLastEventPtr = nullptr;
722  }
723 
724  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
725  getCommandGraph() const {
726  return MGraph.lock();
727  }
728 
729  unsigned long long getQueueID() { return MQueueID; }
730 
731  void setExternalEvent(const event &Event) {
732  std::lock_guard<std::mutex> Lock(MInOrderExternalEventMtx);
733  MInOrderExternalEvent = Event;
734  }
735 
736  std::optional<event> popExternalEvent() {
737  std::lock_guard<std::mutex> Lock(MInOrderExternalEventMtx);
738  std::optional<event> Result = std::nullopt;
739  std::swap(Result, MInOrderExternalEvent);
740  return Result;
741  }
742 
743  const std::vector<event> &
744  getExtendDependencyList(const std::vector<event> &DepEvents,
745  std::vector<event> &MutableVec,
746  std::unique_lock<std::mutex> &QueueLock);
747 
748 protected:
749  event discard_or_return(const event &Event);
750  // Hook to the scheduler to clean up any fusion command held on destruction.
751  void cleanup_fusion_cmd();
752 
753  // template is needed for proper unit testing
754  template <typename HandlerType = handler>
755  void finalizeHandler(HandlerType &Handler, event &EventRet) {
756  if (MIsInorder) {
757  // Accessing and changing of an event isn't atomic operation.
758  // Hence, here is the lock for thread-safety.
759  std::lock_guard<std::mutex> Lock{MMutex};
760  // This dependency is needed for the following purposes:
761  // - host tasks are handled by the runtime and cannot be implicitly
762  // synchronized by the backend.
763  // - to prevent the 2nd kernel enqueue when the 1st kernel is blocked
764  // by a host task. This dependency allows to build the enqueue order in
765  // the RT but will not be passed to the backend. See getPIEvents in
766  // Command.
767  auto &EventToBuildDeps =
769  if (EventToBuildDeps)
770  Handler.depends_on(
771  createSyclObjFromImpl<sycl::event>(EventToBuildDeps));
772 
773  // If there is an external event set, add it as a dependency and clear it.
774  // We do not need to hold the lock as MLastEventMtx will ensure the last
775  // event reflects the corresponding external event dependence as well.
776  std::optional<event> ExternalEvent = popExternalEvent();
777  if (ExternalEvent)
778  Handler.depends_on(*ExternalEvent);
779 
780  EventRet = Handler.finalize();
781  EventToBuildDeps = getSyclObjImpl(EventRet);
782  } else
783  EventRet = Handler.finalize();
784  }
785 
796  event submit_impl(const std::function<void(handler &)> &CGF,
797  const std::shared_ptr<queue_impl> &Self,
798  const std::shared_ptr<queue_impl> &PrimaryQueue,
799  const std::shared_ptr<queue_impl> &SecondaryQueue,
800  const detail::code_location &Loc,
801  const SubmitPostProcessF *PostProcess) {
802  // Flag used to detect nested calls to submit and report an error.
803  thread_local static bool PreventSubmit = false;
804 
805  if (PreventSubmit) {
806  throw sycl::exception(
808  "Calls to sycl::queue::submit cannot be nested. Command group "
809  "function objects should use the sycl::handler API instead.");
810  }
811 
812  handler Handler(Self, PrimaryQueue, SecondaryQueue, MHostQueue);
813  Handler.saveCodeLoc(Loc);
814  PreventSubmit = true;
815  try {
816  CGF(Handler);
817  } catch (...) {
818  PreventSubmit = false;
819  throw;
820  }
821  PreventSubmit = false;
822 
823  // Scheduler will later omit events, that are not required to execute tasks.
824  // Host and interop tasks, however, are not submitted to low-level runtimes
825  // and require separate dependency management.
826  const CG::CGTYPE Type = Handler.getType();
827  event Event = detail::createSyclObjFromImpl<event>(
828  std::make_shared<detail::event_impl>());
829 
830  if (PostProcess) {
831  bool IsKernel = Type == CG::Kernel;
832  bool KernelUsesAssert = false;
833 
834  if (IsKernel)
835  // Kernel only uses assert if it's non interop one
836  KernelUsesAssert = !(Handler.MKernel && Handler.MKernel->isInterop()) &&
838  Handler.MKernelName.c_str());
839  finalizeHandler(Handler, Event);
840 
841  (*PostProcess)(IsKernel, KernelUsesAssert, Event);
842  } else
843  finalizeHandler(Handler, Event);
844 
845  addEvent(Event);
846  return Event;
847  }
848 
854  template <typename HandlerFuncT>
855  event submitWithHandler(const std::shared_ptr<queue_impl> &Self,
856  const std::vector<event> &DepEvents,
857  HandlerFuncT HandlerFunc);
858 
872  template <typename HandlerFuncT, typename MemMngrFuncT,
873  typename... MemMngrArgTs>
874  event submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,
875  const std::vector<event> &DepEvents,
876  HandlerFuncT HandlerFunc, MemMngrFuncT MemMngrFunc,
877  MemMngrArgTs... MemOpArgs);
878 
879  // When instrumentation is enabled emits trace event for wait begin and
880  // returns the telemetry event generated for the wait
881  void *instrumentationProlog(const detail::code_location &CodeLoc,
882  std::string &Name, int32_t StreamID,
883  uint64_t &iid);
884  // Uses events generated by the Prolog and emits wait done event
885  void instrumentationEpilog(void *TelementryEvent, std::string &Name,
886  int32_t StreamID, uint64_t IId);
887 
893  void addSharedEvent(const event &Event);
894 
898  void addEvent(const event &Event);
899 
901  mutable std::mutex MMutex;
902 
905 
907  std::vector<std::weak_ptr<event_impl>> MEventsWeak;
908 
912  std::vector<event> MEventsShared;
916 
918  std::vector<sycl::detail::pi::PiQueue> MQueues;
920  size_t MNextQueueIdx = 0;
921 
922  const bool MHostQueue = false;
925  bool MEmulateOOO = false;
926 
927  // This event is employed for enhanced dependency tracking with in-order queue
928  // Access to the event should be guarded with MMutex
930  // Same as above but for graph begin-end recording cycle.
931  // Track deps within graph commands separately.
932  // Protected by common queue object mutex MMutex.
934 
935  const bool MIsInorder;
936 
937  std::vector<EventImplPtr> MStreamsServiceEvents;
939 
940  // All member variable defined here are needed for the SYCL instrumentation
941  // layer. Do not guard these variables below with XPTI_ENABLE_INSTRUMENTATION
942  // to ensure we have the same object layout when the macro in the library and
943  // SYCL app are not the same.
944  void *MTraceEvent = nullptr;
946  uint8_t MStreamID = 0;
948  uint64_t MInstanceID = 0;
949 
950  // the fallback implementation of profiling info
951  bool MFallbackProfiling = false;
952 
953  // This event can be optionally provided by users for in-order queues to add
954  // an additional dependency for the subsequent submission in to the queue.
955  // Access to the event should be guarded with MInOrderExternalEventMtx.
956  // NOTE: std::optional must not be exposed in the ABI.
957  std::optional<event> MInOrderExternalEvent;
958  mutable std::mutex MInOrderExternalEventMtx;
959 
960 public:
961  // Queue constructed with the discard_events property
962  const bool MDiscardEvents;
964 
965 protected:
966  // Indicates whether the queue supports discarding PI events for tasks
967  // submitted to it. This condition is necessary but not sufficient, PI events
968  // should be discarded only if they also don't represent potential implicit
969  // dependencies for future tasks in other queues.
971 
972  // Command graph which is associated with this queue for the purposes of
973  // recording commands to it.
974  std::weak_ptr<ext::oneapi::experimental::detail::graph_impl> MGraph{};
975 
976  unsigned long long MQueueID;
977  static std::atomic<unsigned long long> MNextAvailableQueueID;
978 
980 };
981 
982 } // namespace detail
983 } // namespace _V1
984 } // namespace sycl
The context class represents a SYCL context on which kernel functions may be executed.
Definition: context.hpp:51
CGTYPE
Type of the command group.
Definition: cg.hpp:56
static GlobalHandler & instance()
static ProgramManager & getInstance()
bool kernelUsesAssert(const std::string &KernelName) const
bool isInFusionMode(QueueIdT Queue)
Definition: scheduler.cpp:645
static Scheduler & getInstance()
Definition: scheduler.cpp:261
event discard_or_return(const event &Event)
Definition: queue_impl.cpp:599
const property_list MPropList
Definition: queue_impl.hpp:915
bool is_in_fusion_mode()
Check whether the queue is in fusion mode.
Definition: queue_impl.hpp:698
event submit(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const std::shared_ptr< queue_impl > &SecondQueue, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess=nullptr)
Submits a command group function object to the queue, in order to be scheduled for execution on the d...
Definition: queue_impl.hpp:406
uint64_t MInstanceID
The instance ID of the trace event for queue object.
Definition: queue_impl.hpp:948
std::vector< EventImplPtr > MStreamsServiceEvents
Definition: queue_impl.hpp:937
void wait_and_throw(const detail::code_location &Loc={})
Definition: queue_impl.hpp:448
Param::return_type get_info() const
Queries SYCL queue for information.
std::optional< event > MInOrderExternalEvent
Definition: queue_impl.hpp:957
std::optional< event > popExternalEvent()
Definition: queue_impl.hpp:736
sycl::detail::pi::PiQueue createQueue(QueueOrder Order)
Creates PI queue.
Definition: queue_impl.hpp:548
void registerStreamServiceEvent(const EventImplPtr &Event)
Definition: queue_impl.hpp:688
static std::atomic< unsigned long long > MNextAvailableQueueID
Definition: queue_impl.hpp:977
void addEvent(const event &Event)
Stores an event that should be associated with the queue.
Definition: queue_impl.cpp:273
std::vector< sycl::detail::pi::PiQueue > MQueues
List of queues created for FPGA device from a single SYCL queue.
Definition: queue_impl.hpp:918
pi_native_handle getNative(int32_t &NativeHandleDesc) const
Gets the native handle of the SYCL queue.
Definition: queue_impl.cpp:540
sycl::detail::pi::PiQueue & getExclusiveQueueHandleRef()
Definition: queue_impl.hpp:581
unsigned long long MQueueID
Definition: queue_impl.hpp:976
std::vector< std::weak_ptr< event_impl > > MEventsWeak
These events are tracked, but not owned, by the queue.
Definition: queue_impl.hpp:907
event submitMemOpHelper(const std::shared_ptr< queue_impl > &Self, const std::vector< event > &DepEvents, HandlerFuncT HandlerFunc, MemMngrFuncT MemMngrFunc, MemMngrArgTs... MemOpArgs)
Performs submission of a memory operation directly if scheduler can be bypassed, or with a handler ot...
bool has_property() const noexcept
Definition: queue_impl.hpp:621
event submit(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess=nullptr)
Submits a command group function object to the queue, in order to be scheduled for execution on the d...
Definition: queue_impl.hpp:429
std::vector< event > MEventsShared
Events without data dependencies (such as USM) need an owner, additionally, USM operations are not ad...
Definition: queue_impl.hpp:912
std::mutex MMutex
Protects all the fields that can be changed by class' methods.
Definition: queue_impl.hpp:901
void setCommandGraph(std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > Graph)
Definition: queue_impl.hpp:717
queue_impl(sycl::detail::pi::PiQueue PiQueue, const ContextImplPtr &Context, const async_handler &AsyncHandler)
Constructs a SYCL queue from plugin interoperability handle.
Definition: queue_impl.hpp:288
bool supportsDiscardingPiEvents() const
Definition: queue_impl.hpp:374
event submitWithHandler(const std::shared_ptr< queue_impl > &Self, const std::vector< event > &DepEvents, HandlerFuncT HandlerFunc)
Helper function for submitting a memory operation with a handler.
Definition: queue_impl.cpp:326
std::function< void(bool, bool, event &)> SubmitPostProcessF
Definition: queue_impl.hpp:391
void addSharedEvent(const event &Event)
queue_impl.addEvent tracks events with weak pointers but some events have no other owners.
Definition: queue_impl.cpp:296
const ContextImplPtr MContext
Definition: queue_impl.hpp:904
void finalizeHandler(HandlerType &Handler, event &EventRet)
Definition: queue_impl.hpp:755
void setExternalEvent(const event &Event)
Definition: queue_impl.hpp:731
sycl::detail::pi::PiQueue & getHandleRef()
Definition: queue_impl.hpp:612
void throw_asynchronous()
Performs a blocking wait for the completion of all enqueued tasks in the queue.
Definition: queue_impl.hpp:460
event memset(const std::shared_ptr< queue_impl > &Self, void *Ptr, int Value, size_t Count, const std::vector< event > &DepEvents)
Fills the memory pointed by a USM pointer with the value specified.
Definition: queue_impl.cpp:133
queue_impl(const DeviceImplPtr &Device, const ContextImplPtr &Context, const async_handler &AsyncHandler, const property_list &PropList)
Constructs a SYCL queue with an async_handler and property_list provided form a device and a context.
Definition: queue_impl.hpp:106
const PluginPtr & getPlugin() const
Definition: queue_impl.hpp:361
event submit_impl(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const std::shared_ptr< queue_impl > &PrimaryQueue, const std::shared_ptr< queue_impl > &SecondaryQueue, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess)
Performs command group submission to the queue.
Definition: queue_impl.hpp:796
const DeviceImplPtr & getDeviceImplPtr() const
Definition: queue_impl.hpp:365
event mem_advise(const std::shared_ptr< queue_impl > &Self, const void *Ptr, size_t Length, pi_mem_advice Advice, const std::vector< event > &DepEvents)
Provides additional information to the underlying runtime about how different allocations are used.
Definition: queue_impl.cpp:219
const async_handler MAsyncHandler
Definition: queue_impl.hpp:914
event memcpyToDeviceGlobal(const std::shared_ptr< queue_impl > &Self, void *DeviceGlobalPtr, const void *Src, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector< event > &DepEvents)
Definition: queue_impl.cpp:230
exception_list getExceptionList() const
Definition: queue_impl.hpp:445
event memcpyFromDeviceGlobal(const std::shared_ptr< queue_impl > &Self, void *Dest, const void *DeviceGlobalPtr, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector< event > &DepEvents)
Definition: queue_impl.cpp:246
event memcpy(const std::shared_ptr< queue_impl > &Self, void *Dest, const void *Src, size_t Count, const std::vector< event > &DepEvents, const code_location &CodeLoc)
Copies data from one memory region to another, both pointed by USM pointers.
Definition: queue_impl.cpp:180
uint8_t MStreamID
The stream under which the traces are emitted from the queue object.
Definition: queue_impl.hpp:946
void reportAsyncException(const std::exception_ptr &ExceptionPtr)
Puts exception to the list of asynchronous ecxeptions.
Definition: queue_impl.hpp:674
void wait(const detail::code_location &Loc={})
Performs a blocking wait for the completion of all enqueued tasks in the queue.
Definition: queue_impl.cpp:472
std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > getCommandGraph() const
Definition: queue_impl.hpp:725
bool MEmulateOOO
Indicates that a native out-of-order queue could not be created and we need to emulate it with multip...
Definition: queue_impl.hpp:925
void instrumentationEpilog(void *TelementryEvent, std::string &Name, int32_t StreamID, uint64_t IId)
Definition: queue_impl.cpp:453
static sycl::detail::pi::PiQueueProperties createPiQueueProperties(const property_list &PropList, QueueOrder Order)
Creates PI properties array.
Definition: queue_impl.hpp:482
const ContextImplPtr & getContextImplPtr() const
Definition: queue_impl.hpp:363
void * instrumentationProlog(const detail::code_location &CodeLoc, std::string &Name, int32_t StreamID, uint64_t &iid)
Definition: queue_impl.cpp:384
size_t MNextQueueIdx
Iterator through MQueues.
Definition: queue_impl.hpp:920
std::weak_ptr< ext::oneapi::experimental::detail::graph_impl > MGraph
Definition: queue_impl.hpp:974
Param::return_type get_backend_info() const
Queries SYCL queue for SYCL backend-specific information.
const std::vector< event > & getExtendDependencyList(const std::vector< event > &DepEvents, std::vector< event > &MutableVec, std::unique_lock< std::mutex > &QueueLock)
Definition: queue_impl.cpp:111
queue_impl(const DeviceImplPtr &Device, const async_handler &AsyncHandler, const property_list &PropList)
Constructs a SYCL queue from a device using an async_handler and property_list provided.
Definition: queue_impl.hpp:93
static ContextImplPtr getDefaultOrNew(const DeviceImplPtr &Device)
Definition: queue_impl.hpp:74
unsigned long long getQueueID()
Definition: queue_impl.hpp:729
queue_impl(sycl::detail::pi::PiQueue PiQueue, const ContextImplPtr &Context, const async_handler &AsyncHandler, const property_list &PropList)
Constructs a SYCL queue from plugin interoperability handle.
Definition: queue_impl.hpp:309
propertyT get_property() const
Definition: queue_impl.hpp:628
const char * c_str() const noexcept
Definition: string.hpp:60
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Definition: event.hpp:44
A list of asynchronous exceptions.
Implementation of node class from SYCL_EXT_ONEAPI_GRAPH.
Definition: graph_impl.hpp:77
Command group handler class.
Definition: handler.hpp:458
Objects of the property_list class are containers for the SYCL properties.
bool has_property() const noexcept
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
Definition: queue.hpp:111
#define __SYCL_PI_CUDA_USE_DEFAULT_STREAM
::pi_queue PiQueue
Definition: pi.hpp:139
::pi_queue_properties PiQueueProperties
Definition: pi.hpp:140
CUDAContextT
Possible CUDA context types supported by PI CUDA backend TODO: Implement this as a property once ther...
Definition: queue_impl.hpp:63
constexpr const char * SYCL_STREAM_NAME
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
Definition: event_impl.hpp:32
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:30
std::shared_ptr< event_impl > EventImplPtr
Definition: cg.hpp:43
std::shared_ptr< plugin > PluginPtr
Definition: pi.hpp:48
std::shared_ptr< device_impl > DeviceImplPtr
static constexpr size_t MaxNumQueues
Sets max number of queues supported by FPGA RT.
Definition: queue_impl.hpp:59
constexpr CUDAContextT DefaultContextType
Default context type created for CUDA backend.
Definition: queue_impl.hpp:66
constexpr auto memory_order_relaxed
std::function< void(sycl::exception_list)> async_handler
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:87
Definition: access.hpp:18
constexpr pi_queue_properties PI_QUEUE_COMPUTE_INDEX
Definition: pi.h:795
pi_result piQueueFinish(pi_queue command_queue)
Definition: pi_cuda.cpp:186
uintptr_t pi_native_handle
Definition: pi.h:217
_pi_result
Definition: pi.h:224
constexpr pi_queue_properties PI_EXT_QUEUE_FLAG_SUBMISSION_IMMEDIATE
Definition: pi.h:805
@ PI_QUEUE_INFO_DEVICE
Definition: pi.h:498
_pi_mem_advice
Definition: pi.h:599
constexpr pi_queue_properties PI_EXT_QUEUE_FLAG_SUBMISSION_NO_IMMEDIATE
Definition: pi.h:804
constexpr pi_queue_properties PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE
Definition: pi.h:797
pi_result piextQueueCreate(pi_context context, pi_device device, pi_queue_properties *properties, pi_queue *queue)
Definition: pi_cuda.cpp:167
pi_result piQueueRelease(pi_queue command_queue)
Definition: pi_cuda.cpp:182
constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW
Definition: pi.h:802
constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS
Definition: pi.h:801
constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH
Definition: pi.h:803
constexpr pi_queue_properties PI_QUEUE_FLAGS
Definition: pi.h:794
constexpr pi_queue_properties PI_QUEUE_FLAG_PROFILING_ENABLE
Definition: pi.h:798
pi_result piQueueGetInfo(pi_queue command_queue, pi_queue_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_cuda.cpp:172
pi_result piQueueRetain(pi_queue command_queue)
Definition: pi_cuda.cpp:180
_Abi const simd< _Tp, _Abi > & noexcept
Definition: simd.hpp:1324