DPC++ Runtime
Runtime libraries for oneAPI DPC++
queue_impl.hpp
Go to the documentation of this file.
1 //==------------------ queue_impl.hpp - SYCL queue -------------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #pragma once
10 
11 #include <detail/config.hpp>
12 #include <detail/context_impl.hpp>
13 #include <detail/device_impl.hpp>
14 #include <detail/device_info.hpp>
15 #include <detail/event_impl.hpp>
17 #include <detail/kernel_impl.hpp>
18 #include <detail/plugin.hpp>
20 #include <detail/thread_pool.hpp>
21 #include <sycl/context.hpp>
24 #include <sycl/device.hpp>
25 #include <sycl/event.hpp>
26 #include <sycl/exception.hpp>
27 #include <sycl/exception_list.hpp>
29 #include <sycl/handler.hpp>
32 #include <sycl/property_list.hpp>
33 #include <sycl/queue.hpp>
34 #include <sycl/stl.hpp>
35 
36 #include "detail/graph_impl.hpp"
37 
38 #include <utility>
39 
40 #ifdef XPTI_ENABLE_INSTRUMENTATION
41 #include "xpti/xpti_trace_framework.hpp"
42 #include <detail/xpti_registry.hpp>
43 #endif
44 
45 namespace sycl {
46 inline namespace _V1 {
47 
48 // forward declaration
49 
50 namespace ext::oneapi::experimental::detail {
51 class graph_impl;
52 }
53 
54 namespace detail {
55 
56 using ContextImplPtr = std::shared_ptr<detail::context_impl>;
57 using DeviceImplPtr = std::shared_ptr<detail::device_impl>;
58 
60 static constexpr size_t MaxNumQueues = 256;
61 
64 enum class CUDAContextT : char { primary, custom };
65 
68 
70 
71 class queue_impl {
72 public:
73  // \return a default context for the platform if it includes the device
74  // passed and default contexts are enabled, a new context otherwise.
78  context{createSyclObjFromImpl<device>(Device), {}, {}});
79 
80  ContextImplPtr DefaultContext = detail::getSyclObjImpl(
81  Device->get_platform().ext_oneapi_get_default_context());
82  if (DefaultContext->isDeviceValid(Device))
83  return DefaultContext;
85  context{createSyclObjFromImpl<device>(Device), {}, {}});
86  }
94  queue_impl(const DeviceImplPtr &Device, const async_handler &AsyncHandler,
95  const property_list &PropList)
96  : queue_impl(Device, getDefaultOrNew(Device), AsyncHandler, PropList){};
97 
107  queue_impl(const DeviceImplPtr &Device, const ContextImplPtr &Context,
108  const async_handler &AsyncHandler, const property_list &PropList)
109  : MDevice(Device), MContext(Context), MAsyncHandler(AsyncHandler),
110  MPropList(PropList), MHostQueue(MDevice->is_host()),
111 #ifndef __INTEL_PREVIEW_BREAKING_CHANGES
113 #endif
114  MIsInorder(has_property<property::queue::in_order>()),
116  has_property<ext::oneapi::property::queue::discard_events>()),
117  MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
119  (MHostQueue ? true : MIsInorder)),
120  MQueueID{
122  if (has_property<property::queue::enable_profiling>()) {
123  if (has_property<ext::oneapi::property::queue::discard_events>())
125  "Queue cannot be constructed with both of "
126  "discard_events and enable_profiling.");
127  // fallback profiling support. See MFallbackProfiling
128  if (MDevice->has(aspect::queue_profiling)) {
129  // When piGetDeviceAndHostTimer is not supported, compute the
130  // profiling time OpenCL version < 2.1 case
131  if (!getDeviceImplPtr()->is_host() &&
132  !getDeviceImplPtr()->isGetDeviceAndHostTimerSupported())
133  MFallbackProfiling = true;
134  } else {
136  "Cannot enable profiling, the associated device "
137  "does not have the queue_profiling aspect");
138  }
139  }
140  if (has_property<ext::intel::property::queue::compute_index>()) {
141  int Idx = get_property<ext::intel::property::queue::compute_index>()
142  .get_index();
143  int NumIndices =
144  createSyclObjFromImpl<device>(Device)
145  .get_info<ext::intel::info::device::max_compute_queue_indices>();
146  if (Idx < 0 || Idx >= NumIndices)
147  throw sycl::exception(
149  "Queue compute index must be a non-negative number less than "
150  "device's number of available compute queue indices.");
151  }
152  if (has_property<
153  ext::codeplay::experimental::property::queue::enable_fusion>() &&
154  !MDevice->get_info<
155  ext::codeplay::experimental::info::device::supports_fusion>()) {
156  throw sycl::exception(
158  "Cannot enable fusion if device does not support fusion");
159  }
160  if (!Context->isDeviceValid(Device)) {
161  if (!Context->is_host() && Context->getBackend() == backend::opencl)
162  throw sycl::invalid_object_error(
163  "Queue cannot be constructed with the given context and device "
164  "since the device is not a member of the context (descendants of "
165  "devices from the context are not supported on OpenCL yet).",
166  PI_ERROR_INVALID_DEVICE);
167  throw sycl::invalid_object_error(
168  "Queue cannot be constructed with the given context and device "
169  "since the device is neither a member of the context nor a "
170  "descendant of its member.",
171  PI_ERROR_INVALID_DEVICE);
172  }
173  if (!MHostQueue) {
174  const QueueOrder QOrder =
176  MQueues.push_back(createQueue(QOrder));
177  // This section is the second part of the instrumentation that uses the
178  // tracepoint information and notifies
179  }
180  // We enable XPTI tracing events using the TLS mechanism; if the code
181  // location data is available, then the tracing data will be rich.
182 #if XPTI_ENABLE_INSTRUMENTATION
183  constexpr uint16_t NotificationTraceType =
184  static_cast<uint16_t>(xpti::trace_point_type_t::queue_create);
185  XPTIScope PrepareNotify((void *)this, NotificationTraceType,
186  SYCL_STREAM_NAME, "queue_create");
187  // Cache the trace event, stream id and instance IDs for the destructor
188  if (xptiCheckTraceEnabled(PrepareNotify.streamID(),
189  NotificationTraceType)) {
190  MTraceEvent = (void *)PrepareNotify.traceEvent();
191  MStreamID = PrepareNotify.streamID();
192  MInstanceID = PrepareNotify.instanceID();
193  // Add the function to capture meta data for the XPTI trace event
194  PrepareNotify.addMetadata([&](auto TEvent) {
195  xpti::addMetadata(TEvent, "sycl_context",
196  reinterpret_cast<size_t>(MContext->getHandleRef()));
197  if (MDevice) {
198  xpti::addMetadata(TEvent, "sycl_device_name",
199  MDevice->getDeviceName());
200  xpti::addMetadata(
201  TEvent, "sycl_device",
202  reinterpret_cast<size_t>(
203  MDevice->is_host() ? 0 : MDevice->getHandleRef()));
204  }
205  xpti::addMetadata(TEvent, "is_inorder", MIsInorder);
206  xpti::addMetadata(TEvent, "queue_id", MQueueID);
207  if (!MHostQueue)
208  xpti::addMetadata(TEvent, "queue_handle",
209  reinterpret_cast<size_t>(getHandleRef()));
210  });
211  PrepareNotify.notify();
212  }
213 #endif
214  }
215 
216  event getLastEvent();
217 
218 private:
219  void queue_impl_interop(sycl::detail::pi::PiQueue PiQueue) {
220  if (has_property<ext::oneapi::property::queue::discard_events>() &&
221  has_property<property::queue::enable_profiling>()) {
223  "Queue cannot be constructed with both of "
224  "discard_events and enable_profiling.");
225  }
226 
227  MQueues.push_back(pi::cast<sycl::detail::pi::PiQueue>(PiQueue));
228 
229  sycl::detail::pi::PiDevice DevicePI{};
230  const PluginPtr &Plugin = getPlugin();
231  // TODO catch an exception and put it to list of asynchronous exceptions
232  Plugin->call<PiApiKind::piQueueGetInfo>(
233  MQueues[0], PI_QUEUE_INFO_DEVICE, sizeof(DevicePI), &DevicePI, nullptr);
234  MDevice = MContext->findMatchingDeviceImpl(DevicePI);
235  if (MDevice == nullptr) {
236  throw sycl::exception(
238  "Device provided by native Queue not found in Context.");
239  }
240  // The following commented section provides a guideline on how to use the
241  // TLS enabled mechanism to create a tracepoint and notify using XPTI. This
242  // is the prolog section and the epilog section will initiate the
243  // notification.
244 #if XPTI_ENABLE_INSTRUMENTATION
245  constexpr uint16_t NotificationTraceType =
246  static_cast<uint16_t>(xpti::trace_point_type_t::queue_create);
247  XPTIScope PrepareNotify((void *)this, NotificationTraceType,
248  SYCL_STREAM_NAME, "queue_create");
249  if (xptiCheckTraceEnabled(PrepareNotify.streamID(),
250  NotificationTraceType)) {
251  // Cache the trace event, stream id and instance IDs for the destructor
252  MTraceEvent = (void *)PrepareNotify.traceEvent();
253  MStreamID = PrepareNotify.streamID();
254  MInstanceID = PrepareNotify.instanceID();
255 
256  // Add the function to capture meta data for the XPTI trace event
257  PrepareNotify.addMetadata([&](auto TEvent) {
258  xpti::addMetadata(TEvent, "sycl_context",
259  reinterpret_cast<size_t>(MContext->getHandleRef()));
260  if (MDevice) {
261  xpti::addMetadata(TEvent, "sycl_device_name",
262  MDevice->getDeviceName());
263  xpti::addMetadata(
264  TEvent, "sycl_device",
265  reinterpret_cast<size_t>(
266  MDevice->is_host() ? 0 : MDevice->getHandleRef()));
267  }
268  xpti::addMetadata(TEvent, "is_inorder", MIsInorder);
269  xpti::addMetadata(TEvent, "queue_id", MQueueID);
270  if (!MHostQueue)
271  xpti::addMetadata(TEvent, "queue_handle", getHandleRef());
272  });
273  PrepareNotify.notify();
274  }
275 #endif
276  }
277 
278 public:
286  const async_handler &AsyncHandler)
287  : MContext(Context), MAsyncHandler(AsyncHandler), MHostQueue(false),
288 #ifndef __INTEL_PREVIEW_BREAKING_CHANGES
290 #endif
291  MIsInorder(has_property<property::queue::in_order>()),
293  has_property<ext::oneapi::property::queue::discard_events>()),
294  MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
296  (MHostQueue ? true : MIsInorder)),
297  MQueueID{
299  queue_impl_interop(PiQueue);
300  }
301 
310  const async_handler &AsyncHandler, const property_list &PropList)
311  : MContext(Context), MAsyncHandler(AsyncHandler), MPropList(PropList),
312  MHostQueue(false),
313 #ifndef __INTEL_PREVIEW_BREAKING_CHANGES
315 #endif
316  MIsInorder(has_property<property::queue::in_order>()),
318  has_property<ext::oneapi::property::queue::discard_events>()),
319  MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
321  (MHostQueue ? true : MIsInorder)) {
322  queue_impl_interop(PiQueue);
323  }
324 
326  // The trace event created in the constructor should be active through the
327  // lifetime of the queue object as member variables when ABI breakage is
328  // allowed. This example shows MTraceEvent as a member variable.
329 #if XPTI_ENABLE_INSTRUMENTATION
330  constexpr uint16_t NotificationTraceType =
331  static_cast<uint16_t>(xpti::trace_point_type_t::queue_destroy);
332  if (xptiCheckTraceEnabled(MStreamID, NotificationTraceType)) {
333  // Used cached information in member variables
334  xptiNotifySubscribers(MStreamID, NotificationTraceType, nullptr,
335  (xpti::trace_event_data_t *)MTraceEvent,
336  MInstanceID,
337  static_cast<const void *>("queue_destroy"));
338  xptiReleaseEvent((xpti::trace_event_data_t *)MTraceEvent);
339  }
340 #endif
342  if (!MHostQueue) {
345  }
346  }
347 
349  cl_command_queue get() {
350  if (MHostQueue) {
351  throw invalid_object_error(
352  "This instance of queue doesn't support OpenCL interoperability",
353  PI_ERROR_INVALID_QUEUE);
354  }
356  return pi::cast<cl_command_queue>(MQueues[0]);
357  }
358 
361  return createSyclObjFromImpl<context>(MContext);
362  }
363 
364  const PluginPtr &getPlugin() const { return MContext->getPlugin(); }
365 
366  const ContextImplPtr &getContextImplPtr() const { return MContext; }
367 
368  const DeviceImplPtr &getDeviceImplPtr() const { return MDevice; }
369 
371  device get_device() const { return createSyclObjFromImpl<device>(MDevice); }
372 
374  bool is_host() const { return MHostQueue; }
375 
379  }
380 
381  bool isInOrder() const { return MIsInorder; }
382 
386  template <typename Param> typename Param::return_type get_info() const;
387 
388  using SubmitPostProcessF = std::function<void(bool, bool, event &)>;
389 
403  event submit(const std::function<void(handler &)> &CGF,
404  const std::shared_ptr<queue_impl> &Self,
405  const std::shared_ptr<queue_impl> &SecondQueue,
406  const detail::code_location &Loc,
407  const SubmitPostProcessF *PostProcess = nullptr) {
408  event ResEvent;
409  try {
410  ResEvent = submit_impl(CGF, Self, Self, SecondQueue, Loc, PostProcess);
411  } catch (...) {
412  ResEvent = SecondQueue->submit_impl(CGF, SecondQueue, Self, SecondQueue,
413  Loc, PostProcess);
414  }
415  return discard_or_return(ResEvent);
416  }
417 
426  event submit(const std::function<void(handler &)> &CGF,
427  const std::shared_ptr<queue_impl> &Self,
428  const detail::code_location &Loc,
429  const SubmitPostProcessF *PostProcess = nullptr) {
430  auto ResEvent = submit_impl(CGF, Self, Self, nullptr, Loc, PostProcess);
431  return discard_or_return(ResEvent);
432  }
433 
439  void wait(const detail::code_location &Loc = {});
440 
443 
445  void wait_and_throw(const detail::code_location &Loc = {}) {
446  wait(Loc);
448  }
449 
458  if (!MAsyncHandler)
459  return;
460 
461  exception_list Exceptions;
462  {
463  std::lock_guard<std::mutex> Lock(MMutex);
464  std::swap(Exceptions, MExceptions);
465  }
466  // Unlock the mutex before calling user-provided handler to avoid
467  // potential deadlock if the same queue is somehow referenced in the
468  // handler.
469  if (Exceptions.size())
470  MAsyncHandler(std::move(Exceptions));
471  }
472 
480  sycl::detail::pi::PiQueueProperties CreationFlags = 0;
481 
482  if (Order == QueueOrder::OOO) {
484  }
485  if (PropList.has_property<property::queue::enable_profiling>()) {
486  CreationFlags |= PI_QUEUE_FLAG_PROFILING_ENABLE;
487  }
488  if (PropList.has_property<
489  ext::oneapi::cuda::property::queue::use_default_stream>()) {
490  CreationFlags |= __SYCL_PI_CUDA_USE_DEFAULT_STREAM;
491  }
492  if (PropList.has_property<ext::oneapi::property::queue::discard_events>()) {
493  // Pass this flag to the Level Zero plugin to be able to check it from
494  // queue property.
496  }
497  // Track that priority settings are not ambiguous.
498  bool PrioritySeen = false;
499  if (PropList
500  .has_property<ext::oneapi::property::queue::priority_normal>()) {
501  // Normal is the default priority, don't pass anything.
502  PrioritySeen = true;
503  }
504  if (PropList.has_property<ext::oneapi::property::queue::priority_low>()) {
505  if (PrioritySeen) {
506  throw sycl::exception(
508  "Queue cannot be constructed with different priorities.");
509  }
510  CreationFlags |= PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW;
511  PrioritySeen = true;
512  }
513  if (PropList.has_property<ext::oneapi::property::queue::priority_high>()) {
514  if (PrioritySeen) {
515  throw sycl::exception(
517  "Queue cannot be constructed with different priorities.");
518  }
520  }
521  // Track that submission modes do not conflict.
522  bool SubmissionSeen = false;
523  if (PropList.has_property<
524  ext::intel::property::queue::no_immediate_command_list>()) {
525  SubmissionSeen = true;
527  }
528  if (PropList.has_property<
529  ext::intel::property::queue::immediate_command_list>()) {
530  if (SubmissionSeen) {
531  throw sycl::exception(
533  "Queue cannot be constructed with different submission modes.");
534  }
535  SubmissionSeen = true;
537  }
538  return CreationFlags;
539  }
540 
547  sycl::detail::pi::PiContext Context = MContext->getHandleRef();
548  sycl::detail::pi::PiDevice Device = MDevice->getHandleRef();
549  const PluginPtr &Plugin = getPlugin();
550 
551  sycl::detail::pi::PiQueueProperties Properties[] = {
553  if (has_property<ext::intel::property::queue::compute_index>()) {
554  int Idx = get_property<ext::intel::property::queue::compute_index>()
555  .get_index();
556  Properties[2] = PI_QUEUE_COMPUTE_INDEX;
557  Properties[3] = static_cast<sycl::detail::pi::PiQueueProperties>(Idx);
558  }
560  Plugin->call_nocheck<PiApiKind::piextQueueCreate>(Context, Device,
561  Properties, &Queue);
562 
563  // If creating out-of-order queue failed and this property is not
564  // supported (for example, on FPGA), it will return
565  // PI_ERROR_INVALID_QUEUE_PROPERTIES and will try to create in-order queue.
566  if (!MEmulateOOO && Error == PI_ERROR_INVALID_QUEUE_PROPERTIES) {
567  MEmulateOOO = true;
569  } else {
570  Plugin->checkPiResult(Error);
571  }
572 
573  return Queue;
574  }
575 
579  sycl::detail::pi::PiQueue *PIQ = nullptr;
580  bool ReuseQueue = false;
581  {
582  std::lock_guard<std::mutex> Lock(MMutex);
583 
584  // To achieve parallelism for FPGA with in order execution model with
585  // possibility of two kernels to share data with each other we shall
586  // create a queue for every kernel enqueued.
587  if (MQueues.size() < MaxNumQueues) {
588  MQueues.push_back({});
589  PIQ = &MQueues.back();
590  } else {
591  // If the limit of OpenCL queues is going to be exceeded - take the
592  // earliest used queue, wait until it finished and then reuse it.
593  PIQ = &MQueues[MNextQueueIdx];
595  ReuseQueue = true;
596  }
597  }
598 
599  if (!ReuseQueue)
601  else
602  getPlugin()->call<PiApiKind::piQueueFinish>(*PIQ);
603 
604  return *PIQ;
605  }
606 
610  if (!MEmulateOOO)
611  return MQueues[0];
612 
614  }
615 
618  template <typename propertyT> bool has_property() const noexcept {
619  return MPropList.has_property<propertyT>();
620  }
621 
625  template <typename propertyT> propertyT get_property() const {
626  return MPropList.get_property<propertyT>();
627  }
628 
638  event memset(const std::shared_ptr<queue_impl> &Self, void *Ptr, int Value,
639  size_t Count, const std::vector<event> &DepEvents);
650  event memcpy(const std::shared_ptr<queue_impl> &Self, void *Dest,
651  const void *Src, size_t Count,
652  const std::vector<event> &DepEvents,
653  const code_location &CodeLoc);
664  event mem_advise(const std::shared_ptr<queue_impl> &Self, const void *Ptr,
665  size_t Length, pi_mem_advice Advice,
666  const std::vector<event> &DepEvents);
667 
671  void reportAsyncException(const std::exception_ptr &ExceptionPtr) {
672  std::lock_guard<std::mutex> Lock(MMutex);
673  MExceptions.PushBack(ExceptionPtr);
674  }
675 
678  }
679 
683  pi_native_handle getNative(int32_t &NativeHandleDesc) const;
684 
685 #ifndef __INTEL_PREVIEW_BREAKING_CHANGES
687  return MAssertHappenedBuffer;
688  }
689 #endif
690 
692  std::lock_guard<std::mutex> Lock(MStreamsServiceEventsMutex);
693  MStreamsServiceEvents.push_back(Event);
694  }
695 
696  bool ext_oneapi_empty() const;
697 
703  std::hash<typename std::shared_ptr<queue_impl>::element_type *>()(
704  this));
705  }
706 
707  event memcpyToDeviceGlobal(const std::shared_ptr<queue_impl> &Self,
708  void *DeviceGlobalPtr, const void *Src,
709  bool IsDeviceImageScope, size_t NumBytes,
710  size_t Offset,
711  const std::vector<event> &DepEvents);
712  event memcpyFromDeviceGlobal(const std::shared_ptr<queue_impl> &Self,
713  void *Dest, const void *DeviceGlobalPtr,
714  bool IsDeviceImageScope, size_t NumBytes,
715  size_t Offset,
716  const std::vector<event> &DepEvents);
717 
719 
721  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph) {
722  std::lock_guard<std::mutex> Lock(MMutex);
723  MGraph = Graph;
724  MGraphLastEventPtr = nullptr;
725  }
726 
727  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
728  getCommandGraph() const {
729  return MGraph.lock();
730  }
731 
732  unsigned long long getQueueID() { return MQueueID; }
733 
734  void setExternalEvent(const event &Event) {
735  std::lock_guard<std::mutex> Lock(MInOrderExternalEventMtx);
736  MInOrderExternalEvent = Event;
737  }
738 
739  std::optional<event> popExternalEvent() {
740  std::lock_guard<std::mutex> Lock(MInOrderExternalEventMtx);
741  std::optional<event> Result = std::nullopt;
742  std::swap(Result, MInOrderExternalEvent);
743  return Result;
744  }
745 
746  const std::vector<event> &
747  getExtendDependencyList(const std::vector<event> &DepEvents,
748  std::vector<event> &MutableVec,
749  std::unique_lock<std::mutex> &QueueLock);
750 
751 protected:
752  event discard_or_return(const event &Event);
753  // Hook to the scheduler to clean up any fusion command held on destruction.
754  void cleanup_fusion_cmd();
755 
756  // template is needed for proper unit testing
757  template <typename HandlerType = handler>
758  void finalizeHandler(HandlerType &Handler, event &EventRet) {
759  if (MIsInorder) {
760  // Accessing and changing of an event isn't atomic operation.
761  // Hence, here is the lock for thread-safety.
762  std::lock_guard<std::mutex> Lock{MMutex};
763  // This dependency is needed for the following purposes:
764  // - host tasks are handled by the runtime and cannot be implicitly
765  // synchronized by the backend.
766  // - to prevent the 2nd kernel enqueue when the 1st kernel is blocked
767  // by a host task. This dependency allows to build the enqueue order in
768  // the RT but will not be passed to the backend. See getPIEvents in
769  // Command.
770  auto &EventToBuildDeps =
772  if (EventToBuildDeps)
773  Handler.depends_on(
774  createSyclObjFromImpl<sycl::event>(EventToBuildDeps));
775 
776  // If there is an external event set, add it as a dependency and clear it.
777  // We do not need to hold the lock as MLastEventMtx will ensure the last
778  // event reflects the corresponding external event dependence as well.
779  std::optional<event> ExternalEvent = popExternalEvent();
780  if (ExternalEvent)
781  Handler.depends_on(*ExternalEvent);
782 
783  EventRet = Handler.finalize();
784  EventToBuildDeps = getSyclObjImpl(EventRet);
785  } else
786  EventRet = Handler.finalize();
787  }
788 
789 protected:
800  event submit_impl(const std::function<void(handler &)> &CGF,
801  const std::shared_ptr<queue_impl> &Self,
802  const std::shared_ptr<queue_impl> &PrimaryQueue,
803  const std::shared_ptr<queue_impl> &SecondaryQueue,
804  const detail::code_location &Loc,
805  const SubmitPostProcessF *PostProcess) {
806  // Flag used to detect nested calls to submit and report an error.
807  thread_local static bool PreventSubmit = false;
808 
809  if (PreventSubmit) {
810  throw sycl::exception(
812  "Calls to sycl::queue::submit cannot be nested. Command group "
813  "function objects should use the sycl::handler API instead.");
814  }
815 
816  handler Handler(Self, PrimaryQueue, SecondaryQueue, MHostQueue);
817  Handler.saveCodeLoc(Loc);
818  PreventSubmit = true;
819  try {
820  CGF(Handler);
821  } catch (...) {
822  PreventSubmit = false;
823  throw;
824  }
825  PreventSubmit = false;
826 
827  // Scheduler will later omit events, that are not required to execute tasks.
828  // Host and interop tasks, however, are not submitted to low-level runtimes
829  // and require separate dependency management.
830  const CG::CGTYPE Type = Handler.getType();
831  event Event = detail::createSyclObjFromImpl<event>(
832  std::make_shared<detail::event_impl>());
833 
834  if (PostProcess) {
835  bool IsKernel = Type == CG::Kernel;
836  bool KernelUsesAssert = false;
837 
838  if (IsKernel)
839  // Kernel only uses assert if it's non interop one
840  KernelUsesAssert =
841  !(Handler.MKernel && Handler.MKernel->isInterop()) &&
842  ProgramManager::getInstance().kernelUsesAssert(Handler.MKernelName);
843 
844  finalizeHandler(Handler, Event);
845 
846  (*PostProcess)(IsKernel, KernelUsesAssert, Event);
847  } else
848  finalizeHandler(Handler, Event);
849 
850  addEvent(Event);
851  return Event;
852  }
853 
859  template <typename HandlerFuncT>
860  event submitWithHandler(const std::shared_ptr<queue_impl> &Self,
861  const std::vector<event> &DepEvents,
862  HandlerFuncT HandlerFunc);
863 
877  template <typename HandlerFuncT, typename MemMngrFuncT,
878  typename... MemMngrArgTs>
879  event submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,
880  const std::vector<event> &DepEvents,
881  HandlerFuncT HandlerFunc, MemMngrFuncT MemMngrFunc,
882  MemMngrArgTs... MemOpArgs);
883 
884  // When instrumentation is enabled emits trace event for wait begin and
885  // returns the telemetry event generated for the wait
886  void *instrumentationProlog(const detail::code_location &CodeLoc,
887  std::string &Name, int32_t StreamID,
888  uint64_t &iid);
889  // Uses events generated by the Prolog and emits wait done event
890  void instrumentationEpilog(void *TelementryEvent, std::string &Name,
891  int32_t StreamID, uint64_t IId);
892 
898  void addSharedEvent(const event &Event);
899 
903  void addEvent(const event &Event);
904 
906  mutable std::mutex MMutex;
907 
910 
912  std::vector<std::weak_ptr<event_impl>> MEventsWeak;
913 
917  std::vector<event> MEventsShared;
921 
923  std::vector<sycl::detail::pi::PiQueue> MQueues;
925  size_t MNextQueueIdx = 0;
926 
927  const bool MHostQueue = false;
930  bool MEmulateOOO = false;
931 
932 #ifndef __INTEL_PREVIEW_BREAKING_CHANGES
933  // Buffer to store assert failure descriptor
935 #endif
936 
937  // This event is employed for enhanced dependency tracking with in-order queue
938  // Access to the event should be guarded with MMutex
940  // Same as above but for graph begin-end recording cycle.
941  // Track deps within graph commands separately.
942  // Protected by common queue object mutex MMutex.
944 
945  const bool MIsInorder;
946 
947  std::vector<EventImplPtr> MStreamsServiceEvents;
949 
950  // All member variable defined here are needed for the SYCL instrumentation
951  // layer. Do not guard these variables below with XPTI_ENABLE_INSTRUMENTATION
952  // to ensure we have the same object layout when the macro in the library and
953  // SYCL app are not the same.
954  void *MTraceEvent = nullptr;
956  uint8_t MStreamID = 0;
958  uint64_t MInstanceID = 0;
959 
960  // the fallback implementation of profiling info
961  bool MFallbackProfiling = false;
962 
963  // This event can be optionally provided by users for in-order queues to add
964  // an additional dependency for the subsequent submission in to the queue.
965  // Access to the event should be guarded with MInOrderExternalEventMtx.
966  // NOTE: std::optional must not be exposed in the ABI.
967  std::optional<event> MInOrderExternalEvent;
968  mutable std::mutex MInOrderExternalEventMtx;
969 
970 public:
971  // Queue constructed with the discard_events property
972  const bool MDiscardEvents;
974 
975 protected:
976  // Indicates whether the queue supports discarding PI events for tasks
977  // submitted to it. This condition is necessary but not sufficient, PI events
978  // should be discarded only if they also don't represent potential implicit
979  // dependencies for future tasks in other queues.
981 
982  // Command graph which is associated with this queue for the purposes of
983  // recording commands to it.
984  std::weak_ptr<ext::oneapi::experimental::detail::graph_impl> MGraph{};
985 
986  unsigned long long MQueueID;
987  static std::atomic<unsigned long long> MNextAvailableQueueID;
988 
990 };
991 
992 } // namespace detail
993 } // namespace _V1
994 } // namespace sycl
Defines a shared array that can be used by kernels in queues.
Definition: buffer.hpp:170
The context class represents a SYCL context on which kernel functions may be executed.
Definition: context.hpp:51
CGTYPE
Type of the command group.
Definition: cg.hpp:56
static GlobalHandler & instance()
static ProgramManager & getInstance()
bool kernelUsesAssert(const std::string &KernelName) const
bool isInFusionMode(QueueIdT Queue)
Definition: scheduler.cpp:644
static Scheduler & getInstance()
Definition: scheduler.cpp:260
event discard_or_return(const event &Event)
Definition: queue_impl.cpp:588
const property_list MPropList
Definition: queue_impl.hpp:920
bool is_in_fusion_mode()
Check whether the queue is in fusion mode.
Definition: queue_impl.hpp:701
event submit(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const std::shared_ptr< queue_impl > &SecondQueue, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess=nullptr)
Submits a command group function object to the queue, in order to be scheduled for execution on the d...
Definition: queue_impl.hpp:403
uint64_t MInstanceID
The instance ID of the trace event for queue object.
Definition: queue_impl.hpp:958
std::vector< EventImplPtr > MStreamsServiceEvents
Definition: queue_impl.hpp:947
void wait_and_throw(const detail::code_location &Loc={})
Definition: queue_impl.hpp:445
Param::return_type get_info() const
Queries SYCL queue for information.
std::optional< event > MInOrderExternalEvent
Definition: queue_impl.hpp:967
buffer< AssertHappened, 1 > MAssertHappenedBuffer
Definition: queue_impl.hpp:934
std::optional< event > popExternalEvent()
Definition: queue_impl.hpp:739
sycl::detail::pi::PiQueue createQueue(QueueOrder Order)
Creates PI queue.
Definition: queue_impl.hpp:545
void registerStreamServiceEvent(const EventImplPtr &Event)
Definition: queue_impl.hpp:691
static std::atomic< unsigned long long > MNextAvailableQueueID
Definition: queue_impl.hpp:987
void addEvent(const event &Event)
Stores an event that should be associated with the queue.
Definition: queue_impl.cpp:233
std::vector< sycl::detail::pi::PiQueue > MQueues
List of queues created for FPGA device from a single SYCL queue.
Definition: queue_impl.hpp:923
pi_native_handle getNative(int32_t &NativeHandleDesc) const
Gets the native handle of the SYCL queue.
Definition: queue_impl.cpp:529
sycl::detail::pi::PiQueue & getExclusiveQueueHandleRef()
Definition: queue_impl.hpp:578
unsigned long long MQueueID
Definition: queue_impl.hpp:986
std::vector< std::weak_ptr< event_impl > > MEventsWeak
These events are tracked, but not owned, by the queue.
Definition: queue_impl.hpp:912
event submitMemOpHelper(const std::shared_ptr< queue_impl > &Self, const std::vector< event > &DepEvents, HandlerFuncT HandlerFunc, MemMngrFuncT MemMngrFunc, MemMngrArgTs... MemOpArgs)
Performs submission of a memory operation directly if scheduler can be bypassed, or with a handler ot...
bool has_property() const noexcept
Definition: queue_impl.hpp:618
event submit(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess=nullptr)
Submits a command group function object to the queue, in order to be scheduled for execution on the d...
Definition: queue_impl.hpp:426
std::vector< event > MEventsShared
Events without data dependencies (such as USM) need an owner, additionally, USM operations are not ad...
Definition: queue_impl.hpp:917
std::mutex MMutex
Protects all the fields that can be changed by class' methods.
Definition: queue_impl.hpp:906
void setCommandGraph(std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > Graph)
Definition: queue_impl.hpp:720
buffer< AssertHappened, 1 > & getAssertHappenedBuffer()
Definition: queue_impl.hpp:686
queue_impl(sycl::detail::pi::PiQueue PiQueue, const ContextImplPtr &Context, const async_handler &AsyncHandler)
Constructs a SYCL queue from plugin interoperability handle.
Definition: queue_impl.hpp:285
bool supportsDiscardingPiEvents() const
Definition: queue_impl.hpp:377
event submitWithHandler(const std::shared_ptr< queue_impl > &Self, const std::vector< event > &DepEvents, HandlerFuncT HandlerFunc)
Helper function for submitting a memory operation with a handler.
Definition: queue_impl.cpp:315
std::function< void(bool, bool, event &)> SubmitPostProcessF
Definition: queue_impl.hpp:388
void addSharedEvent(const event &Event)
queue_impl.addEvent tracks events with weak pointers but some events have no other owners.
Definition: queue_impl.cpp:256
const ContextImplPtr MContext
Definition: queue_impl.hpp:909
void finalizeHandler(HandlerType &Handler, event &EventRet)
Definition: queue_impl.hpp:758
void setExternalEvent(const event &Event)
Definition: queue_impl.hpp:734
sycl::detail::pi::PiQueue & getHandleRef()
Definition: queue_impl.hpp:609
void throw_asynchronous()
Performs a blocking wait for the completion of all enqueued tasks in the queue.
Definition: queue_impl.hpp:457
event memset(const std::shared_ptr< queue_impl > &Self, void *Ptr, int Value, size_t Count, const std::vector< event > &DepEvents)
Fills the memory pointed by a USM pointer with the value specified.
Definition: queue_impl.cpp:97
queue_impl(const DeviceImplPtr &Device, const ContextImplPtr &Context, const async_handler &AsyncHandler, const property_list &PropList)
Constructs a SYCL queue with an async_handler and property_list provided form a device and a context.
Definition: queue_impl.hpp:107
const PluginPtr & getPlugin() const
Definition: queue_impl.hpp:364
event submit_impl(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const std::shared_ptr< queue_impl > &PrimaryQueue, const std::shared_ptr< queue_impl > &SecondaryQueue, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess)
Performs command group submission to the queue.
Definition: queue_impl.hpp:800
const DeviceImplPtr & getDeviceImplPtr() const
Definition: queue_impl.hpp:368
event mem_advise(const std::shared_ptr< queue_impl > &Self, const void *Ptr, size_t Length, pi_mem_advice Advice, const std::vector< event > &DepEvents)
Provides additional information to the underlying runtime about how different allocations are used.
Definition: queue_impl.cpp:179
const async_handler MAsyncHandler
Definition: queue_impl.hpp:919
event memcpyToDeviceGlobal(const std::shared_ptr< queue_impl > &Self, void *DeviceGlobalPtr, const void *Src, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector< event > &DepEvents)
Definition: queue_impl.cpp:190
exception_list getExceptionList() const
Definition: queue_impl.hpp:442
event memcpyFromDeviceGlobal(const std::shared_ptr< queue_impl > &Self, void *Dest, const void *DeviceGlobalPtr, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector< event > &DepEvents)
Definition: queue_impl.cpp:206
event memcpy(const std::shared_ptr< queue_impl > &Self, void *Dest, const void *Src, size_t Count, const std::vector< event > &DepEvents, const code_location &CodeLoc)
Copies data from one memory region to another, both pointed by USM pointers.
Definition: queue_impl.cpp:141
uint8_t MStreamID
The stream under which the traces are emitted from the queue object.
Definition: queue_impl.hpp:956
void reportAsyncException(const std::exception_ptr &ExceptionPtr)
Puts exception to the list of asynchronous ecxeptions.
Definition: queue_impl.hpp:671
void wait(const detail::code_location &Loc={})
Performs a blocking wait for the completion of all enqueued tasks in the queue.
Definition: queue_impl.cpp:461
std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > getCommandGraph() const
Definition: queue_impl.hpp:728
bool MEmulateOOO
Indicates that a native out-of-order queue could not be created and we need to emulate it with multip...
Definition: queue_impl.hpp:930
void instrumentationEpilog(void *TelementryEvent, std::string &Name, int32_t StreamID, uint64_t IId)
Definition: queue_impl.cpp:442
static sycl::detail::pi::PiQueueProperties createPiQueueProperties(const property_list &PropList, QueueOrder Order)
Creates PI properties array.
Definition: queue_impl.hpp:479
const ContextImplPtr & getContextImplPtr() const
Definition: queue_impl.hpp:366
void * instrumentationProlog(const detail::code_location &CodeLoc, std::string &Name, int32_t StreamID, uint64_t &iid)
Definition: queue_impl.cpp:373
size_t MNextQueueIdx
Iterator through MQueues.
Definition: queue_impl.hpp:925
std::weak_ptr< ext::oneapi::experimental::detail::graph_impl > MGraph
Definition: queue_impl.hpp:984
const std::vector< event > & getExtendDependencyList(const std::vector< event > &DepEvents, std::vector< event > &MutableVec, std::unique_lock< std::mutex > &QueueLock)
Definition: queue_impl.cpp:75
queue_impl(const DeviceImplPtr &Device, const async_handler &AsyncHandler, const property_list &PropList)
Constructs a SYCL queue from a device using an async_handler and property_list provided.
Definition: queue_impl.hpp:94
static ContextImplPtr getDefaultOrNew(const DeviceImplPtr &Device)
Definition: queue_impl.hpp:75
unsigned long long getQueueID()
Definition: queue_impl.hpp:732
queue_impl(sycl::detail::pi::PiQueue PiQueue, const ContextImplPtr &Context, const async_handler &AsyncHandler, const property_list &PropList)
Constructs a SYCL queue from plugin interoperability handle.
Definition: queue_impl.hpp:309
propertyT get_property() const
Definition: queue_impl.hpp:625
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:59
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Definition: event.hpp:44
A list of asynchronous exceptions.
Implementation of node class from SYCL_EXT_ONEAPI_GRAPH.
Definition: graph_impl.hpp:76
Command group handler class.
Definition: handler.hpp:454
Objects of the property_list class are containers for the SYCL properties.
bool has_property() const noexcept
Defines the iteration domain of either a single work-group in a parallel dispatch,...
Definition: range.hpp:26
#define __SYCL_PI_CUDA_USE_DEFAULT_STREAM
::pi_queue PiQueue
Definition: pi.hpp:139
::pi_queue_properties PiQueueProperties
Definition: pi.hpp:140
CUDAContextT
Possible CUDA context types supported by PI CUDA backend TODO: Implement this as a property once ther...
Definition: queue_impl.hpp:64
constexpr const char * SYCL_STREAM_NAME
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
Definition: event_impl.hpp:33
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:30
std::shared_ptr< event_impl > EventImplPtr
Definition: cg.hpp:43
std::shared_ptr< plugin > PluginPtr
Definition: pi.hpp:48
std::shared_ptr< device_impl > DeviceImplPtr
static constexpr size_t MaxNumQueues
Sets max number of queues supported by FPGA RT.
Definition: queue_impl.hpp:60
constexpr CUDAContextT DefaultContextType
Default context type created for CUDA backend.
Definition: queue_impl.hpp:67
constexpr auto memory_order_relaxed
std::function< void(sycl::exception_list)> async_handler
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:94
std::conditional_t< std::is_same_v< ElementType, half >, sycl::detail::half_impl::BIsRepresentationT, ElementType > element_type
Definition: multi_ptr.hpp:752
Definition: access.hpp:18
constexpr pi_queue_properties PI_QUEUE_COMPUTE_INDEX
Definition: pi.h:768
pi_result piQueueFinish(pi_queue command_queue)
Definition: pi_cuda.cpp:186
uintptr_t pi_native_handle
Definition: pi.h:206
_pi_result
Definition: pi.h:213
constexpr pi_queue_properties PI_EXT_QUEUE_FLAG_SUBMISSION_IMMEDIATE
Definition: pi.h:778
@ PI_QUEUE_INFO_DEVICE
Definition: pi.h:479
_pi_mem_advice
Definition: pi.h:579
constexpr pi_queue_properties PI_EXT_QUEUE_FLAG_SUBMISSION_NO_IMMEDIATE
Definition: pi.h:777
constexpr pi_queue_properties PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE
Definition: pi.h:770
pi_result piextQueueCreate(pi_context context, pi_device device, pi_queue_properties *properties, pi_queue *queue)
Definition: pi_cuda.cpp:167
pi_result piQueueRelease(pi_queue command_queue)
Definition: pi_cuda.cpp:182
constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW
Definition: pi.h:775
constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS
Definition: pi.h:774
constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH
Definition: pi.h:776
constexpr pi_queue_properties PI_QUEUE_FLAGS
Definition: pi.h:767
constexpr pi_queue_properties PI_QUEUE_FLAG_PROFILING_ENABLE
Definition: pi.h:771
pi_result piQueueGetInfo(pi_queue command_queue, pi_queue_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_cuda.cpp:172
pi_result piQueueRetain(pi_queue command_queue)
Definition: pi_cuda.cpp:180
_Abi const simd< _Tp, _Abi > & noexcept
Definition: simd.hpp:1324