39 #ifdef XPTI_ENABLE_INSTRUMENTATION
40 #include "xpti/xpti_trace_framework.hpp"
45 inline namespace _V1 {
49 namespace ext::oneapi::experimental::detail {
77 context{createSyclObjFromImpl<device>(Device), {}, {}});
80 Device->get_platform().ext_oneapi_get_default_context());
81 if (DefaultContext->isDeviceValid(Device))
82 return DefaultContext;
84 context{createSyclObjFromImpl<device>(Device), {}, {}});
116 if (has_property<property::queue::enable_profiling>()) {
117 if (has_property<ext::oneapi::property::queue::discard_events>())
119 "Queue cannot be constructed with both of "
120 "discard_events and enable_profiling.");
122 if (
MDevice->has(aspect::queue_profiling)) {
129 "Cannot enable profiling, the associated device "
130 "does not have the queue_profiling aspect");
133 if (has_property<ext::intel::property::queue::compute_index>()) {
134 int Idx = get_property<ext::intel::property::queue::compute_index>()
137 createSyclObjFromImpl<device>(Device)
138 .get_info<ext::intel::info::device::max_compute_queue_indices>();
139 if (Idx < 0 || Idx >= NumIndices)
142 "Queue compute index must be a non-negative number less than "
143 "device's number of available compute queue indices.");
145 if (!Context->isDeviceValid(Device)) {
149 "Queue cannot be constructed with the given context and device "
150 "since the device is not a member of the context (descendants of "
151 "devices from the context are not supported on OpenCL yet).");
154 "Queue cannot be constructed with the given context and device "
155 "since the device is neither a member of the context nor a "
156 "descendant of its member.");
166 #if XPTI_ENABLE_INSTRUMENTATION
177 void queue_impl_interop(ur_queue_handle_t UrQueue) {
178 if (has_property<ext::oneapi::property::queue::discard_events>() &&
179 has_property<property::queue::enable_profiling>()) {
181 "Queue cannot be constructed with both of "
182 "discard_events and enable_profiling.");
187 ur_device_handle_t DeviceUr{};
190 Plugin->call<UrApiKind::urQueueGetInfo>(
191 MQueues[0], UR_QUEUE_INFO_DEVICE,
sizeof(DeviceUr), &DeviceUr,
nullptr);
196 "Device provided by native Queue not found in Context.");
202 #if XPTI_ENABLE_INSTRUMENTATION
226 queue_impl_interop(UrQueue);
245 queue_impl_interop(UrQueue);
250 #if XPTI_ENABLE_INSTRUMENTATION
258 }
catch (std::exception &e) {
267 ur_native_handle_t nativeHandle = 0;
270 return ur::cast<cl_command_queue>(nativeHandle);
275 return createSyclObjFromImpl<context>(
MContext);
298 template <
typename Param>
typename Param::return_type
get_info()
const;
303 template <
typename Param>
312 "flush cannot be called for a queue which is "
313 "recording to a command graph.");
336 const std::shared_ptr<queue_impl> &Self,
337 const std::shared_ptr<queue_impl> &SecondQueue,
342 ResEvent =
submit_impl(CGF, Self, Self, SecondQueue,
343 true, Loc, PostProcess);
346 SecondQueue->submit_impl(CGF, SecondQueue, Self, SecondQueue,
347 true, Loc, PostProcess);
361 const std::shared_ptr<queue_impl> &Self,
364 auto ResEvent =
submit_impl(CGF, Self, Self,
nullptr,
365 true, Loc, PostProcess);
370 const std::shared_ptr<queue_impl> &Self,
406 std::lock_guard<std::mutex> Lock(
MMutex);
412 if (Exceptions.
size())
423 ur_queue_flags_t CreationFlags = 0;
426 CreationFlags = UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
428 if (PropList.
has_property<property::queue::enable_profiling>()) {
429 CreationFlags |= UR_QUEUE_FLAG_PROFILING_ENABLE;
432 ext::oneapi::cuda::property::queue::use_default_stream>()) {
433 CreationFlags |= UR_QUEUE_FLAG_USE_DEFAULT_STREAM;
435 if (PropList.
has_property<ext::oneapi::property::queue::discard_events>()) {
438 CreationFlags |= UR_QUEUE_FLAG_DISCARD_EVENTS;
441 bool PrioritySeen =
false;
443 .has_property<ext::oneapi::property::queue::priority_normal>()) {
447 if (PropList.
has_property<ext::oneapi::property::queue::priority_low>()) {
451 "Queue cannot be constructed with different priorities.");
453 CreationFlags |= UR_QUEUE_FLAG_PRIORITY_LOW;
456 if (PropList.
has_property<ext::oneapi::property::queue::priority_high>()) {
460 "Queue cannot be constructed with different priorities.");
462 CreationFlags |= UR_QUEUE_FLAG_PRIORITY_HIGH;
465 bool SubmissionSeen =
false;
467 ext::intel::property::queue::no_immediate_command_list>()) {
468 SubmissionSeen =
true;
469 CreationFlags |= UR_QUEUE_FLAG_SUBMISSION_BATCHED;
472 ext::intel::property::queue::immediate_command_list>()) {
473 if (SubmissionSeen) {
476 "Queue cannot be constructed with different submission modes.");
478 SubmissionSeen =
true;
479 CreationFlags |= UR_QUEUE_FLAG_SUBMISSION_IMMEDIATE;
481 return CreationFlags;
489 ur_queue_handle_t Queue{};
490 ur_context_handle_t Context =
MContext->getHandleRef();
491 ur_device_handle_t Device =
MDevice->getHandleRef();
497 ur_queue_properties_t Properties = {UR_STRUCTURE_TYPE_QUEUE_PROPERTIES,
500 ur_queue_index_properties_t IndexProperties = {
501 UR_STRUCTURE_TYPE_QUEUE_INDEX_PROPERTIES,
nullptr, 0};
502 if (has_property<ext::intel::property::queue::compute_index>()) {
503 IndexProperties.computeIndex =
504 get_property<ext::intel::property::queue::compute_index>()
506 Properties.pNext = &IndexProperties;
508 ur_result_t Error = Plugin->call_nocheck<UrApiKind::urQueueCreate>(
509 Context, Device, &Properties, &Queue);
515 if (!
MEmulateOOO && Error == UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES) {
519 Plugin->checkUrResult(Error);
528 ur_queue_handle_t *PIQ =
nullptr;
529 bool ReuseQueue =
false;
531 std::lock_guard<std::mutex> Lock(
MMutex);
551 getPlugin()->call<UrApiKind::urQueueFinish>(*PIQ);
588 event memset(
const std::shared_ptr<queue_impl> &Self,
void *Ptr,
int Value,
589 size_t Count,
const std::vector<event> &DepEvents,
590 bool CallerNeedsEvent);
602 event memcpy(
const std::shared_ptr<queue_impl> &Self,
void *Dest,
603 const void *Src,
size_t Count,
604 const std::vector<event> &DepEvents,
bool CallerNeedsEvent,
617 event mem_advise(
const std::shared_ptr<queue_impl> &Self,
const void *Ptr,
618 size_t Length, ur_usm_advice_flags_t Advice,
619 const std::vector<event> &DepEvents,
bool CallerNeedsEvent);
625 std::lock_guard<std::mutex> Lock(
MMutex);
636 ur_native_handle_t
getNative(int32_t &NativeHandleDesc)
const;
646 void *DeviceGlobalPtr,
const void *Src,
647 bool IsDeviceImageScope,
size_t NumBytes,
648 size_t Offset,
const std::vector<event> &DepEvents,
649 bool CallerNeedsEvent);
651 void *Dest,
const void *DeviceGlobalPtr,
652 bool IsDeviceImageScope,
size_t NumBytes,
654 const std::vector<event> &DepEvents,
655 bool CallerNeedsEvent);
660 std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph) {
661 std::lock_guard<std::mutex> Lock(
MMutex);
666 std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
682 std::optional<event> Result = std::nullopt;
687 const std::vector<event> &
689 std::vector<event> &MutableVec,
690 std::unique_lock<std::mutex> &QueueLock);
705 return Queue ? Queue->getContextImplPtr() :
nullptr;
710 const std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
718 template <
typename HandlerType = handler>
720 auto ResEvent = std::make_shared<detail::event_impl>(Handler.MQueue);
721 ur_event_handle_t UREvent =
nullptr;
722 getPlugin()->call<UrApiKind::urEnqueueEventsWaitWithBarrier>(
723 Handler.MQueue->getHandleRef(), 0,
nullptr, &UREvent);
724 ResEvent->setHandle(UREvent);
729 template <
typename HandlerType = handler>
734 std::lock_guard<std::mutex> Lock{
MMutex};
746 if (EventToBuildDeps) {
752 if (EventToBuildDeps->isDiscarded() &&
756 if (!EventToBuildDeps->isDiscarded())
757 Handler.depends_on(EventToBuildDeps);
765 Handler.depends_on(*ExternalEvent);
767 EventRet = Handler.finalize();
771 std::lock_guard<std::mutex> Lock{
MMutex};
785 Handler.depends_on(Deps.UnenqueuedCmdEvents);
788 (!Deps.LastBarrier->isEnqueued())))
789 Handler.depends_on(Deps.LastBarrier);
791 EventRet = Handler.finalize();
794 Deps.UnenqueuedCmdEvents.push_back(EventRetImpl);
796 Deps.LastBarrier = EventRetImpl;
797 Deps.UnenqueuedCmdEvents.clear();
798 }
else if (!EventRetImpl->isEnqueued()) {
799 Deps.UnenqueuedCmdEvents.push_back(EventRetImpl);
817 const std::shared_ptr<queue_impl> &Self,
818 const std::shared_ptr<queue_impl> &PrimaryQueue,
819 const std::shared_ptr<queue_impl> &SecondaryQueue,
828 template <
typename HandlerFuncT>
830 const std::vector<event> &DepEvents,
831 HandlerFuncT HandlerFunc);
848 template <
typename HandlerFuncT,
typename MemMngrFuncT,
849 typename... MemMngrArgTs>
851 const std::vector<event> &DepEvents,
852 bool CallerNeedsEvent, HandlerFuncT HandlerFunc,
853 MemMngrFuncT MemMngrFunc, MemMngrArgTs... MemOpArgs);
858 std::string &Name, int32_t StreamID,
862 int32_t StreamID, uint64_t IId);
959 std::weak_ptr<ext::oneapi::experimental::detail::graph_impl>
MGraph{};
964 std::deque<std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>>
The context class represents a SYCL context on which kernel functions may be executed.
ThreadPool & getHostTaskThreadPool()
static GlobalHandler & instance()
event discard_or_return(const event &Event)
std::deque< std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > > MMissedCleanupRequests
void constructorNotification()
const property_list MPropList
bool hasDiscardEventsProperty() const
static ContextImplPtr getContext(const QueueImplPtr &Queue)
queue_impl(ur_queue_handle_t UrQueue, const ContextImplPtr &Context, const async_handler &AsyncHandler)
Constructs a SYCL queue from plugin interoperability handle.
queue_impl(ur_queue_handle_t UrQueue, const ContextImplPtr &Context, const async_handler &AsyncHandler, const property_list &PropList)
Constructs a SYCL queue from plugin interoperability handle.
event submit(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const std::shared_ptr< queue_impl > &SecondQueue, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess=nullptr)
Submits a command group function object to the queue, in order to be scheduled for execution on the d...
void revisitUnenqueuedCommandsState(const EventImplPtr &CompletedHostTask)
uint64_t MInstanceID
The instance ID of the trace event for queue object.
std::vector< EventImplPtr > MStreamsServiceEvents
const property_list & getPropList() const
void wait_and_throw(const detail::code_location &Loc={})
event submitMemOpHelper(const std::shared_ptr< queue_impl > &Self, const std::vector< event > &DepEvents, bool CallerNeedsEvent, HandlerFuncT HandlerFunc, MemMngrFuncT MemMngrFunc, MemMngrArgTs... MemOpArgs)
Performs submission of a memory operation directly if scheduler can be bypassed, or with a handler ot...
Param::return_type get_info() const
Queries SYCL queue for information.
std::optional< event > MInOrderExternalEvent
std::optional< event > popExternalEvent()
void registerStreamServiceEvent(const EventImplPtr &Event)
static std::atomic< unsigned long long > MNextAvailableQueueID
void addEvent(const event &Event)
Stores an event that should be associated with the queue.
struct sycl::_V1::detail::queue_impl::DependencyTrackingItems MExtGraphDeps
void tryToResetEnqueuedBarrierDep(const EventImplPtr &EnqueuedBarrierEvent)
unsigned long long MQueueID
std::vector< std::weak_ptr< event_impl > > MEventsWeak
These events are tracked, but not owned, by the queue.
void destructorNotification()
bool has_property() const noexcept
event submit(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess=nullptr)
Submits a command group function object to the queue, in order to be scheduled for execution on the d...
std::vector< event > MEventsShared
Events without data dependencies (such as USM) need an owner, additionally, USM operations are not ad...
std::mutex MMissedCleanupRequestsMtx
ur_native_handle_t getNative(int32_t &NativeHandleDesc) const
Gets the native handle of the SYCL queue.
std::mutex MMutex
Protects all the fields that can be changed by class' methods.
void submit_without_event(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess=nullptr)
void setCommandGraph(std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > Graph)
event memcpyToDeviceGlobal(const std::shared_ptr< queue_impl > &Self, void *DeviceGlobalPtr, const void *Src, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector< event > &DepEvents, bool CallerNeedsEvent)
device get_device() const
bool supportsDiscardingPiEvents() const
static ThreadPool & getThreadPool()
event submitWithHandler(const std::shared_ptr< queue_impl > &Self, const std::vector< event > &DepEvents, HandlerFuncT HandlerFunc)
Helper function for submitting a memory operation with a handler.
void doUnenqueuedCommandCleanup(const std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > &Graph)
event submit_impl(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const std::shared_ptr< queue_impl > &PrimaryQueue, const std::shared_ptr< queue_impl > &SecondaryQueue, bool CallerNeedsEvent, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess)
Performs command group submission to the queue.
std::function< void(bool, bool, event &)> SubmitPostProcessF
event mem_advise(const std::shared_ptr< queue_impl > &Self, const void *Ptr, size_t Length, ur_usm_advice_flags_t Advice, const std::vector< event > &DepEvents, bool CallerNeedsEvent)
Provides additional information to the underlying runtime about how different allocations are used.
event memcpyFromDeviceGlobal(const std::shared_ptr< queue_impl > &Self, void *Dest, const void *DeviceGlobalPtr, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector< event > &DepEvents, bool CallerNeedsEvent)
void addSharedEvent(const event &Event)
queue_impl.addEvent tracks events with weak pointers but some events have no other owners.
const ContextImplPtr MContext
std::vector< ur_queue_handle_t > MQueues
List of queues created for FPGA device from a single SYCL queue.
void finalizeHandler(HandlerType &Handler, event &EventRet)
void setExternalEvent(const event &Event)
std::mutex MInOrderExternalEventMtx
void throw_asynchronous()
Performs a blocking wait for the completion of all enqueued tasks in the queue.
queue_impl(const DeviceImplPtr &Device, const ContextImplPtr &Context, const async_handler &AsyncHandler, const property_list &PropList)
Constructs a SYCL queue with an async_handler and property_list provided form a device and a context.
void flush()
Provides a hint to the backend to execute previously issued commands on this queue.
const PluginPtr & getPlugin() const
const DeviceImplPtr & getDeviceImplPtr() const
exception_list MExceptions
const async_handler MAsyncHandler
exception_list getExceptionList() const
bool ext_oneapi_empty() const
ur_queue_handle_t createQueue(QueueOrder Order)
Creates UR queue.
const bool MDiscardEvents
uint8_t MStreamID
The stream under which the traces are emitted from the queue object.
ur_queue_handle_t & getExclusiveUrQueueHandleRef()
void reportAsyncException(const std::exception_ptr &ExceptionPtr)
Puts exception to the list of asynchronous ecxeptions.
context get_context() const
void wait(const detail::code_location &Loc={})
Performs a blocking wait for the completion of all enqueued tasks in the queue.
std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > getCommandGraph() const
bool isProfilingFallback()
struct sycl::_V1::detail::queue_impl::DependencyTrackingItems MDefaultGraphDeps
EventImplPtr insertHelperBarrier(const HandlerType &Handler)
bool MEmulateOOO
Indicates that a native out-of-order queue could not be created and we need to emulate it with multip...
void instrumentationEpilog(void *TelementryEvent, std::string &Name, int32_t StreamID, uint64_t IId)
const ContextImplPtr & getContextImplPtr() const
void * instrumentationProlog(const detail::code_location &CodeLoc, std::string &Name, int32_t StreamID, uint64_t &iid)
ur_queue_handle_t & getHandleRef()
size_t MNextQueueIdx
Iterator through MQueues.
std::mutex MStreamsServiceEventsMutex
const bool MIsProfilingEnabled
std::weak_ptr< ext::oneapi::experimental::detail::graph_impl > MGraph
event memset(const std::shared_ptr< queue_impl > &Self, void *Ptr, int Value, size_t Count, const std::vector< event > &DepEvents, bool CallerNeedsEvent)
Fills the memory pointed by a USM pointer with the value specified.
event memcpy(const std::shared_ptr< queue_impl > &Self, void *Dest, const void *Src, size_t Count, const std::vector< event > &DepEvents, bool CallerNeedsEvent, const code_location &CodeLoc)
Copies data from one memory region to another, both pointed by USM pointers.
Param::return_type get_backend_info() const
Queries SYCL queue for SYCL backend-specific information.
const std::vector< event > & getExtendDependencyList(const std::vector< event > &DepEvents, std::vector< event > &MutableVec, std::unique_lock< std::mutex > &QueueLock)
queue_impl(const DeviceImplPtr &Device, const async_handler &AsyncHandler, const property_list &PropList)
Constructs a SYCL queue from a device using an async_handler and property_list provided.
static ContextImplPtr getDefaultOrNew(const DeviceImplPtr &Device)
unsigned long long getQueueID()
static ur_queue_flags_t createUrQueueFlags(const property_list &PropList, QueueOrder Order)
Creates UR properties array.
propertyT get_property() const
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
A list of asynchronous exceptions.
Implementation of node class from SYCL_EXT_ONEAPI_GRAPH.
Command group handler class.
Objects of the property_list class are containers for the SYCL properties.
bool has_property() const noexcept
PropT get_property() const
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
#define __SYCL_REPORT_EXCEPTION_TO_STREAM(str, e)
CUDAContextT
Possible CUDA context types supported by UR CUDA backend TODO: Implement this as a property once ther...
decltype(Obj::impl) const & getSyclObjImpl(const Obj &SyclObject)
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
std::shared_ptr< event_impl > EventImplPtr
std::shared_ptr< plugin > PluginPtr
std::shared_ptr< device_impl > DeviceImplPtr
static constexpr size_t MaxNumQueues
Sets max number of queues supported by FPGA RT.
CGType
Type of the command group.
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
constexpr CUDAContextT DefaultContextType
Default context type created for CUDA backend.
constexpr auto memory_order_relaxed
std::function< void(sycl::exception_list)> async_handler
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
_Abi const simd< _Tp, _Abi > & noexcept
std::vector< EventImplPtr > UnenqueuedCmdEvents
EventImplPtr LastEventPtr
C++ utilities for Unified Runtime integration.