20 #ifdef XPTI_ENABLE_INSTRUMENTATION
21 #include "xpti/xpti_trace_framework.hpp"
27 inline namespace _V1 {
39 "Calls to sycl::queue::submit cannot be nested. Command group "
40 "function objects should use the sycl::handler API instead.");
47 static std::vector<ur_event_handle_t>
49 std::vector<ur_event_handle_t> RetUrEvents;
52 if (EventImpl->getHandleRef() !=
nullptr)
53 RetUrEvents.push_back(EventImpl->getHandleRef());
59 uint32_t queue_impl::get_info<info::queue::reference_count>()
const {
60 ur_result_t result = UR_RESULT_SUCCESS;
61 getPlugin()->call(urQueueGetInfo, MQueues[0], UR_QUEUE_INFO_REFERENCE_COUNT,
62 sizeof(result), &result,
nullptr);
66 template <>
context queue_impl::get_info<info::queue::context>()
const {
70 template <>
device queue_impl::get_info<info::queue::device>()
const {
75 typename info::platform::version::return_type
76 queue_impl::get_backend_info<info::platform::version>()
const {
79 "the info::platform::version info descriptor can "
80 "only be queried with an OpenCL backend");
82 return get_device().get_platform().get_info<info::platform::version>();
86 typename info::device::version::return_type
87 queue_impl::get_backend_info<info::device::version>()
const {
90 "the info::device::version info descriptor can only "
91 "be queried with an OpenCL backend");
93 return get_device().get_info<info::device::version>();
97 typename info::device::backend_version::return_type
98 queue_impl::get_backend_info<info::device::backend_version>()
const {
101 "the info::device::backend_version info descriptor "
102 "can only be queried with a Level Zero backend");
111 const std::shared_ptr<detail::queue_impl> &QueueImpl) {
112 auto EventImpl = std::make_shared<detail::event_impl>(QueueImpl);
114 EventImpl->setStateIncomplete();
115 return detail::createSyclObjFromImpl<event>(EventImpl);
121 return createSyclObjFromImpl<event>(EventImpl);
124 const std::vector<event> &
126 std::vector<event> &MutableVec,
127 std::unique_lock<std::mutex> &QueueLock) {
136 if (!ExternalEvent && !ExtraEvent)
139 MutableVec = DepEvents;
141 MutableVec.push_back(*ExternalEvent);
143 MutableVec.push_back(detail::createSyclObjFromImpl<event>(ExtraEvent));
148 void *Ptr,
int Value,
size_t Count,
149 const std::vector<event> &DepEvents,
150 bool CallerNeedsEvent) {
151 #if XPTI_ENABLE_INSTRUMENTATION
155 XPTIScope PrepareNotify((
void *)
this,
156 (uint16_t)xpti::trace_point_type_t::node_create,
158 PrepareNotify.addMetadata([&](
auto TEvent) {
159 xpti::addMetadata(TEvent,
"sycl_device",
160 reinterpret_cast<size_t>(
MDevice->getHandleRef()));
161 xpti::addMetadata(TEvent,
"memory_ptr",
reinterpret_cast<size_t>(Ptr));
162 xpti::addMetadata(TEvent,
"value_set", Value);
163 xpti::addMetadata(TEvent,
"memory_size", Count);
164 xpti::addMetadata(TEvent,
"queue_id",
MQueueID);
168 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
MQueueID);
170 PrepareNotify.notify();
172 PrepareNotify.scopedNotify((uint16_t)xpti::trace_point_type_t::task_begin);
174 const std::vector<unsigned char> Pattern{
static_cast<unsigned char>(Value)};
176 Self, DepEvents, CallerNeedsEvent,
196 void *Dest,
const void *Src,
size_t Count,
197 const std::vector<event> &DepEvents,
199 #if XPTI_ENABLE_INSTRUMENTATION
203 XPTIScope PrepareNotify((
void *)
this,
204 (uint16_t)xpti::trace_point_type_t::node_create,
206 PrepareNotify.addMetadata([&](
auto TEvent) {
207 xpti::addMetadata(TEvent,
"sycl_device",
208 reinterpret_cast<size_t>(
MDevice->getHandleRef()));
209 xpti::addMetadata(TEvent,
"src_memory_ptr",
reinterpret_cast<size_t>(Src));
210 xpti::addMetadata(TEvent,
"dest_memory_ptr",
211 reinterpret_cast<size_t>(Dest));
212 xpti::addMetadata(TEvent,
"memory_size", Count);
213 xpti::addMetadata(TEvent,
"queue_id",
MQueueID);
215 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
MQueueID);
217 PrepareNotify.notify();
219 PrepareNotify.scopedNotify((uint16_t)xpti::trace_point_type_t::task_begin);
222 if ((!Src || !Dest) && Count != 0) {
225 "NULL pointer argument in memory copy operation.");
228 Self, DepEvents, CallerNeedsEvent,
235 const void *Ptr,
size_t Length,
236 ur_usm_advice_flags_t Advice,
237 const std::vector<event> &DepEvents,
238 bool CallerNeedsEvent) {
240 Self, DepEvents, CallerNeedsEvent,
243 Self, Length, Advice);
247 const std::shared_ptr<detail::queue_impl> &Self,
void *DeviceGlobalPtr,
248 const void *Src,
bool IsDeviceImageScope,
size_t NumBytes,
size_t Offset,
249 const std::vector<event> &DepEvents,
bool CallerNeedsEvent) {
251 Self, DepEvents, CallerNeedsEvent,
253 CGH.memcpyToDeviceGlobal(DeviceGlobalPtr, Src, IsDeviceImageScope,
256 [](
const auto &...Args) {
259 DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Src);
263 const std::shared_ptr<detail::queue_impl> &Self,
void *Dest,
264 const void *DeviceGlobalPtr,
bool IsDeviceImageScope,
size_t NumBytes,
265 size_t Offset,
const std::vector<event> &DepEvents,
bool CallerNeedsEvent) {
267 Self, DepEvents, CallerNeedsEvent,
269 CGH.memcpyFromDeviceGlobal(Dest, DeviceGlobalPtr, IsDeviceImageScope,
272 [](
const auto &...Args) {
275 DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Dest);
287 std::lock_guard<std::mutex> Lock{
MMutex};
299 assert(EImpl &&
"Event implementation is missing");
300 auto *Cmd =
static_cast<Command *
>(EImpl->getCommand());
310 else if (
MEmulateOOO || EImpl->getHandleRef() ==
nullptr) {
311 std::weak_ptr<event_impl> EventWeakPtr{EImpl};
312 std::lock_guard<std::mutex> Lock{
MMutex};
322 std::lock_guard<std::mutex> Lock(
MMutex);
328 const size_t EventThreshold = 128;
342 return E.get_info<info::event::command_execution_status>() !=
343 info::event_command_status::complete;
350 const std::shared_ptr<queue_impl> &Self,
351 const std::shared_ptr<queue_impl> &PrimaryQueue,
352 const std::shared_ptr<queue_impl> &SecondaryQueue,
353 bool CallerNeedsEvent,
356 handler Handler(Self, PrimaryQueue, SecondaryQueue, CallerNeedsEvent);
357 Handler.saveCodeLoc(Loc);
368 event Event = detail::createSyclObjFromImpl<event>(
369 std::make_shared<detail::event_impl>());
370 std::vector<StreamImplPtr> Streams;
372 Streams = std::move(Handler.MStreamStorage);
376 bool KernelUsesAssert =
false;
380 KernelUsesAssert = !(Handler.MKernel && Handler.MKernel->isInterop()) &&
382 Handler.MKernelName.
c_str());
385 (*PostProcess)(IsKernel, KernelUsesAssert, Event);
392 for (
auto &Stream : Streams) {
397 [&](
handler &ServiceCGH) { Stream->generateFlushCommand(ServiceCGH); },
398 Self, PrimaryQueue, SecondaryQueue,
true, Loc, {});
406 template <
typename HandlerFuncT>
408 const std::vector<event> &DepEvents,
409 HandlerFuncT HandlerFunc) {
418 template <
typename HandlerFuncT,
typename MemOpFuncT,
typename... MemOpArgTs>
420 const std::vector<event> &DepEvents,
421 bool CallerNeedsEvent,
422 HandlerFuncT HandlerFunc,
423 MemOpFuncT MemOpFunc,
424 MemOpArgTs... MemOpArgs) {
428 std::unique_lock<std::mutex> Lock(
MMutex, std::defer_lock);
430 std::vector<event> MutableDepEvents;
431 const std::vector<event> &ExpandedDepEvents =
441 MemOpFunc(MemOpArgs...,
getUrEvents(ExpandedDepEvents),
450 MemOpFunc(MemOpArgs...,
getUrEvents(ExpandedDepEvents),
451 &EventImpl->getHandleRef(), EventImpl);
457 EventToStoreIn = EventImpl;
469 std::string &Name, int32_t StreamID,
471 void *TraceEvent =
nullptr;
476 #ifdef XPTI_ENABLE_INSTRUMENTATION
477 constexpr uint16_t NotificationTraceType = xpti::trace_wait_begin;
478 if (!xptiCheckTraceEnabled(StreamID, NotificationTraceType))
481 xpti::payload_t Payload;
482 bool HasSourceInfo =
false;
485 xpti::utils::StringHelper NG;
486 Name = NG.nameWithAddress<
queue_impl *>(
"queue.wait",
this);
493 HasSourceInfo =
true;
496 Payload = xpti::payload_t(Name.c_str(), (
void *)
this);
501 uint64_t QWaitInstanceNo = 0;
502 xpti::trace_event_data_t *WaitEvent =
503 xptiMakeEvent(Name.c_str(), &Payload, xpti::trace_graph_event,
504 xpti_at::active, &QWaitInstanceNo);
505 IId = QWaitInstanceNo;
509 xpti::addMetadata(WaitEvent,
"sym_function_name", CodeLoc.
functionName());
510 xpti::addMetadata(WaitEvent,
"sym_source_file_name", CodeLoc.
fileName());
511 xpti::addMetadata(WaitEvent,
"sym_line_no",
512 static_cast<int32_t
>((CodeLoc.
lineNumber())));
513 xpti::addMetadata(WaitEvent,
"sym_column_no",
516 xptiNotifySubscribers(StreamID, xpti::trace_wait_begin,
nullptr, WaitEvent,
518 static_cast<const void *
>(Name.c_str()));
519 TraceEvent = (
void *)WaitEvent;
526 int32_t StreamID, uint64_t IId) {
527 (void)TelemetryEvent;
531 #ifdef XPTI_ENABLE_INSTRUMENTATION
532 constexpr uint16_t NotificationTraceType = xpti::trace_wait_end;
533 if (!(xptiCheckTraceEnabled(StreamID, NotificationTraceType) &&
537 xpti::trace_event_data_t *TraceEvent =
538 (xpti::trace_event_data_t *)TelemetryEvent;
539 xptiNotifySubscribers(StreamID, NotificationTraceType,
nullptr, TraceEvent,
540 IId,
static_cast<const void *
>(Name.c_str()));
546 #ifdef XPTI_ENABLE_INSTRUMENTATION
547 void *TelemetryEvent =
nullptr;
556 "wait cannot be called for a queue which is "
557 "recording to a command graph.");
565 ExternalEvent->wait();
570 std::lock_guard<std::mutex> Lock(
MMutex);
580 std::vector<std::weak_ptr<event_impl>> WeakEvents;
581 std::vector<event> SharedEvents;
583 std::lock_guard<std::mutex> Lock(
MMutex);
600 for (
auto EventImplWeakPtrIt = WeakEvents.rbegin();
601 EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) {
602 if (std::shared_ptr<event_impl> EventImplSharedPtr =
603 EventImplWeakPtrIt->lock()) {
606 if (!SupportsPiFinish ||
nullptr == EventImplSharedPtr->getHandleRef()) {
607 EventImplSharedPtr->wait(EventImplSharedPtr);
611 if (SupportsPiFinish) {
614 assert(SharedEvents.empty() &&
"Queues that support calling piQueueFinish "
615 "shouldn't have shared events");
617 for (
event &Event : SharedEvents)
621 std::vector<EventImplPtr> StreamsServiceEvents;
629 #ifdef XPTI_ENABLE_INSTRUMENTATION
637 Plugin->call(urQueueRetain,
MQueues[0]);
638 ur_native_handle_t Handle{};
639 ur_queue_native_desc_t UrNativeDesc{UR_STRUCTURE_TYPE_QUEUE_NATIVE_DESC,
641 UrNativeDesc.pNativeData = &NativeHandleDesc;
643 Plugin->call(urQueueGetNativeHandle,
MQueues[0], &UrNativeDesc, &Handle);
657 std::lock_guard<std::mutex> Lock(
MMutex);
667 ->get_info<info::event::command_execution_status>() ==
672 ur_bool_t IsReady =
false;
674 sizeof(IsReady), &IsReady,
nullptr);
680 std::lock_guard<std::mutex> Lock(
MMutex);
682 if (Event.get_info<info::event::command_execution_status>() !=
686 for (
auto EventImplWeakPtrIt =
MEventsWeak.begin();
687 EventImplWeakPtrIt !=
MEventsWeak.end(); ++EventImplWeakPtrIt)
688 if (std::shared_ptr<event_impl> EventImplSharedPtr =
689 EventImplWeakPtrIt->lock())
690 if (EventImplSharedPtr->isHost() &&
692 ->get_info<info::event::command_execution_status>() !=
711 std::unique_lock<std::mutex> Lock{
MMutex, std::try_to_lock};
712 if (Lock.owns_lock())
721 const std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
724 if (Deps.LastBarrier && Deps.LastBarrier->isEnqueued()) {
725 Deps.LastBarrier =
nullptr;
726 Deps.UnenqueuedCmdEvents.clear();
728 if (Deps.UnenqueuedCmdEvents.empty())
730 Deps.UnenqueuedCmdEvents.erase(
732 Deps.UnenqueuedCmdEvents.begin(), Deps.UnenqueuedCmdEvents.end(),
734 return (CommandEvent->isHost() ? CommandEvent->isCompleted()
735 : CommandEvent->isEnqueued());
737 Deps.UnenqueuedCmdEvents.end());
The context class represents a SYCL context on which kernel functions may be executed.
The Command class represents some action that needs to be performed on one or more memory objects.
static void copy_to_device_global(const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue, size_t NumBytes, size_t Offset, const void *SrcMem, const std::vector< ur_event_handle_t > &DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void advise_usm(const void *Ptr, QueueImplPtr Queue, size_t Len, ur_usm_advice_flags_t Advice, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy_usm(const void *SrcMem, QueueImplPtr Queue, size_t Len, void *DstMem, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void fill_usm(void *DstMem, QueueImplPtr Queue, size_t Len, const std::vector< unsigned char > &Pattern, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy_from_device_global(const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue, size_t NumBytes, size_t Offset, void *DstMem, const std::vector< ur_event_handle_t > &DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static ProgramManager & getInstance()
bool kernelUsesAssert(const std::string &KernelName) const
static Scheduler & getInstance()
void cleanUpCmdFusion(sycl::detail::queue_impl *Queue)
static bool isInstanceAlive()
static bool areEventsSafeForSchedulerBypass(const std::vector< sycl::event > &DepEvents, ContextImplPtr Context)
event discard_or_return(const event &Event)
std::deque< std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > > MMissedCleanupRequests
event submit(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const std::shared_ptr< queue_impl > &SecondQueue, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess=nullptr)
Submits a command group function object to the queue, in order to be scheduled for execution on the d...
void revisitUnenqueuedCommandsState(const EventImplPtr &CompletedHostTask)
std::vector< EventImplPtr > MStreamsServiceEvents
event submitMemOpHelper(const std::shared_ptr< queue_impl > &Self, const std::vector< event > &DepEvents, bool CallerNeedsEvent, HandlerFuncT HandlerFunc, MemMngrFuncT MemMngrFunc, MemMngrArgTs... MemOpArgs)
Performs submission of a memory operation directly if scheduler can be bypassed, or with a handler ot...
std::optional< event > MInOrderExternalEvent
std::optional< event > popExternalEvent()
void registerStreamServiceEvent(const EventImplPtr &Event)
static std::atomic< unsigned long long > MNextAvailableQueueID
void addEvent(const event &Event)
Stores an event that should be associated with the queue.
struct sycl::_V1::detail::queue_impl::DependencyTrackingItems MExtGraphDeps
unsigned long long MQueueID
std::vector< std::weak_ptr< event_impl > > MEventsWeak
These events are tracked, but not owned, by the queue.
std::vector< event > MEventsShared
Events without data dependencies (such as USM) need an owner, additionally, USM operations are not ad...
std::mutex MMissedCleanupRequestsMtx
ur_native_handle_t getNative(int32_t &NativeHandleDesc) const
Gets the native handle of the SYCL queue.
std::mutex MMutex
Protects all the fields that can be changed by class' methods.
void cleanup_fusion_cmd()
event memcpyToDeviceGlobal(const std::shared_ptr< queue_impl > &Self, void *DeviceGlobalPtr, const void *Src, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector< event > &DepEvents, bool CallerNeedsEvent)
bool supportsDiscardingPiEvents() const
event submitWithHandler(const std::shared_ptr< queue_impl > &Self, const std::vector< event > &DepEvents, HandlerFuncT HandlerFunc)
Helper function for submitting a memory operation with a handler.
void doUnenqueuedCommandCleanup(const std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > &Graph)
event submit_impl(const std::function< void(handler &)> &CGF, const std::shared_ptr< queue_impl > &Self, const std::shared_ptr< queue_impl > &PrimaryQueue, const std::shared_ptr< queue_impl > &SecondaryQueue, bool CallerNeedsEvent, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess)
Performs command group submission to the queue.
std::function< void(bool, bool, event &)> SubmitPostProcessF
event mem_advise(const std::shared_ptr< queue_impl > &Self, const void *Ptr, size_t Length, ur_usm_advice_flags_t Advice, const std::vector< event > &DepEvents, bool CallerNeedsEvent)
Provides additional information to the underlying runtime about how different allocations are used.
event memcpyFromDeviceGlobal(const std::shared_ptr< queue_impl > &Self, void *Dest, const void *DeviceGlobalPtr, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector< event > &DepEvents, bool CallerNeedsEvent)
void addSharedEvent(const event &Event)
queue_impl.addEvent tracks events with weak pointers but some events have no other owners.
const ContextImplPtr MContext
std::vector< ur_queue_handle_t > MQueues
List of queues created for FPGA device from a single SYCL queue.
void finalizeHandler(HandlerType &Handler, event &EventRet)
std::mutex MInOrderExternalEventMtx
const PluginPtr & getPlugin() const
bool ext_oneapi_empty() const
const bool MDiscardEvents
void wait(const detail::code_location &Loc={})
Performs a blocking wait for the completion of all enqueued tasks in the queue.
std::shared_ptr< ext::oneapi::experimental::detail::graph_impl > getCommandGraph() const
struct sycl::_V1::detail::queue_impl::DependencyTrackingItems MDefaultGraphDeps
bool MEmulateOOO
Indicates that a native out-of-order queue could not be created and we need to emulate it with multip...
void instrumentationEpilog(void *TelementryEvent, std::string &Name, int32_t StreamID, uint64_t IId)
const ContextImplPtr & getContextImplPtr() const
void * instrumentationProlog(const detail::code_location &CodeLoc, std::string &Name, int32_t StreamID, uint64_t &iid)
ur_queue_handle_t & getHandleRef()
std::mutex MStreamsServiceEventsMutex
std::weak_ptr< ext::oneapi::experimental::detail::graph_impl > MGraph
event memset(const std::shared_ptr< queue_impl > &Self, void *Ptr, int Value, size_t Count, const std::vector< event > &DepEvents, bool CallerNeedsEvent)
Fills the memory pointed by a USM pointer with the value specified.
event memcpy(const std::shared_ptr< queue_impl > &Self, void *Dest, const void *Src, size_t Count, const std::vector< event > &DepEvents, bool CallerNeedsEvent, const code_location &CodeLoc)
Copies data from one memory region to another, both pointed by USM pointers.
const std::vector< event > & getExtendDependencyList(const std::vector< event > &DepEvents, std::vector< event > &MutableVec, std::unique_lock< std::mutex > &QueueLock)
const char * c_str() const noexcept
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Command group handler class.
void depends_on(event Event)
Registers event dependencies on this command group.
void memcpy(void *Dest, const void *Src, size_t Count)
Copies data from one memory region to another, each is either a host pointer or a pointer within USM ...
void mem_advise(const void *Ptr, size_t Length, int Advice)
Provides additional information to the underlying runtime about how different allocations are used.
void memset(void *Dest, int Value, size_t Count)
Fills the memory pointed by a USM pointer with the value specified.
__SYCL_EXTERN_STREAM_ATTRS ostream cout
Linked to standard output.
thread_local bool NestedCallsDetector
decltype(Obj::impl) const & getSyclObjImpl(const Obj &SyclObject)
constexpr const char * SYCL_STREAM_NAME
static event createDiscardedEvent()
static const PluginPtr & getPlugin(backend Backend)
static std::vector< ur_event_handle_t > getUrEvents(const std::vector< sycl::event > &DepEvents)
std::shared_ptr< event_impl > EventImplPtr
std::shared_ptr< plugin > PluginPtr
CGType
Type of the command group.
void report(const code_location &CodeLoc)
static event prepareSYCLEventAssociatedWithQueue(const std::shared_ptr< detail::queue_impl > &QueueImpl)
std::string queueDeviceToString(const queue_impl *const &Queue)
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
static device_ext & get_device(unsigned int id)
Util function to get a device by id.
constexpr unsigned long columnNumber() const noexcept
constexpr const char * fileName() const noexcept
constexpr const char * functionName() const noexcept
constexpr unsigned long lineNumber() const noexcept
EventImplPtr LastEventPtr
C++ utilities for Unified Runtime integration.