38 #if __has_include(<cxxabi.h>)
39 #define __SYCL_ENABLE_GNU_DEMANGLING
46 #ifdef XPTI_ENABLE_INSTRUMENTATION
47 #include "xpti/xpti_trace_framework.hpp"
52 inline namespace _V1 {
55 #ifdef XPTI_ENABLE_INSTRUMENTATION
57 extern xpti::trace_event_data_t *GSYCLGraphEvent;
59 bool CurrentCodeLocationValid() {
60 detail::tls_code_loc_t Tls;
61 auto CodeLoc = Tls.query();
62 auto FileName = CodeLoc.fileName();
63 auto FunctionName = CodeLoc.functionName();
64 return (FileName && FileName[0] !=
'\0') ||
65 (FunctionName && FunctionName[0] !=
'\0');
68 void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID,
69 xpti_td *TraceEvent, uint16_t Type,
71 if (!(xptiCheckTraceEnabled(StreamID, Type) && TraceEvent))
74 xptiNotifySubscribers(StreamID, Type, detail::GSYCLGraphEvent,
75 static_cast<xpti_td *
>(TraceEvent), InstanceID, Addr);
79 #ifdef __SYCL_ENABLE_GNU_DEMANGLING
80 struct DemangleHandle {
82 DemangleHandle(
char *ptr) : p(ptr) {}
84 DemangleHandle(
const DemangleHandle &) =
delete;
85 DemangleHandle &
operator=(
const DemangleHandle &) =
delete;
91 DemangleHandle result(abi::__cxa_demangle(Name.c_str(), NULL, NULL, &Status));
92 return (Status == 0) ? result.p : Name;
106 return "ACCELERATOR";
112 const KernelArgMask *EliminatedArgMask, std::vector<ArgDesc> &Args,
114 if (!EliminatedArgMask) {
122 std::sort(Args.begin(), Args.end(), [](
const ArgDesc &A,
const ArgDesc &B) {
123 return A.MIndex < B.MIndex;
126 size_t NextTrueIndex = 0;
131 for (
int Idx = LastIndex + 1; Idx < Arg.
MIndex; ++Idx)
132 if (!(*EliminatedArgMask)[Idx])
136 if ((*EliminatedArgMask)[Arg.
MIndex])
139 Func(Arg, NextTrueIndex);
145 #ifdef XPTI_ENABLE_INSTRUMENTATION
146 static size_t deviceToID(
const device &Device) {
150 return reinterpret_cast<size_t>(
getSyclObjImpl(Device)->getHandleRef());
163 return "discard_write";
165 return "discard_read_write";
171 #ifdef XPTI_ENABLE_INSTRUMENTATION
176 case Command::CommandType::RUN_CG:
177 return "command_group_node";
178 case Command::CommandType::COPY_MEMORY:
179 return "memory_transfer_node";
180 case Command::CommandType::ALLOCA:
181 return "memory_allocation_node";
182 case Command::CommandType::ALLOCA_SUB_BUF:
183 return "sub_buffer_creation_node";
184 case Command::CommandType::RELEASE:
185 return "memory_deallocation_node";
186 case Command::CommandType::MAP_MEM_OBJ:
187 return "memory_transfer_node";
188 case Command::CommandType::UNMAP_MEM_OBJ:
189 return "memory_transfer_node";
190 case Command::CommandType::UPDATE_REQUIREMENT:
191 return "host_acc_create_buffer_lock_node";
192 case Command::CommandType::EMPTY_TASK:
193 return "host_acc_destroy_buffer_release_node";
194 case Command::CommandType::FUSION:
195 return "kernel_fusion_placeholder_node";
197 return "unknown_node";
206 case Command::CommandType::RUN_CG:
207 return "Command Group Action";
208 case Command::CommandType::COPY_MEMORY:
209 return "Memory Transfer (Copy)";
210 case Command::CommandType::ALLOCA:
211 return "Memory Allocation";
212 case Command::CommandType::ALLOCA_SUB_BUF:
213 return "Sub Buffer Creation";
214 case Command::CommandType::RELEASE:
215 return "Memory Deallocation";
216 case Command::CommandType::MAP_MEM_OBJ:
217 return "Memory Transfer (Map)";
218 case Command::CommandType::UNMAP_MEM_OBJ:
219 return "Memory Transfer (Unmap)";
220 case Command::CommandType::UPDATE_REQUIREMENT:
221 return "Host Accessor Creation/Buffer Lock";
222 case Command::CommandType::EMPTY_TASK:
223 return "Host Accessor Destruction/Buffer Lock Release";
224 case Command::CommandType::FUSION:
225 return "Kernel Fusion Placeholder";
227 return "Unknown Action";
232 std::vector<sycl::detail::pi::PiEvent>
234 std::vector<sycl::detail::pi::PiEvent> RetPiEvents;
235 for (
auto &EventImpl : EventImpls) {
236 if (EventImpl->getHandleRef() ==
nullptr)
246 if (EventImpl->getWorkerQueue() == WorkerQueue &&
250 RetPiEvents.push_back(EventImpl->getHandleRef());
261 const std::vector<EventImplPtr> &EventImpls)
const {
262 std::vector<sycl::detail::pi::PiEvent> RetPiEvents;
263 for (
auto &EventImpl : EventImpls) {
268 if (!EventImpl->isContextInitialized() || EventImpl->is_host() ||
273 if (EventImpl->getHandleRef() ==
nullptr) {
274 if (!EventImpl->getCommand() ||
277 std::vector<Command *> AuxCmds;
288 if (EventImpl->getWorkerQueue() == WorkerQueue &&
292 RetPiEvents.push_back(EventImpl->getHandleRef());
299 return (
MType == CommandType::RUN_CG) &&
300 ((
static_cast<const ExecCGCommand *
>(
this))->getCG().getType() ==
301 CG::CGTYPE::CodeplayHostTask);
305 if ((
MType != CommandType::RUN_CG)) {
309 return (
CG.
getType() == CG::CGTYPE::Kernel) &&
315 for (
auto &EventImpl : EventImpls) {
316 EventImpl->flushIfNeeded(Queue);
322 std::vector<interop_handle::ReqToMem> MReqToMem;
325 std::map<const PluginPtr, std::vector<EventImplPtr>>
326 RequiredEventsPerPlugin;
329 const PluginPtr &Plugin = Event->getPlugin();
330 RequiredEventsPerPlugin[Plugin].push_back(Event);
338 for (
auto &PluginWithEvents : RequiredEventsPerPlugin) {
339 std::vector<sycl::detail::pi::PiEvent> RawEvents =
346 HostTask.MQueue->reportAsyncException(std::current_exception());
350 HostTask.MQueue->reportAsyncException(std::current_exception());
351 return PI_ERROR_UNKNOWN;
358 Event->waitInternal();
366 std::vector<interop_handle::ReqToMem> ReqToMem)
367 : MThisCmd{ThisCmd}, MReqToMem(
std::move(ReqToMem)) {}
370 assert(MThisCmd->
getCG().
getType() == CG::CGTYPE::CodeplayHostTask);
374 #ifdef XPTI_ENABLE_INSTRUMENTATION
379 std::unique_ptr<detail::tls_code_loc_t> AsyncCodeLocationPtr;
380 if (xptiTraceEnabled() && !CurrentCodeLocationValid()) {
381 AsyncCodeLocationPtr.reset(
387 if (WaitResult != PI_SUCCESS) {
388 std::exception_ptr EPtr = std::make_exception_ptr(sycl::runtime_error(
389 std::string(
"Couldn't wait for host-task's dependencies"),
391 HostTask.MQueue->reportAsyncException(EPtr);
400 if (
HostTask.MHostTask->isInteropTask()) {
402 HostTask.MQueue->getDeviceImplPtr(),
403 HostTask.MQueue->getContextImplPtr()};
405 HostTask.MHostTask->call(MThisCmd->
MEvent->getHostProfilingInfo(), IH);
407 HostTask.MHostTask->call(MThisCmd->
MEvent->getHostProfilingInfo());
409 auto CurrentException = std::current_exception();
410 #ifdef XPTI_ENABLE_INSTRUMENTATION
414 if (xptiTraceEnabled()) {
416 rethrow_exception(CurrentException);
419 }
catch (
const std::exception &StdException) {
423 "Host task lambda thrown non standard exception");
427 HostTask.MQueue->reportAsyncException(CurrentException);
432 #ifdef XPTI_ENABLE_INSTRUMENTATION
435 AsyncCodeLocationPtr.reset();
443 auto CurrentException = std::current_exception();
444 HostTask.MQueue->reportAsyncException(CurrentException);
451 HostEvent->waitInternal();
455 std::vector<EventImplPtr> &EventImpls,
457 if (!EventImpls.empty()) {
458 if (Queue->is_host()) {
472 std::map<context_impl *, std::vector<EventImplPtr>>
473 RequiredEventsPerContext;
477 assert(Context.get() &&
478 "Only non-host events are expected to be waited for here");
479 RequiredEventsPerContext[Context.get()].push_back(Event);
482 for (
auto &CtxWithEvents : RequiredEventsPerContext) {
483 std::vector<sycl::detail::pi::PiEvent> RawEvents =
486 RawEvents.size(), RawEvents.data());
491 assert(Event->getContextImpl().get() &&
492 "Only non-host events are expected to be waited for here");
495 std::vector<sycl::detail::pi::PiEvent> RawEvents =
498 const PluginPtr &Plugin = Queue->getPlugin();
501 MEvent->setHostEnqueueTime();
503 Queue->getHandleRef(), RawEvents.size(), &RawEvents[0], &Event);
514 const std::vector<sycl::detail::pi::PiExtSyncPoint> &SyncPoints)
515 : MQueue(
std::move(Queue)),
517 MPreparedDepsEvents(MEvent->getPreparedDepsEvents()),
518 MPreparedHostDepsEvents(MEvent->getPreparedHostDepsEvents()), MType(Type),
519 MCommandBuffer(CommandBuffer), MSyncPointDeps(SyncPoints) {
525 MEvent->setStateIncomplete();
528 #ifdef XPTI_ENABLE_INSTRUMENTATION
529 if (!xptiTraceEnabled())
537 #ifdef XPTI_ENABLE_INSTRUMENTATION
553 Command *Cmd,
void *ObjAddr,
bool IsCommand,
554 std::optional<access::mode> AccMode) {
555 #ifdef XPTI_ENABLE_INSTRUMENTATION
558 constexpr uint16_t NotificationTraceType = xpti::trace_edge_create;
559 if (!(xptiCheckTraceEnabled(
MStreamID, NotificationTraceType) &&
565 xpti::utils::StringHelper SH;
566 std::string AddressStr = SH.addressAsString<
void *>(ObjAddr);
568 std::string TypeString = SH.nameWithAddressString(Prefix, AddressStr);
571 xpti::payload_t Payload(TypeString.c_str(),
MAddress);
572 uint64_t EdgeInstanceNo;
574 xptiMakeEvent(TypeString.c_str(), &Payload, xpti::trace_graph_event,
575 xpti_at::active, &EdgeInstanceNo);
577 xpti_td *SrcEvent =
static_cast<xpti_td *
>(Cmd->
MTraceEvent);
578 xpti_td *TgtEvent =
static_cast<xpti_td *
>(
MTraceEvent);
579 EdgeEvent->source_id = SrcEvent->unique_id;
580 EdgeEvent->target_id = TgtEvent->unique_id;
582 xpti::addMetadata(EdgeEvent,
"access_mode",
583 static_cast<int>(AccMode.value()));
584 xpti::addMetadata(EdgeEvent,
"memory_object",
585 reinterpret_cast<size_t>(ObjAddr));
587 xpti::addMetadata(EdgeEvent,
"event",
reinterpret_cast<size_t>(ObjAddr));
589 xptiNotifySubscribers(
MStreamID, NotificationTraceType,
590 detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
605 #ifdef XPTI_ENABLE_INSTRUMENTATION
618 xpti::utils::StringHelper SH;
619 std::string AddressStr =
625 std::string NodeName = SH.nameWithAddressString(
"virtual_node", AddressStr);
627 xpti::payload_t VNPayload(NodeName.c_str(),
MAddress);
628 uint64_t VNodeInstanceNo;
630 xptiMakeEvent(NodeName.c_str(), &VNPayload, xpti::trace_graph_event,
631 xpti_at::active, &VNodeInstanceNo);
633 xpti::addMetadata(NodeEvent,
"kernel_name", NodeName);
634 xptiNotifySubscribers(
MStreamID, xpti::trace_node_create,
635 detail::GSYCLGraphEvent, NodeEvent, VNodeInstanceNo,
638 std::string EdgeName = SH.nameWithAddressString(
"Event", AddressStr);
639 xpti::payload_t EdgePayload(EdgeName.c_str(),
MAddress);
640 uint64_t EdgeInstanceNo;
642 xptiMakeEvent(EdgeName.c_str(), &EdgePayload, xpti::trace_graph_event,
643 xpti_at::active, &EdgeInstanceNo);
644 if (EdgeEvent && NodeEvent) {
647 xpti_td *TgtEvent =
static_cast<xpti_td *
>(
MTraceEvent);
648 EdgeEvent->source_id = NodeEvent->unique_id;
649 EdgeEvent->target_id = TgtEvent->unique_id;
650 xpti::addMetadata(EdgeEvent,
"event",
651 reinterpret_cast<size_t>(PiEventAddr));
652 xptiNotifySubscribers(
MStreamID, xpti::trace_edge_create,
653 detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
662 uint64_t CommandInstanceNo = 0;
663 #ifdef XPTI_ENABLE_INSTRUMENTATION
665 return CommandInstanceNo;
671 xpti::utils::StringHelper SH;
673 std::string CommandString =
676 xpti::payload_t p(CommandString.c_str(),
MAddress);
677 xpti_td *CmdTraceEvent =
678 xptiMakeEvent(CommandString.c_str(), &p, xpti::trace_graph_event,
679 xpti_at::active, &CommandInstanceNo);
690 return CommandInstanceNo;
694 #ifdef XPTI_ENABLE_INSTRUMENTATION
695 constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
699 xptiNotifySubscribers(
MStreamID, NotificationTraceType,
700 detail::GSYCLGraphEvent,
707 std::vector<Command *> &ToCleanUp) {
709 const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr();
717 bool PiEventExpected = (!DepEvent->is_host() && DepEvent->isInitialized());
718 if (
auto *DepCmd =
static_cast<Command *
>(DepEvent->getCommand()))
719 PiEventExpected &= DepCmd->producesPiEvent();
721 if (!PiEventExpected) {
728 Command *ConnectionCmd =
nullptr;
732 if (DepEventContext != WorkerContext && !WorkerContext->is_host()) {
738 return ConnectionCmd;
742 return MQueue->getContextImplPtr();
746 assert(
MWorkerQueue &&
"MWorkerQueue must not be nullptr");
760 Command *ConnectionCmd =
nullptr;
769 if (!ConnectionCmd) {
770 MDeps.push_back(NewDep);
775 #ifdef XPTI_ENABLE_INSTRUMENTATION
781 return ConnectionCmd;
785 std::vector<Command *> &ToCleanUp) {
786 #ifdef XPTI_ENABLE_INSTRUMENTATION
800 #ifdef XPTI_ENABLE_INSTRUMENTATION
801 emitInstrumentationGeneral(
803 xpti::trace_signal,
static_cast<const void *
>(PiEventAddr));
805 std::ignore = PiEventAddr;
809 #ifdef XPTI_ENABLE_INSTRUMENTATION
812 static_cast<const void *
>(Txt));
820 std::vector<Command *> &ToCleanUp) {
821 #ifdef XPTI_ENABLE_INSTRUMENTATION
825 std::unique_ptr<detail::tls_code_loc_t> AsyncCodeLocationPtr;
826 if (xptiTraceEnabled() && !CurrentCodeLocationValid()) {
827 AsyncCodeLocationPtr.reset(
843 #ifdef XPTI_ENABLE_INSTRUMENTATION
847 std::string Info =
"enqueue.barrier[";
855 #ifdef XPTI_ENABLE_INSTRUMENTATION
866 #ifdef XPTI_ENABLE_INSTRUMENTATION
882 if (PI_SUCCESS != Res)
887 (
MEvent->is_host() ||
MEvent->getHandleRef() ==
nullptr))
898 ToCleanUp.push_back(
this);
904 #ifdef XPTI_ENABLE_INSTRUMENTATION
911 #ifdef XPTI_ENABLE_INSTRUMENTATION
912 assert(
MType == CommandType::RELEASE &&
"Expected release command");
918 xpti_td *TgtTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
923 for (
auto &Item : DepList) {
924 if (Item->MTraceEvent && Item->MAddress ==
MAddress) {
925 xpti::utils::StringHelper SH;
926 std::string AddressStr = SH.addressAsString<
void *>(
MAddress);
927 std::string TypeString =
928 "Edge:" + SH.nameWithAddressString(commandToName(
MType), AddressStr);
932 xpti::payload_t p(TypeString.c_str(),
MAddress);
933 uint64_t EdgeInstanceNo;
935 xptiMakeEvent(TypeString.c_str(), &p, xpti::trace_graph_event,
936 xpti_at::active, &EdgeInstanceNo);
938 xpti_td *SrcTraceEvent =
static_cast<xpti_td *
>(Item->MTraceEvent);
939 EdgeEvent->target_id = TgtTraceEvent->unique_id;
940 EdgeEvent->source_id = SrcTraceEvent->unique_id;
941 xpti::addMetadata(EdgeEvent,
"memory_object",
942 reinterpret_cast<size_t>(
MAddress));
943 xptiNotifySubscribers(
MStreamID, xpti::trace_edge_create,
944 detail::GSYCLGraphEvent, EdgeEvent,
945 EdgeInstanceNo,
nullptr);
955 return "A Buffer is locked by the host accessor";
957 return "Blocked by host task";
960 return "Unknown block reason";
964 #ifdef XPTI_ENABLE_INSTRUMENTATION
965 if (!xptiTraceEnabled())
969 auto TData = Tls.
query();
970 if (TData.fileName())
972 if (TData.functionName())
977 (int)TData.lineNumber(), (int)TData.columnNumber()};
985 :
Command(Type, Queue), MLinkedAllocaCmd(LinkedAllocaCmd),
986 MIsLeaderAlloca(nullptr == LinkedAllocaCmd), MIsConst(IsConst),
987 MRequirement(
std::move(Req)), MReleaseCmd(Queue, this) {
993 #ifdef XPTI_ENABLE_INSTRUMENTATION
1002 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1003 xpti::addMetadata(TE,
"sycl_device", deviceToID(
MQueue->get_device()));
1004 xpti::addMetadata(TE,
"sycl_device_type",
1006 xpti::addMetadata(TE,
"sycl_device_name",
1008 xpti::addMetadata(TE,
"memory_object",
reinterpret_cast<size_t>(
MAddress));
1011 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1024 bool InitFromUserData,
1027 LinkedAllocaCmd, IsConst),
1028 MInitFromUserData(InitFromUserData) {
1033 std::vector<Command *> ToCleanUp;
1036 assert(ConnectionCmd ==
nullptr);
1037 assert(ToCleanUp.empty());
1038 (void)ConnectionCmd;
1042 #ifdef XPTI_ENABLE_INSTRUMENTATION
1053 pi_int32 AllocaCommand::enqueueImp() {
1059 void *HostPtr =
nullptr;
1074 std::move(EventImpls), Event);
1080 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1082 Stream <<
"ID = " <<
this <<
"\\n";
1086 Stream <<
"\"];" << std::endl;
1088 for (
const auto &Dep :
MDeps) {
1089 if (Dep.MDepCommand ==
nullptr)
1091 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1092 <<
" [ label = \"Access mode: "
1094 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1101 std::vector<Command *> &ToEnqueue,
1102 std::vector<Command *> &ToCleanUp)
1106 MParentAlloca(ParentAlloca) {
1114 ToEnqueue.push_back(ConnectionCmd);
1118 #ifdef XPTI_ENABLE_INSTRUMENTATION
1125 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1127 xpti::addMetadata(TE,
"access_range_start",
1129 xpti::addMetadata(TE,
"access_range_end",
1131 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1143 return static_cast<void *
>(
1150 pi_int32 AllocaSubBufCommand::enqueueImp() {
1166 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1168 Stream <<
"ID = " <<
this <<
"\\n";
1174 Stream <<
"\"];" << std::endl;
1176 for (
const auto &Dep :
MDeps) {
1177 if (Dep.MDepCommand ==
nullptr)
1179 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1180 <<
" [ label = \"Access mode: "
1182 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1193 #ifdef XPTI_ENABLE_INSTRUMENTATION
1202 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1203 xpti::addMetadata(TE,
"sycl_device", deviceToID(
MQueue->get_device()));
1204 xpti::addMetadata(TE,
"sycl_device_type",
1206 xpti::addMetadata(TE,
"sycl_device_name",
1208 xpti::addMetadata(TE,
"allocation_type",
1209 commandToName(MAllocaCmd->
getType()));
1212 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1219 pi_int32 ReleaseCommand::enqueueImp() {
1222 std::vector<sycl::detail::pi::PiEvent> RawEvents =
getPiEvents(EventImpls);
1223 bool SkipRelease =
false;
1229 const bool CurAllocaIsHost = MAllocaCmd->
getQueue()->is_host();
1230 bool NeedUnmap =
false;
1244 NeedUnmap |= CurAllocaIsHost == MAllocaCmd->
MIsActive;
1253 UnmapEventImpl->setContextImpl(Queue->getContextImplPtr());
1254 UnmapEventImpl->setStateIncomplete();
1257 void *Src = CurAllocaIsHost
1261 void *Dst = !CurAllocaIsHost
1266 RawEvents, UnmapEvent);
1270 EventImpls.push_back(UnmapEventImpl);
1284 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#FF827A\", label=\"";
1286 Stream <<
"ID = " <<
this <<
" ; ";
1288 Stream <<
" Alloca : " << MAllocaCmd <<
"\\n";
1289 Stream <<
" MemObj : " << MAllocaCmd->
getSYCLMemObj() <<
"\\n";
1290 Stream <<
"\"];" << std::endl;
1292 for (
const auto &Dep :
MDeps) {
1293 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1294 <<
" [ label = \"Access mode: "
1296 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1311 MSrcAllocaCmd(SrcAllocaCmd), MSrcReq(
std::move(Req)), MDstPtr(DstPtr),
1317 #ifdef XPTI_ENABLE_INSTRUMENTATION
1326 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1327 xpti::addMetadata(TE,
"sycl_device", deviceToID(
MQueue->get_device()));
1328 xpti::addMetadata(TE,
"sycl_device_type",
1330 xpti::addMetadata(TE,
"sycl_device_name",
1332 xpti::addMetadata(TE,
"memory_object",
reinterpret_cast<size_t>(
MAddress));
1335 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1342 pi_int32 MapMemObject::enqueueImp() {
1345 std::vector<sycl::detail::pi::PiEvent> RawEvents =
getPiEvents(EventImpls);
1358 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#77AFFF\", label=\"";
1360 Stream <<
"ID = " <<
this <<
" ; ";
1363 Stream <<
"\"];" << std::endl;
1365 for (
const auto &Dep :
MDeps) {
1366 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1367 <<
" [ label = \"Access mode: "
1369 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1377 MDstAllocaCmd(DstAllocaCmd), MDstReq(
std::move(Req)), MSrcPtr(SrcPtr) {
1382 #ifdef XPTI_ENABLE_INSTRUMENTATION
1391 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1392 xpti::addMetadata(TE,
"sycl_device", deviceToID(
MQueue->get_device()));
1393 xpti::addMetadata(TE,
"sycl_device_type",
1395 xpti::addMetadata(TE,
"sycl_device_name",
1397 xpti::addMetadata(TE,
"memory_object",
reinterpret_cast<size_t>(
MAddress));
1400 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1423 return MQueue->getDeviceImplPtr()->getBackend() !=
1425 MEvent->getHandleRef() !=
nullptr;
1428 pi_int32 UnMapMemObject::enqueueImp() {
1431 std::vector<sycl::detail::pi::PiEvent> RawEvents =
getPiEvents(EventImpls);
1437 std::move(RawEvents), Event);
1443 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#EBC40F\", label=\"";
1445 Stream <<
"ID = " <<
this <<
" ; ";
1448 Stream <<
"\"];" << std::endl;
1450 for (
const auto &Dep :
MDeps) {
1451 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1452 <<
" [ label = \"Access mode: "
1454 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1465 MSrcQueue(SrcQueue), MSrcReq(
std::move(SrcReq)),
1466 MSrcAllocaCmd(SrcAllocaCmd), MDstReq(
std::move(DstReq)),
1467 MDstAllocaCmd(DstAllocaCmd) {
1468 if (!MSrcQueue->is_host()) {
1469 MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1479 #ifdef XPTI_ENABLE_INSTRUMENTATION
1488 xpti_td *CmdTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
1489 xpti::addMetadata(CmdTraceEvent,
"sycl_device",
1490 deviceToID(
MQueue->get_device()));
1491 xpti::addMetadata(CmdTraceEvent,
"sycl_device_type",
1493 xpti::addMetadata(CmdTraceEvent,
"sycl_device_name",
1495 xpti::addMetadata(CmdTraceEvent,
"memory_object",
1496 reinterpret_cast<size_t>(
MAddress));
1497 xpti::addMetadata(CmdTraceEvent,
"copy_from",
1498 reinterpret_cast<size_t>(
1501 CmdTraceEvent,
"copy_to",
1505 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1532 return MQueue->is_host() ||
1533 MQueue->getDeviceImplPtr()->getBackend() !=
1535 MEvent->getHandleRef() !=
nullptr;
1538 pi_int32 MemCpyCommand::enqueueImp() {
1558 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#C7EB15\" label=\"";
1560 Stream <<
"ID = " <<
this <<
" ; ";
1562 Stream <<
"From: " << MSrcAllocaCmd <<
" is host: " << MSrcQueue->is_host()
1564 Stream <<
"To: " << MDstAllocaCmd <<
" is host: " <<
MQueue->is_host()
1567 Stream <<
"\"];" << std::endl;
1569 for (
const auto &Dep :
MDeps) {
1570 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1571 <<
" [ label = \"Access mode: "
1573 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1580 if (Dep.MDepRequirement == Req)
1581 return Dep.MAllocaCmd;
1589 std::vector<std::shared_ptr<const void>>
1592 return ((
CGExecKernel *)MCommandGroup.get())->getAuxiliaryResources();
1601 pi_int32 UpdateHostRequirementCommand::enqueueImp() {
1607 assert(MSrcAllocaCmd &&
"Expected valid alloca command");
1608 assert(MSrcAllocaCmd->
getMemAllocation() &&
"Expected valid source pointer");
1609 assert(MDstPtr &&
"Expected valid target pointer");
1616 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#f1337f\", label=\"";
1618 Stream <<
"ID = " <<
this <<
"\\n";
1620 bool IsReqOnBuffer =
1622 Stream <<
"TYPE: " << (IsReqOnBuffer ?
"Buffer" :
"Image") <<
"\\n";
1624 Stream <<
"Is sub buffer: " << std::boolalpha << MDstReq.
MIsSubBuffer
1627 Stream <<
"\"];" << std::endl;
1629 for (
const auto &Dep :
MDeps) {
1630 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1631 <<
" [ label = \"Access mode: "
1633 <<
"MemObj: " << Dep.MAllocaCmd->getSYCLMemObj() <<
" \" ]"
1644 MSrcQueue(SrcQueue), MSrcReq(
std::move(SrcReq)),
1645 MSrcAllocaCmd(SrcAllocaCmd), MDstReq(
std::move(DstReq)), MDstPtr(DstPtr) {
1646 if (!MSrcQueue->is_host()) {
1647 MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1657 #ifdef XPTI_ENABLE_INSTRUMENTATION
1666 xpti_td *CmdTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
1667 xpti::addMetadata(CmdTraceEvent,
"sycl_device",
1668 deviceToID(
MQueue->get_device()));
1669 xpti::addMetadata(CmdTraceEvent,
"sycl_device_type",
1671 xpti::addMetadata(CmdTraceEvent,
"sycl_device_name",
1673 xpti::addMetadata(CmdTraceEvent,
"memory_object",
1674 reinterpret_cast<size_t>(
MAddress));
1675 xpti::addMetadata(CmdTraceEvent,
"copy_from",
1676 reinterpret_cast<size_t>(
1679 CmdTraceEvent,
"copy_to",
1683 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1694 pi_int32 MemCpyCommandHost::enqueueImp() {
1698 std::vector<sycl::detail::pi::PiEvent> RawEvents =
getPiEvents(EventImpls);
1727 pi_int32 EmptyCommand::enqueueImp() {
1737 MRequirements.emplace_back(ReqRef);
1738 const Requirement *
const StoredReq = &MRequirements.back();
1742 std::vector<Command *> ToCleanUp;
1744 assert(Cmd ==
nullptr &&
"Conection command should be null for EmptyCommand");
1745 assert(ToCleanUp.empty() &&
"addDep should add a command for cleanup only if "
1746 "there's a connection command");
1751 #ifdef XPTI_ENABLE_INSTRUMENTATION
1756 if (MRequirements.empty())
1765 xpti_td *CmdTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
1766 xpti::addMetadata(CmdTraceEvent,
"sycl_device",
1767 deviceToID(
MQueue->get_device()));
1768 xpti::addMetadata(CmdTraceEvent,
"sycl_device_type",
1770 xpti::addMetadata(CmdTraceEvent,
"sycl_device_name",
1772 xpti::addMetadata(CmdTraceEvent,
"memory_object",
1773 reinterpret_cast<size_t>(
MAddress));
1776 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1784 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#8d8f29\", label=\"";
1786 Stream <<
"ID = " <<
this <<
"\\n";
1787 Stream <<
"EMPTY NODE"
1790 Stream <<
"\"];" << std::endl;
1792 for (
const auto &Dep :
MDeps) {
1793 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1794 <<
" [ label = \"Access mode: "
1796 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1804 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#B6A2EB\", label=\"";
1806 Stream <<
"ID = " <<
this <<
"\\n";
1809 Stream <<
"\"];" << std::endl;
1811 for (
const auto &Dep :
MDeps) {
1812 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1813 <<
" [ label = \"Access mode: "
1815 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1824 MSrcAllocaCmd(SrcAllocaCmd), MDstReq(
std::move(Req)), MDstPtr(DstPtr) {
1830 #ifdef XPTI_ENABLE_INSTRUMENTATION
1839 xpti_td *CmdTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
1840 xpti::addMetadata(CmdTraceEvent,
"sycl_device",
1841 deviceToID(
MQueue->get_device()));
1842 xpti::addMetadata(CmdTraceEvent,
"sycl_device_type",
1844 xpti::addMetadata(CmdTraceEvent,
"sycl_device_name",
1846 xpti::addMetadata(CmdTraceEvent,
"memory_object",
1847 reinterpret_cast<size_t>(
MAddress));
1850 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1863 return "update_host";
1869 return "copy acc to acc";
1872 return "copy acc to ptr";
1875 return "copy ptr to acc";
1880 return "barrier waitlist";
1888 return "prefetch usm";
1894 return "copy 2d usm";
1897 return "fill 2d usm";
1900 return "advise usm";
1902 return "memset 2d usm";
1905 return "copy to device_global";
1908 return "copy from device_global";
1911 return "read_write host pipe";
1913 return "exec command buffer";
1915 return "copy image";
1917 return "semaphore wait";
1919 return "semaphore signal";
1927 std::unique_ptr<detail::CG> CommandGroup,
QueueImplPtr Queue,
1929 const std::vector<sycl::detail::pi::PiExtSyncPoint> &Dependencies)
1932 MCommandGroup(
std::move(CommandGroup)) {
1934 MEvent->setSubmittedQueue(
1941 #ifdef XPTI_ENABLE_INSTRUMENTATION
1942 std::string instrumentationGetKernelName(
1943 const std::shared_ptr<detail::kernel_impl> &SyclKernel,
1944 const std::string &FunctionName,
const std::string &SyclKernelName,
1945 void *&Address, std::optional<bool> &FromSource) {
1946 std::string KernelName;
1947 if (SyclKernel && SyclKernel->isCreatedFromSource()) {
1949 pi_kernel KernelHandle = SyclKernel->getHandleRef();
1950 Address = KernelHandle;
1951 KernelName = FunctionName;
1959 void instrumentationAddExtraKernelMetadata(
1960 xpti_td *&CmdTraceEvent,
const NDRDescT &NDRDesc,
1962 const std::string &KernelName,
1963 const std::shared_ptr<detail::kernel_impl> &SyclKernel,
1965 std::vector<ArgDesc> &CGArgs)
1968 std::vector<ArgDesc> Args;
1970 auto FilterArgs = [&Args](detail::ArgDesc &Arg,
int NextTrueIndex) {
1971 Args.push_back({Arg.MType, Arg.MPtr, Arg.MSize, NextTrueIndex});
1975 std::mutex *KernelMutex =
nullptr;
1978 std::shared_ptr<kernel_impl> SyclKernelImpl;
1979 std::shared_ptr<device_image_impl> DeviceImageImpl;
1987 kernel_id KernelID =
1991 std::shared_ptr<kernel_impl> KernelImpl =
1994 EliminatedArgMask = KernelImpl->getKernelArgMask();
1995 Program = KernelImpl->getDeviceImage()->get_program_ref();
1996 }
else if (
nullptr != SyclKernel) {
1997 Program = SyclKernel->getProgramRef();
1998 if (!SyclKernel->isCreatedFromSource())
1999 EliminatedArgMask = SyclKernel->getKernelArgMask();
2001 std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) =
2003 Queue->getContextImplPtr(), Queue->getDeviceImplPtr(), KernelName);
2008 xpti::offload_kernel_enqueue_data_t KernelData{
2009 {NDRDesc.GlobalSize[0], NDRDesc.GlobalSize[1], NDRDesc.GlobalSize[2]},
2010 {NDRDesc.LocalSize[0], NDRDesc.LocalSize[1], NDRDesc.LocalSize[2]},
2011 {NDRDesc.GlobalOffset[0], NDRDesc.GlobalOffset[1],
2012 NDRDesc.GlobalOffset[2]},
2014 xpti::addMetadata(CmdTraceEvent,
"enqueue_kernel_data", KernelData);
2015 for (
size_t i = 0; i < Args.size(); i++) {
2016 std::string Prefix(
"arg");
2017 xpti::offload_kernel_arg_data_t
arg{(int)Args[i].MType, Args[i].MPtr,
2018 Args[i].MSize, Args[i].MIndex};
2019 xpti::addMetadata(CmdTraceEvent, Prefix + std::to_string(i),
arg);
2023 void instrumentationFillCommonData(
const std::string &KernelName,
2024 const std::string &FileName, uint64_t Line,
2025 uint64_t Column,
const void *
const Address,
2027 std::optional<bool> &FromSource,
2028 uint64_t &OutInstanceID,
2029 xpti_td *&OutTraceEvent) {
2036 bool HasSourceInfo =
false;
2037 xpti::payload_t Payload;
2038 if (!FileName.empty()) {
2040 Payload = xpti::payload_t(KernelName.c_str(), FileName.c_str(), Line,
2042 HasSourceInfo =
true;
2043 }
else if (Address) {
2045 Payload = xpti::payload_t(KernelName.c_str(), Address);
2049 Payload = xpti::payload_t(KernelName.c_str());
2051 uint64_t CGKernelInstanceNo;
2053 xpti_td *CmdTraceEvent =
2054 xptiMakeEvent(
"ExecCG", &Payload, xpti::trace_graph_event,
2055 xpti::trace_activity_type_t::active, &CGKernelInstanceNo);
2056 if (CmdTraceEvent) {
2057 OutInstanceID = CGKernelInstanceNo;
2058 OutTraceEvent = CmdTraceEvent;
2062 if (CGKernelInstanceNo > 1)
2065 xpti::addMetadata(CmdTraceEvent,
"sycl_device",
2066 deviceToID(Queue->get_device()));
2067 xpti::addMetadata(CmdTraceEvent,
"sycl_device_type",
2069 xpti::addMetadata(CmdTraceEvent,
"sycl_device_name",
2071 if (!KernelName.empty()) {
2072 xpti::addMetadata(CmdTraceEvent,
"kernel_name", KernelName);
2074 if (FromSource.has_value()) {
2075 xpti::addMetadata(CmdTraceEvent,
"from_source", FromSource.value());
2077 if (HasSourceInfo) {
2078 xpti::addMetadata(CmdTraceEvent,
"sym_function_name", KernelName);
2079 xpti::addMetadata(CmdTraceEvent,
"sym_source_file_name", FileName);
2080 xpti::addMetadata(CmdTraceEvent,
"sym_line_no",
static_cast<int>(Line));
2081 xpti::addMetadata(CmdTraceEvent,
"sym_column_no",
2082 static_cast<int>(Column));
2091 #ifdef XPTI_ENABLE_INSTRUMENTATION
2092 std::pair<xpti_td *, uint64_t> emitKernelInstrumentationData(
2093 int32_t StreamID,
const std::shared_ptr<detail::kernel_impl> &SyclKernel,
2094 const detail::code_location &CodeLoc,
const std::string &SyclKernelName,
2097 std::vector<ArgDesc> &CGArgs) {
2099 auto XptiObjects = std::make_pair<xpti_td *, uint64_t>(
nullptr, -1);
2100 constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
2101 if (!xptiCheckTraceEnabled(StreamID))
2104 void *Address =
nullptr;
2105 std::optional<bool> FromSource;
2106 std::string KernelName = instrumentationGetKernelName(
2107 SyclKernel, std::string(CodeLoc.functionName()), SyclKernelName, Address,
2110 auto &[CmdTraceEvent, InstanceID] = XptiObjects;
2112 std::string FileName =
2113 CodeLoc.fileName() ? CodeLoc.fileName() : std::string();
2114 instrumentationFillCommonData(KernelName, FileName, CodeLoc.lineNumber(),
2115 CodeLoc.columnNumber(), Address, Queue,
2116 FromSource, InstanceID, CmdTraceEvent);
2118 if (CmdTraceEvent) {
2120 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
2121 Queue->getQueueID());
2123 instrumentationAddExtraKernelMetadata(CmdTraceEvent, NDRDesc,
2125 SyclKernel, Queue, CGArgs);
2127 xptiNotifySubscribers(
2128 StreamID, NotificationTraceType, detail::GSYCLGraphEvent, CmdTraceEvent,
2130 static_cast<const void *
>(
2131 commandToNodeType(Command::CommandType::RUN_CG).c_str()));
2139 #ifdef XPTI_ENABLE_INSTRUMENTATION
2140 constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
2144 std::string KernelName;
2145 std::optional<bool> FromSource;
2146 switch (MCommandGroup->getType()) {
2150 KernelName = instrumentationGetKernelName(
2151 KernelCG->MSyclKernel, MCommandGroup->MFunctionName,
2152 KernelCG->getKernelName(),
MAddress, FromSource);
2159 xpti_td *CmdTraceEvent =
nullptr;
2160 instrumentationFillCommonData(KernelName, MCommandGroup->MFileName,
2161 MCommandGroup->MLine, MCommandGroup->MColumn,
2165 if (CmdTraceEvent) {
2166 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
2172 instrumentationAddExtraKernelMetadata(
2173 CmdTraceEvent, KernelCG->MNDRDesc, KernelCG->getKernelBundle(),
2174 KernelCG->MKernelName, KernelCG->MSyclKernel,
MQueue,
2178 xptiNotifySubscribers(
2179 MStreamID, NotificationTraceType, detail::GSYCLGraphEvent,
2181 static_cast<const void *
>(commandToNodeType(
MType).c_str()));
2187 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
2189 Stream <<
"ID = " <<
this <<
"\\n";
2192 switch (MCommandGroup->getType()) {
2196 Stream <<
"Kernel name: ";
2197 if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource())
2198 Stream <<
"created from source";
2209 Stream <<
"\"];" << std::endl;
2211 for (
const auto &Dep :
MDeps) {
2212 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
2213 <<
" [ label = \"Access mode: "
2215 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
2244 sycl::info::kernel_device_specific::compile_work_group_size>(
2247 if (WGSize[0] == 0) {
2271 switch (AccessorMode) {
2284 const std::shared_ptr<device_image_impl> &DeviceImageImpl,
2285 const std::function<
void *(
Requirement *Req)> &getMemAllocationFunc,
2287 size_t NextTrueIndex) {
2288 switch (Arg.
MType) {
2299 getMemAllocationFunc
2317 &MemObjData, &MemArg);
2327 sampler *SamplerPtr = (sampler *)Arg.
MPtr;
2343 "SYCL2020 specialization constants are not yet supported on host "
2347 assert(DeviceImageImpl !=
nullptr);
2349 DeviceImageImpl->get_spec_const_buffer_ref();
2352 SpecConstsBuffer ? &SpecConstsBuffer :
nullptr;
2358 Kernel, NextTrueIndex, &MemObjData, SpecConstsBufferArg);
2363 "Invalid kernel param kind " +
2371 const std::shared_ptr<device_image_impl> &DeviceImageImpl,
2373 std::vector<sycl::detail::pi::PiEvent> &RawEvents,
2376 const std::function<
void *(
Requirement *Req)> &getMemAllocationFunc,
2377 bool IsCooperative) {
2378 const PluginPtr &Plugin = Queue->getPlugin();
2380 auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
2383 Queue->get_context(), Queue->is_host(), Arg,
2392 const bool HasLocalSize = (NDRDesc.
LocalSize[0] != 0);
2396 size_t RequiredWGSize[3] = {0, 0, 0};
2397 size_t *LocalSize =
nullptr;
2403 Kernel, Queue->getDeviceImplPtr()->getHandleRef(),
2405 RequiredWGSize,
nullptr);
2407 const bool EnforcedLocalSize =
2408 (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2409 RequiredWGSize[2] != 0);
2410 if (EnforcedLocalSize)
2411 LocalSize = RequiredWGSize;
2413 if (OutEventImpl !=
nullptr)
2414 OutEventImpl->setHostEnqueueTime();
2417 if (IsCooperative) {
2424 &NDRDesc.
GlobalSize[0], LocalSize, RawEvents.size(),
2425 RawEvents.empty() ? nullptr : &RawEvents[0],
2426 OutEventImpl ? &OutEventImpl->getHandleRef() :
nullptr);
2432 void **CastedBlob = (
void **)Blob;
2434 std::vector<Requirement *> *Reqs =
2435 static_cast<std::vector<Requirement *> *
>(CastedBlob[0]);
2437 std::shared_ptr<HostKernelBase> *
HostKernel =
2438 static_cast<std::shared_ptr<HostKernelBase> *
>(CastedBlob[1]);
2443 void **NextArg = CastedBlob + 3;
2445 Req->
MData = *(NextArg++);
2447 (*HostKernel)->call(*NDRDesc,
nullptr);
2459 std::vector<sycl::detail::pi::PiExtSyncPoint> &SyncPoints,
2462 const std::function<
void *(
Requirement *Req)> &getMemAllocationFunc) {
2467 std::shared_ptr<kernel_impl> SyclKernelImpl =
nullptr;
2468 std::shared_ptr<device_image_impl> DeviceImageImpl =
nullptr;
2486 PiKernel = SyclKernelImpl->getHandleRef();
2487 DeviceImageImpl = SyclKernelImpl->getDeviceImage();
2488 PiProgram = DeviceImageImpl->get_program_ref();
2489 EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
2490 }
else if (Kernel !=
nullptr) {
2493 EliminatedArgMask = Kernel->getKernelArgMask();
2496 sycl::detail::ProgramManager::getInstance().getOrCreateKernel(
2497 ContextImpl, DeviceImpl, CommandGroup.
MKernelName);
2500 auto SetFunc = [&Plugin, &
PiKernel, &DeviceImageImpl, &Ctx,
2501 &getMemAllocationFunc](sycl::detail::ArgDesc &Arg,
2502 size_t NextTrueIndex) {
2504 getMemAllocationFunc, Ctx,
false, Arg,
2508 auto Args = CommandGroup.
MArgs;
2515 auto NDRDesc = CommandGroup.
MNDRDesc;
2519 size_t RequiredWGSize[3] = {0, 0, 0};
2520 size_t *LocalSize =
nullptr;
2526 PiKernel, DeviceImpl->getHandleRef(),
2531 const bool EnforcedLocalSize =
2532 (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2533 RequiredWGSize[2] != 0);
2534 if (EnforcedLocalSize)
2535 LocalSize = RequiredWGSize;
2541 &NDRDesc.
GlobalSize[0], LocalSize, SyncPoints.size(),
2542 SyncPoints.size() ? SyncPoints.data() :
nullptr, OutSyncPoint,
2545 if (!SyclKernelImpl && !Kernel) {
2550 if (Res != pi_result::PI_SUCCESS) {
2562 const std::shared_ptr<detail::kernel_impl> &MSyclKernel,
2563 const std::string &KernelName,
2564 std::vector<sycl::detail::pi::PiEvent> &RawEvents,
2566 const std::function<
void *(
Requirement *Req)> &getMemAllocationFunc,
2568 const bool KernelIsCooperative) {
2571 auto ContextImpl = Queue->getContextImplPtr();
2572 auto DeviceImpl = Queue->getDeviceImplPtr();
2574 std::mutex *KernelMutex =
nullptr;
2578 std::shared_ptr<kernel_impl> SyclKernelImpl;
2579 std::shared_ptr<device_image_impl> DeviceImageImpl;
2594 Kernel = SyclKernelImpl->getHandleRef();
2595 DeviceImageImpl = SyclKernelImpl->getDeviceImage();
2597 Program = DeviceImageImpl->get_program_ref();
2599 EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
2600 KernelMutex = SyclKernelImpl->getCacheMutex();
2601 }
else if (
nullptr != MSyclKernel) {
2602 assert(MSyclKernel->get_info<info::kernel::context>() ==
2603 Queue->get_context());
2604 Kernel = MSyclKernel->getHandleRef();
2605 Program = MSyclKernel->getProgramRef();
2613 KernelMutex = &MSyclKernel->getNoncacheableEnqueueMutex();
2614 EliminatedArgMask = MSyclKernel->getKernelArgMask();
2616 std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) =
2618 ContextImpl, DeviceImpl, KernelName, NDRDesc);
2622 std::vector<sycl::detail::pi::PiEvent> &EventsWaitList = RawEvents;
2625 std::vector<sycl::detail::pi::PiEvent> DeviceGlobalInitEvents =
2626 ContextImpl->initializeDeviceGlobals(Program, Queue);
2627 std::vector<sycl::detail::pi::PiEvent> EventsWithDeviceGlobalInits;
2628 if (!DeviceGlobalInitEvents.empty()) {
2629 EventsWithDeviceGlobalInits.reserve(RawEvents.size() +
2630 DeviceGlobalInitEvents.size());
2631 EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(),
2632 RawEvents.begin(), RawEvents.end());
2633 EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(),
2634 DeviceGlobalInitEvents.begin(),
2635 DeviceGlobalInitEvents.end());
2636 EventsWaitList = EventsWithDeviceGlobalInits;
2644 using LockT = std::unique_lock<std::mutex>;
2645 auto Lock = KernelMutex ? LockT(*KernelMutex) : LockT();
2651 const PluginPtr &Plugin = Queue->getPlugin();
2658 NDRDesc, EventsWaitList, OutEventImpl,
2659 EliminatedArgMask, getMemAllocationFunc,
2660 KernelIsCooperative);
2662 const PluginPtr &Plugin = Queue->getPlugin();
2663 if (!SyclKernelImpl && !MSyclKernel) {
2668 if (PI_SUCCESS != Error) {
2671 const device_impl &DeviceImpl = *(Queue->getDeviceImplPtr());
2681 bool blocking,
void *ptr,
size_t size,
2682 std::vector<sycl::detail::pi::PiEvent> &RawEvents,
2688 device Device = Queue->get_device();
2690 std::optional<sycl::detail::pi::PiProgram> CachedProgram =
2691 ContextImpl->getProgramForHostPipe(Device, hostPipeEntry);
2693 Program = *CachedProgram;
2699 Queue->get_device());
2704 assert(Program &&
"Program for this hostpipe is not compiled.");
2707 const PluginPtr &Plugin = Queue->getPlugin();
2709 pi_queue pi_q = Queue->getHandleRef();
2712 auto OutEvent = OutEventImpl ? &OutEventImpl->getHandleRef() :
nullptr;
2713 if (OutEventImpl !=
nullptr)
2714 OutEventImpl->setHostEnqueueTime();
2718 pi_q, Program, PipeName.c_str(), blocking, ptr, size,
2719 RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
2725 pi_q, Program, PipeName.c_str(), blocking, ptr, size,
2726 RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
2732 pi_int32 ExecCGCommand::enqueueImpCommandBuffer() {
2741 std::vector<sycl::detail::pi::PiEvent> RawEvents =
getPiEvents(EventImpls);
2742 if (!RawEvents.empty()) {
2748 (
MQueue->supportsDiscardingPiEvents() &&
2749 MCommandGroup->getRequirements().size() == 0)
2751 : &
MEvent->getHandleRef();
2754 switch (MCommandGroup->getType()) {
2755 case CG::CGTYPE::Kernel: {
2756 CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
2758 auto getMemAllocationFunc = [
this](
Requirement *Req) {
2759 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2760 return AllocaCmd->getMemAllocation();
2765 bool KernelUsesAssert =
2766 !(ExecKernel->MSyclKernel && ExecKernel->MSyclKernel->isInterop()) &&
2768 ExecKernel->MKernelName);
2769 if (KernelUsesAssert) {
2770 Event = &
MEvent->getHandleRef();
2776 getMemAllocationFunc);
2777 MEvent->setSyncPoint(OutSyncPoint);
2778 MEvent->setCommandBufferCommand(OutCommand);
2781 case CG::CGTYPE::CopyUSM: {
2782 CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
2785 Copy->getLength(), Copy->getDst(),
MSyncPointDeps, &OutSyncPoint);
2786 MEvent->setSyncPoint(OutSyncPoint);
2789 case CG::CGTYPE::CopyAccToAcc: {
2790 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2794 AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc);
2795 AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst);
2799 AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(),
2800 ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange,
2801 ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
2802 ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
2805 MEvent->setSyncPoint(OutSyncPoint);
2808 case CG::CGTYPE::CopyAccToPtr: {
2809 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2811 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2815 AllocaCmd->getMemAllocation(), Req->MDims, Req->MMemoryRange,
2816 Req->MAccessRange, Req->MOffset, Req->MElemSize, (
char *)Copy->getDst(),
2817 Req->MDims, Req->MAccessRange,
2820 MEvent->setSyncPoint(OutSyncPoint);
2823 case CG::CGTYPE::CopyPtrToAcc: {
2824 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2826 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2830 (
char *)Copy->getSrc(), Req->MDims, Req->MAccessRange,
2831 {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(),
2832 Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2834 MEvent->setSyncPoint(OutSyncPoint);
2837 case CG::CGTYPE::Fill: {
2838 CGFill *Fill = (CGFill *)MCommandGroup.get();
2840 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2844 AllocaCmd->getMemAllocation(), Fill->MPattern.size(),
2845 Fill->MPattern.data(), Req->MDims, Req->MMemoryRange, Req->MAccessRange,
2846 Req->MOffset, Req->MElemSize, std::move(
MSyncPointDeps), &OutSyncPoint);
2847 MEvent->setSyncPoint(OutSyncPoint);
2850 case CG::CGTYPE::FillUSM: {
2851 CGFillUSM *Fill = (CGFillUSM *)MCommandGroup.get();
2856 MEvent->setSyncPoint(OutSyncPoint);
2859 case CG::CGTYPE::PrefetchUSM: {
2860 CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get();
2863 Prefetch->getLength(), std::move(
MSyncPointDeps), &OutSyncPoint);
2864 MEvent->setSyncPoint(OutSyncPoint);
2867 case CG::CGTYPE::AdviseUSM: {
2868 CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get();
2871 Advise->getLength(), Advise->getAdvice(), std::move(
MSyncPointDeps),
2873 MEvent->setSyncPoint(OutSyncPoint);
2878 throw runtime_error(
"CG type not implemented for command buffers.",
2879 PI_ERROR_INVALID_OPERATION);
2883 pi_int32 ExecCGCommand::enqueueImp() {
2885 return enqueueImpCommandBuffer();
2887 return enqueueImpQueue();
2891 pi_int32 ExecCGCommand::enqueueImpQueue() {
2898 bool DiscardPiEvent = (
MQueue->supportsDiscardingPiEvents() &&
2899 MCommandGroup->getRequirements().size() == 0);
2901 DiscardPiEvent ? nullptr : &
MEvent->getHandleRef();
2904 switch (MCommandGroup->getType()) {
2906 case CG::CGTYPE::UpdateHost: {
2908 "Update host should be handled by the Scheduler. " +
2911 case CG::CGTYPE::CopyAccToPtr: {
2912 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2914 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2917 AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(),
MQueue,
2918 Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2919 Req->MElemSize, Copy->getDst(),
2921 Req->MAccessRange, Req->MAccessRange, {0, 0, 0},
2922 Req->MElemSize, std::move(RawEvents),
MEvent->getHandleRef(),
MEvent);
2926 case CG::CGTYPE::CopyPtrToAcc: {
2927 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2929 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2934 AllocaCmd->getSYCLMemObj(), Copy->getSrc(),
2936 Req->MAccessRange, Req->MAccessRange,
2937 {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(),
2938 MQueue, Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2939 Req->MElemSize, std::move(RawEvents),
MEvent->getHandleRef(),
MEvent);
2943 case CG::CGTYPE::CopyAccToAcc: {
2944 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2948 AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc);
2949 AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst);
2952 AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(),
MQueue,
2953 ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange,
2954 ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
2955 MQueue, ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
2956 ReqDst->MOffset, ReqDst->MElemSize, std::move(RawEvents),
2961 case CG::CGTYPE::Fill: {
2962 CGFill *Fill = (CGFill *)MCommandGroup.get();
2964 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2967 AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(),
MQueue,
2968 Fill->MPattern.size(), Fill->MPattern.data(), Req->MDims,
2969 Req->MMemoryRange, Req->MAccessRange, Req->MOffset, Req->MElemSize,
2974 case CG::CGTYPE::Kernel: {
2975 CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
2977 NDRDescT &NDRDesc = ExecKernel->MNDRDesc;
2978 std::vector<ArgDesc> &Args = ExecKernel->MArgs;
2980 if (
MQueue->is_host() || (
MQueue->getDeviceImplPtr()->getBackend() ==
2981 backend::ext_intel_esimd_emulator)) {
2982 for (ArgDesc &Arg : Args)
2985 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2986 Req->MData = AllocaCmd->getMemAllocation();
2988 if (!RawEvents.empty()) {
2990 const PluginPtr &Plugin = EventImpls[0]->getPlugin();
2995 ExecKernel->MHostKernel->call(NDRDesc,
2996 getEvent()->getHostProfilingInfo());
2998 assert(
MQueue->getDeviceImplPtr()->getBackend() ==
2999 backend::ext_intel_esimd_emulator);
3001 MEvent->setHostEnqueueTime();
3004 reinterpret_cast<pi_kernel>(ExecKernel->MHostKernel->getPtr()),
3005 NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0],
3006 &NDRDesc.LocalSize[0], 0,
nullptr,
nullptr);
3011 auto getMemAllocationFunc = [
this](
Requirement *Req) {
3012 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
3015 return AllocaCmd ? AllocaCmd->getMemAllocation() :
nullptr;
3018 const std::shared_ptr<detail::kernel_impl> &SyclKernel =
3019 ExecKernel->MSyclKernel;
3020 const std::string &KernelName = ExecKernel->MKernelName;
3024 bool KernelUsesAssert =
3025 !(SyclKernel && SyclKernel->isInterop()) &&
3027 if (KernelUsesAssert) {
3033 MQueue, NDRDesc, Args, ExecKernel->getKernelBundle(), SyclKernel,
3034 KernelName, RawEvents, EventImpl, getMemAllocationFunc,
3035 ExecKernel->MKernelCacheConfig, ExecKernel->MKernelIsCooperative);
3037 case CG::CGTYPE::CopyUSM: {
3038 CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
3040 Copy->getDst(), std::move(RawEvents), Event,
3045 case CG::CGTYPE::FillUSM: {
3046 CGFillUSM *Fill = (CGFillUSM *)MCommandGroup.get();
3048 Fill->getFill(), std::move(RawEvents), Event,
3053 case CG::CGTYPE::PrefetchUSM: {
3054 CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get();
3056 Prefetch->getLength(), std::move(RawEvents),
3061 case CG::CGTYPE::AdviseUSM: {
3062 CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get();
3064 Advise->getAdvice(), std::move(RawEvents), Event,
3069 case CG::CGTYPE::Copy2DUSM: {
3070 CGCopy2DUSM *Copy = (CGCopy2DUSM *)MCommandGroup.get();
3072 Copy->getDst(), Copy->getDstPitch(),
3073 Copy->getWidth(), Copy->getHeight(),
3074 std::move(RawEvents), Event,
MEvent);
3077 case CG::CGTYPE::Fill2DUSM: {
3078 CGFill2DUSM *Fill = (CGFill2DUSM *)MCommandGroup.get();
3080 Fill->getWidth(), Fill->getHeight(),
3081 Fill->getPattern(), std::move(RawEvents), Event,
3085 case CG::CGTYPE::Memset2DUSM: {
3086 CGMemset2DUSM *Memset = (CGMemset2DUSM *)MCommandGroup.get();
3088 Memset->getWidth(), Memset->getHeight(),
3089 Memset->getValue(), std::move(RawEvents),
3093 case CG::CGTYPE::CodeplayHostTask: {
3094 CGHostTask *HostTask =
static_cast<CGHostTask *
>(MCommandGroup.get());
3096 for (ArgDesc &Arg : HostTask->MArgs) {
3097 switch (Arg.MType) {
3100 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
3103 Req->MData = AllocaCmd->getMemAllocation();
3108 "Unsupported arg type " +
3113 std::vector<interop_handle::ReqToMem> ReqToMem;
3115 if (HostTask->MHostTask->isInteropTask()) {
3118 const std::vector<Requirement *> &HandlerReq =
3119 HostTask->getRequirements();
3120 auto ReqToMemConv = [&ReqToMem, HostTask](
Requirement *Req) {
3121 const std::vector<AllocaCommandBase *> &AllocaCmds =
3122 Req->MSYCLMemObj->MRecord->MAllocaCommands;
3124 for (AllocaCommandBase *AllocaCmd : AllocaCmds)
3125 if (HostTask->MQueue->getContextImplPtr() ==
3126 AllocaCmd->getQueue()->getContextImplPtr()) {
3128 reinterpret_cast<pi_mem>(AllocaCmd->getMemAllocation());
3129 ReqToMem.emplace_back(std::make_pair(Req, MemArg));
3135 "Can't get memory object due to no allocation available");
3139 "Can't get memory object due to no allocation available " +
3142 std::for_each(std::begin(HandlerReq), std::end(HandlerReq), ReqToMemConv);
3143 std::sort(std::begin(ReqToMem), std::end(ReqToMem));
3157 case CG::CGTYPE::Barrier: {
3158 if (
MQueue->getDeviceImplPtr()->is_host()) {
3164 MEvent->setHostEnqueueTime();
3166 MQueue->getHandleRef(), 0,
nullptr, Event);
3170 case CG::CGTYPE::BarrierWaitlist: {
3171 CGBarrier *Barrier =
static_cast<CGBarrier *
>(MCommandGroup.get());
3172 std::vector<detail::EventImplPtr> Events = Barrier->MEventsWaitWithBarrier;
3173 std::vector<sycl::detail::pi::PiEvent> PiEvents =
3175 if (
MQueue->getDeviceImplPtr()->is_host() || PiEvents.empty()) {
3182 MEvent->setHostEnqueueTime();
3184 MQueue->getHandleRef(), PiEvents.size(), &PiEvents[0], Event);
3188 case CG::CGTYPE::CopyToDeviceGlobal: {
3189 CGCopyToDeviceGlobal *Copy = (CGCopyToDeviceGlobal *)MCommandGroup.get();
3191 Copy->getDeviceGlobalPtr(), Copy->isDeviceImageScoped(),
MQueue,
3192 Copy->getNumBytes(), Copy->getOffset(), Copy->getSrc(),
3193 std::move(RawEvents), Event,
MEvent);
3197 case CG::CGTYPE::CopyFromDeviceGlobal: {
3198 CGCopyFromDeviceGlobal *Copy =
3199 (CGCopyFromDeviceGlobal *)MCommandGroup.get();
3201 Copy->getDeviceGlobalPtr(), Copy->isDeviceImageScoped(),
MQueue,
3202 Copy->getNumBytes(), Copy->getOffset(), Copy->getDest(),
3203 std::move(RawEvents), Event,
MEvent);
3207 case CG::CGTYPE::ReadWriteHostPipe: {
3208 CGReadWriteHostPipe *ExecReadWriteHostPipe =
3209 (CGReadWriteHostPipe *)MCommandGroup.get();
3210 std::string pipeName = ExecReadWriteHostPipe->getPipeName();
3211 void *hostPtr = ExecReadWriteHostPipe->getHostPtr();
3212 size_t typeSize = ExecReadWriteHostPipe->getTypeSize();
3213 bool blocking = ExecReadWriteHostPipe->isBlocking();
3214 bool read = ExecReadWriteHostPipe->isReadHostPipe();
3220 typeSize, RawEvents, EventImpl, read);
3222 case CG::CGTYPE::ExecCommandBuffer: {
3223 CGExecCommandBuffer *CmdBufferCG =
3224 static_cast<CGExecCommandBuffer *
>(MCommandGroup.get());
3226 MEvent->setHostEnqueueTime();
3227 return MQueue->getPlugin()
3229 CmdBufferCG->MCommandBuffer,
MQueue->getHandleRef(),
3230 RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
3233 case CG::CGTYPE::CopyImage: {
3234 CGCopyImage *Copy = (CGCopyImage *)MCommandGroup.get();
3239 Copy->getSrc(),
MQueue, Copy->getDst(), Desc, Copy->getFormat(),
3240 Copy->getCopyFlags(), Copy->getSrcOffset(), Copy->getDstOffset(),
3241 Copy->getHostExtent(), Copy->getCopyExtent(), std::move(RawEvents),
3245 case CG::CGTYPE::SemaphoreWait: {
3246 CGSemaphoreWait *SemWait = (CGSemaphoreWait *)MCommandGroup.get();
3247 if (
MQueue->getDeviceImplPtr()->is_host()) {
3254 MQueue->getHandleRef(), SemWait->getInteropSemaphoreHandle(), 0,
3259 case CG::CGTYPE::SemaphoreSignal: {
3260 CGSemaphoreSignal *SemSignal = (CGSemaphoreSignal *)MCommandGroup.get();
3261 if (
MQueue->getDeviceImplPtr()->is_host()) {
3268 MQueue->getHandleRef(), SemSignal->getInteropSemaphoreHandle(), 0,
3273 case CG::CGTYPE::None:
3275 "CG type not implemented. " +
3278 return PI_ERROR_INVALID_OPERATION;
3283 MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask;
3289 (MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask);
3293 if (MCommandGroup->getType() == CG::CGTYPE::CodeplayHostTask)
3305 return MAuxiliaryCommands;
3309 MFusionList.push_back(Kernel);
3318 pi_int32 KernelFusionCommand::enqueueImp() {
3334 "Cannot release the queue attached to the KernelFusionCommand if it "
3341 #ifdef XPTI_ENABLE_INSTRUMENTATION
3342 constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
3343 if (!xptiCheckTraceEnabled(
MStreamID)) {
3351 static unsigned FusionNodeCount = 0;
3352 std::stringstream PayloadStr;
3353 PayloadStr <<
"Fusion command #" << FusionNodeCount++;
3354 xpti::payload_t Payload = xpti::payload_t(PayloadStr.str().c_str());
3356 uint64_t CommandInstanceNo = 0;
3357 xpti_td *CmdTraceEvent =
3358 xptiMakeEvent(
MCommandName.c_str(), &Payload, xpti::trace_graph_event,
3359 xpti_at::active, &CommandInstanceNo);
3362 if (CmdTraceEvent) {
3378 xpti::addMetadata(CmdTraceEvent,
"sycl_device",
3379 deviceToID(
MQueue->get_device()));
3380 xpti::addMetadata(CmdTraceEvent,
"sycl_device_type",
3382 xpti::addMetadata(CmdTraceEvent,
"sycl_device_name",
3388 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
3390 xptiNotifySubscribers(
MStreamID, NotificationTraceType,
3391 detail::GSYCLGraphEvent,
3400 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
3402 Stream <<
"ID = " <<
this <<
"\\n";
3404 <<
"FUSION LIST: {";
3405 bool Initial =
true;
3406 for (
auto *Cmd : MFusionList) {
3412 if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource()) {
3413 Stream <<
"created from source";
3420 Stream <<
"\"];" << std::endl;
3422 for (
const auto &Dep :
MDeps) {
3423 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
3424 <<
" [ label = \"Access mode: "
3426 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
3434 std::vector<std::shared_ptr<ext::oneapi::experimental::detail::node_impl>>
3439 pi_int32 UpdateCommandBufferCommand::enqueueImp() {
3445 for (
auto &Node : MNodes) {
3446 auto CG =
static_cast<CGExecKernel *
>(Node->MCommandGroup.get());
3447 for (
auto &Arg :
CG->MArgs) {
3452 for (
const DepDesc &Dep :
MDeps) {
3453 Requirement *Req =
static_cast<AccessorImplHost *
>(Arg.MPtr);
3454 if (Dep.MDepRequirement == Req) {
3455 if (Dep.MAllocaCmd) {
3456 Req->
MData = Dep.MAllocaCmd->getMemAllocation();
3459 "No allocation available for accessor when "
3460 "updating command buffer!");
3472 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#8d8f29\", label=\"";
3474 Stream <<
"ID = " <<
this <<
"\\n";
3475 Stream <<
"CommandBuffer Command Update"
3478 Stream <<
"\"];" << std::endl;
3480 for (
const auto &Dep :
MDeps) {
3481 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
3482 <<
" [ label = \"Access mode: "
3484 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
The context class represents a SYCL context on which kernel functions may be executed.
backend get_backend() const noexcept
Returns the backend associated with this context.
detail::SYCLMemObjI * MSYCLMemObj
range< 3 > & MAccessRange
range< 3 > & MMemoryRange
Base class for memory allocation commands.
const Requirement * getRequirement() const final
AllocaCommandBase * MLinkedAllocaCmd
Alloca command linked with current command.
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
virtual void * getMemAllocation() const =0
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
bool MIsActive
Indicates that current alloca is active one.
bool MIsLeaderAlloca
Indicates that the command owns memory allocation in case of connected alloca command.
AllocaCommandBase(CommandType Type, QueueImplPtr Queue, Requirement Req, AllocaCommandBase *LinkedAllocaCmd, bool IsConst)
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
SYCLMemObjI * getSYCLMemObj() const
void printDot(std::ostream &Stream) const final
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
AllocaCommand(QueueImplPtr Queue, Requirement Req, bool InitFromUserData=true, AllocaCommandBase *LinkedAllocaCmd=nullptr, bool IsConst=false)
void * getMemAllocation() const final
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
void printDot(std::ostream &Stream) const final
AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *ParentAlloca, std::vector< Command * > &ToEnqueue, std::vector< Command * > &ToCleanUp)
sycl::detail::kernel_param_kind_t MType
"Execute kernel" command group class.
std::shared_ptr< detail::kernel_bundle_impl > MKernelBundle
std::vector< ArgDesc > MArgs
bool MKernelIsCooperative
NDRDescT MNDRDesc
Stores ND-range description.
std::shared_ptr< detail::kernel_impl > MSyclKernel
std::shared_ptr< detail::queue_impl > MQueue
Base class for all types of command groups.
CGTYPE
Type of the command group.
The Command class represents some action that needs to be performed on one or more memory objects.
bool MShouldCompleteEventIfPossible
void copySubmissionCodeLocation()
Command * processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform glueing of events from different contexts.
void * MTraceEvent
The event for node_create and task_begin.
CommandType MType
The type of the command.
virtual bool producesPiEvent() const
Returns true iff the command produces a PI event on non-host devices.
void emitEnqueuedEventSignal(sycl::detail::pi::PiEvent &PiEventAddr)
Creates a signal event with the enqueued kernel event handle.
int32_t MStreamID
The stream under which the traces are emitted.
void emitInstrumentation(uint16_t Type, const char *Txt=nullptr)
Emits an event of Type.
virtual void emitInstrumentationData()=0
Instrumentation method which emits telemetry data.
void resolveReleaseDependencies(std::set< Command * > &list)
Looks at all the dependencies for the release command and enables instrumentation to report these dep...
std::string MCommandName
Buffer to build the command end-user understandable name.
virtual pi_int32 enqueueImp()=0
Private interface. Derived classes should implement this method.
bool MMarkedForCleanup
Indicates that the node will be freed by graph cleanup.
void emitEdgeEventForEventDependence(Command *Cmd, sycl::detail::pi::PiEvent &EventAddr)
Creates an edge event when the dependency is an event.
unsigned MLeafCounter
Counts the number of memory objects this command is a leaf for.
virtual bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, std::vector< Command * > &ToCleanUp)
Checks if the command is enqueued, and calls enqueueImp.
std::vector< sycl::detail::pi::PiEvent > getPiEvents(const std::vector< EventImplPtr > &EventImpls) const
Collect PI events from EventImpls and filter out some of them in case of in order queue.
void waitForPreparedHostEvents() const
std::string MSubmissionFileName
Introduces string to handle memory management since code_location struct works with raw char arrays.
std::mutex MEnqueueMtx
Mutex used to protect enqueueing from race conditions.
virtual const ContextImplPtr & getWorkerContext() const
Get the context of the queue this command will be submitted to.
void emitInstrumentationDataProxy()
Proxy method which calls emitInstrumentationData.
code_location MSubmissionCodeLocation
Represents code location of command submission to SYCL API, assigned with the valid value only if com...
void makeTraceEventEpilog()
If prolog has been run, run epilog; this must be guarded by a check for xptiTraceEnabled().
std::vector< sycl::detail::pi::PiEvent > getPiEventsBlocking(const std::vector< EventImplPtr > &EventImpls) const
Collect PI events from EventImpls and filter out some of them in case of in order queue.
std::vector< EventImplPtr > & MPreparedHostDepsEvents
std::vector< EventImplPtr > & MPreparedDepsEvents
Dependency events prepared for waiting by backend.
Command(CommandType Type, QueueImplPtr Queue, sycl::detail::pi::PiExtCommandBuffer CommandBuffer=nullptr, const std::vector< sycl::detail::pi::PiExtSyncPoint > &SyncPoints={})
It is safe to bind MPreparedDepsEvents and MPreparedHostDepsEvents references to event_impl class mem...
std::string MSubmissionFunctionName
uint64_t MInstanceID
Instance ID tracked for the command.
const QueueImplPtr & getWorkerQueue() const
Get the queue this command will be submitted to.
std::vector< DepDesc > MDeps
Contains list of dependencies(edges)
const char * getBlockReason() const
std::vector< sycl::detail::pi::PiExtSyncPoint > MSyncPointDeps
List of sync points for submissions to a command buffer.
virtual bool readyForCleanup() const
Returns true iff this command is ready to be submitted for cleanup.
void addUser(Command *NewUser)
std::atomic< EnqueueResultT::ResultT > MEnqueueStatus
Describes the status of the command.
const EventImplPtr & getEvent() const
uint64_t makeTraceEventProlog(void *MAddress)
Create a trace event of node_create type; this must be guarded by a check for xptiTraceEnabled().
friend class DispatchHostTask
CommandType getType() const
void * MAddress
Reserved for storing the object address such as SPIR-V or memory object address.
std::string MAddressString
Buffer to build the address string.
void waitForEvents(QueueImplPtr Queue, std::vector< EventImplPtr > &RawEvents, sycl::detail::pi::PiEvent &Event)
QueueImplPtr MWorkerQueue
bool MIsBlockable
Indicates whether the command can be blocked from enqueueing.
virtual bool supportsPostEnqueueCleanup() const
Returns true iff this command can be freed by post enqueue cleanup.
bool MTraceEventPrologComplete
Flag to indicate if makeTraceEventProlog() has been run.
sycl::detail::pi::PiExtCommandBuffer MCommandBuffer
CommandBuffer which will be used to submit to instead of the queue, if set.
std::string MCommandNodeType
Buffer to build the command node type.
Command * addDep(DepDesc NewDep, std::vector< Command * > &ToCleanUp)
void emitEdgeEventForCommandDependence(Command *Cmd, void *ObjAddr, bool IsCommand, std::optional< access::mode > AccMode=std::nullopt)
Creates an edge event when the dependency is a command.
const QueueImplPtr & getQueue() const
bool MFirstInstance
Flag to indicate if this is the first time we are seeing this payload.
DispatchHostTask(ExecCGCommand *ThisCmd, std::vector< interop_handle::ReqToMem > ReqToMem)
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
void printDot(std::ostream &Stream) const final
void addRequirement(Command *DepCmd, AllocaCommandBase *AllocaCmd, const Requirement *Req)
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
EmptyCommand(QueueImplPtr Queue)
The exec CG command enqueues execution of kernel or explicit memory operation.
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
std::string_view getTypeString() const
detail::CG & getCG() const
void printDot(std::ostream &Stream) const final
std::vector< std::shared_ptr< const void > > getAuxiliaryResources() const
void clearAuxiliaryResources()
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
ExecCGCommand(std::unique_ptr< detail::CG > CommandGroup, QueueImplPtr Queue, sycl::detail::pi::PiExtCommandBuffer CommandBuffer=nullptr, const std::vector< sycl::detail::pi::PiExtSyncPoint > &Dependencies={})
void TraceEventXPTI(const char *Message)
static GlobalHandler & instance()
void addToFusionList(ExecCGCommand *Kernel)
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
void setFusionStatus(FusionStatus Status)
Set the status of this fusion command to Status.
std::vector< ExecCGCommand * > & getFusionList()
void printDot(std::ostream &Stream) const final
KernelFusionCommand(QueueImplPtr Queue)
void resetQueue()
Reset the queue.
std::vector< Command * > & auxiliaryCommands()
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, void **DstPtr, QueueImplPtr Queue, access::mode MapMode)
void printDot(std::ostream &Stream) const final
const ContextImplPtr & getWorkerContext() const final
Get the context of the queue this command will be submitted to.
MemCpyCommandHost(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, Requirement DstReq, void **DstPtr, QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
void printDot(std::ostream &Stream) const final
MemCpyCommand(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, Requirement DstReq, AllocaCommandBase *DstAllocaCmd, QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
void printDot(std::ostream &Stream) const final
const ContextImplPtr & getWorkerContext() const final
Get the context of the queue this command will be submitted to.
static void * allocateMemSubBuffer(ContextImplPtr TargetContext, void *ParentMemObj, size_t ElemSize, size_t Offset, range< 3 > Range, std::vector< EventImplPtr > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
static void ext_oneapi_copyH2D_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, char *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void ext_oneapi_copyD2H_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, void *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, char *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void prefetch_usm(void *Ptr, QueueImplPtr Queue, size_t Len, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void advise_usm(const void *Ptr, QueueImplPtr Queue, size_t Len, pi_mem_advice Advice, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy_2d_usm(const void *SrcMem, size_t SrcPitch, QueueImplPtr Queue, void *DstMem, size_t DstPitch, size_t Width, size_t Height, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_prefetch_usm_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, void *Mem, size_t Length, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void fill(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, size_t PatternSize, const char *Pattern, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent &OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_copyD2D_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, void *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void * allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, bool InitFromUserData, void *HostPtr, std::vector< EventImplPtr > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
static void copy_image_bindless(void *Src, QueueImplPtr Queue, void *Dst, const sycl::detail::pi::PiMemImageDesc &Desc, const sycl::detail::pi::PiMemImageFormat &Format, const sycl::detail::pi::PiImageCopyFlags Flags, sycl::detail::pi::PiImageOffset SrcOffset, sycl::detail::pi::PiImageOffset DstOffset, sycl::detail::pi::PiImageRegion CopyExtent, sycl::detail::pi::PiImageRegion HostExtent, const std::vector< sycl::detail::pi::PiEvent > &DepEvents, sycl::detail::pi::PiEvent *OutEvent)
static void copy_usm(const void *SrcMem, QueueImplPtr Queue, size_t Len, void *DstMem, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy_to_device_global(const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue, size_t NumBytes, size_t Offset, const void *SrcMem, const std::vector< sycl::detail::pi::PiEvent > &DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void release(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *MemAllocation, std::vector< EventImplPtr > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
static void ext_oneapi_fill_usm_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, void *DstMem, size_t Len, int Pattern, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void copy_from_device_global(const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue, size_t NumBytes, size_t Offset, void *DstMem, const std::vector< sycl::detail::pi::PiEvent > &DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_copy_usm_cmd_buffer(ContextImplPtr Context, const void *SrcMem, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, size_t Len, void *DstMem, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void memset_2d_usm(void *DstMem, QueueImplPtr Queue, size_t Pitch, size_t Width, size_t Height, char Value, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void fill_usm(void *DstMem, QueueImplPtr Queue, size_t Len, int Pattern, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy(SYCLMemObjI *SYCLMemObj, void *SrcMem, QueueImplPtr SrcQueue, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, QueueImplPtr TgtQueue, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent &OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_fill_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, void *Mem, size_t PatternSize, const char *Pattern, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void * map(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, access::mode AccessMode, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
static void fill_2d_usm(void *DstMem, QueueImplPtr Queue, size_t Pitch, size_t Width, size_t Height, const std::vector< char > &Pattern, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void unmap(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, void *MappedPtr, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
static void ext_oneapi_advise_usm_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, const void *Mem, size_t Length, pi_mem_advice Advice, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
sycl::range< 3 > GlobalSize
sycl::range< 3 > NumWorkGroups
Number of workgroups, used to record the number of workgroups from the simplest form of parallel_for_...
sycl::id< 3 > GlobalOffset
sycl::range< 3 > LocalSize
void set(sycl::range< Dims_ > NumWorkItems)
static ProgramManager & getInstance()
kernel_id getSYCLKernelID(const std::string &KernelName)
device_image_plain getDeviceImageFromBinaryImage(RTDeviceBinaryImage *BinImage, const context &Ctx, const device &Dev)
bool kernelUsesAssert(const std::string &KernelName) const
HostPipeMapEntry * getHostPipeEntry(const std::string &UniqueId)
device_image_plain build(const device_image_plain &DeviceImage, const std::vector< device > &Devs, const property_list &PropList)
std::tuple< sycl::detail::pi::PiKernel, std::mutex *, const KernelArgMask *, sycl::detail::pi::PiProgram > getOrCreateKernel(const ContextImplPtr &ContextImpl, const DeviceImplPtr &DeviceImpl, const std::string &KernelName, const NDRDescT &NDRDesc={})
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd)
void printDot(std::ostream &Stream) const final
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
virtual MemObjType getType() const =0
Command * connectDepEvent(Command *const Cmd, const EventImplPtr &DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform connection of events in multiple contexts.
QueueImplPtr getDefaultHostQueue()
void enqueueCommandForCG(EventImplPtr NewEvent, std::vector< Command * > &AuxilaryCmds, BlockingT Blocking=NON_BLOCKING)
GraphBuilder MGraphBuilder
static Scheduler & getInstance()
void NotifyHostTaskCompletion(Command *Cmd)
void printDot(std::ostream &Stream) const final
UnMapMemObject(AllocaCommandBase *DstAllocaCmd, Requirement Req, void **SrcPtr, QueueImplPtr Queue)
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
UpdateCommandBufferCommand(QueueImplPtr Queue, ext::oneapi::experimental::detail::exec_graph_impl *Graph, std::vector< std::shared_ptr< ext::oneapi::experimental::detail::node_impl >> Nodes)
void printDot(std::ostream &Stream) const final
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
UpdateHostRequirementCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *SrcAllocaCmd, void **DstPtr)
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
void printDot(std::ostream &Stream) const final
static void bufferAssociateNotification(const void *, const void *)
const PluginPtr & getPlugin() const
sycl::detail::pi::PiDevice & getHandleRef()
Get reference to PI device.
Data type that manages the code_location information in TLS.
const detail::code_location & query()
Query the information in the TLS slot.
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
bool is_accelerator() const
Check if device is an accelerator device.
bool is_gpu() const
Check if device is a GPU device.
bool is_cpu() const
Check if device is a CPU device.
cl_int get_cl_code() const
Class representing the implementation of command_graph<executable>.
void updateImpl(std::shared_ptr< node_impl > NodeImpl)
Objects of the class identify kernel is some kernel_bundle related APIs.
Provides an abstraction of a SYCL kernel.
Defines the iteration domain of both the work-groups and the overall dispatch.
void handleErrorOrWarning(pi_result Error, const device_impl &DeviceImpl, pi_kernel Kernel, const NDRDescT &NDRDesc)
Analyzes error code and arguments of piEnqueueKernelLaunch to emit user-friendly exception describing...
::pi_kernel_cache_config PiKernelCacheConfig
::pi_ext_sync_point PiExtSyncPoint
::pi_ext_command_buffer_command PiExtCommandBufferCommand
void free(void *Ptr, const context &Ctxt, const code_location &CL)
@ kind_specialization_constants_buffer
static void flushCrossQueueDeps(const std::vector< EventImplPtr > &EventImpls, const QueueImplPtr &Queue)
static std::string_view cgTypeToString(detail::CG::CGTYPE Type)
static std::string demangleKernelName(std::string Name)
std::vector< bool > KernelArgMask
constexpr const char * SYCL_STREAM_NAME
std::enable_if< !std::is_same< typename Param::return_type, sycl::range< 3 > >::value, typename Param::return_type >::type get_kernel_device_specific_info(sycl::detail::pi::PiKernel Kernel, sycl::detail::pi::PiDevice Device, const PluginPtr &Plugin)
std::string codeToString(pi_int32 code)
void ReverseRangeDimensionsForKernel(NDRDescT &NDR)
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
pi_int32 enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName, bool blocking, void *ptr, size_t size, std::vector< sycl::detail::pi::PiEvent > &RawEvents, const detail::EventImplPtr &OutEventImpl, bool read)
std::shared_ptr< event_impl > EventImplPtr
static void adjustNDRangePerKernel(NDRDescT &NDR, sycl::detail::pi::PiKernel Kernel, const device_impl &DeviceImpl)
pi_mem_obj_access AccessModeToPi(access::mode AccessorMode)
static pi_result SetKernelParamsAndLaunch(const QueueImplPtr &Queue, std::vector< ArgDesc > &Args, const std::shared_ptr< device_image_impl > &DeviceImageImpl, sycl::detail::pi::PiKernel Kernel, NDRDescT &NDRDesc, std::vector< sycl::detail::pi::PiEvent > &RawEvents, const detail::EventImplPtr &OutEventImpl, const KernelArgMask *EliminatedArgMask, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, bool IsCooperative)
std::shared_ptr< plugin > PluginPtr
AccessorImplHost Requirement
std::shared_ptr< device_impl > DeviceImplPtr
static std::string deviceToString(device Device)
pi_int32 enqueueImpKernel(const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector< ArgDesc > &Args, const std::shared_ptr< detail::kernel_bundle_impl > &KernelBundleImplPtr, const std::shared_ptr< detail::kernel_impl > &MSyclKernel, const std::string &KernelName, std::vector< sycl::detail::pi::PiEvent > &RawEvents, const detail::EventImplPtr &OutEventImpl, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig, const bool KernelIsCooperative)
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
void applyFuncOnFilteredArgs(const KernelArgMask *EliminatedArgMask, std::vector< ArgDesc > &Args, std::function< void(detail::ArgDesc &Arg, int NextTrueIndex)> Func)
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
void SetArgBasedOnType(const PluginPtr &Plugin, sycl::detail::pi::PiKernel Kernel, const std::shared_ptr< device_image_impl > &DeviceImageImpl, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, const sycl::context &Context, bool IsHost, detail::ArgDesc &Arg, size_t NextTrueIndex)
void DispatchNativeKernel(void *Blob)
pi_int32 enqueueImpCommandBufferKernel(context Ctx, DeviceImplPtr DeviceImpl, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, const CGExecKernel &CommandGroup, std::vector< sycl::detail::pi::PiExtSyncPoint > &SyncPoints, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint, sycl::detail::pi::PiExtCommandBufferCommand *OutCommand, const std::function< void *(Requirement *Req)> &getMemAllocationFunc)
static std::string accessModeToString(access::mode Mode)
Function for_each(Group g, Ptr first, Ptr last, Function f)
__DPCPP_SYCL_EXTERNAL _SYCL_EXT_CPLX_INLINE_VISIBILITY std::enable_if_t< is_genfloat< _Tp >::value, _Tp > arg(const complex< _Tp > &__c)
PropertyListT int access::address_space multi_ptr & operator=(multi_ptr &&)=default
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM
pi_result piextEnqueueReadHostPipe(pi_queue queue, pi_program program, const char *pipe_symbol, pi_bool blocking, void *ptr, size_t size, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
Plugin.
pi_result piKernelRelease(pi_kernel kernel)
pi_result piextEnqueueWriteHostPipe(pi_queue queue, pi_program program, const char *pipe_symbol, pi_bool blocking, void *ptr, size_t size, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
Write to pipe of a given name.
pi_result piKernelSetArg(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
pi_result piEnqueueEventsWaitWithBarrier(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
pi_result piEnqueueKernelLaunch(pi_queue queue, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
@ PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
pi_result piextWaitExternalSemaphore(pi_queue command_queue, pi_interop_semaphore_handle sem_handle, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
API to instruct the queue with a non-blocking wait on an external semaphore.
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG
provides the preferred cache configuration (large slm or large data)
pi_result piProgramRelease(pi_program program)
pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer, pi_queue queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
API to submit the command-buffer to queue for execution, returns an error if the command-buffer is no...
pi_result piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index, const pi_sampler *arg_value)
pi_result piEnqueueEventsWait(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
pi_result piextCommandBufferNDRangeKernel(pi_ext_command_buffer command_buffer, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_sync_points_in_wait_list, const pi_ext_sync_point *sync_point_wait_list, pi_ext_sync_point *sync_point, pi_ext_command_buffer_command *command)
API to append a kernel execution command to the command-buffer.
pi_result piKernelSetExecInfo(pi_kernel kernel, pi_kernel_exec_info value_name, size_t param_value_size, const void *param_value)
API to set attributes controlling kernel execution.
pi_result piEventsWait(pi_uint32 num_events, const pi_event *event_list)
pi_result piextEnqueueCooperativeKernelLaunch(pi_queue queue, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
pi_result piextSignalExternalSemaphore(pi_queue command_queue, pi_interop_semaphore_handle sem_handle, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
API to instruct the queue to signal the external semaphore handle once all previous commands have com...
pi_result piextKernelSetArgPointer(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
Sets up pointer arguments for CL kernels.
pi_result piKernelGetGroupInfo(pi_kernel kernel, pi_device device, pi_kernel_group_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
@ PI_KERNEL_ARG_MEM_OBJ_ACCESS
pi_result piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, const pi_mem_obj_property *arg_properties, const pi_mem *arg_value)
pi_mem_access_flag mem_access
Dependency between two commands.
const Requirement * MDepRequirement
Requirement for the dependency.
Command * MDepCommand
The actual dependency command.
Result of command enqueueing.
RTDeviceBinaryImage * getDevBinImage()