38 #if __has_include(<cxxabi.h>)
39 #define __SYCL_ENABLE_GNU_DEMANGLING
46 #ifdef XPTI_ENABLE_INSTRUMENTATION
47 #include "xpti/xpti_trace_framework.hpp"
52 inline namespace _V1 {
55 #ifdef XPTI_ENABLE_INSTRUMENTATION
57 extern xpti::trace_event_data_t *GSYCLGraphEvent;
59 bool CurrentCodeLocationValid() {
60 detail::tls_code_loc_t Tls;
61 auto CodeLoc = Tls.query();
62 auto FileName = CodeLoc.fileName();
63 auto FunctionName = CodeLoc.functionName();
64 return (FileName && FileName[0] !=
'\0') ||
65 (FunctionName && FunctionName[0] !=
'\0');
69 #ifdef __SYCL_ENABLE_GNU_DEMANGLING
70 struct DemangleHandle {
72 DemangleHandle(
char *ptr) : p(ptr) {}
74 DemangleHandle &
operator=(
const DemangleHandle &) =
delete;
80 DemangleHandle result(abi::__cxa_demangle(Name.c_str(), NULL, NULL, &Status));
81 return (Status == 0) ? result.p : Name;
101 const KernelArgMask *EliminatedArgMask, std::vector<ArgDesc> &Args,
103 if (!EliminatedArgMask) {
111 std::sort(Args.begin(), Args.end(), [](
const ArgDesc &A,
const ArgDesc &B) {
112 return A.MIndex < B.MIndex;
115 size_t NextTrueIndex = 0;
120 for (
int Idx = LastIndex + 1; Idx < Arg.
MIndex; ++Idx)
121 if (!(*EliminatedArgMask)[Idx])
125 if ((*EliminatedArgMask)[Arg.
MIndex])
128 Func(Arg, NextTrueIndex);
134 #ifdef XPTI_ENABLE_INSTRUMENTATION
135 static size_t deviceToID(
const device &Device) {
139 return reinterpret_cast<size_t>(
getSyclObjImpl(Device)->getHandleRef());
152 return "discard_write";
154 return "discard_read_write";
160 #ifdef XPTI_ENABLE_INSTRUMENTATION
165 case Command::CommandType::RUN_CG:
166 return "command_group_node";
167 case Command::CommandType::COPY_MEMORY:
168 return "memory_transfer_node";
169 case Command::CommandType::ALLOCA:
170 return "memory_allocation_node";
171 case Command::CommandType::ALLOCA_SUB_BUF:
172 return "sub_buffer_creation_node";
173 case Command::CommandType::RELEASE:
174 return "memory_deallocation_node";
175 case Command::CommandType::MAP_MEM_OBJ:
176 return "memory_transfer_node";
177 case Command::CommandType::UNMAP_MEM_OBJ:
178 return "memory_transfer_node";
179 case Command::CommandType::UPDATE_REQUIREMENT:
180 return "host_acc_create_buffer_lock_node";
181 case Command::CommandType::EMPTY_TASK:
182 return "host_acc_destroy_buffer_release_node";
183 case Command::CommandType::FUSION:
184 return "kernel_fusion_placeholder_node";
186 return "unknown_node";
195 case Command::CommandType::RUN_CG:
196 return "Command Group Action";
197 case Command::CommandType::COPY_MEMORY:
198 return "Memory Transfer (Copy)";
199 case Command::CommandType::ALLOCA:
200 return "Memory Allocation";
201 case Command::CommandType::ALLOCA_SUB_BUF:
202 return "Sub Buffer Creation";
203 case Command::CommandType::RELEASE:
204 return "Memory Deallocation";
205 case Command::CommandType::MAP_MEM_OBJ:
206 return "Memory Transfer (Map)";
207 case Command::CommandType::UNMAP_MEM_OBJ:
208 return "Memory Transfer (Unmap)";
209 case Command::CommandType::UPDATE_REQUIREMENT:
210 return "Host Accessor Creation/Buffer Lock";
211 case Command::CommandType::EMPTY_TASK:
212 return "Host Accessor Destruction/Buffer Lock Release";
213 case Command::CommandType::FUSION:
214 return "Kernel Fusion Placeholder";
216 return "Unknown Action";
221 std::vector<sycl::detail::pi::PiEvent>
223 std::vector<sycl::detail::pi::PiEvent> RetPiEvents;
224 for (
auto &EventImpl : EventImpls) {
225 if (EventImpl->getHandleRef() ==
nullptr)
235 if (EventImpl->getWorkerQueue() == WorkerQueue &&
239 RetPiEvents.push_back(EventImpl->getHandleRef());
250 const std::vector<EventImplPtr> &EventImpls)
const {
251 std::vector<sycl::detail::pi::PiEvent> RetPiEvents;
252 for (
auto &EventImpl : EventImpls) {
257 if (!EventImpl->isContextInitialized() || EventImpl->is_host())
261 if (EventImpl->getHandleRef() ==
nullptr) {
262 if (!EventImpl->getCommand() ||
265 std::vector<Command *> AuxCmds;
276 if (EventImpl->getWorkerQueue() == WorkerQueue &&
280 RetPiEvents.push_back(EventImpl->getHandleRef());
287 return (
MType == CommandType::RUN_CG) &&
288 ((
static_cast<const ExecCGCommand *
>(
this))->getCG().getType() ==
289 CG::CGTYPE::CodeplayHostTask);
294 for (
auto &EventImpl : EventImpls) {
295 EventImpl->flushIfNeeded(Queue);
301 std::vector<interop_handle::ReqToMem> MReqToMem;
304 std::map<const PluginPtr, std::vector<EventImplPtr>>
305 RequiredEventsPerPlugin;
308 const PluginPtr &Plugin = Event->getPlugin();
309 RequiredEventsPerPlugin[Plugin].push_back(Event);
317 for (
auto &PluginWithEvents : RequiredEventsPerPlugin) {
318 std::vector<sycl::detail::pi::PiEvent> RawEvents =
325 HostTask.MQueue->reportAsyncException(std::current_exception());
329 HostTask.MQueue->reportAsyncException(std::current_exception());
330 return PI_ERROR_UNKNOWN;
337 Event->waitInternal();
345 std::vector<interop_handle::ReqToMem> ReqToMem)
346 : MThisCmd{ThisCmd}, MReqToMem(std::move(ReqToMem)) {}
349 assert(MThisCmd->
getCG().
getType() == CG::CGTYPE::CodeplayHostTask);
353 #ifdef XPTI_ENABLE_INSTRUMENTATION
358 std::unique_ptr<detail::tls_code_loc_t> AsyncCodeLocationPtr;
359 if (xptiTraceEnabled() && !CurrentCodeLocationValid()) {
360 AsyncCodeLocationPtr.reset(
366 if (WaitResult != PI_SUCCESS) {
367 std::exception_ptr EPtr = std::make_exception_ptr(sycl::runtime_error(
368 std::string(
"Couldn't wait for host-task's dependencies"),
370 HostTask.MQueue->reportAsyncException(EPtr);
379 if (
HostTask.MHostTask->isInteropTask()) {
381 HostTask.MQueue->getDeviceImplPtr(),
382 HostTask.MQueue->getContextImplPtr()};
388 auto CurrentException = std::current_exception();
389 #ifdef XPTI_ENABLE_INSTRUMENTATION
393 if (xptiTraceEnabled()) {
395 rethrow_exception(CurrentException);
398 }
catch (
const std::exception &StdException) {
402 "Host task lambda thrown non standard exception");
406 HostTask.MQueue->reportAsyncException(CurrentException);
411 #ifdef XPTI_ENABLE_INSTRUMENTATION
414 AsyncCodeLocationPtr.reset();
422 auto CurrentException = std::current_exception();
423 HostTask.MQueue->reportAsyncException(CurrentException);
430 HostEvent->waitInternal();
434 std::vector<EventImplPtr> &EventImpls,
436 if (!EventImpls.empty()) {
437 if (Queue->is_host()) {
451 std::map<context_impl *, std::vector<EventImplPtr>>
452 RequiredEventsPerContext;
456 assert(Context.get() &&
457 "Only non-host events are expected to be waited for here");
458 RequiredEventsPerContext[Context.get()].push_back(Event);
461 for (
auto &CtxWithEvents : RequiredEventsPerContext) {
462 std::vector<sycl::detail::pi::PiEvent> RawEvents =
465 RawEvents.size(), RawEvents.data());
470 assert(Event->getContextImpl().
get() &&
471 "Only non-host events are expected to be waited for here");
474 std::vector<sycl::detail::pi::PiEvent> RawEvents =
477 const PluginPtr &Plugin = Queue->getPlugin();
480 MEvent->setHostEnqueueTime();
482 Queue->getHandleRef(), RawEvents.size(), &RawEvents[0], &Event);
493 const std::vector<sycl::detail::pi::PiExtSyncPoint> &SyncPoints)
494 : MQueue(
std::move(Queue)),
496 MPreparedDepsEvents(MEvent->getPreparedDepsEvents()),
497 MPreparedHostDepsEvents(MEvent->getPreparedHostDepsEvents()), MType(Type),
498 MCommandBuffer(CommandBuffer), MSyncPointDeps(SyncPoints) {
504 MEvent->setStateIncomplete();
507 #ifdef XPTI_ENABLE_INSTRUMENTATION
508 if (!xptiTraceEnabled())
516 #ifdef XPTI_ENABLE_INSTRUMENTATION
532 Command *Cmd,
void *ObjAddr,
bool IsCommand,
533 std::optional<access::mode> AccMode) {
534 #ifdef XPTI_ENABLE_INSTRUMENTATION
537 constexpr uint16_t NotificationTraceType = xpti::trace_edge_create;
538 if (!(xptiCheckTraceEnabled(
MStreamID, NotificationTraceType) &&
544 xpti::utils::StringHelper SH;
545 std::string AddressStr = SH.addressAsString<
void *>(ObjAddr);
547 std::string TypeString = SH.nameWithAddressString(Prefix, AddressStr);
550 xpti::payload_t Payload(TypeString.c_str(),
MAddress);
551 uint64_t EdgeInstanceNo;
553 xptiMakeEvent(TypeString.c_str(), &Payload, xpti::trace_graph_event,
554 xpti_at::active, &EdgeInstanceNo);
556 xpti_td *SrcEvent =
static_cast<xpti_td *
>(Cmd->
MTraceEvent);
557 xpti_td *TgtEvent =
static_cast<xpti_td *
>(
MTraceEvent);
558 EdgeEvent->source_id = SrcEvent->unique_id;
559 EdgeEvent->target_id = TgtEvent->unique_id;
561 xpti::addMetadata(EdgeEvent,
"access_mode",
562 static_cast<int>(AccMode.value()));
563 xpti::addMetadata(EdgeEvent,
"memory_object",
564 reinterpret_cast<size_t>(ObjAddr));
566 xpti::addMetadata(EdgeEvent,
"event",
reinterpret_cast<size_t>(ObjAddr));
568 xptiNotifySubscribers(
MStreamID, NotificationTraceType,
569 detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
584 #ifdef XPTI_ENABLE_INSTRUMENTATION
597 xpti::utils::StringHelper SH;
598 std::string AddressStr =
604 std::string NodeName = SH.nameWithAddressString(
"virtual_node", AddressStr);
606 xpti::payload_t VNPayload(NodeName.c_str(),
MAddress);
607 uint64_t VNodeInstanceNo;
609 xptiMakeEvent(NodeName.c_str(), &VNPayload, xpti::trace_graph_event,
610 xpti_at::active, &VNodeInstanceNo);
612 xpti::addMetadata(NodeEvent,
"kernel_name", NodeName);
613 xptiNotifySubscribers(
MStreamID, xpti::trace_node_create,
614 detail::GSYCLGraphEvent, NodeEvent, VNodeInstanceNo,
617 std::string EdgeName = SH.nameWithAddressString(
"Event", AddressStr);
618 xpti::payload_t EdgePayload(EdgeName.c_str(),
MAddress);
619 uint64_t EdgeInstanceNo;
621 xptiMakeEvent(EdgeName.c_str(), &EdgePayload, xpti::trace_graph_event,
622 xpti_at::active, &EdgeInstanceNo);
623 if (EdgeEvent && NodeEvent) {
626 xpti_td *TgtEvent =
static_cast<xpti_td *
>(
MTraceEvent);
627 EdgeEvent->source_id = NodeEvent->unique_id;
628 EdgeEvent->target_id = TgtEvent->unique_id;
629 xpti::addMetadata(EdgeEvent,
"event",
630 reinterpret_cast<size_t>(PiEventAddr));
631 xptiNotifySubscribers(
MStreamID, xpti::trace_edge_create,
632 detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
641 uint64_t CommandInstanceNo = 0;
642 #ifdef XPTI_ENABLE_INSTRUMENTATION
643 if (!xptiCheckTraceEnabled(
MStreamID, xpti::trace_node_create))
644 return CommandInstanceNo;
650 xpti::utils::StringHelper SH;
652 std::string CommandString =
655 xpti::payload_t p(CommandString.c_str(),
MAddress);
656 xpti_td *CmdTraceEvent =
657 xptiMakeEvent(CommandString.c_str(), &p, xpti::trace_graph_event,
658 xpti_at::active, &CommandInstanceNo);
669 return CommandInstanceNo;
673 #ifdef XPTI_ENABLE_INSTRUMENTATION
674 constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
678 xptiNotifySubscribers(
MStreamID, NotificationTraceType,
679 detail::GSYCLGraphEvent,
686 std::vector<Command *> &ToCleanUp) {
688 const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr();
696 bool PiEventExpected = (!DepEvent->is_host() && DepEvent->isInitialized());
697 if (
auto *DepCmd =
static_cast<Command *
>(DepEvent->getCommand()))
698 PiEventExpected &= DepCmd->producesPiEvent();
700 if (!PiEventExpected) {
707 Command *ConnectionCmd =
nullptr;
711 if (DepEventContext != WorkerContext && !WorkerContext->is_host()) {
717 return ConnectionCmd;
721 return MQueue->getContextImplPtr();
725 assert(
MWorkerQueue &&
"MWorkerQueue must not be nullptr");
739 Command *ConnectionCmd =
nullptr;
748 if (!ConnectionCmd) {
749 MDeps.push_back(NewDep);
754 #ifdef XPTI_ENABLE_INSTRUMENTATION
760 return ConnectionCmd;
764 std::vector<Command *> &ToCleanUp) {
765 #ifdef XPTI_ENABLE_INSTRUMENTATION
779 #ifdef XPTI_ENABLE_INSTRUMENTATION
780 constexpr uint16_t NotificationTraceType = xpti::trace_signal;
781 if (!(xptiCheckTraceEnabled(
MStreamID, NotificationTraceType) &&
786 xptiNotifySubscribers(
787 MStreamID, NotificationTraceType, detail::GSYCLGraphEvent,
793 #ifdef XPTI_ENABLE_INSTRUMENTATION
797 xptiNotifySubscribers(
MStreamID, Type, detail::GSYCLGraphEvent,
799 static_cast<const void *
>(Txt));
804 std::vector<Command *> &ToCleanUp) {
805 #ifdef XPTI_ENABLE_INSTRUMENTATION
809 std::unique_ptr<detail::tls_code_loc_t> AsyncCodeLocationPtr;
810 if (xptiTraceEnabled() && !CurrentCodeLocationValid()) {
811 AsyncCodeLocationPtr.reset(
827 #ifdef XPTI_ENABLE_INSTRUMENTATION
831 std::string Info =
"enqueue.barrier[";
839 #ifdef XPTI_ENABLE_INSTRUMENTATION
850 #ifdef XPTI_ENABLE_INSTRUMENTATION
866 if (PI_SUCCESS != Res)
871 (
MEvent->is_host() ||
MEvent->getHandleRef() ==
nullptr))
882 ToCleanUp.push_back(
this);
888 #ifdef XPTI_ENABLE_INSTRUMENTATION
895 #ifdef XPTI_ENABLE_INSTRUMENTATION
896 assert(
MType == CommandType::RELEASE &&
"Expected release command");
902 xpti_td *TgtTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
907 for (
auto &Item : DepList) {
908 if (Item->MTraceEvent && Item->MAddress ==
MAddress) {
909 xpti::utils::StringHelper SH;
910 std::string AddressStr = SH.addressAsString<
void *>(
MAddress);
911 std::string TypeString =
912 "Edge:" + SH.nameWithAddressString(commandToName(
MType), AddressStr);
916 xpti::payload_t p(TypeString.c_str(),
MAddress);
917 uint64_t EdgeInstanceNo;
919 xptiMakeEvent(TypeString.c_str(), &p, xpti::trace_graph_event,
920 xpti_at::active, &EdgeInstanceNo);
922 xpti_td *SrcTraceEvent =
static_cast<xpti_td *
>(Item->MTraceEvent);
923 EdgeEvent->target_id = TgtTraceEvent->unique_id;
924 EdgeEvent->source_id = SrcTraceEvent->unique_id;
925 xpti::addMetadata(EdgeEvent,
"memory_object",
926 reinterpret_cast<size_t>(
MAddress));
927 xptiNotifySubscribers(
MStreamID, xpti::trace_edge_create,
928 detail::GSYCLGraphEvent, EdgeEvent,
929 EdgeInstanceNo,
nullptr);
939 return "A Buffer is locked by the host accessor";
941 return "Blocked by host task";
944 return "Unknown block reason";
948 #ifdef XPTI_ENABLE_INSTRUMENTATION
949 if (!xptiTraceEnabled())
953 auto TData = Tls.
query();
954 if (TData.fileName())
956 if (TData.functionName())
961 (int)TData.lineNumber(), (int)TData.columnNumber()};
969 :
Command(Type, Queue), MLinkedAllocaCmd(LinkedAllocaCmd),
970 MIsLeaderAlloca(nullptr == LinkedAllocaCmd), MIsConst(IsConst),
971 MRequirement(
std::move(Req)), MReleaseCmd(Queue, this) {
977 #ifdef XPTI_ENABLE_INSTRUMENTATION
987 xpti::addMetadata(TE,
"sycl_device", deviceToID(
MQueue->get_device()));
988 xpti::addMetadata(TE,
"sycl_device_type",
990 xpti::addMetadata(TE,
"sycl_device_name",
992 xpti::addMetadata(TE,
"memory_object",
reinterpret_cast<size_t>(
MAddress));
1004 bool InitFromUserData,
1007 LinkedAllocaCmd, IsConst),
1008 MInitFromUserData(InitFromUserData) {
1013 std::vector<Command *> ToCleanUp;
1016 assert(ConnectionCmd ==
nullptr);
1017 assert(ToCleanUp.empty());
1018 (void)ConnectionCmd;
1022 #ifdef XPTI_ENABLE_INSTRUMENTATION
1033 pi_int32 AllocaCommand::enqueueImp() {
1039 void *HostPtr =
nullptr;
1054 std::move(EventImpls), Event);
1060 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1062 Stream <<
"ID = " <<
this <<
"\\n";
1066 Stream <<
"\"];" << std::endl;
1068 for (
const auto &Dep :
MDeps) {
1069 if (Dep.MDepCommand ==
nullptr)
1071 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1072 <<
" [ label = \"Access mode: "
1074 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1081 std::vector<Command *> &ToEnqueue,
1082 std::vector<Command *> &ToCleanUp)
1086 MParentAlloca(ParentAlloca) {
1094 ToEnqueue.push_back(ConnectionCmd);
1098 #ifdef XPTI_ENABLE_INSTRUMENTATION
1105 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1107 xpti::addMetadata(TE,
"access_range_start",
1109 xpti::addMetadata(TE,
"access_range_end",
1121 return static_cast<void *
>(
1128 pi_int32 AllocaSubBufCommand::enqueueImp() {
1144 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1146 Stream <<
"ID = " <<
this <<
"\\n";
1152 Stream <<
"\"];" << std::endl;
1154 for (
const auto &Dep :
MDeps) {
1155 if (Dep.MDepCommand ==
nullptr)
1157 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1158 <<
" [ label = \"Access mode: "
1160 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1171 #ifdef XPTI_ENABLE_INSTRUMENTATION
1180 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1181 xpti::addMetadata(TE,
"sycl_device", deviceToID(
MQueue->get_device()));
1182 xpti::addMetadata(TE,
"sycl_device_type",
1184 xpti::addMetadata(TE,
"sycl_device_name",
1186 xpti::addMetadata(TE,
"allocation_type",
1187 commandToName(MAllocaCmd->
getType()));
1193 pi_int32 ReleaseCommand::enqueueImp() {
1196 std::vector<sycl::detail::pi::PiEvent> RawEvents =
getPiEvents(EventImpls);
1197 bool SkipRelease =
false;
1203 const bool CurAllocaIsHost = MAllocaCmd->
getQueue()->is_host();
1204 bool NeedUnmap =
false;
1218 NeedUnmap |= CurAllocaIsHost == MAllocaCmd->
MIsActive;
1227 UnmapEventImpl->setContextImpl(Queue->getContextImplPtr());
1228 UnmapEventImpl->setStateIncomplete();
1231 void *Src = CurAllocaIsHost
1235 void *Dst = !CurAllocaIsHost
1240 RawEvents, UnmapEvent);
1244 EventImpls.push_back(UnmapEventImpl);
1258 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#FF827A\", label=\"";
1260 Stream <<
"ID = " <<
this <<
" ; ";
1262 Stream <<
" Alloca : " << MAllocaCmd <<
"\\n";
1263 Stream <<
" MemObj : " << MAllocaCmd->
getSYCLMemObj() <<
"\\n";
1264 Stream <<
"\"];" << std::endl;
1266 for (
const auto &Dep :
MDeps) {
1267 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1268 <<
" [ label = \"Access mode: "
1270 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1285 MSrcAllocaCmd(SrcAllocaCmd), MSrcReq(
std::move(Req)), MDstPtr(DstPtr),
1291 #ifdef XPTI_ENABLE_INSTRUMENTATION
1300 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1301 xpti::addMetadata(TE,
"sycl_device", deviceToID(
MQueue->get_device()));
1302 xpti::addMetadata(TE,
"sycl_device_type",
1304 xpti::addMetadata(TE,
"sycl_device_name",
1306 xpti::addMetadata(TE,
"memory_object",
reinterpret_cast<size_t>(
MAddress));
1312 pi_int32 MapMemObject::enqueueImp() {
1315 std::vector<sycl::detail::pi::PiEvent> RawEvents =
getPiEvents(EventImpls);
1328 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#77AFFF\", label=\"";
1330 Stream <<
"ID = " <<
this <<
" ; ";
1333 Stream <<
"\"];" << std::endl;
1335 for (
const auto &Dep :
MDeps) {
1336 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1337 <<
" [ label = \"Access mode: "
1339 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1347 MDstAllocaCmd(DstAllocaCmd), MDstReq(
std::move(Req)), MSrcPtr(SrcPtr) {
1352 #ifdef XPTI_ENABLE_INSTRUMENTATION
1361 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1362 xpti::addMetadata(TE,
"sycl_device", deviceToID(
MQueue->get_device()));
1363 xpti::addMetadata(TE,
"sycl_device_type",
1365 xpti::addMetadata(TE,
"sycl_device_name",
1367 xpti::addMetadata(TE,
"memory_object",
reinterpret_cast<size_t>(
MAddress));
1389 return MQueue->getDeviceImplPtr()->getBackend() !=
1391 MEvent->getHandleRef() !=
nullptr;
1394 pi_int32 UnMapMemObject::enqueueImp() {
1397 std::vector<sycl::detail::pi::PiEvent> RawEvents =
getPiEvents(EventImpls);
1403 std::move(RawEvents), Event);
1409 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#EBC40F\", label=\"";
1411 Stream <<
"ID = " <<
this <<
" ; ";
1414 Stream <<
"\"];" << std::endl;
1416 for (
const auto &Dep :
MDeps) {
1417 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1418 <<
" [ label = \"Access mode: "
1420 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1431 MSrcQueue(SrcQueue), MSrcReq(
std::move(SrcReq)),
1432 MSrcAllocaCmd(SrcAllocaCmd), MDstReq(
std::move(DstReq)),
1433 MDstAllocaCmd(DstAllocaCmd) {
1434 if (!MSrcQueue->is_host()) {
1435 MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1445 #ifdef XPTI_ENABLE_INSTRUMENTATION
1454 xpti_td *CmdTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
1455 xpti::addMetadata(CmdTraceEvent,
"sycl_device",
1456 deviceToID(
MQueue->get_device()));
1457 xpti::addMetadata(CmdTraceEvent,
"sycl_device_type",
1459 xpti::addMetadata(CmdTraceEvent,
"sycl_device_name",
1461 xpti::addMetadata(CmdTraceEvent,
"memory_object",
1462 reinterpret_cast<size_t>(
MAddress));
1463 xpti::addMetadata(CmdTraceEvent,
"copy_from",
1464 reinterpret_cast<size_t>(
1467 CmdTraceEvent,
"copy_to",
1494 return MQueue->is_host() ||
1495 MQueue->getDeviceImplPtr()->getBackend() !=
1497 MEvent->getHandleRef() !=
nullptr;
1500 pi_int32 MemCpyCommand::enqueueImp() {
1520 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#C7EB15\" label=\"";
1522 Stream <<
"ID = " <<
this <<
" ; ";
1524 Stream <<
"From: " << MSrcAllocaCmd <<
" is host: " << MSrcQueue->is_host()
1526 Stream <<
"To: " << MDstAllocaCmd <<
" is host: " <<
MQueue->is_host()
1529 Stream <<
"\"];" << std::endl;
1531 for (
const auto &Dep :
MDeps) {
1532 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1533 <<
" [ label = \"Access mode: "
1535 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1542 if (Dep.MDepRequirement == Req)
1543 return Dep.MAllocaCmd;
1551 std::vector<std::shared_ptr<const void>>
1554 return ((
CGExecKernel *)MCommandGroup.get())->getAuxiliaryResources();
1563 pi_int32 UpdateHostRequirementCommand::enqueueImp() {
1569 assert(MSrcAllocaCmd &&
"Expected valid alloca command");
1570 assert(MSrcAllocaCmd->
getMemAllocation() &&
"Expected valid source pointer");
1571 assert(MDstPtr &&
"Expected valid target pointer");
1578 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#f1337f\", label=\"";
1580 Stream <<
"ID = " <<
this <<
"\\n";
1582 bool IsReqOnBuffer =
1584 Stream <<
"TYPE: " << (IsReqOnBuffer ?
"Buffer" :
"Image") <<
"\\n";
1586 Stream <<
"Is sub buffer: " << std::boolalpha << MDstReq.
MIsSubBuffer
1589 Stream <<
"\"];" << std::endl;
1591 for (
const auto &Dep :
MDeps) {
1592 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1593 <<
" [ label = \"Access mode: "
1595 <<
"MemObj: " << Dep.MAllocaCmd->getSYCLMemObj() <<
" \" ]"
1606 MSrcQueue(SrcQueue), MSrcReq(
std::move(SrcReq)),
1607 MSrcAllocaCmd(SrcAllocaCmd), MDstReq(
std::move(DstReq)), MDstPtr(DstPtr) {
1608 if (!MSrcQueue->is_host()) {
1609 MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1619 #ifdef XPTI_ENABLE_INSTRUMENTATION
1628 xpti_td *CmdTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
1629 xpti::addMetadata(CmdTraceEvent,
"sycl_device",
1630 deviceToID(
MQueue->get_device()));
1631 xpti::addMetadata(CmdTraceEvent,
"sycl_device_type",
1633 xpti::addMetadata(CmdTraceEvent,
"sycl_device_name",
1635 xpti::addMetadata(CmdTraceEvent,
"memory_object",
1636 reinterpret_cast<size_t>(
MAddress));
1637 xpti::addMetadata(CmdTraceEvent,
"copy_from",
1638 reinterpret_cast<size_t>(
1641 CmdTraceEvent,
"copy_to",
1652 pi_int32 MemCpyCommandHost::enqueueImp() {
1656 std::vector<sycl::detail::pi::PiEvent> RawEvents =
getPiEvents(EventImpls);
1685 pi_int32 EmptyCommand::enqueueImp() {
1695 MRequirements.emplace_back(ReqRef);
1696 const Requirement *
const StoredReq = &MRequirements.back();
1700 std::vector<Command *> ToCleanUp;
1702 assert(Cmd ==
nullptr &&
"Conection command should be null for EmptyCommand");
1703 assert(ToCleanUp.empty() &&
"addDep should add a command for cleanup only if "
1704 "there's a connection command");
1709 #ifdef XPTI_ENABLE_INSTRUMENTATION
1714 if (MRequirements.empty())
1723 xpti_td *CmdTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
1724 xpti::addMetadata(CmdTraceEvent,
"sycl_device",
1725 deviceToID(
MQueue->get_device()));
1726 xpti::addMetadata(CmdTraceEvent,
"sycl_device_type",
1728 xpti::addMetadata(CmdTraceEvent,
"sycl_device_name",
1730 xpti::addMetadata(CmdTraceEvent,
"memory_object",
1731 reinterpret_cast<size_t>(
MAddress));
1738 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#8d8f29\", label=\"";
1740 Stream <<
"ID = " <<
this <<
"\\n";
1741 Stream <<
"EMPTY NODE"
1744 Stream <<
"\"];" << std::endl;
1746 for (
const auto &Dep :
MDeps) {
1747 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1748 <<
" [ label = \"Access mode: "
1750 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1758 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#B6A2EB\", label=\"";
1760 Stream <<
"ID = " <<
this <<
"\\n";
1763 Stream <<
"\"];" << std::endl;
1765 for (
const auto &Dep :
MDeps) {
1766 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1767 <<
" [ label = \"Access mode: "
1769 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1778 MSrcAllocaCmd(SrcAllocaCmd), MDstReq(
std::move(Req)), MDstPtr(DstPtr) {
1784 #ifdef XPTI_ENABLE_INSTRUMENTATION
1793 xpti_td *CmdTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
1794 xpti::addMetadata(CmdTraceEvent,
"sycl_device",
1795 deviceToID(
MQueue->get_device()));
1796 xpti::addMetadata(CmdTraceEvent,
"sycl_device_type",
1798 xpti::addMetadata(CmdTraceEvent,
"sycl_device_name",
1800 xpti::addMetadata(CmdTraceEvent,
"memory_object",
1801 reinterpret_cast<size_t>(
MAddress));
1813 return "update_host";
1819 return "copy acc to acc";
1822 return "copy acc to ptr";
1825 return "copy ptr to acc";
1834 return "prefetch usm";
1840 return "copy 2d usm";
1843 return "fill 2d usm";
1846 return "memset 2d usm";
1849 return "copy to device_global";
1852 return "copy from device_global";
1861 std::unique_ptr<detail::CG> CommandGroup,
QueueImplPtr Queue,
1863 const std::vector<sycl::detail::pi::PiExtSyncPoint> &Dependencies)
1866 MCommandGroup(
std::move(CommandGroup)) {
1868 MEvent->setSubmittedQueue(
1875 #ifdef XPTI_ENABLE_INSTRUMENTATION
1876 std::string instrumentationGetKernelName(
1877 const std::shared_ptr<detail::kernel_impl> &SyclKernel,
1878 const std::string &FunctionName,
const std::string &SyclKernelName,
1879 void *&Address, std::optional<bool> &FromSource) {
1880 std::string KernelName;
1881 if (SyclKernel && SyclKernel->isCreatedFromSource()) {
1883 pi_kernel KernelHandle = SyclKernel->getHandleRef();
1884 Address = KernelHandle;
1885 KernelName = FunctionName;
1893 void instrumentationAddExtraKernelMetadata(
1894 xpti_td *&CmdTraceEvent,
const NDRDescT &NDRDesc,
1896 const std::string &KernelName,
1897 const std::shared_ptr<detail::kernel_impl> &SyclKernel,
1899 std::vector<ArgDesc> &CGArgs)
1902 std::vector<ArgDesc> Args;
1904 auto FilterArgs = [&Args](detail::ArgDesc &Arg,
int NextTrueIndex) {
1905 Args.push_back({Arg.MType, Arg.MPtr, Arg.MSize, NextTrueIndex});
1909 std::mutex *KernelMutex =
nullptr;
1912 std::shared_ptr<kernel_impl> SyclKernelImpl;
1913 std::shared_ptr<device_image_impl> DeviceImageImpl;
1921 kernel_id KernelID =
1925 std::shared_ptr<kernel_impl> KernelImpl =
1928 EliminatedArgMask = KernelImpl->getKernelArgMask();
1929 Program = KernelImpl->getDeviceImage()->get_program_ref();
1930 }
else if (
nullptr != SyclKernel) {
1931 auto SyclProg = SyclKernel->getProgramImpl();
1932 Program = SyclProg->getHandleRef();
1933 if (!SyclKernel->isCreatedFromSource())
1934 EliminatedArgMask = SyclKernel->getKernelArgMask();
1936 std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) =
1938 Queue->getContextImplPtr(), Queue->getDeviceImplPtr(), KernelName,
1944 xpti::offload_kernel_enqueue_data_t KernelData{
1945 {NDRDesc.GlobalSize[0], NDRDesc.GlobalSize[1], NDRDesc.GlobalSize[2]},
1946 {NDRDesc.LocalSize[0], NDRDesc.LocalSize[1], NDRDesc.LocalSize[2]},
1947 {NDRDesc.GlobalOffset[0], NDRDesc.GlobalOffset[1],
1948 NDRDesc.GlobalOffset[2]},
1950 xpti::addMetadata(CmdTraceEvent,
"enqueue_kernel_data", KernelData);
1951 for (
size_t i = 0; i < Args.size(); i++) {
1952 std::string Prefix(
"arg");
1953 xpti::offload_kernel_arg_data_t arg{(int)Args[i].MType, Args[i].MPtr,
1954 Args[i].MSize, Args[i].MIndex};
1955 xpti::addMetadata(CmdTraceEvent, Prefix + std::to_string(i), arg);
1959 void instrumentationFillCommonData(
const std::string &KernelName,
1960 const std::string &FileName, uint64_t Line,
1961 uint64_t Column,
const void *
const Address,
1963 std::optional<bool> &FromSource,
1964 uint64_t &OutInstanceID,
1965 xpti_td *&OutTraceEvent) {
1972 bool HasSourceInfo =
false;
1973 xpti::payload_t Payload;
1974 if (!FileName.empty()) {
1976 Payload = xpti::payload_t(KernelName.c_str(), FileName.c_str(), Line,
1978 HasSourceInfo =
true;
1979 }
else if (Address) {
1981 Payload = xpti::payload_t(KernelName.c_str(), Address);
1985 Payload = xpti::payload_t(KernelName.c_str());
1988 uint64_t CGKernelInstanceNo;
1990 xpti_td *CmdTraceEvent =
1991 xptiMakeEvent(
"ExecCG", &Payload, xpti::trace_graph_event,
1992 xpti::trace_activity_type_t::active, &CGKernelInstanceNo);
1994 if (CmdTraceEvent) {
1995 OutInstanceID = CGKernelInstanceNo;
1996 OutTraceEvent = CmdTraceEvent;
2000 if (CGKernelInstanceNo > 1)
2003 xpti::addMetadata(CmdTraceEvent,
"sycl_device",
2004 deviceToID(Queue->get_device()));
2005 xpti::addMetadata(CmdTraceEvent,
"sycl_device_type",
2007 xpti::addMetadata(CmdTraceEvent,
"sycl_device_name",
2009 if (!KernelName.empty()) {
2010 xpti::addMetadata(CmdTraceEvent,
"kernel_name", KernelName);
2012 if (FromSource.has_value()) {
2013 xpti::addMetadata(CmdTraceEvent,
"from_source", FromSource.value());
2015 if (HasSourceInfo) {
2016 xpti::addMetadata(CmdTraceEvent,
"sym_function_name", KernelName);
2017 xpti::addMetadata(CmdTraceEvent,
"sym_source_file_name", FileName);
2018 xpti::addMetadata(CmdTraceEvent,
"sym_line_no",
static_cast<int>(Line));
2019 xpti::addMetadata(CmdTraceEvent,
"sym_column_no",
2020 static_cast<int>(Column));
2027 const std::shared_ptr<detail::kernel_impl> &SyclKernel,
2031 std::vector<ArgDesc> &CGArgs) {
2032 #ifdef XPTI_ENABLE_INSTRUMENTATION
2033 constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
2035 if (!xptiCheckTraceEnabled(StreamID, NotificationTraceType))
2038 void *Address =
nullptr;
2039 std::optional<bool> FromSource;
2040 std::string KernelName = instrumentationGetKernelName(
2041 SyclKernel, std::string(CodeLoc.
functionName()), SyclKernelName, Address,
2044 xpti_td *CmdTraceEvent =
nullptr;
2045 uint64_t InstanceID = -1;
2046 std::string FileName =
2048 instrumentationFillCommonData(KernelName, FileName, CodeLoc.
lineNumber(),
2050 FromSource, InstanceID, CmdTraceEvent);
2052 if (CmdTraceEvent) {
2053 instrumentationAddExtraKernelMetadata(CmdTraceEvent, NDRDesc,
2055 SyclKernel, Queue, CGArgs);
2057 xptiNotifySubscribers(
2058 StreamID, NotificationTraceType, detail::GSYCLGraphEvent, CmdTraceEvent,
2060 static_cast<const void *
>(
2061 commandToNodeType(Command::CommandType::RUN_CG).c_str()));
2064 std::ignore = SyclKernel;
2065 std::ignore = CodeLoc;
2066 std::ignore = SyclKernelName;
2067 std::ignore = Queue;
2068 std::ignore = NDRDesc;
2070 std::ignore = CGArgs;
2075 #ifdef XPTI_ENABLE_INSTRUMENTATION
2076 constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
2077 if (!xptiCheckTraceEnabled(
MStreamID, NotificationTraceType))
2080 std::string KernelName;
2081 std::optional<bool> FromSource;
2082 switch (MCommandGroup->getType()) {
2086 KernelName = instrumentationGetKernelName(
2087 KernelCG->MSyclKernel, MCommandGroup->MFunctionName,
2088 KernelCG->getKernelName(),
MAddress, FromSource);
2095 xpti_td *CmdTraceEvent =
nullptr;
2096 instrumentationFillCommonData(KernelName, MCommandGroup->MFileName,
2097 MCommandGroup->MLine, MCommandGroup->MColumn,
2101 if (CmdTraceEvent) {
2106 instrumentationAddExtraKernelMetadata(
2107 CmdTraceEvent, KernelCG->MNDRDesc, KernelCG->getKernelBundle(),
2108 KernelCG->MKernelName, KernelCG->MSyclKernel,
MQueue,
2112 xptiNotifySubscribers(
2113 MStreamID, NotificationTraceType, detail::GSYCLGraphEvent,
2115 static_cast<const void *
>(commandToNodeType(
MType).c_str()));
2121 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
2123 Stream <<
"ID = " <<
this <<
"\\n";
2126 switch (MCommandGroup->getType()) {
2130 Stream <<
"Kernel name: ";
2131 if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource())
2132 Stream <<
"created from source";
2139 Stream <<
"CG type: " <<
cgTypeToString(MCommandGroup->getType()) <<
"\\n";
2143 Stream <<
"\"];" << std::endl;
2145 for (
const auto &Dep :
MDeps) {
2146 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
2147 <<
" [ label = \"Access mode: "
2149 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
2174 sycl::info::kernel_device_specific::compile_work_group_size>(
2177 if (WGSize[0] == 0) {
2201 switch (AccessorMode) {
2214 const std::shared_ptr<device_image_impl> &DeviceImageImpl,
2215 const std::function<
void *(
Requirement *Req)> &getMemAllocationFunc,
2217 size_t NextTrueIndex) {
2218 switch (Arg.
MType) {
2229 getMemAllocationFunc
2247 &MemObjData, &MemArg);
2257 sampler *SamplerPtr = (sampler *)Arg.
MPtr;
2273 "SYCL2020 specialization constants are not yet supported on host "
2277 assert(DeviceImageImpl !=
nullptr);
2279 DeviceImageImpl->get_spec_const_buffer_ref();
2282 SpecConstsBuffer ? &SpecConstsBuffer :
nullptr;
2288 Kernel, NextTrueIndex, &MemObjData, SpecConstsBufferArg);
2293 "Invalid kernel param kind " +
2301 const std::shared_ptr<device_image_impl> &DeviceImageImpl,
2303 std::vector<sycl::detail::pi::PiEvent> &RawEvents,
2306 const std::function<
void *(
Requirement *Req)> &getMemAllocationFunc) {
2307 const PluginPtr &Plugin = Queue->getPlugin();
2309 auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
2312 Queue->get_context(), Queue->is_host(), Arg,
2321 const bool HasLocalSize = (NDRDesc.
LocalSize[0] != 0);
2325 size_t RequiredWGSize[3] = {0, 0, 0};
2326 size_t *LocalSize =
nullptr;
2332 Kernel, Queue->getDeviceImplPtr()->getHandleRef(),
2334 RequiredWGSize,
nullptr);
2336 const bool EnforcedLocalSize =
2337 (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2338 RequiredWGSize[2] != 0);
2339 if (EnforcedLocalSize)
2340 LocalSize = RequiredWGSize;
2342 if (OutEventImpl !=
nullptr)
2343 OutEventImpl->setHostEnqueueTime();
2346 &NDRDesc.
GlobalSize[0], LocalSize, RawEvents.size(),
2347 RawEvents.empty() ? nullptr : &RawEvents[0],
2348 OutEventImpl ? &OutEventImpl->getHandleRef() :
nullptr);
2354 void **CastedBlob = (
void **)Blob;
2356 std::vector<Requirement *> *Reqs =
2357 static_cast<std::vector<Requirement *> *
>(CastedBlob[0]);
2359 std::shared_ptr<HostKernelBase> *
HostKernel =
2360 static_cast<std::shared_ptr<HostKernelBase> *
>(CastedBlob[1]);
2365 void **NextArg = CastedBlob + 3;
2367 Req->
MData = *(NextArg++);
2369 (*HostKernel)->call(*NDRDesc,
nullptr);
2381 std::vector<sycl::detail::pi::PiExtSyncPoint> &SyncPoints,
2383 const std::function<
void *(
Requirement *Req)> &getMemAllocationFunc) {
2387 std::mutex *KernelMutex =
nullptr;
2392 if (Kernel !=
nullptr) {
2396 sycl::detail::ProgramManager::getInstance().getOrCreateKernel(
2397 ContextImpl, DeviceImpl, CommandGroup.
MKernelName,
nullptr);
2400 auto SetFunc = [&Plugin, &
PiKernel, &Ctx, &getMemAllocationFunc](
2401 sycl::detail::ArgDesc &Arg,
size_t NextTrueIndex) {
2406 getMemAllocationFunc, Ctx,
false, Arg, NextTrueIndex);
2409 auto Args = CommandGroup.
MArgs;
2416 auto NDRDesc = CommandGroup.
MNDRDesc;
2420 size_t RequiredWGSize[3] = {0, 0, 0};
2421 size_t *LocalSize =
nullptr;
2427 PiKernel, DeviceImpl->getHandleRef(),
2432 const bool EnforcedLocalSize =
2433 (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2434 RequiredWGSize[2] != 0);
2435 if (EnforcedLocalSize)
2436 LocalSize = RequiredWGSize;
2442 &NDRDesc.
GlobalSize[0], LocalSize, SyncPoints.size(),
2443 SyncPoints.size() ? SyncPoints.data() :
nullptr, OutSyncPoint);
2445 if (Res != pi_result::PI_SUCCESS) {
2447 "Failed to add kernel to PI command-buffer");
2456 const std::shared_ptr<detail::kernel_impl> &MSyclKernel,
2457 const std::string &KernelName,
2458 std::vector<sycl::detail::pi::PiEvent> &RawEvents,
2460 const std::function<
void *(
Requirement *Req)> &getMemAllocationFunc,
2464 auto ContextImpl = Queue->getContextImplPtr();
2465 auto DeviceImpl = Queue->getDeviceImplPtr();
2467 std::mutex *KernelMutex =
nullptr;
2471 std::shared_ptr<kernel_impl> SyclKernelImpl;
2472 std::shared_ptr<device_image_impl> DeviceImageImpl;
2487 Kernel = SyclKernelImpl->getHandleRef();
2488 DeviceImageImpl = SyclKernelImpl->getDeviceImage();
2490 Program = DeviceImageImpl->get_program_ref();
2492 std::tie(Kernel, KernelMutex, EliminatedArgMask) =
2496 }
else if (
nullptr != MSyclKernel) {
2497 assert(MSyclKernel->get_info<info::kernel::context>() ==
2498 Queue->get_context());
2499 Kernel = MSyclKernel->getHandleRef();
2500 auto SyclProg = MSyclKernel->getProgramImpl();
2501 Program = SyclProg->getHandleRef();
2502 if (SyclProg->is_cacheable()) {
2504 std::tie(FoundKernel, KernelMutex, EliminatedArgMask, std::ignore) =
2506 ContextImpl, DeviceImpl, KernelName, SyclProg.get());
2507 assert(FoundKernel == Kernel);
2515 KernelMutex = &MSyclKernel->getNoncacheableEnqueueMutex();
2516 EliminatedArgMask = MSyclKernel->getKernelArgMask();
2519 std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) =
2521 ContextImpl, DeviceImpl, KernelName,
nullptr);
2525 std::vector<sycl::detail::pi::PiEvent> &EventsWaitList = RawEvents;
2528 std::vector<sycl::detail::pi::PiEvent> DeviceGlobalInitEvents =
2529 ContextImpl->initializeDeviceGlobals(Program, Queue);
2530 std::vector<sycl::detail::pi::PiEvent> EventsWithDeviceGlobalInits;
2531 if (!DeviceGlobalInitEvents.empty()) {
2532 EventsWithDeviceGlobalInits.reserve(RawEvents.size() +
2533 DeviceGlobalInitEvents.size());
2534 EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(),
2535 RawEvents.begin(), RawEvents.end());
2536 EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(),
2537 DeviceGlobalInitEvents.begin(),
2538 DeviceGlobalInitEvents.end());
2539 EventsWaitList = EventsWithDeviceGlobalInits;
2544 assert(KernelMutex);
2545 std::lock_guard<std::mutex> Lock(*KernelMutex);
2551 const PluginPtr &Plugin = Queue->getPlugin();
2558 NDRDesc, EventsWaitList, OutEventImpl,
2559 EliminatedArgMask, getMemAllocationFunc);
2561 if (PI_SUCCESS != Error) {
2564 const device_impl &DeviceImpl = *(Queue->getDeviceImplPtr());
2574 bool blocking,
void *ptr,
size_t size,
2575 std::vector<sycl::detail::pi::PiEvent> &RawEvents,
2581 device Device = Queue->get_device();
2583 std::optional<sycl::detail::pi::PiProgram> CachedProgram =
2584 ContextImpl->getProgramForHostPipe(Device, hostPipeEntry);
2586 Program = *CachedProgram;
2592 Queue->get_device());
2597 assert(Program &&
"Program for this hostpipe is not compiled.");
2600 const PluginPtr &Plugin = Queue->getPlugin();
2602 pi_queue pi_q = Queue->getHandleRef();
2605 auto OutEvent = OutEventImpl ? &OutEventImpl->getHandleRef() :
nullptr;
2606 if (OutEventImpl !=
nullptr)
2607 OutEventImpl->setHostEnqueueTime();
2611 pi_q, Program, PipeName.c_str(), blocking, ptr, size,
2612 RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
2618 pi_q, Program, PipeName.c_str(), blocking, ptr, size,
2619 RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
2625 pi_int32 ExecCGCommand::enqueueImpCommandBuffer() {
2631 (
MQueue->has_discard_events_support() &&
2632 MCommandGroup->getRequirements().size() == 0)
2634 : &
MEvent->getHandleRef();
2636 switch (MCommandGroup->getType()) {
2637 case CG::CGTYPE::Kernel: {
2638 CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
2640 auto getMemAllocationFunc = [
this](
Requirement *Req) {
2641 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2642 return AllocaCmd->getMemAllocation();
2647 bool KernelUsesAssert =
2648 !(ExecKernel->MSyclKernel && ExecKernel->MSyclKernel->isInterop()) &&
2650 ExecKernel->MKernelName);
2651 if (KernelUsesAssert) {
2652 Event = &
MEvent->getHandleRef();
2657 *ExecKernel,
MSyncPointDeps, &OutSyncPoint, getMemAllocationFunc);
2658 MEvent->setSyncPoint(OutSyncPoint);
2661 case CG::CGTYPE::CopyUSM: {
2662 CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
2665 Copy->getLength(), Copy->getDst(),
MSyncPointDeps, &OutSyncPoint);
2666 MEvent->setSyncPoint(OutSyncPoint);
2669 case CG::CGTYPE::CopyAccToAcc: {
2670 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2674 AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc);
2675 AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst);
2679 AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(),
2680 ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange,
2681 ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
2682 ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
2685 MEvent->setSyncPoint(OutSyncPoint);
2688 case CG::CGTYPE::CopyAccToPtr: {
2689 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2691 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2695 AllocaCmd->getMemAllocation(), Req->MDims, Req->MMemoryRange,
2696 Req->MAccessRange, Req->MOffset, Req->MElemSize, (
char *)Copy->getDst(),
2697 Req->MDims, Req->MAccessRange,
2703 case CG::CGTYPE::CopyPtrToAcc: {
2704 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2706 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2710 (
char *)Copy->getSrc(), Req->MDims, Req->MAccessRange,
2711 {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(),
2712 Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2718 throw runtime_error(
"CG type not implemented for command buffers.",
2719 PI_ERROR_INVALID_OPERATION);
2723 pi_int32 ExecCGCommand::enqueueImp() {
2725 return enqueueImpCommandBuffer();
2727 return enqueueImpQueue();
2731 pi_int32 ExecCGCommand::enqueueImpQueue() {
2738 bool DiscardEvent = (
MQueue->has_discard_events_support() &&
2739 MCommandGroup->getRequirements().size() == 0);
2741 DiscardEvent ? nullptr : &
MEvent->getHandleRef();
2744 switch (MCommandGroup->getType()) {
2746 case CG::CGTYPE::UpdateHost: {
2748 "Update host should be handled by the Scheduler. " +
2751 case CG::CGTYPE::CopyAccToPtr: {
2752 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2754 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2757 AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(),
MQueue,
2758 Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2759 Req->MElemSize, Copy->getDst(),
2761 Req->MAccessRange, Req->MAccessRange, {0, 0, 0},
2762 Req->MElemSize, std::move(RawEvents),
MEvent->getHandleRef(),
MEvent);
2766 case CG::CGTYPE::CopyPtrToAcc: {
2767 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2769 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2774 AllocaCmd->getSYCLMemObj(), Copy->getSrc(),
2776 Req->MAccessRange, Req->MAccessRange,
2777 {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(),
2778 MQueue, Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2779 Req->MElemSize, std::move(RawEvents),
MEvent->getHandleRef(),
MEvent);
2783 case CG::CGTYPE::CopyAccToAcc: {
2784 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2788 AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc);
2789 AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst);
2792 AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(),
MQueue,
2793 ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange,
2794 ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
2795 MQueue, ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
2796 ReqDst->MOffset, ReqDst->MElemSize, std::move(RawEvents),
2801 case CG::CGTYPE::Fill: {
2802 CGFill *Fill = (CGFill *)MCommandGroup.get();
2804 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2807 AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(),
MQueue,
2808 Fill->MPattern.size(), Fill->MPattern.data(), Req->MDims,
2809 Req->MMemoryRange, Req->MAccessRange, Req->MOffset, Req->MElemSize,
2814 case CG::CGTYPE::Kernel: {
2815 CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
2817 NDRDescT &NDRDesc = ExecKernel->MNDRDesc;
2818 std::vector<ArgDesc> &Args = ExecKernel->MArgs;
2820 if (
MQueue->is_host() || (
MQueue->getDeviceImplPtr()->getBackend() ==
2821 backend::ext_intel_esimd_emulator)) {
2822 for (ArgDesc &Arg : Args)
2825 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2826 Req->MData = AllocaCmd->getMemAllocation();
2828 if (!RawEvents.empty()) {
2830 const PluginPtr &Plugin = EventImpls[0]->getPlugin();
2835 ExecKernel->MHostKernel->call(NDRDesc,
2836 getEvent()->getHostProfilingInfo());
2838 assert(
MQueue->getDeviceImplPtr()->getBackend() ==
2839 backend::ext_intel_esimd_emulator);
2841 MEvent->setHostEnqueueTime();
2844 reinterpret_cast<pi_kernel>(ExecKernel->MHostKernel->getPtr()),
2845 NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0],
2846 &NDRDesc.LocalSize[0], 0,
nullptr,
nullptr);
2851 auto getMemAllocationFunc = [
this](
Requirement *Req) {
2852 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2855 return AllocaCmd ? AllocaCmd->getMemAllocation() :
nullptr;
2858 const std::shared_ptr<detail::kernel_impl> &SyclKernel =
2859 ExecKernel->MSyclKernel;
2860 const std::string &KernelName = ExecKernel->MKernelName;
2864 bool KernelUsesAssert =
2865 !(SyclKernel && SyclKernel->isInterop()) &&
2867 if (KernelUsesAssert) {
2873 MQueue, NDRDesc, Args, ExecKernel->getKernelBundle(), SyclKernel,
2874 KernelName, RawEvents, EventImpl, getMemAllocationFunc,
2875 ExecKernel->MKernelCacheConfig);
2877 case CG::CGTYPE::CopyUSM: {
2878 CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
2880 Copy->getDst(), std::move(RawEvents), Event,
2885 case CG::CGTYPE::FillUSM: {
2886 CGFillUSM *Fill = (CGFillUSM *)MCommandGroup.get();
2888 Fill->getFill(), std::move(RawEvents), Event,
2893 case CG::CGTYPE::PrefetchUSM: {
2894 CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get();
2896 Prefetch->getLength(), std::move(RawEvents),
2901 case CG::CGTYPE::AdviseUSM: {
2902 CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get();
2904 Advise->getAdvice(), std::move(RawEvents), Event,
2909 case CG::CGTYPE::Copy2DUSM: {
2910 CGCopy2DUSM *Copy = (CGCopy2DUSM *)MCommandGroup.get();
2912 Copy->getDst(), Copy->getDstPitch(),
2913 Copy->getWidth(), Copy->getHeight(),
2914 std::move(RawEvents), Event,
MEvent);
2917 case CG::CGTYPE::Fill2DUSM: {
2918 CGFill2DUSM *Fill = (CGFill2DUSM *)MCommandGroup.get();
2920 Fill->getWidth(), Fill->getHeight(),
2921 Fill->getPattern(), std::move(RawEvents), Event,
2925 case CG::CGTYPE::Memset2DUSM: {
2926 CGMemset2DUSM *Memset = (CGMemset2DUSM *)MCommandGroup.get();
2928 Memset->getWidth(), Memset->getHeight(),
2929 Memset->getValue(), std::move(RawEvents),
2933 case CG::CGTYPE::CodeplayHostTask: {
2934 CGHostTask *HostTask =
static_cast<CGHostTask *
>(MCommandGroup.get());
2936 for (ArgDesc &Arg : HostTask->MArgs) {
2937 switch (Arg.MType) {
2940 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2943 Req->MData = AllocaCmd->getMemAllocation();
2948 "Unsupported arg type " +
2953 std::vector<interop_handle::ReqToMem> ReqToMem;
2955 if (HostTask->MHostTask->isInteropTask()) {
2958 const std::vector<Requirement *> &HandlerReq =
2959 HostTask->getRequirements();
2960 auto ReqToMemConv = [&ReqToMem, HostTask](
Requirement *Req) {
2961 const std::vector<AllocaCommandBase *> &AllocaCmds =
2962 Req->MSYCLMemObj->MRecord->MAllocaCommands;
2964 for (AllocaCommandBase *AllocaCmd : AllocaCmds)
2965 if (HostTask->MQueue->getContextImplPtr() ==
2966 AllocaCmd->getQueue()->getContextImplPtr()) {
2968 reinterpret_cast<pi_mem>(AllocaCmd->getMemAllocation());
2969 ReqToMem.emplace_back(std::make_pair(Req, MemArg));
2975 "Can't get memory object due to no allocation available");
2979 "Can't get memory object due to no allocation available " +
2982 std::for_each(std::begin(HandlerReq), std::end(HandlerReq), ReqToMemConv);
2983 std::sort(std::begin(ReqToMem), std::end(ReqToMem));
2997 case CG::CGTYPE::Barrier: {
2998 if (
MQueue->getDeviceImplPtr()->is_host()) {
3004 MEvent->setHostEnqueueTime();
3006 MQueue->getHandleRef(), 0,
nullptr, Event);
3010 case CG::CGTYPE::BarrierWaitlist: {
3011 CGBarrier *Barrier =
static_cast<CGBarrier *
>(MCommandGroup.get());
3012 std::vector<detail::EventImplPtr> Events = Barrier->MEventsWaitWithBarrier;
3013 std::vector<sycl::detail::pi::PiEvent> PiEvents =
3015 if (
MQueue->getDeviceImplPtr()->is_host() || PiEvents.empty()) {
3022 MEvent->setHostEnqueueTime();
3024 MQueue->getHandleRef(), PiEvents.size(), &PiEvents[0], Event);
3028 case CG::CGTYPE::CopyToDeviceGlobal: {
3029 CGCopyToDeviceGlobal *Copy = (CGCopyToDeviceGlobal *)MCommandGroup.get();
3031 Copy->getDeviceGlobalPtr(), Copy->isDeviceImageScoped(),
MQueue,
3032 Copy->getNumBytes(), Copy->getOffset(), Copy->getSrc(),
3033 std::move(RawEvents), Event,
MEvent);
3037 case CG::CGTYPE::CopyFromDeviceGlobal: {
3038 CGCopyFromDeviceGlobal *Copy =
3039 (CGCopyFromDeviceGlobal *)MCommandGroup.get();
3041 Copy->getDeviceGlobalPtr(), Copy->isDeviceImageScoped(),
MQueue,
3042 Copy->getNumBytes(), Copy->getOffset(), Copy->getDest(),
3043 std::move(RawEvents), Event,
MEvent);
3047 case CG::CGTYPE::ReadWriteHostPipe: {
3048 CGReadWriteHostPipe *ExecReadWriteHostPipe =
3049 (CGReadWriteHostPipe *)MCommandGroup.get();
3050 std::string pipeName = ExecReadWriteHostPipe->getPipeName();
3051 void *hostPtr = ExecReadWriteHostPipe->getHostPtr();
3052 size_t typeSize = ExecReadWriteHostPipe->getTypeSize();
3053 bool blocking = ExecReadWriteHostPipe->isBlocking();
3054 bool read = ExecReadWriteHostPipe->isReadHostPipe();
3060 typeSize, RawEvents, EventImpl, read);
3062 case CG::CGTYPE::ExecCommandBuffer: {
3063 CGExecCommandBuffer *CmdBufferCG =
3064 static_cast<CGExecCommandBuffer *
>(MCommandGroup.get());
3066 MEvent->setHostEnqueueTime();
3067 return MQueue->getPlugin()
3069 CmdBufferCG->MCommandBuffer,
MQueue->getHandleRef(),
3070 RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
3073 case CG::CGTYPE::CopyImage: {
3074 CGCopyImage *Copy = (CGCopyImage *)MCommandGroup.get();
3079 Copy->getSrc(),
MQueue, Copy->getDst(), Desc, Copy->getFormat(),
3080 Copy->getCopyFlags(), Copy->getSrcOffset(), Copy->getDstOffset(),
3081 Copy->getHostExtent(), Copy->getCopyExtent(), std::move(RawEvents),
3085 case CG::CGTYPE::SemaphoreWait: {
3086 CGSemaphoreWait *SemWait = (CGSemaphoreWait *)MCommandGroup.get();
3087 if (
MQueue->getDeviceImplPtr()->is_host()) {
3094 MQueue->getHandleRef(), SemWait->getInteropSemaphoreHandle(), 0,
3099 case CG::CGTYPE::SemaphoreSignal: {
3100 CGSemaphoreSignal *SemSignal = (CGSemaphoreSignal *)MCommandGroup.get();
3101 if (
MQueue->getDeviceImplPtr()->is_host()) {
3108 MQueue->getHandleRef(), SemSignal->getInteropSemaphoreHandle(), 0,
3113 case CG::CGTYPE::None:
3115 "CG type not implemented. " +
3118 return PI_ERROR_INVALID_OPERATION;
3123 MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask;
3129 (MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask);
3133 if (MCommandGroup->getType() == CG::CGTYPE::CodeplayHostTask)
3145 return MAuxiliaryCommands;
3149 MFusionList.push_back(Kernel);
3158 pi_int32 KernelFusionCommand::enqueueImp() {
3174 "Cannot release the queue attached to the KernelFusionCommand if it "
3181 #ifdef XPTI_ENABLE_INSTRUMENTATION
3182 constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
3183 if (!xptiCheckTraceEnabled(
MStreamID, NotificationTraceType)) {
3191 static unsigned FusionNodeCount = 0;
3192 std::stringstream PayloadStr;
3193 PayloadStr <<
"Fusion command #" << FusionNodeCount++;
3194 xpti::payload_t Payload = xpti::payload_t(PayloadStr.str().c_str());
3196 uint64_t CommandInstanceNo = 0;
3197 xpti_td *CmdTraceEvent =
3198 xptiMakeEvent(
MCommandName.c_str(), &Payload, xpti::trace_graph_event,
3199 xpti_at::active, &CommandInstanceNo);
3202 if (CmdTraceEvent) {
3218 xpti::addMetadata(CmdTraceEvent,
"sycl_device",
3219 deviceToID(
MQueue->get_device()));
3220 xpti::addMetadata(CmdTraceEvent,
"sycl_device_type",
3222 xpti::addMetadata(CmdTraceEvent,
"sycl_device_name",
3227 xptiNotifySubscribers(
MStreamID, NotificationTraceType,
3228 detail::GSYCLGraphEvent,
3237 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
3239 Stream <<
"ID = " <<
this <<
"\\n";
3241 <<
"FUSION LIST: {";
3242 bool Initial =
true;
3243 for (
auto *Cmd : MFusionList) {
3249 if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource()) {
3250 Stream <<
"created from source";
3257 Stream <<
"\"];" << std::endl;
3259 for (
const auto &Dep :
MDeps) {
3260 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
3261 <<
" [ label = \"Access mode: "
3263 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"