37 #if __has_include(<cxxabi.h>)
38 #define __SYCL_ENABLE_GNU_DEMANGLING
45 #ifdef XPTI_ENABLE_INSTRUMENTATION
46 #include "xpti/xpti_trace_framework.hpp"
54 #ifdef XPTI_ENABLE_INSTRUMENTATION
56 extern xpti::trace_event_data_t *GSYCLGraphEvent;
58 bool CurrentCodeLocationValid() {
59 detail::tls_code_loc_t Tls;
60 auto CodeLoc = Tls.query();
61 auto FileName = CodeLoc.fileName();
62 auto FunctionName = CodeLoc.functionName();
63 return (FileName && FileName[0] !=
'\0') ||
64 (FunctionName && FunctionName[0] !=
'\0');
68 #ifdef __SYCL_ENABLE_GNU_DEMANGLING
69 struct DemangleHandle {
71 DemangleHandle(
char *ptr) : p(ptr) {}
76 DemangleHandle result(abi::__cxa_demangle(Name.c_str(), NULL, NULL, &Status));
77 return (Status == 0) ? result.p : Name;
86 else if (Device.is_cpu())
88 else if (Device.is_gpu())
90 else if (Device.is_accelerator())
98 std::vector<ArgDesc> &Args,
100 if (EliminatedArgMask.empty()) {
108 std::sort(Args.begin(), Args.end(), [](
const ArgDesc &A,
const ArgDesc &B) {
109 return A.MIndex < B.MIndex;
112 size_t NextTrueIndex = 0;
117 for (
int Idx = LastIndex + 1; Idx < Arg.
MIndex; ++Idx)
118 if (!EliminatedArgMask[Idx])
122 if (EliminatedArgMask[Arg.
MIndex])
125 Func(Arg, NextTrueIndex);
131 #ifdef XPTI_ENABLE_INSTRUMENTATION
132 static size_t deviceToID(
const device &Device) {
136 return reinterpret_cast<size_t>(
getSyclObjImpl(Device)->getHandleRef());
142 case access::mode::read:
148 case access::mode::discard_write:
149 return "discard_write";
150 case access::mode::discard_read_write:
151 return "discard_read_write";
157 #ifdef XPTI_ENABLE_INSTRUMENTATION
160 static std::string commandToNodeType(Command::CommandType Type) {
162 case Command::CommandType::RUN_CG:
163 return "command_group_node";
164 case Command::CommandType::COPY_MEMORY:
165 return "memory_transfer_node";
166 case Command::CommandType::ALLOCA:
167 return "memory_allocation_node";
168 case Command::CommandType::ALLOCA_SUB_BUF:
169 return "sub_buffer_creation_node";
170 case Command::CommandType::RELEASE:
171 return "memory_deallocation_node";
172 case Command::CommandType::MAP_MEM_OBJ:
173 return "memory_transfer_node";
174 case Command::CommandType::UNMAP_MEM_OBJ:
175 return "memory_transfer_node";
176 case Command::CommandType::UPDATE_REQUIREMENT:
177 return "host_acc_create_buffer_lock_node";
178 case Command::CommandType::EMPTY_TASK:
179 return "host_acc_destroy_buffer_release_node";
180 case Command::CommandType::FUSION:
181 return "kernel_fusion_placeholder_node";
183 return "unknown_node";
190 static std::string commandToName(Command::CommandType Type) {
192 case Command::CommandType::RUN_CG:
193 return "Command Group Action";
194 case Command::CommandType::COPY_MEMORY:
195 return "Memory Transfer (Copy)";
196 case Command::CommandType::ALLOCA:
197 return "Memory Allocation";
198 case Command::CommandType::ALLOCA_SUB_BUF:
199 return "Sub Buffer Creation";
200 case Command::CommandType::RELEASE:
201 return "Memory Deallocation";
202 case Command::CommandType::MAP_MEM_OBJ:
203 return "Memory Transfer (Map)";
204 case Command::CommandType::UNMAP_MEM_OBJ:
205 return "Memory Transfer (Unmap)";
206 case Command::CommandType::UPDATE_REQUIREMENT:
207 return "Host Accessor Creation/Buffer Lock";
208 case Command::CommandType::EMPTY_TASK:
209 return "Host Accessor Destruction/Buffer Lock Release";
210 case Command::CommandType::FUSION:
211 return "Kernel Fusion Placeholder";
213 return "Unknown Action";
218 std::vector<RT::PiEvent>
219 Command::getPiEvents(
const std::vector<EventImplPtr> &EventImpls)
const {
220 std::vector<RT::PiEvent> RetPiEvents;
221 for (
auto &EventImpl : EventImpls) {
222 if (EventImpl->getHandleRef() ==
nullptr)
232 if (EventImpl->getWorkerQueue() == WorkerQueue &&
233 WorkerQueue->isInOrder() && !isHostTask())
236 RetPiEvents.push_back(EventImpl->getHandleRef());
246 std::vector<RT::PiEvent> Command::getPiEventsBlocking(
247 const std::vector<EventImplPtr> &EventImpls)
const {
248 std::vector<RT::PiEvent> RetPiEvents;
249 for (
auto &EventImpl : EventImpls) {
254 if (!EventImpl->isContextInitialized() || EventImpl->is_host())
258 if (EventImpl->getHandleRef() ==
nullptr) {
259 if (!EventImpl->getCommand() ||
262 std::vector<Command *> AuxCmds;
263 Scheduler::getInstance().enqueueCommandForCG(EventImpl, AuxCmds,
273 if (EventImpl->getWorkerQueue() == WorkerQueue &&
274 WorkerQueue->isInOrder() && !isHostTask())
277 RetPiEvents.push_back(EventImpl->getHandleRef());
283 bool Command::isHostTask()
const {
284 return (MType == CommandType::RUN_CG) &&
285 ((
static_cast<const ExecCGCommand *
>(
this))->getCG().getType() ==
286 CG::CGTYPE::CodeplayHostTask);
291 for (
auto &EventImpl : EventImpls) {
292 EventImpl->flushIfNeeded(Queue);
298 std::vector<interop_handle::ReqToMem> MReqToMem;
301 std::map<const detail::plugin *, std::vector<EventImplPtr>>
302 RequiredEventsPerPlugin;
306 RequiredEventsPerPlugin[&Plugin].push_back(Event);
314 for (
auto &PluginWithEvents : RequiredEventsPerPlugin) {
315 std::vector<RT::PiEvent> RawEvents =
320 }
catch (
const sycl::exception &E) {
322 HostTask.MQueue->reportAsyncException(std::current_exception());
326 HostTask.MQueue->reportAsyncException(std::current_exception());
327 return PI_ERROR_UNKNOWN;
334 Event->waitInternal();
342 std::vector<interop_handle::ReqToMem> ReqToMem)
343 : MThisCmd{ThisCmd}, MReqToMem(std::move(ReqToMem)) {}
346 assert(MThisCmd->
getCG().
getType() == CG::CGTYPE::CodeplayHostTask);
350 #ifdef XPTI_ENABLE_INSTRUMENTATION
355 std::unique_ptr<detail::tls_code_loc_t> AsyncCodeLocationPtr;
356 if (xptiTraceEnabled() && !CurrentCodeLocationValid()) {
357 AsyncCodeLocationPtr.reset(
363 if (WaitResult != PI_SUCCESS) {
364 std::exception_ptr EPtr = std::make_exception_ptr(sycl::runtime_error(
365 std::string(
"Couldn't wait for host-task's dependencies"),
367 HostTask.MQueue->reportAsyncException(EPtr);
370 Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd);
376 if (
HostTask.MHostTask->isInteropTask()) {
378 HostTask.MQueue->getDeviceImplPtr(),
379 HostTask.MQueue->getContextImplPtr()};
385 auto CurrentException = std::current_exception();
386 #ifdef XPTI_ENABLE_INSTRUMENTATION
390 if (xptiTraceEnabled()) {
392 rethrow_exception(CurrentException);
393 }
catch (
const sycl::exception &) {
395 }
catch (
const std::exception &StdException) {
396 GlobalHandler::instance().TraceEventXPTI(StdException.what());
398 GlobalHandler::instance().TraceEventXPTI(
399 "Host task lambda thrown non standard exception");
403 HostTask.MQueue->reportAsyncException(CurrentException);
408 #ifdef XPTI_ENABLE_INSTRUMENTATION
411 AsyncCodeLocationPtr.reset();
417 Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd);
419 auto CurrentException = std::current_exception();
420 HostTask.MQueue->reportAsyncException(CurrentException);
425 void Command::waitForPreparedHostEvents()
const {
426 for (
const EventImplPtr &HostEvent : MPreparedHostDepsEvents)
427 HostEvent->waitInternal();
431 std::vector<EventImplPtr> &EventImpls,
434 if (!EventImpls.empty()) {
435 if (Queue->is_host()) {
449 std::map<context_impl *, std::vector<EventImplPtr>>
450 RequiredEventsPerContext;
454 assert(Context.get() &&
455 "Only non-host events are expected to be waited for here");
456 RequiredEventsPerContext[Context.get()].push_back(Event);
459 for (
auto &CtxWithEvents : RequiredEventsPerContext) {
460 std::vector<RT::PiEvent> RawEvents = getPiEvents(CtxWithEvents.second);
462 RawEvents.size(), RawEvents.data());
467 assert(Event->getContextImpl().get() &&
468 "Only non-host events are expected to be waited for here");
471 std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
475 Queue->getHandleRef(), RawEvents.size(), &RawEvents[0], &Event);
484 : MQueue(
std::move(Queue)),
486 MPreparedDepsEvents(MEvent->getPreparedDepsEvents()),
487 MPreparedHostDepsEvents(MEvent->getPreparedHostDepsEvents()),
494 MEvent->setStateIncomplete();
497 #ifdef XPTI_ENABLE_INSTRUMENTATION
498 if (!xptiTraceEnabled())
506 #ifdef XPTI_ENABLE_INSTRUMENTATION
522 Command *Cmd,
void *ObjAddr,
bool IsCommand,
523 std::optional<access::mode> AccMode) {
524 #ifdef XPTI_ENABLE_INSTRUMENTATION
532 xpti::utils::StringHelper SH;
533 std::string AddressStr = SH.addressAsString<
void *>(ObjAddr);
535 std::string TypeString = SH.nameWithAddressString(Prefix, AddressStr);
538 xpti::payload_t Payload(TypeString.c_str(),
MAddress);
539 uint64_t EdgeInstanceNo;
541 xptiMakeEvent(TypeString.c_str(), &Payload, xpti::trace_graph_event,
542 xpti_at::active, &EdgeInstanceNo);
544 xpti_td *SrcEvent =
static_cast<xpti_td *
>(Cmd->
MTraceEvent);
545 xpti_td *TgtEvent =
static_cast<xpti_td *
>(
MTraceEvent);
546 EdgeEvent->source_id = SrcEvent->unique_id;
547 EdgeEvent->target_id = TgtEvent->unique_id;
549 xpti::addMetadata(EdgeEvent,
"access_mode",
550 static_cast<int>(AccMode.value()));
551 xpti::addMetadata(EdgeEvent,
"memory_object",
552 reinterpret_cast<size_t>(ObjAddr));
554 xpti::addMetadata(EdgeEvent,
"event",
reinterpret_cast<size_t>(ObjAddr));
556 xptiNotifySubscribers(
MStreamID, xpti::trace_edge_create,
557 detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
572 #ifdef XPTI_ENABLE_INSTRUMENTATION
585 xpti::utils::StringHelper SH;
586 std::string AddressStr = SH.addressAsString<
RT::PiEvent>(PiEventAddr);
591 std::string NodeName = SH.nameWithAddressString(
"virtual_node", AddressStr);
593 xpti::payload_t VNPayload(NodeName.c_str(),
MAddress);
594 uint64_t VNodeInstanceNo;
596 xptiMakeEvent(NodeName.c_str(), &VNPayload, xpti::trace_graph_event,
597 xpti_at::active, &VNodeInstanceNo);
599 xpti::addMetadata(NodeEvent,
"kernel_name", NodeName);
600 xptiNotifySubscribers(
MStreamID, xpti::trace_node_create,
601 detail::GSYCLGraphEvent, NodeEvent, VNodeInstanceNo,
604 std::string EdgeName = SH.nameWithAddressString(
"Event", AddressStr);
605 xpti::payload_t EdgePayload(EdgeName.c_str(),
MAddress);
606 uint64_t EdgeInstanceNo;
608 xptiMakeEvent(EdgeName.c_str(), &EdgePayload, xpti::trace_graph_event,
609 xpti_at::active, &EdgeInstanceNo);
610 if (EdgeEvent && NodeEvent) {
613 xpti_td *TgtEvent =
static_cast<xpti_td *
>(
MTraceEvent);
614 EdgeEvent->source_id = NodeEvent->unique_id;
615 EdgeEvent->target_id = TgtEvent->unique_id;
616 xpti::addMetadata(EdgeEvent,
"event",
617 reinterpret_cast<size_t>(PiEventAddr));
618 xptiNotifySubscribers(
MStreamID, xpti::trace_edge_create,
619 detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
628 uint64_t CommandInstanceNo = 0;
629 #ifdef XPTI_ENABLE_INSTRUMENTATION
630 if (!xptiTraceEnabled())
631 return CommandInstanceNo;
637 xpti::utils::StringHelper SH;
639 std::string CommandString =
642 xpti::payload_t p(CommandString.c_str(),
MAddress);
643 xpti_td *CmdTraceEvent =
644 xptiMakeEvent(CommandString.c_str(), &p, xpti::trace_graph_event,
645 xpti_at::active, &CommandInstanceNo);
656 return CommandInstanceNo;
660 #ifdef XPTI_ENABLE_INSTRUMENTATION
664 xptiNotifySubscribers(
MStreamID, xpti::trace_node_create,
665 detail::GSYCLGraphEvent,
672 std::vector<Command *> &ToCleanUp) {
674 const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr();
682 bool PiEventExpected = (!DepEvent->is_host() && DepEvent->isInitialized());
683 if (
auto *DepCmd =
static_cast<Command *
>(DepEvent->getCommand()))
684 PiEventExpected &= DepCmd->producesPiEvent();
686 if (!PiEventExpected) {
693 Command *ConnectionCmd =
nullptr;
697 if (DepEventContext != WorkerContext && !WorkerContext->is_host()) {
703 return ConnectionCmd;
707 return MQueue->getContextImplPtr();
711 assert(
MWorkerQueue &&
"MWorkerQueue must not be nullptr");
725 Command *ConnectionCmd =
nullptr;
734 if (!ConnectionCmd) {
735 MDeps.push_back(NewDep);
740 #ifdef XPTI_ENABLE_INSTRUMENTATION
746 return ConnectionCmd;
750 std::vector<Command *> &ToCleanUp) {
751 #ifdef XPTI_ENABLE_INSTRUMENTATION
765 #ifdef XPTI_ENABLE_INSTRUMENTATION
766 if (!(xptiTraceEnabled() &&
MTraceEvent && PiEventAddr))
770 xptiNotifySubscribers(
MStreamID, xpti::trace_signal, detail::GSYCLGraphEvent,
772 (
void *)PiEventAddr);
777 #ifdef XPTI_ENABLE_INSTRUMENTATION
781 xptiNotifySubscribers(
MStreamID, Type, detail::GSYCLGraphEvent,
783 static_cast<const void *
>(Txt));
788 std::vector<Command *> &ToCleanUp) {
789 #ifdef XPTI_ENABLE_INSTRUMENTATION
793 std::unique_ptr<detail::tls_code_loc_t> AsyncCodeLocationPtr;
794 if (xptiTraceEnabled() && !CurrentCodeLocationValid()) {
795 AsyncCodeLocationPtr.reset(
811 #ifdef XPTI_ENABLE_INSTRUMENTATION
815 std::string Info =
"enqueue.barrier[";
823 #ifdef XPTI_ENABLE_INSTRUMENTATION
834 #ifdef XPTI_ENABLE_INSTRUMENTATION
850 if (PI_SUCCESS != Res)
855 (
MEvent->is_host() ||
MEvent->getHandleRef() ==
nullptr))
866 ToCleanUp.push_back(
this);
872 #ifdef XPTI_ENABLE_INSTRUMENTATION
879 #ifdef XPTI_ENABLE_INSTRUMENTATION
880 assert(
MType == CommandType::RELEASE &&
"Expected release command");
886 xpti_td *TgtTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
891 for (
auto &Item : DepList) {
892 if (Item->MTraceEvent && Item->MAddress ==
MAddress) {
893 xpti::utils::StringHelper SH;
894 std::string AddressStr = SH.addressAsString<
void *>(
MAddress);
895 std::string TypeString =
896 "Edge:" + SH.nameWithAddressString(commandToName(
MType), AddressStr);
900 xpti::payload_t p(TypeString.c_str(),
MAddress);
901 uint64_t EdgeInstanceNo;
903 xptiMakeEvent(TypeString.c_str(), &p, xpti::trace_graph_event,
904 xpti_at::active, &EdgeInstanceNo);
906 xpti_td *SrcTraceEvent =
static_cast<xpti_td *
>(Item->MTraceEvent);
907 EdgeEvent->target_id = TgtTraceEvent->unique_id;
908 EdgeEvent->source_id = SrcTraceEvent->unique_id;
909 xpti::addMetadata(EdgeEvent,
"memory_object",
910 reinterpret_cast<size_t>(
MAddress));
911 xptiNotifySubscribers(
MStreamID, xpti::trace_edge_create,
912 detail::GSYCLGraphEvent, EdgeEvent,
913 EdgeInstanceNo,
nullptr);
923 return "A Buffer is locked by the host accessor";
925 return "Blocked by host task";
928 return "Unknown block reason";
932 #ifdef XPTI_ENABLE_INSTRUMENTATION
933 if (!xptiTraceEnabled())
937 auto TData = Tls.
query();
938 if (TData.fileName())
940 if (TData.functionName())
945 (int)TData.lineNumber(), (int)TData.columnNumber()};
953 :
Command(Type, Queue), MLinkedAllocaCmd(LinkedAllocaCmd),
954 MIsLeaderAlloca(nullptr == LinkedAllocaCmd), MIsConst(IsConst),
955 MRequirement(
std::move(Req)), MReleaseCmd(Queue, this) {
961 #ifdef XPTI_ENABLE_INSTRUMENTATION
962 if (!xptiTraceEnabled())
971 xpti::addMetadata(TE,
"sycl_device", deviceToID(
MQueue->get_device()));
972 xpti::addMetadata(TE,
"sycl_device_type",
974 xpti::addMetadata(TE,
"sycl_device_name",
976 xpti::addMetadata(TE,
"memory_object",
reinterpret_cast<size_t>(
MAddress));
988 bool InitFromUserData,
991 LinkedAllocaCmd, IsConst),
992 MInitFromUserData(InitFromUserData) {
997 std::vector<Command *> ToCleanUp;
1000 assert(ConnectionCmd ==
nullptr);
1001 assert(ToCleanUp.empty());
1002 (void)ConnectionCmd;
1006 #ifdef XPTI_ENABLE_INSTRUMENTATION
1007 if (!xptiTraceEnabled())
1017 pi_int32 AllocaCommand::enqueueImp() {
1023 void *HostPtr =
nullptr;
1038 std::move(EventImpls), Event);
1044 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1046 Stream <<
"ID = " <<
this <<
"\\n";
1050 Stream <<
"\"];" << std::endl;
1052 for (
const auto &Dep :
MDeps) {
1053 if (Dep.MDepCommand ==
nullptr)
1055 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1056 <<
" [ label = \"Access mode: "
1058 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1065 std::vector<Command *> &ToEnqueue,
1066 std::vector<Command *> &ToCleanUp)
1070 MParentAlloca(ParentAlloca) {
1078 ToEnqueue.push_back(ConnectionCmd);
1082 #ifdef XPTI_ENABLE_INSTRUMENTATION
1083 if (!xptiTraceEnabled())
1089 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1091 xpti::addMetadata(TE,
"access_range_start",
1093 xpti::addMetadata(TE,
"access_range_end",
1105 return static_cast<void *
>(
1112 pi_int32 AllocaSubBufCommand::enqueueImp() {
1128 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1130 Stream <<
"ID = " <<
this <<
"\\n";
1136 Stream <<
"\"];" << std::endl;
1138 for (
const auto &Dep :
MDeps) {
1139 if (Dep.MDepCommand ==
nullptr)
1141 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1142 <<
" [ label = \"Access mode: "
1144 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1155 #ifdef XPTI_ENABLE_INSTRUMENTATION
1156 if (!xptiTraceEnabled())
1164 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1165 xpti::addMetadata(TE,
"sycl_device", deviceToID(
MQueue->get_device()));
1166 xpti::addMetadata(TE,
"sycl_device_type",
1168 xpti::addMetadata(TE,
"sycl_device_name",
1170 xpti::addMetadata(TE,
"allocation_type",
1171 commandToName(MAllocaCmd->
getType()));
1177 pi_int32 ReleaseCommand::enqueueImp() {
1180 std::vector<RT::PiEvent> RawEvents =
getPiEvents(EventImpls);
1181 bool SkipRelease =
false;
1187 const bool CurAllocaIsHost = MAllocaCmd->
getQueue()->is_host();
1188 bool NeedUnmap =
false;
1202 NeedUnmap |= CurAllocaIsHost == MAllocaCmd->
MIsActive;
1211 UnmapEventImpl->setContextImpl(Queue->getContextImplPtr());
1212 UnmapEventImpl->setStateIncomplete();
1213 RT::PiEvent &UnmapEvent = UnmapEventImpl->getHandleRef();
1215 void *Src = CurAllocaIsHost
1219 void *Dst = !CurAllocaIsHost
1224 RawEvents, UnmapEvent);
1228 EventImpls.push_back(UnmapEventImpl);
1242 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#FF827A\", label=\"";
1244 Stream <<
"ID = " <<
this <<
" ; ";
1246 Stream <<
" Alloca : " << MAllocaCmd <<
"\\n";
1247 Stream <<
" MemObj : " << MAllocaCmd->
getSYCLMemObj() <<
"\\n";
1248 Stream <<
"\"];" << std::endl;
1250 for (
const auto &Dep :
MDeps) {
1251 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1252 <<
" [ label = \"Access mode: "
1254 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1269 MSrcAllocaCmd(SrcAllocaCmd), MSrcReq(
std::move(Req)), MDstPtr(DstPtr),
1275 #ifdef XPTI_ENABLE_INSTRUMENTATION
1276 if (!xptiTraceEnabled())
1284 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1285 xpti::addMetadata(TE,
"sycl_device", deviceToID(
MQueue->get_device()));
1286 xpti::addMetadata(TE,
"sycl_device_type",
1288 xpti::addMetadata(TE,
"sycl_device_name",
1290 xpti::addMetadata(TE,
"memory_object",
reinterpret_cast<size_t>(
MAddress));
1296 pi_int32 MapMemObject::enqueueImp() {
1299 std::vector<RT::PiEvent> RawEvents =
getPiEvents(EventImpls);
1312 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#77AFFF\", label=\"";
1314 Stream <<
"ID = " <<
this <<
" ; ";
1317 Stream <<
"\"];" << std::endl;
1319 for (
const auto &Dep :
MDeps) {
1320 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1321 <<
" [ label = \"Access mode: "
1323 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1331 MDstAllocaCmd(DstAllocaCmd), MDstReq(
std::move(Req)), MSrcPtr(SrcPtr) {
1336 #ifdef XPTI_ENABLE_INSTRUMENTATION
1337 if (!xptiTraceEnabled())
1345 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1346 xpti::addMetadata(TE,
"sycl_device", deviceToID(
MQueue->get_device()));
1347 xpti::addMetadata(TE,
"sycl_device_type",
1349 xpti::addMetadata(TE,
"sycl_device_name",
1351 xpti::addMetadata(TE,
"memory_object",
reinterpret_cast<size_t>(
MAddress));
1374 MEvent->getHandleRef() !=
nullptr;
1377 pi_int32 UnMapMemObject::enqueueImp() {
1380 std::vector<RT::PiEvent> RawEvents =
getPiEvents(EventImpls);
1386 std::move(RawEvents), Event);
1392 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#EBC40F\", label=\"";
1394 Stream <<
"ID = " <<
this <<
" ; ";
1397 Stream <<
"\"];" << std::endl;
1399 for (
const auto &Dep :
MDeps) {
1400 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1401 <<
" [ label = \"Access mode: "
1403 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1414 MSrcQueue(SrcQueue), MSrcReq(
std::move(SrcReq)),
1415 MSrcAllocaCmd(SrcAllocaCmd), MDstReq(
std::move(DstReq)),
1416 MDstAllocaCmd(DstAllocaCmd) {
1417 if (!MSrcQueue->is_host()) {
1418 MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1428 #ifdef XPTI_ENABLE_INSTRUMENTATION
1429 if (!xptiTraceEnabled())
1437 xpti_td *CmdTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
1438 xpti::addMetadata(CmdTraceEvent,
"sycl_device",
1439 deviceToID(
MQueue->get_device()));
1440 xpti::addMetadata(CmdTraceEvent,
"sycl_device_type",
1442 xpti::addMetadata(CmdTraceEvent,
"sycl_device_name",
1444 xpti::addMetadata(CmdTraceEvent,
"memory_object",
1445 reinterpret_cast<size_t>(
MAddress));
1446 xpti::addMetadata(CmdTraceEvent,
"copy_from",
1447 reinterpret_cast<size_t>(
1450 CmdTraceEvent,
"copy_to",
1477 return MQueue->is_host() ||
1479 MEvent->getHandleRef() !=
nullptr;
1482 pi_int32 MemCpyCommand::enqueueImp() {
1502 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#C7EB15\" label=\"";
1504 Stream <<
"ID = " <<
this <<
" ; ";
1506 Stream <<
"From: " << MSrcAllocaCmd <<
" is host: " << MSrcQueue->is_host()
1508 Stream <<
"To: " << MDstAllocaCmd <<
" is host: " <<
MQueue->is_host()
1511 Stream <<
"\"];" << std::endl;
1513 for (
const auto &Dep :
MDeps) {
1514 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1515 <<
" [ label = \"Access mode: "
1517 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1524 if (Dep.MDepRequirement == Req)
1525 return Dep.MAllocaCmd;
1527 throw runtime_error(
"Alloca for command not found",
1528 PI_ERROR_INVALID_OPERATION);
1531 std::vector<std::shared_ptr<const void>>
1534 return ((
CGExecKernel *)MCommandGroup.get())->getAuxiliaryResources();
1543 pi_int32 UpdateHostRequirementCommand::enqueueImp() {
1549 assert(MSrcAllocaCmd &&
"Expected valid alloca command");
1550 assert(MSrcAllocaCmd->
getMemAllocation() &&
"Expected valid source pointer");
1551 assert(MDstPtr &&
"Expected valid target pointer");
1558 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#f1337f\", label=\"";
1560 Stream <<
"ID = " <<
this <<
"\\n";
1562 bool IsReqOnBuffer =
1564 Stream <<
"TYPE: " << (IsReqOnBuffer ?
"Buffer" :
"Image") <<
"\\n";
1566 Stream <<
"Is sub buffer: " << std::boolalpha << MDstReq.
MIsSubBuffer
1569 Stream <<
"\"];" << std::endl;
1571 for (
const auto &Dep :
MDeps) {
1572 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1573 <<
" [ label = \"Access mode: "
1575 <<
"MemObj: " << Dep.MAllocaCmd->getSYCLMemObj() <<
" \" ]"
1586 MSrcQueue(SrcQueue), MSrcReq(
std::move(SrcReq)),
1587 MSrcAllocaCmd(SrcAllocaCmd), MDstReq(
std::move(DstReq)), MDstPtr(DstPtr) {
1588 if (!MSrcQueue->is_host()) {
1589 MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1599 #ifdef XPTI_ENABLE_INSTRUMENTATION
1600 if (!xptiTraceEnabled())
1608 xpti_td *CmdTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
1609 xpti::addMetadata(CmdTraceEvent,
"sycl_device",
1610 deviceToID(
MQueue->get_device()));
1611 xpti::addMetadata(CmdTraceEvent,
"sycl_device_type",
1613 xpti::addMetadata(CmdTraceEvent,
"sycl_device_name",
1615 xpti::addMetadata(CmdTraceEvent,
"memory_object",
1616 reinterpret_cast<size_t>(
MAddress));
1617 xpti::addMetadata(CmdTraceEvent,
"copy_from",
1618 reinterpret_cast<size_t>(
1621 CmdTraceEvent,
"copy_to",
1632 pi_int32 MemCpyCommandHost::enqueueImp() {
1636 std::vector<RT::PiEvent> RawEvents =
getPiEvents(EventImpls);
1655 MDstReq.
MElemSize, std::move(RawEvents), Event);
1665 pi_int32 EmptyCommand::enqueueImp() {
1675 MRequirements.emplace_back(ReqRef);
1676 const Requirement *
const StoredReq = &MRequirements.back();
1680 std::vector<Command *> ToCleanUp;
1682 assert(Cmd ==
nullptr &&
"Conection command should be null for EmptyCommand");
1683 assert(ToCleanUp.empty() &&
"addDep should add a command for cleanup only if "
1684 "there's a connection command");
1689 #ifdef XPTI_ENABLE_INSTRUMENTATION
1690 if (!xptiTraceEnabled())
1694 if (MRequirements.empty())
1703 xpti_td *CmdTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
1704 xpti::addMetadata(CmdTraceEvent,
"sycl_device",
1705 deviceToID(
MQueue->get_device()));
1706 xpti::addMetadata(CmdTraceEvent,
"sycl_device_type",
1708 xpti::addMetadata(CmdTraceEvent,
"sycl_device_name",
1710 xpti::addMetadata(CmdTraceEvent,
"memory_object",
1711 reinterpret_cast<size_t>(
MAddress));
1718 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#8d8f29\", label=\"";
1720 Stream <<
"ID = " <<
this <<
"\\n";
1721 Stream <<
"EMPTY NODE"
1724 Stream <<
"\"];" << std::endl;
1726 for (
const auto &Dep :
MDeps) {
1727 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1728 <<
" [ label = \"Access mode: "
1730 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1738 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#B6A2EB\", label=\"";
1740 Stream <<
"ID = " <<
this <<
"\\n";
1743 Stream <<
"\"];" << std::endl;
1745 for (
const auto &Dep :
MDeps) {
1746 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1747 <<
" [ label = \"Access mode: "
1749 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1758 MSrcAllocaCmd(SrcAllocaCmd), MDstReq(
std::move(Req)), MDstPtr(DstPtr) {
1764 #ifdef XPTI_ENABLE_INSTRUMENTATION
1765 if (!xptiTraceEnabled())
1773 xpti_td *CmdTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
1774 xpti::addMetadata(CmdTraceEvent,
"sycl_device",
1775 deviceToID(
MQueue->get_device()));
1776 xpti::addMetadata(CmdTraceEvent,
"sycl_device_type",
1778 xpti::addMetadata(CmdTraceEvent,
"sycl_device_name",
1780 xpti::addMetadata(CmdTraceEvent,
"memory_object",
1781 reinterpret_cast<size_t>(
MAddress));
1793 return "update_host";
1799 return "copy acc to acc";
1802 return "copy acc to ptr";
1805 return "copy ptr to acc";
1814 return "prefetch usm";
1820 return "copy 2d usm";
1823 return "fill 2d usm";
1826 return "memset 2d usm";
1829 return "copy to device_global";
1832 return "copy from device_global";
1843 MCommandGroup(
std::move(CommandGroup)) {
1845 MEvent->setSubmittedQueue(
1853 #ifdef XPTI_ENABLE_INSTRUMENTATION
1854 if (!xptiTraceEnabled())
1858 bool HasSourceInfo =
false;
1859 std::string KernelName;
1860 std::optional<bool> FromSource;
1861 switch (MCommandGroup->getType()) {
1866 if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource()) {
1868 pi_kernel KernelHandle = KernelCG->MSyclKernel->getHandleRef();
1870 KernelName = MCommandGroup->MFunctionName;
1887 xpti::payload_t Payload;
1888 if (!MCommandGroup->MFileName.empty()) {
1891 xpti::payload_t(KernelName.c_str(), MCommandGroup->MFileName.c_str(),
1892 MCommandGroup->MLine, MCommandGroup->MColumn,
MAddress);
1893 HasSourceInfo =
true;
1896 Payload = xpti::payload_t(KernelName.c_str(),
MAddress);
1900 Payload = xpti::payload_t(KernelName.c_str());
1903 uint64_t CGKernelInstanceNo;
1905 xpti_td *CmdTraceEvent =
1906 xptiMakeEvent(
"ExecCG", &Payload, xpti::trace_graph_event,
1907 xpti::trace_activity_type_t::active, &CGKernelInstanceNo);
1909 if (CmdTraceEvent) {
1915 if (CGKernelInstanceNo > 1)
1918 xpti::addMetadata(CmdTraceEvent,
"sycl_device",
1919 deviceToID(
MQueue->get_device()));
1920 xpti::addMetadata(CmdTraceEvent,
"sycl_device_type",
1922 xpti::addMetadata(CmdTraceEvent,
"sycl_device_name",
1924 if (!KernelName.empty()) {
1925 xpti::addMetadata(CmdTraceEvent,
"kernel_name", KernelName);
1927 if (FromSource.has_value()) {
1928 xpti::addMetadata(CmdTraceEvent,
"from_source", FromSource.value());
1930 if (HasSourceInfo) {
1931 xpti::addMetadata(CmdTraceEvent,
"sym_function_name", KernelName);
1932 xpti::addMetadata(CmdTraceEvent,
"sym_source_file_name",
1933 MCommandGroup->MFileName);
1934 xpti::addMetadata(CmdTraceEvent,
"sym_line_no", MCommandGroup->MLine);
1935 xpti::addMetadata(CmdTraceEvent,
"sym_column_no", MCommandGroup->MColumn);
1941 auto &NDRDesc = KernelCG->
MNDRDesc;
1942 std::vector<ArgDesc> Args;
1949 std::mutex *KernelMutex =
nullptr;
1951 std::shared_ptr<kernel_impl> SyclKernelImpl;
1952 std::shared_ptr<device_image_impl> DeviceImageImpl;
1963 KernelCG->MKernelName);
1968 ->get_program_ref();
1969 }
else if (
nullptr != KernelCG->MSyclKernel) {
1970 auto SyclProg = KernelCG->MSyclKernel->getProgramImpl();
1971 Program = SyclProg->getHandleRef();
1973 std::tie(Kernel, KernelMutex, Program) =
1975 KernelCG->MOSModuleHandle,
MQueue->getContextImplPtr(),
1976 MQueue->getDeviceImplPtr(), KernelCG->MKernelName,
nullptr);
1980 if (
nullptr == KernelCG->MSyclKernel ||
1981 !KernelCG->MSyclKernel->isCreatedFromSource()) {
1984 KernelCG->MOSModuleHandle, Program, KernelCG->MKernelName);
1989 xpti::offload_kernel_enqueue_data_t KernelData{
1990 {NDRDesc.GlobalSize[0], NDRDesc.GlobalSize[1], NDRDesc.GlobalSize[2]},
1991 {NDRDesc.LocalSize[0], NDRDesc.LocalSize[1], NDRDesc.LocalSize[2]},
1992 {NDRDesc.GlobalOffset[0], NDRDesc.GlobalOffset[1],
1993 NDRDesc.GlobalOffset[2]},
1995 xpti::addMetadata(CmdTraceEvent,
"enqueue_kernel_data", KernelData);
1996 for (
size_t i = 0; i < Args.size(); i++) {
1997 std::string Prefix(
"arg");
1998 xpti::offload_kernel_arg_data_t arg{(int)Args[i].
MType, Args[i].MPtr,
1999 Args[i].MSize, Args[i].MIndex};
2000 xpti::addMetadata(CmdTraceEvent, Prefix + std::to_string(i), arg);
2004 xptiNotifySubscribers(
MStreamID, xpti::trace_node_create,
2005 detail::GSYCLGraphEvent, CmdTraceEvent,
2013 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
2015 Stream <<
"ID = " <<
this <<
"\\n";
2018 switch (MCommandGroup->getType()) {
2022 Stream <<
"Kernel name: ";
2023 if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource())
2024 Stream <<
"created from source";
2031 Stream <<
"CG type: " <<
cgTypeToString(MCommandGroup->getType()) <<
"\\n";
2035 Stream <<
"\"];" << std::endl;
2037 for (
const auto &Dep :
MDeps) {
2038 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
2039 <<
" [ label = \"Access mode: "
2041 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
2065 sycl::info::kernel_device_specific::compile_work_group_size>(
2068 if (WGSize[0] == 0) {
2093 const std::shared_ptr<device_image_impl> &DeviceImageImpl,
2097 const std::function<
void *(
Requirement *Req)> &getMemAllocationFunc) {
2100 auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
2102 switch (Arg.MType) {
2109 assert(getMemAllocationFunc !=
nullptr &&
2110 "We should have caught this earlier.");
2128 sampler *SamplerPtr = (sampler *)Arg.MPtr;
2130 ->getOrCreateSampler(Queue->get_context());
2137 Arg.MSize, Arg.MPtr);
2141 if (Queue->is_host()) {
2142 throw sycl::feature_not_supported(
2143 "SYCL2020 specialization constants are not yet supported on host "
2145 PI_ERROR_INVALID_OPERATION);
2147 assert(DeviceImageImpl !=
nullptr);
2148 RT::PiMem SpecConstsBuffer = DeviceImageImpl->get_spec_const_buffer_ref();
2151 SpecConstsBuffer ? &SpecConstsBuffer :
nullptr;
2153 SpecConstsBufferArg);
2157 throw runtime_error(
"Invalid kernel param kind", PI_ERROR_INVALID_VALUE);
2167 const bool HasLocalSize = (NDRDesc.
LocalSize[0] != 0);
2171 size_t RequiredWGSize[3] = {0, 0, 0};
2172 size_t *LocalSize =
nullptr;
2178 Kernel, Queue->getDeviceImplPtr()->getHandleRef(),
2180 RequiredWGSize,
nullptr);
2182 const bool EnforcedLocalSize =
2183 (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2184 RequiredWGSize[2] != 0);
2185 if (EnforcedLocalSize)
2186 LocalSize = RequiredWGSize;
2191 &NDRDesc.
GlobalSize[0], LocalSize, RawEvents.size(),
2192 RawEvents.empty() ? nullptr : &RawEvents[0], OutEvent);
2200 void **CastedBlob = (
void **)Blob;
2202 std::vector<Requirement *> *Reqs =
2203 static_cast<std::vector<Requirement *> *
>(CastedBlob[0]);
2205 std::unique_ptr<HostKernelBase> *
HostKernel =
2206 static_cast<std::unique_ptr<HostKernelBase> *
>(CastedBlob[1]);
2211 void **NextArg = CastedBlob + 3;
2213 Req->
MData = *(NextArg++);
2215 (*HostKernel)->call(*NDRDesc,
nullptr);
2226 const std::shared_ptr<detail::kernel_impl> &MSyclKernel,
2228 std::vector<RT::PiEvent> &RawEvents,
RT::PiEvent *OutEvent,
2229 const std::function<
void *(
Requirement *Req)> &getMemAllocationFunc,
2233 auto ContextImpl = Queue->getContextImplPtr();
2234 auto DeviceImpl = Queue->getDeviceImplPtr();
2236 std::mutex *KernelMutex =
nullptr;
2239 std::shared_ptr<kernel_impl> SyclKernelImpl;
2240 std::shared_ptr<device_image_impl> DeviceImageImpl;
2255 Kernel = SyclKernelImpl->getHandleRef();
2256 DeviceImageImpl = SyclKernelImpl->getDeviceImage();
2258 Program = DeviceImageImpl->get_program_ref();
2264 }
else if (
nullptr != MSyclKernel) {
2265 assert(MSyclKernel->get_info<info::kernel::context>() ==
2266 Queue->get_context());
2267 Kernel = MSyclKernel->getHandleRef();
2268 auto SyclProg = MSyclKernel->getProgramImpl();
2269 Program = SyclProg->getHandleRef();
2270 if (SyclProg->is_cacheable()) {
2272 std::tie(FoundKernel, KernelMutex, std::ignore) =
2276 assert(FoundKernel == Kernel);
2284 KernelMutex = &MSyclKernel->getNoncacheableEnqueueMutex();
2287 std::tie(Kernel, KernelMutex, Program) =
2293 std::vector<RT::PiEvent> &EventsWaitList = RawEvents;
2296 std::vector<RT::PiEvent> DeviceGlobalInitEvents =
2297 ContextImpl->initializeDeviceGlobals(Program, Queue);
2298 std::vector<RT::PiEvent> EventsWithDeviceGlobalInits;
2299 if (!DeviceGlobalInitEvents.empty()) {
2300 EventsWithDeviceGlobalInits.reserve(RawEvents.size() +
2301 DeviceGlobalInitEvents.size());
2302 EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(),
2303 RawEvents.begin(), RawEvents.end());
2304 EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(),
2305 DeviceGlobalInitEvents.begin(),
2306 DeviceGlobalInitEvents.end());
2307 EventsWaitList = EventsWithDeviceGlobalInits;
2312 if (
nullptr == MSyclKernel || !MSyclKernel->isCreatedFromSource()) {
2318 assert(KernelMutex);
2319 std::lock_guard<std::mutex> Lock(*KernelMutex);
2332 NDRDesc, EventsWaitList, OutEvent,
2333 EliminatedArgMask, getMemAllocationFunc);
2335 if (PI_SUCCESS != Error) {
2338 const device_impl &DeviceImpl = *(Queue->getDeviceImplPtr());
2346 pi_int32 ExecCGCommand::enqueueImp() {
2347 if (getCG().getType() != CG::CGTYPE::CodeplayHostTask)
2348 waitForPreparedHostEvents();
2349 std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
2350 auto RawEvents = getPiEvents(EventImpls);
2353 RT::PiEvent *Event = (MQueue->has_discard_events_support() &&
2354 MCommandGroup->MRequirements.size() == 0)
2356 : &MEvent->getHandleRef();
2357 switch (MCommandGroup->getType()) {
2359 case CG::CGTYPE::UpdateHost: {
2360 throw runtime_error(
"Update host should be handled by the Scheduler.",
2361 PI_ERROR_INVALID_OPERATION);
2363 case CG::CGTYPE::CopyAccToPtr: {
2364 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2366 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2368 MemoryManager::copy(
2369 AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(), MQueue,
2370 Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2371 Req->MElemSize, Copy->getDst(),
2372 Scheduler::getInstance().getDefaultHostQueue(), Req->MDims,
2373 Req->MAccessRange, Req->MAccessRange, {0, 0, 0},
2374 Req->MElemSize, std::move(RawEvents), MEvent->getHandleRef());
2378 case CG::CGTYPE::CopyPtrToAcc: {
2379 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2381 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2383 Scheduler::getInstance().getDefaultHostQueue();
2385 MemoryManager::copy(
2386 AllocaCmd->getSYCLMemObj(), Copy->getSrc(),
2387 Scheduler::getInstance().getDefaultHostQueue(), Req->MDims,
2388 Req->MAccessRange, Req->MAccessRange,
2389 {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(),
2390 MQueue, Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2391 Req->MElemSize, std::move(RawEvents), MEvent->getHandleRef());
2395 case CG::CGTYPE::CopyAccToAcc: {
2396 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2400 AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc);
2401 AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst);
2403 MemoryManager::copy(
2404 AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(), MQueue,
2405 ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange,
2406 ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
2407 MQueue, ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
2408 ReqDst->MOffset, ReqDst->MElemSize, std::move(RawEvents),
2409 MEvent->getHandleRef());
2413 case CG::CGTYPE::Fill: {
2414 CGFill *Fill = (CGFill *)MCommandGroup.get();
2416 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2418 MemoryManager::fill(
2419 AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(), MQueue,
2420 Fill->MPattern.size(), Fill->MPattern.data(), Req->MDims,
2421 Req->MMemoryRange, Req->MAccessRange, Req->MOffset, Req->MElemSize,
2422 std::move(RawEvents), MEvent->getHandleRef());
2426 case CG::CGTYPE::RunOnHostIntel: {
2427 CGExecKernel *HostTask = (CGExecKernel *)MCommandGroup.get();
2434 std::vector<void *> ArgsBlob(HostTask->MArgs.size() + 3);
2436 std::vector<Requirement *> *CopyReqs =
2437 new std::vector<Requirement *>(HostTask->MRequirements);
2441 std::unique_ptr<HostKernelBase> *CopyHostKernel =
2442 new std::unique_ptr<HostKernelBase>(std::move(HostTask->MHostKernel));
2444 NDRDescT *CopyNDRDesc =
new NDRDescT(HostTask->MNDRDesc);
2446 ArgsBlob[0] = (
void *)CopyReqs;
2447 ArgsBlob[1] = (
void *)CopyHostKernel;
2448 ArgsBlob[2] = (
void *)CopyNDRDesc;
2450 void **NextArg = ArgsBlob.data() + 3;
2452 if (MQueue->is_host()) {
2453 for (ArgDesc &Arg : HostTask->MArgs) {
2454 assert(Arg.MType == kernel_param_kind_t::kind_accessor);
2457 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2459 *NextArg = AllocaCmd->getMemAllocation();
2463 if (!RawEvents.empty()) {
2465 const detail::plugin &Plugin = EventImpls[0]->getPlugin();
2473 std::vector<pi_mem> Buffers;
2477 std::vector<void *> MemLocs;
2479 for (ArgDesc &Arg : HostTask->MArgs) {
2480 assert(Arg.MType == kernel_param_kind_t::kind_accessor);
2483 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2486 Buffers.push_back(MemArg);
2487 MemLocs.push_back(NextArg);
2490 const detail::plugin &Plugin = MQueue->getPlugin();
2493 ArgsBlob.size() *
sizeof(ArgsBlob[0]), Buffers.size(), Buffers.data(),
2494 const_cast<const void **
>(MemLocs.data()), RawEvents.size(),
2495 RawEvents.empty() ? nullptr : RawEvents.data(), Event);
2498 case PI_ERROR_INVALID_OPERATION:
2499 throw sycl::runtime_error(
2500 "Device doesn't support run_on_host_intel tasks.", Error);
2504 throw sycl::runtime_error(
"Enqueueing run_on_host_intel task has failed.",
2508 case CG::CGTYPE::Kernel: {
2509 CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
2511 NDRDescT &NDRDesc = ExecKernel->MNDRDesc;
2512 std::vector<ArgDesc> &Args = ExecKernel->MArgs;
2514 if (MQueue->is_host() || (MQueue->getPlugin().getBackend() ==
2516 for (ArgDesc &Arg : Args)
2517 if (kernel_param_kind_t::kind_accessor == Arg.MType) {
2519 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2520 Req->MData = AllocaCmd->getMemAllocation();
2522 if (!RawEvents.empty()) {
2524 const detail::plugin &Plugin = EventImpls[0]->getPlugin();
2528 if (MQueue->is_host()) {
2529 ExecKernel->MHostKernel->call(NDRDesc,
2530 getEvent()->getHostProfilingInfo());
2532 assert(MQueue->getPlugin().getBackend() ==
2537 reinterpret_cast<pi_kernel>(ExecKernel->MHostKernel->getPtr()),
2538 NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0],
2539 &NDRDesc.LocalSize[0], 0,
nullptr,
nullptr);
2545 auto getMemAllocationFunc = [
this](
Requirement *Req) {
2546 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2547 return AllocaCmd->getMemAllocation();
2550 const std::shared_ptr<detail::kernel_impl> &SyclKernel =
2551 ExecKernel->MSyclKernel;
2552 const std::string &KernelName = ExecKernel->MKernelName;
2557 bool KernelUsesAssert = !(SyclKernel && SyclKernel->isInterop()) &&
2558 ProgramManager::getInstance().kernelUsesAssert(
2560 if (KernelUsesAssert) {
2561 Event = &MEvent->getHandleRef();
2566 MQueue, NDRDesc, Args, ExecKernel->getKernelBundle(), SyclKernel,
2567 KernelName,
OSModuleHandle, RawEvents, Event, getMemAllocationFunc,
2568 ExecKernel->MKernelCacheConfig);
2570 case CG::CGTYPE::CopyUSM: {
2571 CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
2572 MemoryManager::copy_usm(Copy->getSrc(), MQueue, Copy->getLength(),
2573 Copy->getDst(), std::move(RawEvents), Event);
2577 case CG::CGTYPE::FillUSM: {
2578 CGFillUSM *Fill = (CGFillUSM *)MCommandGroup.get();
2579 MemoryManager::fill_usm(Fill->getDst(), MQueue, Fill->getLength(),
2580 Fill->getFill(), std::move(RawEvents), Event);
2584 case CG::CGTYPE::PrefetchUSM: {
2585 CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get();
2586 MemoryManager::prefetch_usm(Prefetch->getDst(), MQueue,
2587 Prefetch->getLength(), std::move(RawEvents),
2592 case CG::CGTYPE::AdviseUSM: {
2593 CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get();
2594 MemoryManager::advise_usm(Advise->getDst(), MQueue, Advise->getLength(),
2595 Advise->getAdvice(), std::move(RawEvents), Event);
2599 case CG::CGTYPE::Copy2DUSM: {
2600 CGCopy2DUSM *Copy = (CGCopy2DUSM *)MCommandGroup.get();
2601 MemoryManager::copy_2d_usm(Copy->getSrc(), Copy->getSrcPitch(), MQueue,
2602 Copy->getDst(), Copy->getDstPitch(),
2603 Copy->getWidth(), Copy->getHeight(),
2604 std::move(RawEvents), Event);
2607 case CG::CGTYPE::Fill2DUSM: {
2608 CGFill2DUSM *Fill = (CGFill2DUSM *)MCommandGroup.get();
2609 MemoryManager::fill_2d_usm(Fill->getDst(), MQueue, Fill->getPitch(),
2610 Fill->getWidth(), Fill->getHeight(),
2611 Fill->getPattern(), std::move(RawEvents), Event);
2614 case CG::CGTYPE::Memset2DUSM: {
2615 CGMemset2DUSM *Memset = (CGMemset2DUSM *)MCommandGroup.get();
2616 MemoryManager::memset_2d_usm(
2617 Memset->getDst(), MQueue, Memset->getPitch(), Memset->getWidth(),
2618 Memset->getHeight(), Memset->getValue(), std::move(RawEvents), Event);
2621 case CG::CGTYPE::CodeplayInteropTask: {
2622 const detail::plugin &Plugin = MQueue->getPlugin();
2623 CGInteropTask *ExecInterop = (CGInteropTask *)MCommandGroup.get();
2628 if (!RawEvents.empty()) {
2631 std::vector<interop_handler::ReqToMem> ReqMemObjs;
2634 const auto &HandlerReq = ExecInterop->MRequirements;
2636 std::begin(HandlerReq), std::end(HandlerReq), [&](
Requirement *Req) {
2637 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2638 auto MemArg =
reinterpret_cast<pi_mem>(AllocaCmd->getMemAllocation());
2639 interop_handler::ReqToMem ReqToMem = std::make_pair(Req, MemArg);
2640 ReqMemObjs.emplace_back(ReqToMem);
2643 std::sort(std::begin(ReqMemObjs), std::end(ReqMemObjs));
2644 interop_handler InteropHandler(std::move(ReqMemObjs), MQueue);
2645 ExecInterop->MInteropTask->call(InteropHandler);
2651 case CG::CGTYPE::CodeplayHostTask: {
2652 CGHostTask *HostTask =
static_cast<CGHostTask *
>(MCommandGroup.get());
2654 for (ArgDesc &Arg : HostTask->MArgs) {
2655 switch (Arg.MType) {
2656 case kernel_param_kind_t::kind_accessor: {
2658 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2660 Req->MData = AllocaCmd->getMemAllocation();
2664 throw runtime_error(
"Unsupported arg type", PI_ERROR_INVALID_VALUE);
2668 std::vector<interop_handle::ReqToMem> ReqToMem;
2670 if (HostTask->MHostTask->isInteropTask()) {
2673 const std::vector<Requirement *> &HandlerReq = HostTask->MRequirements;
2674 auto ReqToMemConv = [&ReqToMem, HostTask](
Requirement *Req) {
2675 const std::vector<AllocaCommandBase *> &AllocaCmds =
2676 Req->MSYCLMemObj->MRecord->MAllocaCommands;
2678 for (AllocaCommandBase *AllocaCmd : AllocaCmds)
2679 if (HostTask->MQueue->getContextImplPtr() ==
2680 AllocaCmd->getQueue()->getContextImplPtr()) {
2682 reinterpret_cast<pi_mem>(AllocaCmd->getMemAllocation());
2683 ReqToMem.emplace_back(std::make_pair(Req, MemArg));
2689 "Can't get memory object due to no allocation available");
2691 throw runtime_error(
2692 "Can't get memory object due to no allocation available",
2693 PI_ERROR_INVALID_MEM_OBJECT);
2695 std::for_each(std::begin(HandlerReq), std::end(HandlerReq), ReqToMemConv);
2696 std::sort(std::begin(ReqToMem), std::end(ReqToMem));
2701 copySubmissionCodeLocation();
2703 MQueue->getThreadPool().submit<DispatchHostTask>(
2704 DispatchHostTask(
this, std::move(ReqToMem)));
2706 MShouldCompleteEventIfPossible =
false;
2710 case CG::CGTYPE::Barrier: {
2711 if (MQueue->getDeviceImplPtr()->is_host()) {
2715 const detail::plugin &Plugin = MQueue->getPlugin();
2717 MQueue->getHandleRef(), 0,
nullptr, Event);
2721 case CG::CGTYPE::BarrierWaitlist: {
2722 CGBarrier *Barrier =
static_cast<CGBarrier *
>(MCommandGroup.get());
2723 std::vector<detail::EventImplPtr> Events = Barrier->MEventsWaitWithBarrier;
2724 std::vector<RT::PiEvent> PiEvents = getPiEventsBlocking(Events);
2725 if (MQueue->getDeviceImplPtr()->is_host() || PiEvents.empty()) {
2730 const detail::plugin &Plugin = MQueue->getPlugin();
2732 MQueue->getHandleRef(), PiEvents.size(), &PiEvents[0], Event);
2736 case CG::CGTYPE::CopyToDeviceGlobal: {
2737 CGCopyToDeviceGlobal *Copy = (CGCopyToDeviceGlobal *)MCommandGroup.get();
2738 MemoryManager::copy_to_device_global(
2739 Copy->getDeviceGlobalPtr(), Copy->isDeviceImageScoped(), MQueue,
2740 Copy->getNumBytes(), Copy->getOffset(), Copy->getSrc(),
2741 Copy->getOSModuleHandle(), std::move(RawEvents), Event);
2745 case CG::CGTYPE::CopyFromDeviceGlobal: {
2746 CGCopyFromDeviceGlobal *Copy =
2747 (CGCopyFromDeviceGlobal *)MCommandGroup.get();
2748 MemoryManager::copy_from_device_global(
2749 Copy->getDeviceGlobalPtr(), Copy->isDeviceImageScoped(), MQueue,
2750 Copy->getNumBytes(), Copy->getOffset(), Copy->getDest(),
2751 Copy->getOSModuleHandle(), std::move(RawEvents), Event);
2755 case CG::CGTYPE::None:
2756 throw runtime_error(
"CG type not implemented.", PI_ERROR_INVALID_OPERATION);
2758 return PI_ERROR_INVALID_OPERATION;
2761 bool ExecCGCommand::producesPiEvent()
const {
2762 return MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask;
2765 bool ExecCGCommand::supportsPostEnqueueCleanup()
const {
2767 return Command::supportsPostEnqueueCleanup() &&
2768 (MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask);
2771 bool ExecCGCommand::readyForCleanup()
const {
2772 if (MCommandGroup->getType() == CG::CGTYPE::CodeplayHostTask)
2773 return MLeafCounter == 0 && MEvent->isCompleted();
2774 return Command::readyForCleanup();
2784 return MAuxiliaryCommands;
2788 MFusionList.push_back(Kernel);
2797 pi_int32 KernelFusionCommand::enqueueImp() {
2809 #ifdef XPTI_ENABLE_INSTRUMENTATION
2810 if (!xptiTraceEnabled()) {
2818 static unsigned FusionNodeCount = 0;
2819 std::stringstream PayloadStr;
2820 PayloadStr <<
"Fusion command #" << FusionNodeCount++;
2821 xpti::payload_t Payload = xpti::payload_t(PayloadStr.str().c_str());
2823 uint64_t CommandInstanceNo = 0;
2824 xpti_td *CmdTraceEvent =
2825 xptiMakeEvent(
MCommandName.c_str(), &Payload, xpti::trace_graph_event,
2826 xpti_at::active, &CommandInstanceNo);
2829 if (CmdTraceEvent) {
2845 xpti::addMetadata(CmdTraceEvent,
"sycl_device",
2846 deviceToID(
MQueue->get_device()));
2847 xpti::addMetadata(CmdTraceEvent,
"sycl_device_type",
2849 xpti::addMetadata(CmdTraceEvent,
"sycl_device_name",
2854 xptiNotifySubscribers(
MStreamID, xpti::trace_node_create,
2855 detail::GSYCLGraphEvent,
2864 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
2866 Stream <<
"ID = " <<
this <<
"\\n";
2868 <<
"FUSION LIST: {";
2869 bool Initial =
true;
2870 for (
auto *Cmd : MFusionList) {
2876 if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource()) {
2877 Stream <<
"created from source";
2884 Stream <<
"\"];" << std::endl;
2886 for (
const auto &Dep :
MDeps) {
2887 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
2888 <<
" [ label = \"Access mode: "
2890 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"