38 #if __has_include(<cxxabi.h>)
39 #define __SYCL_ENABLE_GNU_DEMANGLING
46 #ifdef XPTI_ENABLE_INSTRUMENTATION
47 #include "xpti/xpti_trace_framework.hpp"
52 inline namespace _V1 {
55 #ifdef XPTI_ENABLE_INSTRUMENTATION
57 extern xpti::trace_event_data_t *GSYCLGraphEvent;
59 bool CurrentCodeLocationValid() {
60 detail::tls_code_loc_t Tls;
61 auto CodeLoc = Tls.query();
62 auto FileName = CodeLoc.fileName();
63 auto FunctionName = CodeLoc.functionName();
64 return (FileName && FileName[0] !=
'\0') ||
65 (FunctionName && FunctionName[0] !=
'\0');
68 void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID,
69 xpti_td *TraceEvent, uint16_t Type,
71 if (!(xptiCheckTraceEnabled(StreamID, Type) && TraceEvent))
74 xptiNotifySubscribers(StreamID, Type, detail::GSYCLGraphEvent,
75 static_cast<xpti_td *
>(TraceEvent), InstanceID, Addr);
79 #ifdef __SYCL_ENABLE_GNU_DEMANGLING
80 struct DemangleHandle {
82 DemangleHandle(
char *ptr) : p(ptr) {}
84 DemangleHandle &
operator=(
const DemangleHandle &) =
delete;
90 DemangleHandle result(abi::__cxa_demangle(Name.c_str(), NULL, NULL, &Status));
91 return (Status == 0) ? result.p : Name;
105 return "ACCELERATOR";
111 const KernelArgMask *EliminatedArgMask, std::vector<ArgDesc> &Args,
113 if (!EliminatedArgMask) {
121 std::sort(Args.begin(), Args.end(), [](
const ArgDesc &A,
const ArgDesc &B) {
122 return A.MIndex < B.MIndex;
125 size_t NextTrueIndex = 0;
130 for (
int Idx = LastIndex + 1; Idx < Arg.
MIndex; ++Idx)
131 if (!(*EliminatedArgMask)[Idx])
135 if ((*EliminatedArgMask)[Arg.
MIndex])
138 Func(Arg, NextTrueIndex);
144 #ifdef XPTI_ENABLE_INSTRUMENTATION
145 static size_t deviceToID(
const device &Device) {
149 return reinterpret_cast<size_t>(
getSyclObjImpl(Device)->getHandleRef());
162 return "discard_write";
164 return "discard_read_write";
170 #ifdef XPTI_ENABLE_INSTRUMENTATION
175 case Command::CommandType::RUN_CG:
176 return "command_group_node";
177 case Command::CommandType::COPY_MEMORY:
178 return "memory_transfer_node";
179 case Command::CommandType::ALLOCA:
180 return "memory_allocation_node";
181 case Command::CommandType::ALLOCA_SUB_BUF:
182 return "sub_buffer_creation_node";
183 case Command::CommandType::RELEASE:
184 return "memory_deallocation_node";
185 case Command::CommandType::MAP_MEM_OBJ:
186 return "memory_transfer_node";
187 case Command::CommandType::UNMAP_MEM_OBJ:
188 return "memory_transfer_node";
189 case Command::CommandType::UPDATE_REQUIREMENT:
190 return "host_acc_create_buffer_lock_node";
191 case Command::CommandType::EMPTY_TASK:
192 return "host_acc_destroy_buffer_release_node";
193 case Command::CommandType::FUSION:
194 return "kernel_fusion_placeholder_node";
196 return "unknown_node";
205 case Command::CommandType::RUN_CG:
206 return "Command Group Action";
207 case Command::CommandType::COPY_MEMORY:
208 return "Memory Transfer (Copy)";
209 case Command::CommandType::ALLOCA:
210 return "Memory Allocation";
211 case Command::CommandType::ALLOCA_SUB_BUF:
212 return "Sub Buffer Creation";
213 case Command::CommandType::RELEASE:
214 return "Memory Deallocation";
215 case Command::CommandType::MAP_MEM_OBJ:
216 return "Memory Transfer (Map)";
217 case Command::CommandType::UNMAP_MEM_OBJ:
218 return "Memory Transfer (Unmap)";
219 case Command::CommandType::UPDATE_REQUIREMENT:
220 return "Host Accessor Creation/Buffer Lock";
221 case Command::CommandType::EMPTY_TASK:
222 return "Host Accessor Destruction/Buffer Lock Release";
223 case Command::CommandType::FUSION:
224 return "Kernel Fusion Placeholder";
226 return "Unknown Action";
231 std::vector<sycl::detail::pi::PiEvent>
233 std::vector<sycl::detail::pi::PiEvent> RetPiEvents;
234 for (
auto &EventImpl : EventImpls) {
235 if (EventImpl->getHandleRef() ==
nullptr)
245 if (EventImpl->getWorkerQueue() == WorkerQueue &&
249 RetPiEvents.push_back(EventImpl->getHandleRef());
260 const std::vector<EventImplPtr> &EventImpls)
const {
261 std::vector<sycl::detail::pi::PiEvent> RetPiEvents;
262 for (
auto &EventImpl : EventImpls) {
267 if (!EventImpl->isContextInitialized() || EventImpl->is_host() ||
272 if (EventImpl->getHandleRef() ==
nullptr) {
273 if (!EventImpl->getCommand() ||
276 std::vector<Command *> AuxCmds;
287 if (EventImpl->getWorkerQueue() == WorkerQueue &&
291 RetPiEvents.push_back(EventImpl->getHandleRef());
298 return (
MType == CommandType::RUN_CG) &&
299 ((
static_cast<const ExecCGCommand *
>(
this))->getCG().getType() ==
300 CG::CGTYPE::CodeplayHostTask);
304 if ((
MType != CommandType::RUN_CG)) {
308 return (
CG.
getType() == CG::CGTYPE::Kernel) &&
314 for (
auto &EventImpl : EventImpls) {
315 EventImpl->flushIfNeeded(Queue);
321 std::vector<interop_handle::ReqToMem> MReqToMem;
324 std::map<const PluginPtr, std::vector<EventImplPtr>>
325 RequiredEventsPerPlugin;
328 const PluginPtr &Plugin = Event->getPlugin();
329 RequiredEventsPerPlugin[Plugin].push_back(Event);
337 for (
auto &PluginWithEvents : RequiredEventsPerPlugin) {
338 std::vector<sycl::detail::pi::PiEvent> RawEvents =
345 HostTask.MQueue->reportAsyncException(std::current_exception());
349 HostTask.MQueue->reportAsyncException(std::current_exception());
350 return PI_ERROR_UNKNOWN;
357 Event->waitInternal();
365 std::vector<interop_handle::ReqToMem> ReqToMem)
366 : MThisCmd{ThisCmd}, MReqToMem(
std::move(ReqToMem)) {}
369 assert(MThisCmd->
getCG().
getType() == CG::CGTYPE::CodeplayHostTask);
373 #ifdef XPTI_ENABLE_INSTRUMENTATION
378 std::unique_ptr<detail::tls_code_loc_t> AsyncCodeLocationPtr;
379 if (xptiTraceEnabled() && !CurrentCodeLocationValid()) {
380 AsyncCodeLocationPtr.reset(
386 if (WaitResult != PI_SUCCESS) {
387 std::exception_ptr EPtr = std::make_exception_ptr(sycl::runtime_error(
388 std::string(
"Couldn't wait for host-task's dependencies"),
390 HostTask.MQueue->reportAsyncException(EPtr);
399 if (
HostTask.MHostTask->isInteropTask()) {
401 HostTask.MQueue->getDeviceImplPtr(),
402 HostTask.MQueue->getContextImplPtr()};
404 HostTask.MHostTask->call(MThisCmd->
MEvent->getHostProfilingInfo(), IH);
406 HostTask.MHostTask->call(MThisCmd->
MEvent->getHostProfilingInfo());
408 auto CurrentException = std::current_exception();
409 #ifdef XPTI_ENABLE_INSTRUMENTATION
413 if (xptiTraceEnabled()) {
415 rethrow_exception(CurrentException);
418 }
catch (
const std::exception &StdException) {
422 "Host task lambda thrown non standard exception");
426 HostTask.MQueue->reportAsyncException(CurrentException);
431 #ifdef XPTI_ENABLE_INSTRUMENTATION
434 AsyncCodeLocationPtr.reset();
442 auto CurrentException = std::current_exception();
443 HostTask.MQueue->reportAsyncException(CurrentException);
450 HostEvent->waitInternal();
454 std::vector<EventImplPtr> &EventImpls,
456 if (!EventImpls.empty()) {
457 if (Queue->is_host()) {
471 std::map<context_impl *, std::vector<EventImplPtr>>
472 RequiredEventsPerContext;
476 assert(Context.get() &&
477 "Only non-host events are expected to be waited for here");
478 RequiredEventsPerContext[Context.get()].push_back(Event);
481 for (
auto &CtxWithEvents : RequiredEventsPerContext) {
482 std::vector<sycl::detail::pi::PiEvent> RawEvents =
485 RawEvents.size(), RawEvents.data());
490 assert(Event->getContextImpl().get() &&
491 "Only non-host events are expected to be waited for here");
494 std::vector<sycl::detail::pi::PiEvent> RawEvents =
497 const PluginPtr &Plugin = Queue->getPlugin();
500 MEvent->setHostEnqueueTime();
502 Queue->getHandleRef(), RawEvents.size(), &RawEvents[0], &Event);
513 const std::vector<sycl::detail::pi::PiExtSyncPoint> &SyncPoints)
514 : MQueue(
std::move(Queue)),
516 MPreparedDepsEvents(MEvent->getPreparedDepsEvents()),
517 MPreparedHostDepsEvents(MEvent->getPreparedHostDepsEvents()), MType(Type),
518 MCommandBuffer(CommandBuffer), MSyncPointDeps(SyncPoints) {
524 MEvent->setStateIncomplete();
527 #ifdef XPTI_ENABLE_INSTRUMENTATION
528 if (!xptiTraceEnabled())
536 #ifdef XPTI_ENABLE_INSTRUMENTATION
552 Command *Cmd,
void *ObjAddr,
bool IsCommand,
553 std::optional<access::mode> AccMode) {
554 #ifdef XPTI_ENABLE_INSTRUMENTATION
557 constexpr uint16_t NotificationTraceType = xpti::trace_edge_create;
558 if (!(xptiCheckTraceEnabled(
MStreamID, NotificationTraceType) &&
564 xpti::utils::StringHelper SH;
565 std::string AddressStr = SH.addressAsString<
void *>(ObjAddr);
567 std::string TypeString = SH.nameWithAddressString(Prefix, AddressStr);
570 xpti::payload_t Payload(TypeString.c_str(),
MAddress);
571 uint64_t EdgeInstanceNo;
573 xptiMakeEvent(TypeString.c_str(), &Payload, xpti::trace_graph_event,
574 xpti_at::active, &EdgeInstanceNo);
576 xpti_td *SrcEvent =
static_cast<xpti_td *
>(Cmd->
MTraceEvent);
577 xpti_td *TgtEvent =
static_cast<xpti_td *
>(
MTraceEvent);
578 EdgeEvent->source_id = SrcEvent->unique_id;
579 EdgeEvent->target_id = TgtEvent->unique_id;
581 xpti::addMetadata(EdgeEvent,
"access_mode",
582 static_cast<int>(AccMode.value()));
583 xpti::addMetadata(EdgeEvent,
"memory_object",
584 reinterpret_cast<size_t>(ObjAddr));
586 xpti::addMetadata(EdgeEvent,
"event",
reinterpret_cast<size_t>(ObjAddr));
588 xptiNotifySubscribers(
MStreamID, NotificationTraceType,
589 detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
604 #ifdef XPTI_ENABLE_INSTRUMENTATION
617 xpti::utils::StringHelper SH;
624 std::string NodeName = SH.nameWithAddressString(
"virtual_node", AddressStr);
626 xpti::payload_t VNPayload(NodeName.c_str(),
MAddress);
627 uint64_t VNodeInstanceNo;
629 xptiMakeEvent(NodeName.c_str(), &VNPayload, xpti::trace_graph_event,
630 xpti_at::active, &VNodeInstanceNo);
632 xpti::addMetadata(NodeEvent,
"kernel_name", NodeName);
633 xptiNotifySubscribers(
MStreamID, xpti::trace_node_create,
634 detail::GSYCLGraphEvent, NodeEvent, VNodeInstanceNo,
637 std::string EdgeName = SH.nameWithAddressString(
"Event", AddressStr);
638 xpti::payload_t EdgePayload(EdgeName.c_str(),
MAddress);
639 uint64_t EdgeInstanceNo;
641 xptiMakeEvent(EdgeName.c_str(), &EdgePayload, xpti::trace_graph_event,
642 xpti_at::active, &EdgeInstanceNo);
643 if (EdgeEvent && NodeEvent) {
646 xpti_td *TgtEvent =
static_cast<xpti_td *
>(
MTraceEvent);
647 EdgeEvent->source_id = NodeEvent->unique_id;
648 EdgeEvent->target_id = TgtEvent->unique_id;
649 xpti::addMetadata(EdgeEvent,
"event",
650 reinterpret_cast<size_t>(PiEventAddr));
651 xptiNotifySubscribers(
MStreamID, xpti::trace_edge_create,
652 detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
661 uint64_t CommandInstanceNo = 0;
662 #ifdef XPTI_ENABLE_INSTRUMENTATION
664 return CommandInstanceNo;
670 xpti::utils::StringHelper SH;
675 xpti::payload_t p(CommandString.c_str(),
MAddress);
676 xpti_td *CmdTraceEvent =
677 xptiMakeEvent(CommandString.c_str(), &p, xpti::trace_graph_event,
678 xpti_at::active, &CommandInstanceNo);
689 return CommandInstanceNo;
693 #ifdef XPTI_ENABLE_INSTRUMENTATION
694 constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
698 xptiNotifySubscribers(
MStreamID, NotificationTraceType,
699 detail::GSYCLGraphEvent,
706 std::vector<Command *> &ToCleanUp) {
708 const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr();
716 bool PiEventExpected = (!DepEvent->is_host() && DepEvent->isInitialized());
717 if (
auto *DepCmd =
static_cast<Command *
>(DepEvent->getCommand()))
718 PiEventExpected &= DepCmd->producesPiEvent();
720 if (!PiEventExpected) {
727 Command *ConnectionCmd =
nullptr;
731 if (DepEventContext != WorkerContext && !WorkerContext->is_host()) {
737 return ConnectionCmd;
741 return MQueue->getContextImplPtr();
745 assert(
MWorkerQueue &&
"MWorkerQueue must not be nullptr");
759 Command *ConnectionCmd =
nullptr;
768 if (!ConnectionCmd) {
769 MDeps.push_back(NewDep);
774 #ifdef XPTI_ENABLE_INSTRUMENTATION
780 return ConnectionCmd;
784 std::vector<Command *> &ToCleanUp) {
785 #ifdef XPTI_ENABLE_INSTRUMENTATION
799 #ifdef XPTI_ENABLE_INSTRUMENTATION
800 emitInstrumentationGeneral(
802 xpti::trace_signal,
static_cast<const void *
>(PiEventAddr));
804 std::ignore = PiEventAddr;
808 #ifdef XPTI_ENABLE_INSTRUMENTATION
811 static_cast<const void *
>(Txt));
819 std::vector<Command *> &ToCleanUp) {
820 #ifdef XPTI_ENABLE_INSTRUMENTATION
824 std::unique_ptr<detail::tls_code_loc_t> AsyncCodeLocationPtr;
825 if (xptiTraceEnabled() && !CurrentCodeLocationValid()) {
826 AsyncCodeLocationPtr.reset(
842 #ifdef XPTI_ENABLE_INSTRUMENTATION
854 #ifdef XPTI_ENABLE_INSTRUMENTATION
865 #ifdef XPTI_ENABLE_INSTRUMENTATION
881 if (PI_SUCCESS != Res)
886 (
MEvent->is_host() ||
MEvent->getHandleRef() ==
nullptr))
897 ToCleanUp.push_back(
this);
903 #ifdef XPTI_ENABLE_INSTRUMENTATION
910 #ifdef XPTI_ENABLE_INSTRUMENTATION
911 assert(
MType == CommandType::RELEASE &&
"Expected release command");
917 xpti_td *TgtTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
922 for (
auto &Item : DepList) {
923 if (Item->MTraceEvent && Item->MAddress ==
MAddress) {
924 xpti::utils::StringHelper SH;
927 "Edge:" + SH.nameWithAddressString(commandToName(
MType), AddressStr);
931 xpti::payload_t p(TypeString.c_str(),
MAddress);
932 uint64_t EdgeInstanceNo;
934 xptiMakeEvent(TypeString.c_str(), &p, xpti::trace_graph_event,
935 xpti_at::active, &EdgeInstanceNo);
937 xpti_td *SrcTraceEvent =
static_cast<xpti_td *
>(Item->MTraceEvent);
938 EdgeEvent->target_id = TgtTraceEvent->unique_id;
939 EdgeEvent->source_id = SrcTraceEvent->unique_id;
940 xpti::addMetadata(EdgeEvent,
"memory_object",
941 reinterpret_cast<size_t>(
MAddress));
942 xptiNotifySubscribers(
MStreamID, xpti::trace_edge_create,
943 detail::GSYCLGraphEvent, EdgeEvent,
944 EdgeInstanceNo,
nullptr);
954 return "A Buffer is locked by the host accessor";
956 return "Blocked by host task";
959 return "Unknown block reason";
963 #ifdef XPTI_ENABLE_INSTRUMENTATION
964 if (!xptiTraceEnabled())
968 auto TData = Tls.
query();
969 if (TData.fileName())
971 if (TData.functionName())
976 (int)TData.lineNumber(), (int)TData.columnNumber()};
984 :
Command(Type, Queue), MLinkedAllocaCmd(LinkedAllocaCmd),
985 MIsLeaderAlloca(nullptr == LinkedAllocaCmd), MIsConst(IsConst),
986 MRequirement(
std::move(Req)), MReleaseCmd(Queue, this) {
992 #ifdef XPTI_ENABLE_INSTRUMENTATION
1001 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1002 xpti::addMetadata(TE,
"sycl_device", deviceToID(
MQueue->get_device()));
1003 xpti::addMetadata(TE,
"sycl_device_type",
1005 xpti::addMetadata(TE,
"sycl_device_name",
1007 xpti::addMetadata(TE,
"memory_object",
reinterpret_cast<size_t>(
MAddress));
1008 xpti::addMetadata(TE,
"queue_id",
MQueue->getQueueID());
1020 bool InitFromUserData,
1023 LinkedAllocaCmd, IsConst),
1024 MInitFromUserData(InitFromUserData) {
1029 std::vector<Command *> ToCleanUp;
1032 assert(ConnectionCmd ==
nullptr);
1033 assert(ToCleanUp.empty());
1034 (void)ConnectionCmd;
1038 #ifdef XPTI_ENABLE_INSTRUMENTATION
1049 pi_int32 AllocaCommand::enqueueImp() {
1055 void *HostPtr =
nullptr;
1070 std::move(EventImpls), Event);
1076 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1078 Stream <<
"ID = " <<
this <<
"\\n";
1082 Stream <<
"\"];" << std::endl;
1084 for (
const auto &Dep :
MDeps) {
1085 if (Dep.MDepCommand ==
nullptr)
1087 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1088 <<
" [ label = \"Access mode: "
1090 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1097 std::vector<Command *> &ToEnqueue,
1098 std::vector<Command *> &ToCleanUp)
1102 MParentAlloca(ParentAlloca) {
1110 ToEnqueue.push_back(ConnectionCmd);
1114 #ifdef XPTI_ENABLE_INSTRUMENTATION
1121 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1123 xpti::addMetadata(TE,
"access_range_start",
1125 xpti::addMetadata(TE,
"access_range_end",
1127 xpti::addMetadata(TE,
"queue_id",
MQueue->getQueueID());
1138 return static_cast<void *
>(
1145 pi_int32 AllocaSubBufCommand::enqueueImp() {
1161 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1163 Stream <<
"ID = " <<
this <<
"\\n";
1169 Stream <<
"\"];" << std::endl;
1171 for (
const auto &Dep :
MDeps) {
1172 if (Dep.MDepCommand ==
nullptr)
1174 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1175 <<
" [ label = \"Access mode: "
1177 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1188 #ifdef XPTI_ENABLE_INSTRUMENTATION
1197 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1198 xpti::addMetadata(TE,
"sycl_device", deviceToID(
MQueue->get_device()));
1199 xpti::addMetadata(TE,
"sycl_device_type",
1201 xpti::addMetadata(TE,
"sycl_device_name",
1203 xpti::addMetadata(TE,
"allocation_type",
1204 commandToName(MAllocaCmd->
getType()));
1205 xpti::addMetadata(TE,
"queue_id",
MQueue->getQueueID());
1212 pi_int32 ReleaseCommand::enqueueImp() {
1215 std::vector<sycl::detail::pi::PiEvent> RawEvents =
getPiEvents(EventImpls);
1216 bool SkipRelease =
false;
1222 const bool CurAllocaIsHost = MAllocaCmd->
getQueue()->is_host();
1223 bool NeedUnmap =
false;
1237 NeedUnmap |= CurAllocaIsHost == MAllocaCmd->
MIsActive;
1246 UnmapEventImpl->setContextImpl(Queue->getContextImplPtr());
1247 UnmapEventImpl->setStateIncomplete();
1250 void *Src = CurAllocaIsHost
1254 void *Dst = !CurAllocaIsHost
1259 RawEvents, UnmapEvent);
1263 EventImpls.push_back(UnmapEventImpl);
1277 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#FF827A\", label=\"";
1279 Stream <<
"ID = " <<
this <<
" ; ";
1281 Stream <<
" Alloca : " << MAllocaCmd <<
"\\n";
1282 Stream <<
" MemObj : " << MAllocaCmd->
getSYCLMemObj() <<
"\\n";
1283 Stream <<
"\"];" << std::endl;
1285 for (
const auto &Dep :
MDeps) {
1286 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1287 <<
" [ label = \"Access mode: "
1289 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1304 MSrcAllocaCmd(SrcAllocaCmd), MSrcReq(
std::move(Req)), MDstPtr(DstPtr),
1310 #ifdef XPTI_ENABLE_INSTRUMENTATION
1319 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1320 xpti::addMetadata(TE,
"sycl_device", deviceToID(
MQueue->get_device()));
1321 xpti::addMetadata(TE,
"sycl_device_type",
1323 xpti::addMetadata(TE,
"sycl_device_name",
1325 xpti::addMetadata(TE,
"memory_object",
reinterpret_cast<size_t>(
MAddress));
1326 xpti::addMetadata(TE,
"queue_id",
MQueue->getQueueID());
1333 pi_int32 MapMemObject::enqueueImp() {
1336 std::vector<sycl::detail::pi::PiEvent> RawEvents =
getPiEvents(EventImpls);
1349 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#77AFFF\", label=\"";
1351 Stream <<
"ID = " <<
this <<
" ; ";
1354 Stream <<
"\"];" << std::endl;
1356 for (
const auto &Dep :
MDeps) {
1357 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1358 <<
" [ label = \"Access mode: "
1360 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1368 MDstAllocaCmd(DstAllocaCmd), MDstReq(
std::move(Req)), MSrcPtr(SrcPtr) {
1373 #ifdef XPTI_ENABLE_INSTRUMENTATION
1382 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1383 xpti::addMetadata(TE,
"sycl_device", deviceToID(
MQueue->get_device()));
1384 xpti::addMetadata(TE,
"sycl_device_type",
1386 xpti::addMetadata(TE,
"sycl_device_name",
1388 xpti::addMetadata(TE,
"memory_object",
reinterpret_cast<size_t>(
MAddress));
1389 xpti::addMetadata(TE,
"queue_id",
MQueue->getQueueID());
1412 return MQueue->getDeviceImplPtr()->getBackend() !=
1414 MEvent->getHandleRef() !=
nullptr;
1417 pi_int32 UnMapMemObject::enqueueImp() {
1420 std::vector<sycl::detail::pi::PiEvent> RawEvents =
getPiEvents(EventImpls);
1426 std::move(RawEvents), Event);
1432 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#EBC40F\", label=\"";
1434 Stream <<
"ID = " <<
this <<
" ; ";
1437 Stream <<
"\"];" << std::endl;
1439 for (
const auto &Dep :
MDeps) {
1440 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1441 <<
" [ label = \"Access mode: "
1443 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1454 MSrcQueue(SrcQueue), MSrcReq(
std::move(SrcReq)),
1455 MSrcAllocaCmd(SrcAllocaCmd), MDstReq(
std::move(DstReq)),
1456 MDstAllocaCmd(DstAllocaCmd) {
1457 if (!MSrcQueue->is_host()) {
1458 MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1468 #ifdef XPTI_ENABLE_INSTRUMENTATION
1477 xpti_td *CmdTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
1478 xpti::addMetadata(CmdTraceEvent,
"sycl_device",
1479 deviceToID(
MQueue->get_device()));
1480 xpti::addMetadata(CmdTraceEvent,
"sycl_device_type",
1482 xpti::addMetadata(CmdTraceEvent,
"sycl_device_name",
1484 xpti::addMetadata(CmdTraceEvent,
"memory_object",
1485 reinterpret_cast<size_t>(
MAddress));
1486 xpti::addMetadata(CmdTraceEvent,
"copy_from",
1487 reinterpret_cast<size_t>(
1490 CmdTraceEvent,
"copy_to",
1492 xpti::addMetadata(CmdTraceEvent,
"queue_id",
MQueue->getQueueID());
1519 return MQueue->is_host() ||
1520 MQueue->getDeviceImplPtr()->getBackend() !=
1522 MEvent->getHandleRef() !=
nullptr;
1525 pi_int32 MemCpyCommand::enqueueImp() {
1545 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#C7EB15\" label=\"";
1547 Stream <<
"ID = " <<
this <<
" ; ";
1549 Stream <<
"From: " << MSrcAllocaCmd <<
" is host: " << MSrcQueue->is_host()
1551 Stream <<
"To: " << MDstAllocaCmd <<
" is host: " <<
MQueue->is_host()
1554 Stream <<
"\"];" << std::endl;
1556 for (
const auto &Dep :
MDeps) {
1557 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1558 <<
" [ label = \"Access mode: "
1560 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1567 if (Dep.MDepRequirement == Req)
1568 return Dep.MAllocaCmd;
1576 std::vector<std::shared_ptr<const void>>
1579 return ((
CGExecKernel *)MCommandGroup.get())->getAuxiliaryResources();
1588 pi_int32 UpdateHostRequirementCommand::enqueueImp() {
1594 assert(MSrcAllocaCmd &&
"Expected valid alloca command");
1595 assert(MSrcAllocaCmd->
getMemAllocation() &&
"Expected valid source pointer");
1596 assert(MDstPtr &&
"Expected valid target pointer");
1603 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#f1337f\", label=\"";
1605 Stream <<
"ID = " <<
this <<
"\\n";
1607 bool IsReqOnBuffer =
1609 Stream <<
"TYPE: " << (IsReqOnBuffer ?
"Buffer" :
"Image") <<
"\\n";
1611 Stream <<
"Is sub buffer: " << std::boolalpha << MDstReq.
MIsSubBuffer
1614 Stream <<
"\"];" << std::endl;
1616 for (
const auto &Dep :
MDeps) {
1617 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1618 <<
" [ label = \"Access mode: "
1620 <<
"MemObj: " << Dep.MAllocaCmd->getSYCLMemObj() <<
" \" ]"
1631 MSrcQueue(SrcQueue), MSrcReq(
std::move(SrcReq)),
1632 MSrcAllocaCmd(SrcAllocaCmd), MDstReq(
std::move(DstReq)), MDstPtr(DstPtr) {
1633 if (!MSrcQueue->is_host()) {
1634 MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1644 #ifdef XPTI_ENABLE_INSTRUMENTATION
1653 xpti_td *CmdTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
1654 xpti::addMetadata(CmdTraceEvent,
"sycl_device",
1655 deviceToID(
MQueue->get_device()));
1656 xpti::addMetadata(CmdTraceEvent,
"sycl_device_type",
1658 xpti::addMetadata(CmdTraceEvent,
"sycl_device_name",
1660 xpti::addMetadata(CmdTraceEvent,
"memory_object",
1661 reinterpret_cast<size_t>(
MAddress));
1662 xpti::addMetadata(CmdTraceEvent,
"copy_from",
1663 reinterpret_cast<size_t>(
1666 CmdTraceEvent,
"copy_to",
1668 xpti::addMetadata(CmdTraceEvent,
"queue_id",
MQueue->getQueueID());
1679 pi_int32 MemCpyCommandHost::enqueueImp() {
1683 std::vector<sycl::detail::pi::PiEvent> RawEvents =
getPiEvents(EventImpls);
1712 pi_int32 EmptyCommand::enqueueImp() {
1722 MRequirements.emplace_back(ReqRef);
1723 const Requirement *
const StoredReq = &MRequirements.back();
1727 std::vector<Command *> ToCleanUp;
1729 assert(Cmd ==
nullptr &&
"Conection command should be null for EmptyCommand");
1730 assert(ToCleanUp.empty() &&
"addDep should add a command for cleanup only if "
1731 "there's a connection command");
1736 #ifdef XPTI_ENABLE_INSTRUMENTATION
1741 if (MRequirements.empty())
1750 xpti_td *CmdTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
1751 xpti::addMetadata(CmdTraceEvent,
"sycl_device",
1752 deviceToID(
MQueue->get_device()));
1753 xpti::addMetadata(CmdTraceEvent,
"sycl_device_type",
1755 xpti::addMetadata(CmdTraceEvent,
"sycl_device_name",
1757 xpti::addMetadata(CmdTraceEvent,
"memory_object",
1758 reinterpret_cast<size_t>(
MAddress));
1759 xpti::addMetadata(CmdTraceEvent,
"queue_id",
MQueue->getQueueID());
1767 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#8d8f29\", label=\"";
1769 Stream <<
"ID = " <<
this <<
"\\n";
1770 Stream <<
"EMPTY NODE"
1773 Stream <<
"\"];" << std::endl;
1775 for (
const auto &Dep :
MDeps) {
1776 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1777 <<
" [ label = \"Access mode: "
1779 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1787 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#B6A2EB\", label=\"";
1789 Stream <<
"ID = " <<
this <<
"\\n";
1792 Stream <<
"\"];" << std::endl;
1794 for (
const auto &Dep :
MDeps) {
1795 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1796 <<
" [ label = \"Access mode: "
1798 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1807 MSrcAllocaCmd(SrcAllocaCmd), MDstReq(
std::move(Req)), MDstPtr(DstPtr) {
1813 #ifdef XPTI_ENABLE_INSTRUMENTATION
1822 xpti_td *CmdTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
1823 xpti::addMetadata(CmdTraceEvent,
"sycl_device",
1824 deviceToID(
MQueue->get_device()));
1825 xpti::addMetadata(CmdTraceEvent,
"sycl_device_type",
1827 xpti::addMetadata(CmdTraceEvent,
"sycl_device_name",
1829 xpti::addMetadata(CmdTraceEvent,
"memory_object",
1830 reinterpret_cast<size_t>(
MAddress));
1831 xpti::addMetadata(CmdTraceEvent,
"queue_id",
MQueue->getQueueID());
1844 return "update_host";
1850 return "copy acc to acc";
1853 return "copy acc to ptr";
1856 return "copy ptr to acc";
1861 return "barrier waitlist";
1869 return "prefetch usm";
1875 return "copy 2d usm";
1878 return "fill 2d usm";
1881 return "advise usm";
1883 return "memset 2d usm";
1886 return "copy to device_global";
1889 return "copy from device_global";
1892 return "read_write host pipe";
1894 return "exec command buffer";
1896 return "copy image";
1898 return "semaphore wait";
1900 return "semaphore signal";
1908 std::unique_ptr<detail::CG> CommandGroup,
QueueImplPtr Queue,
1910 const std::vector<sycl::detail::pi::PiExtSyncPoint> &Dependencies)
1913 MCommandGroup(
std::move(CommandGroup)) {
1915 MEvent->setSubmittedQueue(
1922 #ifdef XPTI_ENABLE_INSTRUMENTATION
1924 const std::shared_ptr<detail::kernel_impl> &SyclKernel,
1926 void *&Address, std::optional<bool> &FromSource) {
1928 if (SyclKernel && SyclKernel->isCreatedFromSource()) {
1930 pi_kernel KernelHandle = SyclKernel->getHandleRef();
1931 Address = KernelHandle;
1932 KernelName = FunctionName;
1940 void instrumentationAddExtraKernelMetadata(
1941 xpti_td *&CmdTraceEvent,
const NDRDescT &NDRDesc,
1944 const std::shared_ptr<detail::kernel_impl> &SyclKernel,
1946 std::vector<ArgDesc> &CGArgs)
1949 std::vector<ArgDesc> Args;
1951 auto FilterArgs = [&Args](detail::ArgDesc &Arg,
int NextTrueIndex) {
1952 Args.push_back({Arg.MType, Arg.MPtr, Arg.MSize, NextTrueIndex});
1956 std::mutex *KernelMutex =
nullptr;
1959 std::shared_ptr<kernel_impl> SyclKernelImpl;
1960 std::shared_ptr<device_image_impl> DeviceImageImpl;
1968 kernel_id KernelID =
1972 std::shared_ptr<kernel_impl> KernelImpl =
1975 EliminatedArgMask = KernelImpl->getKernelArgMask();
1976 Program = KernelImpl->getDeviceImage()->get_program_ref();
1977 }
else if (
nullptr != SyclKernel) {
1978 auto SyclProg = SyclKernel->getProgramImpl();
1979 Program = SyclProg->getHandleRef();
1980 if (!SyclKernel->isCreatedFromSource())
1981 EliminatedArgMask = SyclKernel->getKernelArgMask();
1983 std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) =
1985 Queue->getContextImplPtr(), Queue->getDeviceImplPtr(), KernelName);
1990 xpti::offload_kernel_enqueue_data_t KernelData{
1991 {NDRDesc.GlobalSize[0], NDRDesc.GlobalSize[1], NDRDesc.GlobalSize[2]},
1992 {NDRDesc.LocalSize[0], NDRDesc.LocalSize[1], NDRDesc.LocalSize[2]},
1993 {NDRDesc.GlobalOffset[0], NDRDesc.GlobalOffset[1],
1994 NDRDesc.GlobalOffset[2]},
1996 xpti::addMetadata(CmdTraceEvent,
"enqueue_kernel_data", KernelData);
1997 for (
size_t i = 0; i < Args.size(); i++) {
1999 xpti::offload_kernel_arg_data_t
arg{(int)Args[i].MType, Args[i].MPtr,
2000 Args[i].MSize, Args[i].MIndex};
2001 xpti::addMetadata(CmdTraceEvent, Prefix + std::to_string(i),
arg);
2005 void instrumentationFillCommonData(
const std::string &KernelName,
2007 uint64_t Column,
const void *
const Address,
2009 std::optional<bool> &FromSource,
2010 uint64_t &OutInstanceID,
2011 xpti_td *&OutTraceEvent) {
2018 bool HasSourceInfo =
false;
2019 xpti::payload_t Payload;
2020 if (!FileName.empty()) {
2022 Payload = xpti::payload_t(KernelName.c_str(), FileName.c_str(), Line,
2024 HasSourceInfo =
true;
2025 }
else if (Address) {
2027 Payload = xpti::payload_t(KernelName.c_str(), Address);
2031 Payload = xpti::payload_t(KernelName.c_str());
2033 uint64_t CGKernelInstanceNo;
2035 xpti_td *CmdTraceEvent =
2036 xptiMakeEvent(
"ExecCG", &Payload, xpti::trace_graph_event,
2037 xpti::trace_activity_type_t::active, &CGKernelInstanceNo);
2038 if (CmdTraceEvent) {
2039 OutInstanceID = CGKernelInstanceNo;
2040 OutTraceEvent = CmdTraceEvent;
2044 if (CGKernelInstanceNo > 1)
2047 xpti::addMetadata(CmdTraceEvent,
"sycl_device",
2048 deviceToID(Queue->get_device()));
2049 xpti::addMetadata(CmdTraceEvent,
"sycl_device_type",
2051 xpti::addMetadata(CmdTraceEvent,
"sycl_device_name",
2053 if (!KernelName.empty()) {
2054 xpti::addMetadata(CmdTraceEvent,
"kernel_name", KernelName);
2056 if (FromSource.has_value()) {
2057 xpti::addMetadata(CmdTraceEvent,
"from_source", FromSource.value());
2059 if (HasSourceInfo) {
2060 xpti::addMetadata(CmdTraceEvent,
"sym_function_name", KernelName);
2061 xpti::addMetadata(CmdTraceEvent,
"sym_source_file_name", FileName);
2062 xpti::addMetadata(CmdTraceEvent,
"sym_line_no",
static_cast<int>(Line));
2063 xpti::addMetadata(CmdTraceEvent,
"sym_column_no",
2064 static_cast<int>(Column));
2066 xpti::addMetadata(CmdTraceEvent,
"queue_id", Queue->getQueueID());
2071 #ifdef XPTI_ENABLE_INSTRUMENTATION
2072 std::pair<xpti_td *, uint64_t> emitKernelInstrumentationData(
2073 int32_t StreamID,
const std::shared_ptr<detail::kernel_impl> &SyclKernel,
2074 const detail::code_location &CodeLoc,
const std::string &SyclKernelName,
2077 std::vector<ArgDesc> &CGArgs) {
2079 auto XptiObjects = std::make_pair<xpti_td *, uint64_t>(
nullptr, -1);
2080 constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
2081 if (!xptiCheckTraceEnabled(StreamID))
2084 void *Address =
nullptr;
2085 std::optional<bool> FromSource;
2086 std::string KernelName = instrumentationGetKernelName(
2087 SyclKernel,
std::string(CodeLoc.functionName()), SyclKernelName, Address,
2090 auto &[CmdTraceEvent, InstanceID] = XptiObjects;
2093 CodeLoc.fileName() ? CodeLoc.fileName() :
std::string();
2094 instrumentationFillCommonData(KernelName, FileName, CodeLoc.lineNumber(),
2095 CodeLoc.columnNumber(), Address, Queue,
2096 FromSource, InstanceID, CmdTraceEvent);
2098 if (CmdTraceEvent) {
2099 instrumentationAddExtraKernelMetadata(CmdTraceEvent, NDRDesc,
2101 SyclKernel, Queue, CGArgs);
2103 xptiNotifySubscribers(
2104 StreamID, NotificationTraceType, detail::GSYCLGraphEvent, CmdTraceEvent,
2106 static_cast<const void *
>(
2107 commandToNodeType(Command::CommandType::RUN_CG).c_str()));
2115 #ifdef XPTI_ENABLE_INSTRUMENTATION
2116 constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
2121 std::optional<bool> FromSource;
2122 switch (MCommandGroup->getType()) {
2126 KernelName = instrumentationGetKernelName(
2127 KernelCG->MSyclKernel, MCommandGroup->MFunctionName,
2128 KernelCG->getKernelName(),
MAddress, FromSource);
2135 xpti_td *CmdTraceEvent =
nullptr;
2136 instrumentationFillCommonData(KernelName, MCommandGroup->MFileName,
2137 MCommandGroup->MLine, MCommandGroup->MColumn,
2141 if (CmdTraceEvent) {
2146 instrumentationAddExtraKernelMetadata(
2147 CmdTraceEvent, KernelCG->MNDRDesc, KernelCG->getKernelBundle(),
2148 KernelCG->MKernelName, KernelCG->MSyclKernel,
MQueue,
2152 xptiNotifySubscribers(
2153 MStreamID, NotificationTraceType, detail::GSYCLGraphEvent,
2155 static_cast<const void *
>(commandToNodeType(
MType).c_str()));
2161 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
2163 Stream <<
"ID = " <<
this <<
"\\n";
2166 switch (MCommandGroup->getType()) {
2170 Stream <<
"Kernel name: ";
2171 if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource())
2172 Stream <<
"created from source";
2183 Stream <<
"\"];" << std::endl;
2185 for (
const auto &Dep :
MDeps) {
2186 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
2187 <<
" [ label = \"Access mode: "
2189 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
2218 sycl::info::kernel_device_specific::compile_work_group_size>(
2221 if (WGSize[0] == 0) {
2245 switch (AccessorMode) {
2258 const std::shared_ptr<device_image_impl> &DeviceImageImpl,
2259 const std::function<
void *(
Requirement *Req)> &getMemAllocationFunc,
2261 size_t NextTrueIndex) {
2262 switch (Arg.
MType) {
2273 getMemAllocationFunc
2291 &MemObjData, &MemArg);
2301 sampler *SamplerPtr = (sampler *)Arg.
MPtr;
2317 "SYCL2020 specialization constants are not yet supported on host "
2321 assert(DeviceImageImpl !=
nullptr);
2323 DeviceImageImpl->get_spec_const_buffer_ref();
2326 SpecConstsBuffer ? &SpecConstsBuffer :
nullptr;
2332 Kernel, NextTrueIndex, &MemObjData, SpecConstsBufferArg);
2337 "Invalid kernel param kind " +
2345 const std::shared_ptr<device_image_impl> &DeviceImageImpl,
2347 std::vector<sycl::detail::pi::PiEvent> &RawEvents,
2350 const std::function<
void *(
Requirement *Req)> &getMemAllocationFunc,
2351 bool IsCooperative) {
2352 const PluginPtr &Plugin = Queue->getPlugin();
2354 auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
2357 Queue->get_context(), Queue->is_host(), Arg,
2366 const bool HasLocalSize = (NDRDesc.
LocalSize[0] != 0);
2370 size_t RequiredWGSize[3] = {0, 0, 0};
2371 size_t *LocalSize =
nullptr;
2377 Kernel, Queue->getDeviceImplPtr()->getHandleRef(),
2379 RequiredWGSize,
nullptr);
2381 const bool EnforcedLocalSize =
2382 (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2383 RequiredWGSize[2] != 0);
2384 if (EnforcedLocalSize)
2385 LocalSize = RequiredWGSize;
2387 if (OutEventImpl !=
nullptr)
2388 OutEventImpl->setHostEnqueueTime();
2391 if (IsCooperative) {
2398 &NDRDesc.
GlobalSize[0], LocalSize, RawEvents.size(),
2399 RawEvents.empty() ? nullptr : &RawEvents[0],
2400 OutEventImpl ? &OutEventImpl->getHandleRef() :
nullptr);
2406 void **CastedBlob = (
void **)Blob;
2408 std::vector<Requirement *> *Reqs =
2409 static_cast<std::vector<Requirement *> *
>(CastedBlob[0]);
2411 std::shared_ptr<HostKernelBase> *
HostKernel =
2412 static_cast<std::shared_ptr<HostKernelBase> *
>(CastedBlob[1]);
2417 void **NextArg = CastedBlob + 3;
2419 Req->
MData = *(NextArg++);
2421 (*HostKernel)->call(*NDRDesc,
nullptr);
2433 std::vector<sycl::detail::pi::PiExtSyncPoint> &SyncPoints,
2435 const std::function<
void *(
Requirement *Req)> &getMemAllocationFunc) {
2440 std::shared_ptr<kernel_impl> SyclKernelImpl =
nullptr;
2441 std::shared_ptr<device_image_impl> DeviceImageImpl =
nullptr;
2459 PiKernel = SyclKernelImpl->getHandleRef();
2460 DeviceImageImpl = SyclKernelImpl->getDeviceImage();
2461 PiProgram = DeviceImageImpl->get_program_ref();
2462 EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
2463 }
else if (Kernel !=
nullptr) {
2465 auto SyclProg = Kernel->getProgramImpl();
2467 EliminatedArgMask = Kernel->getKernelArgMask();
2470 sycl::detail::ProgramManager::getInstance().getOrCreateKernel(
2471 ContextImpl, DeviceImpl, CommandGroup.
MKernelName);
2474 auto SetFunc = [&Plugin, &
PiKernel, &DeviceImageImpl, &Ctx,
2475 &getMemAllocationFunc](sycl::detail::ArgDesc &Arg,
2476 size_t NextTrueIndex) {
2478 getMemAllocationFunc, Ctx,
false, Arg,
2482 auto Args = CommandGroup.
MArgs;
2489 auto NDRDesc = CommandGroup.
MNDRDesc;
2493 size_t RequiredWGSize[3] = {0, 0, 0};
2494 size_t *LocalSize =
nullptr;
2500 PiKernel, DeviceImpl->getHandleRef(),
2505 const bool EnforcedLocalSize =
2506 (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2507 RequiredWGSize[2] != 0);
2508 if (EnforcedLocalSize)
2509 LocalSize = RequiredWGSize;
2515 &NDRDesc.
GlobalSize[0], LocalSize, SyncPoints.size(),
2516 SyncPoints.size() ? SyncPoints.data() :
nullptr, OutSyncPoint);
2518 if (!SyclKernelImpl && !Kernel) {
2523 if (Res != pi_result::PI_SUCCESS) {
2535 const std::shared_ptr<detail::kernel_impl> &MSyclKernel,
2537 std::vector<sycl::detail::pi::PiEvent> &RawEvents,
2539 const std::function<
void *(
Requirement *Req)> &getMemAllocationFunc,
2541 const bool KernelIsCooperative) {
2544 auto ContextImpl = Queue->getContextImplPtr();
2545 auto DeviceImpl = Queue->getDeviceImplPtr();
2547 std::mutex *KernelMutex =
nullptr;
2551 std::shared_ptr<kernel_impl> SyclKernelImpl;
2552 std::shared_ptr<device_image_impl> DeviceImageImpl;
2567 Kernel = SyclKernelImpl->getHandleRef();
2568 DeviceImageImpl = SyclKernelImpl->getDeviceImage();
2570 Program = DeviceImageImpl->get_program_ref();
2572 EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
2573 KernelMutex = SyclKernelImpl->getCacheMutex();
2574 }
else if (
nullptr != MSyclKernel) {
2575 assert(MSyclKernel->get_info<info::kernel::context>() ==
2576 Queue->get_context());
2577 Kernel = MSyclKernel->getHandleRef();
2578 auto SyclProg = MSyclKernel->getProgramImpl();
2579 Program = SyclProg->getHandleRef();
2586 KernelMutex = &MSyclKernel->getNoncacheableEnqueueMutex();
2587 EliminatedArgMask = MSyclKernel->getKernelArgMask();
2589 std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) =
2591 ContextImpl, DeviceImpl, KernelName, NDRDesc);
2595 std::vector<sycl::detail::pi::PiEvent> &EventsWaitList = RawEvents;
2598 std::vector<sycl::detail::pi::PiEvent> DeviceGlobalInitEvents =
2599 ContextImpl->initializeDeviceGlobals(Program, Queue);
2600 std::vector<sycl::detail::pi::PiEvent> EventsWithDeviceGlobalInits;
2601 if (!DeviceGlobalInitEvents.empty()) {
2602 EventsWithDeviceGlobalInits.reserve(RawEvents.size() +
2603 DeviceGlobalInitEvents.size());
2604 EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(),
2605 RawEvents.begin(), RawEvents.end());
2606 EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(),
2607 DeviceGlobalInitEvents.begin(),
2608 DeviceGlobalInitEvents.end());
2609 EventsWaitList = EventsWithDeviceGlobalInits;
2617 using LockT = std::unique_lock<std::mutex>;
2618 auto Lock = KernelMutex ? LockT(*KernelMutex) : LockT();
2624 const PluginPtr &Plugin = Queue->getPlugin();
2631 NDRDesc, EventsWaitList, OutEventImpl,
2632 EliminatedArgMask, getMemAllocationFunc,
2633 KernelIsCooperative);
2635 const PluginPtr &Plugin = Queue->getPlugin();
2636 if (!SyclKernelImpl && !MSyclKernel) {
2641 if (PI_SUCCESS != Error) {
2644 const device_impl &DeviceImpl = *(Queue->getDeviceImplPtr());
2654 bool blocking,
void *ptr,
size_t size,
2655 std::vector<sycl::detail::pi::PiEvent> &RawEvents,
2661 device Device = Queue->get_device();
2663 std::optional<sycl::detail::pi::PiProgram> CachedProgram =
2664 ContextImpl->getProgramForHostPipe(Device, hostPipeEntry);
2666 Program = *CachedProgram;
2672 Queue->get_device());
2677 assert(Program &&
"Program for this hostpipe is not compiled.");
2680 const PluginPtr &Plugin = Queue->getPlugin();
2682 pi_queue pi_q = Queue->getHandleRef();
2685 auto OutEvent = OutEventImpl ? &OutEventImpl->getHandleRef() :
nullptr;
2686 if (OutEventImpl !=
nullptr)
2687 OutEventImpl->setHostEnqueueTime();
2691 pi_q, Program, PipeName.c_str(), blocking, ptr, size,
2692 RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
2698 pi_q, Program, PipeName.c_str(), blocking, ptr, size,
2699 RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
2705 pi_int32 ExecCGCommand::enqueueImpCommandBuffer() {
2714 std::vector<sycl::detail::pi::PiEvent> RawEvents =
getPiEvents(EventImpls);
2715 if (!RawEvents.empty()) {
2721 (
MQueue->supportsDiscardingPiEvents() &&
2722 MCommandGroup->getRequirements().size() == 0)
2724 : &
MEvent->getHandleRef();
2726 switch (MCommandGroup->getType()) {
2727 case CG::CGTYPE::Kernel: {
2728 CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
2730 auto getMemAllocationFunc = [
this](
Requirement *Req) {
2731 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2732 return AllocaCmd->getMemAllocation();
2737 bool KernelUsesAssert =
2738 !(ExecKernel->MSyclKernel && ExecKernel->MSyclKernel->isInterop()) &&
2740 ExecKernel->MKernelName);
2741 if (KernelUsesAssert) {
2742 Event = &
MEvent->getHandleRef();
2747 *ExecKernel,
MSyncPointDeps, &OutSyncPoint, getMemAllocationFunc);
2748 MEvent->setSyncPoint(OutSyncPoint);
2751 case CG::CGTYPE::CopyUSM: {
2752 CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
2755 Copy->getLength(), Copy->getDst(),
MSyncPointDeps, &OutSyncPoint);
2756 MEvent->setSyncPoint(OutSyncPoint);
2759 case CG::CGTYPE::CopyAccToAcc: {
2760 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2764 AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc);
2765 AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst);
2769 AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(),
2770 ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange,
2771 ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
2772 ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
2775 MEvent->setSyncPoint(OutSyncPoint);
2778 case CG::CGTYPE::CopyAccToPtr: {
2779 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2781 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2785 AllocaCmd->getMemAllocation(), Req->MDims, Req->MMemoryRange,
2786 Req->MAccessRange, Req->MOffset, Req->MElemSize, (
char *)Copy->getDst(),
2787 Req->MDims, Req->MAccessRange,
2790 MEvent->setSyncPoint(OutSyncPoint);
2793 case CG::CGTYPE::CopyPtrToAcc: {
2794 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2796 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2800 (
char *)Copy->getSrc(), Req->MDims, Req->MAccessRange,
2801 {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(),
2802 Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2804 MEvent->setSyncPoint(OutSyncPoint);
2807 case CG::CGTYPE::Fill: {
2808 CGFill *Fill = (CGFill *)MCommandGroup.get();
2810 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2814 AllocaCmd->getMemAllocation(), Fill->MPattern.size(),
2815 Fill->MPattern.data(), Req->MDims, Req->MMemoryRange, Req->MAccessRange,
2816 Req->MOffset, Req->MElemSize, std::move(
MSyncPointDeps), &OutSyncPoint);
2817 MEvent->setSyncPoint(OutSyncPoint);
2820 case CG::CGTYPE::FillUSM: {
2821 CGFillUSM *Fill = (CGFillUSM *)MCommandGroup.get();
2826 MEvent->setSyncPoint(OutSyncPoint);
2829 case CG::CGTYPE::PrefetchUSM: {
2830 CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get();
2833 Prefetch->getLength(), std::move(
MSyncPointDeps), &OutSyncPoint);
2834 MEvent->setSyncPoint(OutSyncPoint);
2837 case CG::CGTYPE::AdviseUSM: {
2838 CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get();
2841 Advise->getLength(), Advise->getAdvice(), std::move(
MSyncPointDeps),
2843 MEvent->setSyncPoint(OutSyncPoint);
2848 throw runtime_error(
"CG type not implemented for command buffers.",
2849 PI_ERROR_INVALID_OPERATION);
2853 pi_int32 ExecCGCommand::enqueueImp() {
2855 return enqueueImpCommandBuffer();
2857 return enqueueImpQueue();
2861 pi_int32 ExecCGCommand::enqueueImpQueue() {
2868 bool DiscardPiEvent = (
MQueue->supportsDiscardingPiEvents() &&
2869 MCommandGroup->getRequirements().size() == 0);
2871 DiscardPiEvent ? nullptr : &
MEvent->getHandleRef();
2874 switch (MCommandGroup->getType()) {
2876 case CG::CGTYPE::UpdateHost: {
2878 "Update host should be handled by the Scheduler. " +
2881 case CG::CGTYPE::CopyAccToPtr: {
2882 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2884 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2887 AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(),
MQueue,
2888 Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2889 Req->MElemSize, Copy->getDst(),
2891 Req->MAccessRange, Req->MAccessRange, {0, 0, 0},
2892 Req->MElemSize, std::move(RawEvents),
MEvent->getHandleRef(),
MEvent);
2896 case CG::CGTYPE::CopyPtrToAcc: {
2897 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2899 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2904 AllocaCmd->getSYCLMemObj(), Copy->getSrc(),
2906 Req->MAccessRange, Req->MAccessRange,
2907 {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(),
2908 MQueue, Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2909 Req->MElemSize, std::move(RawEvents),
MEvent->getHandleRef(),
MEvent);
2913 case CG::CGTYPE::CopyAccToAcc: {
2914 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2918 AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc);
2919 AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst);
2922 AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(),
MQueue,
2923 ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange,
2924 ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
2925 MQueue, ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
2926 ReqDst->MOffset, ReqDst->MElemSize, std::move(RawEvents),
2931 case CG::CGTYPE::Fill: {
2932 CGFill *Fill = (CGFill *)MCommandGroup.get();
2934 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2937 AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(),
MQueue,
2938 Fill->MPattern.size(), Fill->MPattern.data(), Req->MDims,
2939 Req->MMemoryRange, Req->MAccessRange, Req->MOffset, Req->MElemSize,
2944 case CG::CGTYPE::Kernel: {
2945 CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
2947 NDRDescT &NDRDesc = ExecKernel->MNDRDesc;
2948 std::vector<ArgDesc> &Args = ExecKernel->MArgs;
2950 if (
MQueue->is_host() || (
MQueue->getDeviceImplPtr()->getBackend() ==
2951 backend::ext_intel_esimd_emulator)) {
2952 for (ArgDesc &Arg : Args)
2955 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2956 Req->MData = AllocaCmd->getMemAllocation();
2958 if (!RawEvents.empty()) {
2960 const PluginPtr &Plugin = EventImpls[0]->getPlugin();
2965 ExecKernel->MHostKernel->call(NDRDesc,
2966 getEvent()->getHostProfilingInfo());
2968 assert(
MQueue->getDeviceImplPtr()->getBackend() ==
2969 backend::ext_intel_esimd_emulator);
2971 MEvent->setHostEnqueueTime();
2974 reinterpret_cast<pi_kernel>(ExecKernel->MHostKernel->getPtr()),
2975 NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0],
2976 &NDRDesc.LocalSize[0], 0,
nullptr,
nullptr);
2981 auto getMemAllocationFunc = [
this](
Requirement *Req) {
2982 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2985 return AllocaCmd ? AllocaCmd->getMemAllocation() :
nullptr;
2988 const std::shared_ptr<detail::kernel_impl> &SyclKernel =
2989 ExecKernel->MSyclKernel;
2990 const std::string &KernelName = ExecKernel->MKernelName;
2994 bool KernelUsesAssert =
2995 !(SyclKernel && SyclKernel->isInterop()) &&
2997 if (KernelUsesAssert) {
3003 MQueue, NDRDesc, Args, ExecKernel->getKernelBundle(), SyclKernel,
3004 KernelName, RawEvents, EventImpl, getMemAllocationFunc,
3005 ExecKernel->MKernelCacheConfig, ExecKernel->MKernelIsCooperative);
3007 case CG::CGTYPE::CopyUSM: {
3008 CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
3010 Copy->getDst(), std::move(RawEvents), Event,
3015 case CG::CGTYPE::FillUSM: {
3016 CGFillUSM *Fill = (CGFillUSM *)MCommandGroup.get();
3018 Fill->getFill(), std::move(RawEvents), Event,
3023 case CG::CGTYPE::PrefetchUSM: {
3024 CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get();
3026 Prefetch->getLength(), std::move(RawEvents),
3031 case CG::CGTYPE::AdviseUSM: {
3032 CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get();
3034 Advise->getAdvice(), std::move(RawEvents), Event,
3039 case CG::CGTYPE::Copy2DUSM: {
3040 CGCopy2DUSM *Copy = (CGCopy2DUSM *)MCommandGroup.get();
3042 Copy->getDst(), Copy->getDstPitch(),
3043 Copy->getWidth(), Copy->getHeight(),
3044 std::move(RawEvents), Event,
MEvent);
3047 case CG::CGTYPE::Fill2DUSM: {
3048 CGFill2DUSM *Fill = (CGFill2DUSM *)MCommandGroup.get();
3050 Fill->getWidth(), Fill->getHeight(),
3051 Fill->getPattern(), std::move(RawEvents), Event,
3055 case CG::CGTYPE::Memset2DUSM: {
3056 CGMemset2DUSM *Memset = (CGMemset2DUSM *)MCommandGroup.get();
3058 Memset->getWidth(), Memset->getHeight(),
3059 Memset->getValue(), std::move(RawEvents),
3063 case CG::CGTYPE::CodeplayHostTask: {
3064 CGHostTask *HostTask =
static_cast<CGHostTask *
>(MCommandGroup.get());
3066 for (ArgDesc &Arg : HostTask->MArgs) {
3067 switch (Arg.MType) {
3070 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
3073 Req->MData = AllocaCmd->getMemAllocation();
3078 "Unsupported arg type " +
3083 std::vector<interop_handle::ReqToMem> ReqToMem;
3085 if (HostTask->MHostTask->isInteropTask()) {
3088 const std::vector<Requirement *> &HandlerReq =
3089 HostTask->getRequirements();
3090 auto ReqToMemConv = [&ReqToMem, HostTask](
Requirement *Req) {
3091 const std::vector<AllocaCommandBase *> &AllocaCmds =
3092 Req->MSYCLMemObj->MRecord->MAllocaCommands;
3094 for (AllocaCommandBase *AllocaCmd : AllocaCmds)
3095 if (HostTask->MQueue->getContextImplPtr() ==
3096 AllocaCmd->getQueue()->getContextImplPtr()) {
3098 reinterpret_cast<pi_mem>(AllocaCmd->getMemAllocation());
3099 ReqToMem.emplace_back(std::make_pair(Req, MemArg));
3105 "Can't get memory object due to no allocation available");
3109 "Can't get memory object due to no allocation available " +
3112 std::for_each(std::begin(HandlerReq), std::end(HandlerReq), ReqToMemConv);
3113 std::sort(std::begin(ReqToMem), std::end(ReqToMem));
3127 case CG::CGTYPE::Barrier: {
3128 if (
MQueue->getDeviceImplPtr()->is_host()) {
3134 MEvent->setHostEnqueueTime();
3136 MQueue->getHandleRef(), 0,
nullptr, Event);
3140 case CG::CGTYPE::BarrierWaitlist: {
3141 CGBarrier *Barrier =
static_cast<CGBarrier *
>(MCommandGroup.get());
3142 std::vector<detail::EventImplPtr> Events = Barrier->MEventsWaitWithBarrier;
3143 std::vector<sycl::detail::pi::PiEvent> PiEvents =
3145 if (
MQueue->getDeviceImplPtr()->is_host() || PiEvents.empty()) {
3152 MEvent->setHostEnqueueTime();
3154 MQueue->getHandleRef(), PiEvents.size(), &PiEvents[0], Event);
3158 case CG::CGTYPE::CopyToDeviceGlobal: {
3159 CGCopyToDeviceGlobal *Copy = (CGCopyToDeviceGlobal *)MCommandGroup.get();
3161 Copy->getDeviceGlobalPtr(), Copy->isDeviceImageScoped(),
MQueue,
3162 Copy->getNumBytes(), Copy->getOffset(), Copy->getSrc(),
3163 std::move(RawEvents), Event,
MEvent);
3167 case CG::CGTYPE::CopyFromDeviceGlobal: {
3168 CGCopyFromDeviceGlobal *Copy =
3169 (CGCopyFromDeviceGlobal *)MCommandGroup.get();
3171 Copy->getDeviceGlobalPtr(), Copy->isDeviceImageScoped(),
MQueue,
3172 Copy->getNumBytes(), Copy->getOffset(), Copy->getDest(),
3173 std::move(RawEvents), Event,
MEvent);
3177 case CG::CGTYPE::ReadWriteHostPipe: {
3178 CGReadWriteHostPipe *ExecReadWriteHostPipe =
3179 (CGReadWriteHostPipe *)MCommandGroup.get();
3180 std::string pipeName = ExecReadWriteHostPipe->getPipeName();
3181 void *hostPtr = ExecReadWriteHostPipe->getHostPtr();
3182 size_t typeSize = ExecReadWriteHostPipe->getTypeSize();
3183 bool blocking = ExecReadWriteHostPipe->isBlocking();
3184 bool read = ExecReadWriteHostPipe->isReadHostPipe();
3190 typeSize, RawEvents, EventImpl, read);
3192 case CG::CGTYPE::ExecCommandBuffer: {
3193 CGExecCommandBuffer *CmdBufferCG =
3194 static_cast<CGExecCommandBuffer *
>(MCommandGroup.get());
3196 MEvent->setHostEnqueueTime();
3197 return MQueue->getPlugin()
3199 CmdBufferCG->MCommandBuffer,
MQueue->getHandleRef(),
3200 RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
3203 case CG::CGTYPE::CopyImage: {
3204 CGCopyImage *Copy = (CGCopyImage *)MCommandGroup.get();
3209 Copy->getSrc(),
MQueue, Copy->getDst(), Desc, Copy->getFormat(),
3210 Copy->getCopyFlags(), Copy->getSrcOffset(), Copy->getDstOffset(),
3211 Copy->getHostExtent(), Copy->getCopyExtent(), std::move(RawEvents),
3215 case CG::CGTYPE::SemaphoreWait: {
3216 CGSemaphoreWait *SemWait = (CGSemaphoreWait *)MCommandGroup.get();
3217 if (
MQueue->getDeviceImplPtr()->is_host()) {
3224 MQueue->getHandleRef(), SemWait->getInteropSemaphoreHandle(), 0,
3229 case CG::CGTYPE::SemaphoreSignal: {
3230 CGSemaphoreSignal *SemSignal = (CGSemaphoreSignal *)MCommandGroup.get();
3231 if (
MQueue->getDeviceImplPtr()->is_host()) {
3238 MQueue->getHandleRef(), SemSignal->getInteropSemaphoreHandle(), 0,
3243 case CG::CGTYPE::None:
3245 "CG type not implemented. " +
3248 return PI_ERROR_INVALID_OPERATION;
3253 MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask;
3259 (MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask);
3263 if (MCommandGroup->getType() == CG::CGTYPE::CodeplayHostTask)
3275 return MAuxiliaryCommands;
3279 MFusionList.push_back(Kernel);
3288 pi_int32 KernelFusionCommand::enqueueImp() {
3304 "Cannot release the queue attached to the KernelFusionCommand if it "
3311 #ifdef XPTI_ENABLE_INSTRUMENTATION
3312 constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
3313 if (!xptiCheckTraceEnabled(
MStreamID)) {
3321 static unsigned FusionNodeCount = 0;
3322 std::stringstream PayloadStr;
3323 PayloadStr <<
"Fusion command #" << FusionNodeCount++;
3324 xpti::payload_t Payload = xpti::payload_t(PayloadStr.str().c_str());
3326 uint64_t CommandInstanceNo = 0;
3327 xpti_td *CmdTraceEvent =
3328 xptiMakeEvent(
MCommandName.c_str(), &Payload, xpti::trace_graph_event,
3329 xpti_at::active, &CommandInstanceNo);
3332 if (CmdTraceEvent) {
3348 xpti::addMetadata(CmdTraceEvent,
"sycl_device",
3349 deviceToID(
MQueue->get_device()));
3350 xpti::addMetadata(CmdTraceEvent,
"sycl_device_type",
3352 xpti::addMetadata(CmdTraceEvent,
"sycl_device_name",
3354 xpti::addMetadata(CmdTraceEvent,
"queue_id",
MQueue->getQueueID());
3358 xptiNotifySubscribers(
MStreamID, NotificationTraceType,
3359 detail::GSYCLGraphEvent,
3368 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
3370 Stream <<
"ID = " <<
this <<
"\\n";
3372 <<
"FUSION LIST: {";
3373 bool Initial =
true;
3374 for (
auto *Cmd : MFusionList) {
3380 if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource()) {
3381 Stream <<
"created from source";
3388 Stream <<
"\"];" << std::endl;
3390 for (
const auto &Dep :
MDeps) {
3391 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
3392 <<
" [ label = \"Access mode: "
3394 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
The context class represents a SYCL context on which kernel functions may be executed.
backend get_backend() const noexcept
Returns the backend associated with this context.
detail::SYCLMemObjI * MSYCLMemObj
range< 3 > & MAccessRange
range< 3 > & MMemoryRange
Base class for memory allocation commands.
const Requirement * getRequirement() const final
AllocaCommandBase * MLinkedAllocaCmd
Alloca command linked with current command.
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
virtual void * getMemAllocation() const =0
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
bool MIsActive
Indicates that current alloca is active one.
bool MIsLeaderAlloca
Indicates that the command owns memory allocation in case of connected alloca command.
AllocaCommandBase(CommandType Type, QueueImplPtr Queue, Requirement Req, AllocaCommandBase *LinkedAllocaCmd, bool IsConst)
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
SYCLMemObjI * getSYCLMemObj() const
void printDot(std::ostream &Stream) const final
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
AllocaCommand(QueueImplPtr Queue, Requirement Req, bool InitFromUserData=true, AllocaCommandBase *LinkedAllocaCmd=nullptr, bool IsConst=false)
void * getMemAllocation() const final
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
void printDot(std::ostream &Stream) const final
AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *ParentAlloca, std::vector< Command * > &ToEnqueue, std::vector< Command * > &ToCleanUp)
sycl::detail::kernel_param_kind_t MType
"Execute kernel" command group class.
std::shared_ptr< detail::kernel_bundle_impl > MKernelBundle
std::vector< ArgDesc > MArgs
bool MKernelIsCooperative
NDRDescT MNDRDesc
Stores ND-range description.
std::shared_ptr< detail::kernel_impl > MSyclKernel
std::shared_ptr< detail::queue_impl > MQueue
Base class for all types of command groups.
CGTYPE
Type of the command group.
The Command class represents some action that needs to be performed on one or more memory objects.
bool MShouldCompleteEventIfPossible
void copySubmissionCodeLocation()
Command * processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform glueing of events from different contexts.
void * MTraceEvent
The event for node_create and task_begin.
CommandType MType
The type of the command.
virtual bool producesPiEvent() const
Returns true iff the command produces a PI event on non-host devices.
void emitEnqueuedEventSignal(sycl::detail::pi::PiEvent &PiEventAddr)
Creates a signal event with the enqueued kernel event handle.
int32_t MStreamID
The stream under which the traces are emitted.
void emitInstrumentation(uint16_t Type, const char *Txt=nullptr)
Emits an event of Type.
virtual void emitInstrumentationData()=0
Instrumentation method which emits telemetry data.
void resolveReleaseDependencies(std::set< Command * > &list)
Looks at all the dependencies for the release command and enables instrumentation to report these dep...
std::string MCommandName
Buffer to build the command end-user understandable name.
virtual pi_int32 enqueueImp()=0
Private interface. Derived classes should implement this method.
bool MMarkedForCleanup
Indicates that the node will be freed by graph cleanup.
void emitEdgeEventForEventDependence(Command *Cmd, sycl::detail::pi::PiEvent &EventAddr)
Creates an edge event when the dependency is an event.
unsigned MLeafCounter
Counts the number of memory objects this command is a leaf for.
virtual bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, std::vector< Command * > &ToCleanUp)
Checks if the command is enqueued, and calls enqueueImp.
std::vector< sycl::detail::pi::PiEvent > getPiEvents(const std::vector< EventImplPtr > &EventImpls) const
Collect PI events from EventImpls and filter out some of them in case of in order queue.
void waitForPreparedHostEvents() const
std::string MSubmissionFileName
Introduces string to handle memory management since code_location struct works with raw char arrays.
std::mutex MEnqueueMtx
Mutex used to protect enqueueing from race conditions.
virtual const ContextImplPtr & getWorkerContext() const
Get the context of the queue this command will be submitted to.
void emitInstrumentationDataProxy()
Proxy method which calls emitInstrumentationData.
code_location MSubmissionCodeLocation
Represents code location of command submission to SYCL API, assigned with the valid value only if com...
void makeTraceEventEpilog()
If prolog has been run, run epilog; this must be guarded by a check for xptiTraceEnabled().
std::vector< sycl::detail::pi::PiEvent > getPiEventsBlocking(const std::vector< EventImplPtr > &EventImpls) const
Collect PI events from EventImpls and filter out some of them in case of in order queue.
std::vector< EventImplPtr > & MPreparedHostDepsEvents
std::vector< EventImplPtr > & MPreparedDepsEvents
Dependency events prepared for waiting by backend.
Command(CommandType Type, QueueImplPtr Queue, sycl::detail::pi::PiExtCommandBuffer CommandBuffer=nullptr, const std::vector< sycl::detail::pi::PiExtSyncPoint > &SyncPoints={})
It is safe to bind MPreparedDepsEvents and MPreparedHostDepsEvents references to event_impl class mem...
std::string MSubmissionFunctionName
uint64_t MInstanceID
Instance ID tracked for the command.
const QueueImplPtr & getWorkerQueue() const
Get the queue this command will be submitted to.
std::vector< DepDesc > MDeps
Contains list of dependencies(edges)
const char * getBlockReason() const
std::vector< sycl::detail::pi::PiExtSyncPoint > MSyncPointDeps
List of sync points for submissions to a command buffer.
virtual bool readyForCleanup() const
Returns true iff this command is ready to be submitted for cleanup.
void addUser(Command *NewUser)
std::atomic< EnqueueResultT::ResultT > MEnqueueStatus
Describes the status of the command.
const EventImplPtr & getEvent() const
uint64_t makeTraceEventProlog(void *MAddress)
Create a trace event of node_create type; this must be guarded by a check for xptiTraceEnabled().
friend class DispatchHostTask
CommandType getType() const
void * MAddress
Reserved for storing the object address such as SPIR-V or memory object address.
std::string MAddressString
Buffer to build the address string.
void waitForEvents(QueueImplPtr Queue, std::vector< EventImplPtr > &RawEvents, sycl::detail::pi::PiEvent &Event)
QueueImplPtr MWorkerQueue
bool MIsBlockable
Indicates whether the command can be blocked from enqueueing.
virtual bool supportsPostEnqueueCleanup() const
Returns true iff this command can be freed by post enqueue cleanup.
bool MTraceEventPrologComplete
Flag to indicate if makeTraceEventProlog() has been run.
sycl::detail::pi::PiExtCommandBuffer MCommandBuffer
CommandBuffer which will be used to submit to instead of the queue, if set.
std::string MCommandNodeType
Buffer to build the command node type.
Command * addDep(DepDesc NewDep, std::vector< Command * > &ToCleanUp)
void emitEdgeEventForCommandDependence(Command *Cmd, void *ObjAddr, bool IsCommand, std::optional< access::mode > AccMode=std::nullopt)
Creates an edge event when the dependency is a command.
const QueueImplPtr & getQueue() const
bool MFirstInstance
Flag to indicate if this is the first time we are seeing this payload.
DispatchHostTask(ExecCGCommand *ThisCmd, std::vector< interop_handle::ReqToMem > ReqToMem)
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
void printDot(std::ostream &Stream) const final
void addRequirement(Command *DepCmd, AllocaCommandBase *AllocaCmd, const Requirement *Req)
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
EmptyCommand(QueueImplPtr Queue)
The exec CG command enqueues execution of kernel or explicit memory operation.
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
std::string_view getTypeString() const
detail::CG & getCG() const
void printDot(std::ostream &Stream) const final
std::vector< std::shared_ptr< const void > > getAuxiliaryResources() const
void clearAuxiliaryResources()
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
ExecCGCommand(std::unique_ptr< detail::CG > CommandGroup, QueueImplPtr Queue, sycl::detail::pi::PiExtCommandBuffer CommandBuffer=nullptr, const std::vector< sycl::detail::pi::PiExtSyncPoint > &Dependencies={})
void TraceEventXPTI(const char *Message)
static GlobalHandler & instance()
void addToFusionList(ExecCGCommand *Kernel)
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
void setFusionStatus(FusionStatus Status)
Set the status of this fusion command to Status.
std::vector< ExecCGCommand * > & getFusionList()
void printDot(std::ostream &Stream) const final
KernelFusionCommand(QueueImplPtr Queue)
void resetQueue()
Reset the queue.
std::vector< Command * > & auxiliaryCommands()
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, void **DstPtr, QueueImplPtr Queue, access::mode MapMode)
void printDot(std::ostream &Stream) const final
const ContextImplPtr & getWorkerContext() const final
Get the context of the queue this command will be submitted to.
MemCpyCommandHost(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, Requirement DstReq, void **DstPtr, QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
void printDot(std::ostream &Stream) const final
MemCpyCommand(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, Requirement DstReq, AllocaCommandBase *DstAllocaCmd, QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
void printDot(std::ostream &Stream) const final
const ContextImplPtr & getWorkerContext() const final
Get the context of the queue this command will be submitted to.
static void * allocateMemSubBuffer(ContextImplPtr TargetContext, void *ParentMemObj, size_t ElemSize, size_t Offset, range< 3 > Range, std::vector< EventImplPtr > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
static void ext_oneapi_copyH2D_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, char *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void ext_oneapi_copyD2H_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, void *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, char *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void prefetch_usm(void *Ptr, QueueImplPtr Queue, size_t Len, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void advise_usm(const void *Ptr, QueueImplPtr Queue, size_t Len, pi_mem_advice Advice, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy_2d_usm(const void *SrcMem, size_t SrcPitch, QueueImplPtr Queue, void *DstMem, size_t DstPitch, size_t Width, size_t Height, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_prefetch_usm_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, void *Mem, size_t Length, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void fill(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, size_t PatternSize, const char *Pattern, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent &OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_copyD2D_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, void *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void * allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, bool InitFromUserData, void *HostPtr, std::vector< EventImplPtr > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
static void copy_image_bindless(void *Src, QueueImplPtr Queue, void *Dst, const sycl::detail::pi::PiMemImageDesc &Desc, const sycl::detail::pi::PiMemImageFormat &Format, const sycl::detail::pi::PiImageCopyFlags Flags, sycl::detail::pi::PiImageOffset SrcOffset, sycl::detail::pi::PiImageOffset DstOffset, sycl::detail::pi::PiImageRegion CopyExtent, sycl::detail::pi::PiImageRegion HostExtent, const std::vector< sycl::detail::pi::PiEvent > &DepEvents, sycl::detail::pi::PiEvent *OutEvent)
static void copy_usm(const void *SrcMem, QueueImplPtr Queue, size_t Len, void *DstMem, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy_to_device_global(const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue, size_t NumBytes, size_t Offset, const void *SrcMem, const std::vector< sycl::detail::pi::PiEvent > &DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void release(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *MemAllocation, std::vector< EventImplPtr > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
static void ext_oneapi_fill_usm_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, void *DstMem, size_t Len, int Pattern, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void copy_from_device_global(const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue, size_t NumBytes, size_t Offset, void *DstMem, const std::vector< sycl::detail::pi::PiEvent > &DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_copy_usm_cmd_buffer(ContextImplPtr Context, const void *SrcMem, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, size_t Len, void *DstMem, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void memset_2d_usm(void *DstMem, QueueImplPtr Queue, size_t Pitch, size_t Width, size_t Height, char Value, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void fill_usm(void *DstMem, QueueImplPtr Queue, size_t Len, int Pattern, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy(SYCLMemObjI *SYCLMemObj, void *SrcMem, QueueImplPtr SrcQueue, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, QueueImplPtr TgtQueue, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent &OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_fill_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, void *Mem, size_t PatternSize, const char *Pattern, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void * map(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, access::mode AccessMode, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
static void fill_2d_usm(void *DstMem, QueueImplPtr Queue, size_t Pitch, size_t Width, size_t Height, const std::vector< char > &Pattern, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void unmap(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, void *MappedPtr, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
static void ext_oneapi_advise_usm_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, const void *Mem, size_t Length, pi_mem_advice Advice, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
sycl::range< 3 > GlobalSize
sycl::range< 3 > NumWorkGroups
Number of workgroups, used to record the number of workgroups from the simplest form of parallel_for_...
sycl::id< 3 > GlobalOffset
sycl::range< 3 > LocalSize
void set(sycl::range< Dims_ > NumWorkItems)
static ProgramManager & getInstance()
kernel_id getSYCLKernelID(const std::string &KernelName)
device_image_plain getDeviceImageFromBinaryImage(RTDeviceBinaryImage *BinImage, const context &Ctx, const device &Dev)
bool kernelUsesAssert(const std::string &KernelName) const
HostPipeMapEntry * getHostPipeEntry(const std::string &UniqueId)
device_image_plain build(const device_image_plain &DeviceImage, const std::vector< device > &Devs, const property_list &PropList)
std::tuple< sycl::detail::pi::PiKernel, std::mutex *, const KernelArgMask *, sycl::detail::pi::PiProgram > getOrCreateKernel(const ContextImplPtr &ContextImpl, const DeviceImplPtr &DeviceImpl, const std::string &KernelName, const NDRDescT &NDRDesc={})
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd)
void printDot(std::ostream &Stream) const final
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
virtual MemObjType getType() const =0
Command * connectDepEvent(Command *const Cmd, const EventImplPtr &DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform connection of events in multiple contexts.
QueueImplPtr getDefaultHostQueue()
void enqueueCommandForCG(EventImplPtr NewEvent, std::vector< Command * > &AuxilaryCmds, BlockingT Blocking=NON_BLOCKING)
GraphBuilder MGraphBuilder
static Scheduler & getInstance()
void NotifyHostTaskCompletion(Command *Cmd)
void printDot(std::ostream &Stream) const final
UnMapMemObject(AllocaCommandBase *DstAllocaCmd, Requirement Req, void **SrcPtr, QueueImplPtr Queue)
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
UpdateHostRequirementCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *SrcAllocaCmd, void **DstPtr)
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
void printDot(std::ostream &Stream) const final
static void bufferAssociateNotification(const void *, const void *)
const PluginPtr & getPlugin() const
sycl::detail::pi::PiDevice & getHandleRef()
Get reference to PI device.
Data type that manages the code_location information in TLS.
const detail::code_location & query()
Query the information in the TLS slot.
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
bool is_accelerator() const
Check if device is an accelerator device.
bool is_gpu() const
Check if device is a GPU device.
bool is_cpu() const
Check if device is a CPU device.
cl_int get_cl_code() const
Objects of the class identify kernel is some kernel_bundle related APIs.
Provides an abstraction of a SYCL kernel.
Defines the iteration domain of both the work-groups and the overall dispatch.
void handleErrorOrWarning(pi_result Error, const device_impl &DeviceImpl, pi_kernel Kernel, const NDRDescT &NDRDesc)
Analyzes error code and arguments of piEnqueueKernelLaunch to emit user-friendly exception describing...
::pi_kernel_cache_config PiKernelCacheConfig
::pi_ext_sync_point PiExtSyncPoint
void free(void *Ptr, const context &Ctxt, const code_location &CL)
@ kind_specialization_constants_buffer
static void flushCrossQueueDeps(const std::vector< EventImplPtr > &EventImpls, const QueueImplPtr &Queue)
static std::string_view cgTypeToString(detail::CG::CGTYPE Type)
static std::string demangleKernelName(std::string Name)
std::vector< bool > KernelArgMask
constexpr const char * SYCL_STREAM_NAME
std::enable_if< !std::is_same< typename Param::return_type, sycl::range< 3 > >::value, typename Param::return_type >::type get_kernel_device_specific_info(sycl::detail::pi::PiKernel Kernel, sycl::detail::pi::PiDevice Device, const PluginPtr &Plugin)
std::string codeToString(pi_int32 code)
void ReverseRangeDimensionsForKernel(NDRDescT &NDR)
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
pi_int32 enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName, bool blocking, void *ptr, size_t size, std::vector< sycl::detail::pi::PiEvent > &RawEvents, const detail::EventImplPtr &OutEventImpl, bool read)
std::shared_ptr< event_impl > EventImplPtr
static void adjustNDRangePerKernel(NDRDescT &NDR, sycl::detail::pi::PiKernel Kernel, const device_impl &DeviceImpl)
pi_int32 enqueueImpCommandBufferKernel(context Ctx, DeviceImplPtr DeviceImpl, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, const CGExecKernel &CommandGroup, std::vector< sycl::detail::pi::PiExtSyncPoint > &SyncPoints, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint, const std::function< void *(Requirement *Req)> &getMemAllocationFunc)
pi_mem_obj_access AccessModeToPi(access::mode AccessorMode)
static pi_result SetKernelParamsAndLaunch(const QueueImplPtr &Queue, std::vector< ArgDesc > &Args, const std::shared_ptr< device_image_impl > &DeviceImageImpl, sycl::detail::pi::PiKernel Kernel, NDRDescT &NDRDesc, std::vector< sycl::detail::pi::PiEvent > &RawEvents, const detail::EventImplPtr &OutEventImpl, const KernelArgMask *EliminatedArgMask, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, bool IsCooperative)
std::shared_ptr< plugin > PluginPtr
AccessorImplHost Requirement
std::shared_ptr< device_impl > DeviceImplPtr
static std::string deviceToString(device Device)
pi_int32 enqueueImpKernel(const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector< ArgDesc > &Args, const std::shared_ptr< detail::kernel_bundle_impl > &KernelBundleImplPtr, const std::shared_ptr< detail::kernel_impl > &MSyclKernel, const std::string &KernelName, std::vector< sycl::detail::pi::PiEvent > &RawEvents, const detail::EventImplPtr &OutEventImpl, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig, const bool KernelIsCooperative)
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
void applyFuncOnFilteredArgs(const KernelArgMask *EliminatedArgMask, std::vector< ArgDesc > &Args, std::function< void(detail::ArgDesc &Arg, int NextTrueIndex)> Func)
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
void SetArgBasedOnType(const PluginPtr &Plugin, sycl::detail::pi::PiKernel Kernel, const std::shared_ptr< device_image_impl > &DeviceImageImpl, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, const sycl::context &Context, bool IsHost, detail::ArgDesc &Arg, size_t NextTrueIndex)
void DispatchNativeKernel(void *Blob)
static std::string accessModeToString(access::mode Mode)
Function for_each(Group g, Ptr first, Ptr last, Function f)
__DPCPP_SYCL_EXTERNAL _SYCL_EXT_CPLX_INLINE_VISIBILITY std::enable_if_t< is_genfloat< _Tp >::value, _Tp > arg(const complex< _Tp > &__c)
PropertyListT int access::address_space multi_ptr & operator=(multi_ptr &&)=default
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM
pi_result piextEnqueueReadHostPipe(pi_queue queue, pi_program program, const char *pipe_symbol, pi_bool blocking, void *ptr, size_t size, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
Plugin.
pi_result piKernelRelease(pi_kernel kernel)
pi_result piextEnqueueWriteHostPipe(pi_queue queue, pi_program program, const char *pipe_symbol, pi_bool blocking, void *ptr, size_t size, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
Write to pipe of a given name.
pi_result piKernelSetArg(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
pi_result piEnqueueEventsWaitWithBarrier(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
pi_result piEnqueueKernelLaunch(pi_queue queue, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
@ PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
pi_result piextWaitExternalSemaphore(pi_queue command_queue, pi_interop_semaphore_handle sem_handle, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
API to instruct the queue with a non-blocking wait on an external semaphore.
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG
provides the preferred cache configuration (large slm or large data)
pi_result piProgramRelease(pi_program program)
pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer, pi_queue queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
API to submit the command-buffer to queue for execution, returns an error if the command-buffer is no...
pi_result piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index, const pi_sampler *arg_value)
pi_result piextCommandBufferNDRangeKernel(pi_ext_command_buffer command_buffer, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_sync_points_in_wait_list, const pi_ext_sync_point *sync_point_wait_list, pi_ext_sync_point *sync_point)
API to append a kernel execution command to the command-buffer.
pi_result piEnqueueEventsWait(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
pi_result piKernelSetExecInfo(pi_kernel kernel, pi_kernel_exec_info value_name, size_t param_value_size, const void *param_value)
API to set attributes controlling kernel execution.
pi_result piEventsWait(pi_uint32 num_events, const pi_event *event_list)
pi_result piextEnqueueCooperativeKernelLaunch(pi_queue queue, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
pi_result piextSignalExternalSemaphore(pi_queue command_queue, pi_interop_semaphore_handle sem_handle, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
API to instruct the queue to signal the external semaphore handle once all previous commands have com...
pi_result piextKernelSetArgPointer(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
Sets up pointer arguments for CL kernels.
pi_result piKernelGetGroupInfo(pi_kernel kernel, pi_device device, pi_kernel_group_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
@ PI_KERNEL_ARG_MEM_OBJ_ACCESS
pi_result piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, const pi_mem_obj_property *arg_properties, const pi_mem *arg_value)
pi_mem_access_flag mem_access
Dependency between two commands.
const Requirement * MDepRequirement
Requirement for the dependency.
Command * MDepCommand
The actual dependency command.
Result of command enqueueing.
RTDeviceBinaryImage * getDevBinImage()