38 #if __has_include(<cxxabi.h>)
39 #define __SYCL_ENABLE_GNU_DEMANGLING
46 #ifdef XPTI_ENABLE_INSTRUMENTATION
47 #include "xpti/xpti_trace_framework.hpp"
52 inline namespace _V1 {
55 #ifdef XPTI_ENABLE_INSTRUMENTATION
57 extern xpti::trace_event_data_t *GSYCLGraphEvent;
59 static bool CurrentCodeLocationValid() {
60 detail::tls_code_loc_t Tls;
61 auto CodeLoc = Tls.query();
62 auto FileName = CodeLoc.fileName();
63 auto FunctionName = CodeLoc.functionName();
64 return (FileName && FileName[0] !=
'\0') ||
65 (FunctionName && FunctionName[0] !=
'\0');
68 void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID,
69 xpti_td *TraceEvent, uint16_t Type,
71 if (!(xptiCheckTraceEnabled(StreamID, Type) && TraceEvent))
74 xptiNotifySubscribers(StreamID, Type, detail::GSYCLGraphEvent,
75 static_cast<xpti_td *
>(TraceEvent), InstanceID, Addr);
78 static size_t deviceToID(
const device &Device) {
79 return reinterpret_cast<size_t>(
getSyclObjImpl(Device)->getHandleRef());
82 static void addDeviceMetadata(xpti_td *TraceEvent,
const QueueImplPtr &Queue) {
83 xpti::addMetadata(TraceEvent,
"sycl_device_type",
86 xpti::addMetadata(TraceEvent,
"sycl_device",
87 deviceToID(Queue->get_device()));
88 xpti::addMetadata(TraceEvent,
"sycl_device_name",
93 static unsigned long long getQueueID(
const QueueImplPtr &Queue) {
94 return Queue ? Queue->getQueueID() : 0;
100 return Queue->getContextImplPtr();
104 #ifdef __SYCL_ENABLE_GNU_DEMANGLING
105 struct DemangleHandle {
107 DemangleHandle(
char *ptr) : p(ptr) {}
109 DemangleHandle(
const DemangleHandle &) =
delete;
110 DemangleHandle &
operator=(
const DemangleHandle &) =
delete;
116 DemangleHandle result(abi::__cxa_demangle(Name.c_str(), NULL, NULL, &Status));
117 return (Status == 0) ? result.p : Name;
124 const KernelArgMask *EliminatedArgMask, std::vector<ArgDesc> &Args,
126 if (!EliminatedArgMask || EliminatedArgMask->size() == 0) {
134 std::sort(Args.begin(), Args.end(), [](
const ArgDesc &A,
const ArgDesc &B) {
135 return A.MIndex < B.MIndex;
138 size_t NextTrueIndex = 0;
143 for (
int Idx = LastIndex + 1; Idx < Arg.
MIndex; ++Idx)
144 if (!(*EliminatedArgMask)[Idx])
148 if ((*EliminatedArgMask)[Arg.
MIndex])
151 Func(Arg, NextTrueIndex);
166 return "discard_write";
168 return "discard_read_write";
174 #ifdef XPTI_ENABLE_INSTRUMENTATION
179 case Command::CommandType::RUN_CG:
180 return "command_group_node";
181 case Command::CommandType::COPY_MEMORY:
182 return "memory_transfer_node";
183 case Command::CommandType::ALLOCA:
184 return "memory_allocation_node";
185 case Command::CommandType::ALLOCA_SUB_BUF:
186 return "sub_buffer_creation_node";
187 case Command::CommandType::RELEASE:
188 return "memory_deallocation_node";
189 case Command::CommandType::MAP_MEM_OBJ:
190 return "memory_transfer_node";
191 case Command::CommandType::UNMAP_MEM_OBJ:
192 return "memory_transfer_node";
193 case Command::CommandType::UPDATE_REQUIREMENT:
194 return "host_acc_create_buffer_lock_node";
195 case Command::CommandType::EMPTY_TASK:
196 return "host_acc_destroy_buffer_release_node";
198 return "unknown_node";
207 case Command::CommandType::RUN_CG:
208 return "Command Group Action";
209 case Command::CommandType::COPY_MEMORY:
210 return "Memory Transfer (Copy)";
211 case Command::CommandType::ALLOCA:
212 return "Memory Allocation";
213 case Command::CommandType::ALLOCA_SUB_BUF:
214 return "Sub Buffer Creation";
215 case Command::CommandType::RELEASE:
216 return "Memory Deallocation";
217 case Command::CommandType::MAP_MEM_OBJ:
218 return "Memory Transfer (Map)";
219 case Command::CommandType::UNMAP_MEM_OBJ:
220 return "Memory Transfer (Unmap)";
221 case Command::CommandType::UPDATE_REQUIREMENT:
222 return "Host Accessor Creation/Buffer Lock";
223 case Command::CommandType::EMPTY_TASK:
224 return "Host Accessor Destruction/Buffer Lock Release";
226 return "Unknown Action";
231 std::vector<ur_event_handle_t>
233 std::vector<ur_event_handle_t> RetUrEvents;
234 for (
auto &EventImpl : EventImpls) {
235 auto Handle = EventImpl->getHandle();
236 if (Handle ==
nullptr)
247 RetUrEvents.push_back(Handle);
258 const std::vector<EventImplPtr> &EventImpls)
const {
259 std::vector<ur_event_handle_t> RetUrEvents;
260 for (
auto &EventImpl : EventImpls) {
265 if (EventImpl->isDefaultConstructed() || EventImpl->isHost() ||
273 if (!EventImpl->isInterop() && !EventImpl->isEnqueued()) {
274 if (!EventImpl->getCommand() ||
277 std::vector<Command *> AuxCmds;
289 RetUrEvents.push_back(EventImpl->getHandle());
296 return (
MType == CommandType::RUN_CG) &&
297 ((
static_cast<const ExecCGCommand *
>(
this))->getCG().getType() ==
302 if ((
MType != CommandType::RUN_CG)) {
313 for (
auto &EventImpl : EventImpls) {
314 EventImpl->flushIfNeeded(Queue);
320 struct EnqueueNativeCommandData {
322 std::function<void(interop_handle)>
func;
325 void InteropFreeFunc(ur_queue_handle_t,
void *InteropData) {
326 auto *Data =
reinterpret_cast<EnqueueNativeCommandData *
>(InteropData);
327 return Data->func(Data->ih);
333 std::vector<interop_handle::ReqToMem> MReqToMem;
334 std::vector<ur_mem_handle_t> MReqUrMem;
336 bool waitForEvents()
const {
337 std::map<const PluginPtr, std::vector<EventImplPtr>>
338 RequiredEventsPerPlugin;
341 const PluginPtr &Plugin = Event->getPlugin();
342 RequiredEventsPerPlugin[Plugin].push_back(Event);
350 for (
auto &PluginWithEvents : RequiredEventsPerPlugin) {
351 std::vector<ur_event_handle_t> RawEvents =
353 if (RawEvents.size() == 0)
356 PluginWithEvents.first->call<UrApiKind::urEventWait>(RawEvents.size(),
359 MThisCmd->
MEvent->getSubmittedQueue()->reportAsyncException(
360 std::current_exception());
363 MThisCmd->
MEvent->getSubmittedQueue()->reportAsyncException(
364 std::current_exception());
372 Event->waitInternal();
380 std::vector<interop_handle::ReqToMem> ReqToMem,
381 std::vector<ur_mem_handle_t> ReqUrMem)
382 : MThisCmd{ThisCmd}, MReqToMem(
std::move(ReqToMem)),
383 MReqUrMem(
std::move(ReqUrMem)) {}
390 #ifdef XPTI_ENABLE_INSTRUMENTATION
395 std::unique_ptr<detail::tls_code_loc_t> AsyncCodeLocationPtr;
396 if (xptiTraceEnabled() && !CurrentCodeLocationValid()) {
397 AsyncCodeLocationPtr.reset(
402 if (!waitForEvents()) {
405 std::string(
"Couldn't wait for host-task's dependencies")));
407 MThisCmd->
MEvent->getSubmittedQueue()->reportAsyncException(EPtr);
416 if (
HostTask.MHostTask->isInteropTask()) {
418 "Host task submissions should have an associated queue");
420 HostTask.MQueue->getDeviceImplPtr(),
421 HostTask.MQueue->getContextImplPtr()};
425 bool NativeCommandSupport =
false;
426 Queue->getPlugin()->
call<UrApiKind::urDeviceGetInfo>(
428 UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP,
429 sizeof(NativeCommandSupport), &NativeCommandSupport,
nullptr);
430 if (NativeCommandSupport) {
431 EnqueueNativeCommandData CustomOpData{
432 IH,
HostTask.MHostTask->MInteropTask};
441 Queue->getPlugin()->call<UrApiKind::urEnqueueNativeCommandExp>(
442 HostTask.MQueue->getHandleRef(), InteropFreeFunc, &CustomOpData,
443 MReqUrMem.size(), MReqUrMem.data(),
nullptr, 0,
nullptr,
nullptr);
445 HostTask.MHostTask->call(MThisCmd->
MEvent->getHostProfilingInfo(),
449 HostTask.MHostTask->call(MThisCmd->
MEvent->getHostProfilingInfo());
451 auto CurrentException = std::current_exception();
452 #ifdef XPTI_ENABLE_INSTRUMENTATION
456 if (xptiTraceEnabled()) {
458 rethrow_exception(CurrentException);
461 }
catch (
const std::exception &StdException) {
465 "Host task lambda thrown non standard exception");
469 MThisCmd->
MEvent->getSubmittedQueue()->reportAsyncException(
475 #ifdef XPTI_ENABLE_INSTRUMENTATION
478 AsyncCodeLocationPtr.reset();
486 auto CurrentException = std::current_exception();
487 MThisCmd->
MEvent->getSubmittedQueue()->reportAsyncException(
495 HostEvent->waitInternal();
499 std::vector<EventImplPtr> &EventImpls,
500 ur_event_handle_t &Event) {
503 assert(!Event->isHost() &&
504 "Only non-host events are expected to be waited for here");
506 if (!EventImpls.empty()) {
521 std::map<context_impl *, std::vector<EventImplPtr>>
522 RequiredEventsPerContext;
526 assert(Context.get() &&
527 "Only non-host events are expected to be waited for here");
528 RequiredEventsPerContext[Context.get()].push_back(Event);
531 for (
auto &CtxWithEvents : RequiredEventsPerContext) {
532 std::vector<ur_event_handle_t> RawEvents =
534 if (!RawEvents.empty()) {
535 CtxWithEvents.first->getPlugin()->call<UrApiKind::urEventWait>(
536 RawEvents.size(), RawEvents.data());
540 std::vector<ur_event_handle_t> RawEvents =
getUrEvents(EventImpls);
542 const PluginPtr &Plugin = Queue->getPlugin();
545 MEvent->setHostEnqueueTime();
546 Plugin->call<UrApiKind::urEnqueueEventsWait>(
547 Queue->getHandleRef(), RawEvents.size(), &RawEvents[0], &Event);
557 ur_exp_command_buffer_handle_t CommandBuffer,
558 const std::vector<ur_exp_command_buffer_sync_point_t> &SyncPoints)
559 : MQueue(
std::move(Queue)),
561 MPreparedDepsEvents(MEvent->getPreparedDepsEvents()),
562 MPreparedHostDepsEvents(MEvent->getPreparedHostDepsEvents()), MType(Type),
563 MCommandBuffer(CommandBuffer), MSyncPointDeps(SyncPoints) {
570 MEvent->setStateIncomplete();
573 #ifdef XPTI_ENABLE_INSTRUMENTATION
574 if (!xptiTraceEnabled())
582 #ifdef XPTI_ENABLE_INSTRUMENTATION
598 Command *Cmd,
void *ObjAddr,
bool IsCommand,
599 std::optional<access::mode> AccMode) {
600 #ifdef XPTI_ENABLE_INSTRUMENTATION
603 constexpr uint16_t NotificationTraceType = xpti::trace_edge_create;
604 if (!(xptiCheckTraceEnabled(
MStreamID, NotificationTraceType) &&
610 xpti::utils::StringHelper SH;
611 std::string AddressStr = SH.addressAsString<
void *>(ObjAddr);
613 std::string TypeString = SH.nameWithAddressString(Prefix, AddressStr);
616 xpti::payload_t Payload(TypeString.c_str(),
MAddress);
617 uint64_t EdgeInstanceNo;
619 xptiMakeEvent(TypeString.c_str(), &Payload, xpti::trace_graph_event,
620 xpti_at::active, &EdgeInstanceNo);
622 xpti_td *SrcEvent =
static_cast<xpti_td *
>(Cmd->
MTraceEvent);
623 xpti_td *TgtEvent =
static_cast<xpti_td *
>(
MTraceEvent);
624 EdgeEvent->source_id = SrcEvent->unique_id;
625 EdgeEvent->target_id = TgtEvent->unique_id;
627 xpti::addMetadata(EdgeEvent,
"access_mode",
628 static_cast<int>(AccMode.value()));
629 xpti::addMetadata(EdgeEvent,
"memory_object",
630 reinterpret_cast<size_t>(ObjAddr));
632 xpti::addMetadata(EdgeEvent,
"event",
reinterpret_cast<size_t>(ObjAddr));
634 xptiNotifySubscribers(
MStreamID, NotificationTraceType,
635 detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
649 ur_event_handle_t &UrEventAddr) {
650 #ifdef XPTI_ENABLE_INSTRUMENTATION
663 xpti::utils::StringHelper SH;
664 std::string AddressStr = SH.addressAsString<ur_event_handle_t>(UrEventAddr);
669 std::string NodeName = SH.nameWithAddressString(
"virtual_node", AddressStr);
672 xpti::payload_t VNPayload(NodeName.c_str(),
MAddress);
673 uint64_t VNodeInstanceNo;
675 xptiMakeEvent(NodeName.c_str(), &VNPayload, xpti::trace_graph_event,
676 xpti_at::active, &VNodeInstanceNo);
678 xpti::addMetadata(NodeEvent,
"kernel_name", NodeName);
679 xptiNotifySubscribers(
MStreamID, xpti::trace_node_create,
680 detail::GSYCLGraphEvent, NodeEvent, VNodeInstanceNo,
683 std::string EdgeName = SH.nameWithAddressString(
"Event", AddressStr);
684 xpti::payload_t EdgePayload(EdgeName.c_str(),
MAddress);
685 uint64_t EdgeInstanceNo;
687 xptiMakeEvent(EdgeName.c_str(), &EdgePayload, xpti::trace_graph_event,
688 xpti_at::active, &EdgeInstanceNo);
689 if (EdgeEvent && NodeEvent) {
692 xpti_td *TgtEvent =
static_cast<xpti_td *
>(
MTraceEvent);
693 EdgeEvent->source_id = NodeEvent->unique_id;
694 EdgeEvent->target_id = TgtEvent->unique_id;
695 xpti::addMetadata(EdgeEvent,
"event",
696 reinterpret_cast<size_t>(UrEventAddr));
697 xptiNotifySubscribers(
MStreamID, xpti::trace_edge_create,
698 detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
707 uint64_t CommandInstanceNo = 0;
708 #ifdef XPTI_ENABLE_INSTRUMENTATION
710 return CommandInstanceNo;
716 xpti::utils::StringHelper SH;
718 std::string CommandString =
721 xpti::payload_t p(CommandString.c_str(),
MAddress);
722 xpti_td *CmdTraceEvent =
723 xptiMakeEvent(CommandString.c_str(), &p, xpti::trace_graph_event,
724 xpti_at::active, &CommandInstanceNo);
736 return CommandInstanceNo;
740 #ifdef XPTI_ENABLE_INSTRUMENTATION
741 constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
745 xptiNotifySubscribers(
MStreamID, NotificationTraceType,
746 detail::GSYCLGraphEvent,
753 std::vector<Command *> &ToCleanUp) {
760 bool PiEventExpected =
761 (!DepEvent->isHost() && !DepEvent->isDefaultConstructed());
762 if (
auto *DepCmd =
static_cast<Command *
>(DepEvent->getCommand()))
763 PiEventExpected &= DepCmd->producesPiEvent();
765 if (!PiEventExpected) {
772 Command *ConnectionCmd =
nullptr;
776 if (DepEventContext != WorkerContext && WorkerContext) {
782 return ConnectionCmd;
788 return MQueue->getContextImplPtr();
801 Command *ConnectionCmd =
nullptr;
810 if (!ConnectionCmd) {
811 MDeps.push_back(NewDep);
816 #ifdef XPTI_ENABLE_INSTRUMENTATION
822 return ConnectionCmd;
826 std::vector<Command *> &ToCleanUp) {
827 #ifdef XPTI_ENABLE_INSTRUMENTATION
831 ur_event_handle_t UrEventAddr = Event->getHandle();
841 #ifdef XPTI_ENABLE_INSTRUMENTATION
842 emitInstrumentationGeneral(
844 xpti::trace_signal,
static_cast<const void *
>(UrEventAddr));
846 std::ignore = UrEventAddr;
850 #ifdef XPTI_ENABLE_INSTRUMENTATION
853 static_cast<const void *
>(Txt));
861 std::vector<Command *> &ToCleanUp) {
862 #ifdef XPTI_ENABLE_INSTRUMENTATION
866 std::unique_ptr<detail::tls_code_loc_t> AsyncCodeLocationPtr;
867 if (xptiTraceEnabled() && !CurrentCodeLocationValid()) {
868 AsyncCodeLocationPtr.reset(
884 #ifdef XPTI_ENABLE_INSTRUMENTATION
888 std::string Info =
"enqueue.barrier[";
896 #ifdef XPTI_ENABLE_INSTRUMENTATION
907 #ifdef XPTI_ENABLE_INSTRUMENTATION
923 if (UR_RESULT_SUCCESS != Res)
940 ToCleanUp.push_back(
this);
946 #ifdef XPTI_ENABLE_INSTRUMENTATION
953 #ifdef XPTI_ENABLE_INSTRUMENTATION
954 assert(
MType == CommandType::RELEASE &&
"Expected release command");
960 xpti_td *TgtTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
965 for (
auto &Item : DepList) {
966 if (Item->MTraceEvent && Item->MAddress ==
MAddress) {
967 xpti::utils::StringHelper SH;
968 std::string AddressStr = SH.addressAsString<
void *>(
MAddress);
969 std::string TypeString =
970 "Edge:" + SH.nameWithAddressString(commandToName(
MType), AddressStr);
974 xpti::payload_t p(TypeString.c_str(),
MAddress);
975 uint64_t EdgeInstanceNo;
977 xptiMakeEvent(TypeString.c_str(), &p, xpti::trace_graph_event,
978 xpti_at::active, &EdgeInstanceNo);
980 xpti_td *SrcTraceEvent =
static_cast<xpti_td *
>(Item->MTraceEvent);
981 EdgeEvent->target_id = TgtTraceEvent->unique_id;
982 EdgeEvent->source_id = SrcTraceEvent->unique_id;
983 xpti::addMetadata(EdgeEvent,
"memory_object",
984 reinterpret_cast<size_t>(
MAddress));
985 xptiNotifySubscribers(
MStreamID, xpti::trace_edge_create,
986 detail::GSYCLGraphEvent, EdgeEvent,
987 EdgeInstanceNo,
nullptr);
997 return "A Buffer is locked by the host accessor";
999 return "Blocked by host task";
1001 return "Unknown block reason";
1006 #ifdef XPTI_ENABLE_INSTRUMENTATION
1007 if (!xptiTraceEnabled())
1011 auto TData = Tls.
query();
1012 if (TData.fileName())
1014 if (TData.functionName())
1019 (int)TData.lineNumber(), (int)TData.columnNumber()};
1027 :
Command(Type, Queue), MLinkedAllocaCmd(LinkedAllocaCmd),
1028 MIsLeaderAlloca(nullptr == LinkedAllocaCmd), MIsConst(IsConst),
1029 MRequirement(
std::move(Req)), MReleaseCmd(Queue, this) {
1035 #ifdef XPTI_ENABLE_INSTRUMENTATION
1049 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1050 addDeviceMetadata(TE,
MQueue);
1051 xpti::addMetadata(TE,
"memory_object",
reinterpret_cast<size_t>(
MAddress));
1054 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1067 bool InitFromUserData,
1070 LinkedAllocaCmd, IsConst),
1071 MInitFromUserData(InitFromUserData) {
1076 std::vector<Command *> ToCleanUp;
1079 assert(ConnectionCmd ==
nullptr);
1080 assert(ToCleanUp.empty());
1081 (void)ConnectionCmd;
1085 #ifdef XPTI_ENABLE_INSTRUMENTATION
1093 ur_result_t AllocaCommand::enqueueImp() {
1097 ur_event_handle_t UREvent =
nullptr;
1099 void *HostPtr =
nullptr;
1105 MEvent->setHandle(UREvent);
1107 return UR_RESULT_SUCCESS;
1114 MInitFromUserData, HostPtr,
1115 std::move(EventImpls), UREvent);
1116 MEvent->setHandle(UREvent);
1117 return UR_RESULT_SUCCESS;
1121 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1123 Stream <<
"ID = " <<
this <<
"\\n";
1127 Stream <<
"\"];" << std::endl;
1129 for (
const auto &Dep :
MDeps) {
1130 if (Dep.MDepCommand ==
nullptr)
1132 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1133 <<
" [ label = \"Access mode: "
1135 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1142 std::vector<Command *> &ToEnqueue,
1143 std::vector<Command *> &ToCleanUp)
1147 MParentAlloca(ParentAlloca) {
1155 ToEnqueue.push_back(ConnectionCmd);
1159 #ifdef XPTI_ENABLE_INSTRUMENTATION
1163 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1165 xpti::addMetadata(TE,
"access_range_start",
1168 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(
MQueue));
1178 return static_cast<void *
>(
1185 ur_result_t AllocaSubBufCommand::enqueueImp() {
1188 ur_event_handle_t UREvent =
nullptr;
1194 MEvent->setHandle(UREvent);
1198 return UR_RESULT_SUCCESS;
1202 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1204 Stream <<
"ID = " <<
this <<
"\\n";
1209 Stream <<
"\"];" << std::endl;
1211 for (
const auto &Dep :
MDeps) {
1212 if (Dep.MDepCommand ==
nullptr)
1214 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1215 <<
" [ label = \"Access mode: "
1217 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1228 #ifdef XPTI_ENABLE_INSTRUMENTATION
1236 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1237 addDeviceMetadata(TE,
MQueue);
1238 xpti::addMetadata(TE,
"allocation_type",
1239 commandToName(MAllocaCmd->
getType()));
1242 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(
MQueue));
1247 ur_result_t ReleaseCommand::enqueueImp() {
1250 std::vector<ur_event_handle_t> RawEvents =
getUrEvents(EventImpls);
1251 bool SkipRelease =
false;
1257 const bool CurAllocaIsHost = !MAllocaCmd->
getQueue();
1258 bool NeedUnmap =
false;
1272 NeedUnmap |= CurAllocaIsHost == MAllocaCmd->
MIsActive;
1281 UnmapEventImpl->setContextImpl(
getContext(Queue));
1282 UnmapEventImpl->setStateIncomplete();
1283 ur_event_handle_t UREvent =
nullptr;
1285 void *Src = CurAllocaIsHost
1289 void *Dst = !CurAllocaIsHost
1294 RawEvents, UREvent);
1295 UnmapEventImpl->setHandle(UREvent);
1298 EventImpls.push_back(UnmapEventImpl);
1300 ur_event_handle_t UREvent =
nullptr;
1306 std::move(EventImpls), UREvent);
1308 MEvent->setHandle(UREvent);
1309 return UR_RESULT_SUCCESS;
1313 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#FF827A\", label=\"";
1315 Stream <<
"ID = " <<
this <<
" ; ";
1317 Stream <<
" Alloca : " << MAllocaCmd <<
"\\n";
1318 Stream <<
" MemObj : " << MAllocaCmd->
getSYCLMemObj() <<
"\\n";
1319 Stream <<
"\"];" << std::endl;
1321 for (
const auto &Dep :
MDeps) {
1322 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1323 <<
" [ label = \"Access mode: "
1325 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1340 MSrcAllocaCmd(SrcAllocaCmd), MSrcReq(
std::move(Req)), MDstPtr(DstPtr),
1346 #ifdef XPTI_ENABLE_INSTRUMENTATION
1354 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1355 addDeviceMetadata(TE,
MQueue);
1356 xpti::addMetadata(TE,
"memory_object",
reinterpret_cast<size_t>(
MAddress));
1359 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(
MQueue));
1364 ur_result_t MapMemObject::enqueueImp() {
1367 std::vector<ur_event_handle_t> RawEvents =
getUrEvents(EventImpls);
1370 ur_event_handle_t UREvent =
nullptr;
1375 MEvent->setHandle(UREvent);
1376 return UR_RESULT_SUCCESS;
1380 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#77AFFF\", label=\"";
1382 Stream <<
"ID = " <<
this <<
" ; ";
1385 Stream <<
"\"];" << std::endl;
1387 for (
const auto &Dep :
MDeps) {
1388 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1389 <<
" [ label = \"Access mode: "
1391 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1399 MDstAllocaCmd(DstAllocaCmd), MDstReq(
std::move(Req)), MSrcPtr(SrcPtr) {
1404 #ifdef XPTI_ENABLE_INSTRUMENTATION
1412 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1413 addDeviceMetadata(TE,
MQueue);
1414 xpti::addMetadata(TE,
"memory_object",
reinterpret_cast<size_t>(
MAddress));
1417 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(
MQueue));
1438 return MQueue && (
MQueue->getDeviceImplPtr()->getBackend() !=
1440 MEvent->getHandle() !=
nullptr);
1443 ur_result_t UnMapMemObject::enqueueImp() {
1446 std::vector<ur_event_handle_t> RawEvents =
getUrEvents(EventImpls);
1449 ur_event_handle_t UREvent =
nullptr;
1452 std::move(RawEvents), UREvent);
1453 MEvent->setHandle(UREvent);
1455 return UR_RESULT_SUCCESS;
1459 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#EBC40F\", label=\"";
1461 Stream <<
"ID = " <<
this <<
" ; ";
1464 Stream <<
"\"];" << std::endl;
1466 for (
const auto &Dep :
MDeps) {
1467 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1468 <<
" [ label = \"Access mode: "
1470 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1481 MSrcQueue(SrcQueue), MSrcReq(
std::move(SrcReq)),
1482 MSrcAllocaCmd(SrcAllocaCmd), MDstReq(
std::move(DstReq)),
1483 MDstAllocaCmd(DstAllocaCmd) {
1485 MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1495 #ifdef XPTI_ENABLE_INSTRUMENTATION
1503 xpti_td *CmdTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
1504 addDeviceMetadata(CmdTraceEvent,
MQueue);
1505 xpti::addMetadata(CmdTraceEvent,
"memory_object",
1506 reinterpret_cast<size_t>(
MAddress));
1507 xpti::addMetadata(CmdTraceEvent,
"copy_from",
1508 MSrcQueue ? deviceToID(MSrcQueue->get_device()) : 0);
1509 xpti::addMetadata(CmdTraceEvent,
"copy_to",
1513 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(
MQueue));
1541 MQueue->getDeviceImplPtr()->getBackend() !=
1543 MEvent->getHandle() !=
nullptr;
1546 ur_result_t MemCpyCommand::enqueueImp() {
1550 ur_event_handle_t UREvent =
nullptr;
1562 MEvent->setHandle(UREvent);
1563 return UR_RESULT_SUCCESS;
1567 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#C7EB15\" label=\"";
1569 Stream <<
"ID = " <<
this <<
" ; ";
1571 Stream <<
"From: " << MSrcAllocaCmd <<
" is host: " << !MSrcQueue <<
"\\n";
1572 Stream <<
"To: " << MDstAllocaCmd <<
" is host: " << !
MQueue <<
"\\n";
1574 Stream <<
"\"];" << std::endl;
1576 for (
const auto &Dep :
MDeps) {
1577 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1578 <<
" [ label = \"Access mode: "
1580 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1587 if (Dep.MDepRequirement == Req)
1588 return Dep.MAllocaCmd;
1596 std::vector<std::shared_ptr<const void>>
1599 return ((
CGExecKernel *)MCommandGroup.get())->getAuxiliaryResources();
1608 ur_result_t UpdateHostRequirementCommand::enqueueImp() {
1611 ur_event_handle_t UREvent =
nullptr;
1613 MEvent->setHandle(UREvent);
1615 assert(MSrcAllocaCmd &&
"Expected valid alloca command");
1616 assert(MSrcAllocaCmd->
getMemAllocation() &&
"Expected valid source pointer");
1617 assert(MDstPtr &&
"Expected valid target pointer");
1620 return UR_RESULT_SUCCESS;
1624 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#f1337f\", label=\"";
1626 Stream <<
"ID = " <<
this <<
"\\n";
1628 bool IsReqOnBuffer =
1630 Stream <<
"TYPE: " << (IsReqOnBuffer ?
"Buffer" :
"Image") <<
"\\n";
1632 Stream <<
"Is sub buffer: " << std::boolalpha << MDstReq.
MIsSubBuffer
1635 Stream <<
"\"];" << std::endl;
1637 for (
const auto &Dep :
MDeps) {
1638 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1639 <<
" [ label = \"Access mode: "
1641 <<
"MemObj: " << Dep.MAllocaCmd->getSYCLMemObj() <<
" \" ]"
1652 MSrcQueue(SrcQueue), MSrcReq(
std::move(SrcReq)),
1653 MSrcAllocaCmd(SrcAllocaCmd), MDstReq(
std::move(DstReq)), MDstPtr(DstPtr) {
1655 MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1665 #ifdef XPTI_ENABLE_INSTRUMENTATION
1673 xpti_td *CmdTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
1674 addDeviceMetadata(CmdTraceEvent,
MQueue);
1675 xpti::addMetadata(CmdTraceEvent,
"memory_object",
1676 reinterpret_cast<size_t>(
MAddress));
1677 xpti::addMetadata(CmdTraceEvent,
"copy_from",
1678 MSrcQueue ? deviceToID(MSrcQueue->get_device()) : 0);
1679 xpti::addMetadata(CmdTraceEvent,
"copy_to",
1683 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(
MQueue));
1694 ur_result_t MemCpyCommandHost::enqueueImp() {
1698 std::vector<ur_event_handle_t> RawEvents =
getUrEvents(EventImpls);
1700 ur_event_handle_t UREvent =
nullptr;
1708 return UR_RESULT_SUCCESS;
1720 MEvent->setHandle(UREvent);
1725 return UR_RESULT_SUCCESS;
1732 ur_result_t EmptyCommand::enqueueImp() {
1734 ur_event_handle_t UREvent =
nullptr;
1736 MEvent->setHandle(UREvent);
1737 return UR_RESULT_SUCCESS;
1743 MRequirements.emplace_back(ReqRef);
1744 const Requirement *
const StoredReq = &MRequirements.back();
1748 std::vector<Command *> ToCleanUp;
1750 assert(Cmd ==
nullptr &&
"Conection command should be null for EmptyCommand");
1751 assert(ToCleanUp.empty() &&
"addDep should add a command for cleanup only if "
1752 "there's a connection command");
1757 #ifdef XPTI_ENABLE_INSTRUMENTATION
1762 if (MRequirements.empty())
1770 xpti_td *CmdTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
1771 addDeviceMetadata(CmdTraceEvent,
MQueue);
1772 xpti::addMetadata(CmdTraceEvent,
"memory_object",
1773 reinterpret_cast<size_t>(
MAddress));
1776 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(
MQueue));
1782 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#8d8f29\", label=\"";
1784 Stream <<
"ID = " <<
this <<
"\\n";
1785 Stream <<
"EMPTY NODE"
1788 Stream <<
"\"];" << std::endl;
1790 for (
const auto &Dep :
MDeps) {
1791 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1792 <<
" [ label = \"Access mode: "
1794 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1802 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#B6A2EB\", label=\"";
1804 Stream <<
"ID = " <<
this <<
"\\n";
1807 Stream <<
"\"];" << std::endl;
1809 for (
const auto &Dep :
MDeps) {
1810 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1811 <<
" [ label = \"Access mode: "
1813 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1822 MSrcAllocaCmd(SrcAllocaCmd), MDstReq(
std::move(Req)), MDstPtr(DstPtr) {
1828 #ifdef XPTI_ENABLE_INSTRUMENTATION
1836 xpti_td *CmdTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
1837 addDeviceMetadata(CmdTraceEvent,
MQueue);
1838 xpti::addMetadata(CmdTraceEvent,
"memory_object",
1839 reinterpret_cast<size_t>(
MAddress));
1842 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(
MQueue));
1853 return "update_host";
1859 return "copy acc to acc";
1862 return "copy acc to ptr";
1865 return "copy ptr to acc";
1870 return "barrier waitlist";
1878 return "prefetch usm";
1884 return "copy 2d usm";
1887 return "fill 2d usm";
1890 return "advise usm";
1892 return "memset 2d usm";
1895 return "copy to device_global";
1898 return "copy from device_global";
1901 return "read_write host pipe";
1903 return "exec command buffer";
1905 return "copy image";
1907 return "semaphore wait";
1909 return "semaphore signal";
1917 std::unique_ptr<detail::CG> CommandGroup,
QueueImplPtr Queue,
1918 bool EventNeeded, ur_exp_command_buffer_handle_t CommandBuffer,
1919 const std::vector<ur_exp_command_buffer_sync_point_t> &Dependencies)
1922 MEventNeeded(EventNeeded), MCommandGroup(
std::move(CommandGroup)) {
1924 MEvent->setSubmittedQueue(
1928 MEvent->markAsProfilingTagEvent();
1933 #ifdef XPTI_ENABLE_INSTRUMENTATION
1934 std::string instrumentationGetKernelName(
1935 const std::shared_ptr<detail::kernel_impl> &SyclKernel,
1936 const std::string &FunctionName,
const std::string &SyclKernelName,
1937 void *&Address, std::optional<bool> &FromSource) {
1938 std::string KernelName;
1939 if (SyclKernel && SyclKernel->isCreatedFromSource()) {
1941 ur_kernel_handle_t KernelHandle = SyclKernel->getHandleRef();
1942 Address = KernelHandle;
1943 KernelName = FunctionName;
1951 void instrumentationAddExtraKernelMetadata(
1952 xpti_td *&CmdTraceEvent,
const NDRDescT &NDRDesc,
1954 const std::string &KernelName,
1955 const std::shared_ptr<detail::kernel_impl> &SyclKernel,
1957 std::vector<ArgDesc> &CGArgs)
1960 std::vector<ArgDesc> Args;
1962 auto FilterArgs = [&Args](detail::ArgDesc &Arg,
int NextTrueIndex) {
1963 Args.push_back({Arg.MType, Arg.MPtr, Arg.MSize, NextTrueIndex});
1965 ur_program_handle_t Program =
nullptr;
1966 ur_kernel_handle_t
Kernel =
nullptr;
1967 std::mutex *KernelMutex =
nullptr;
1970 std::shared_ptr<kernel_impl> SyclKernelImpl;
1971 std::shared_ptr<device_image_impl> DeviceImageImpl;
1979 kernel_id KernelID =
1983 std::shared_ptr<kernel_impl> KernelImpl =
1986 EliminatedArgMask = KernelImpl->getKernelArgMask();
1987 Program = KernelImpl->getDeviceImage()->get_ur_program_ref();
1988 }
else if (
nullptr != SyclKernel) {
1989 Program = SyclKernel->getProgramRef();
1990 if (!SyclKernel->isCreatedFromSource())
1991 EliminatedArgMask = SyclKernel->getKernelArgMask();
1993 assert(Queue &&
"Kernel submissions should have an associated queue");
1994 std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) =
1996 Queue->getContextImplPtr(), Queue->getDeviceImplPtr(), KernelName);
2001 xpti::offload_kernel_enqueue_data_t KernelData{
2002 {NDRDesc.GlobalSize[0], NDRDesc.GlobalSize[1], NDRDesc.GlobalSize[2]},
2003 {NDRDesc.LocalSize[0], NDRDesc.LocalSize[1], NDRDesc.LocalSize[2]},
2004 {NDRDesc.GlobalOffset[0], NDRDesc.GlobalOffset[1],
2005 NDRDesc.GlobalOffset[2]},
2007 xpti::addMetadata(CmdTraceEvent,
"enqueue_kernel_data", KernelData);
2008 for (
size_t i = 0; i < Args.size(); i++) {
2009 std::string Prefix(
"arg");
2010 xpti::offload_kernel_arg_data_t
arg{(int)Args[i].MType, Args[i].MPtr,
2011 Args[i].MSize, Args[i].MIndex};
2012 xpti::addMetadata(CmdTraceEvent, Prefix + std::to_string(i),
arg);
2016 void instrumentationFillCommonData(
const std::string &KernelName,
2017 const std::string &FileName, uint64_t Line,
2018 uint64_t Column,
const void *
const Address,
2020 std::optional<bool> &FromSource,
2021 uint64_t &OutInstanceID,
2022 xpti_td *&OutTraceEvent) {
2029 bool HasSourceInfo =
false;
2030 xpti::payload_t Payload;
2031 if (!FileName.empty()) {
2033 Payload = xpti::payload_t(KernelName.c_str(), FileName.c_str(), Line,
2035 HasSourceInfo =
true;
2036 }
else if (Address) {
2038 Payload = xpti::payload_t(KernelName.c_str(), Address);
2042 Payload = xpti::payload_t(KernelName.c_str());
2045 uint64_t CGKernelInstanceNo;
2047 xpti_td *CmdTraceEvent =
2048 xptiMakeEvent(
"ExecCG", &Payload, xpti::trace_graph_event,
2049 xpti::trace_activity_type_t::active, &CGKernelInstanceNo);
2050 if (CmdTraceEvent) {
2051 OutInstanceID = CGKernelInstanceNo;
2052 OutTraceEvent = CmdTraceEvent;
2054 addDeviceMetadata(CmdTraceEvent, Queue);
2055 if (!KernelName.empty()) {
2056 xpti::addMetadata(CmdTraceEvent,
"kernel_name", KernelName);
2058 if (FromSource.has_value()) {
2059 xpti::addMetadata(CmdTraceEvent,
"from_source", FromSource.value());
2061 if (HasSourceInfo) {
2062 xpti::addMetadata(CmdTraceEvent,
"sym_function_name", KernelName);
2063 xpti::addMetadata(CmdTraceEvent,
"sym_source_file_name", FileName);
2064 xpti::addMetadata(CmdTraceEvent,
"sym_line_no",
static_cast<int>(Line));
2065 xpti::addMetadata(CmdTraceEvent,
"sym_column_no",
2066 static_cast<int>(Column));
2075 #ifdef XPTI_ENABLE_INSTRUMENTATION
2076 std::pair<xpti_td *, uint64_t> emitKernelInstrumentationData(
2077 int32_t StreamID,
const std::shared_ptr<detail::kernel_impl> &SyclKernel,
2078 const detail::code_location &CodeLoc,
const std::string &SyclKernelName,
2081 std::vector<ArgDesc> &CGArgs) {
2083 auto XptiObjects = std::make_pair<xpti_td *, uint64_t>(
nullptr, -1);
2084 constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
2085 if (!xptiCheckTraceEnabled(StreamID))
2088 void *Address =
nullptr;
2089 std::optional<bool> FromSource;
2090 std::string KernelName = instrumentationGetKernelName(
2091 SyclKernel, std::string(CodeLoc.functionName()), SyclKernelName, Address,
2094 auto &[CmdTraceEvent, InstanceID] = XptiObjects;
2096 std::string FileName =
2097 CodeLoc.fileName() ? CodeLoc.fileName() : std::string();
2098 instrumentationFillCommonData(KernelName, FileName, CodeLoc.lineNumber(),
2099 CodeLoc.columnNumber(), Address, Queue,
2100 FromSource, InstanceID, CmdTraceEvent);
2102 if (CmdTraceEvent) {
2104 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(Queue));
2105 instrumentationAddExtraKernelMetadata(CmdTraceEvent, NDRDesc,
2107 SyclKernel, Queue, CGArgs);
2109 xptiNotifySubscribers(
2110 StreamID, NotificationTraceType, detail::GSYCLGraphEvent, CmdTraceEvent,
2112 static_cast<const void *
>(
2113 commandToNodeType(Command::CommandType::RUN_CG).c_str()));
2121 #ifdef XPTI_ENABLE_INSTRUMENTATION
2122 constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
2126 std::string KernelName;
2127 std::optional<bool> FromSource;
2128 switch (MCommandGroup->getType()) {
2132 KernelName = instrumentationGetKernelName(
2133 KernelCG->MSyclKernel, MCommandGroup->MFunctionName,
2134 KernelCG->getKernelName(),
MAddress, FromSource);
2141 xpti_td *CmdTraceEvent =
nullptr;
2142 instrumentationFillCommonData(KernelName, MCommandGroup->MFileName,
2143 MCommandGroup->MLine, MCommandGroup->MColumn,
2147 if (CmdTraceEvent) {
2148 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
2154 instrumentationAddExtraKernelMetadata(
2155 CmdTraceEvent, KernelCG->MNDRDesc, KernelCG->getKernelBundle(),
2156 KernelCG->MKernelName, KernelCG->MSyclKernel,
MQueue,
2160 xptiNotifySubscribers(
2161 MStreamID, NotificationTraceType, detail::GSYCLGraphEvent,
2163 static_cast<const void *
>(commandToNodeType(
MType).c_str()));
2169 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
2171 Stream <<
"ID = " <<
this <<
"\\n";
2174 switch (MCommandGroup->getType()) {
2178 Stream <<
"Kernel name: ";
2179 if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource())
2180 Stream <<
"created from source";
2191 Stream <<
"\"];" << std::endl;
2193 for (
const auto &Dep :
MDeps) {
2194 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
2195 <<
" [ label = \"Access mode: "
2197 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
2225 sycl::info::kernel_device_specific::compile_work_group_size>(
2228 if (WGSize[0] == 0) {
2232 static_cast<int>(NDR.
Dims)};
2253 switch (AccessorMode) {
2255 return UR_MEM_FLAG_READ_ONLY;
2258 return UR_MEM_FLAG_WRITE_ONLY;
2260 return UR_MEM_FLAG_READ_WRITE;
2265 const PluginPtr &Plugin, ur_kernel_handle_t Kernel,
2266 const std::shared_ptr<device_image_impl> &DeviceImageImpl,
2267 const std::function<
void *(
Requirement *Req)> &getMemAllocationFunc,
2269 switch (Arg.
MType) {
2279 ur_mem_handle_t MemArg =
2280 getMemAllocationFunc
2281 ?
reinterpret_cast<ur_mem_handle_t
>(getMemAllocationFunc(Req))
2283 ur_kernel_arg_mem_obj_properties_t MemObjData{};
2284 MemObjData.stype = UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES;
2286 Plugin->call<UrApiKind::urKernelSetArgMemObj>(
Kernel, NextTrueIndex,
2287 &MemObjData, MemArg);
2292 Plugin->call<UrApiKind::urKernelSetArgValue>(
2295 Plugin->call<UrApiKind::urKernelSetArgLocal>(
Kernel, NextTrueIndex,
2296 Arg.
MSize,
nullptr);
2302 sampler *SamplerPtr = (sampler *)Arg.
MPtr;
2303 ur_sampler_handle_t Sampler =
2305 ->getOrCreateSampler(Context);
2306 Plugin->call<UrApiKind::urKernelSetArgSampler>(
Kernel, NextTrueIndex,
2313 const void *Ptr = *
static_cast<const void *
const *
>(Arg.
MPtr);
2314 Plugin->call<UrApiKind::urKernelSetArgPointer>(
Kernel, NextTrueIndex,
2319 assert(DeviceImageImpl !=
nullptr);
2320 ur_mem_handle_t SpecConstsBuffer =
2321 DeviceImageImpl->get_spec_const_buffer_ref();
2323 ur_kernel_arg_mem_obj_properties_t MemObjProps{};
2324 MemObjProps.pNext =
nullptr;
2325 MemObjProps.stype = UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES;
2326 MemObjProps.memoryAccess = UR_MEM_FLAG_READ_ONLY;
2327 Plugin->call<UrApiKind::urKernelSetArgMemObj>(
2328 Kernel, NextTrueIndex, &MemObjProps, SpecConstsBuffer);
2333 "Invalid kernel param kind " +
2341 const std::shared_ptr<device_image_impl> &DeviceImageImpl,
2342 ur_kernel_handle_t Kernel,
NDRDescT &NDRDesc,
2343 std::vector<ur_event_handle_t> &RawEvents,
2346 const std::function<
void *(
Requirement *Req)> &getMemAllocationFunc,
2347 bool IsCooperative,
bool KernelUsesClusterLaunch,
2349 assert(Queue &&
"Kernel submissions should have an associated queue");
2350 const PluginPtr &Plugin = Queue->getPlugin();
2353 std::vector<unsigned char> Empty;
2355 Queue, BinImage, KernelName,
2356 DeviceImageImpl.
get() ? DeviceImageImpl->get_spec_const_blob_ref()
2360 auto setFunc = [&Plugin,
Kernel, &DeviceImageImpl, &getMemAllocationFunc,
2363 Queue->get_context(), Arg, NextTrueIndex);
2371 const bool HasLocalSize = (NDRDesc.
LocalSize[0] != 0);
2375 size_t RequiredWGSize[3] = {0, 0, 0};
2376 size_t *LocalSize =
nullptr;
2381 Plugin->call<UrApiKind::urKernelGetGroupInfo>(
2382 Kernel, Queue->getDeviceImplPtr()->getHandleRef(),
2383 UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
sizeof(RequiredWGSize),
2387 const bool EnforcedLocalSize =
2388 (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2389 RequiredWGSize[2] != 0);
2390 if (EnforcedLocalSize)
2391 LocalSize = RequiredWGSize;
2393 if (OutEventImpl !=
nullptr)
2394 OutEventImpl->setHostEnqueueTime();
2395 if (KernelUsesClusterLaunch) {
2398 ur_exp_launch_property_value_t launch_property_value_cluster_range;
2399 launch_property_value_cluster_range.clusterDim[0] =
2401 launch_property_value_cluster_range.clusterDim[1] =
2403 launch_property_value_cluster_range.clusterDim[2] =
2406 property_list.push_back({UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION,
2407 launch_property_value_cluster_range});
2409 if (IsCooperative) {
2410 ur_exp_launch_property_value_t launch_property_value_cooperative;
2411 launch_property_value_cooperative.cooperative = 1;
2412 property_list.push_back({UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE,
2413 launch_property_value_cooperative});
2416 ur_event_handle_t UREvent =
nullptr;
2418 Plugin->call_nocheck<UrApiKind::urEnqueueKernelLaunchCustomExp>(
2421 RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
2422 OutEventImpl ? &UREvent :
nullptr);
2424 OutEventImpl->setHandle(UREvent);
2428 ur_event_handle_t UREvent =
nullptr;
2431 if (IsCooperative) {
2433 ->call_nocheck<UrApiKind::urEnqueueCooperativeKernelLaunchExp>(
2436 return Plugin->call_nocheck<UrApiKind::urEnqueueKernelLaunch>(Args...);
2438 &NDRDesc.
GlobalSize[0], LocalSize, RawEvents.size(),
2439 RawEvents.empty() ? nullptr : &RawEvents[0],
2440 OutEventImpl ? &UREvent :
nullptr);
2441 if (Error == UR_RESULT_SUCCESS && OutEventImpl) {
2442 OutEventImpl->setHandle(UREvent);
2450 ur_exp_command_buffer_handle_t CommandBuffer,
2452 std::vector<ur_exp_command_buffer_sync_point_t> &SyncPoints,
2453 ur_exp_command_buffer_sync_point_t *OutSyncPoint,
2454 ur_exp_command_buffer_command_handle_t *OutCommand,
2455 const std::function<
void *(
Requirement *Req)> &getMemAllocationFunc) {
2458 ur_kernel_handle_t UrKernel =
nullptr;
2459 ur_program_handle_t UrProgram =
nullptr;
2460 std::shared_ptr<kernel_impl> SyclKernelImpl =
nullptr;
2461 std::shared_ptr<device_image_impl> DeviceImageImpl =
nullptr;
2479 UrKernel = SyclKernelImpl->getHandleRef();
2480 DeviceImageImpl = SyclKernelImpl->getDeviceImage();
2481 UrProgram = DeviceImageImpl->get_ur_program_ref();
2482 EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
2483 }
else if (
Kernel !=
nullptr) {
2484 UrKernel =
Kernel->getHandleRef();
2485 UrProgram =
Kernel->getProgramRef();
2486 EliminatedArgMask =
Kernel->getKernelArgMask();
2488 std::tie(UrKernel, std::ignore, EliminatedArgMask, UrProgram) =
2489 sycl::detail::ProgramManager::getInstance().getOrCreateKernel(
2490 ContextImpl, DeviceImpl, CommandGroup.
MKernelName);
2493 auto SetFunc = [&Plugin, &UrKernel, &DeviceImageImpl, &Ctx,
2494 &getMemAllocationFunc](sycl::detail::ArgDesc &Arg,
2495 size_t NextTrueIndex) {
2497 getMemAllocationFunc, Ctx, Arg,
2501 auto Args = CommandGroup.
MArgs;
2508 auto NDRDesc = CommandGroup.
MNDRDesc;
2512 size_t RequiredWGSize[3] = {0, 0, 0};
2513 size_t *LocalSize =
nullptr;
2518 Plugin->call<UrApiKind::urKernelGetGroupInfo>(
2519 UrKernel, DeviceImpl->getHandleRef(),
2520 UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
sizeof(RequiredWGSize),
2524 const bool EnforcedLocalSize =
2525 (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2526 RequiredWGSize[2] != 0);
2527 if (EnforcedLocalSize)
2528 LocalSize = RequiredWGSize;
2532 Plugin->call_nocheck<UrApiKind::urCommandBufferAppendKernelLaunchExp>(
2534 &NDRDesc.
GlobalSize[0], LocalSize, SyncPoints.size(),
2535 SyncPoints.size() ? SyncPoints.data() :
nullptr, OutSyncPoint,
2538 if (!SyclKernelImpl && !
Kernel) {
2539 Plugin->call<UrApiKind::urKernelRelease>(UrKernel);
2540 Plugin->call<UrApiKind::urProgramRelease>(UrProgram);
2543 if (Res != UR_RESULT_SUCCESS) {
2555 const std::shared_ptr<detail::kernel_impl> &MSyclKernel,
2556 const std::string &KernelName, std::vector<ur_event_handle_t> &RawEvents,
2558 const std::function<
void *(
Requirement *Req)> &getMemAllocationFunc,
2559 ur_kernel_cache_config_t KernelCacheConfig,
const bool KernelIsCooperative,
2561 assert(Queue &&
"Kernel submissions should have an associated queue");
2563 auto ContextImpl = Queue->getContextImplPtr();
2564 auto DeviceImpl = Queue->getDeviceImplPtr();
2565 ur_kernel_handle_t
Kernel =
nullptr;
2566 std::mutex *KernelMutex =
nullptr;
2567 ur_program_handle_t Program =
nullptr;
2570 std::shared_ptr<kernel_impl> SyclKernelImpl;
2571 std::shared_ptr<device_image_impl> DeviceImageImpl;
2586 Kernel = SyclKernelImpl->getHandleRef();
2587 DeviceImageImpl = SyclKernelImpl->getDeviceImage();
2589 Program = DeviceImageImpl->get_ur_program_ref();
2591 EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
2592 KernelMutex = SyclKernelImpl->getCacheMutex();
2593 }
else if (
nullptr != MSyclKernel) {
2594 assert(MSyclKernel->get_info<info::kernel::context>() ==
2595 Queue->get_context());
2596 Kernel = MSyclKernel->getHandleRef();
2597 Program = MSyclKernel->getProgramRef();
2605 KernelMutex = &MSyclKernel->getNoncacheableEnqueueMutex();
2606 EliminatedArgMask = MSyclKernel->getKernelArgMask();
2610 ContextImpl, DeviceImpl, KernelName, NDRDesc);
2614 std::vector<ur_event_handle_t> &EventsWaitList = RawEvents;
2617 std::vector<ur_event_handle_t> DeviceGlobalInitEvents =
2618 ContextImpl->initializeDeviceGlobals(Program, Queue);
2619 std::vector<ur_event_handle_t> EventsWithDeviceGlobalInits;
2620 if (!DeviceGlobalInitEvents.empty()) {
2621 EventsWithDeviceGlobalInits.reserve(RawEvents.size() +
2622 DeviceGlobalInitEvents.size());
2623 EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(),
2624 RawEvents.begin(), RawEvents.end());
2625 EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(),
2626 DeviceGlobalInitEvents.begin(),
2627 DeviceGlobalInitEvents.end());
2628 EventsWaitList = EventsWithDeviceGlobalInits;
2631 ur_result_t Error = UR_RESULT_SUCCESS;
2636 using LockT = std::unique_lock<std::mutex>;
2637 auto Lock = KernelMutex ? LockT(*KernelMutex) : LockT();
2641 if (KernelCacheConfig == UR_KERNEL_CACHE_CONFIG_LARGE_SLM ||
2642 KernelCacheConfig == UR_KERNEL_CACHE_CONFIG_LARGE_DATA) {
2643 const PluginPtr &Plugin = Queue->getPlugin();
2644 Plugin->call<UrApiKind::urKernelSetExecInfo>(
2645 Kernel, UR_KERNEL_EXEC_INFO_CACHE_CONFIG,
2646 sizeof(ur_kernel_cache_config_t),
nullptr, &KernelCacheConfig);
2650 Queue, Args, DeviceImageImpl,
Kernel, NDRDesc, EventsWaitList,
2651 OutEventImpl, EliminatedArgMask, getMemAllocationFunc,
2652 KernelIsCooperative, KernelUsesClusterLaunch, BinImage, KernelName);
2654 const PluginPtr &Plugin = Queue->getPlugin();
2655 if (!SyclKernelImpl && !MSyclKernel) {
2656 Plugin->call<UrApiKind::urKernelRelease>(
Kernel);
2657 Plugin->call<UrApiKind::urProgramRelease>(Program);
2660 if (UR_RESULT_SUCCESS != Error) {
2663 const device_impl &DeviceImpl = *(Queue->getDeviceImplPtr());
2670 const std::string &PipeName,
bool blocking,
2671 void *ptr,
size_t size,
2672 std::vector<ur_event_handle_t> &RawEvents,
2676 "ReadWrite host pipe submissions should have an associated queue");
2680 ur_program_handle_t Program =
nullptr;
2681 device Device = Queue->get_device();
2683 std::optional<ur_program_handle_t> CachedProgram =
2684 ContextImpl->getProgramForHostPipe(Device, hostPipeEntry);
2686 Program = *CachedProgram;
2692 Queue->get_device());
2697 assert(Program &&
"Program for this hostpipe is not compiled.");
2699 const PluginPtr &Plugin = Queue->getPlugin();
2701 ur_queue_handle_t ur_q = Queue->getHandleRef();
2704 ur_event_handle_t UREvent =
nullptr;
2705 auto OutEvent = OutEventImpl ? &UREvent :
nullptr;
2706 if (OutEventImpl !=
nullptr)
2707 OutEventImpl->setHostEnqueueTime();
2709 Error = Plugin->call_nocheck<UrApiKind::urEnqueueReadHostPipe>(
2710 ur_q, Program, PipeName.c_str(), blocking, ptr, size, RawEvents.size(),
2711 RawEvents.empty() ? nullptr : &RawEvents[0], OutEvent);
2713 Error = Plugin->call_nocheck<UrApiKind::urEnqueueWriteHostPipe>(
2714 ur_q, Program, PipeName.c_str(), blocking, ptr, size, RawEvents.size(),
2715 RawEvents.empty() ? nullptr : &RawEvents[0], OutEvent);
2717 if (Error == UR_RESULT_SUCCESS && OutEventImpl) {
2718 OutEventImpl->setHandle(UREvent);
2723 ur_result_t ExecCGCommand::enqueueImpCommandBuffer() {
2724 assert(
MQueue &&
"Command buffer enqueue should have an associated queue");
2733 std::vector<ur_event_handle_t> RawEvents =
getUrEvents(EventImpls);
2734 if (!RawEvents.empty()) {
2735 MQueue->getPlugin()->call<UrApiKind::urEventWait>(RawEvents.size(),
2739 ur_exp_command_buffer_sync_point_t OutSyncPoint;
2740 ur_exp_command_buffer_command_handle_t OutCommand =
nullptr;
2741 switch (MCommandGroup->getType()) {
2743 CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
2745 auto getMemAllocationFunc = [
this](
Requirement *Req) {
2746 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2747 return AllocaCmd->getMemAllocation();
2753 getMemAllocationFunc);
2754 MEvent->setSyncPoint(OutSyncPoint);
2755 MEvent->setCommandBufferCommand(OutCommand);
2759 CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
2762 Copy->getLength(), Copy->getDst(),
MSyncPointDeps, &OutSyncPoint);
2763 MEvent->setSyncPoint(OutSyncPoint);
2764 return UR_RESULT_SUCCESS;
2767 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2771 AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc);
2772 AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst);
2776 AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(),
2777 ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange,
2778 ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
2779 ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
2782 MEvent->setSyncPoint(OutSyncPoint);
2783 return UR_RESULT_SUCCESS;
2786 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2788 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2792 AllocaCmd->getMemAllocation(), Req->MDims, Req->MMemoryRange,
2793 Req->MAccessRange, Req->MOffset, Req->MElemSize, (
char *)Copy->getDst(),
2794 Req->MDims, Req->MAccessRange,
2797 MEvent->setSyncPoint(OutSyncPoint);
2798 return UR_RESULT_SUCCESS;
2801 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2803 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2807 (
char *)Copy->getSrc(), Req->MDims, Req->MAccessRange,
2808 {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(),
2809 Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2811 MEvent->setSyncPoint(OutSyncPoint);
2812 return UR_RESULT_SUCCESS;
2815 CGFill *
Fill = (CGFill *)MCommandGroup.get();
2817 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2821 AllocaCmd->getMemAllocation(),
Fill->MPattern.size(),
2822 Fill->MPattern.data(), Req->MDims, Req->MMemoryRange, Req->MAccessRange,
2823 Req->MOffset, Req->MElemSize, std::move(
MSyncPointDeps), &OutSyncPoint);
2824 MEvent->setSyncPoint(OutSyncPoint);
2825 return UR_RESULT_SUCCESS;
2828 CGFillUSM *
Fill = (CGFillUSM *)MCommandGroup.get();
2833 MEvent->setSyncPoint(OutSyncPoint);
2834 return UR_RESULT_SUCCESS;
2837 CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get();
2840 Prefetch->getLength(), std::move(
MSyncPointDeps), &OutSyncPoint);
2841 MEvent->setSyncPoint(OutSyncPoint);
2842 return UR_RESULT_SUCCESS;
2845 CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get();
2848 Advise->getLength(), Advise->getAdvice(), std::move(
MSyncPointDeps),
2850 MEvent->setSyncPoint(OutSyncPoint);
2851 return UR_RESULT_SUCCESS;
2856 "CG type not implemented for command buffers.");
2860 ur_result_t ExecCGCommand::enqueueImp() {
2862 return enqueueImpCommandBuffer();
2864 return enqueueImpQueue();
2868 ur_result_t ExecCGCommand::enqueueImpQueue() {
2880 MQueue->supportsDiscardingPiEvents() &&
2881 MCommandGroup->getRequirements().size() == 0;
2883 ur_event_handle_t UREvent =
nullptr;
2884 ur_event_handle_t *Event = DiscardUrEvent ? nullptr : &UREvent;
2887 switch (MCommandGroup->getType()) {
2891 "Update host should be handled by the Scheduler. " +
2895 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2897 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2900 AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(),
MQueue,
2901 Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2902 Req->MElemSize, Copy->getDst(),
nullptr, Req->MDims, Req->MAccessRange,
2903 Req->MAccessRange, {0, 0, 0}, Req->MElemSize,
2904 std::move(RawEvents), UREvent,
MEvent);
2905 MEvent->setHandle(UREvent);
2907 return UR_RESULT_SUCCESS;
2910 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2912 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2915 Req->MDims, Req->MAccessRange, Req->MAccessRange,
2916 {0, 0, 0}, Req->MElemSize,
2917 AllocaCmd->getMemAllocation(),
MQueue, Req->MDims,
2918 Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2919 Req->MElemSize, std::move(RawEvents), UREvent,
MEvent);
2920 MEvent->setHandle(UREvent);
2921 return UR_RESULT_SUCCESS;
2924 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2928 AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc);
2929 AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst);
2932 AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(),
MQueue,
2933 ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange,
2934 ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
2935 MQueue, ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
2936 ReqDst->MOffset, ReqDst->MElemSize, std::move(RawEvents), UREvent,
2938 MEvent->setHandle(UREvent);
2939 return UR_RESULT_SUCCESS;
2942 CGFill *
Fill = (CGFill *)MCommandGroup.get();
2944 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2947 AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(),
MQueue,
2948 Fill->MPattern.size(),
Fill->MPattern.data(), Req->MDims,
2949 Req->MMemoryRange, Req->MAccessRange, Req->MOffset, Req->MElemSize,
2950 std::move(RawEvents), UREvent,
MEvent);
2951 MEvent->setHandle(UREvent);
2952 return UR_RESULT_SUCCESS;
2955 assert(
MQueue &&
"Kernel submissions should have an associated queue");
2956 CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
2958 NDRDescT &NDRDesc = ExecKernel->MNDRDesc;
2959 std::vector<ArgDesc> &Args = ExecKernel->MArgs;
2961 auto getMemAllocationFunc = [
this](
Requirement *Req) {
2962 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2965 return AllocaCmd ? AllocaCmd->getMemAllocation() :
nullptr;
2968 const std::shared_ptr<detail::kernel_impl> &SyclKernel =
2969 ExecKernel->MSyclKernel;
2970 const std::string &KernelName = ExecKernel->MKernelName;
2974 bool KernelUsesAssert =
2975 !(SyclKernel && SyclKernel->isInterop()) &&
2977 if (KernelUsesAssert) {
2982 const RTDeviceBinaryImage *BinImage =
nullptr;
2986 assert(BinImage &&
"Failed to obtain a binary image.");
2989 SyclKernel, KernelName, RawEvents, EventImpl,
2990 getMemAllocationFunc, ExecKernel->MKernelCacheConfig,
2991 ExecKernel->MKernelIsCooperative,
2992 ExecKernel->MKernelUsesClusterLaunch, BinImage);
2994 return UR_RESULT_SUCCESS;
2997 CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
2999 Copy->getDst(), std::move(RawEvents), Event,
3002 MEvent->setHandle(*Event);
3003 return UR_RESULT_SUCCESS;
3006 CGFillUSM *
Fill = (CGFillUSM *)MCommandGroup.get();
3008 Fill->getPattern(), std::move(RawEvents), Event,
3011 MEvent->setHandle(*Event);
3012 return UR_RESULT_SUCCESS;
3015 CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get();
3017 Prefetch->getLength(), std::move(RawEvents),
3020 MEvent->setHandle(*Event);
3021 return UR_RESULT_SUCCESS;
3024 CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get();
3026 Advise->getAdvice(), std::move(RawEvents), Event,
3029 MEvent->setHandle(*Event);
3030 return UR_RESULT_SUCCESS;
3033 CGCopy2DUSM *Copy = (CGCopy2DUSM *)MCommandGroup.get();
3035 Copy->getDst(), Copy->getDstPitch(),
3036 Copy->getWidth(), Copy->getHeight(),
3037 std::move(RawEvents), Event,
MEvent);
3039 MEvent->setHandle(*Event);
3040 return UR_RESULT_SUCCESS;
3043 CGFill2DUSM *
Fill = (CGFill2DUSM *)MCommandGroup.get();
3045 Fill->getWidth(),
Fill->getHeight(),
3046 Fill->getPattern(), std::move(RawEvents), Event,
3049 MEvent->setHandle(*Event);
3050 return UR_RESULT_SUCCESS;
3053 CGMemset2DUSM *Memset = (CGMemset2DUSM *)MCommandGroup.get();
3055 Memset->getWidth(), Memset->getHeight(),
3056 Memset->getValue(), std::move(RawEvents),
3059 MEvent->setHandle(*Event);
3060 return UR_RESULT_SUCCESS;
3063 CGHostTask *HostTask =
static_cast<CGHostTask *
>(MCommandGroup.get());
3065 for (ArgDesc &Arg : HostTask->MArgs) {
3066 switch (Arg.MType) {
3069 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
3072 Req->MData = AllocaCmd->getMemAllocation();
3077 "Unsupported arg type " +
3082 std::vector<interop_handle::ReqToMem> ReqToMem;
3083 std::vector<ur_mem_handle_t> ReqUrMem;
3085 if (HostTask->MHostTask->isInteropTask()) {
3088 const std::vector<Requirement *> &HandlerReq =
3089 HostTask->getRequirements();
3090 auto ReqToMemConv = [&ReqToMem, &ReqUrMem, HostTask](
Requirement *Req) {
3091 const std::vector<AllocaCommandBase *> &AllocaCmds =
3092 Req->MSYCLMemObj->MRecord->MAllocaCommands;
3094 for (AllocaCommandBase *AllocaCmd : AllocaCmds)
3097 auto MemArg =
reinterpret_cast<ur_mem_handle_t
>(
3098 AllocaCmd->getMemAllocation());
3099 ReqToMem.emplace_back(std::make_pair(Req, MemArg));
3100 ReqUrMem.emplace_back(MemArg);
3106 "Can't get memory object due to no allocation available");
3110 "Can't get memory object due to no allocation available " +
3113 std::for_each(std::begin(HandlerReq), std::end(HandlerReq), ReqToMemConv);
3114 std::sort(std::begin(ReqToMem), std::end(ReqToMem));
3126 return UR_RESULT_SUCCESS;
3129 CGHostTask *HostTask =
static_cast<CGHostTask *
>(MCommandGroup.get());
3131 for (ArgDesc &Arg : HostTask->MArgs) {
3132 switch (Arg.MType) {
3135 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
3138 Req->MData = AllocaCmd->getMemAllocation();
3143 "Unsupported arg type ");
3147 std::vector<interop_handle::ReqToMem> ReqToMem;
3148 std::vector<ur_mem_handle_t> ReqMems;
3150 if (HostTask->MHostTask->isInteropTask()) {
3153 const std::vector<Requirement *> &HandlerReq =
3154 HostTask->getRequirements();
3155 auto ReqToMemConv = [&ReqToMem, &ReqMems, HostTask](
Requirement *Req) {
3156 const std::vector<AllocaCommandBase *> &AllocaCmds =
3157 Req->MSYCLMemObj->MRecord->MAllocaCommands;
3159 for (AllocaCommandBase *AllocaCmd : AllocaCmds)
3162 auto MemArg =
reinterpret_cast<ur_mem_handle_t
>(
3163 AllocaCmd->getMemAllocation());
3164 ReqToMem.emplace_back(std::make_pair(Req, MemArg));
3165 ReqMems.emplace_back(MemArg);
3171 "Can't get memory object due to no allocation available");
3175 "Can't get memory object due to no allocation available " +
3178 std::for_each(std::begin(HandlerReq), std::end(HandlerReq), ReqToMemConv);
3179 std::sort(std::begin(ReqToMem), std::end(ReqToMem));
3182 EnqueueNativeCommandData CustomOpData{
3183 interop_handle{ReqToMem, HostTask->MQueue,
3184 HostTask->MQueue->getDeviceImplPtr(),
3185 HostTask->MQueue->getContextImplPtr()},
3186 HostTask->MHostTask->MInteropTask};
3188 ur_bool_t NativeCommandSupport =
false;
3189 MQueue->getPlugin()->call<UrApiKind::urDeviceGetInfo>(
3191 UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP,
3192 sizeof(NativeCommandSupport), &NativeCommandSupport,
nullptr);
3193 assert(NativeCommandSupport &&
"ext_codeplay_enqueue_native_command is not "
3194 "supported on this device");
3195 MQueue->getPlugin()->call<UrApiKind::urEnqueueNativeCommandExp>(
3196 MQueue->getHandleRef(), InteropFreeFunc, &CustomOpData, ReqMems.size(),
3197 ReqMems.data(),
nullptr, RawEvents.size(), RawEvents.data(), Event);
3199 MEvent->setHandle(*Event);
3200 return UR_RESULT_SUCCESS;
3203 assert(
MQueue &&
"Barrier submission should have an associated queue");
3206 MEvent->setHostEnqueueTime();
3207 Plugin->call<UrApiKind::urEnqueueEventsWaitWithBarrier>(
3208 MQueue->getHandleRef(), 0,
nullptr, Event);
3210 MEvent->setHandle(*Event);
3211 return UR_RESULT_SUCCESS;
3214 assert(
MQueue &&
"Barrier submission should have an associated queue");
3215 CGBarrier *
Barrier =
static_cast<CGBarrier *
>(MCommandGroup.get());
3216 std::vector<detail::EventImplPtr> Events =
Barrier->MEventsWaitWithBarrier;
3218 if (UrEvents.empty()) {
3220 return UR_RESULT_SUCCESS;
3224 MEvent->setHostEnqueueTime();
3225 Plugin->call<UrApiKind::urEnqueueEventsWaitWithBarrier>(
3226 MQueue->getHandleRef(), UrEvents.size(), &UrEvents[0], Event);
3228 MEvent->setHandle(*Event);
3229 return UR_RESULT_SUCCESS;
3232 assert(
MQueue &&
"Profiling tag requires a valid queue");
3233 const auto &Plugin =
MQueue->getPlugin();
3237 if (!
MQueue->isInOrder()) {
3241 ur_event_handle_t PreTimestampBarrierEvent{};
3242 Plugin->call<UrApiKind::urEnqueueEventsWaitWithBarrier>(
3245 nullptr, &PreTimestampBarrierEvent);
3246 Plugin->call<UrApiKind::urEventRelease>(PreTimestampBarrierEvent);
3249 Plugin->call<UrApiKind::urEnqueueTimestampRecordingExp>(
3254 MEvent->setHandle(*Event);
3255 return UR_RESULT_SUCCESS;
3258 CGCopyToDeviceGlobal *Copy = (CGCopyToDeviceGlobal *)MCommandGroup.get();
3260 Copy->getDeviceGlobalPtr(), Copy->isDeviceImageScoped(),
MQueue,
3261 Copy->getNumBytes(), Copy->getOffset(), Copy->getSrc(),
3262 std::move(RawEvents), Event,
MEvent);
3264 MEvent->setHandle(*Event);
3265 return UR_RESULT_SUCCESS;
3268 CGCopyFromDeviceGlobal *Copy =
3269 (CGCopyFromDeviceGlobal *)MCommandGroup.get();
3271 Copy->getDeviceGlobalPtr(), Copy->isDeviceImageScoped(),
MQueue,
3272 Copy->getNumBytes(), Copy->getOffset(), Copy->getDest(),
3273 std::move(RawEvents), Event,
MEvent);
3275 MEvent->setHandle(*Event);
3276 return UR_RESULT_SUCCESS;
3279 CGReadWriteHostPipe *ExecReadWriteHostPipe =
3280 (CGReadWriteHostPipe *)MCommandGroup.get();
3281 std::string pipeName = ExecReadWriteHostPipe->getPipeName();
3282 void *hostPtr = ExecReadWriteHostPipe->getHostPtr();
3283 size_t typeSize = ExecReadWriteHostPipe->getTypeSize();
3284 bool blocking = ExecReadWriteHostPipe->isBlocking();
3285 bool read = ExecReadWriteHostPipe->isReadHostPipe();
3291 typeSize, RawEvents, EventImpl, read);
3295 "Command buffer submissions should have an associated queue");
3296 CGExecCommandBuffer *CmdBufferCG =
3297 static_cast<CGExecCommandBuffer *
>(MCommandGroup.get());
3299 MEvent->setHostEnqueueTime();
3301 MQueue->getPlugin()->call_nocheck<UrApiKind::urCommandBufferEnqueueExp>(
3302 CmdBufferCG->MCommandBuffer,
MQueue->getHandleRef(),
3303 RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
3306 MEvent->setHandle(*Event);
3311 CGCopyImage *Copy = (CGCopyImage *)MCommandGroup.get();
3314 MQueue, Copy->getSrc(), Copy->getDst(), Copy->getSrcDesc(),
3315 Copy->getDstDesc(), Copy->getSrcFormat(), Copy->getDstFormat(),
3316 Copy->getCopyFlags(), Copy->getSrcOffset(), Copy->getDstOffset(),
3317 Copy->getCopyExtent(), std::move(RawEvents), Event);
3319 MEvent->setHandle(*Event);
3321 return UR_RESULT_SUCCESS;
3325 "Semaphore wait submissions should have an associated queue");
3326 CGSemaphoreWait *SemWait = (CGSemaphoreWait *)MCommandGroup.get();
3328 auto OptWaitValue = SemWait->getWaitValue();
3329 uint64_t WaitValue = OptWaitValue.has_value() ? OptWaitValue.value() : 0;
3330 Plugin->call<UrApiKind::urBindlessImagesWaitExternalSemaphoreExp>(
3331 MQueue->getHandleRef(), SemWait->getExternalSemaphore(),
3332 OptWaitValue.has_value(), WaitValue, 0,
nullptr,
nullptr);
3334 return UR_RESULT_SUCCESS;
3338 "Semaphore signal submissions should have an associated queue");
3339 CGSemaphoreSignal *SemSignal = (CGSemaphoreSignal *)MCommandGroup.get();
3341 auto OptSignalValue = SemSignal->getSignalValue();
3342 uint64_t SignalValue =
3343 OptSignalValue.has_value() ? OptSignalValue.value() : 0;
3344 Plugin->call<UrApiKind::urBindlessImagesSignalExternalSemaphoreExp>(
3345 MQueue->getHandleRef(), SemSignal->getExternalSemaphore(),
3346 OptSignalValue.has_value(), SignalValue, 0,
nullptr,
nullptr);
3348 return UR_RESULT_SUCCESS;
3352 "CG type not implemented. " +
3355 return UR_RESULT_ERROR_INVALID_OPERATION;
3378 std::vector<std::shared_ptr<ext::oneapi::experimental::detail::node_impl>>
3383 ur_result_t UpdateCommandBufferCommand::enqueueImp() {
3386 ur_event_handle_t UREvent =
nullptr;
3388 MEvent->setHandle(UREvent);
3390 for (
auto &Node : MNodes) {
3391 auto CG =
static_cast<CGExecKernel *
>(Node->MCommandGroup.get());
3392 for (
auto &Arg :
CG->MArgs) {
3397 for (
const DepDesc &Dep :
MDeps) {
3398 Requirement *Req =
static_cast<AccessorImplHost *
>(Arg.MPtr);
3399 if (Dep.MDepRequirement == Req) {
3400 if (Dep.MAllocaCmd) {
3401 Req->
MData = Dep.MAllocaCmd->getMemAllocation();
3404 "No allocation available for accessor when "
3405 "updating command buffer!");
3413 return UR_RESULT_SUCCESS;
3417 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#8d8f29\", label=\"";
3419 Stream <<
"ID = " <<
this <<
"\\n";
3420 Stream <<
"CommandBuffer Command Update"
3423 Stream <<
"\"];" << std::endl;
3425 for (
const auto &Dep :
MDeps) {
3426 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
3427 <<
" [ label = \"Access mode: "
3429 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
The context class represents a SYCL context on which kernel functions may be executed.
detail::SYCLMemObjI * MSYCLMemObj
range< 3 > & MAccessRange
range< 3 > & MMemoryRange
Base class for memory allocation commands.
const Requirement * getRequirement() const final
AllocaCommandBase * MLinkedAllocaCmd
Alloca command linked with current command.
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
virtual void * getMemAllocation() const =0
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
bool MIsActive
Indicates that current alloca is active one.
bool MIsLeaderAlloca
Indicates that the command owns memory allocation in case of connected alloca command.
AllocaCommandBase(CommandType Type, QueueImplPtr Queue, Requirement Req, AllocaCommandBase *LinkedAllocaCmd, bool IsConst)
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
SYCLMemObjI * getSYCLMemObj() const
void printDot(std::ostream &Stream) const final
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
AllocaCommand(QueueImplPtr Queue, Requirement Req, bool InitFromUserData=true, AllocaCommandBase *LinkedAllocaCmd=nullptr, bool IsConst=false)
void * getMemAllocation() const final
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
void printDot(std::ostream &Stream) const final
AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *ParentAlloca, std::vector< Command * > &ToEnqueue, std::vector< Command * > &ToCleanUp)
sycl::detail::kernel_param_kind_t MType
"Execute kernel" command group class.
std::shared_ptr< detail::kernel_bundle_impl > MKernelBundle
std::vector< ArgDesc > MArgs
bool MKernelIsCooperative
NDRDescT MNDRDesc
Stores ND-range description.
std::shared_ptr< detail::kernel_impl > MSyclKernel
bool MKernelUsesClusterLaunch
std::shared_ptr< detail::queue_impl > MQueue
Base class for all types of command groups.
The Command class represents some action that needs to be performed on one or more memory objects.
bool MShouldCompleteEventIfPossible
void copySubmissionCodeLocation()
virtual ur_result_t enqueueImp()=0
Private interface. Derived classes should implement this method.
Command * processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform glueing of events from different contexts.
void * MTraceEvent
The event for node_create and task_begin.
CommandType MType
The type of the command.
virtual bool producesPiEvent() const
Returns true iff the command produces a UR event on non-host devices.
int32_t MStreamID
The stream under which the traces are emitted.
void emitInstrumentation(uint16_t Type, const char *Txt=nullptr)
Emits an event of Type.
virtual void emitInstrumentationData()=0
Instrumentation method which emits telemetry data.
void resolveReleaseDependencies(std::set< Command * > &list)
Looks at all the dependencies for the release command and enables instrumentation to report these dep...
std::string MCommandName
Buffer to build the command end-user understandable name.
bool MMarkedForCleanup
Indicates that the node will be freed by graph cleanup.
unsigned MLeafCounter
Counts the number of memory objects this command is a leaf for.
std::vector< ur_exp_command_buffer_sync_point_t > MSyncPointDeps
List of sync points for submissions to a command buffer.
virtual bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, std::vector< Command * > &ToCleanUp)
Checks if the command is enqueued, and calls enqueueImp.
void waitForEvents(QueueImplPtr Queue, std::vector< EventImplPtr > &RawEvents, ur_event_handle_t &Event)
void waitForPreparedHostEvents() const
std::string MSubmissionFileName
Introduces string to handle memory management since code_location struct works with raw char arrays.
std::vector< ur_event_handle_t > getUrEvents(const std::vector< EventImplPtr > &EventImpls) const
Collect UR events from EventImpls and filter out some of them in case of in order queue.
std::mutex MEnqueueMtx
Mutex used to protect enqueueing from race conditions.
void emitInstrumentationDataProxy()
Proxy method which calls emitInstrumentationData.
code_location MSubmissionCodeLocation
Represents code location of command submission to SYCL API, assigned with the valid value only if com...
void makeTraceEventEpilog()
If prolog has been run, run epilog; this must be guarded by a check for xptiTraceEnabled().
std::vector< EventImplPtr > & MPreparedHostDepsEvents
void emitEnqueuedEventSignal(const ur_event_handle_t UrEventAddr)
Creates a signal event with the enqueued kernel event handle.
std::vector< EventImplPtr > & MPreparedDepsEvents
Dependency events prepared for waiting by backend.
std::string MSubmissionFunctionName
uint64_t MInstanceID
Instance ID tracked for the command.
std::vector< DepDesc > MDeps
Contains list of dependencies(edges)
const char * getBlockReason() const
std::vector< ur_event_handle_t > getUrEventsBlocking(const std::vector< EventImplPtr > &EventImpls) const
Collect UR events from EventImpls and filter out some of them in case of in order queue.
virtual bool readyForCleanup() const
Returns true iff this command is ready to be submitted for cleanup.
void addUser(Command *NewUser)
virtual ContextImplPtr getWorkerContext() const
Get the context of the queue this command will be submitted to.
std::atomic< EnqueueResultT::ResultT > MEnqueueStatus
Describes the status of the command.
const EventImplPtr & getEvent() const
ur_exp_command_buffer_handle_t MCommandBuffer
CommandBuffer which will be used to submit to instead of the queue, if set.
uint64_t makeTraceEventProlog(void *MAddress)
Create a trace event of node_create type; this must be guarded by a check for xptiTraceEnabled().
friend class DispatchHostTask
CommandType getType() const
void * MAddress
Reserved for storing the object address such as SPIR-V or memory object address.
std::string MAddressString
Buffer to build the address string.
QueueImplPtr MWorkerQueue
bool MIsBlockable
Indicates whether the command can be blocked from enqueueing.
virtual bool supportsPostEnqueueCleanup() const
Returns true iff this command can be freed by post enqueue cleanup.
bool MTraceEventPrologComplete
Flag to indicate if makeTraceEventProlog() has been run.
std::string MCommandNodeType
Buffer to build the command node type.
void emitEdgeEventForEventDependence(Command *Cmd, ur_event_handle_t &EventAddr)
Creates an edge event when the dependency is an event.
Command * addDep(DepDesc NewDep, std::vector< Command * > &ToCleanUp)
void emitEdgeEventForCommandDependence(Command *Cmd, void *ObjAddr, bool IsCommand, std::optional< access::mode > AccMode=std::nullopt)
Creates an edge event when the dependency is a command.
const QueueImplPtr & getQueue() const
Command(CommandType Type, QueueImplPtr Queue, ur_exp_command_buffer_handle_t CommandBuffer=nullptr, const std::vector< ur_exp_command_buffer_sync_point_t > &SyncPoints={})
It is safe to bind MPreparedDepsEvents and MPreparedHostDepsEvents references to event_impl class mem...
DispatchHostTask(ExecCGCommand *ThisCmd, std::vector< interop_handle::ReqToMem > ReqToMem, std::vector< ur_mem_handle_t > ReqUrMem)
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
void printDot(std::ostream &Stream) const final
void addRequirement(Command *DepCmd, AllocaCommandBase *AllocaCmd, const Requirement *Req)
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
The exec CG command enqueues execution of kernel or explicit memory operation.
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
ExecCGCommand(std::unique_ptr< detail::CG > CommandGroup, QueueImplPtr Queue, bool EventNeeded, ur_exp_command_buffer_handle_t CommandBuffer=nullptr, const std::vector< ur_exp_command_buffer_sync_point_t > &Dependencies={})
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
std::string_view getTypeString() const
detail::CG & getCG() const
void printDot(std::ostream &Stream) const final
std::vector< std::shared_ptr< const void > > getAuxiliaryResources() const
void clearAuxiliaryResources()
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
void TraceEventXPTI(const char *Message)
static GlobalHandler & instance()
void call(HostProfilingInfo *HPI)
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, void **DstPtr, QueueImplPtr Queue, access::mode MapMode)
void printDot(std::ostream &Stream) const final
ContextImplPtr getWorkerContext() const final
Get the context of the queue this command will be submitted to.
MemCpyCommandHost(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, Requirement DstReq, void **DstPtr, QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
void printDot(std::ostream &Stream) const final
ContextImplPtr getWorkerContext() const final
Get the context of the queue this command will be submitted to.
MemCpyCommand(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, Requirement DstReq, AllocaCommandBase *DstAllocaCmd, QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
void printDot(std::ostream &Stream) const final
static void ext_oneapi_fill_usm_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, void *DstMem, size_t Len, const std::vector< unsigned char > &Pattern, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void ext_oneapi_copyH2D_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, SYCLMemObjI *SYCLMemObj, char *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void copy_image_bindless(QueueImplPtr Queue, const void *Src, void *Dst, const ur_image_desc_t &SrcDesc, const ur_image_desc_t &DstDesc, const ur_image_format_t &SrcFormat, const ur_image_format_t &DstFormat, const ur_exp_image_copy_flags_t Flags, ur_rect_offset_t SrcOffset, ur_rect_offset_t DstOffset, ur_rect_region_t CopyExtent, const std::vector< ur_event_handle_t > &DepEvents, ur_event_handle_t *OutEvent)
static void fill_2d_usm(void *DstMem, QueueImplPtr Queue, size_t Pitch, size_t Width, size_t Height, const std::vector< unsigned char > &Pattern, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_advise_usm_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, const void *Mem, size_t Length, ur_usm_advice_flags_t Advice, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void fill(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, size_t PatternSize, const unsigned char *Pattern, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t &OutEvent, const detail::EventImplPtr &OutEventImpl)
static void * allocateMemSubBuffer(ContextImplPtr TargetContext, void *ParentMemObj, size_t ElemSize, size_t Offset, range< 3 > Range, std::vector< EventImplPtr > DepEvents, ur_event_handle_t &OutEvent)
static void ext_oneapi_fill_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, SYCLMemObjI *SYCLMemObj, void *Mem, size_t PatternSize, const unsigned char *Pattern, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void memset_2d_usm(void *DstMem, QueueImplPtr Queue, size_t Pitch, size_t Width, size_t Height, char Value, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_copyD2H_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, SYCLMemObjI *SYCLMemObj, void *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, char *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void copy_to_device_global(const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue, size_t NumBytes, size_t Offset, const void *SrcMem, const std::vector< ur_event_handle_t > &DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_copyD2D_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, SYCLMemObjI *SYCLMemObj, void *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void copy_2d_usm(const void *SrcMem, size_t SrcPitch, QueueImplPtr Queue, void *DstMem, size_t DstPitch, size_t Width, size_t Height, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void advise_usm(const void *Ptr, QueueImplPtr Queue, size_t Len, ur_usm_advice_flags_t Advice, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void * map(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, access::mode AccessMode, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t &OutEvent)
static void copy(SYCLMemObjI *SYCLMemObj, void *SrcMem, QueueImplPtr SrcQueue, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, QueueImplPtr TgtQueue, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t &OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy_usm(const void *SrcMem, QueueImplPtr Queue, size_t Len, void *DstMem, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void fill_usm(void *DstMem, QueueImplPtr Queue, size_t Len, const std::vector< unsigned char > &Pattern, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy_from_device_global(const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue, size_t NumBytes, size_t Offset, void *DstMem, const std::vector< ur_event_handle_t > &DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void * allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, bool InitFromUserData, void *HostPtr, std::vector< EventImplPtr > DepEvents, ur_event_handle_t &OutEvent)
static void ext_oneapi_copy_usm_cmd_buffer(ContextImplPtr Context, const void *SrcMem, ur_exp_command_buffer_handle_t CommandBuffer, size_t Len, void *DstMem, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void unmap(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, void *MappedPtr, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t &OutEvent)
static void release(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *MemAllocation, std::vector< EventImplPtr > DepEvents, ur_event_handle_t &OutEvent)
static void ext_oneapi_prefetch_usm_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, void *Mem, size_t Length, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void prefetch_usm(void *Ptr, QueueImplPtr Queue, size_t Len, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
sycl::range< 3 > GlobalSize
sycl::range< 3 > NumWorkGroups
Number of workgroups, used to record the number of workgroups from the simplest form of parallel_for_...
sycl::id< 3 > GlobalOffset
sycl::range< 3 > LocalSize
sycl::range< 3 > ClusterDimensions
std::tuple< ur_kernel_handle_t, std::mutex *, const KernelArgMask *, ur_program_handle_t > getOrCreateKernel(const ContextImplPtr &ContextImpl, const DeviceImplPtr &DeviceImpl, const std::string &KernelName, const NDRDescT &NDRDesc={})
static ProgramManager & getInstance()
kernel_id getSYCLKernelID(const std::string &KernelName)
device_image_plain getDeviceImageFromBinaryImage(RTDeviceBinaryImage *BinImage, const context &Ctx, const device &Dev)
bool kernelUsesAssert(const std::string &KernelName) const
HostPipeMapEntry * getHostPipeEntry(const std::string &UniqueId)
device_image_plain build(const device_image_plain &DeviceImage, const std::vector< device > &Devs, const property_list &PropList)
sycl_device_binary get() const
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd)
void printDot(std::ostream &Stream) const final
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
static const char * get()
virtual MemObjType getType() const =0
Command * connectDepEvent(Command *const Cmd, const EventImplPtr &DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform connection of events in multiple contexts.
void enqueueCommandForCG(EventImplPtr NewEvent, std::vector< Command * > &AuxilaryCmds, BlockingT Blocking=NON_BLOCKING)
GraphBuilder MGraphBuilder
ur_kernel_handle_t completeSpecConstMaterialization(QueueImplPtr Queue, const RTDeviceBinaryImage *BinImage, const std::string &KernelName, std::vector< unsigned char > &SpecConstBlob)
static Scheduler & getInstance()
void NotifyHostTaskCompletion(Command *Cmd)
void printDot(std::ostream &Stream) const final
UnMapMemObject(AllocaCommandBase *DstAllocaCmd, Requirement Req, void **SrcPtr, QueueImplPtr Queue)
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
UpdateCommandBufferCommand(QueueImplPtr Queue, ext::oneapi::experimental::detail::exec_graph_impl *Graph, std::vector< std::shared_ptr< ext::oneapi::experimental::detail::node_impl >> Nodes)
void printDot(std::ostream &Stream) const final
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
UpdateHostRequirementCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *SrcAllocaCmd, void **DstPtr)
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
void printDot(std::ostream &Stream) const final
static void bufferAssociateNotification(const void *, const void *)
const PluginPtr & getPlugin() const
ur_device_handle_t & getHandleRef()
Get reference to UR device.
static ThreadPool & getThreadPool()
Data type that manages the code_location information in TLS.
const detail::code_location & query()
Query the information in the TLS slot.
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Class representing the implementation of command_graph<executable>.
void updateImpl(std::shared_ptr< node_impl > NodeImpl)
Objects of the class identify kernel is some kernel_bundle related APIs.
Provides an abstraction of a SYCL kernel.
Defines the iteration domain of both the work-groups and the overall dispatch.
Objects of the property_list class are containers for the SYCL properties.
std::function< void(interop_handle)> func
void handleErrorOrWarning(ur_result_t Error, const device_impl &DeviceImpl, ur_kernel_handle_t Kernel, const NDRDescT &NDRDesc)
Analyzes error code and arguments of urEnqueueKernelLaunch to emit user-friendly exception describing...
void free(void *Ptr, const context &Ctxt, const code_location &CL)
@ kind_specialization_constants_buffer
decltype(Obj::impl) const & getSyclObjImpl(const Obj &SyclObject)
static void flushCrossQueueDeps(const std::vector< EventImplPtr > &EventImpls, const QueueImplPtr &Queue)
static std::string demangleKernelName(std::string Name)
static ur_result_t SetKernelParamsAndLaunch(const QueueImplPtr &Queue, std::vector< ArgDesc > &Args, const std::shared_ptr< device_image_impl > &DeviceImageImpl, ur_kernel_handle_t Kernel, NDRDescT &NDRDesc, std::vector< ur_event_handle_t > &RawEvents, const detail::EventImplPtr &OutEventImpl, const KernelArgMask *EliminatedArgMask, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, bool IsCooperative, bool KernelUsesClusterLaunch, const RTDeviceBinaryImage *BinImage, const std::string &KernelName)
std::vector< bool > KernelArgMask
std::string codeToString(int32_t code)
constexpr const char * SYCL_STREAM_NAME
ur_mem_flags_t AccessModeToUr(access::mode AccessorMode)
static std::string_view cgTypeToString(detail::CGType Type)
void ReverseRangeDimensionsForKernel(NDRDescT &NDR)
void enqueueImpKernel(const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector< ArgDesc > &Args, const std::shared_ptr< detail::kernel_bundle_impl > &KernelBundleImplPtr, const std::shared_ptr< detail::kernel_impl > &MSyclKernel, const std::string &KernelName, std::vector< ur_event_handle_t > &RawEvents, const detail::EventImplPtr &OutEventImpl, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, ur_kernel_cache_config_t KernelCacheConfig, const bool KernelIsCooperative, const bool KernelUsesClusterLaunch, const RTDeviceBinaryImage *BinImage)
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
void SetArgBasedOnType(const PluginPtr &Plugin, ur_kernel_handle_t Kernel, const std::shared_ptr< device_image_impl > &DeviceImageImpl, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, const sycl::context &Context, detail::ArgDesc &Arg, size_t NextTrueIndex)
std::shared_ptr< event_impl > EventImplPtr
std::shared_ptr< plugin > PluginPtr
AccessorImplHost Requirement
ur_result_t enqueueImpCommandBufferKernel(context Ctx, DeviceImplPtr DeviceImpl, ur_exp_command_buffer_handle_t CommandBuffer, const CGExecKernel &CommandGroup, std::vector< ur_exp_command_buffer_sync_point_t > &SyncPoints, ur_exp_command_buffer_sync_point_t *OutSyncPoint, ur_exp_command_buffer_command_handle_t *OutCommand, const std::function< void *(Requirement *Req)> &getMemAllocationFunc)
std::shared_ptr< device_impl > DeviceImplPtr
CGType
Type of the command group.
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
static void adjustNDRangePerKernel(NDRDescT &NDR, ur_kernel_handle_t Kernel, const device_impl &DeviceImpl)
void applyFuncOnFilteredArgs(const KernelArgMask *EliminatedArgMask, std::vector< ArgDesc > &Args, std::function< void(detail::ArgDesc &Arg, int NextTrueIndex)> Func)
ur_result_t enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName, bool blocking, void *ptr, size_t size, std::vector< ur_event_handle_t > &RawEvents, const detail::EventImplPtr &OutEventImpl, bool read)
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
int32_t get_ur_error(const exception &e)
static ContextImplPtr getContext(const QueueImplPtr &Queue)
std::enable_if< !std::is_same< typename Param::return_type, sycl::range< 3 > >::value, typename Param::return_type >::type get_kernel_device_specific_info(ur_kernel_handle_t Kernel, ur_device_handle_t Device, const PluginPtr &Plugin)
static std::string accessModeToString(access::mode Mode)
std::string queueDeviceToString(const queue_impl *const &Queue)
std::tuple< const RTDeviceBinaryImage *, ur_program_handle_t > retrieveKernelBinary(const QueueImplPtr &, const char *KernelName, CGExecKernel *CGKernel=nullptr)
Function for_each(Group g, Ptr first, Ptr last, Function f)
__DPCPP_SYCL_EXTERNAL _SYCL_EXT_CPLX_INLINE_VISIBILITY std::enable_if_t< is_genfloat< _Tp >::value, _Tp > arg(const complex< _Tp > &__c)
PropertyListT int access::address_space multi_ptr & operator=(multi_ptr &&)=default
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Dependency between two commands.
const Requirement * MDepRequirement
Requirement for the dependency.
Command * MDepCommand
The actual dependency command.
Result of command enqueueing.
RTDeviceBinaryImage * getDevBinImage()