38 #if __has_include(<cxxabi.h>)
39 #define __SYCL_ENABLE_GNU_DEMANGLING
46 #ifdef XPTI_ENABLE_INSTRUMENTATION
47 #include "xpti/xpti_trace_framework.hpp"
52 inline namespace _V1 {
55 #ifdef XPTI_ENABLE_INSTRUMENTATION
57 extern xpti::trace_event_data_t *GSYCLGraphEvent;
59 static bool CurrentCodeLocationValid() {
60 detail::tls_code_loc_t Tls;
61 auto CodeLoc = Tls.query();
62 auto FileName = CodeLoc.fileName();
63 auto FunctionName = CodeLoc.functionName();
64 return (FileName && FileName[0] !=
'\0') ||
65 (FunctionName && FunctionName[0] !=
'\0');
68 void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID,
69 xpti_td *TraceEvent, uint16_t Type,
71 if (!(xptiCheckTraceEnabled(StreamID, Type) && TraceEvent))
74 xptiNotifySubscribers(StreamID, Type, detail::GSYCLGraphEvent,
75 static_cast<xpti_td *
>(TraceEvent), InstanceID, Addr);
78 static size_t deviceToID(
const device &Device) {
79 return reinterpret_cast<size_t>(
getSyclObjImpl(Device)->getHandleRef());
82 static void addDeviceMetadata(xpti_td *TraceEvent,
const QueueImplPtr &Queue) {
83 xpti::addMetadata(TraceEvent,
"sycl_device_type",
86 xpti::addMetadata(TraceEvent,
"sycl_device",
87 deviceToID(Queue->get_device()));
88 xpti::addMetadata(TraceEvent,
"sycl_device_name",
93 static unsigned long long getQueueID(
const QueueImplPtr &Queue) {
94 return Queue ? Queue->getQueueID() : 0;
100 return Queue->getContextImplPtr();
104 #ifdef __SYCL_ENABLE_GNU_DEMANGLING
105 struct DemangleHandle {
107 DemangleHandle(
char *ptr) : p(ptr) {}
109 DemangleHandle(
const DemangleHandle &) =
delete;
110 DemangleHandle &
operator=(
const DemangleHandle &) =
delete;
116 DemangleHandle result(abi::__cxa_demangle(Name.c_str(), NULL, NULL, &Status));
117 return (Status == 0) ? result.p : Name;
124 const KernelArgMask *EliminatedArgMask, std::vector<ArgDesc> &Args,
126 if (!EliminatedArgMask || EliminatedArgMask->size() == 0) {
134 std::sort(Args.begin(), Args.end(), [](
const ArgDesc &A,
const ArgDesc &B) {
135 return A.MIndex < B.MIndex;
138 size_t NextTrueIndex = 0;
143 for (
int Idx = LastIndex + 1; Idx < Arg.
MIndex; ++Idx)
144 if (!(*EliminatedArgMask)[Idx])
148 if ((*EliminatedArgMask)[Arg.
MIndex])
151 Func(Arg, NextTrueIndex);
166 return "discard_write";
168 return "discard_read_write";
174 #ifdef XPTI_ENABLE_INSTRUMENTATION
179 case Command::CommandType::RUN_CG:
180 return "command_group_node";
181 case Command::CommandType::COPY_MEMORY:
182 return "memory_transfer_node";
183 case Command::CommandType::ALLOCA:
184 return "memory_allocation_node";
185 case Command::CommandType::ALLOCA_SUB_BUF:
186 return "sub_buffer_creation_node";
187 case Command::CommandType::RELEASE:
188 return "memory_deallocation_node";
189 case Command::CommandType::MAP_MEM_OBJ:
190 return "memory_transfer_node";
191 case Command::CommandType::UNMAP_MEM_OBJ:
192 return "memory_transfer_node";
193 case Command::CommandType::UPDATE_REQUIREMENT:
194 return "host_acc_create_buffer_lock_node";
195 case Command::CommandType::EMPTY_TASK:
196 return "host_acc_destroy_buffer_release_node";
197 case Command::CommandType::FUSION:
198 return "kernel_fusion_placeholder_node";
200 return "unknown_node";
209 case Command::CommandType::RUN_CG:
210 return "Command Group Action";
211 case Command::CommandType::COPY_MEMORY:
212 return "Memory Transfer (Copy)";
213 case Command::CommandType::ALLOCA:
214 return "Memory Allocation";
215 case Command::CommandType::ALLOCA_SUB_BUF:
216 return "Sub Buffer Creation";
217 case Command::CommandType::RELEASE:
218 return "Memory Deallocation";
219 case Command::CommandType::MAP_MEM_OBJ:
220 return "Memory Transfer (Map)";
221 case Command::CommandType::UNMAP_MEM_OBJ:
222 return "Memory Transfer (Unmap)";
223 case Command::CommandType::UPDATE_REQUIREMENT:
224 return "Host Accessor Creation/Buffer Lock";
225 case Command::CommandType::EMPTY_TASK:
226 return "Host Accessor Destruction/Buffer Lock Release";
227 case Command::CommandType::FUSION:
228 return "Kernel Fusion Placeholder";
230 return "Unknown Action";
235 std::vector<ur_event_handle_t>
237 std::vector<ur_event_handle_t> RetUrEvents;
238 for (
auto &EventImpl : EventImpls) {
239 if (EventImpl->getHandleRef() ==
nullptr)
250 RetUrEvents.push_back(EventImpl->getHandleRef());
261 const std::vector<EventImplPtr> &EventImpls)
const {
262 std::vector<ur_event_handle_t> RetUrEvents;
263 for (
auto &EventImpl : EventImpls) {
268 if (EventImpl->isDefaultConstructed() || EventImpl->isHost() ||
273 if (!EventImpl->isEnqueued()) {
274 if (!EventImpl->getCommand() ||
277 std::vector<Command *> AuxCmds;
289 RetUrEvents.push_back(EventImpl->getHandleRef());
296 return (
MType == CommandType::RUN_CG) &&
297 ((
static_cast<const ExecCGCommand *
>(
this))->getCG().getType() ==
302 if ((
MType != CommandType::RUN_CG)) {
313 for (
auto &EventImpl : EventImpls) {
314 EventImpl->flushIfNeeded(Queue);
320 struct EnqueueNativeCommandData {
322 std::function<void(interop_handle)>
func;
325 void InteropFreeFunc(ur_queue_handle_t,
void *InteropData) {
326 auto *Data =
reinterpret_cast<EnqueueNativeCommandData *
>(InteropData);
327 return Data->func(Data->ih);
333 std::vector<interop_handle::ReqToMem> MReqToMem;
334 std::vector<ur_mem_handle_t> MReqUrMem;
336 bool waitForEvents()
const {
337 std::map<const PluginPtr, std::vector<EventImplPtr>>
338 RequiredEventsPerPlugin;
341 const PluginPtr &Plugin = Event->getPlugin();
342 RequiredEventsPerPlugin[Plugin].push_back(Event);
350 for (
auto &PluginWithEvents : RequiredEventsPerPlugin) {
351 std::vector<ur_event_handle_t> RawEvents =
353 if (RawEvents.size() == 0)
356 PluginWithEvents.first->call(urEventWait, RawEvents.size(),
359 MThisCmd->
MEvent->getSubmittedQueue()->reportAsyncException(
360 std::current_exception());
363 MThisCmd->
MEvent->getSubmittedQueue()->reportAsyncException(
364 std::current_exception());
372 Event->waitInternal();
380 std::vector<interop_handle::ReqToMem> ReqToMem,
381 std::vector<ur_mem_handle_t> ReqUrMem)
382 : MThisCmd{ThisCmd}, MReqToMem(
std::move(ReqToMem)),
383 MReqUrMem(
std::move(ReqUrMem)) {}
390 #ifdef XPTI_ENABLE_INSTRUMENTATION
395 std::unique_ptr<detail::tls_code_loc_t> AsyncCodeLocationPtr;
396 if (xptiTraceEnabled() && !CurrentCodeLocationValid()) {
397 AsyncCodeLocationPtr.reset(
402 if (!waitForEvents()) {
405 std::string(
"Couldn't wait for host-task's dependencies")));
407 MThisCmd->
MEvent->getSubmittedQueue()->reportAsyncException(EPtr);
416 if (
HostTask.MHostTask->isInteropTask()) {
418 "Host task submissions should have an associated queue");
420 HostTask.MQueue->getDeviceImplPtr(),
421 HostTask.MQueue->getContextImplPtr()};
425 bool NativeCommandSupport =
false;
426 Queue->getPlugin()->
call(
429 UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP,
430 sizeof(NativeCommandSupport), &NativeCommandSupport,
nullptr);
431 if (NativeCommandSupport) {
432 EnqueueNativeCommandData CustomOpData{
433 IH,
HostTask.MHostTask->MInteropTask};
442 Queue->getPlugin()->call(
443 urEnqueueNativeCommandExp,
HostTask.MQueue->getHandleRef(),
444 InteropFreeFunc, &CustomOpData, MReqUrMem.size(),
445 MReqUrMem.data(),
nullptr, 0,
nullptr,
nullptr);
447 HostTask.MHostTask->call(MThisCmd->
MEvent->getHostProfilingInfo(),
451 HostTask.MHostTask->call(MThisCmd->
MEvent->getHostProfilingInfo());
453 auto CurrentException = std::current_exception();
454 #ifdef XPTI_ENABLE_INSTRUMENTATION
458 if (xptiTraceEnabled()) {
460 rethrow_exception(CurrentException);
463 }
catch (
const std::exception &StdException) {
467 "Host task lambda thrown non standard exception");
471 MThisCmd->
MEvent->getSubmittedQueue()->reportAsyncException(
477 #ifdef XPTI_ENABLE_INSTRUMENTATION
480 AsyncCodeLocationPtr.reset();
488 auto CurrentException = std::current_exception();
489 MThisCmd->
MEvent->getSubmittedQueue()->reportAsyncException(
497 HostEvent->waitInternal();
501 std::vector<EventImplPtr> &EventImpls,
502 ur_event_handle_t &Event) {
505 assert(!Event->isHost() &&
506 "Only non-host events are expected to be waited for here");
508 if (!EventImpls.empty()) {
523 std::map<context_impl *, std::vector<EventImplPtr>>
524 RequiredEventsPerContext;
528 assert(Context.get() &&
529 "Only non-host events are expected to be waited for here");
530 RequiredEventsPerContext[Context.get()].push_back(Event);
533 for (
auto &CtxWithEvents : RequiredEventsPerContext) {
534 std::vector<ur_event_handle_t> RawEvents =
536 if (!RawEvents.empty()) {
537 CtxWithEvents.first->getPlugin()->call(urEventWait, RawEvents.size(),
542 std::vector<ur_event_handle_t> RawEvents =
getUrEvents(EventImpls);
544 const PluginPtr &Plugin = Queue->getPlugin();
547 MEvent->setHostEnqueueTime();
548 Plugin->call(urEnqueueEventsWait, Queue->getHandleRef(), RawEvents.size(),
549 &RawEvents[0], &Event);
559 ur_exp_command_buffer_handle_t CommandBuffer,
560 const std::vector<ur_exp_command_buffer_sync_point_t> &SyncPoints)
561 : MQueue(
std::move(Queue)),
563 MPreparedDepsEvents(MEvent->getPreparedDepsEvents()),
564 MPreparedHostDepsEvents(MEvent->getPreparedHostDepsEvents()), MType(Type),
565 MCommandBuffer(CommandBuffer), MSyncPointDeps(SyncPoints) {
572 MEvent->setStateIncomplete();
575 #ifdef XPTI_ENABLE_INSTRUMENTATION
576 if (!xptiTraceEnabled())
584 #ifdef XPTI_ENABLE_INSTRUMENTATION
600 Command *Cmd,
void *ObjAddr,
bool IsCommand,
601 std::optional<access::mode> AccMode) {
602 #ifdef XPTI_ENABLE_INSTRUMENTATION
605 constexpr uint16_t NotificationTraceType = xpti::trace_edge_create;
606 if (!(xptiCheckTraceEnabled(
MStreamID, NotificationTraceType) &&
612 xpti::utils::StringHelper SH;
613 std::string AddressStr = SH.addressAsString<
void *>(ObjAddr);
615 std::string TypeString = SH.nameWithAddressString(Prefix, AddressStr);
618 xpti::payload_t Payload(TypeString.c_str(),
MAddress);
619 uint64_t EdgeInstanceNo;
621 xptiMakeEvent(TypeString.c_str(), &Payload, xpti::trace_graph_event,
622 xpti_at::active, &EdgeInstanceNo);
624 xpti_td *SrcEvent =
static_cast<xpti_td *
>(Cmd->
MTraceEvent);
625 xpti_td *TgtEvent =
static_cast<xpti_td *
>(
MTraceEvent);
626 EdgeEvent->source_id = SrcEvent->unique_id;
627 EdgeEvent->target_id = TgtEvent->unique_id;
629 xpti::addMetadata(EdgeEvent,
"access_mode",
630 static_cast<int>(AccMode.value()));
631 xpti::addMetadata(EdgeEvent,
"memory_object",
632 reinterpret_cast<size_t>(ObjAddr));
634 xpti::addMetadata(EdgeEvent,
"event",
reinterpret_cast<size_t>(ObjAddr));
636 xptiNotifySubscribers(
MStreamID, NotificationTraceType,
637 detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
651 ur_event_handle_t &UrEventAddr) {
652 #ifdef XPTI_ENABLE_INSTRUMENTATION
665 xpti::utils::StringHelper SH;
666 std::string AddressStr = SH.addressAsString<ur_event_handle_t>(UrEventAddr);
671 std::string NodeName = SH.nameWithAddressString(
"virtual_node", AddressStr);
673 xpti::payload_t VNPayload(NodeName.c_str(),
MAddress);
674 uint64_t VNodeInstanceNo;
676 xptiMakeEvent(NodeName.c_str(), &VNPayload, xpti::trace_graph_event,
677 xpti_at::active, &VNodeInstanceNo);
679 xpti::addMetadata(NodeEvent,
"kernel_name", NodeName);
680 xptiNotifySubscribers(
MStreamID, xpti::trace_node_create,
681 detail::GSYCLGraphEvent, NodeEvent, VNodeInstanceNo,
684 std::string EdgeName = SH.nameWithAddressString(
"Event", AddressStr);
685 xpti::payload_t EdgePayload(EdgeName.c_str(),
MAddress);
686 uint64_t EdgeInstanceNo;
688 xptiMakeEvent(EdgeName.c_str(), &EdgePayload, xpti::trace_graph_event,
689 xpti_at::active, &EdgeInstanceNo);
690 if (EdgeEvent && NodeEvent) {
693 xpti_td *TgtEvent =
static_cast<xpti_td *
>(
MTraceEvent);
694 EdgeEvent->source_id = NodeEvent->unique_id;
695 EdgeEvent->target_id = TgtEvent->unique_id;
696 xpti::addMetadata(EdgeEvent,
"event",
697 reinterpret_cast<size_t>(UrEventAddr));
698 xptiNotifySubscribers(
MStreamID, xpti::trace_edge_create,
699 detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
708 uint64_t CommandInstanceNo = 0;
709 #ifdef XPTI_ENABLE_INSTRUMENTATION
711 return CommandInstanceNo;
717 xpti::utils::StringHelper SH;
719 std::string CommandString =
722 xpti::payload_t p(CommandString.c_str(),
MAddress);
723 xpti_td *CmdTraceEvent =
724 xptiMakeEvent(CommandString.c_str(), &p, xpti::trace_graph_event,
725 xpti_at::active, &CommandInstanceNo);
736 return CommandInstanceNo;
740 #ifdef XPTI_ENABLE_INSTRUMENTATION
741 constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
745 xptiNotifySubscribers(
MStreamID, NotificationTraceType,
746 detail::GSYCLGraphEvent,
753 std::vector<Command *> &ToCleanUp) {
760 bool PiEventExpected =
761 (!DepEvent->isHost() && !DepEvent->isDefaultConstructed());
762 if (
auto *DepCmd =
static_cast<Command *
>(DepEvent->getCommand()))
763 PiEventExpected &= DepCmd->producesPiEvent();
765 if (!PiEventExpected) {
772 Command *ConnectionCmd =
nullptr;
776 if (DepEventContext != WorkerContext && WorkerContext) {
782 return ConnectionCmd;
788 return MQueue->getContextImplPtr();
801 Command *ConnectionCmd =
nullptr;
810 if (!ConnectionCmd) {
811 MDeps.push_back(NewDep);
816 #ifdef XPTI_ENABLE_INSTRUMENTATION
822 return ConnectionCmd;
826 std::vector<Command *> &ToCleanUp) {
827 #ifdef XPTI_ENABLE_INSTRUMENTATION
831 ur_event_handle_t &UrEventAddr = Event->getHandleRef();
841 #ifdef XPTI_ENABLE_INSTRUMENTATION
842 emitInstrumentationGeneral(
844 xpti::trace_signal,
static_cast<const void *
>(UrEventAddr));
846 std::ignore = UrEventAddr;
850 #ifdef XPTI_ENABLE_INSTRUMENTATION
853 static_cast<const void *
>(Txt));
861 std::vector<Command *> &ToCleanUp) {
862 #ifdef XPTI_ENABLE_INSTRUMENTATION
866 std::unique_ptr<detail::tls_code_loc_t> AsyncCodeLocationPtr;
867 if (xptiTraceEnabled() && !CurrentCodeLocationValid()) {
868 AsyncCodeLocationPtr.reset(
884 #ifdef XPTI_ENABLE_INSTRUMENTATION
888 std::string Info =
"enqueue.barrier[";
896 #ifdef XPTI_ENABLE_INSTRUMENTATION
907 #ifdef XPTI_ENABLE_INSTRUMENTATION
923 if (UR_RESULT_SUCCESS != Res)
929 (
MEvent->isHost() ||
MEvent->getHandleRef() ==
nullptr))
940 ToCleanUp.push_back(
this);
946 #ifdef XPTI_ENABLE_INSTRUMENTATION
953 #ifdef XPTI_ENABLE_INSTRUMENTATION
954 assert(
MType == CommandType::RELEASE &&
"Expected release command");
960 xpti_td *TgtTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
965 for (
auto &Item : DepList) {
966 if (Item->MTraceEvent && Item->MAddress ==
MAddress) {
967 xpti::utils::StringHelper SH;
968 std::string AddressStr = SH.addressAsString<
void *>(
MAddress);
969 std::string TypeString =
970 "Edge:" + SH.nameWithAddressString(commandToName(
MType), AddressStr);
974 xpti::payload_t p(TypeString.c_str(),
MAddress);
975 uint64_t EdgeInstanceNo;
977 xptiMakeEvent(TypeString.c_str(), &p, xpti::trace_graph_event,
978 xpti_at::active, &EdgeInstanceNo);
980 xpti_td *SrcTraceEvent =
static_cast<xpti_td *
>(Item->MTraceEvent);
981 EdgeEvent->target_id = TgtTraceEvent->unique_id;
982 EdgeEvent->source_id = SrcTraceEvent->unique_id;
983 xpti::addMetadata(EdgeEvent,
"memory_object",
984 reinterpret_cast<size_t>(
MAddress));
985 xptiNotifySubscribers(
MStreamID, xpti::trace_edge_create,
986 detail::GSYCLGraphEvent, EdgeEvent,
987 EdgeInstanceNo,
nullptr);
997 return "A Buffer is locked by the host accessor";
999 return "Blocked by host task";
1002 return "Unknown block reason";
1006 #ifdef XPTI_ENABLE_INSTRUMENTATION
1007 if (!xptiTraceEnabled())
1011 auto TData = Tls.
query();
1012 if (TData.fileName())
1014 if (TData.functionName())
1019 (int)TData.lineNumber(), (int)TData.columnNumber()};
1027 :
Command(Type, Queue), MLinkedAllocaCmd(LinkedAllocaCmd),
1028 MIsLeaderAlloca(nullptr == LinkedAllocaCmd), MIsConst(IsConst),
1029 MRequirement(
std::move(Req)), MReleaseCmd(Queue, this) {
1035 #ifdef XPTI_ENABLE_INSTRUMENTATION
1044 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1045 addDeviceMetadata(TE,
MQueue);
1046 xpti::addMetadata(TE,
"memory_object",
reinterpret_cast<size_t>(
MAddress));
1049 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1062 bool InitFromUserData,
1065 LinkedAllocaCmd, IsConst),
1066 MInitFromUserData(InitFromUserData) {
1071 std::vector<Command *> ToCleanUp;
1074 assert(ConnectionCmd ==
nullptr);
1075 assert(ToCleanUp.empty());
1076 (void)ConnectionCmd;
1080 #ifdef XPTI_ENABLE_INSTRUMENTATION
1091 ur_result_t AllocaCommand::enqueueImp() {
1095 ur_event_handle_t &Event =
MEvent->getHandleRef();
1097 void *HostPtr =
nullptr;
1104 return UR_RESULT_SUCCESS;
1111 MInitFromUserData, HostPtr,
1112 std::move(EventImpls), Event);
1114 return UR_RESULT_SUCCESS;
1118 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1120 Stream <<
"ID = " <<
this <<
"\\n";
1124 Stream <<
"\"];" << std::endl;
1126 for (
const auto &Dep :
MDeps) {
1127 if (Dep.MDepCommand ==
nullptr)
1129 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1130 <<
" [ label = \"Access mode: "
1132 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1139 std::vector<Command *> &ToEnqueue,
1140 std::vector<Command *> &ToCleanUp)
1144 MParentAlloca(ParentAlloca) {
1152 ToEnqueue.push_back(ConnectionCmd);
1156 #ifdef XPTI_ENABLE_INSTRUMENTATION
1163 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1165 xpti::addMetadata(TE,
"access_range_start",
1167 xpti::addMetadata(TE,
"access_range_end",
1169 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1181 return static_cast<void *
>(
1188 ur_result_t AllocaSubBufCommand::enqueueImp() {
1191 ur_event_handle_t &Event =
MEvent->getHandleRef();
1200 return UR_RESULT_SUCCESS;
1204 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1206 Stream <<
"ID = " <<
this <<
"\\n";
1211 Stream <<
"\"];" << std::endl;
1213 for (
const auto &Dep :
MDeps) {
1214 if (Dep.MDepCommand ==
nullptr)
1216 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1217 <<
" [ label = \"Access mode: "
1219 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1230 #ifdef XPTI_ENABLE_INSTRUMENTATION
1239 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1240 addDeviceMetadata(TE,
MQueue);
1241 xpti::addMetadata(TE,
"allocation_type",
1242 commandToName(MAllocaCmd->
getType()));
1245 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1252 ur_result_t ReleaseCommand::enqueueImp() {
1255 std::vector<ur_event_handle_t> RawEvents =
getUrEvents(EventImpls);
1256 bool SkipRelease =
false;
1262 const bool CurAllocaIsHost = !MAllocaCmd->
getQueue();
1263 bool NeedUnmap =
false;
1277 NeedUnmap |= CurAllocaIsHost == MAllocaCmd->
MIsActive;
1286 UnmapEventImpl->setContextImpl(
getContext(Queue));
1287 UnmapEventImpl->setStateIncomplete();
1288 ur_event_handle_t &UnmapEvent = UnmapEventImpl->getHandleRef();
1290 void *Src = CurAllocaIsHost
1294 void *Dst = !CurAllocaIsHost
1299 RawEvents, UnmapEvent);
1303 EventImpls.push_back(UnmapEventImpl);
1305 ur_event_handle_t &Event =
MEvent->getHandleRef();
1311 std::move(EventImpls), Event);
1313 return UR_RESULT_SUCCESS;
1317 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#FF827A\", label=\"";
1319 Stream <<
"ID = " <<
this <<
" ; ";
1321 Stream <<
" Alloca : " << MAllocaCmd <<
"\\n";
1322 Stream <<
" MemObj : " << MAllocaCmd->
getSYCLMemObj() <<
"\\n";
1323 Stream <<
"\"];" << std::endl;
1325 for (
const auto &Dep :
MDeps) {
1326 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1327 <<
" [ label = \"Access mode: "
1329 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1344 MSrcAllocaCmd(SrcAllocaCmd), MSrcReq(
std::move(Req)), MDstPtr(DstPtr),
1350 #ifdef XPTI_ENABLE_INSTRUMENTATION
1359 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1360 addDeviceMetadata(TE,
MQueue);
1361 xpti::addMetadata(TE,
"memory_object",
reinterpret_cast<size_t>(
MAddress));
1364 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1371 ur_result_t MapMemObject::enqueueImp() {
1374 std::vector<ur_event_handle_t> RawEvents =
getUrEvents(EventImpls);
1377 ur_event_handle_t &Event =
MEvent->getHandleRef();
1383 return UR_RESULT_SUCCESS;
1387 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#77AFFF\", label=\"";
1389 Stream <<
"ID = " <<
this <<
" ; ";
1392 Stream <<
"\"];" << std::endl;
1394 for (
const auto &Dep :
MDeps) {
1395 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1396 <<
" [ label = \"Access mode: "
1398 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1406 MDstAllocaCmd(DstAllocaCmd), MDstReq(
std::move(Req)), MSrcPtr(SrcPtr) {
1411 #ifdef XPTI_ENABLE_INSTRUMENTATION
1420 xpti_td *TE =
static_cast<xpti_td *
>(
MTraceEvent);
1421 addDeviceMetadata(TE,
MQueue);
1422 xpti::addMetadata(TE,
"memory_object",
reinterpret_cast<size_t>(
MAddress));
1425 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1448 return MQueue && (
MQueue->getDeviceImplPtr()->getBackend() !=
1450 MEvent->getHandleRef() !=
nullptr);
1453 ur_result_t UnMapMemObject::enqueueImp() {
1456 std::vector<ur_event_handle_t> RawEvents =
getUrEvents(EventImpls);
1459 ur_event_handle_t &Event =
MEvent->getHandleRef();
1462 std::move(RawEvents), Event);
1464 return UR_RESULT_SUCCESS;
1468 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#EBC40F\", label=\"";
1470 Stream <<
"ID = " <<
this <<
" ; ";
1473 Stream <<
"\"];" << std::endl;
1475 for (
const auto &Dep :
MDeps) {
1476 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1477 <<
" [ label = \"Access mode: "
1479 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1490 MSrcQueue(SrcQueue), MSrcReq(
std::move(SrcReq)),
1491 MSrcAllocaCmd(SrcAllocaCmd), MDstReq(
std::move(DstReq)),
1492 MDstAllocaCmd(DstAllocaCmd) {
1494 MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1504 #ifdef XPTI_ENABLE_INSTRUMENTATION
1513 xpti_td *CmdTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
1514 addDeviceMetadata(CmdTraceEvent,
MQueue);
1515 xpti::addMetadata(CmdTraceEvent,
"memory_object",
1516 reinterpret_cast<size_t>(
MAddress));
1517 xpti::addMetadata(CmdTraceEvent,
"copy_from",
1518 MSrcQueue ? deviceToID(MSrcQueue->get_device()) : 0);
1519 xpti::addMetadata(CmdTraceEvent,
"copy_to",
1523 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1553 MQueue->getDeviceImplPtr()->getBackend() !=
1555 MEvent->getHandleRef() !=
nullptr;
1558 ur_result_t MemCpyCommand::enqueueImp() {
1562 ur_event_handle_t &Event =
MEvent->getHandleRef();
1574 return UR_RESULT_SUCCESS;
1578 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#C7EB15\" label=\"";
1580 Stream <<
"ID = " <<
this <<
" ; ";
1582 Stream <<
"From: " << MSrcAllocaCmd <<
" is host: " << !MSrcQueue <<
"\\n";
1583 Stream <<
"To: " << MDstAllocaCmd <<
" is host: " << !
MQueue <<
"\\n";
1585 Stream <<
"\"];" << std::endl;
1587 for (
const auto &Dep :
MDeps) {
1588 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1589 <<
" [ label = \"Access mode: "
1591 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1598 if (Dep.MDepRequirement == Req)
1599 return Dep.MAllocaCmd;
1607 std::vector<std::shared_ptr<const void>>
1610 return ((
CGExecKernel *)MCommandGroup.get())->getAuxiliaryResources();
1619 ur_result_t UpdateHostRequirementCommand::enqueueImp() {
1622 ur_event_handle_t &Event =
MEvent->getHandleRef();
1625 assert(MSrcAllocaCmd &&
"Expected valid alloca command");
1626 assert(MSrcAllocaCmd->
getMemAllocation() &&
"Expected valid source pointer");
1627 assert(MDstPtr &&
"Expected valid target pointer");
1630 return UR_RESULT_SUCCESS;
1634 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#f1337f\", label=\"";
1636 Stream <<
"ID = " <<
this <<
"\\n";
1638 bool IsReqOnBuffer =
1640 Stream <<
"TYPE: " << (IsReqOnBuffer ?
"Buffer" :
"Image") <<
"\\n";
1642 Stream <<
"Is sub buffer: " << std::boolalpha << MDstReq.
MIsSubBuffer
1645 Stream <<
"\"];" << std::endl;
1647 for (
const auto &Dep :
MDeps) {
1648 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1649 <<
" [ label = \"Access mode: "
1651 <<
"MemObj: " << Dep.MAllocaCmd->getSYCLMemObj() <<
" \" ]"
1662 MSrcQueue(SrcQueue), MSrcReq(
std::move(SrcReq)),
1663 MSrcAllocaCmd(SrcAllocaCmd), MDstReq(
std::move(DstReq)), MDstPtr(DstPtr) {
1665 MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1675 #ifdef XPTI_ENABLE_INSTRUMENTATION
1684 xpti_td *CmdTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
1685 addDeviceMetadata(CmdTraceEvent,
MQueue);
1686 xpti::addMetadata(CmdTraceEvent,
"memory_object",
1687 reinterpret_cast<size_t>(
MAddress));
1688 xpti::addMetadata(CmdTraceEvent,
"copy_from",
1689 MSrcQueue ? deviceToID(MSrcQueue->get_device()) : 0);
1690 xpti::addMetadata(CmdTraceEvent,
"copy_to",
1694 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1707 ur_result_t MemCpyCommandHost::enqueueImp() {
1711 std::vector<ur_event_handle_t> RawEvents =
getUrEvents(EventImpls);
1713 ur_event_handle_t &Event =
MEvent->getHandleRef();
1721 return UR_RESULT_SUCCESS;
1731 return UR_RESULT_SUCCESS;
1738 ur_result_t EmptyCommand::enqueueImp() {
1742 return UR_RESULT_SUCCESS;
1748 MRequirements.emplace_back(ReqRef);
1749 const Requirement *
const StoredReq = &MRequirements.back();
1753 std::vector<Command *> ToCleanUp;
1755 assert(Cmd ==
nullptr &&
"Conection command should be null for EmptyCommand");
1756 assert(ToCleanUp.empty() &&
"addDep should add a command for cleanup only if "
1757 "there's a connection command");
1762 #ifdef XPTI_ENABLE_INSTRUMENTATION
1767 if (MRequirements.empty())
1776 xpti_td *CmdTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
1777 addDeviceMetadata(CmdTraceEvent,
MQueue);
1778 xpti::addMetadata(CmdTraceEvent,
"memory_object",
1779 reinterpret_cast<size_t>(
MAddress));
1782 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1790 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#8d8f29\", label=\"";
1792 Stream <<
"ID = " <<
this <<
"\\n";
1793 Stream <<
"EMPTY NODE"
1796 Stream <<
"\"];" << std::endl;
1798 for (
const auto &Dep :
MDeps) {
1799 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1800 <<
" [ label = \"Access mode: "
1802 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1810 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#B6A2EB\", label=\"";
1812 Stream <<
"ID = " <<
this <<
"\\n";
1815 Stream <<
"\"];" << std::endl;
1817 for (
const auto &Dep :
MDeps) {
1818 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
1819 <<
" [ label = \"Access mode: "
1821 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
1830 MSrcAllocaCmd(SrcAllocaCmd), MDstReq(
std::move(Req)), MDstPtr(DstPtr) {
1836 #ifdef XPTI_ENABLE_INSTRUMENTATION
1845 xpti_td *CmdTraceEvent =
static_cast<xpti_td *
>(
MTraceEvent);
1846 addDeviceMetadata(CmdTraceEvent,
MQueue);
1847 xpti::addMetadata(CmdTraceEvent,
"memory_object",
1848 reinterpret_cast<size_t>(
MAddress));
1851 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1864 return "update_host";
1870 return "copy acc to acc";
1873 return "copy acc to ptr";
1876 return "copy ptr to acc";
1881 return "barrier waitlist";
1889 return "prefetch usm";
1895 return "copy 2d usm";
1898 return "fill 2d usm";
1901 return "advise usm";
1903 return "memset 2d usm";
1906 return "copy to device_global";
1909 return "copy from device_global";
1912 return "read_write host pipe";
1914 return "exec command buffer";
1916 return "copy image";
1918 return "semaphore wait";
1920 return "semaphore signal";
1928 std::unique_ptr<detail::CG> CommandGroup,
QueueImplPtr Queue,
1929 bool EventNeeded, ur_exp_command_buffer_handle_t CommandBuffer,
1930 const std::vector<ur_exp_command_buffer_sync_point_t> &Dependencies)
1933 MEventNeeded(EventNeeded), MCommandGroup(
std::move(CommandGroup)) {
1935 MEvent->setSubmittedQueue(
1939 MEvent->markAsProfilingTagEvent();
1944 #ifdef XPTI_ENABLE_INSTRUMENTATION
1945 std::string instrumentationGetKernelName(
1946 const std::shared_ptr<detail::kernel_impl> &SyclKernel,
1947 const std::string &FunctionName,
const std::string &SyclKernelName,
1948 void *&Address, std::optional<bool> &FromSource) {
1949 std::string KernelName;
1950 if (SyclKernel && SyclKernel->isCreatedFromSource()) {
1952 ur_kernel_handle_t KernelHandle = SyclKernel->getHandleRef();
1953 Address = KernelHandle;
1954 KernelName = FunctionName;
1962 void instrumentationAddExtraKernelMetadata(
1963 xpti_td *&CmdTraceEvent,
const NDRDescT &NDRDesc,
1965 const std::string &KernelName,
1966 const std::shared_ptr<detail::kernel_impl> &SyclKernel,
1968 std::vector<ArgDesc> &CGArgs)
1971 std::vector<ArgDesc> Args;
1973 auto FilterArgs = [&Args](detail::ArgDesc &Arg,
int NextTrueIndex) {
1974 Args.push_back({Arg.MType, Arg.MPtr, Arg.MSize, NextTrueIndex});
1976 ur_program_handle_t Program =
nullptr;
1977 ur_kernel_handle_t
Kernel =
nullptr;
1978 std::mutex *KernelMutex =
nullptr;
1981 std::shared_ptr<kernel_impl> SyclKernelImpl;
1982 std::shared_ptr<device_image_impl> DeviceImageImpl;
1990 kernel_id KernelID =
1994 std::shared_ptr<kernel_impl> KernelImpl =
1997 EliminatedArgMask = KernelImpl->getKernelArgMask();
1998 Program = KernelImpl->getDeviceImage()->get_ur_program_ref();
1999 }
else if (
nullptr != SyclKernel) {
2000 Program = SyclKernel->getProgramRef();
2001 if (!SyclKernel->isCreatedFromSource())
2002 EliminatedArgMask = SyclKernel->getKernelArgMask();
2004 assert(Queue &&
"Kernel submissions should have an associated queue");
2005 std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) =
2007 Queue->getContextImplPtr(), Queue->getDeviceImplPtr(), KernelName);
2012 xpti::offload_kernel_enqueue_data_t KernelData{
2013 {NDRDesc.GlobalSize[0], NDRDesc.GlobalSize[1], NDRDesc.GlobalSize[2]},
2014 {NDRDesc.LocalSize[0], NDRDesc.LocalSize[1], NDRDesc.LocalSize[2]},
2015 {NDRDesc.GlobalOffset[0], NDRDesc.GlobalOffset[1],
2016 NDRDesc.GlobalOffset[2]},
2018 xpti::addMetadata(CmdTraceEvent,
"enqueue_kernel_data", KernelData);
2019 for (
size_t i = 0; i < Args.size(); i++) {
2020 std::string Prefix(
"arg");
2021 xpti::offload_kernel_arg_data_t
arg{(int)Args[i].MType, Args[i].MPtr,
2022 Args[i].MSize, Args[i].MIndex};
2023 xpti::addMetadata(CmdTraceEvent, Prefix + std::to_string(i),
arg);
2027 void instrumentationFillCommonData(
const std::string &KernelName,
2028 const std::string &FileName, uint64_t Line,
2029 uint64_t Column,
const void *
const Address,
2031 std::optional<bool> &FromSource,
2032 uint64_t &OutInstanceID,
2033 xpti_td *&OutTraceEvent) {
2040 bool HasSourceInfo =
false;
2041 xpti::payload_t Payload;
2042 if (!FileName.empty()) {
2044 Payload = xpti::payload_t(KernelName.c_str(), FileName.c_str(), Line,
2046 HasSourceInfo =
true;
2047 }
else if (Address) {
2049 Payload = xpti::payload_t(KernelName.c_str(), Address);
2053 Payload = xpti::payload_t(KernelName.c_str());
2055 uint64_t CGKernelInstanceNo;
2057 xpti_td *CmdTraceEvent =
2058 xptiMakeEvent(
"ExecCG", &Payload, xpti::trace_graph_event,
2059 xpti::trace_activity_type_t::active, &CGKernelInstanceNo);
2060 if (CmdTraceEvent) {
2061 OutInstanceID = CGKernelInstanceNo;
2062 OutTraceEvent = CmdTraceEvent;
2066 if (CGKernelInstanceNo > 1)
2069 addDeviceMetadata(CmdTraceEvent, Queue);
2070 if (!KernelName.empty()) {
2071 xpti::addMetadata(CmdTraceEvent,
"kernel_name", KernelName);
2073 if (FromSource.has_value()) {
2074 xpti::addMetadata(CmdTraceEvent,
"from_source", FromSource.value());
2076 if (HasSourceInfo) {
2077 xpti::addMetadata(CmdTraceEvent,
"sym_function_name", KernelName);
2078 xpti::addMetadata(CmdTraceEvent,
"sym_source_file_name", FileName);
2079 xpti::addMetadata(CmdTraceEvent,
"sym_line_no",
static_cast<int>(Line));
2080 xpti::addMetadata(CmdTraceEvent,
"sym_column_no",
2081 static_cast<int>(Column));
2090 #ifdef XPTI_ENABLE_INSTRUMENTATION
2091 std::pair<xpti_td *, uint64_t> emitKernelInstrumentationData(
2092 int32_t StreamID,
const std::shared_ptr<detail::kernel_impl> &SyclKernel,
2093 const detail::code_location &CodeLoc,
const std::string &SyclKernelName,
2096 std::vector<ArgDesc> &CGArgs) {
2098 auto XptiObjects = std::make_pair<xpti_td *, uint64_t>(
nullptr, -1);
2099 constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
2100 if (!xptiCheckTraceEnabled(StreamID))
2103 void *Address =
nullptr;
2104 std::optional<bool> FromSource;
2105 std::string KernelName = instrumentationGetKernelName(
2106 SyclKernel, std::string(CodeLoc.functionName()), SyclKernelName, Address,
2109 auto &[CmdTraceEvent, InstanceID] = XptiObjects;
2111 std::string FileName =
2112 CodeLoc.fileName() ? CodeLoc.fileName() : std::string();
2113 instrumentationFillCommonData(KernelName, FileName, CodeLoc.lineNumber(),
2114 CodeLoc.columnNumber(), Address, Queue,
2115 FromSource, InstanceID, CmdTraceEvent);
2117 if (CmdTraceEvent) {
2119 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(Queue));
2120 instrumentationAddExtraKernelMetadata(CmdTraceEvent, NDRDesc,
2122 SyclKernel, Queue, CGArgs);
2124 xptiNotifySubscribers(
2125 StreamID, NotificationTraceType, detail::GSYCLGraphEvent, CmdTraceEvent,
2127 static_cast<const void *
>(
2128 commandToNodeType(Command::CommandType::RUN_CG).c_str()));
2136 #ifdef XPTI_ENABLE_INSTRUMENTATION
2137 constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
2141 std::string KernelName;
2142 std::optional<bool> FromSource;
2143 switch (MCommandGroup->getType()) {
2147 KernelName = instrumentationGetKernelName(
2148 KernelCG->MSyclKernel, MCommandGroup->MFunctionName,
2149 KernelCG->getKernelName(),
MAddress, FromSource);
2156 xpti_td *CmdTraceEvent =
nullptr;
2157 instrumentationFillCommonData(KernelName, MCommandGroup->MFileName,
2158 MCommandGroup->MLine, MCommandGroup->MColumn,
2162 if (CmdTraceEvent) {
2163 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
2169 instrumentationAddExtraKernelMetadata(
2170 CmdTraceEvent, KernelCG->MNDRDesc, KernelCG->getKernelBundle(),
2171 KernelCG->MKernelName, KernelCG->MSyclKernel,
MQueue,
2175 xptiNotifySubscribers(
2176 MStreamID, NotificationTraceType, detail::GSYCLGraphEvent,
2178 static_cast<const void *
>(commandToNodeType(
MType).c_str()));
2184 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
2186 Stream <<
"ID = " <<
this <<
"\\n";
2189 switch (MCommandGroup->getType()) {
2193 Stream <<
"Kernel name: ";
2194 if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource())
2195 Stream <<
"created from source";
2206 Stream <<
"\"];" << std::endl;
2208 for (
const auto &Dep :
MDeps) {
2209 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
2210 <<
" [ label = \"Access mode: "
2212 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
2240 sycl::info::kernel_device_specific::compile_work_group_size>(
2243 if (WGSize[0] == 0) {
2247 static_cast<int>(NDR.
Dims)};
2268 switch (AccessorMode) {
2270 return UR_MEM_FLAG_READ_ONLY;
2273 return UR_MEM_FLAG_WRITE_ONLY;
2275 return UR_MEM_FLAG_READ_WRITE;
2280 const PluginPtr &Plugin, ur_kernel_handle_t Kernel,
2281 const std::shared_ptr<device_image_impl> &DeviceImageImpl,
2282 const std::function<
void *(
Requirement *Req)> &getMemAllocationFunc,
2284 switch (Arg.
MType) {
2294 ur_mem_handle_t MemArg =
2295 getMemAllocationFunc
2296 ?
reinterpret_cast<ur_mem_handle_t
>(getMemAllocationFunc(Req))
2298 ur_kernel_arg_mem_obj_properties_t MemObjData{};
2299 MemObjData.stype = UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES;
2301 Plugin->call(urKernelSetArgMemObj,
Kernel, NextTrueIndex, &MemObjData,
2307 Plugin->call(urKernelSetArgValue,
Kernel, NextTrueIndex, Arg.
MSize,
2310 Plugin->call(urKernelSetArgLocal,
Kernel, NextTrueIndex, Arg.
MSize,
2317 sampler *SamplerPtr = (sampler *)Arg.
MPtr;
2318 ur_sampler_handle_t Sampler =
2320 ->getOrCreateSampler(Context);
2321 Plugin->call(urKernelSetArgSampler,
Kernel, NextTrueIndex,
nullptr,
2328 const void *Ptr = *
static_cast<const void *
const *
>(Arg.
MPtr);
2329 Plugin->call(urKernelSetArgPointer,
Kernel, NextTrueIndex,
nullptr, Ptr);
2333 assert(DeviceImageImpl !=
nullptr);
2334 ur_mem_handle_t SpecConstsBuffer =
2335 DeviceImageImpl->get_spec_const_buffer_ref();
2337 ur_kernel_arg_mem_obj_properties_t MemObjProps{};
2338 MemObjProps.pNext =
nullptr;
2339 MemObjProps.stype = UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES;
2340 MemObjProps.memoryAccess = UR_MEM_FLAG_READ_ONLY;
2341 Plugin->call(urKernelSetArgMemObj,
Kernel, NextTrueIndex, &MemObjProps,
2347 "Invalid kernel param kind " +
2355 const std::shared_ptr<device_image_impl> &DeviceImageImpl,
2356 ur_kernel_handle_t Kernel,
NDRDescT &NDRDesc,
2357 std::vector<ur_event_handle_t> &RawEvents,
2360 const std::function<
void *(
Requirement *Req)> &getMemAllocationFunc,
2361 bool IsCooperative,
bool KernelUsesClusterLaunch,
2363 assert(Queue &&
"Kernel submissions should have an associated queue");
2364 const PluginPtr &Plugin = Queue->getPlugin();
2367 std::vector<unsigned char> Empty;
2369 Queue, BinImage, KernelName,
2370 DeviceImageImpl.
get() ? DeviceImageImpl->get_spec_const_blob_ref()
2374 auto setFunc = [&Plugin,
Kernel, &DeviceImageImpl, &getMemAllocationFunc,
2377 Queue->get_context(), Arg, NextTrueIndex);
2385 const bool HasLocalSize = (NDRDesc.
LocalSize[0] != 0);
2389 size_t RequiredWGSize[3] = {0, 0, 0};
2390 size_t *LocalSize =
nullptr;
2395 Plugin->call(urKernelGetGroupInfo,
Kernel,
2396 Queue->getDeviceImplPtr()->getHandleRef(),
2397 UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
2398 sizeof(RequiredWGSize), RequiredWGSize,
2401 const bool EnforcedLocalSize =
2402 (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2403 RequiredWGSize[2] != 0);
2404 if (EnforcedLocalSize)
2405 LocalSize = RequiredWGSize;
2407 if (OutEventImpl !=
nullptr)
2408 OutEventImpl->setHostEnqueueTime();
2409 if (KernelUsesClusterLaunch) {
2412 ur_exp_launch_property_value_t launch_property_value_cluster_range;
2413 launch_property_value_cluster_range.clusterDim[0] =
2415 launch_property_value_cluster_range.clusterDim[1] =
2417 launch_property_value_cluster_range.clusterDim[2] =
2420 property_list.push_back({UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION,
2421 launch_property_value_cluster_range});
2423 if (IsCooperative) {
2424 ur_exp_launch_property_value_t launch_property_value_cooperative;
2425 launch_property_value_cooperative.cooperative = 1;
2426 property_list.push_back({UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE,
2427 launch_property_value_cooperative});
2430 return Plugin->call_nocheck(
2431 urEnqueueKernelLaunchCustomExp, Queue->getHandleRef(),
Kernel,
2434 RawEvents.empty() ? nullptr : &RawEvents[0],
2435 OutEventImpl ? &OutEventImpl->getHandleRef() :
nullptr);
2439 if (IsCooperative) {
2440 return Plugin->call_nocheck(urEnqueueCooperativeKernelLaunchExp,
2443 return Plugin->call_nocheck(urEnqueueKernelLaunch, Args...);
2445 &NDRDesc.
GlobalSize[0], LocalSize, RawEvents.size(),
2446 RawEvents.empty() ? nullptr : &RawEvents[0],
2447 OutEventImpl ? &OutEventImpl->getHandleRef() :
nullptr);
2453 ur_exp_command_buffer_handle_t CommandBuffer,
2455 std::vector<ur_exp_command_buffer_sync_point_t> &SyncPoints,
2456 ur_exp_command_buffer_sync_point_t *OutSyncPoint,
2457 ur_exp_command_buffer_command_handle_t *OutCommand,
2458 const std::function<
void *(
Requirement *Req)> &getMemAllocationFunc) {
2461 ur_kernel_handle_t UrKernel =
nullptr;
2462 ur_program_handle_t UrProgram =
nullptr;
2463 std::shared_ptr<kernel_impl> SyclKernelImpl =
nullptr;
2464 std::shared_ptr<device_image_impl> DeviceImageImpl =
nullptr;
2482 UrKernel = SyclKernelImpl->getHandleRef();
2483 DeviceImageImpl = SyclKernelImpl->getDeviceImage();
2484 UrProgram = DeviceImageImpl->get_ur_program_ref();
2485 EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
2486 }
else if (
Kernel !=
nullptr) {
2487 UrKernel =
Kernel->getHandleRef();
2488 UrProgram =
Kernel->getProgramRef();
2489 EliminatedArgMask =
Kernel->getKernelArgMask();
2491 std::tie(UrKernel, std::ignore, EliminatedArgMask, UrProgram) =
2492 sycl::detail::ProgramManager::getInstance().getOrCreateKernel(
2493 ContextImpl, DeviceImpl, CommandGroup.
MKernelName);
2496 auto SetFunc = [&Plugin, &UrKernel, &DeviceImageImpl, &Ctx,
2497 &getMemAllocationFunc](sycl::detail::ArgDesc &Arg,
2498 size_t NextTrueIndex) {
2500 getMemAllocationFunc, Ctx, Arg,
2504 auto Args = CommandGroup.
MArgs;
2511 auto NDRDesc = CommandGroup.
MNDRDesc;
2515 size_t RequiredWGSize[3] = {0, 0, 0};
2516 size_t *LocalSize =
nullptr;
2521 Plugin->call(urKernelGetGroupInfo, UrKernel, DeviceImpl->getHandleRef(),
2522 UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
2523 sizeof(RequiredWGSize), RequiredWGSize,
2526 const bool EnforcedLocalSize =
2527 (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2528 RequiredWGSize[2] != 0);
2529 if (EnforcedLocalSize)
2530 LocalSize = RequiredWGSize;
2533 ur_result_t Res = Plugin->call_nocheck(
2534 urCommandBufferAppendKernelLaunchExp, CommandBuffer, UrKernel,
2536 SyncPoints.size(), SyncPoints.size() ? SyncPoints.data() :
nullptr,
2537 OutSyncPoint, OutCommand);
2539 if (!SyclKernelImpl && !
Kernel) {
2540 Plugin->call(urKernelRelease, UrKernel);
2541 Plugin->call(urProgramRelease, UrProgram);
2544 if (Res != UR_RESULT_SUCCESS) {
2556 const std::shared_ptr<detail::kernel_impl> &MSyclKernel,
2557 const std::string &KernelName, std::vector<ur_event_handle_t> &RawEvents,
2559 const std::function<
void *(
Requirement *Req)> &getMemAllocationFunc,
2560 ur_kernel_cache_config_t KernelCacheConfig,
const bool KernelIsCooperative,
2562 assert(Queue &&
"Kernel submissions should have an associated queue");
2564 auto ContextImpl = Queue->getContextImplPtr();
2565 auto DeviceImpl = Queue->getDeviceImplPtr();
2566 ur_kernel_handle_t
Kernel =
nullptr;
2567 std::mutex *KernelMutex =
nullptr;
2568 ur_program_handle_t Program =
nullptr;
2571 std::shared_ptr<kernel_impl> SyclKernelImpl;
2572 std::shared_ptr<device_image_impl> DeviceImageImpl;
2587 Kernel = SyclKernelImpl->getHandleRef();
2588 DeviceImageImpl = SyclKernelImpl->getDeviceImage();
2590 Program = DeviceImageImpl->get_ur_program_ref();
2592 EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
2593 KernelMutex = SyclKernelImpl->getCacheMutex();
2594 }
else if (
nullptr != MSyclKernel) {
2595 assert(MSyclKernel->get_info<info::kernel::context>() ==
2596 Queue->get_context());
2597 Kernel = MSyclKernel->getHandleRef();
2598 Program = MSyclKernel->getProgramRef();
2606 KernelMutex = &MSyclKernel->getNoncacheableEnqueueMutex();
2607 EliminatedArgMask = MSyclKernel->getKernelArgMask();
2611 ContextImpl, DeviceImpl, KernelName, NDRDesc);
2615 std::vector<ur_event_handle_t> &EventsWaitList = RawEvents;
2618 std::vector<ur_event_handle_t> DeviceGlobalInitEvents =
2619 ContextImpl->initializeDeviceGlobals(Program, Queue);
2620 std::vector<ur_event_handle_t> EventsWithDeviceGlobalInits;
2621 if (!DeviceGlobalInitEvents.empty()) {
2622 EventsWithDeviceGlobalInits.reserve(RawEvents.size() +
2623 DeviceGlobalInitEvents.size());
2624 EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(),
2625 RawEvents.begin(), RawEvents.end());
2626 EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(),
2627 DeviceGlobalInitEvents.begin(),
2628 DeviceGlobalInitEvents.end());
2629 EventsWaitList = EventsWithDeviceGlobalInits;
2632 ur_result_t Error = UR_RESULT_SUCCESS;
2637 using LockT = std::unique_lock<std::mutex>;
2638 auto Lock = KernelMutex ? LockT(*KernelMutex) : LockT();
2642 if (KernelCacheConfig == UR_KERNEL_CACHE_CONFIG_LARGE_SLM ||
2643 KernelCacheConfig == UR_KERNEL_CACHE_CONFIG_LARGE_DATA) {
2644 const PluginPtr &Plugin = Queue->getPlugin();
2646 urKernelSetExecInfo,
Kernel, UR_KERNEL_EXEC_INFO_CACHE_CONFIG,
2647 sizeof(ur_kernel_cache_config_t),
nullptr, &KernelCacheConfig);
2651 Queue, Args, DeviceImageImpl,
Kernel, NDRDesc, EventsWaitList,
2652 OutEventImpl, EliminatedArgMask, getMemAllocationFunc,
2653 KernelIsCooperative, KernelUsesClusterLaunch, BinImage, KernelName);
2655 const PluginPtr &Plugin = Queue->getPlugin();
2656 if (!SyclKernelImpl && !MSyclKernel) {
2657 Plugin->call(urKernelRelease,
Kernel);
2658 Plugin->call(urProgramRelease, Program);
2661 if (UR_RESULT_SUCCESS != Error) {
2664 const device_impl &DeviceImpl = *(Queue->getDeviceImplPtr());
2671 const std::string &PipeName,
bool blocking,
2672 void *ptr,
size_t size,
2673 std::vector<ur_event_handle_t> &RawEvents,
2677 "ReadWrite host pipe submissions should have an associated queue");
2681 ur_program_handle_t Program =
nullptr;
2682 device Device = Queue->get_device();
2684 std::optional<ur_program_handle_t> CachedProgram =
2685 ContextImpl->getProgramForHostPipe(Device, hostPipeEntry);
2687 Program = *CachedProgram;
2693 Queue->get_device());
2698 assert(Program &&
"Program for this hostpipe is not compiled.");
2700 const PluginPtr &Plugin = Queue->getPlugin();
2702 ur_queue_handle_t ur_q = Queue->getHandleRef();
2705 auto OutEvent = OutEventImpl ? &OutEventImpl->getHandleRef() :
nullptr;
2706 if (OutEventImpl !=
nullptr)
2707 OutEventImpl->setHostEnqueueTime();
2709 Error = Plugin->call_nocheck(
2710 urEnqueueReadHostPipe, ur_q, Program, PipeName.c_str(), blocking, ptr,
2711 size, RawEvents.size(), RawEvents.empty() ?
nullptr : &RawEvents[0],
2714 Error = Plugin->call_nocheck(
2715 urEnqueueWriteHostPipe, ur_q, Program, PipeName.c_str(), blocking, ptr,
2716 size, RawEvents.size(), RawEvents.empty() ?
nullptr : &RawEvents[0],
2723 ur_result_t ExecCGCommand::enqueueImpCommandBuffer() {
2724 assert(
MQueue &&
"Command buffer enqueue should have an associated queue");
2733 std::vector<ur_event_handle_t> RawEvents =
getUrEvents(EventImpls);
2734 if (!RawEvents.empty()) {
2735 MQueue->getPlugin()->call(urEventWait, RawEvents.size(), &RawEvents[0]);
2743 MQueue->supportsDiscardingPiEvents() &&
2744 MCommandGroup->getRequirements().size() == 0;
2745 ur_event_handle_t *Event = DiscardUrEvent ? nullptr : &
MEvent->getHandleRef();
2746 ur_exp_command_buffer_sync_point_t OutSyncPoint;
2747 ur_exp_command_buffer_command_handle_t OutCommand =
nullptr;
2748 switch (MCommandGroup->getType()) {
2750 CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
2752 auto getMemAllocationFunc = [
this](
Requirement *Req) {
2753 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2754 return AllocaCmd->getMemAllocation();
2759 bool KernelUsesAssert =
2760 !(ExecKernel->MSyclKernel && ExecKernel->MSyclKernel->isInterop()) &&
2762 ExecKernel->MKernelName);
2763 if (KernelUsesAssert) {
2764 Event = &
MEvent->getHandleRef();
2770 getMemAllocationFunc);
2771 MEvent->setSyncPoint(OutSyncPoint);
2772 MEvent->setCommandBufferCommand(OutCommand);
2776 CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
2779 Copy->getLength(), Copy->getDst(),
MSyncPointDeps, &OutSyncPoint);
2780 MEvent->setSyncPoint(OutSyncPoint);
2781 return UR_RESULT_SUCCESS;
2784 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2788 AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc);
2789 AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst);
2793 AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(),
2794 ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange,
2795 ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
2796 ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
2799 MEvent->setSyncPoint(OutSyncPoint);
2800 return UR_RESULT_SUCCESS;
2803 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2805 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2809 AllocaCmd->getMemAllocation(), Req->MDims, Req->MMemoryRange,
2810 Req->MAccessRange, Req->MOffset, Req->MElemSize, (
char *)Copy->getDst(),
2811 Req->MDims, Req->MAccessRange,
2814 MEvent->setSyncPoint(OutSyncPoint);
2815 return UR_RESULT_SUCCESS;
2818 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2820 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2824 (
char *)Copy->getSrc(), Req->MDims, Req->MAccessRange,
2825 {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(),
2826 Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2828 MEvent->setSyncPoint(OutSyncPoint);
2829 return UR_RESULT_SUCCESS;
2832 CGFill *
Fill = (CGFill *)MCommandGroup.get();
2834 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2838 AllocaCmd->getMemAllocation(),
Fill->MPattern.size(),
2839 Fill->MPattern.data(), Req->MDims, Req->MMemoryRange, Req->MAccessRange,
2840 Req->MOffset, Req->MElemSize, std::move(
MSyncPointDeps), &OutSyncPoint);
2841 MEvent->setSyncPoint(OutSyncPoint);
2842 return UR_RESULT_SUCCESS;
2845 CGFillUSM *
Fill = (CGFillUSM *)MCommandGroup.get();
2850 MEvent->setSyncPoint(OutSyncPoint);
2851 return UR_RESULT_SUCCESS;
2854 CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get();
2857 Prefetch->getLength(), std::move(
MSyncPointDeps), &OutSyncPoint);
2858 MEvent->setSyncPoint(OutSyncPoint);
2859 return UR_RESULT_SUCCESS;
2862 CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get();
2865 Advise->getLength(), Advise->getAdvice(), std::move(
MSyncPointDeps),
2867 MEvent->setSyncPoint(OutSyncPoint);
2868 return UR_RESULT_SUCCESS;
2873 "CG type not implemented for command buffers.");
2877 ur_result_t ExecCGCommand::enqueueImp() {
2879 return enqueueImpCommandBuffer();
2881 return enqueueImpQueue();
2885 ur_result_t ExecCGCommand::enqueueImpQueue() {
2897 MQueue->supportsDiscardingPiEvents() &&
2898 MCommandGroup->getRequirements().size() == 0;
2900 ur_event_handle_t *Event = DiscardUrEvent ? nullptr : &
MEvent->getHandleRef();
2903 switch (MCommandGroup->getType()) {
2907 "Update host should be handled by the Scheduler. " +
2911 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2913 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2916 AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(),
MQueue,
2917 Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2918 Req->MElemSize, Copy->getDst(),
nullptr, Req->MDims, Req->MAccessRange,
2919 Req->MAccessRange, {0, 0, 0}, Req->MElemSize,
2922 return UR_RESULT_SUCCESS;
2925 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2927 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2930 AllocaCmd->getSYCLMemObj(), Copy->getSrc(),
nullptr, Req->MDims,
2931 Req->MAccessRange, Req->MAccessRange,
2932 {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(),
2933 MQueue, Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2934 Req->MElemSize, std::move(RawEvents),
MEvent->getHandleRef(),
MEvent);
2936 return UR_RESULT_SUCCESS;
2939 CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2943 AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc);
2944 AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst);
2947 AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(),
MQueue,
2948 ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange,
2949 ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
2950 MQueue, ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
2951 ReqDst->MOffset, ReqDst->MElemSize, std::move(RawEvents),
2954 return UR_RESULT_SUCCESS;
2957 CGFill *
Fill = (CGFill *)MCommandGroup.get();
2959 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2962 AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(),
MQueue,
2963 Fill->MPattern.size(),
Fill->MPattern.data(), Req->MDims,
2964 Req->MMemoryRange, Req->MAccessRange, Req->MOffset, Req->MElemSize,
2967 return UR_RESULT_SUCCESS;
2970 assert(
MQueue &&
"Kernel submissions should have an associated queue");
2971 CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
2973 NDRDescT &NDRDesc = ExecKernel->MNDRDesc;
2974 std::vector<ArgDesc> &Args = ExecKernel->MArgs;
2976 auto getMemAllocationFunc = [
this](
Requirement *Req) {
2977 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2980 return AllocaCmd ? AllocaCmd->getMemAllocation() :
nullptr;
2983 const std::shared_ptr<detail::kernel_impl> &SyclKernel =
2984 ExecKernel->MSyclKernel;
2985 const std::string &KernelName = ExecKernel->MKernelName;
2989 bool KernelUsesAssert =
2990 !(SyclKernel && SyclKernel->isInterop()) &&
2992 if (KernelUsesAssert) {
2997 const RTDeviceBinaryImage *BinImage =
nullptr;
3001 assert(BinImage &&
"Failed to obtain a binary image.");
3004 SyclKernel, KernelName, RawEvents, EventImpl,
3005 getMemAllocationFunc, ExecKernel->MKernelCacheConfig,
3006 ExecKernel->MKernelIsCooperative,
3007 ExecKernel->MKernelUsesClusterLaunch, BinImage);
3009 return UR_RESULT_SUCCESS;
3012 CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
3014 Copy->getDst(), std::move(RawEvents), Event,
3017 return UR_RESULT_SUCCESS;
3020 CGFillUSM *
Fill = (CGFillUSM *)MCommandGroup.get();
3022 Fill->getPattern(), std::move(RawEvents), Event,
3025 return UR_RESULT_SUCCESS;
3028 CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get();
3030 Prefetch->getLength(), std::move(RawEvents),
3033 return UR_RESULT_SUCCESS;
3036 CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get();
3038 Advise->getAdvice(), std::move(RawEvents), Event,
3041 return UR_RESULT_SUCCESS;
3044 CGCopy2DUSM *Copy = (CGCopy2DUSM *)MCommandGroup.get();
3046 Copy->getDst(), Copy->getDstPitch(),
3047 Copy->getWidth(), Copy->getHeight(),
3048 std::move(RawEvents), Event,
MEvent);
3049 return UR_RESULT_SUCCESS;
3052 CGFill2DUSM *
Fill = (CGFill2DUSM *)MCommandGroup.get();
3054 Fill->getWidth(),
Fill->getHeight(),
3055 Fill->getPattern(), std::move(RawEvents), Event,
3057 return UR_RESULT_SUCCESS;
3060 CGMemset2DUSM *Memset = (CGMemset2DUSM *)MCommandGroup.get();
3062 Memset->getWidth(), Memset->getHeight(),
3063 Memset->getValue(), std::move(RawEvents),
3065 return UR_RESULT_SUCCESS;
3068 CGHostTask *HostTask =
static_cast<CGHostTask *
>(MCommandGroup.get());
3070 for (ArgDesc &Arg : HostTask->MArgs) {
3071 switch (Arg.MType) {
3074 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
3077 Req->MData = AllocaCmd->getMemAllocation();
3082 "Unsupported arg type " +
3087 std::vector<interop_handle::ReqToMem> ReqToMem;
3088 std::vector<ur_mem_handle_t> ReqUrMem;
3090 if (HostTask->MHostTask->isInteropTask()) {
3093 const std::vector<Requirement *> &HandlerReq =
3094 HostTask->getRequirements();
3095 auto ReqToMemConv = [&ReqToMem, &ReqUrMem, HostTask](
Requirement *Req) {
3096 const std::vector<AllocaCommandBase *> &AllocaCmds =
3097 Req->MSYCLMemObj->MRecord->MAllocaCommands;
3099 for (AllocaCommandBase *AllocaCmd : AllocaCmds)
3102 auto MemArg =
reinterpret_cast<ur_mem_handle_t
>(
3103 AllocaCmd->getMemAllocation());
3104 ReqToMem.emplace_back(std::make_pair(Req, MemArg));
3105 ReqUrMem.emplace_back(MemArg);
3111 "Can't get memory object due to no allocation available");
3115 "Can't get memory object due to no allocation available " +
3118 std::for_each(std::begin(HandlerReq), std::end(HandlerReq), ReqToMemConv);
3119 std::sort(std::begin(ReqToMem), std::end(ReqToMem));
3131 return UR_RESULT_SUCCESS;
3134 CGHostTask *HostTask =
static_cast<CGHostTask *
>(MCommandGroup.get());
3136 for (ArgDesc &Arg : HostTask->MArgs) {
3137 switch (Arg.MType) {
3140 AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
3143 Req->MData = AllocaCmd->getMemAllocation();
3148 "Unsupported arg type ");
3152 std::vector<interop_handle::ReqToMem> ReqToMem;
3153 std::vector<ur_mem_handle_t> ReqMems;
3155 if (HostTask->MHostTask->isInteropTask()) {
3158 const std::vector<Requirement *> &HandlerReq =
3159 HostTask->getRequirements();
3160 auto ReqToMemConv = [&ReqToMem, &ReqMems, HostTask](
Requirement *Req) {
3161 const std::vector<AllocaCommandBase *> &AllocaCmds =
3162 Req->MSYCLMemObj->MRecord->MAllocaCommands;
3164 for (AllocaCommandBase *AllocaCmd : AllocaCmds)
3165 if (HostTask->MQueue->getContextImplPtr() ==
3166 AllocaCmd->getQueue()->getContextImplPtr()) {
3167 auto MemArg =
reinterpret_cast<ur_mem_handle_t
>(
3168 AllocaCmd->getMemAllocation());
3169 ReqToMem.emplace_back(std::make_pair(Req, MemArg));
3170 ReqMems.emplace_back(MemArg);
3176 "Can't get memory object due to no allocation available");
3180 "Can't get memory object due to no allocation available ");
3182 std::for_each(std::begin(HandlerReq), std::end(HandlerReq), ReqToMemConv);
3183 std::sort(std::begin(ReqToMem), std::end(ReqToMem));
3186 EnqueueNativeCommandData CustomOpData{
3187 interop_handle{ReqToMem, HostTask->MQueue,
3188 HostTask->MQueue->getDeviceImplPtr(),
3189 HostTask->MQueue->getContextImplPtr()},
3190 HostTask->MHostTask->MInteropTask};
3192 ur_bool_t NativeCommandSupport =
false;
3193 MQueue->getPlugin()->call(
3196 UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP,
3197 sizeof(NativeCommandSupport), &NativeCommandSupport,
nullptr);
3198 assert(NativeCommandSupport &&
"ext_codeplay_enqueue_native_command is not "
3199 "supported on this device");
3200 MQueue->getPlugin()->call(urEnqueueNativeCommandExp,
MQueue->getHandleRef(),
3201 InteropFreeFunc, &CustomOpData, ReqMems.size(),
3202 ReqMems.data(),
nullptr, RawEvents.size(),
3203 RawEvents.data(), Event);
3205 return UR_RESULT_SUCCESS;
3208 assert(
MQueue &&
"Barrier submission should have an associated queue");
3211 MEvent->setHostEnqueueTime();
3212 Plugin->call(urEnqueueEventsWaitWithBarrier,
MQueue->getHandleRef(), 0,
3215 return UR_RESULT_SUCCESS;
3218 assert(
MQueue &&
"Barrier submission should have an associated queue");
3219 CGBarrier *
Barrier =
static_cast<CGBarrier *
>(MCommandGroup.get());
3220 std::vector<detail::EventImplPtr> Events =
Barrier->MEventsWaitWithBarrier;
3222 if (UrEvents.empty()) {
3224 return UR_RESULT_SUCCESS;
3228 MEvent->setHostEnqueueTime();
3229 Plugin->call(urEnqueueEventsWaitWithBarrier,
MQueue->getHandleRef(),
3230 UrEvents.size(), &UrEvents[0], Event);
3232 return UR_RESULT_SUCCESS;
3235 const auto &Plugin =
MQueue->getPlugin();
3239 if (!
MQueue->isInOrder())
3240 Plugin->call(urEnqueueEventsWaitWithBarrier,
MQueue->getHandleRef(),
3244 Plugin->call(urEnqueueTimestampRecordingExp,
MQueue->getHandleRef(),
3249 return UR_RESULT_SUCCESS;
3252 CGCopyToDeviceGlobal *Copy = (CGCopyToDeviceGlobal *)MCommandGroup.get();
3254 Copy->getDeviceGlobalPtr(), Copy->isDeviceImageScoped(),
MQueue,
3255 Copy->getNumBytes(), Copy->getOffset(), Copy->getSrc(),
3256 std::move(RawEvents), Event,
MEvent);
3258 return UR_RESULT_SUCCESS;
3261 CGCopyFromDeviceGlobal *Copy =
3262 (CGCopyFromDeviceGlobal *)MCommandGroup.get();
3264 Copy->getDeviceGlobalPtr(), Copy->isDeviceImageScoped(),
MQueue,
3265 Copy->getNumBytes(), Copy->getOffset(), Copy->getDest(),
3266 std::move(RawEvents), Event,
MEvent);
3268 return UR_RESULT_SUCCESS;
3271 CGReadWriteHostPipe *ExecReadWriteHostPipe =
3272 (CGReadWriteHostPipe *)MCommandGroup.get();
3273 std::string pipeName = ExecReadWriteHostPipe->getPipeName();
3274 void *hostPtr = ExecReadWriteHostPipe->getHostPtr();
3275 size_t typeSize = ExecReadWriteHostPipe->getTypeSize();
3276 bool blocking = ExecReadWriteHostPipe->isBlocking();
3277 bool read = ExecReadWriteHostPipe->isReadHostPipe();
3283 typeSize, RawEvents, EventImpl, read);
3287 "Command buffer submissions should have an associated queue");
3288 CGExecCommandBuffer *CmdBufferCG =
3289 static_cast<CGExecCommandBuffer *
>(MCommandGroup.get());
3291 MEvent->setHostEnqueueTime();
3292 return MQueue->getPlugin()->call_nocheck(
3293 urCommandBufferEnqueueExp, CmdBufferCG->MCommandBuffer,
3294 MQueue->getHandleRef(), RawEvents.size(),
3295 RawEvents.empty() ? nullptr : &RawEvents[0], Event);
3298 CGCopyImage *Copy = (CGCopyImage *)MCommandGroup.get();
3301 MQueue, Copy->getSrc(), Copy->getDst(), Copy->getSrcDesc(),
3302 Copy->getDstDesc(), Copy->getSrcFormat(), Copy->getDstFormat(),
3303 Copy->getCopyFlags(), Copy->getSrcOffset(), Copy->getDstOffset(),
3304 Copy->getCopyExtent(), std::move(RawEvents), Event);
3305 return UR_RESULT_SUCCESS;
3309 "Semaphore wait submissions should have an associated queue");
3310 CGSemaphoreWait *SemWait = (CGSemaphoreWait *)MCommandGroup.get();
3312 auto OptWaitValue = SemWait->getWaitValue();
3313 uint64_t WaitValue = OptWaitValue.has_value() ? OptWaitValue.value() : 0;
3314 Plugin->call(urBindlessImagesWaitExternalSemaphoreExp,
3315 MQueue->getHandleRef(), SemWait->getInteropSemaphoreHandle(),
3316 OptWaitValue.has_value(), WaitValue, 0,
nullptr,
nullptr);
3318 return UR_RESULT_SUCCESS;
3322 "Semaphore signal submissions should have an associated queue");
3323 CGSemaphoreSignal *SemSignal = (CGSemaphoreSignal *)MCommandGroup.get();
3325 auto OptSignalValue = SemSignal->getSignalValue();
3326 uint64_t SignalValue =
3327 OptSignalValue.has_value() ? OptSignalValue.value() : 0;
3328 Plugin->call(urBindlessImagesSignalExternalSemaphoreExp,
3329 MQueue->getHandleRef(), SemSignal->getInteropSemaphoreHandle(),
3330 OptSignalValue.has_value(), SignalValue, 0,
nullptr,
nullptr);
3332 return UR_RESULT_SUCCESS;
3336 "CG type not implemented. " +
3339 return UR_RESULT_ERROR_INVALID_OPERATION;
3366 return MAuxiliaryCommands;
3370 MFusionList.push_back(
Kernel);
3379 ur_result_t KernelFusionCommand::enqueueImp() {
3386 return UR_RESULT_SUCCESS;
3395 "Cannot release the queue attached to the KernelFusionCommand if it "
3402 #ifdef XPTI_ENABLE_INSTRUMENTATION
3403 constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
3404 if (!xptiCheckTraceEnabled(
MStreamID)) {
3412 static unsigned FusionNodeCount = 0;
3413 std::stringstream PayloadStr;
3414 PayloadStr <<
"Fusion command #" << FusionNodeCount++;
3415 xpti::payload_t Payload = xpti::payload_t(PayloadStr.str().c_str());
3417 uint64_t CommandInstanceNo = 0;
3418 xpti_td *CmdTraceEvent =
3419 xptiMakeEvent(
MCommandName.c_str(), &Payload, xpti::trace_graph_event,
3420 xpti_at::active, &CommandInstanceNo);
3423 if (CmdTraceEvent) {
3439 addDeviceMetadata(CmdTraceEvent,
MQueue);
3444 xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
3446 xptiNotifySubscribers(
MStreamID, NotificationTraceType,
3447 detail::GSYCLGraphEvent,
3456 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
3458 Stream <<
"ID = " <<
this <<
"\\n";
3460 <<
"FUSION LIST: {";
3461 bool Initial =
true;
3462 for (
auto *Cmd : MFusionList) {
3468 if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource()) {
3469 Stream <<
"created from source";
3476 Stream <<
"\"];" << std::endl;
3478 for (
const auto &Dep :
MDeps) {
3479 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
3480 <<
" [ label = \"Access mode: "
3482 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
3490 std::vector<std::shared_ptr<ext::oneapi::experimental::detail::node_impl>>
3495 ur_result_t UpdateCommandBufferCommand::enqueueImp() {
3498 ur_event_handle_t &Event =
MEvent->getHandleRef();
3501 for (
auto &Node : MNodes) {
3502 auto CG =
static_cast<CGExecKernel *
>(Node->MCommandGroup.get());
3503 for (
auto &Arg :
CG->MArgs) {
3508 for (
const DepDesc &Dep :
MDeps) {
3509 Requirement *Req =
static_cast<AccessorImplHost *
>(Arg.MPtr);
3510 if (Dep.MDepRequirement == Req) {
3511 if (Dep.MAllocaCmd) {
3512 Req->
MData = Dep.MAllocaCmd->getMemAllocation();
3515 "No allocation available for accessor when "
3516 "updating command buffer!");
3524 return UR_RESULT_SUCCESS;
3528 Stream <<
"\"" <<
this <<
"\" [style=filled, fillcolor=\"#8d8f29\", label=\"";
3530 Stream <<
"ID = " <<
this <<
"\\n";
3531 Stream <<
"CommandBuffer Command Update"
3534 Stream <<
"\"];" << std::endl;
3536 for (
const auto &Dep :
MDeps) {
3537 Stream <<
" \"" <<
this <<
"\" -> \"" << Dep.MDepCommand <<
"\""
3538 <<
" [ label = \"Access mode: "
3540 <<
"MemObj: " << Dep.MDepRequirement->MSYCLMemObj <<
" \" ]"
The context class represents a SYCL context on which kernel functions may be executed.
detail::SYCLMemObjI * MSYCLMemObj
range< 3 > & MAccessRange
range< 3 > & MMemoryRange
Base class for memory allocation commands.
const Requirement * getRequirement() const final
AllocaCommandBase * MLinkedAllocaCmd
Alloca command linked with current command.
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
virtual void * getMemAllocation() const =0
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
bool MIsActive
Indicates that current alloca is active one.
bool MIsLeaderAlloca
Indicates that the command owns memory allocation in case of connected alloca command.
AllocaCommandBase(CommandType Type, QueueImplPtr Queue, Requirement Req, AllocaCommandBase *LinkedAllocaCmd, bool IsConst)
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
SYCLMemObjI * getSYCLMemObj() const
void printDot(std::ostream &Stream) const final
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
AllocaCommand(QueueImplPtr Queue, Requirement Req, bool InitFromUserData=true, AllocaCommandBase *LinkedAllocaCmd=nullptr, bool IsConst=false)
void * getMemAllocation() const final
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
void printDot(std::ostream &Stream) const final
AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *ParentAlloca, std::vector< Command * > &ToEnqueue, std::vector< Command * > &ToCleanUp)
sycl::detail::kernel_param_kind_t MType
"Execute kernel" command group class.
std::shared_ptr< detail::kernel_bundle_impl > MKernelBundle
std::vector< ArgDesc > MArgs
bool MKernelIsCooperative
NDRDescT MNDRDesc
Stores ND-range description.
std::shared_ptr< detail::kernel_impl > MSyclKernel
bool MKernelUsesClusterLaunch
std::shared_ptr< detail::queue_impl > MQueue
Base class for all types of command groups.
The Command class represents some action that needs to be performed on one or more memory objects.
bool MShouldCompleteEventIfPossible
void copySubmissionCodeLocation()
virtual ur_result_t enqueueImp()=0
Private interface. Derived classes should implement this method.
Command * processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform glueing of events from different contexts.
void * MTraceEvent
The event for node_create and task_begin.
CommandType MType
The type of the command.
virtual bool producesPiEvent() const
Returns true iff the command produces a UR event on non-host devices.
int32_t MStreamID
The stream under which the traces are emitted.
void emitInstrumentation(uint16_t Type, const char *Txt=nullptr)
Emits an event of Type.
virtual void emitInstrumentationData()=0
Instrumentation method which emits telemetry data.
void resolveReleaseDependencies(std::set< Command * > &list)
Looks at all the dependencies for the release command and enables instrumentation to report these dep...
std::string MCommandName
Buffer to build the command end-user understandable name.
bool MMarkedForCleanup
Indicates that the node will be freed by graph cleanup.
unsigned MLeafCounter
Counts the number of memory objects this command is a leaf for.
std::vector< ur_exp_command_buffer_sync_point_t > MSyncPointDeps
List of sync points for submissions to a command buffer.
virtual bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, std::vector< Command * > &ToCleanUp)
Checks if the command is enqueued, and calls enqueueImp.
void waitForEvents(QueueImplPtr Queue, std::vector< EventImplPtr > &RawEvents, ur_event_handle_t &Event)
void waitForPreparedHostEvents() const
std::string MSubmissionFileName
Introduces string to handle memory management since code_location struct works with raw char arrays.
std::vector< ur_event_handle_t > getUrEvents(const std::vector< EventImplPtr > &EventImpls) const
Collect UR events from EventImpls and filter out some of them in case of in order queue.
std::mutex MEnqueueMtx
Mutex used to protect enqueueing from race conditions.
void emitInstrumentationDataProxy()
Proxy method which calls emitInstrumentationData.
code_location MSubmissionCodeLocation
Represents code location of command submission to SYCL API, assigned with the valid value only if com...
void emitEnqueuedEventSignal(ur_event_handle_t &UrEventAddr)
Creates a signal event with the enqueued kernel event handle.
void makeTraceEventEpilog()
If prolog has been run, run epilog; this must be guarded by a check for xptiTraceEnabled().
std::vector< EventImplPtr > & MPreparedHostDepsEvents
std::vector< EventImplPtr > & MPreparedDepsEvents
Dependency events prepared for waiting by backend.
std::string MSubmissionFunctionName
uint64_t MInstanceID
Instance ID tracked for the command.
std::vector< DepDesc > MDeps
Contains list of dependencies(edges)
const char * getBlockReason() const
std::vector< ur_event_handle_t > getUrEventsBlocking(const std::vector< EventImplPtr > &EventImpls) const
Collect UR events from EventImpls and filter out some of them in case of in order queue.
virtual bool readyForCleanup() const
Returns true iff this command is ready to be submitted for cleanup.
void addUser(Command *NewUser)
virtual ContextImplPtr getWorkerContext() const
Get the context of the queue this command will be submitted to.
std::atomic< EnqueueResultT::ResultT > MEnqueueStatus
Describes the status of the command.
const EventImplPtr & getEvent() const
ur_exp_command_buffer_handle_t MCommandBuffer
CommandBuffer which will be used to submit to instead of the queue, if set.
uint64_t makeTraceEventProlog(void *MAddress)
Create a trace event of node_create type; this must be guarded by a check for xptiTraceEnabled().
friend class DispatchHostTask
CommandType getType() const
void * MAddress
Reserved for storing the object address such as SPIR-V or memory object address.
std::string MAddressString
Buffer to build the address string.
QueueImplPtr MWorkerQueue
bool MIsBlockable
Indicates whether the command can be blocked from enqueueing.
virtual bool supportsPostEnqueueCleanup() const
Returns true iff this command can be freed by post enqueue cleanup.
bool MTraceEventPrologComplete
Flag to indicate if makeTraceEventProlog() has been run.
std::string MCommandNodeType
Buffer to build the command node type.
void emitEdgeEventForEventDependence(Command *Cmd, ur_event_handle_t &EventAddr)
Creates an edge event when the dependency is an event.
Command * addDep(DepDesc NewDep, std::vector< Command * > &ToCleanUp)
void emitEdgeEventForCommandDependence(Command *Cmd, void *ObjAddr, bool IsCommand, std::optional< access::mode > AccMode=std::nullopt)
Creates an edge event when the dependency is a command.
const QueueImplPtr & getQueue() const
Command(CommandType Type, QueueImplPtr Queue, ur_exp_command_buffer_handle_t CommandBuffer=nullptr, const std::vector< ur_exp_command_buffer_sync_point_t > &SyncPoints={})
It is safe to bind MPreparedDepsEvents and MPreparedHostDepsEvents references to event_impl class mem...
bool MFirstInstance
Flag to indicate if this is the first time we are seeing this payload.
DispatchHostTask(ExecCGCommand *ThisCmd, std::vector< interop_handle::ReqToMem > ReqToMem, std::vector< ur_mem_handle_t > ReqUrMem)
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
void printDot(std::ostream &Stream) const final
void addRequirement(Command *DepCmd, AllocaCommandBase *AllocaCmd, const Requirement *Req)
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
The exec CG command enqueues execution of kernel or explicit memory operation.
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
ExecCGCommand(std::unique_ptr< detail::CG > CommandGroup, QueueImplPtr Queue, bool EventNeeded, ur_exp_command_buffer_handle_t CommandBuffer=nullptr, const std::vector< ur_exp_command_buffer_sync_point_t > &Dependencies={})
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
std::string_view getTypeString() const
detail::CG & getCG() const
void printDot(std::ostream &Stream) const final
std::vector< std::shared_ptr< const void > > getAuxiliaryResources() const
void clearAuxiliaryResources()
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
void TraceEventXPTI(const char *Message)
static GlobalHandler & instance()
void call(HostProfilingInfo *HPI)
void addToFusionList(ExecCGCommand *Kernel)
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
void setFusionStatus(FusionStatus Status)
Set the status of this fusion command to Status.
std::vector< ExecCGCommand * > & getFusionList()
void printDot(std::ostream &Stream) const final
KernelFusionCommand(QueueImplPtr Queue)
void resetQueue()
Reset the queue.
std::vector< Command * > & auxiliaryCommands()
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, void **DstPtr, QueueImplPtr Queue, access::mode MapMode)
void printDot(std::ostream &Stream) const final
ContextImplPtr getWorkerContext() const final
Get the context of the queue this command will be submitted to.
MemCpyCommandHost(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, Requirement DstReq, void **DstPtr, QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
void printDot(std::ostream &Stream) const final
ContextImplPtr getWorkerContext() const final
Get the context of the queue this command will be submitted to.
MemCpyCommand(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, Requirement DstReq, AllocaCommandBase *DstAllocaCmd, QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
void printDot(std::ostream &Stream) const final
static void ext_oneapi_fill_usm_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, void *DstMem, size_t Len, const std::vector< unsigned char > &Pattern, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void ext_oneapi_copyH2D_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, SYCLMemObjI *SYCLMemObj, char *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void copy_image_bindless(QueueImplPtr Queue, const void *Src, void *Dst, const ur_image_desc_t &SrcDesc, const ur_image_desc_t &DstDesc, const ur_image_format_t &SrcFormat, const ur_image_format_t &DstFormat, const ur_exp_image_copy_flags_t Flags, ur_rect_offset_t SrcOffset, ur_rect_offset_t DstOffset, ur_rect_region_t CopyExtent, const std::vector< ur_event_handle_t > &DepEvents, ur_event_handle_t *OutEvent)
static void fill_2d_usm(void *DstMem, QueueImplPtr Queue, size_t Pitch, size_t Width, size_t Height, const std::vector< unsigned char > &Pattern, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_advise_usm_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, const void *Mem, size_t Length, ur_usm_advice_flags_t Advice, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void fill(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, size_t PatternSize, const unsigned char *Pattern, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t &OutEvent, const detail::EventImplPtr &OutEventImpl)
static void * allocateMemSubBuffer(ContextImplPtr TargetContext, void *ParentMemObj, size_t ElemSize, size_t Offset, range< 3 > Range, std::vector< EventImplPtr > DepEvents, ur_event_handle_t &OutEvent)
static void ext_oneapi_fill_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, SYCLMemObjI *SYCLMemObj, void *Mem, size_t PatternSize, const unsigned char *Pattern, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void memset_2d_usm(void *DstMem, QueueImplPtr Queue, size_t Pitch, size_t Width, size_t Height, char Value, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_copyD2H_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, SYCLMemObjI *SYCLMemObj, void *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, char *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void copy_to_device_global(const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue, size_t NumBytes, size_t Offset, const void *SrcMem, const std::vector< ur_event_handle_t > &DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_copyD2D_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, SYCLMemObjI *SYCLMemObj, void *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void copy_2d_usm(const void *SrcMem, size_t SrcPitch, QueueImplPtr Queue, void *DstMem, size_t DstPitch, size_t Width, size_t Height, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void advise_usm(const void *Ptr, QueueImplPtr Queue, size_t Len, ur_usm_advice_flags_t Advice, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void * map(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, access::mode AccessMode, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t &OutEvent)
static void copy(SYCLMemObjI *SYCLMemObj, void *SrcMem, QueueImplPtr SrcQueue, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, QueueImplPtr TgtQueue, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t &OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy_usm(const void *SrcMem, QueueImplPtr Queue, size_t Len, void *DstMem, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void fill_usm(void *DstMem, QueueImplPtr Queue, size_t Len, const std::vector< unsigned char > &Pattern, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy_from_device_global(const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue, size_t NumBytes, size_t Offset, void *DstMem, const std::vector< ur_event_handle_t > &DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void * allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, bool InitFromUserData, void *HostPtr, std::vector< EventImplPtr > DepEvents, ur_event_handle_t &OutEvent)
static void ext_oneapi_copy_usm_cmd_buffer(ContextImplPtr Context, const void *SrcMem, ur_exp_command_buffer_handle_t CommandBuffer, size_t Len, void *DstMem, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void unmap(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, void *MappedPtr, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t &OutEvent)
static void release(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *MemAllocation, std::vector< EventImplPtr > DepEvents, ur_event_handle_t &OutEvent)
static void ext_oneapi_prefetch_usm_cmd_buffer(sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, void *Mem, size_t Length, std::vector< ur_exp_command_buffer_sync_point_t > Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint)
static void prefetch_usm(void *Ptr, QueueImplPtr Queue, size_t Len, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
sycl::range< 3 > GlobalSize
sycl::range< 3 > NumWorkGroups
Number of workgroups, used to record the number of workgroups from the simplest form of parallel_for_...
sycl::id< 3 > GlobalOffset
sycl::range< 3 > LocalSize
sycl::range< 3 > ClusterDimensions
std::tuple< ur_kernel_handle_t, std::mutex *, const KernelArgMask *, ur_program_handle_t > getOrCreateKernel(const ContextImplPtr &ContextImpl, const DeviceImplPtr &DeviceImpl, const std::string &KernelName, const NDRDescT &NDRDesc={})
static ProgramManager & getInstance()
kernel_id getSYCLKernelID(const std::string &KernelName)
device_image_plain getDeviceImageFromBinaryImage(RTDeviceBinaryImage *BinImage, const context &Ctx, const device &Dev)
bool kernelUsesAssert(const std::string &KernelName) const
HostPipeMapEntry * getHostPipeEntry(const std::string &UniqueId)
device_image_plain build(const device_image_plain &DeviceImage, const std::vector< device > &Devs, const property_list &PropList)
sycl_device_binary get() const
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd)
void printDot(std::ostream &Stream) const final
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
static const char * get()
virtual MemObjType getType() const =0
Command * connectDepEvent(Command *const Cmd, const EventImplPtr &DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform connection of events in multiple contexts.
void enqueueCommandForCG(EventImplPtr NewEvent, std::vector< Command * > &AuxilaryCmds, BlockingT Blocking=NON_BLOCKING)
GraphBuilder MGraphBuilder
ur_kernel_handle_t completeSpecConstMaterialization(QueueImplPtr Queue, const RTDeviceBinaryImage *BinImage, const std::string &KernelName, std::vector< unsigned char > &SpecConstBlob)
static Scheduler & getInstance()
void NotifyHostTaskCompletion(Command *Cmd)
void printDot(std::ostream &Stream) const final
UnMapMemObject(AllocaCommandBase *DstAllocaCmd, Requirement Req, void **SrcPtr, QueueImplPtr Queue)
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
UpdateCommandBufferCommand(QueueImplPtr Queue, ext::oneapi::experimental::detail::exec_graph_impl *Graph, std::vector< std::shared_ptr< ext::oneapi::experimental::detail::node_impl >> Nodes)
void printDot(std::ostream &Stream) const final
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
bool producesPiEvent() const final
Returns true iff the command produces a UR event on non-host devices.
UpdateHostRequirementCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *SrcAllocaCmd, void **DstPtr)
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
void printDot(std::ostream &Stream) const final
static void bufferAssociateNotification(const void *, const void *)
const PluginPtr & getPlugin() const
ur_device_handle_t & getHandleRef()
Get reference to UR device.
static ThreadPool & getThreadPool()
Data type that manages the code_location information in TLS.
const detail::code_location & query()
Query the information in the TLS slot.
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Class representing the implementation of command_graph<executable>.
void updateImpl(std::shared_ptr< node_impl > NodeImpl)
Objects of the class identify kernel is some kernel_bundle related APIs.
Provides an abstraction of a SYCL kernel.
Defines the iteration domain of both the work-groups and the overall dispatch.
Objects of the property_list class are containers for the SYCL properties.
std::function< void(interop_handle)> func
void handleErrorOrWarning(ur_result_t Error, const device_impl &DeviceImpl, ur_kernel_handle_t Kernel, const NDRDescT &NDRDesc)
Analyzes error code and arguments of urEnqueueKernelLaunch to emit user-friendly exception describing...
void free(void *Ptr, const context &Ctxt, const code_location &CL)
@ kind_specialization_constants_buffer
decltype(Obj::impl) const & getSyclObjImpl(const Obj &SyclObject)
static void flushCrossQueueDeps(const std::vector< EventImplPtr > &EventImpls, const QueueImplPtr &Queue)
static std::string demangleKernelName(std::string Name)
static ur_result_t SetKernelParamsAndLaunch(const QueueImplPtr &Queue, std::vector< ArgDesc > &Args, const std::shared_ptr< device_image_impl > &DeviceImageImpl, ur_kernel_handle_t Kernel, NDRDescT &NDRDesc, std::vector< ur_event_handle_t > &RawEvents, const detail::EventImplPtr &OutEventImpl, const KernelArgMask *EliminatedArgMask, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, bool IsCooperative, bool KernelUsesClusterLaunch, const RTDeviceBinaryImage *BinImage, const std::string &KernelName)
std::vector< bool > KernelArgMask
std::string codeToString(int32_t code)
constexpr const char * SYCL_STREAM_NAME
ur_mem_flags_t AccessModeToUr(access::mode AccessorMode)
static std::string_view cgTypeToString(detail::CGType Type)
void ReverseRangeDimensionsForKernel(NDRDescT &NDR)
void enqueueImpKernel(const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector< ArgDesc > &Args, const std::shared_ptr< detail::kernel_bundle_impl > &KernelBundleImplPtr, const std::shared_ptr< detail::kernel_impl > &MSyclKernel, const std::string &KernelName, std::vector< ur_event_handle_t > &RawEvents, const detail::EventImplPtr &OutEventImpl, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, ur_kernel_cache_config_t KernelCacheConfig, const bool KernelIsCooperative, const bool KernelUsesClusterLaunch, const RTDeviceBinaryImage *BinImage)
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
void SetArgBasedOnType(const PluginPtr &Plugin, ur_kernel_handle_t Kernel, const std::shared_ptr< device_image_impl > &DeviceImageImpl, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, const sycl::context &Context, detail::ArgDesc &Arg, size_t NextTrueIndex)
std::shared_ptr< event_impl > EventImplPtr
std::shared_ptr< plugin > PluginPtr
AccessorImplHost Requirement
ur_result_t enqueueImpCommandBufferKernel(context Ctx, DeviceImplPtr DeviceImpl, ur_exp_command_buffer_handle_t CommandBuffer, const CGExecKernel &CommandGroup, std::vector< ur_exp_command_buffer_sync_point_t > &SyncPoints, ur_exp_command_buffer_sync_point_t *OutSyncPoint, ur_exp_command_buffer_command_handle_t *OutCommand, const std::function< void *(Requirement *Req)> &getMemAllocationFunc)
std::shared_ptr< device_impl > DeviceImplPtr
CGType
Type of the command group.
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
static void adjustNDRangePerKernel(NDRDescT &NDR, ur_kernel_handle_t Kernel, const device_impl &DeviceImpl)
void applyFuncOnFilteredArgs(const KernelArgMask *EliminatedArgMask, std::vector< ArgDesc > &Args, std::function< void(detail::ArgDesc &Arg, int NextTrueIndex)> Func)
ur_result_t enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName, bool blocking, void *ptr, size_t size, std::vector< ur_event_handle_t > &RawEvents, const detail::EventImplPtr &OutEventImpl, bool read)
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
static ContextImplPtr getContext(const QueueImplPtr &Queue)
std::enable_if< !std::is_same< typename Param::return_type, sycl::range< 3 > >::value, typename Param::return_type >::type get_kernel_device_specific_info(ur_kernel_handle_t Kernel, ur_device_handle_t Device, const PluginPtr &Plugin)
static std::string accessModeToString(access::mode Mode)
std::string queueDeviceToString(const queue_impl *const &Queue)
std::tuple< const RTDeviceBinaryImage *, ur_program_handle_t > retrieveKernelBinary(const QueueImplPtr &, const char *KernelName, CGExecKernel *CGKernel=nullptr)
Function for_each(Group g, Ptr first, Ptr last, Function f)
__DPCPP_SYCL_EXTERNAL _SYCL_EXT_CPLX_INLINE_VISIBILITY std::enable_if_t< is_genfloat< _Tp >::value, _Tp > arg(const complex< _Tp > &__c)
PropertyListT int access::address_space multi_ptr & operator=(multi_ptr &&)=default
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Dependency between two commands.
const Requirement * MDepRequirement
Requirement for the dependency.
Command * MDepCommand
The actual dependency command.
Result of command enqueueing.
RTDeviceBinaryImage * getDevBinImage()