12 #include <sycl/feature_test.hpp>
13 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
53 return LHS == RHS || (LHS->is_host() && RHS->is_host());
64 case access::mode::read:
65 return (Required == Current);
67 assert(
false &&
"Write only access is expected to be mapped as read_write");
68 return (Required == Current || Required == access::mode::discard_write);
70 case access::mode::atomic:
71 case access::mode::discard_write:
72 case access::mode::discard_read_write:
84 if (A == access::mode::discard_write &&
88 if (B == access::mode::discard_write &&
95 Scheduler::GraphBuilder::GraphBuilder() {
97 std::string GraphPrintOpts(EnvVarCStr);
98 bool EnableAlways = GraphPrintOpts.find(
"always") != std::string::npos;
100 if (GraphPrintOpts.find(
"before_addCG") != std::string::npos ||
102 MPrintOptionsArray[BeforeAddCG] =
true;
103 if (GraphPrintOpts.find(
"after_addCG") != std::string::npos || EnableAlways)
104 MPrintOptionsArray[AfterAddCG] =
true;
105 if (GraphPrintOpts.find(
"before_addCopyBack") != std::string::npos ||
107 MPrintOptionsArray[BeforeAddCopyBack] =
true;
108 if (GraphPrintOpts.find(
"after_addCopyBack") != std::string::npos ||
110 MPrintOptionsArray[AfterAddCopyBack] =
true;
111 if (GraphPrintOpts.find(
"before_addHostAcc") != std::string::npos ||
113 MPrintOptionsArray[BeforeAddHostAcc] =
true;
114 if (GraphPrintOpts.find(
"after_addHostAcc") != std::string::npos ||
116 MPrintOptionsArray[AfterAddHostAcc] =
true;
117 if (GraphPrintOpts.find(
"after_fusionComplete") != std::string::npos ||
119 MPrintOptionsArray[AfterFusionComplete] =
true;
120 if (GraphPrintOpts.find(
"after_fusionCancel") != std::string::npos ||
122 MPrintOptionsArray[AfterFusionCancel] =
true;
127 assert(Cmd &&
"Cmd can't be nullptr");
131 Visited.push_back(Cmd);
137 Cmd->MMarks.MVisited =
false;
142 if (Cmd->MMarks.MToBeDeleted) {
143 if (Cmd->getType() == Command::FUSION &&
151 Cmd->getEvent()->setCommand(
nullptr);
154 Cmd->MMarks.MVisited =
false;
159 std::vector<Command *> &Visited,
Command *Cmd) {
169 void Scheduler::GraphBuilder::printGraphAsDot(
const char *ModeName) {
170 static size_t Counter = 0;
171 std::string ModeNameStr(ModeName);
172 std::string FileName =
173 "graph_" + std::to_string(Counter) + ModeNameStr +
".dot";
177 std::fstream Stream(FileName, std::ios::out);
178 Stream <<
"strict digraph {" << std::endl;
180 MVisitedCmds.clear();
182 for (SYCLMemObjI *MemObject : MMemObjs)
183 for (Command *AllocaCmd : MemObject->MRecord->MAllocaCommands)
186 Stream <<
"}" << std::endl;
192 return MemObject->
MRecord.get();
197 std::vector<Command *> &ToEnqueue) {
201 if (
nullptr != Record)
204 const size_t LeafLimit = 8;
210 DepDesc Dep = findDepForRecord(Dependant, Record);
212 std::vector<Command *> ToCleanUp;
215 ToEnqueue.push_back(ConnectionCmd);
217 --(Dependency->MLeafCounter);
218 if (Dependency->readyForCleanup())
219 ToCleanUp.push_back(Dependency);
230 std::vector<sycl::device> Devices =
231 InteropCtxPtr->get_info<info::context::devices>();
232 assert(Devices.size() != 0);
239 Dev, InteropCtxPtr, {}, {}}};
242 new MemObjRecord{InteropCtxPtr, LeafLimit, AllocateDependency});
243 getOrCreateAllocaForReq(MemObject->
MRecord.get(), Req, InteropQueuePtr,
247 LeafLimit, AllocateDependency});
249 MMemObjs.push_back(MemObject);
250 return MemObject->
MRecord.get();
253 void Scheduler::GraphBuilder::updateLeaves(
const std::set<Command *> &Cmds,
256 std::vector<Command *> &ToCleanUp) {
258 const bool ReadOnlyReq =
AccessMode == access::mode::read;
263 bool WasLeaf = Cmd->MLeafCounter > 0;
266 if (WasLeaf && Cmd->readyForCleanup()) {
267 ToCleanUp.push_back(Cmd);
272 void Scheduler::GraphBuilder::addNodeToLeaves(
274 std::vector<Command *> &ToEnqueue) {
278 if (Leaves.push_back(Cmd, ToEnqueue))
284 std::vector<Command *> &ToEnqueue) {
286 findAllocaForReq(Record, Req, Queue->getContextImplPtr());
287 assert(AllocaCmd &&
"There must be alloca for requirement!");
294 std::set<Command *> Deps =
295 findDepsForReq(Record, Req, Queue->getContextImplPtr());
296 std::vector<Command *> ToCleanUp;
299 UpdateCommand->
addDep(
DepDesc{Dep, StoredReq, AllocaCmd}, ToCleanUp);
301 ToEnqueue.push_back(ConnCmd);
303 updateLeaves(Deps, Record, Req->
MAccessMode, ToCleanUp);
304 addNodeToLeaves(Record, UpdateCommand, Req->
MAccessMode, ToEnqueue);
305 for (Command *Cmd : ToCleanUp)
307 return UpdateCommand;
316 "Expected linked alloca commands");
318 "Expected source alloca command to be active");
320 if (AllocaCmdSrc->
getQueue()->is_host()) {
339 Command *Scheduler::GraphBuilder::insertMemoryMove(
341 std::vector<Command *> &ToEnqueue) {
343 AllocaCommandBase *AllocaCmdDst =
344 getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
346 throw runtime_error(
"Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
348 std::set<Command *> Deps =
349 findDepsForReq(Record, Req, Queue->getContextImplPtr());
350 Deps.insert(AllocaCmdDst);
353 if (AllocaCmdDst->getType() == Command::CommandType::ALLOCA_SUB_BUF)
355 static_cast<AllocaSubBufCommand *
>(AllocaCmdDst)->getParentAlloca();
358 AllocaCommandBase *AllocaCmdSrc =
359 findAllocaForReq(Record, Req, Record->MCurContext);
364 auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) {
366 Record->MCurContext) &&
368 AllocaCmd->
getType() == Command::CommandType::ALLOCA;
372 std::find_if(Record->MAllocaCommands.begin(),
373 Record->MAllocaCommands.end(), IsSuitableAlloca);
374 AllocaCmdSrc = (Record->MAllocaCommands.end() != It) ? *It :
nullptr;
377 throw runtime_error(
"Cannot find buffer allocation",
378 PI_ERROR_INVALID_VALUE);
381 if (AllocaCmdSrc->getType() == Command::CommandType::ALLOCA_SUB_BUF)
383 static_cast<AllocaSubBufCommand *
>(AllocaCmdSrc)->getParentAlloca();
384 else if (AllocaCmdSrc->getSYCLMemObj() != Req->MSYCLMemObj)
385 assert(
false &&
"Inappropriate alloca command.");
388 Command *NewCmd =
nullptr;
390 if (AllocaCmdSrc->MLinkedAllocaCmd == AllocaCmdDst) {
396 Record->MHostAccess = MapMode;
399 if ((Req->MAccessMode == access::mode::discard_write) ||
400 (Req->MAccessMode == access::mode::discard_read_write)) {
401 Record->MCurContext = Queue->getContextImplPtr();
407 new MemCpyCommand(*AllocaCmdSrc->getRequirement(), AllocaCmdSrc,
408 *AllocaCmdDst->getRequirement(), AllocaCmdDst,
409 AllocaCmdSrc->getQueue(), AllocaCmdDst->getQueue());
412 std::vector<Command *> ToCleanUp;
413 for (Command *Dep : Deps) {
414 Command *ConnCmd = NewCmd->
addDep(
415 DepDesc{Dep, NewCmd->getRequirement(), AllocaCmdDst}, ToCleanUp);
417 ToEnqueue.push_back(ConnCmd);
421 for (Command *Cmd : ToCleanUp)
423 Record->MCurContext = Queue->getContextImplPtr();
427 Command *Scheduler::GraphBuilder::remapMemoryObject(
428 MemObjRecord *Record,
Requirement *Req, AllocaCommandBase *HostAllocaCmd,
429 std::vector<Command *> &ToEnqueue) {
430 assert(HostAllocaCmd->getQueue()->is_host() &&
431 "Host alloca command expected");
432 assert(HostAllocaCmd->MIsActive &&
"Active alloca command expected");
434 AllocaCommandBase *LinkedAllocaCmd = HostAllocaCmd->MLinkedAllocaCmd;
435 assert(LinkedAllocaCmd &&
"Linked alloca command expected");
437 std::set<Command *> Deps = findDepsForReq(Record, Req, Record->MCurContext);
439 UnMapMemObject *UnMapCmd =
new UnMapMemObject(
440 LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(),
441 &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue());
447 MapMemObject *MapCmd =
new MapMemObject(
448 LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(),
449 &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue(), MapMode);
451 std::vector<Command *> ToCleanUp;
452 for (Command *Dep : Deps) {
453 Command *ConnCmd = UnMapCmd->addDep(
454 DepDesc{Dep, UnMapCmd->getRequirement(), LinkedAllocaCmd}, ToCleanUp);
456 ToEnqueue.push_back(ConnCmd);
459 Command *ConnCmd = MapCmd->addDep(
460 DepDesc{UnMapCmd, MapCmd->getRequirement(), HostAllocaCmd}, ToCleanUp);
462 ToEnqueue.push_back(ConnCmd);
466 for (Command *Cmd : ToCleanUp)
468 Record->MHostAccess = MapMode;
476 std::vector<Command *> &ToEnqueue) {
477 QueueImplPtr HostQueue = Scheduler::getInstance().getDefaultHostQueue();
480 if (Record && MPrintOptionsArray[BeforeAddCopyBack])
481 printGraphAsDot(
"before_addCopyBack");
487 std::set<Command *> Deps =
488 findDepsForReq(Record, Req, HostQueue->getContextImplPtr());
490 findAllocaForReq(Record, Req, Record->
MCurContext);
492 auto MemCpyCmdUniquePtr = std::make_unique<MemCpyCommandHost>(
494 SrcAllocaCmd->
getQueue(), std::move(HostQueue));
496 if (!MemCpyCmdUniquePtr)
497 throw runtime_error(
"Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
501 std::vector<Command *> ToCleanUp;
506 ToEnqueue.push_back(ConnCmd);
509 updateLeaves(Deps, Record, Req->
MAccessMode, ToCleanUp);
510 addNodeToLeaves(Record, MemCpyCmd, Req->
MAccessMode, ToEnqueue);
513 if (MPrintOptionsArray[AfterAddCopyBack])
514 printGraphAsDot(
"after_addCopyBack");
522 std::vector<Command *> &ToEnqueue) {
524 const QueueImplPtr &HostQueue = getInstance().getDefaultHostQueue();
526 MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req, ToEnqueue);
527 if (MPrintOptionsArray[BeforeAddHostAcc])
528 printGraphAsDot(
"before_addHostAccessor");
529 markModifiedIfWrite(Record, Req);
532 getOrCreateAllocaForReq(Record, Req, HostQueue, ToEnqueue);
537 remapMemoryObject(Record, Req, HostAllocaCmd, ToEnqueue);
539 insertMemoryMove(Record, Req, HostQueue, ToEnqueue);
542 insertUpdateHostReqCmd(Record, Req, HostQueue, ToEnqueue);
546 addEmptyCmd(UpdateHostAccCmd, {Req}, HostQueue,
547 Command::BlockReason::HostAccessor, ToEnqueue);
551 if (MPrintOptionsArray[AfterAddHostAcc])
552 printGraphAsDot(
"after_addHostAccessor");
554 return UpdateHostAccCmd;
557 Command *Scheduler::GraphBuilder::addCGUpdateHost(
558 std::unique_ptr<detail::CG> CommandGroup,
const QueueImplPtr &HostQueue,
559 std::vector<Command *> &ToEnqueue) {
561 auto UpdateHost =
static_cast<CGUpdateHost *
>(CommandGroup.get());
564 MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req, ToEnqueue);
565 return insertMemoryMove(Record, Req, HostQueue, ToEnqueue);
577 Scheduler::GraphBuilder::findDepsForReq(
MemObjRecord *Record,
580 std::set<Command *> RetDeps;
581 std::vector<Command *> Visited;
582 const bool ReadOnlyReq = Req->
MAccessMode == access::mode::read;
589 ToAnalyze.insert(ToAnalyze.begin(), V.begin(), V.end());
592 while (!ToAnalyze.empty()) {
593 Command *DepCmd = ToAnalyze.back();
594 ToAnalyze.pop_back();
596 std::vector<Command *> NewAnalyze;
598 for (
const DepDesc &Dep : DepCmd->MDeps) {
599 if (Dep.MDepRequirement->MSYCLMemObj != Req->
MSYCLMemObj)
602 bool CanBypassDep =
false;
605 Dep.MDepRequirement->MAccessMode == access::mode::read && ReadOnlyReq;
608 CanBypassDep |= !
doOverlap(Dep.MDepRequirement, Req);
613 sameCtx(Context, Dep.MDepCommand->getQueue()->getContextImplPtr());
616 RetDeps.insert(DepCmd);
624 NewAnalyze.push_back(Dep.MDepCommand);
626 ToAnalyze.insert(ToAnalyze.end(), NewAnalyze.begin(), NewAnalyze.end());
641 assert(
false &&
"No dependency found for a leaf of the record");
642 return {
nullptr,
nullptr,
nullptr};
650 auto IsSuitableAlloca = [&Context, Req,
652 bool Res =
sameCtx(AllocaCmd->
getQueue()->getContextImplPtr(), Context);
655 Res &= AllocaCmd->
getType() == Command::CommandType::ALLOCA_SUB_BUF;
658 Req->MSYCLMemObj->getSizeInBytes();
659 Res &= AllowConst || !AllocaCmd->
MIsConst;
670 if (std::strcmp(HUMConfig,
"0") == 0)
671 return Ctx->is_host();
672 if (std::strcmp(HUMConfig,
"1") == 0)
675 for (
const device &Device : Ctx->getDevices()) {
676 if (!Device.get_info<info::device::host_unified_memory>())
686 AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
688 std::vector<Command *> &ToEnqueue) {
690 AllocaCommandBase *AllocaCmd = findAllocaForReq(
691 Record, Req, Queue->getContextImplPtr(),
false);
694 std::vector<Command *> ToCleanUp;
698 range<3> ParentRange{Req->MSYCLMemObj->getSizeInBytes(), 1, 1};
699 Requirement ParentRequirement( {0, 0, 0}, ParentRange,
705 getOrCreateAllocaForReq(Record, &ParentRequirement, Queue, ToEnqueue);
706 AllocaCmd =
new AllocaSubBufCommand(Queue, *Req, ParentAlloca, ToEnqueue,
710 const Requirement FullReq( {0, 0, 0}, Req->MMemoryRange,
712 Req->MSYCLMemObj, Req->MDims, Req->MElemSize,
721 const bool HostUnifiedMemory =
725 auto *MemObj =
static_cast<SYCLMemObjT *
>(Req->MSYCLMemObj);
726 const bool InitFromUserData = Record->MAllocaCommands.empty() &&
727 (HostUnifiedMemory || MemObj->isInterop());
728 AllocaCommandBase *LinkedAllocaCmd =
nullptr;
733 if (Record->MAllocaCommands.empty()) {
734 if (!HostUnifiedMemory &&
735 Req->MAccessMode != access::mode::discard_write &&
736 Req->MAccessMode != access::mode::discard_read_write) {
739 if (MemObj->hasUserDataPtr()) {
741 Scheduler::getInstance().getDefaultHostQueue();
742 AllocaCommand *HostAllocaCmd =
new AllocaCommand(
743 DefaultHostQueue, FullReq,
true ,
745 MemObj->isHostPointerReadOnly() );
746 Record->MAllocaCommands.push_back(HostAllocaCmd);
747 Record->MWriteLeaves.push_back(HostAllocaCmd, ToEnqueue);
748 ++(HostAllocaCmd->MLeafCounter);
749 Record->MCurContext = DefaultHostQueue->getContextImplPtr();
756 if (Req->MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer)
761 if (Queue->is_host() != Record->MCurContext->is_host()) {
770 bool PinnedHostMemory = MemObj->has_property<
771 sycl::ext::oneapi::property::buffer::use_pinned_host_memory>();
773 bool HostUnifiedMemoryOnNonHostDevice =
776 if (PinnedHostMemory || HostUnifiedMemoryOnNonHostDevice) {
777 AllocaCommandBase *LinkedAllocaCmdCand = findAllocaForReq(
778 Record, Req, Record->MCurContext,
false);
781 if (LinkedAllocaCmdCand &&
782 !LinkedAllocaCmdCand->MLinkedAllocaCmd) {
783 LinkedAllocaCmd = LinkedAllocaCmdCand;
790 new AllocaCommand(Queue, FullReq, InitFromUserData, LinkedAllocaCmd);
793 if (LinkedAllocaCmd) {
794 Command *ConnCmd = AllocaCmd->addDep(
795 DepDesc{LinkedAllocaCmd, AllocaCmd->getRequirement(),
799 ToEnqueue.push_back(ConnCmd);
800 LinkedAllocaCmd->MLinkedAllocaCmd = AllocaCmd;
803 ConnCmd = AllocaCmd->getReleaseCmd()->addDep(
804 DepDesc(LinkedAllocaCmd->getReleaseCmd(),
805 AllocaCmd->getRequirement(), LinkedAllocaCmd),
808 ToEnqueue.push_back(ConnCmd);
814 if (Queue->is_host()) {
815 AllocaCmd->MIsActive =
false;
817 LinkedAllocaCmd->MIsActive =
false;
818 Record->MCurContext = Queue->getContextImplPtr();
820 std::set<Command *> Deps =
821 findDepsForReq(Record, Req, Queue->getContextImplPtr());
822 for (Command *Dep : Deps) {
823 Command *ConnCmd = AllocaCmd->addDep(
824 DepDesc{Dep, Req, LinkedAllocaCmd}, ToCleanUp);
826 ToEnqueue.push_back(ConnCmd);
828 updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp);
829 addNodeToLeaves(Record, AllocaCmd, Req->MAccessMode, ToEnqueue);
834 Record->MAllocaCommands.push_back(AllocaCmd);
835 Record->MWriteLeaves.push_back(AllocaCmd, ToEnqueue);
836 ++(AllocaCmd->MLeafCounter);
837 for (Command *Cmd : ToCleanUp)
844 void Scheduler::GraphBuilder::markModifiedIfWrite(MemObjRecord *Record,
846 switch (Req->MAccessMode) {
849 case access::mode::discard_write:
850 case access::mode::discard_read_write:
851 case access::mode::atomic:
852 Record->MMemModified =
true;
854 case access::mode::read:
859 EmptyCommand *Scheduler::GraphBuilder::addEmptyCmd(
860 Command *Cmd,
const std::vector<Requirement *> &Reqs,
862 std::vector<Command *> &ToEnqueue,
const bool AddDepsToLeaves) {
863 EmptyCommand *EmptyCmd =
864 new EmptyCommand(Scheduler::getInstance().getDefaultHostQueue());
867 throw runtime_error(
"Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
869 EmptyCmd->MIsBlockable =
true;
870 EmptyCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueBlocked;
871 EmptyCmd->MBlockReason = Reason;
874 MemObjRecord *Record = getOrInsertMemObjRecord(Queue, Req, ToEnqueue);
875 AllocaCommandBase *AllocaCmd =
876 getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
877 EmptyCmd->addRequirement(Cmd, AllocaCmd, Req);
882 Cmd->addUser(EmptyCmd);
884 if (AddDepsToLeaves) {
885 const std::vector<DepDesc> &Deps = Cmd->MDeps;
886 std::vector<Command *> ToCleanUp;
887 for (
const DepDesc &Dep : Deps) {
889 MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj);
891 updateLeaves({Cmd}, Record, Req->MAccessMode, ToCleanUp);
892 addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode, ToEnqueue);
894 for (Command *Cmd : ToCleanUp)
902 if (Cmd->
getCG().
getType() != CG::CGTYPE::CodeplayHostTask)
912 std::unordered_map<SYCLMemObjI *, access::mode> CombinedModes;
913 bool HasDuplicateMemObjects =
false;
915 auto Result = CombinedModes.insert(
916 std::make_pair(Req->MSYCLMemObj, Req->MAccessMode));
917 if (!Result.second) {
918 Result.first->second =
920 HasDuplicateMemObjects =
true;
924 if (!HasDuplicateMemObjects)
927 Req->MAccessMode = CombinedModes[Req->MSYCLMemObj];
931 Scheduler::GraphBuildResult
932 Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
934 std::vector<Command *> &ToEnqueue) {
935 std::vector<Requirement *> &Reqs = CommandGroup->MRequirements;
936 std::vector<detail::EventImplPtr> &Events = CommandGroup->MEvents;
938 auto NewCmd = std::make_unique<ExecCGCommand>(std::move(CommandGroup), Queue);
940 throw runtime_error(
"Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
946 auto QUniqueID = std::hash<QueueImplPtr>()(Queue);
947 if (isInFusionMode(QUniqueID) && !NewCmd->isHostTask()) {
948 auto *FusionCmd = findFusionList(QUniqueID)->second.get();
950 bool dependsOnFusion =
false;
951 for (
auto Ev = Events.begin(); Ev != Events.end();) {
952 auto *EvDepCmd =
static_cast<Command *
>((*Ev)->getCommand());
958 if (EvDepCmd->getQueue() != Queue && isPartOfActiveFusion(EvDepCmd)) {
959 printFusionWarning(
"Aborting fusion because of event dependency from a "
961 cancelFusion(EvDepCmd->getQueue(), ToEnqueue);
965 if (EvDepCmd == FusionCmd) {
966 Ev = Events.erase(Ev);
967 dependsOnFusion =
true;
977 if (dependsOnFusion) {
978 for (
auto *Cmd : FusionCmd->getFusionList()) {
979 Events.push_back(Cmd->getEvent());
986 createGraphForCommand(NewCmd.get(), NewCmd->getCG(),
988 FusionCmd->auxiliaryCommands());
992 FusionCmd->addToFusionList(NewCmd.get());
993 NewCmd->MFusionCmd = FusionCmd;
994 std::vector<Command *> ToCleanUp;
997 auto ConnectionCmd = FusionCmd->addDep(NewCmd->getEvent(), ToCleanUp);
999 FusionCmd->auxiliaryCommands().push_back(ConnectionCmd);
1001 return {NewCmd.release(), FusionCmd->getEvent(),
false};
1003 createGraphForCommand(NewCmd.get(), NewCmd->getCG(),
1006 auto Event = NewCmd->getEvent();
1007 return {NewCmd.release(), Event,
true};
1010 void Scheduler::GraphBuilder::createGraphForCommand(
1012 std::vector<Requirement *> &Reqs,
1013 const std::vector<detail::EventImplPtr> &Events,
QueueImplPtr Queue,
1014 std::vector<Command *> &ToEnqueue) {
1016 if (MPrintOptionsArray[BeforeAddCG])
1017 printGraphAsDot(
"before_addCG");
1023 std::vector<Command *> ToCleanUp;
1028 bool isSameCtx =
false;
1034 Record = getOrInsertMemObjRecord(QueueForAlloca, Req, ToEnqueue);
1035 markModifiedIfWrite(Record, Req);
1038 getOrCreateAllocaForReq(Record, Req, QueueForAlloca, ToEnqueue);
1041 sameCtx(QueueForAlloca->getContextImplPtr(), Record->MCurContext);
1051 remapMemoryObject(Record, Req, AllocaCmd, ToEnqueue);
1055 bool NeedMemMoveToHost =
false;
1056 auto MemMoveTargetQueue = Queue;
1058 if (isInteropTask) {
1062 NeedMemMoveToHost =
true;
1063 MemMoveTargetQueue = HT.
MQueue;
1065 }
else if (!Queue->is_host() && !Record->
MCurContext->is_host())
1066 NeedMemMoveToHost =
true;
1068 if (NeedMemMoveToHost)
1069 insertMemoryMove(Record, Req,
1070 Scheduler::getInstance().getDefaultHostQueue(),
1072 insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue);
1074 std::set<Command *> Deps =
1075 findDepsForReq(Record, Req, Queue->getContextImplPtr());
1077 for (Command *Dep : Deps) {
1078 if (Dep != NewCmd) {
1080 NewCmd->
addDep(DepDesc{Dep, Req, AllocaCmd}, ToCleanUp);
1082 ToEnqueue.push_back(ConnCmd);
1091 std::vector<DepDesc> Deps = NewCmd->
MDeps;
1092 for (DepDesc &Dep : Deps) {
1094 MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj);
1095 updateLeaves({Dep.MDepCommand}, Record, Req->MAccessMode, ToCleanUp);
1096 addNodeToLeaves(Record, NewCmd, Req->MAccessMode, ToEnqueue);
1101 if (e->getCommand() && e->getCommand() == NewCmd) {
1104 if (Command *ConnCmd = NewCmd->
addDep(e, ToCleanUp))
1105 ToEnqueue.push_back(ConnCmd);
1108 if (MPrintOptionsArray[AfterAddCG])
1109 printGraphAsDot(
"after_addCG");
1111 for (Command *Cmd : ToCleanUp) {
1112 cleanupCommand(Cmd);
1116 void Scheduler::GraphBuilder::decrementLeafCountersForRecord(
1121 cleanupCommand(Cmd);
1126 cleanupCommand(Cmd);
1130 void Scheduler::GraphBuilder::cleanupCommandsForRecord(
MemObjRecord *Record) {
1131 std::vector<AllocaCommandBase *> &AllocaCommands = Record->
MAllocaCommands;
1132 if (AllocaCommands.empty())
1135 assert(MCmdsToVisit.empty());
1136 MVisitedCmds.clear();
1140 for (
Command *AllocaCmd : AllocaCommands) {
1146 if (UserCmd->
getType() != Command::CommandType::ALLOCA)
1147 MCmdsToVisit.push(UserCmd);
1154 AllocaCmd->
MUsers.clear();
1172 while (!MCmdsToVisit.empty()) {
1173 Command *Cmd = MCmdsToVisit.front();
1180 if (UserCmd->
getType() != Command::CommandType::ALLOCA)
1181 MCmdsToVisit.push(UserCmd);
1185 std::map<Command *, bool> ShouldBeUpdated;
1186 auto NewEnd = std::remove_if(
1188 if (std::find(AllocaCommands.begin(), AllocaCommands.end(),
1189 Dep.MAllocaCmd) != AllocaCommands.end()) {
1190 ShouldBeUpdated.insert({Dep.MDepCommand, true});
1196 Cmd->
MDeps.erase(NewEnd, Cmd->MDeps.end());
1199 for (
auto DepCmdIt : ShouldBeUpdated) {
1200 if (!DepCmdIt.second)
1202 DepCmdIt.first->MUsers.erase(Cmd);
1207 if (Cmd->MDeps.empty()) {
1208 Cmd->MUsers.clear();
1211 if (!Cmd->MMarkedForCleanup)
1212 Cmd->MMarks.MToBeDeleted =
true;
1219 void Scheduler::GraphBuilder::cleanupCommand(
Command *Cmd,
1220 bool AllowUnsubmitted) {
1222 static bool DeprWarningPrinted =
false;
1223 if (!DeprWarningPrinted) {
1224 std::cerr <<
"WARNING: The enviroment variable "
1225 "SYCL_DISABLE_POST_ENQUEUE_CLEANUP is deprecated. Please "
1226 "use SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP instead.\n";
1227 DeprWarningPrinted =
true;
1238 assert(CmdT != Command::ALLOCA && CmdT != Command::ALLOCA_SUB_BUF);
1239 assert(CmdT != Command::RELEASE);
1259 DepCmd->
MUsers.erase(Cmd);
1262 if (Cmd->
getType() == Command::FUSION &&
1270 Cmd->
getEvent()->setCommand(
nullptr);
1274 void Scheduler::GraphBuilder::removeRecordForMemObj(
SYCLMemObjI *MemObject) {
1275 const auto It = std::find_if(
1276 MMemObjs.begin(), MMemObjs.end(),
1277 [MemObject](
const SYCLMemObjI *Obj) { return Obj == MemObject; });
1278 if (It != MMemObjs.end())
1296 Command *Scheduler::GraphBuilder::connectDepEvent(
1298 std::vector<Command *> &ToCleanUp) {
1307 std::move(HT), {}, {}, {},
1310 {DepEvent}, CG::CodeplayHostTask,
1313 std::move(ConnectCG), Scheduler::getInstance().getDefaultHostQueue());
1314 }
catch (
const std::bad_alloc &) {
1315 throw runtime_error(
"Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
1322 (void)ConnectCmd->
addDep(Dep, ToCleanUp);
1323 assert(
reinterpret_cast<Command *
>(DepEvent->getCommand()) ==
1332 std::ignore = Cmd->
addDep(DepOnConnect, ToCleanUp);
1337 if (
Command *DepCmd =
reinterpret_cast<Command *
>(DepEvent->getCommand()))
1340 std::ignore = ConnectCmd->
addDep(DepEvent, ToCleanUp);
1351 auto QUniqueID = std::hash<QueueImplPtr>()(Queue);
1352 if (isInFusionMode(QUniqueID)) {
1354 "Queue already in fusion mode"};
1356 auto OldFusionCmd = findFusionList(QUniqueID);
1357 if (OldFusionCmd != MFusionMap.end()) {
1362 OldFusionCmd->second->setFusionStatus(
1363 KernelFusionCommand::FusionStatus::DELETED);
1364 cleanupCommand(OldFusionCmd->second.release());
1365 MFusionMap.erase(OldFusionCmd);
1367 MFusionMap.emplace(QUniqueID, std::make_unique<KernelFusionCommand>(Queue));
1370 void Scheduler::GraphBuilder::removeNodeFromGraph(
1371 Command *Node, std::vector<Command *> &ToEnqueue) {
1374 for (
auto &Dep : Node->
MDeps) {
1375 auto AccessMode = Dep.MDepRequirement->MAccessMode;
1376 auto *Record = getMemObjRecord(Dep.MDepRequirement->MSYCLMemObj);
1378 Node->
MLeafCounter -= Record->MReadLeaves.remove(Node);
1379 Node->
MLeafCounter -= Record->MWriteLeaves.remove(Node);
1383 for (
auto PrevDep : Dep.MDepCommand->MDeps) {
1384 auto *DepReq = PrevDep.MDepRequirement;
1385 auto *DepRecord = getMemObjRecord(DepReq->MSYCLMemObj);
1386 if (DepRecord == Record) {
1389 assert(Dep.MDepCommand);
1390 addNodeToLeaves(Record, Dep.MDepCommand, DepReq->MAccessMode,
1395 Dep.MDepCommand->MUsers.erase(Node);
1398 Node->
MDeps.clear();
1402 std::vector<Command *> &ToEnqueue) {
1403 auto QUniqueID = std::hash<QueueImplPtr>()(Queue);
1404 if (!isInFusionMode(QUniqueID)) {
1409 auto *PlaceholderCmd = (*FusionList).second.get();
1412 auto FusedCmdList = PlaceholderCmd->getFusionList();
1413 ToEnqueue.insert(ToEnqueue.end(), FusedCmdList.begin(), FusedCmdList.end());
1417 ToEnqueue.insert(ToEnqueue.end(), PlaceholderCmd->auxiliaryCommands().begin(),
1418 PlaceholderCmd->auxiliaryCommands().end());
1420 ToEnqueue.push_back(PlaceholderCmd);
1422 if (MPrintOptionsArray[AfterFusionCancel]) {
1423 printGraphAsDot(
"after_fusionCancel");
1427 PlaceholderCmd->setFusionStatus(KernelFusionCommand::FusionStatus::CANCELLED);
1431 if (Cmd->
getType() == Command::RUN_CG) {
1432 return static_cast<ExecCGCommand *
>(Cmd)->MFusionCmd == Fusion;
1449 return !PredPartOfFusion;
1457 for (
auto &Dep : Cmd->
MDeps) {
1458 auto *DepCmd = Dep.MDepCommand;
1467 auto *EvDepCmd =
static_cast<Command *
>(Ev->getCommand());
1476 auto *EvDepCmd =
static_cast<Command *
>(Ev->getCommand());
1489 std::vector<Command *> &ToEnqueue,
1491 auto QUniqueID = std::hash<QueueImplPtr>()(Queue);
1492 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
1493 if (!isInFusionMode(QUniqueID)) {
1494 auto InactiveFusionList = findFusionList(QUniqueID);
1495 if (InactiveFusionList == MFusionMap.end()) {
1496 throw sycl::exception{
1498 "Calling complete_fusion on a queue not in fusion mode"};
1500 return InactiveFusionList->second->getEvent();
1504 auto *PlaceholderCmd =
FusionList->second.get();
1505 auto &CmdList = PlaceholderCmd->getFusionList();
1513 bool CreatesCircularDep =
1515 return checkForCircularDependency(Cmd, true, PlaceholderCmd);
1517 if (CreatesCircularDep) {
1520 "Aborting fusion because it would create a circular dependency");
1521 auto LastEvent = PlaceholderCmd->getEvent();
1522 this->cancelFusion(Queue, ToEnqueue);
1527 auto FusedCG = detail::jit_compiler::get_instance().fuseKernels(
1528 Queue, CmdList, PropList);
1534 auto LastEvent = PlaceholderCmd->getEvent();
1535 this->cancelFusion(Queue, ToEnqueue);
1540 std::vector<EventImplPtr> FusedEventDeps;
1541 for (
auto *Cmd : CmdList) {
1542 FusedEventDeps.insert(FusedEventDeps.end(),
1545 FusedEventDeps.insert(FusedEventDeps.end(),
1552 FusedEventDeps.erase(
1553 std::remove_if(FusedEventDeps.begin(), FusedEventDeps.end(),
1555 if (E->getCommand() == PlaceholderCmd) {
1558 if (E->getCommand() &&
1562 static_cast<ExecCGCommand *>(E->getCommand());
1563 if (RunCGCmd->MFusionCmd == PlaceholderCmd) {
1569 FusedEventDeps.end());
1571 auto FusedKernelCmd =
1572 std::make_unique<ExecCGCommand>(std::move(FusedCG), Queue);
1574 assert(PlaceholderCmd->MDeps.empty());
1579 for (
auto OldCmd = CmdList.rbegin(); OldCmd != CmdList.rend(); ++OldCmd) {
1580 removeNodeFromGraph(*OldCmd, ToEnqueue);
1581 cleanupCommand(*OldCmd,
true);
1584 createGraphForCommand(FusedKernelCmd.get(), FusedKernelCmd->getCG(),
false,
1585 FusedKernelCmd->getCG().MRequirements, FusedEventDeps,
1588 ToEnqueue.push_back(FusedKernelCmd.get());
1590 std::vector<Command *> ToCleanUp;
1592 auto *ConnectToPlaceholder =
1593 PlaceholderCmd->addDep(FusedKernelCmd->getEvent(), ToCleanUp);
1594 if (ConnectToPlaceholder) {
1595 ToEnqueue.push_back(ConnectToPlaceholder);
1597 for (Command *Cmd : ToCleanUp) {
1598 cleanupCommand(Cmd);
1600 ToEnqueue.push_back(PlaceholderCmd);
1602 if (MPrintOptionsArray[AfterFusionComplete]) {
1603 printGraphAsDot(
"after_fusionComplete");
1607 PlaceholderCmd->setFusionStatus(KernelFusionCommand::FusionStatus::COMPLETE);
1609 return FusedKernelCmd.release()->getEvent();
1610 #else // SYCL_EXT_CODEPLAY_KERNEL_FUSION
1611 printFusionWarning(
"Kernel fusion not supported by this build");
1614 auto *PlaceholderCmd =
FusionList->second.get();
1615 auto LastEvent = PlaceholderCmd->getEvent();
1616 this->cancelFusion(Queue, ToEnqueue);
1618 #endif // SYCL_EXT_CODEPLAY_KERNEL_FUSION
1621 bool Scheduler::GraphBuilder::isInFusionMode(
QueueIdT Id) {