12 #include <sycl/feature_test.hpp>
13 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
34 inline namespace _V1 {
53 return LHS == RHS || (LHS->is_host() && RHS->is_host());
65 return (Required == Current);
67 assert(
false &&
"Write only access is expected to be mapped as read_write");
97 std::string GraphPrintOpts(EnvVarCStr);
98 bool EnableAlways = GraphPrintOpts.find(
"always") != std::string::npos;
100 if (GraphPrintOpts.find(
"before_addCG") != std::string::npos ||
102 MPrintOptionsArray[BeforeAddCG] =
true;
103 if (GraphPrintOpts.find(
"after_addCG") != std::string::npos || EnableAlways)
104 MPrintOptionsArray[AfterAddCG] =
true;
105 if (GraphPrintOpts.find(
"before_addCopyBack") != std::string::npos ||
107 MPrintOptionsArray[BeforeAddCopyBack] =
true;
108 if (GraphPrintOpts.find(
"after_addCopyBack") != std::string::npos ||
110 MPrintOptionsArray[AfterAddCopyBack] =
true;
111 if (GraphPrintOpts.find(
"before_addHostAcc") != std::string::npos ||
113 MPrintOptionsArray[BeforeAddHostAcc] =
true;
114 if (GraphPrintOpts.find(
"after_addHostAcc") != std::string::npos ||
116 MPrintOptionsArray[AfterAddHostAcc] =
true;
117 if (GraphPrintOpts.find(
"after_fusionComplete") != std::string::npos ||
119 MPrintOptionsArray[AfterFusionComplete] =
true;
120 if (GraphPrintOpts.find(
"after_fusionCancel") != std::string::npos ||
122 MPrintOptionsArray[AfterFusionCancel] =
true;
127 assert(Cmd &&
"Cmd can't be nullptr");
131 Visited.push_back(Cmd);
137 Cmd->MMarks.MVisited =
false;
142 if (Cmd->MMarks.MToBeDeleted) {
151 Cmd->getEvent()->setCommand(
nullptr);
154 Cmd->MMarks.MVisited =
false;
159 std::vector<Command *> &Visited,
Command *Cmd) {
169 void Scheduler::GraphBuilder::printGraphAsDot(
const char *ModeName) {
170 static size_t Counter = 0;
171 std::string ModeNameStr(ModeName);
172 std::string FileName =
173 "graph_" + std::to_string(Counter) + ModeNameStr +
".dot";
177 std::fstream Stream(FileName, std::ios::out);
178 Stream <<
"strict digraph {" << std::endl;
180 MVisitedCmds.clear();
182 for (SYCLMemObjI *MemObject : MMemObjs)
183 for (
Command *AllocaCmd : MemObject->MRecord->MAllocaCommands)
186 Stream <<
"}" << std::endl;
192 return MemObject->
MRecord.get();
197 std::vector<Command *> &ToEnqueue) {
201 if (
nullptr != Record)
204 const size_t LeafLimit = 8;
210 DepDesc Dep = findDepForRecord(Dependant, Record);
212 std::vector<Command *> ToCleanUp;
215 ToEnqueue.push_back(ConnectionCmd);
217 --(Dependency->MLeafCounter);
218 if (Dependency->readyForCleanup())
219 ToCleanUp.push_back(Dependency);
230 std::vector<sycl::device> Devices =
231 InteropCtxPtr->get_info<info::context::devices>();
232 assert(Devices.size() != 0);
239 Dev, InteropCtxPtr, {}, {}}};
242 new MemObjRecord{InteropCtxPtr, LeafLimit, AllocateDependency});
243 getOrCreateAllocaForReq(MemObject->
MRecord.get(), Req, InteropQueuePtr,
247 LeafLimit, AllocateDependency});
249 MMemObjs.push_back(MemObject);
250 return MemObject->
MRecord.get();
256 std::vector<Command *> &ToCleanUp) {
263 bool WasLeaf = Cmd->MLeafCounter > 0;
266 if (WasLeaf && Cmd->readyForCleanup()) {
267 ToCleanUp.push_back(Cmd);
274 std::vector<Command *> &ToEnqueue) {
278 if (Leaves.push_back(Cmd, ToEnqueue))
284 std::vector<Command *> &ToEnqueue) {
286 findAllocaForReq(Record, Req, Queue->getContextImplPtr());
287 assert(AllocaCmd &&
"There must be alloca for requirement!");
294 std::set<Command *> Deps =
295 findDepsForReq(Record, Req, Queue->getContextImplPtr());
296 std::vector<Command *> ToCleanUp;
299 UpdateCommand->
addDep(
DepDesc{Dep, StoredReq, AllocaCmd}, ToCleanUp);
301 ToEnqueue.push_back(ConnCmd);
303 updateLeaves(Deps, Record, Req->
MAccessMode, ToCleanUp);
304 addNodeToLeaves(Record, UpdateCommand, Req->
MAccessMode, ToEnqueue);
307 return UpdateCommand;
316 "Expected linked alloca commands");
318 "Expected source alloca command to be active");
320 if (AllocaCmdSrc->
getQueue()->is_host()) {
339 Command *Scheduler::GraphBuilder::insertMemoryMove(
341 std::vector<Command *> &ToEnqueue) {
343 AllocaCommandBase *AllocaCmdDst =
344 getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
346 throw runtime_error(
"Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
348 std::set<Command *> Deps =
349 findDepsForReq(Record, Req, Queue->getContextImplPtr());
350 Deps.insert(AllocaCmdDst);
353 if (AllocaCmdDst->getType() == Command::CommandType::ALLOCA_SUB_BUF)
355 static_cast<AllocaSubBufCommand *
>(AllocaCmdDst)->getParentAlloca();
358 AllocaCommandBase *AllocaCmdSrc =
359 findAllocaForReq(Record, Req, Record->MCurContext);
364 auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) {
366 Record->MCurContext) &&
368 AllocaCmd->
getType() == Command::CommandType::ALLOCA;
372 std::find_if(Record->MAllocaCommands.begin(),
373 Record->MAllocaCommands.end(), IsSuitableAlloca);
374 AllocaCmdSrc = (Record->MAllocaCommands.end() != It) ? *It :
nullptr;
377 throw runtime_error(
"Cannot find buffer allocation",
378 PI_ERROR_INVALID_VALUE);
381 if (AllocaCmdSrc->getType() == Command::CommandType::ALLOCA_SUB_BUF)
383 static_cast<AllocaSubBufCommand *
>(AllocaCmdSrc)->getParentAlloca();
384 else if (AllocaCmdSrc->getSYCLMemObj() != Req->MSYCLMemObj)
385 assert(
false &&
"Inappropriate alloca command.");
390 if (AllocaCmdSrc->MLinkedAllocaCmd == AllocaCmdDst) {
396 Record->MHostAccess = MapMode;
401 Record->MCurContext = Queue->getContextImplPtr();
407 new MemCpyCommand(*AllocaCmdSrc->getRequirement(), AllocaCmdSrc,
408 *AllocaCmdDst->getRequirement(), AllocaCmdDst,
409 AllocaCmdSrc->getQueue(), AllocaCmdDst->getQueue());
412 std::vector<Command *> ToCleanUp;
415 DepDesc{Dep, NewCmd->getRequirement(), AllocaCmdDst}, ToCleanUp);
417 ToEnqueue.push_back(ConnCmd);
423 Record->MCurContext = Queue->getContextImplPtr();
427 Command *Scheduler::GraphBuilder::remapMemoryObject(
428 MemObjRecord *Record,
Requirement *Req, AllocaCommandBase *HostAllocaCmd,
429 std::vector<Command *> &ToEnqueue) {
430 assert(HostAllocaCmd->getQueue()->is_host() &&
431 "Host alloca command expected");
432 assert(HostAllocaCmd->MIsActive &&
"Active alloca command expected");
434 AllocaCommandBase *LinkedAllocaCmd = HostAllocaCmd->MLinkedAllocaCmd;
435 assert(LinkedAllocaCmd &&
"Linked alloca command expected");
437 std::set<Command *> Deps = findDepsForReq(Record, Req, Record->MCurContext);
439 UnMapMemObject *UnMapCmd =
new UnMapMemObject(
440 LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(),
441 &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue());
447 MapMemObject *MapCmd =
new MapMemObject(
448 LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(),
449 &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue(), MapMode);
451 std::vector<Command *> ToCleanUp;
453 Command *ConnCmd = UnMapCmd->addDep(
454 DepDesc{Dep, UnMapCmd->getRequirement(), LinkedAllocaCmd}, ToCleanUp);
456 ToEnqueue.push_back(ConnCmd);
459 Command *ConnCmd = MapCmd->addDep(
460 DepDesc{UnMapCmd, MapCmd->getRequirement(), HostAllocaCmd}, ToCleanUp);
462 ToEnqueue.push_back(ConnCmd);
468 Record->MHostAccess = MapMode;
476 std::vector<Command *> &ToEnqueue) {
480 if (Record && MPrintOptionsArray[BeforeAddCopyBack])
481 printGraphAsDot(
"before_addCopyBack");
487 std::set<Command *> Deps =
488 findDepsForReq(Record, Req, HostQueue->getContextImplPtr());
490 findAllocaForReq(Record, Req, Record->
MCurContext);
492 auto MemCpyCmdUniquePtr = std::make_unique<MemCpyCommandHost>(
494 SrcAllocaCmd->
getQueue(), std::move(HostQueue));
496 if (!MemCpyCmdUniquePtr)
497 throw runtime_error(
"Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
501 std::vector<Command *> ToCleanUp;
506 ToEnqueue.push_back(ConnCmd);
509 updateLeaves(Deps, Record, Req->
MAccessMode, ToCleanUp);
510 addNodeToLeaves(Record, MemCpyCmd, Req->
MAccessMode, ToEnqueue);
513 if (MPrintOptionsArray[AfterAddCopyBack])
514 printGraphAsDot(
"after_addCopyBack");
522 std::vector<Command *> &ToEnqueue) {
526 MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req, ToEnqueue);
527 if (MPrintOptionsArray[BeforeAddHostAcc])
528 printGraphAsDot(
"before_addHostAccessor");
529 markModifiedIfWrite(Record, Req);
532 getOrCreateAllocaForReq(Record, Req, HostQueue, ToEnqueue);
537 remapMemoryObject(Record, Req,
545 insertMemoryMove(Record, Req, HostQueue, ToEnqueue);
548 insertUpdateHostReqCmd(Record, Req, HostQueue, ToEnqueue);
552 addEmptyCmd(UpdateHostAccCmd, {Req}, HostQueue,
557 if (MPrintOptionsArray[AfterAddHostAcc])
558 printGraphAsDot(
"after_addHostAccessor");
560 return UpdateHostAccCmd;
564 std::unique_ptr<detail::CG> CommandGroup,
const QueueImplPtr &HostQueue,
565 std::vector<Command *> &ToEnqueue) {
567 auto UpdateHost =
static_cast<CGUpdateHost *
>(CommandGroup.get());
570 MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req, ToEnqueue);
571 return insertMemoryMove(Record, Req, HostQueue, ToEnqueue);
583 Scheduler::GraphBuilder::findDepsForReq(
MemObjRecord *Record,
586 std::set<Command *> RetDeps;
587 std::vector<Command *> Visited;
595 ToAnalyze.insert(ToAnalyze.begin(), V.begin(), V.end());
598 while (!ToAnalyze.empty()) {
599 Command *DepCmd = ToAnalyze.back();
600 ToAnalyze.pop_back();
602 std::vector<Command *> NewAnalyze;
604 for (
const DepDesc &Dep : DepCmd->MDeps) {
605 if (Dep.MDepRequirement->MSYCLMemObj != Req->
MSYCLMemObj)
608 bool CanBypassDep =
false;
614 CanBypassDep |= !
doOverlap(Dep.MDepRequirement, Req);
619 sameCtx(Context, Dep.MDepCommand->getQueue()->getContextImplPtr());
622 RetDeps.insert(DepCmd);
630 NewAnalyze.push_back(Dep.MDepCommand);
632 ToAnalyze.insert(ToAnalyze.end(), NewAnalyze.begin(), NewAnalyze.end());
647 assert(
false &&
"No dependency found for a leaf of the record");
648 return {
nullptr,
nullptr,
nullptr};
656 auto IsSuitableAlloca = [&Context, Req,
658 bool Res =
sameCtx(AllocaCmd->
getQueue()->getContextImplPtr(), Context);
661 Res &= AllocaCmd->
getType() == Command::CommandType::ALLOCA_SUB_BUF;
664 Res &= AllowConst || !AllocaCmd->
MIsConst;
675 if (std::strcmp(HUMConfig,
"0") == 0)
676 return Ctx->is_host();
677 if (std::strcmp(HUMConfig,
"1") == 0)
680 for (
const device &Device : Ctx->getDevices()) {
681 if (!Device.get_info<info::device::host_unified_memory>())
691 AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
693 std::vector<Command *> &ToEnqueue) {
695 AllocaCommandBase *AllocaCmd = findAllocaForReq(
696 Record, Req, Queue->getContextImplPtr(),
false);
699 std::vector<Command *> ToCleanUp;
703 range<3> ParentRange{Req->MSYCLMemObj->getSizeInBytes(), 1, 1};
705 {0, 0, 0}, ParentRange, ParentRange,
707 sizeof(char),
size_t(0));
710 getOrCreateAllocaForReq(Record, &ParentRequirement, Queue, ToEnqueue);
711 AllocaCmd =
new AllocaSubBufCommand(Queue, *Req, ParentAlloca, ToEnqueue,
715 const Requirement FullReq( {0, 0, 0}, Req->MMemoryRange,
717 Req->MSYCLMemObj, Req->MDims, Req->MElemSize,
726 const bool HostUnifiedMemory =
728 SYCLMemObjI *MemObj = Req->MSYCLMemObj;
729 const bool InitFromUserData = Record->MAllocaCommands.empty() &&
730 (HostUnifiedMemory || MemObj->isInterop());
731 AllocaCommandBase *LinkedAllocaCmd =
nullptr;
736 if (Record->MAllocaCommands.empty()) {
737 if (!HostUnifiedMemory &&
742 if (MemObj->hasUserDataPtr()) {
745 AllocaCommand *HostAllocaCmd =
new AllocaCommand(
748 MemObj->isHostPointerReadOnly() );
749 Record->MAllocaCommands.push_back(HostAllocaCmd);
750 Record->MWriteLeaves.push_back(HostAllocaCmd, ToEnqueue);
751 ++(HostAllocaCmd->MLeafCounter);
759 if (Req->MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer)
764 if (Queue->is_host() != Record->MCurContext->is_host()) {
773 bool PinnedHostMemory = MemObj->usesPinnedHostMemory();
775 bool HostUnifiedMemoryOnNonHostDevice =
778 if (PinnedHostMemory || HostUnifiedMemoryOnNonHostDevice) {
779 AllocaCommandBase *LinkedAllocaCmdCand = findAllocaForReq(
780 Record, Req, Record->MCurContext,
false);
783 if (LinkedAllocaCmdCand &&
784 !LinkedAllocaCmdCand->MLinkedAllocaCmd) {
785 LinkedAllocaCmd = LinkedAllocaCmdCand;
792 new AllocaCommand(Queue, FullReq, InitFromUserData, LinkedAllocaCmd);
795 if (LinkedAllocaCmd) {
796 Command *ConnCmd = AllocaCmd->addDep(
797 DepDesc{LinkedAllocaCmd, AllocaCmd->getRequirement(),
801 ToEnqueue.push_back(ConnCmd);
802 LinkedAllocaCmd->MLinkedAllocaCmd = AllocaCmd;
805 ConnCmd = AllocaCmd->getReleaseCmd()->addDep(
806 DepDesc(LinkedAllocaCmd->getReleaseCmd(),
807 AllocaCmd->getRequirement(), LinkedAllocaCmd),
810 ToEnqueue.push_back(ConnCmd);
816 if (Queue->is_host()) {
817 AllocaCmd->MIsActive =
false;
819 LinkedAllocaCmd->MIsActive =
false;
820 Record->MCurContext = Queue->getContextImplPtr();
822 std::set<Command *> Deps =
823 findDepsForReq(Record, Req, Queue->getContextImplPtr());
825 Command *ConnCmd = AllocaCmd->addDep(
826 DepDesc{Dep, Req, LinkedAllocaCmd}, ToCleanUp);
828 ToEnqueue.push_back(ConnCmd);
830 updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp);
831 addNodeToLeaves(Record, AllocaCmd, Req->MAccessMode, ToEnqueue);
836 Record->MAllocaCommands.push_back(AllocaCmd);
837 Record->MWriteLeaves.push_back(AllocaCmd, ToEnqueue);
838 ++(AllocaCmd->MLeafCounter);
846 void Scheduler::GraphBuilder::markModifiedIfWrite(MemObjRecord *Record,
848 switch (Req->MAccessMode) {
854 Record->MMemModified =
true;
861 EmptyCommand *Scheduler::GraphBuilder::addEmptyCmd(
862 Command *Cmd,
const std::vector<Requirement *> &Reqs,
864 std::vector<Command *> &ToEnqueue,
const bool AddDepsToLeaves) {
865 EmptyCommand *EmptyCmd =
869 throw runtime_error(
"Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
871 EmptyCmd->MIsBlockable =
true;
873 EmptyCmd->MBlockReason = Reason;
876 MemObjRecord *Record = getOrInsertMemObjRecord(Queue, Req, ToEnqueue);
877 AllocaCommandBase *AllocaCmd =
878 getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
879 EmptyCmd->addRequirement(Cmd, AllocaCmd, Req);
884 Cmd->addUser(EmptyCmd);
886 if (AddDepsToLeaves) {
887 const std::vector<DepDesc> &Deps = Cmd->MDeps;
888 std::vector<Command *> ToCleanUp;
889 for (
const DepDesc &Dep : Deps) {
893 updateLeaves({Cmd}, Record, Req->MAccessMode, ToCleanUp);
894 addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode, ToEnqueue);
904 if (Cmd->
getCG().
getType() != CG::CGTYPE::CodeplayHostTask)
914 std::unordered_map<SYCLMemObjI *, access::mode> CombinedModes;
915 bool HasDuplicateMemObjects =
false;
917 auto Result = CombinedModes.insert(
918 std::make_pair(Req->MSYCLMemObj, Req->MAccessMode));
919 if (!Result.second) {
920 Result.first->second =
922 HasDuplicateMemObjects =
true;
926 if (!HasDuplicateMemObjects)
929 Req->MAccessMode = CombinedModes[Req->MSYCLMemObj];
934 std::unique_ptr<detail::CG> CommandGroup,
const QueueImplPtr &Queue,
935 std::vector<Command *> &ToEnqueue,
937 const std::vector<sycl::detail::pi::PiExtSyncPoint> &Dependencies) {
938 std::vector<Requirement *> &Reqs = CommandGroup->getRequirements();
939 std::vector<detail::EventImplPtr> &Events = CommandGroup->getEvents();
941 auto NewCmd = std::make_unique<ExecCGCommand>(
942 std::move(CommandGroup), Queue, CommandBuffer, std::move(Dependencies));
945 throw runtime_error(
"Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
951 auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
953 auto *FusionCmd = findFusionList(QUniqueID)->second.get();
955 bool dependsOnFusion =
false;
956 for (
auto Ev = Events.begin(); Ev != Events.end();) {
957 auto *EvDepCmd =
static_cast<Command *
>((*Ev)->getCommand());
963 if (EvDepCmd->getQueue() != Queue && isPartOfActiveFusion(EvDepCmd)) {
964 printFusionWarning(
"Aborting fusion because of event dependency from a "
970 if (EvDepCmd == FusionCmd) {
971 Ev = Events.erase(Ev);
972 dependsOnFusion =
true;
982 if (dependsOnFusion) {
983 for (
auto *Cmd : FusionCmd->getFusionList()) {
984 Events.push_back(Cmd->getEvent());
991 createGraphForCommand(NewCmd.get(), NewCmd->getCG(),
993 FusionCmd->auxiliaryCommands());
997 FusionCmd->addToFusionList(NewCmd.get());
998 NewCmd->MFusionCmd = FusionCmd;
999 std::vector<Command *> ToCleanUp;
1002 auto ConnectionCmd = FusionCmd->addDep(NewCmd->getEvent(), ToCleanUp);
1003 if (ConnectionCmd) {
1004 FusionCmd->auxiliaryCommands().push_back(ConnectionCmd);
1006 return {NewCmd.release(), FusionCmd->getEvent(),
false};
1008 createGraphForCommand(NewCmd.get(), NewCmd->getCG(),
1011 auto Event = NewCmd->getEvent();
1012 return {NewCmd.release(), Event,
true};
1015 void Scheduler::GraphBuilder::createGraphForCommand(
1017 std::vector<Requirement *> &Reqs,
1018 const std::vector<detail::EventImplPtr> &Events,
QueueImplPtr Queue,
1019 std::vector<Command *> &ToEnqueue) {
1021 if (MPrintOptionsArray[BeforeAddCG])
1022 printGraphAsDot(
"before_addCG");
1028 std::vector<Command *> ToCleanUp;
1033 bool isSameCtx =
false;
1039 Record = getOrInsertMemObjRecord(QueueForAlloca, Req, ToEnqueue);
1040 markModifiedIfWrite(Record, Req);
1043 getOrCreateAllocaForReq(Record, Req, QueueForAlloca, ToEnqueue);
1046 sameCtx(QueueForAlloca->getContextImplPtr(), Record->MCurContext);
1056 remapMemoryObject(Record, Req,
1066 bool NeedMemMoveToHost =
false;
1067 auto MemMoveTargetQueue = Queue;
1069 if (isInteropTask) {
1070 const detail::CGHostTask &HT =
static_cast<detail::CGHostTask &
>(CG);
1072 if (HT.MQueue->getContextImplPtr() != Record->
MCurContext) {
1073 NeedMemMoveToHost =
true;
1074 MemMoveTargetQueue = HT.MQueue;
1076 }
else if (!Queue->is_host() && !Record->
MCurContext->is_host())
1077 NeedMemMoveToHost =
true;
1079 if (NeedMemMoveToHost)
1080 insertMemoryMove(Record, Req,
1083 insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue);
1085 std::set<Command *> Deps =
1086 findDepsForReq(Record, Req, Queue->getContextImplPtr());
1089 if (Dep != NewCmd) {
1091 NewCmd->
addDep(DepDesc{Dep, Req, AllocaCmd}, ToCleanUp);
1093 ToEnqueue.push_back(ConnCmd);
1102 std::vector<DepDesc> Deps = NewCmd->
MDeps;
1103 for (DepDesc &Dep : Deps) {
1106 updateLeaves({Dep.MDepCommand}, Record, Req->MAccessMode, ToCleanUp);
1107 addNodeToLeaves(Record, NewCmd, Req->MAccessMode, ToEnqueue);
1112 if (e->getCommand() && e->getCommand() == NewCmd) {
1116 ToEnqueue.push_back(ConnCmd);
1119 if (MPrintOptionsArray[AfterAddCG])
1120 printGraphAsDot(
"after_addCG");
1122 for (
Command *Cmd : ToCleanUp) {
1123 cleanupCommand(Cmd);
1132 cleanupCommand(Cmd);
1137 cleanupCommand(Cmd);
1142 std::vector<AllocaCommandBase *> &AllocaCommands = Record->
MAllocaCommands;
1143 if (AllocaCommands.empty())
1146 assert(MCmdsToVisit.empty());
1147 MVisitedCmds.clear();
1151 for (
Command *AllocaCmd : AllocaCommands) {
1157 if (UserCmd->
getType() != Command::CommandType::ALLOCA)
1158 MCmdsToVisit.push(UserCmd);
1165 AllocaCmd->
MUsers.clear();
1183 while (!MCmdsToVisit.empty()) {
1184 Command *Cmd = MCmdsToVisit.front();
1191 if (UserCmd->
getType() != Command::CommandType::ALLOCA)
1192 MCmdsToVisit.push(UserCmd);
1196 std::map<Command *, bool> ShouldBeUpdated;
1197 auto NewEnd = std::remove_if(
1199 if (std::find(AllocaCommands.begin(), AllocaCommands.end(),
1200 Dep.MAllocaCmd) != AllocaCommands.end()) {
1201 ShouldBeUpdated.insert({Dep.MDepCommand, true});
1207 Cmd->
MDeps.erase(NewEnd, Cmd->MDeps.end());
1210 for (
auto DepCmdIt : ShouldBeUpdated) {
1211 if (!DepCmdIt.second)
1213 DepCmdIt.first->MUsers.erase(Cmd);
1218 if (Cmd->MDeps.empty()) {
1219 Cmd->MUsers.clear();
1222 if (!Cmd->MMarkedForCleanup)
1223 Cmd->MMarks.MToBeDeleted =
true;
1231 Command *Cmd, [[maybe_unused]]
bool AllowUnsubmitted) {
1233 static bool DeprWarningPrinted =
false;
1234 if (!DeprWarningPrinted) {
1235 std::cerr <<
"WARNING: The enviroment variable "
1236 "SYCL_DISABLE_POST_ENQUEUE_CLEANUP is deprecated. Please "
1237 "use SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP instead.\n";
1238 DeprWarningPrinted =
true;
1270 DepCmd->
MUsers.erase(Cmd);
1281 Cmd->
getEvent()->setCommand(
nullptr);
1286 const auto It = std::find_if(
1287 MMemObjs.begin(), MMemObjs.end(),
1288 [MemObject](
const SYCLMemObjI *Obj) { return Obj == MemObject; });
1289 if (It != MMemObjs.end())
1309 std::vector<Command *> &ToCleanUp) {
1318 std::move(HT), {}, {}, {},
1327 }
catch (
const std::bad_alloc &) {
1328 throw runtime_error(
"Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
1335 (void)ConnectCmd->
addDep(Dep, ToCleanUp);
1336 assert(
reinterpret_cast<Command *
>(DepEvent->getCommand()) ==
1345 std::ignore = Cmd->
addDep(DepOnConnect, ToCleanUp);
1350 if (
Command *DepCmd =
reinterpret_cast<Command *
>(DepEvent->getCommand()))
1353 std::ignore = ConnectCmd->
addDep(DepEvent, ToCleanUp);
1365 auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1366 MFusionMap.emplace(QUniqueID, std::make_unique<KernelFusionCommand>(Queue));
1370 sycl::detail::queue_impl *Queue) {
1371 auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue);
1374 "Queue already in fusion mode"};
1376 auto OldFusionCmd = findFusionList(QUniqueID);
1377 if (OldFusionCmd != MFusionMap.end()) {
1382 OldFusionCmd->second->setFusionStatus(
1384 cleanupCommand(OldFusionCmd->second.release());
1385 MFusionMap.erase(OldFusionCmd);
1389 void Scheduler::GraphBuilder::removeNodeFromGraph(
1390 Command *Node, std::vector<Command *> &ToEnqueue) {
1393 for (
auto &Dep : Node->
MDeps) {
1394 auto AccessMode = Dep.MDepRequirement->MAccessMode;
1397 Node->
MLeafCounter -= Record->MReadLeaves.remove(Node);
1398 Node->
MLeafCounter -= Record->MWriteLeaves.remove(Node);
1402 for (
auto PrevDep : Dep.MDepCommand->MDeps) {
1403 auto *DepReq = PrevDep.MDepRequirement;
1405 if (DepRecord == Record) {
1408 assert(Dep.MDepCommand);
1409 addNodeToLeaves(Record, Dep.MDepCommand, DepReq->MAccessMode,
1414 Dep.MDepCommand->MUsers.erase(Node);
1424 std::vector<Command *> &ToEnqueue) {
1425 auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1431 auto *PlaceholderCmd = (*FusionList).second.get();
1434 auto FusedCmdList = PlaceholderCmd->getFusionList();
1435 ToEnqueue.insert(ToEnqueue.end(), FusedCmdList.begin(), FusedCmdList.end());
1439 ToEnqueue.insert(ToEnqueue.end(), PlaceholderCmd->auxiliaryCommands().begin(),
1440 PlaceholderCmd->auxiliaryCommands().end());
1442 ToEnqueue.push_back(PlaceholderCmd);
1444 if (MPrintOptionsArray[AfterFusionCancel]) {
1445 printGraphAsDot(
"after_fusionCancel");
1454 return static_cast<ExecCGCommand *
>(Cmd)->MFusionCmd == Fusion;
1471 return !PredPartOfFusion;
1479 for (
auto &Dep : Cmd->
MDeps) {
1480 auto *DepCmd = Dep.MDepCommand;
1489 auto *EvDepCmd =
static_cast<Command *
>(Ev->getCommand());
1498 auto *EvDepCmd =
static_cast<Command *
>(Ev->getCommand());
1511 std::vector<Command *> &ToEnqueue,
1513 auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1514 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
1516 auto InactiveFusionList = findFusionList(QUniqueID);
1517 if (InactiveFusionList == MFusionMap.end()) {
1520 "Calling complete_fusion on a queue not in fusion mode"};
1522 return InactiveFusionList->second->getEvent();
1526 auto *PlaceholderCmd =
FusionList->second.get();
1527 auto &CmdList = PlaceholderCmd->getFusionList();
1535 bool CreatesCircularDep =
1537 return checkForCircularDependency(Cmd, true, PlaceholderCmd);
1539 if (CreatesCircularDep) {
1542 "Aborting fusion because it would create a circular dependency");
1543 auto LastEvent = PlaceholderCmd->getEvent();
1550 Queue, CmdList, PropList);
1556 auto LastEvent = PlaceholderCmd->getEvent();
1562 std::vector<EventImplPtr> FusedEventDeps;
1563 for (
auto *Cmd : CmdList) {
1564 FusedEventDeps.insert(FusedEventDeps.end(),
1567 FusedEventDeps.insert(FusedEventDeps.end(),
1574 FusedEventDeps.erase(
1575 std::remove_if(FusedEventDeps.begin(), FusedEventDeps.end(),
1577 if (E->getCommand() == PlaceholderCmd) {
1580 if (E->getCommand() &&
1584 static_cast<ExecCGCommand *>(E->getCommand());
1585 if (RunCGCmd->MFusionCmd == PlaceholderCmd) {
1591 FusedEventDeps.end());
1593 auto FusedKernelCmd =
1594 std::make_unique<ExecCGCommand>(std::move(FusedCG), Queue);
1596 assert(PlaceholderCmd->MDeps.empty());
1601 for (
auto OldCmd = CmdList.rbegin(); OldCmd != CmdList.rend(); ++OldCmd) {
1602 removeNodeFromGraph(*OldCmd, ToEnqueue);
1603 cleanupCommand(*OldCmd,
true);
1606 createGraphForCommand(FusedKernelCmd.get(), FusedKernelCmd->getCG(),
false,
1607 FusedKernelCmd->getCG().getRequirements(),
1608 FusedEventDeps, Queue, ToEnqueue);
1610 ToEnqueue.push_back(FusedKernelCmd.get());
1612 std::vector<Command *> ToCleanUp;
1614 auto *ConnectToPlaceholder =
1615 PlaceholderCmd->addDep(FusedKernelCmd->getEvent(), ToCleanUp);
1616 if (ConnectToPlaceholder) {
1617 ToEnqueue.push_back(ConnectToPlaceholder);
1619 for (Command *Cmd : ToCleanUp) {
1620 cleanupCommand(Cmd);
1622 ToEnqueue.push_back(PlaceholderCmd);
1624 if (MPrintOptionsArray[AfterFusionComplete]) {
1625 printGraphAsDot(
"after_fusionComplete");
1631 return FusedKernelCmd.release()->getEvent();
1632 #else // SYCL_EXT_CODEPLAY_KERNEL_FUSION
1633 printFusionWarning(
"Kernel fusion not supported by this build");
1636 auto *PlaceholderCmd =
FusionList->second.get();
1637 auto LastEvent = PlaceholderCmd->getEvent();
1638 this->cancelFusion(Queue, ToEnqueue);
1640 #endif // SYCL_EXT_CODEPLAY_KERNEL_FUSION
1643 bool Scheduler::GraphBuilder::isInFusionMode(
QueueIdT Id) {