13 #include <sycl/feature_test.hpp>
14 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
36 inline namespace _V1 {
55 return LHS == RHS || (LHS->is_host() && RHS->is_host());
67 return (Required == Current);
69 assert(
false &&
"Write only access is expected to be mapped as read_write");
99 std::string GraphPrintOpts(EnvVarCStr);
100 bool EnableAlways = GraphPrintOpts.find(
"always") != std::string::npos;
102 if (GraphPrintOpts.find(
"before_addCG") != std::string::npos ||
104 MPrintOptionsArray[BeforeAddCG] =
true;
105 if (GraphPrintOpts.find(
"after_addCG") != std::string::npos || EnableAlways)
106 MPrintOptionsArray[AfterAddCG] =
true;
107 if (GraphPrintOpts.find(
"before_addCopyBack") != std::string::npos ||
109 MPrintOptionsArray[BeforeAddCopyBack] =
true;
110 if (GraphPrintOpts.find(
"after_addCopyBack") != std::string::npos ||
112 MPrintOptionsArray[AfterAddCopyBack] =
true;
113 if (GraphPrintOpts.find(
"before_addHostAcc") != std::string::npos ||
115 MPrintOptionsArray[BeforeAddHostAcc] =
true;
116 if (GraphPrintOpts.find(
"after_addHostAcc") != std::string::npos ||
118 MPrintOptionsArray[AfterAddHostAcc] =
true;
119 if (GraphPrintOpts.find(
"after_fusionComplete") != std::string::npos ||
121 MPrintOptionsArray[AfterFusionComplete] =
true;
122 if (GraphPrintOpts.find(
"after_fusionCancel") != std::string::npos ||
124 MPrintOptionsArray[AfterFusionCancel] =
true;
129 assert(Cmd &&
"Cmd can't be nullptr");
133 Visited.push_back(Cmd);
139 Cmd->MMarks.MVisited =
false;
144 if (Cmd->MMarks.MToBeDeleted) {
153 Cmd->getEvent()->setCommand(
nullptr);
156 Cmd->MMarks.MVisited =
false;
161 std::vector<Command *> &Visited,
Command *Cmd) {
171 void Scheduler::GraphBuilder::printGraphAsDot(
const char *ModeName) {
172 static size_t Counter = 0;
173 std::string ModeNameStr(ModeName);
174 std::string FileName =
175 "graph_" + std::to_string(Counter) + ModeNameStr +
".dot";
179 std::fstream Stream(FileName, std::ios::out);
180 Stream <<
"strict digraph {" << std::endl;
182 MVisitedCmds.clear();
184 for (SYCLMemObjI *MemObject : MMemObjs)
185 for (
Command *AllocaCmd : MemObject->MRecord->MAllocaCommands)
188 Stream <<
"}" << std::endl;
194 return MemObject->
MRecord.get();
199 std::vector<Command *> &ToEnqueue) {
203 if (
nullptr != Record)
206 const size_t LeafLimit = 8;
212 DepDesc Dep = findDepForRecord(Dependant, Record);
214 std::vector<Command *> ToCleanUp;
217 ToEnqueue.push_back(ConnectionCmd);
219 --(Dependency->MLeafCounter);
220 if (Dependency->readyForCleanup())
221 ToCleanUp.push_back(Dependency);
232 std::vector<sycl::device> Devices =
233 InteropCtxPtr->get_info<info::context::devices>();
234 assert(Devices.size() != 0);
241 Dev, InteropCtxPtr, {}, {}}};
244 new MemObjRecord{InteropCtxPtr, LeafLimit, AllocateDependency});
245 getOrCreateAllocaForReq(MemObject->
MRecord.get(), Req, InteropQueuePtr,
249 LeafLimit, AllocateDependency});
251 MMemObjs.push_back(MemObject);
252 return MemObject->
MRecord.get();
258 std::vector<Command *> &ToCleanUp) {
265 bool WasLeaf = Cmd->MLeafCounter > 0;
268 if (WasLeaf && Cmd->readyForCleanup()) {
269 ToCleanUp.push_back(Cmd);
276 std::vector<Command *> &ToEnqueue) {
280 if (Leaves.push_back(Cmd, ToEnqueue))
286 std::vector<Command *> &ToEnqueue) {
288 findAllocaForReq(Record, Req, Queue->getContextImplPtr());
289 assert(AllocaCmd &&
"There must be alloca for requirement!");
296 std::set<Command *> Deps =
297 findDepsForReq(Record, Req, Queue->getContextImplPtr());
298 std::vector<Command *> ToCleanUp;
301 UpdateCommand->
addDep(
DepDesc{Dep, StoredReq, AllocaCmd}, ToCleanUp);
303 ToEnqueue.push_back(ConnCmd);
305 updateLeaves(Deps, Record, Req->
MAccessMode, ToCleanUp);
306 addNodeToLeaves(Record, UpdateCommand, Req->
MAccessMode, ToEnqueue);
309 return UpdateCommand;
318 "Expected linked alloca commands");
320 "Expected source alloca command to be active");
322 if (AllocaCmdSrc->
getQueue()->is_host()) {
341 Command *Scheduler::GraphBuilder::insertMemoryMove(
343 std::vector<Command *> &ToEnqueue) {
345 AllocaCommandBase *AllocaCmdDst =
346 getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
348 throw runtime_error(
"Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
350 std::set<Command *> Deps =
351 findDepsForReq(Record, Req, Queue->getContextImplPtr());
352 Deps.insert(AllocaCmdDst);
355 if (AllocaCmdDst->getType() == Command::CommandType::ALLOCA_SUB_BUF)
357 static_cast<AllocaSubBufCommand *
>(AllocaCmdDst)->getParentAlloca();
360 AllocaCommandBase *AllocaCmdSrc =
361 findAllocaForReq(Record, Req, Record->MCurContext);
366 auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) {
368 Record->MCurContext) &&
370 AllocaCmd->
getType() == Command::CommandType::ALLOCA;
374 std::find_if(Record->MAllocaCommands.begin(),
375 Record->MAllocaCommands.end(), IsSuitableAlloca);
376 AllocaCmdSrc = (Record->MAllocaCommands.end() != It) ? *It :
nullptr;
379 throw runtime_error(
"Cannot find buffer allocation",
380 PI_ERROR_INVALID_VALUE);
383 if (AllocaCmdSrc->getType() == Command::CommandType::ALLOCA_SUB_BUF)
385 static_cast<AllocaSubBufCommand *
>(AllocaCmdSrc)->getParentAlloca();
386 else if (AllocaCmdSrc->getSYCLMemObj() != Req->MSYCLMemObj)
387 assert(
false &&
"Inappropriate alloca command.");
392 if (AllocaCmdSrc->MLinkedAllocaCmd == AllocaCmdDst) {
398 Record->MHostAccess = MapMode;
403 Record->MCurContext = Queue->getContextImplPtr();
409 new MemCpyCommand(*AllocaCmdSrc->getRequirement(), AllocaCmdSrc,
410 *AllocaCmdDst->getRequirement(), AllocaCmdDst,
411 AllocaCmdSrc->getQueue(), AllocaCmdDst->getQueue());
414 std::vector<Command *> ToCleanUp;
417 DepDesc{Dep, NewCmd->getRequirement(), AllocaCmdDst}, ToCleanUp);
419 ToEnqueue.push_back(ConnCmd);
425 Record->MCurContext = Queue->getContextImplPtr();
429 Command *Scheduler::GraphBuilder::remapMemoryObject(
430 MemObjRecord *Record,
Requirement *Req, AllocaCommandBase *HostAllocaCmd,
431 std::vector<Command *> &ToEnqueue) {
432 assert(HostAllocaCmd->getQueue()->is_host() &&
433 "Host alloca command expected");
434 assert(HostAllocaCmd->MIsActive &&
"Active alloca command expected");
436 AllocaCommandBase *LinkedAllocaCmd = HostAllocaCmd->MLinkedAllocaCmd;
437 assert(LinkedAllocaCmd &&
"Linked alloca command expected");
439 std::set<Command *> Deps = findDepsForReq(Record, Req, Record->MCurContext);
441 UnMapMemObject *UnMapCmd =
new UnMapMemObject(
442 LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(),
443 &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue());
449 MapMemObject *MapCmd =
new MapMemObject(
450 LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(),
451 &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue(), MapMode);
453 std::vector<Command *> ToCleanUp;
455 Command *ConnCmd = UnMapCmd->addDep(
456 DepDesc{Dep, UnMapCmd->getRequirement(), LinkedAllocaCmd}, ToCleanUp);
458 ToEnqueue.push_back(ConnCmd);
461 Command *ConnCmd = MapCmd->addDep(
462 DepDesc{UnMapCmd, MapCmd->getRequirement(), HostAllocaCmd}, ToCleanUp);
464 ToEnqueue.push_back(ConnCmd);
470 Record->MHostAccess = MapMode;
478 std::vector<Command *> &ToEnqueue) {
482 if (Record && MPrintOptionsArray[BeforeAddCopyBack])
483 printGraphAsDot(
"before_addCopyBack");
489 std::set<Command *> Deps =
490 findDepsForReq(Record, Req, HostQueue->getContextImplPtr());
492 findAllocaForReq(Record, Req, Record->
MCurContext);
494 auto MemCpyCmdUniquePtr = std::make_unique<MemCpyCommandHost>(
496 SrcAllocaCmd->
getQueue(), std::move(HostQueue));
498 if (!MemCpyCmdUniquePtr)
499 throw runtime_error(
"Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
503 std::vector<Command *> ToCleanUp;
508 ToEnqueue.push_back(ConnCmd);
511 updateLeaves(Deps, Record, Req->
MAccessMode, ToCleanUp);
512 addNodeToLeaves(Record, MemCpyCmd, Req->
MAccessMode, ToEnqueue);
515 if (MPrintOptionsArray[AfterAddCopyBack])
516 printGraphAsDot(
"after_addCopyBack");
524 std::vector<Command *> &ToEnqueue) {
533 MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req, ToEnqueue);
534 if (MPrintOptionsArray[BeforeAddHostAcc])
535 printGraphAsDot(
"before_addHostAccessor");
536 markModifiedIfWrite(Record, Req);
539 getOrCreateAllocaForReq(Record, Req, HostQueue, ToEnqueue);
544 remapMemoryObject(Record, Req,
552 insertMemoryMove(Record, Req, HostQueue, ToEnqueue);
555 insertUpdateHostReqCmd(Record, Req, HostQueue, ToEnqueue);
559 addEmptyCmd(UpdateHostAccCmd, {Req}, HostQueue,
564 if (MPrintOptionsArray[AfterAddHostAcc])
565 printGraphAsDot(
"after_addHostAccessor");
567 return UpdateHostAccCmd;
571 std::unique_ptr<detail::CG> CommandGroup,
const QueueImplPtr &HostQueue,
572 std::vector<Command *> &ToEnqueue) {
574 auto UpdateHost =
static_cast<CGUpdateHost *
>(CommandGroup.get());
577 MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req, ToEnqueue);
578 return insertMemoryMove(Record, Req, HostQueue, ToEnqueue);
590 Scheduler::GraphBuilder::findDepsForReq(
MemObjRecord *Record,
593 std::set<Command *> RetDeps;
594 std::vector<Command *> Visited;
602 ToAnalyze.insert(ToAnalyze.begin(), V.begin(), V.end());
605 while (!ToAnalyze.empty()) {
606 Command *DepCmd = ToAnalyze.back();
607 ToAnalyze.pop_back();
609 std::vector<Command *> NewAnalyze;
611 for (
const DepDesc &Dep : DepCmd->MDeps) {
612 if (Dep.MDepRequirement->MSYCLMemObj != Req->
MSYCLMemObj)
615 bool CanBypassDep =
false;
621 CanBypassDep |= !
doOverlap(Dep.MDepRequirement, Req);
626 sameCtx(Context, Dep.MDepCommand->getQueue()->getContextImplPtr());
629 RetDeps.insert(DepCmd);
637 NewAnalyze.push_back(Dep.MDepCommand);
639 ToAnalyze.insert(ToAnalyze.end(), NewAnalyze.begin(), NewAnalyze.end());
654 assert(
false &&
"No dependency found for a leaf of the record");
655 return {
nullptr,
nullptr,
nullptr};
663 auto IsSuitableAlloca = [&Context, Req,
665 bool Res =
sameCtx(AllocaCmd->
getQueue()->getContextImplPtr(), Context);
668 Res &= AllocaCmd->
getType() == Command::CommandType::ALLOCA_SUB_BUF;
671 Res &= AllowConst || !AllocaCmd->
MIsConst;
682 if (std::strcmp(HUMConfig,
"0") == 0)
683 return Ctx->is_host();
684 if (std::strcmp(HUMConfig,
"1") == 0)
687 for (
const device &Device : Ctx->getDevices()) {
688 if (!Device.get_info<info::device::host_unified_memory>())
698 AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
700 std::vector<Command *> &ToEnqueue) {
702 AllocaCommandBase *AllocaCmd = findAllocaForReq(
703 Record, Req, Queue->getContextImplPtr(),
false);
706 std::vector<Command *> ToCleanUp;
710 range<3> ParentRange{Req->MSYCLMemObj->getSizeInBytes(), 1, 1};
712 {0, 0, 0}, ParentRange, ParentRange,
714 sizeof(char),
size_t(0));
717 getOrCreateAllocaForReq(Record, &ParentRequirement, Queue, ToEnqueue);
718 AllocaCmd =
new AllocaSubBufCommand(Queue, *Req, ParentAlloca, ToEnqueue,
722 const Requirement FullReq( {0, 0, 0}, Req->MMemoryRange,
724 Req->MSYCLMemObj, Req->MDims, Req->MElemSize,
733 const bool HostUnifiedMemory =
735 SYCLMemObjI *MemObj = Req->MSYCLMemObj;
736 const bool InitFromUserData = Record->MAllocaCommands.empty() &&
737 (HostUnifiedMemory || MemObj->isInterop());
738 AllocaCommandBase *LinkedAllocaCmd =
nullptr;
743 if (Record->MAllocaCommands.empty()) {
744 if (!HostUnifiedMemory &&
749 if (MemObj->hasUserDataPtr()) {
752 AllocaCommand *HostAllocaCmd =
new AllocaCommand(
755 MemObj->isHostPointerReadOnly() );
756 Record->MAllocaCommands.push_back(HostAllocaCmd);
757 Record->MWriteLeaves.push_back(HostAllocaCmd, ToEnqueue);
758 ++(HostAllocaCmd->MLeafCounter);
766 if (Req->MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer)
771 if (Queue->is_host() != Record->MCurContext->is_host()) {
780 bool PinnedHostMemory = MemObj->usesPinnedHostMemory();
782 bool HostUnifiedMemoryOnNonHostDevice =
785 if (PinnedHostMemory || HostUnifiedMemoryOnNonHostDevice) {
786 AllocaCommandBase *LinkedAllocaCmdCand = findAllocaForReq(
787 Record, Req, Record->MCurContext,
false);
790 if (LinkedAllocaCmdCand &&
791 !LinkedAllocaCmdCand->MLinkedAllocaCmd) {
792 LinkedAllocaCmd = LinkedAllocaCmdCand;
799 new AllocaCommand(Queue, FullReq, InitFromUserData, LinkedAllocaCmd);
802 if (LinkedAllocaCmd) {
803 Command *ConnCmd = AllocaCmd->addDep(
804 DepDesc{LinkedAllocaCmd, AllocaCmd->getRequirement(),
808 ToEnqueue.push_back(ConnCmd);
809 LinkedAllocaCmd->MLinkedAllocaCmd = AllocaCmd;
812 ConnCmd = AllocaCmd->getReleaseCmd()->addDep(
813 DepDesc(LinkedAllocaCmd->getReleaseCmd(),
814 AllocaCmd->getRequirement(), LinkedAllocaCmd),
817 ToEnqueue.push_back(ConnCmd);
823 if (Queue->is_host()) {
824 AllocaCmd->MIsActive =
false;
826 LinkedAllocaCmd->MIsActive =
false;
827 Record->MCurContext = Queue->getContextImplPtr();
829 std::set<Command *> Deps =
830 findDepsForReq(Record, Req, Queue->getContextImplPtr());
832 Command *ConnCmd = AllocaCmd->addDep(
833 DepDesc{Dep, Req, LinkedAllocaCmd}, ToCleanUp);
835 ToEnqueue.push_back(ConnCmd);
837 updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp);
838 addNodeToLeaves(Record, AllocaCmd, Req->MAccessMode, ToEnqueue);
843 Record->MAllocaCommands.push_back(AllocaCmd);
844 Record->MWriteLeaves.push_back(AllocaCmd, ToEnqueue);
845 ++(AllocaCmd->MLeafCounter);
853 void Scheduler::GraphBuilder::markModifiedIfWrite(MemObjRecord *Record,
855 switch (Req->MAccessMode) {
861 Record->MMemModified =
true;
868 EmptyCommand *Scheduler::GraphBuilder::addEmptyCmd(
869 Command *Cmd,
const std::vector<Requirement *> &Reqs,
871 std::vector<Command *> &ToEnqueue,
const bool AddDepsToLeaves) {
872 EmptyCommand *EmptyCmd =
876 throw runtime_error(
"Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
878 EmptyCmd->MIsBlockable =
true;
880 EmptyCmd->MBlockReason = Reason;
883 MemObjRecord *Record = getOrInsertMemObjRecord(Queue, Req, ToEnqueue);
884 AllocaCommandBase *AllocaCmd =
885 getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
886 EmptyCmd->addRequirement(Cmd, AllocaCmd, Req);
891 Cmd->addUser(EmptyCmd);
893 if (AddDepsToLeaves) {
894 const std::vector<DepDesc> &Deps = Cmd->MDeps;
895 std::vector<Command *> ToCleanUp;
896 for (
const DepDesc &Dep : Deps) {
900 updateLeaves({Cmd}, Record, Req->MAccessMode, ToCleanUp);
901 addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode, ToEnqueue);
911 if (Cmd->
getCG().
getType() != CG::CGTYPE::CodeplayHostTask)
921 std::unordered_map<SYCLMemObjI *, access::mode> CombinedModes;
922 bool HasDuplicateMemObjects =
false;
924 auto Result = CombinedModes.insert(
925 std::make_pair(Req->MSYCLMemObj, Req->MAccessMode));
926 if (!Result.second) {
927 Result.first->second =
929 HasDuplicateMemObjects =
true;
933 if (!HasDuplicateMemObjects)
936 Req->MAccessMode = CombinedModes[Req->MSYCLMemObj];
941 std::unique_ptr<detail::CG> CommandGroup,
const QueueImplPtr &Queue,
942 std::vector<Command *> &ToEnqueue,
944 const std::vector<sycl::detail::pi::PiExtSyncPoint> &Dependencies) {
945 std::vector<Requirement *> &Reqs = CommandGroup->getRequirements();
946 std::vector<detail::EventImplPtr> &Events = CommandGroup->getEvents();
948 auto NewCmd = std::make_unique<ExecCGCommand>(
949 std::move(CommandGroup), Queue, CommandBuffer, std::move(Dependencies));
952 throw runtime_error(
"Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
958 auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
960 if (NewCmd->isFusable()) {
961 auto *FusionCmd = findFusionList(QUniqueID)->second.get();
963 bool dependsOnFusion =
false;
964 for (
auto Ev = Events.begin(); Ev != Events.end();) {
965 auto *EvDepCmd =
static_cast<Command *
>((*Ev)->getCommand());
972 if (EvDepCmd->getQueue() != Queue && isPartOfActiveFusion(EvDepCmd)) {
974 "Aborting fusion because of event dependency from a "
980 if (EvDepCmd == FusionCmd) {
981 Ev = Events.erase(Ev);
982 dependsOnFusion =
true;
992 if (dependsOnFusion) {
993 for (
auto *Cmd : FusionCmd->getFusionList()) {
994 Events.push_back(Cmd->getEvent());
1001 createGraphForCommand(NewCmd.get(), NewCmd->getCG(),
1003 Queue, FusionCmd->auxiliaryCommands());
1007 FusionCmd->addToFusionList(NewCmd.get());
1008 NewCmd->MFusionCmd = FusionCmd;
1009 std::vector<Command *> ToCleanUp;
1012 auto ConnectionCmd = FusionCmd->addDep(NewCmd->getEvent(), ToCleanUp);
1013 if (ConnectionCmd) {
1014 FusionCmd->auxiliaryCommands().push_back(ConnectionCmd);
1016 return {NewCmd.release(), FusionCmd->getEvent(),
false};
1019 std::stringstream ss(s);
1020 if (NewCmd->getCG().getType() == CG::CGTYPE::Kernel) {
1021 ss <<
"Not fusing kernel with 'use_root_sync' property. Can only fuse "
1022 "non-cooperative device kernels.";
1024 ss <<
"Not fusing '" << NewCmd->getTypeString()
1025 <<
"' command group. Can only fuse device kernel command groups.";
1027 printFusionWarning(ss.str());
1030 createGraphForCommand(NewCmd.get(), NewCmd->getCG(),
1033 auto Event = NewCmd->getEvent();
1034 return {NewCmd.release(), Event,
true};
1037 void Scheduler::GraphBuilder::createGraphForCommand(
1039 std::vector<Requirement *> &Reqs,
1040 const std::vector<detail::EventImplPtr> &Events,
QueueImplPtr Queue,
1041 std::vector<Command *> &ToEnqueue) {
1043 if (MPrintOptionsArray[BeforeAddCG])
1044 printGraphAsDot(
"before_addCG");
1050 std::vector<Command *> ToCleanUp;
1055 bool isSameCtx =
false;
1061 Record = getOrInsertMemObjRecord(QueueForAlloca, Req, ToEnqueue);
1062 markModifiedIfWrite(Record, Req);
1065 getOrCreateAllocaForReq(Record, Req, QueueForAlloca, ToEnqueue);
1068 sameCtx(QueueForAlloca->getContextImplPtr(), Record->MCurContext);
1078 remapMemoryObject(Record, Req,
1088 bool NeedMemMoveToHost =
false;
1089 auto MemMoveTargetQueue = Queue;
1091 if (isInteropTask) {
1092 const detail::CGHostTask &HT =
static_cast<detail::CGHostTask &
>(CG);
1094 if (HT.MQueue->getContextImplPtr() != Record->
MCurContext) {
1095 NeedMemMoveToHost =
true;
1096 MemMoveTargetQueue = HT.MQueue;
1098 }
else if (!Queue->is_host() && !Record->
MCurContext->is_host())
1099 NeedMemMoveToHost =
true;
1101 if (NeedMemMoveToHost)
1102 insertMemoryMove(Record, Req,
1105 insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue);
1107 std::set<Command *> Deps =
1108 findDepsForReq(Record, Req, Queue->getContextImplPtr());
1111 if (Dep != NewCmd) {
1113 NewCmd->
addDep(DepDesc{Dep, Req, AllocaCmd}, ToCleanUp);
1115 ToEnqueue.push_back(ConnCmd);
1124 std::vector<DepDesc> Deps = NewCmd->
MDeps;
1125 for (DepDesc &Dep : Deps) {
1128 updateLeaves({Dep.MDepCommand}, Record, Req->MAccessMode, ToCleanUp);
1129 addNodeToLeaves(Record, NewCmd, Req->MAccessMode, ToEnqueue);
1134 if (e->getCommand() && e->getCommand() == NewCmd) {
1138 ToEnqueue.push_back(ConnCmd);
1141 if (MPrintOptionsArray[AfterAddCG])
1142 printGraphAsDot(
"after_addCG");
1144 for (
Command *Cmd : ToCleanUp) {
1145 cleanupCommand(Cmd);
1154 cleanupCommand(Cmd);
1159 cleanupCommand(Cmd);
1164 std::vector<AllocaCommandBase *> &AllocaCommands = Record->
MAllocaCommands;
1165 if (AllocaCommands.empty())
1168 assert(MCmdsToVisit.empty());
1169 MVisitedCmds.clear();
1173 for (
Command *AllocaCmd : AllocaCommands) {
1179 if (UserCmd->
getType() != Command::CommandType::ALLOCA)
1180 MCmdsToVisit.push(UserCmd);
1187 AllocaCmd->
MUsers.clear();
1205 while (!MCmdsToVisit.empty()) {
1206 Command *Cmd = MCmdsToVisit.front();
1213 if (UserCmd->
getType() != Command::CommandType::ALLOCA)
1214 MCmdsToVisit.push(UserCmd);
1218 std::map<Command *, bool> ShouldBeUpdated;
1219 auto NewEnd = std::remove_if(
1221 if (std::find(AllocaCommands.begin(), AllocaCommands.end(),
1222 Dep.MAllocaCmd) != AllocaCommands.end()) {
1223 ShouldBeUpdated.insert({Dep.MDepCommand, true});
1229 Cmd->
MDeps.erase(NewEnd, Cmd->MDeps.end());
1232 for (
auto DepCmdIt : ShouldBeUpdated) {
1233 if (!DepCmdIt.second)
1235 DepCmdIt.first->MUsers.erase(Cmd);
1240 if (Cmd->MDeps.empty()) {
1241 Cmd->MUsers.clear();
1244 if (!Cmd->MMarkedForCleanup)
1245 Cmd->MMarks.MToBeDeleted =
true;
1253 Command *Cmd, [[maybe_unused]]
bool AllowUnsubmitted) {
1255 static bool DeprWarningPrinted =
false;
1256 if (!DeprWarningPrinted) {
1257 std::cerr <<
"WARNING: The enviroment variable "
1258 "SYCL_DISABLE_POST_ENQUEUE_CLEANUP is deprecated. Please "
1259 "use SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP instead.\n";
1260 DeprWarningPrinted =
true;
1292 DepCmd->
MUsers.erase(Cmd);
1303 Cmd->
getEvent()->setCommand(
nullptr);
1308 const auto It = std::find_if(
1309 MMemObjs.begin(), MMemObjs.end(),
1310 [MemObject](
const SYCLMemObjI *Obj) { return Obj == MemObject; });
1311 if (It != MMemObjs.end())
1331 std::vector<Command *> &ToCleanUp) {
1340 std::move(HT), {}, {}, {},
1349 }
catch (
const std::bad_alloc &) {
1350 throw runtime_error(
"Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
1357 (void)ConnectCmd->
addDep(Dep, ToCleanUp);
1358 assert(
reinterpret_cast<Command *
>(DepEvent->getCommand()) ==
1367 std::ignore = Cmd->
addDep(DepOnConnect, ToCleanUp);
1372 if (
Command *DepCmd =
reinterpret_cast<Command *
>(DepEvent->getCommand()))
1375 std::ignore = ConnectCmd->
addDep(DepEvent, ToCleanUp);
1387 auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1388 MFusionMap.emplace(QUniqueID, std::make_unique<KernelFusionCommand>(Queue));
1392 sycl::detail::queue_impl *Queue) {
1393 auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue);
1396 "Queue already in fusion mode"};
1398 auto OldFusionCmd = findFusionList(QUniqueID);
1399 if (OldFusionCmd != MFusionMap.end()) {
1404 OldFusionCmd->second->setFusionStatus(
1406 cleanupCommand(OldFusionCmd->second.release());
1407 MFusionMap.erase(OldFusionCmd);
1411 void Scheduler::GraphBuilder::removeNodeFromGraph(
1412 Command *Node, std::vector<Command *> &ToEnqueue) {
1415 for (
auto &Dep : Node->
MDeps) {
1416 auto AccessMode = Dep.MDepRequirement->MAccessMode;
1419 Node->
MLeafCounter -= Record->MReadLeaves.remove(Node);
1420 Node->
MLeafCounter -= Record->MWriteLeaves.remove(Node);
1424 for (
auto PrevDep : Dep.MDepCommand->MDeps) {
1425 auto *DepReq = PrevDep.MDepRequirement;
1427 if (DepRecord == Record) {
1430 assert(Dep.MDepCommand);
1431 addNodeToLeaves(Record, Dep.MDepCommand, DepReq->MAccessMode,
1436 Dep.MDepCommand->MUsers.erase(Node);
1446 std::vector<Command *> &ToEnqueue) {
1447 auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1453 auto *PlaceholderCmd = (*FusionList).second.get();
1456 auto FusedCmdList = PlaceholderCmd->getFusionList();
1457 ToEnqueue.insert(ToEnqueue.end(), FusedCmdList.begin(), FusedCmdList.end());
1461 ToEnqueue.insert(ToEnqueue.end(), PlaceholderCmd->auxiliaryCommands().begin(),
1462 PlaceholderCmd->auxiliaryCommands().end());
1464 ToEnqueue.push_back(PlaceholderCmd);
1466 if (MPrintOptionsArray[AfterFusionCancel]) {
1467 printGraphAsDot(
"after_fusionCancel");
1476 return static_cast<ExecCGCommand *
>(Cmd)->MFusionCmd == Fusion;
1493 return !PredPartOfFusion;
1501 for (
auto &Dep : Cmd->
MDeps) {
1502 auto *DepCmd = Dep.MDepCommand;
1511 auto *EvDepCmd =
static_cast<Command *
>(Ev->getCommand());
1520 auto *EvDepCmd =
static_cast<Command *
>(Ev->getCommand());
1533 std::vector<Command *> &ToEnqueue,
1535 auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1536 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
1538 auto InactiveFusionList = findFusionList(QUniqueID);
1539 if (InactiveFusionList == MFusionMap.end()) {
1542 "Calling complete_fusion on a queue not in fusion mode"};
1544 return InactiveFusionList->second->getEvent();
1548 auto *PlaceholderCmd =
FusionList->second.get();
1549 auto &CmdList = PlaceholderCmd->getFusionList();
1563 bool CreatesCircularDep =
1564 MFusionMap.size() > 1 &&
1566 return checkForCircularDependency(Cmd, true, PlaceholderCmd);
1568 if (CreatesCircularDep) {
1571 "Aborting fusion because it would create a circular dependency");
1572 auto LastEvent = PlaceholderCmd->getEvent();
1579 Queue, CmdList, PropList);
1585 auto LastEvent = PlaceholderCmd->getEvent();
1591 std::vector<EventImplPtr> FusedEventDeps;
1592 for (
auto *Cmd : CmdList) {
1593 FusedEventDeps.insert(FusedEventDeps.end(),
1596 FusedEventDeps.insert(FusedEventDeps.end(),
1603 FusedEventDeps.erase(
1604 std::remove_if(FusedEventDeps.begin(), FusedEventDeps.end(),
1606 if (E->getCommand() == PlaceholderCmd) {
1609 if (E->getCommand() &&
1613 static_cast<ExecCGCommand *>(E->getCommand());
1614 if (RunCGCmd->MFusionCmd == PlaceholderCmd) {
1620 FusedEventDeps.end());
1622 auto FusedKernelCmd =
1623 std::make_unique<ExecCGCommand>(std::move(FusedCG), Queue);
1627 PlaceholderCmd->getEvent());
1628 assert(PlaceholderCmd->MDeps.empty());
1633 for (
auto OldCmd = CmdList.rbegin(); OldCmd != CmdList.rend(); ++OldCmd) {
1634 removeNodeFromGraph(*OldCmd, ToEnqueue);
1635 cleanupCommand(*OldCmd,
true);
1638 createGraphForCommand(FusedKernelCmd.get(), FusedKernelCmd->getCG(),
false,
1639 FusedKernelCmd->getCG().getRequirements(),
1640 FusedEventDeps, Queue, ToEnqueue);
1642 ToEnqueue.push_back(FusedKernelCmd.get());
1644 std::vector<Command *> ToCleanUp;
1646 auto *ConnectToPlaceholder =
1647 PlaceholderCmd->addDep(FusedKernelCmd->getEvent(), ToCleanUp);
1648 if (ConnectToPlaceholder) {
1649 ToEnqueue.push_back(ConnectToPlaceholder);
1651 for (Command *Cmd : ToCleanUp) {
1652 cleanupCommand(Cmd);
1654 ToEnqueue.push_back(PlaceholderCmd);
1656 if (MPrintOptionsArray[AfterFusionComplete]) {
1657 printGraphAsDot(
"after_fusionComplete");
1663 return FusedKernelCmd.release()->getEvent();
1665 printFusionWarning(
"Kernel fusion not supported by this build");
1668 auto *PlaceholderCmd =
FusionList->second.get();
1669 auto LastEvent = PlaceholderCmd->getEvent();
1670 this->cancelFusion(Queue, ToEnqueue);
1675 bool Scheduler::GraphBuilder::isInFusionMode(
QueueIdT Id) {
1683 Command *Scheduler::GraphBuilder::addCommandGraphUpdate(
1685 std::vector<std::shared_ptr<ext::oneapi::experimental::detail::node_impl>>
1687 const QueueImplPtr &Queue, std::vector<Requirement *> Requirements,
1688 std::vector<detail::EventImplPtr> &Events,
1689 std::vector<Command *> &ToEnqueue) {
1691 std::make_unique<UpdateCommandBufferCommand>(Queue, Graph, Nodes);
1696 std::vector<Command *> ToCleanUp;
1701 bool isSameCtx =
false;
1705 Record = getOrInsertMemObjRecord(Queue, Req, ToEnqueue);
1706 markModifiedIfWrite(Record, Req);
1708 AllocaCmd = getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
1716 bool NeedMemMoveToHost =
false;
1717 auto MemMoveTargetQueue = Queue;
1719 if (!Queue->is_host() && !Record->
MCurContext->is_host())
1720 NeedMemMoveToHost =
true;
1722 if (NeedMemMoveToHost)
1723 insertMemoryMove(Record, Req,
1724 Scheduler::getInstance().getDefaultHostQueue(),
1726 insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue);
1728 std::set<Command *> Deps =
1729 findDepsForReq(Record, Req, Queue->getContextImplPtr());
1732 if (Dep != NewCmd.get()) {
1736 ToEnqueue.push_back(ConnCmd);
1745 std::vector<DepDesc> Deps = NewCmd->
MDeps;
1749 updateLeaves({Dep.MDepCommand}, Record, Req->
MAccessMode, ToCleanUp);
1750 addNodeToLeaves(Record, NewCmd.get(), Req->
MAccessMode, ToEnqueue);
1755 if (e->getCommand() &&
1756 e->getCommand() ==
static_cast<Command *
>(NewCmd.get())) {
1760 ToEnqueue.push_back(ConnCmd);
1763 if (MPrintOptionsArray[AfterAddCG])
1764 printGraphAsDot(
"after_addCG");
1766 for (
Command *Cmd : ToCleanUp) {
1767 cleanupCommand(Cmd);
1770 return NewCmd.release();
detail::SYCLMemObjI * MSYCLMemObj
range< 3 > & MAccessRange
Base class for memory allocation commands.
const Requirement * getRequirement() const final
AllocaCommandBase * MLinkedAllocaCmd
Alloca command linked with current command.
bool MIsActive
Indicates that current alloca is active one.
The AllocaSubBuf command enqueues creation of sub-buffer of memory object.
std::unique_ptr< HostTask > MHostTask
"Update host" command group class.
Base class for all types of command groups.
The Command class represents some action that needs to be performed on one or more memory objects.
bool isSuccessfullyEnqueued() const
const std::vector< EventImplPtr > & getPreparedDepsEvents() const
const std::vector< EventImplPtr > & getPreparedHostDepsEvents() const
unsigned MLeafCounter
Counts the number of memory objects this command is a leaf for.
std::unordered_set< Command * > MUsers
Contains list of commands that depend on the command.
virtual const ContextImplPtr & getWorkerContext() const
Get the context of the queue this command will be submitted to.
Marks MMarks
Used for marking the node during graph traversal.
std::vector< DepDesc > MDeps
Contains list of dependencies(edges)
virtual bool readyForCleanup() const
Returns true iff this command is ready to be submitted for cleanup.
void addUser(Command *NewUser)
void clearAllDependencies()
Clear all dependency events This should only be used if a command is about to be deleted without bein...
const EventImplPtr & getEvent() const
CommandType getType() const
virtual void printDot(std::ostream &Stream) const =0
Command * addDep(DepDesc NewDep, std::vector< Command * > &ToCleanUp)
const QueueImplPtr & getQueue() const
The empty command does nothing during enqueue.
The exec CG command enqueues execution of kernel or explicit memory operation.
detail::CG & getCG() const
The KernelFusionCommand is placed in the execution graph together with the individual kernels of the ...
bool readyForDeletion() const
A wrapper for CircularBuffer class along with collection for host accessor's EmptyCommands.
size_t remove(value_type Cmd)
Replacement for std::remove with subsequent call to erase(newEnd, end()).
std::function< void(Command *, Command *, MemObjRecord *, EnqueueListT &)> AllocateDependencyF
std::vector< value_type > toVector() const
std::vector< Command * > EnqueueListT
The map command enqueues mapping of device memory onto host memory.
The mem copy host command enqueues memory copy between two instances of memory object.
const Requirement * getRequirement() const final
virtual ContextImplPtr getInteropContext() const =0
std::shared_ptr< MemObjRecord > MRecord
void handleWriteAccessorCreation()
void startFusion(QueueImplPtr Queue)
void cleanupCommand(Command *Cmd, bool AllowUnsubmitted=false)
void decrementLeafCountersForRecord(MemObjRecord *Record)
Decrements leaf counters for all leaves of the record.
MemObjRecord * getMemObjRecord(SYCLMemObjI *MemObject)
EventImplPtr completeFusion(QueueImplPtr Queue, std::vector< Command * > &ToEnqueue, const property_list &)
Command * addHostAccessor(Requirement *Req, std::vector< Command * > &ToEnqueue)
Enqueues a command to create a host accessor.
void cleanupCommandsForRecord(MemObjRecord *Record)
Removes commands that use the given MemObjRecord from the graph.
GraphBuildResult addCG(std::unique_ptr< detail::CG > CommandGroup, const QueueImplPtr &Queue, std::vector< Command * > &ToEnqueue, sycl::detail::pi::PiExtCommandBuffer CommandBuffer=nullptr, const std::vector< sycl::detail::pi::PiExtSyncPoint > &Dependencies={})
Registers command group and adds it to the dependency graph.
void removeRecordForMemObj(SYCLMemObjI *MemObject)
Removes the MemObjRecord for the memory object passed.
Command * addCopyBack(Requirement *Req, std::vector< Command * > &ToEnqueue)
Enqueues a command to update memory to the latest state.
Command * connectDepEvent(Command *const Cmd, const EventImplPtr &DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform connection of events in multiple contexts.
Command * addCGUpdateHost(std::unique_ptr< detail::CG > CommandGroup, const QueueImplPtr &HostQueue, std::vector< Command * > &ToEnqueue)
Registers a command group that updates host memory to the latest state.
MemObjRecord * getOrInsertMemObjRecord(const QueueImplPtr &Queue, const Requirement *Req, std::vector< Command * > &ToEnqueue)
void cancelFusion(QueueImplPtr Queue, std::vector< Command * > &ToEnqueue)
void addNodeToLeaves(MemObjRecord *Record, Command *Cmd, access::mode AccessMode, std::vector< Command * > &ToEnqueue)
Adds new command to leaves if needed.
void cleanUpCmdFusion(sycl::detail::queue_impl *Queue)
Clean up the internal fusion commands held for the given queue.
AllocaCommandBase * findAllocaForReq(MemObjRecord *Record, const Requirement *Req, const ContextImplPtr &Context, bool AllowConst=true)
Searches for suitable alloca in memory record.
DepDesc findDepForRecord(Command *Cmd, MemObjRecord *Record)
Finds a command dependency corresponding to the record.
void updateLeaves(const std::set< Command * > &Cmds, MemObjRecord *Record, access::mode AccessMode, std::vector< Command * > &ToCleanUp)
Removes commands from leaves.
QueueImplPtr getDefaultHostQueue()
bool isInFusionMode(QueueIdT Queue)
void cancelFusion(QueueImplPtr Queue)
static MemObjRecord * getMemObjRecord(const Requirement *const Req)
static Scheduler & getInstance()
void cleanUpCmdFusion(sycl::detail::queue_impl *Queue)
void takeAuxiliaryResources(const EventImplPtr &Dst, const EventImplPtr &Src)
Assign Src's auxiliary resources to Dst.
QueueImplPtr DefaultHostQueue
The unmap command removes mapping of host memory onto device memory.
const Requirement * getRequirement() const final
std::unique_ptr< detail::CG > fuseKernels(QueueImplPtr Queue, std::vector< ExecCGCommand * > &InputKernels, const property_list &)
static jit_compiler & get_instance()
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Class representing the implementation of command_graph<executable>.
Objects of the property_list class are containers for the SYCL properties.
__SYCL_EXTERN_STREAM_ATTRS ostream cerr
Linked to standard error (unbuffered)
static bool doOverlap(const Requirement *LHS, const Requirement *RHS)
Checks whether two requirements overlap or not.
static bool createsCircularDependency(Command *Cmd, bool PredPartOfFusion, KernelFusionCommand *Fusion)
static bool isInteropHostTask(ExecCGCommand *Cmd)
static void unmarkVisitedNodes(std::vector< Command * > &Visited)
static bool checkForCircularDependency(Command *, bool, KernelFusionCommand *)
std::hash< std::shared_ptr< detail::queue_impl > >::result_type QueueIdT
static access::mode combineAccessModes(access::mode A, access::mode B)
Combines two access modes into a single one that allows both.
static bool markNodeAsVisited(Command *Cmd, std::vector< Command * > &Visited)
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
std::unique_ptr< KernelFusionCommand > FusionList
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
static bool isPartOfFusion(Command *Cmd, KernelFusionCommand *Fusion)
std::shared_ptr< event_impl > EventImplPtr
static void printDotRecursive(std::fstream &Stream, std::vector< Command * > &Visited, Command *Cmd)
AccessorImplHost Requirement
std::shared_ptr< device_impl > DeviceImplPtr
static void handleVisitedNodes(std::vector< Command * > &Visited)
static bool IsSuitableSubReq(const Requirement *Req)
Checks if current requirement is requirement for sub buffer.
static Command * insertMapUnmapForLinkedCmds(AllocaCommandBase *AllocaCmdSrc, AllocaCommandBase *AllocaCmdDst, access::mode MapMode)
static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx)
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
static bool isAccessModeAllowed(access::mode Required, access::mode Current)
Checks if the required access mode is allowed under the current one.
static void combineAccessModesOfReqs(std::vector< Requirement * > &Reqs)
static bool sameCtx(const ContextImplPtr &LHS, const ContextImplPtr &RHS)
constexpr if(sizeof(T)==8)
class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(local_accessor) local_accessor class __SYCL_EBO __SYCL_SPECIAL_CLASS AccessMode
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
bool any_of(const simd_mask< _Tp, _Abi > &) noexcept
bool MVisited
Used for marking the node as visited during graph traversal.
bool MToBeDeleted
Used for marking the node for deletion during cleanup.
Dependency between two commands.
const Requirement * MDepRequirement
Requirement for the dependency.
Command * MDepCommand
The actual dependency command.
AllocaCommandBase * MAllocaCmd
Allocation command for the memory object we have requirement for.
LeavesCollection MWriteLeaves
ContextImplPtr MCurContext
std::vector< AllocaCommandBase * > MAllocaCommands
LeavesCollection MReadLeaves