13 #include <sycl/feature_test.hpp>
14 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
36 inline namespace _V1 {
68 return (Required == Current);
70 assert(
false &&
"Write only access is expected to be mapped as read_write");
100 std::string GraphPrintOpts(EnvVarCStr);
101 bool EnableAlways = GraphPrintOpts.find(
"always") != std::string::npos;
103 if (GraphPrintOpts.find(
"before_addCG") != std::string::npos ||
105 MPrintOptionsArray[BeforeAddCG] =
true;
106 if (GraphPrintOpts.find(
"after_addCG") != std::string::npos || EnableAlways)
107 MPrintOptionsArray[AfterAddCG] =
true;
108 if (GraphPrintOpts.find(
"before_addCopyBack") != std::string::npos ||
110 MPrintOptionsArray[BeforeAddCopyBack] =
true;
111 if (GraphPrintOpts.find(
"after_addCopyBack") != std::string::npos ||
113 MPrintOptionsArray[AfterAddCopyBack] =
true;
114 if (GraphPrintOpts.find(
"before_addHostAcc") != std::string::npos ||
116 MPrintOptionsArray[BeforeAddHostAcc] =
true;
117 if (GraphPrintOpts.find(
"after_addHostAcc") != std::string::npos ||
119 MPrintOptionsArray[AfterAddHostAcc] =
true;
120 if (GraphPrintOpts.find(
"after_fusionComplete") != std::string::npos ||
122 MPrintOptionsArray[AfterFusionComplete] =
true;
123 if (GraphPrintOpts.find(
"after_fusionCancel") != std::string::npos ||
125 MPrintOptionsArray[AfterFusionCancel] =
true;
130 assert(Cmd &&
"Cmd can't be nullptr");
134 Visited.push_back(Cmd);
140 Cmd->MMarks.MVisited =
false;
145 if (Cmd->MMarks.MToBeDeleted) {
154 Cmd->getEvent()->setCommand(
nullptr);
157 Cmd->MMarks.MVisited =
false;
162 std::vector<Command *> &Visited,
Command *Cmd) {
172 void Scheduler::GraphBuilder::printGraphAsDot(
const char *ModeName) {
173 static size_t Counter = 0;
174 std::string ModeNameStr(ModeName);
175 std::string FileName =
176 "graph_" + std::to_string(Counter) + ModeNameStr +
".dot";
180 std::fstream Stream(FileName, std::ios::out);
181 Stream <<
"strict digraph {" << std::endl;
183 MVisitedCmds.clear();
185 for (SYCLMemObjI *MemObject : MMemObjs)
186 for (
Command *AllocaCmd : MemObject->MRecord->MAllocaCommands)
189 Stream <<
"}" << std::endl;
195 return MemObject->
MRecord.get();
204 if (
nullptr != Record)
207 const size_t LeafLimit = 8;
213 DepDesc Dep = findDepForRecord(Dependant, Record);
215 std::vector<Command *> ToCleanUp;
218 ToEnqueue.push_back(ConnectionCmd);
220 --(Dependency->MLeafCounter);
221 if (Dependency->readyForCleanup())
222 ToCleanUp.push_back(Dependency);
233 std::vector<sycl::device> Devices =
234 InteropCtxPtr->get_info<info::context::devices>();
235 assert(Devices.size() != 0);
242 Dev, InteropCtxPtr, {}, {}}};
245 new MemObjRecord{InteropCtxPtr, LeafLimit, AllocateDependency});
246 std::vector<Command *> ToEnqueue;
247 getOrCreateAllocaForReq(MemObject->
MRecord.get(), Req, InteropQueuePtr,
249 assert(ToEnqueue.empty() &&
"Creation of the first alloca for a record "
250 "shouldn't lead to any enqueuing (no linked "
251 "alloca or exceeding the leaf limit).");
254 LeafLimit, AllocateDependency});
256 MMemObjs.push_back(MemObject);
257 return MemObject->
MRecord.get();
263 std::vector<Command *> &ToCleanUp) {
270 bool WasLeaf = Cmd->MLeafCounter > 0;
273 if (WasLeaf && Cmd->readyForCleanup()) {
274 ToCleanUp.push_back(Cmd);
281 std::vector<Command *> &ToEnqueue) {
285 if (Leaves.push_back(Cmd, ToEnqueue))
291 std::vector<Command *> &ToEnqueue) {
294 assert(AllocaCmd &&
"There must be alloca for requirement!");
301 std::set<Command *> Deps = findDepsForReq(Record, Req, Context);
302 std::vector<Command *> ToCleanUp;
305 UpdateCommand->
addDep(
DepDesc{Dep, StoredReq, AllocaCmd}, ToCleanUp);
307 ToEnqueue.push_back(ConnCmd);
309 updateLeaves(Deps, Record, Req->
MAccessMode, ToCleanUp);
310 addNodeToLeaves(Record, UpdateCommand, Req->
MAccessMode, ToEnqueue);
313 return UpdateCommand;
322 "Expected linked alloca commands");
324 "Expected source alloca command to be active");
345 Command *Scheduler::GraphBuilder::insertMemoryMove(
347 std::vector<Command *> &ToEnqueue) {
349 AllocaCommandBase *AllocaCmdDst =
350 getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
353 "Out of host memory");
356 std::set<Command *> Deps = findDepsForReq(Record, Req, Context);
357 Deps.insert(AllocaCmdDst);
360 if (AllocaCmdDst->getType() == Command::CommandType::ALLOCA_SUB_BUF)
362 static_cast<AllocaSubBufCommand *
>(AllocaCmdDst)->getParentAlloca();
365 AllocaCommandBase *AllocaCmdSrc =
366 findAllocaForReq(Record, Req, Record->MCurContext);
371 auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) {
374 AllocaCmd->
getType() == Command::CommandType::ALLOCA;
378 std::find_if(Record->MAllocaCommands.begin(),
379 Record->MAllocaCommands.end(), IsSuitableAlloca);
380 AllocaCmdSrc = (Record->MAllocaCommands.end() != It) ? *It :
nullptr;
384 "Cannot find buffer allocation");
387 if (AllocaCmdSrc->getType() == Command::CommandType::ALLOCA_SUB_BUF)
389 static_cast<AllocaSubBufCommand *
>(AllocaCmdSrc)->getParentAlloca();
390 else if (AllocaCmdSrc->getSYCLMemObj() != Req->MSYCLMemObj)
391 assert(
false &&
"Inappropriate alloca command.");
396 if (AllocaCmdSrc->MLinkedAllocaCmd == AllocaCmdDst) {
402 Record->MHostAccess = MapMode;
407 Record->MCurContext = Context;
413 new MemCpyCommand(*AllocaCmdSrc->getRequirement(), AllocaCmdSrc,
414 *AllocaCmdDst->getRequirement(), AllocaCmdDst,
415 AllocaCmdSrc->getQueue(), AllocaCmdDst->getQueue());
418 std::vector<Command *> ToCleanUp;
421 DepDesc{Dep, NewCmd->getRequirement(), AllocaCmdDst}, ToCleanUp);
423 ToEnqueue.push_back(ConnCmd);
429 Record->MCurContext = Context;
433 Command *Scheduler::GraphBuilder::remapMemoryObject(
434 MemObjRecord *Record,
Requirement *Req, AllocaCommandBase *HostAllocaCmd,
435 std::vector<Command *> &ToEnqueue) {
436 assert(!HostAllocaCmd->getQueue() &&
"Host alloca command expected");
437 assert(HostAllocaCmd->MIsActive &&
"Active alloca command expected");
439 AllocaCommandBase *LinkedAllocaCmd = HostAllocaCmd->MLinkedAllocaCmd;
440 assert(LinkedAllocaCmd &&
"Linked alloca command expected");
442 std::set<Command *> Deps = findDepsForReq(Record, Req, Record->MCurContext);
444 UnMapMemObject *UnMapCmd =
new UnMapMemObject(
445 LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(),
446 &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue());
452 MapMemObject *MapCmd =
new MapMemObject(
453 LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(),
454 &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue(), MapMode);
456 std::vector<Command *> ToCleanUp;
458 Command *ConnCmd = UnMapCmd->addDep(
459 DepDesc{Dep, UnMapCmd->getRequirement(), LinkedAllocaCmd}, ToCleanUp);
461 ToEnqueue.push_back(ConnCmd);
464 Command *ConnCmd = MapCmd->addDep(
465 DepDesc{UnMapCmd, MapCmd->getRequirement(), HostAllocaCmd}, ToCleanUp);
467 ToEnqueue.push_back(ConnCmd);
473 Record->MHostAccess = MapMode;
481 std::vector<Command *> &ToEnqueue) {
484 if (Record && MPrintOptionsArray[BeforeAddCopyBack])
485 printGraphAsDot(
"before_addCopyBack");
491 std::set<Command *> Deps = findDepsForReq(Record, Req,
nullptr);
493 findAllocaForReq(Record, Req, Record->
MCurContext);
495 auto MemCpyCmdUniquePtr = std::make_unique<MemCpyCommandHost>(
499 if (!MemCpyCmdUniquePtr)
501 "Out of host memory");
505 std::vector<Command *> ToCleanUp;
510 ToEnqueue.push_back(ConnCmd);
513 updateLeaves(Deps, Record, Req->
MAccessMode, ToCleanUp);
514 addNodeToLeaves(Record, MemCpyCmd, Req->
MAccessMode, ToEnqueue);
517 if (MPrintOptionsArray[AfterAddCopyBack])
518 printGraphAsDot(
"after_addCopyBack");
526 std::vector<Command *> &ToEnqueue) {
534 MemObjRecord *Record = getOrInsertMemObjRecord(
nullptr, Req);
535 if (MPrintOptionsArray[BeforeAddHostAcc])
536 printGraphAsDot(
"before_addHostAccessor");
537 markModifiedIfWrite(Record, Req);
540 getOrCreateAllocaForReq(Record, Req,
nullptr, ToEnqueue);
544 remapMemoryObject(Record, Req,
552 insertMemoryMove(Record, Req,
nullptr, ToEnqueue);
555 insertUpdateHostReqCmd(Record, Req,
nullptr, ToEnqueue);
563 if (MPrintOptionsArray[AfterAddHostAcc])
564 printGraphAsDot(
"after_addHostAccessor");
566 return UpdateHostAccCmd;
570 std::unique_ptr<detail::CG> CommandGroup,
571 std::vector<Command *> &ToEnqueue) {
576 MemObjRecord *Record = getOrInsertMemObjRecord(
nullptr, Req);
577 return insertMemoryMove(Record, Req,
nullptr, ToEnqueue);
589 Scheduler::GraphBuilder::findDepsForReq(
MemObjRecord *Record,
592 std::set<Command *> RetDeps;
593 std::vector<Command *> Visited;
601 ToAnalyze.insert(ToAnalyze.begin(), V.begin(), V.end());
604 while (!ToAnalyze.empty()) {
605 Command *DepCmd = ToAnalyze.back();
606 ToAnalyze.pop_back();
608 std::vector<Command *> NewAnalyze;
610 for (
const DepDesc &Dep : DepCmd->MDeps) {
611 if (Dep.MDepRequirement->MSYCLMemObj != Req->
MSYCLMemObj)
614 bool CanBypassDep =
false;
620 CanBypassDep |= !
doOverlap(Dep.MDepRequirement, Req);
623 if (Dep.MDepCommand) {
624 auto DepQueue = Dep.MDepCommand->getQueue();
629 RetDeps.insert(DepCmd);
637 NewAnalyze.push_back(Dep.MDepCommand);
639 ToAnalyze.insert(ToAnalyze.end(), NewAnalyze.begin(), NewAnalyze.end());
654 assert(
false &&
"No dependency found for a leaf of the record");
655 return {
nullptr,
nullptr,
nullptr};
663 auto IsSuitableAlloca = [&Context, Req,
668 Res &= AllocaCmd->
getType() == Command::CommandType::ALLOCA_SUB_BUF;
671 Res &= AllowConst || !AllocaCmd->
MIsConst;
682 if (std::strcmp(HUMConfig,
"0") == 0)
683 return Ctx ==
nullptr;
684 if (std::strcmp(HUMConfig,
"1") == 0)
692 for (
const device &Device : Ctx->getDevices()) {
693 if (!Device.get_info<info::device::host_unified_memory>())
703 AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
705 std::vector<Command *> &ToEnqueue) {
707 AllocaCommandBase *AllocaCmd =
708 findAllocaForReq(Record, Req, Context,
false);
711 std::vector<Command *> ToCleanUp;
715 range<3> ParentRange{Req->MSYCLMemObj->getSizeInBytes(), 1, 1};
717 {0, 0, 0}, ParentRange, ParentRange,
719 sizeof(char),
size_t(0));
722 getOrCreateAllocaForReq(Record, &ParentRequirement, Queue, ToEnqueue);
723 AllocaCmd =
new AllocaSubBufCommand(Queue, *Req, ParentAlloca, ToEnqueue,
727 const Requirement FullReq( {0, 0, 0}, Req->MMemoryRange,
729 Req->MSYCLMemObj, Req->MDims, Req->MElemSize,
739 SYCLMemObjI *MemObj = Req->MSYCLMemObj;
740 const bool InitFromUserData = Record->MAllocaCommands.empty() &&
741 (HostUnifiedMemory || MemObj->isInterop());
742 AllocaCommandBase *LinkedAllocaCmd =
nullptr;
747 if (Record->MAllocaCommands.empty()) {
748 if (!HostUnifiedMemory &&
753 if (MemObj->hasUserDataPtr()) {
754 AllocaCommand *HostAllocaCmd =
new AllocaCommand(
755 nullptr, FullReq,
true ,
757 MemObj->isHostPointerReadOnly() );
758 Record->MAllocaCommands.push_back(HostAllocaCmd);
759 Record->MWriteLeaves.push_back(HostAllocaCmd, ToEnqueue);
760 ++(HostAllocaCmd->MLeafCounter);
761 Record->MCurContext =
nullptr;
768 if (Req->MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer)
773 if ((Context ==
nullptr) != (Record->MCurContext ==
nullptr)) {
782 bool PinnedHostMemory = MemObj->usesPinnedHostMemory();
784 bool HostUnifiedMemoryOnNonHostDevice =
787 if (PinnedHostMemory || HostUnifiedMemoryOnNonHostDevice) {
788 AllocaCommandBase *LinkedAllocaCmdCand = findAllocaForReq(
789 Record, Req, Record->MCurContext,
false);
792 if (LinkedAllocaCmdCand &&
793 !LinkedAllocaCmdCand->MLinkedAllocaCmd) {
794 LinkedAllocaCmd = LinkedAllocaCmdCand;
801 new AllocaCommand(Queue, FullReq, InitFromUserData, LinkedAllocaCmd);
804 if (LinkedAllocaCmd) {
805 Command *ConnCmd = AllocaCmd->addDep(
806 DepDesc{LinkedAllocaCmd, AllocaCmd->getRequirement(),
810 ToEnqueue.push_back(ConnCmd);
811 LinkedAllocaCmd->MLinkedAllocaCmd = AllocaCmd;
814 ConnCmd = AllocaCmd->getReleaseCmd()->addDep(
815 DepDesc(LinkedAllocaCmd->getReleaseCmd(),
816 AllocaCmd->getRequirement(), LinkedAllocaCmd),
819 ToEnqueue.push_back(ConnCmd);
825 if (Queue ==
nullptr) {
826 AllocaCmd->MIsActive =
false;
828 LinkedAllocaCmd->MIsActive =
false;
829 Record->MCurContext = Context;
831 std::set<Command *> Deps = findDepsForReq(Record, Req, Context);
833 Command *ConnCmd = AllocaCmd->addDep(
834 DepDesc{Dep, Req, LinkedAllocaCmd}, ToCleanUp);
836 ToEnqueue.push_back(ConnCmd);
838 updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp);
839 addNodeToLeaves(Record, AllocaCmd, Req->MAccessMode, ToEnqueue);
844 Record->MAllocaCommands.push_back(AllocaCmd);
845 Record->MWriteLeaves.push_back(AllocaCmd, ToEnqueue);
846 ++(AllocaCmd->MLeafCounter);
854 void Scheduler::GraphBuilder::markModifiedIfWrite(MemObjRecord *Record,
856 switch (Req->MAccessMode) {
862 Record->MMemModified =
true;
869 EmptyCommand *Scheduler::GraphBuilder::addEmptyCmd(
870 Command *Cmd,
const std::vector<Requirement *> &Reqs,
872 EmptyCommand *EmptyCmd =
new EmptyCommand();
876 "Out of host memory");
878 EmptyCmd->MIsBlockable =
true;
880 EmptyCmd->MBlockReason = Reason;
883 MemObjRecord *Record = getOrInsertMemObjRecord(
nullptr, Req);
884 AllocaCommandBase *AllocaCmd =
885 getOrCreateAllocaForReq(Record, Req,
nullptr, ToEnqueue);
886 EmptyCmd->addRequirement(Cmd, AllocaCmd, Req);
891 Cmd->addUser(EmptyCmd);
893 const std::vector<DepDesc> &Deps = Cmd->MDeps;
894 std::vector<Command *> ToCleanUp;
895 for (
const DepDesc &Dep : Deps) {
899 updateLeaves({Cmd}, Record, Req->MAccessMode, ToCleanUp);
900 addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode, ToEnqueue);
919 std::unordered_map<SYCLMemObjI *, access::mode> CombinedModes;
920 bool HasDuplicateMemObjects =
false;
922 auto Result = CombinedModes.insert(
923 std::make_pair(Req->MSYCLMemObj, Req->MAccessMode));
924 if (!Result.second) {
925 Result.first->second =
927 HasDuplicateMemObjects =
true;
931 if (!HasDuplicateMemObjects)
934 Req->MAccessMode = CombinedModes[Req->MSYCLMemObj];
939 std::unique_ptr<detail::CG> CommandGroup,
const QueueImplPtr &Queue,
940 std::vector<Command *> &ToEnqueue,
bool EventNeeded,
942 const std::vector<sycl::detail::pi::PiExtSyncPoint> &Dependencies) {
943 std::vector<Requirement *> &Reqs = CommandGroup->getRequirements();
944 std::vector<detail::EventImplPtr> &Events = CommandGroup->getEvents();
946 auto NewCmd = std::make_unique<ExecCGCommand>(std::move(CommandGroup), Queue,
947 EventNeeded, CommandBuffer,
948 std::move(Dependencies));
952 "Out of host memory");
958 auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
960 if (NewCmd->isFusable()) {
961 auto *FusionCmd = findFusionList(QUniqueID)->second.get();
963 bool dependsOnFusion =
false;
964 for (
auto Ev = Events.begin(); Ev != Events.end();) {
965 auto *EvDepCmd =
static_cast<Command *
>((*Ev)->getCommand());
972 if (EvDepCmd->getQueue() != Queue && isPartOfActiveFusion(EvDepCmd)) {
974 "Aborting fusion because of event dependency from a "
980 if (EvDepCmd == FusionCmd) {
981 Ev = Events.erase(Ev);
982 dependsOnFusion =
true;
992 if (dependsOnFusion) {
993 for (
auto *Cmd : FusionCmd->getFusionList()) {
994 Events.push_back(Cmd->getEvent());
1001 createGraphForCommand(NewCmd.get(), NewCmd->getCG(),
1003 Queue, FusionCmd->auxiliaryCommands());
1007 FusionCmd->addToFusionList(NewCmd.get());
1008 NewCmd->MFusionCmd = FusionCmd;
1009 std::vector<Command *> ToCleanUp;
1012 auto ConnectionCmd = FusionCmd->addDep(NewCmd->getEvent(), ToCleanUp);
1013 if (ConnectionCmd) {
1014 FusionCmd->auxiliaryCommands().push_back(ConnectionCmd);
1016 return {NewCmd.release(), FusionCmd->getEvent(),
false};
1019 std::stringstream ss(s);
1021 ss <<
"Not fusing kernel with 'use_root_sync' property. Can only fuse "
1022 "non-cooperative device kernels.";
1024 ss <<
"Not fusing '" << NewCmd->getTypeString()
1025 <<
"' command group. Can only fuse device kernel command groups.";
1027 printFusionWarning(ss.str());
1030 createGraphForCommand(NewCmd.get(), NewCmd->getCG(),
1033 auto Event = NewCmd->getEvent();
1034 return {NewCmd.release(), Event,
true};
1037 void Scheduler::GraphBuilder::createGraphForCommand(
1039 std::vector<Requirement *> &Reqs,
1040 const std::vector<detail::EventImplPtr> &Events,
QueueImplPtr Queue,
1041 std::vector<Command *> &ToEnqueue) {
1043 if (MPrintOptionsArray[BeforeAddCG])
1044 printGraphAsDot(
"before_addCG");
1050 std::vector<Command *> ToCleanUp;
1055 bool isSameCtx =
false;
1061 Record = getOrInsertMemObjRecord(QueueForAlloca, Req);
1062 markModifiedIfWrite(Record, Req);
1065 getOrCreateAllocaForReq(Record, Req, QueueForAlloca, ToEnqueue);
1077 remapMemoryObject(Record, Req,
1087 bool NeedMemMoveToHost =
false;
1088 auto MemMoveTargetQueue = Queue;
1090 if (isInteropTask) {
1091 const detail::CGHostTask &HT =
static_cast<detail::CGHostTask &
>(CG);
1094 NeedMemMoveToHost =
true;
1095 MemMoveTargetQueue = HT.MQueue;
1098 NeedMemMoveToHost =
true;
1100 if (NeedMemMoveToHost)
1101 insertMemoryMove(Record, Req,
nullptr, ToEnqueue);
1102 insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue);
1105 std::set<Command *> Deps =
1109 if (Dep != NewCmd) {
1111 NewCmd->
addDep(DepDesc{Dep, Req, AllocaCmd}, ToCleanUp);
1113 ToEnqueue.push_back(ConnCmd);
1122 std::vector<DepDesc> Deps = NewCmd->
MDeps;
1123 for (DepDesc &Dep : Deps) {
1126 updateLeaves({Dep.MDepCommand}, Record, Req->MAccessMode, ToCleanUp);
1127 addNodeToLeaves(Record, NewCmd, Req->MAccessMode, ToEnqueue);
1132 if (e->getCommand() && e->getCommand() == NewCmd) {
1136 ToEnqueue.push_back(ConnCmd);
1139 if (MPrintOptionsArray[AfterAddCG])
1140 printGraphAsDot(
"after_addCG");
1142 for (
Command *Cmd : ToCleanUp) {
1143 cleanupCommand(Cmd);
1152 cleanupCommand(Cmd);
1157 cleanupCommand(Cmd);
1162 std::vector<AllocaCommandBase *> &AllocaCommands = Record->
MAllocaCommands;
1163 if (AllocaCommands.empty())
1166 assert(MCmdsToVisit.empty());
1167 MVisitedCmds.clear();
1171 for (
Command *AllocaCmd : AllocaCommands) {
1177 if (UserCmd->
getType() != Command::CommandType::ALLOCA)
1178 MCmdsToVisit.push(UserCmd);
1185 AllocaCmd->
MUsers.clear();
1203 while (!MCmdsToVisit.empty()) {
1204 Command *Cmd = MCmdsToVisit.front();
1211 if (UserCmd->
getType() != Command::CommandType::ALLOCA)
1212 MCmdsToVisit.push(UserCmd);
1216 std::map<Command *, bool> ShouldBeUpdated;
1217 auto NewEnd = std::remove_if(
1219 if (std::find(AllocaCommands.begin(), AllocaCommands.end(),
1220 Dep.MAllocaCmd) != AllocaCommands.end()) {
1221 ShouldBeUpdated.insert({Dep.MDepCommand, true});
1227 Cmd->
MDeps.erase(NewEnd, Cmd->MDeps.end());
1230 for (
auto DepCmdIt : ShouldBeUpdated) {
1231 if (!DepCmdIt.second)
1233 DepCmdIt.first->MUsers.erase(Cmd);
1238 if (Cmd->MDeps.empty()) {
1239 Cmd->MUsers.clear();
1242 if (!Cmd->MMarkedForCleanup)
1243 Cmd->MMarks.MToBeDeleted =
true;
1251 Command *Cmd, [[maybe_unused]]
bool AllowUnsubmitted) {
1253 static bool DeprWarningPrinted =
false;
1254 if (!DeprWarningPrinted) {
1255 std::cerr <<
"WARNING: The enviroment variable "
1256 "SYCL_DISABLE_POST_ENQUEUE_CLEANUP is deprecated. Please "
1257 "use SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP instead.\n";
1258 DeprWarningPrinted =
true;
1290 DepCmd->
MUsers.erase(Cmd);
1301 Cmd->
getEvent()->setCommand(
nullptr);
1306 const auto It = std::find_if(
1307 MMemObjs.begin(), MMemObjs.end(),
1308 [MemObject](
const SYCLMemObjI *Obj) { return Obj == MemObject; });
1309 if (It != MMemObjs.end())
1329 std::vector<Command *> &ToCleanUp) {
1338 std::move(HT), Cmd->
getQueue(), {},
1346 ConnectCmd =
new ExecCGCommand(std::move(ConnectCG),
nullptr,
1348 }
catch (
const std::bad_alloc &) {
1350 "Out of host memory");
1357 (void)ConnectCmd->
addDep(Dep, ToCleanUp);
1358 assert(
reinterpret_cast<Command *
>(DepEvent->getCommand()) ==
1367 std::ignore = Cmd->
addDep(DepOnConnect, ToCleanUp);
1372 if (
Command *DepCmd =
reinterpret_cast<Command *
>(DepEvent->getCommand()))
1375 std::ignore = ConnectCmd->
addDep(DepEvent, ToCleanUp);
1387 auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1388 MFusionMap.emplace(QUniqueID, std::make_unique<KernelFusionCommand>(Queue));
1392 sycl::detail::queue_impl *Queue) {
1393 auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue);
1396 "Queue already in fusion mode"};
1398 auto OldFusionCmd = findFusionList(QUniqueID);
1399 if (OldFusionCmd != MFusionMap.end()) {
1404 OldFusionCmd->second->setFusionStatus(
1406 cleanupCommand(OldFusionCmd->second.release());
1407 MFusionMap.erase(OldFusionCmd);
1411 void Scheduler::GraphBuilder::removeNodeFromGraph(
1412 Command *Node, std::vector<Command *> &ToEnqueue) {
1415 for (
auto &Dep : Node->
MDeps) {
1416 auto AccessMode = Dep.MDepRequirement->MAccessMode;
1419 Node->
MLeafCounter -= Record->MReadLeaves.remove(Node);
1420 Node->
MLeafCounter -= Record->MWriteLeaves.remove(Node);
1424 for (
auto PrevDep : Dep.MDepCommand->MDeps) {
1425 auto *DepReq = PrevDep.MDepRequirement;
1427 if (DepRecord == Record) {
1430 assert(Dep.MDepCommand);
1431 addNodeToLeaves(Record, Dep.MDepCommand, DepReq->MAccessMode,
1436 Dep.MDepCommand->MUsers.erase(Node);
1446 std::vector<Command *> &ToEnqueue) {
1447 auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1453 auto *PlaceholderCmd = (*FusionList).second.get();
1456 auto FusedCmdList = PlaceholderCmd->getFusionList();
1457 ToEnqueue.insert(ToEnqueue.end(), FusedCmdList.begin(), FusedCmdList.end());
1461 ToEnqueue.insert(ToEnqueue.end(), PlaceholderCmd->auxiliaryCommands().begin(),
1462 PlaceholderCmd->auxiliaryCommands().end());
1464 ToEnqueue.push_back(PlaceholderCmd);
1466 if (MPrintOptionsArray[AfterFusionCancel]) {
1467 printGraphAsDot(
"after_fusionCancel");
1476 return static_cast<ExecCGCommand *
>(Cmd)->MFusionCmd == Fusion;
1493 return !PredPartOfFusion;
1501 for (
auto &Dep : Cmd->
MDeps) {
1502 auto *DepCmd = Dep.MDepCommand;
1511 auto *EvDepCmd =
static_cast<Command *
>(Ev->getCommand());
1520 auto *EvDepCmd =
static_cast<Command *
>(Ev->getCommand());
1533 std::vector<Command *> &ToEnqueue,
1535 auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1536 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
1538 auto InactiveFusionList = findFusionList(QUniqueID);
1539 if (InactiveFusionList == MFusionMap.end()) {
1542 "Calling complete_fusion on a queue not in fusion mode"};
1544 return InactiveFusionList->second->getEvent();
1548 auto *PlaceholderCmd =
FusionList->second.get();
1549 auto &CmdList = PlaceholderCmd->getFusionList();
1563 bool CreatesCircularDep =
1564 MFusionMap.size() > 1 &&
1566 return checkForCircularDependency(Cmd, true, PlaceholderCmd);
1568 if (CreatesCircularDep) {
1571 "Aborting fusion because it would create a circular dependency");
1572 auto LastEvent = PlaceholderCmd->getEvent();
1579 Queue, CmdList, PropList);
1585 auto LastEvent = PlaceholderCmd->getEvent();
1591 std::vector<EventImplPtr> FusedEventDeps;
1592 for (
auto *Cmd : CmdList) {
1593 FusedEventDeps.insert(FusedEventDeps.end(),
1596 FusedEventDeps.insert(FusedEventDeps.end(),
1603 FusedEventDeps.erase(
1604 std::remove_if(FusedEventDeps.begin(), FusedEventDeps.end(),
1606 if (E->getCommand() == PlaceholderCmd) {
1609 if (E->getCommand() &&
1613 static_cast<ExecCGCommand *>(E->getCommand());
1614 if (RunCGCmd->MFusionCmd == PlaceholderCmd) {
1620 FusedEventDeps.end());
1622 auto FusedKernelCmd = std::make_unique<ExecCGCommand>(
1623 std::move(FusedCG), Queue,
true);
1627 PlaceholderCmd->getEvent());
1628 assert(PlaceholderCmd->MDeps.empty());
1633 for (
auto OldCmd = CmdList.rbegin(); OldCmd != CmdList.rend(); ++OldCmd) {
1634 removeNodeFromGraph(*OldCmd, ToEnqueue);
1635 cleanupCommand(*OldCmd,
true);
1638 createGraphForCommand(FusedKernelCmd.get(), FusedKernelCmd->getCG(),
false,
1639 FusedKernelCmd->getCG().getRequirements(),
1640 FusedEventDeps, Queue, ToEnqueue);
1642 ToEnqueue.push_back(FusedKernelCmd.get());
1644 std::vector<Command *> ToCleanUp;
1646 auto *ConnectToPlaceholder =
1647 PlaceholderCmd->addDep(FusedKernelCmd->getEvent(), ToCleanUp);
1648 if (ConnectToPlaceholder) {
1649 ToEnqueue.push_back(ConnectToPlaceholder);
1651 for (Command *Cmd : ToCleanUp) {
1652 cleanupCommand(Cmd);
1654 ToEnqueue.push_back(PlaceholderCmd);
1656 if (MPrintOptionsArray[AfterFusionComplete]) {
1657 printGraphAsDot(
"after_fusionComplete");
1663 return FusedKernelCmd.release()->getEvent();
1665 printFusionWarning(
"Kernel fusion not supported by this build");
1668 auto *PlaceholderCmd =
FusionList->second.get();
1669 auto LastEvent = PlaceholderCmd->getEvent();
1670 this->cancelFusion(Queue, ToEnqueue);
1675 bool Scheduler::GraphBuilder::isInFusionMode(
QueueIdT Id) {
1683 Command *Scheduler::GraphBuilder::addCommandGraphUpdate(
1685 std::vector<std::shared_ptr<ext::oneapi::experimental::detail::node_impl>>
1687 const QueueImplPtr &Queue, std::vector<Requirement *> Requirements,
1688 std::vector<detail::EventImplPtr> &Events,
1689 std::vector<Command *> &ToEnqueue) {
1691 std::make_unique<UpdateCommandBufferCommand>(Queue, Graph, Nodes);
1696 std::vector<Command *> ToCleanUp;
1701 bool isSameCtx =
false;
1705 Record = getOrInsertMemObjRecord(Queue, Req);
1706 markModifiedIfWrite(Record, Req);
1708 AllocaCmd = getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
1716 bool NeedMemMoveToHost =
false;
1717 auto MemMoveTargetQueue = Queue;
1720 NeedMemMoveToHost =
true;
1722 if (NeedMemMoveToHost)
1723 insertMemoryMove(Record, Req,
nullptr, ToEnqueue);
1724 insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue);
1726 std::set<Command *> Deps =
1730 if (Dep != NewCmd.get()) {
1734 ToEnqueue.push_back(ConnCmd);
1743 std::vector<DepDesc> Deps = NewCmd->
MDeps;
1747 updateLeaves({Dep.MDepCommand}, Record, Req->
MAccessMode, ToCleanUp);
1748 addNodeToLeaves(Record, NewCmd.get(), Req->
MAccessMode, ToEnqueue);
1753 if (e->getCommand() &&
1754 e->getCommand() ==
static_cast<Command *
>(NewCmd.get())) {
1758 ToEnqueue.push_back(ConnCmd);
1761 if (MPrintOptionsArray[AfterAddCG])
1762 printGraphAsDot(
"after_addCG");
1764 for (
Command *Cmd : ToCleanUp) {
1765 cleanupCommand(Cmd);
1768 return NewCmd.release();
detail::SYCLMemObjI * MSYCLMemObj
range< 3 > & MAccessRange
Base class for memory allocation commands.
const Requirement * getRequirement() const final
AllocaCommandBase * MLinkedAllocaCmd
Alloca command linked with current command.
bool MIsActive
Indicates that current alloca is active one.
The AllocaSubBuf command enqueues creation of sub-buffer of memory object.
std::shared_ptr< HostTask > MHostTask
"Update host" command group class.
Base class for all types of command groups.
The Command class represents some action that needs to be performed on one or more memory objects.
bool isSuccessfullyEnqueued() const
const std::vector< EventImplPtr > & getPreparedDepsEvents() const
const std::vector< EventImplPtr > & getPreparedHostDepsEvents() const
unsigned MLeafCounter
Counts the number of memory objects this command is a leaf for.
std::unordered_set< Command * > MUsers
Contains list of commands that depend on the command.
Marks MMarks
Used for marking the node during graph traversal.
std::vector< DepDesc > MDeps
Contains list of dependencies(edges)
virtual bool readyForCleanup() const
Returns true iff this command is ready to be submitted for cleanup.
void addUser(Command *NewUser)
virtual ContextImplPtr getWorkerContext() const
Get the context of the queue this command will be submitted to.
void clearAllDependencies()
Clear all dependency events This should only be used if a command is about to be deleted without bein...
const EventImplPtr & getEvent() const
CommandType getType() const
virtual void printDot(std::ostream &Stream) const =0
Command * addDep(DepDesc NewDep, std::vector< Command * > &ToCleanUp)
const QueueImplPtr & getQueue() const
The empty command does nothing during enqueue.
The exec CG command enqueues execution of kernel or explicit memory operation.
detail::CG & getCG() const
The KernelFusionCommand is placed in the execution graph together with the individual kernels of the ...
bool readyForDeletion() const
A wrapper for CircularBuffer class along with collection for host accessor's EmptyCommands.
size_t remove(value_type Cmd)
Replacement for std::remove with subsequent call to erase(newEnd, end()).
std::function< void(Command *, Command *, MemObjRecord *, EnqueueListT &)> AllocateDependencyF
std::vector< value_type > toVector() const
std::vector< Command * > EnqueueListT
The map command enqueues mapping of device memory onto host memory.
The mem copy host command enqueues memory copy between two instances of memory object.
const Requirement * getRequirement() const final
virtual ContextImplPtr getInteropContext() const =0
std::shared_ptr< MemObjRecord > MRecord
void handleWriteAccessorCreation()
void startFusion(QueueImplPtr Queue)
void cleanupCommand(Command *Cmd, bool AllowUnsubmitted=false)
MemObjRecord * getOrInsertMemObjRecord(const QueueImplPtr &Queue, const Requirement *Req)
Command * addCGUpdateHost(std::unique_ptr< detail::CG > CommandGroup, std::vector< Command * > &ToEnqueue)
Registers a command group that updates host memory to the latest state.
void decrementLeafCountersForRecord(MemObjRecord *Record)
Decrements leaf counters for all leaves of the record.
MemObjRecord * getMemObjRecord(SYCLMemObjI *MemObject)
EventImplPtr completeFusion(QueueImplPtr Queue, std::vector< Command * > &ToEnqueue, const property_list &)
Command * addHostAccessor(Requirement *Req, std::vector< Command * > &ToEnqueue)
Enqueues a command to create a host accessor.
void cleanupCommandsForRecord(MemObjRecord *Record)
Removes commands that use the given MemObjRecord from the graph.
void removeRecordForMemObj(SYCLMemObjI *MemObject)
Removes the MemObjRecord for the memory object passed.
Command * addCopyBack(Requirement *Req, std::vector< Command * > &ToEnqueue)
Enqueues a command to update memory to the latest state.
Command * connectDepEvent(Command *const Cmd, const EventImplPtr &DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform connection of events in multiple contexts.
GraphBuildResult addCG(std::unique_ptr< detail::CG > CommandGroup, const QueueImplPtr &Queue, std::vector< Command * > &ToEnqueue, bool EventNeeded, sycl::detail::pi::PiExtCommandBuffer CommandBuffer=nullptr, const std::vector< sycl::detail::pi::PiExtSyncPoint > &Dependencies={})
Registers command group and adds it to the dependency graph.
void cancelFusion(QueueImplPtr Queue, std::vector< Command * > &ToEnqueue)
void addNodeToLeaves(MemObjRecord *Record, Command *Cmd, access::mode AccessMode, std::vector< Command * > &ToEnqueue)
Adds new command to leaves if needed.
void cleanUpCmdFusion(sycl::detail::queue_impl *Queue)
Clean up the internal fusion commands held for the given queue.
AllocaCommandBase * findAllocaForReq(MemObjRecord *Record, const Requirement *Req, const ContextImplPtr &Context, bool AllowConst=true)
Searches for suitable alloca in memory record.
DepDesc findDepForRecord(Command *Cmd, MemObjRecord *Record)
Finds a command dependency corresponding to the record.
void updateLeaves(const std::set< Command * > &Cmds, MemObjRecord *Record, access::mode AccessMode, std::vector< Command * > &ToCleanUp)
Removes commands from leaves.
bool isInFusionMode(QueueIdT Queue)
void cancelFusion(QueueImplPtr Queue)
static MemObjRecord * getMemObjRecord(const Requirement *const Req)
static Scheduler & getInstance()
void cleanUpCmdFusion(sycl::detail::queue_impl *Queue)
void takeAuxiliaryResources(const EventImplPtr &Dst, const EventImplPtr &Src)
Assign Src's auxiliary resources to Dst.
The unmap command removes mapping of host memory onto device memory.
const Requirement * getRequirement() const final
std::unique_ptr< detail::CG > fuseKernels(QueueImplPtr Queue, std::vector< ExecCGCommand * > &InputKernels, const property_list &)
static jit_compiler & get_instance()
static ContextImplPtr getContext(const QueueImplPtr &Queue)
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Class representing the implementation of command_graph<executable>.
Objects of the property_list class are containers for the SYCL properties.
__SYCL_EXTERN_STREAM_ATTRS ostream cerr
Linked to standard error (unbuffered)
static bool doOverlap(const Requirement *LHS, const Requirement *RHS)
Checks whether two requirements overlap or not.
decltype(Obj::impl) const & getSyclObjImpl(const Obj &SyclObject)
static bool createsCircularDependency(Command *Cmd, bool PredPartOfFusion, KernelFusionCommand *Fusion)
static bool isInteropHostTask(ExecCGCommand *Cmd)
static void unmarkVisitedNodes(std::vector< Command * > &Visited)
static bool checkForCircularDependency(Command *, bool, KernelFusionCommand *)
std::hash< std::shared_ptr< detail::queue_impl > >::result_type QueueIdT
static access::mode combineAccessModes(access::mode A, access::mode B)
Combines two access modes into a single one that allows both.
static bool markNodeAsVisited(Command *Cmd, std::vector< Command * > &Visited)
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
std::unique_ptr< KernelFusionCommand > FusionList
static bool isPartOfFusion(Command *Cmd, KernelFusionCommand *Fusion)
std::shared_ptr< event_impl > EventImplPtr
static void printDotRecursive(std::fstream &Stream, std::vector< Command * > &Visited, Command *Cmd)
AccessorImplHost Requirement
std::shared_ptr< device_impl > DeviceImplPtr
static void handleVisitedNodes(std::vector< Command * > &Visited)
static bool IsSuitableSubReq(const Requirement *Req)
Checks if current requirement is requirement for sub buffer.
static Command * insertMapUnmapForLinkedCmds(AllocaCommandBase *AllocaCmdSrc, AllocaCommandBase *AllocaCmdDst, access::mode MapMode)
static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx)
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
static bool isAccessModeAllowed(access::mode Required, access::mode Current)
Checks if the required access mode is allowed under the current one.
static void combineAccessModesOfReqs(std::vector< Requirement * > &Reqs)
static ContextImplPtr getContext(const QueueImplPtr &Queue)
static bool isOnSameContext(const ContextImplPtr Context, const QueueImplPtr &Queue)
constexpr if(sizeof(T)==8)
class __SYCL_EBO __SYCL_SPECIAL_CLASS AccessMode
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
bool any_of(const simd_mask< _Tp, _Abi > &) noexcept
bool MVisited
Used for marking the node as visited during graph traversal.
bool MToBeDeleted
Used for marking the node for deletion during cleanup.
Dependency between two commands.
const Requirement * MDepRequirement
Requirement for the dependency.
Command * MDepCommand
The actual dependency command.
AllocaCommandBase * MAllocaCmd
Allocation command for the memory object we have requirement for.
LeavesCollection MWriteLeaves
ContextImplPtr MCurContext
std::vector< AllocaCommandBase * > MAllocaCommands
LeavesCollection MReadLeaves