13 #include <sycl/feature_test.hpp>
14 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
35 inline namespace _V1 {
54 return LHS == RHS || (LHS->is_host() && RHS->is_host());
66 return (Required == Current);
68 assert(
false &&
"Write only access is expected to be mapped as read_write");
99 bool EnableAlways = GraphPrintOpts.find(
"always") != std::string::npos;
101 if (GraphPrintOpts.find(
"before_addCG") != std::string::npos ||
103 MPrintOptionsArray[BeforeAddCG] =
true;
104 if (GraphPrintOpts.find(
"after_addCG") != std::string::npos || EnableAlways)
105 MPrintOptionsArray[AfterAddCG] =
true;
106 if (GraphPrintOpts.find(
"before_addCopyBack") != std::string::npos ||
108 MPrintOptionsArray[BeforeAddCopyBack] =
true;
109 if (GraphPrintOpts.find(
"after_addCopyBack") != std::string::npos ||
111 MPrintOptionsArray[AfterAddCopyBack] =
true;
112 if (GraphPrintOpts.find(
"before_addHostAcc") != std::string::npos ||
114 MPrintOptionsArray[BeforeAddHostAcc] =
true;
115 if (GraphPrintOpts.find(
"after_addHostAcc") != std::string::npos ||
117 MPrintOptionsArray[AfterAddHostAcc] =
true;
118 if (GraphPrintOpts.find(
"after_fusionComplete") != std::string::npos ||
120 MPrintOptionsArray[AfterFusionComplete] =
true;
121 if (GraphPrintOpts.find(
"after_fusionCancel") != std::string::npos ||
123 MPrintOptionsArray[AfterFusionCancel] =
true;
128 assert(Cmd &&
"Cmd can't be nullptr");
132 Visited.push_back(Cmd);
138 Cmd->MMarks.MVisited =
false;
143 if (Cmd->MMarks.MToBeDeleted) {
152 Cmd->getEvent()->setCommand(
nullptr);
155 Cmd->MMarks.MVisited =
false;
160 std::vector<Command *> &Visited,
Command *Cmd) {
170 void Scheduler::GraphBuilder::printGraphAsDot(
const char *ModeName) {
171 static size_t Counter = 0;
174 "graph_" + std::to_string(Counter) + ModeNameStr +
".dot";
178 std::fstream Stream(FileName, std::ios::out);
179 Stream <<
"strict digraph {" << std::endl;
181 MVisitedCmds.clear();
183 for (SYCLMemObjI *MemObject : MMemObjs)
184 for (
Command *AllocaCmd : MemObject->MRecord->MAllocaCommands)
187 Stream <<
"}" << std::endl;
193 return MemObject->
MRecord.get();
198 std::vector<Command *> &ToEnqueue) {
202 if (
nullptr != Record)
205 const size_t LeafLimit = 8;
211 DepDesc Dep = findDepForRecord(Dependant, Record);
213 std::vector<Command *> ToCleanUp;
216 ToEnqueue.push_back(ConnectionCmd);
218 --(Dependency->MLeafCounter);
219 if (Dependency->readyForCleanup())
220 ToCleanUp.push_back(Dependency);
231 std::vector<sycl::device> Devices =
232 InteropCtxPtr->get_info<info::context::devices>();
233 assert(Devices.size() != 0);
240 Dev, InteropCtxPtr, {}, {}}};
243 new MemObjRecord{InteropCtxPtr, LeafLimit, AllocateDependency});
244 getOrCreateAllocaForReq(MemObject->
MRecord.get(), Req, InteropQueuePtr,
248 LeafLimit, AllocateDependency});
250 MMemObjs.push_back(MemObject);
251 return MemObject->
MRecord.get();
257 std::vector<Command *> &ToCleanUp) {
264 bool WasLeaf = Cmd->MLeafCounter > 0;
267 if (WasLeaf && Cmd->readyForCleanup()) {
268 ToCleanUp.push_back(Cmd);
275 std::vector<Command *> &ToEnqueue) {
279 if (Leaves.push_back(Cmd, ToEnqueue))
285 std::vector<Command *> &ToEnqueue) {
287 findAllocaForReq(Record, Req, Queue->getContextImplPtr());
288 assert(AllocaCmd &&
"There must be alloca for requirement!");
295 std::set<Command *> Deps =
296 findDepsForReq(Record, Req, Queue->getContextImplPtr());
297 std::vector<Command *> ToCleanUp;
300 UpdateCommand->
addDep(
DepDesc{Dep, StoredReq, AllocaCmd}, ToCleanUp);
302 ToEnqueue.push_back(ConnCmd);
304 updateLeaves(Deps, Record, Req->
MAccessMode, ToCleanUp);
305 addNodeToLeaves(Record, UpdateCommand, Req->
MAccessMode, ToEnqueue);
308 return UpdateCommand;
317 "Expected linked alloca commands");
319 "Expected source alloca command to be active");
321 if (AllocaCmdSrc->
getQueue()->is_host()) {
340 Command *Scheduler::GraphBuilder::insertMemoryMove(
342 std::vector<Command *> &ToEnqueue) {
344 AllocaCommandBase *AllocaCmdDst =
345 getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
347 throw runtime_error(
"Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
349 std::set<Command *> Deps =
350 findDepsForReq(Record, Req, Queue->getContextImplPtr());
351 Deps.insert(AllocaCmdDst);
354 if (AllocaCmdDst->getType() == Command::CommandType::ALLOCA_SUB_BUF)
356 static_cast<AllocaSubBufCommand *
>(AllocaCmdDst)->getParentAlloca();
359 AllocaCommandBase *AllocaCmdSrc =
360 findAllocaForReq(Record, Req, Record->MCurContext);
365 auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) {
367 Record->MCurContext) &&
369 AllocaCmd->
getType() == Command::CommandType::ALLOCA;
373 std::find_if(Record->MAllocaCommands.begin(),
374 Record->MAllocaCommands.end(), IsSuitableAlloca);
375 AllocaCmdSrc = (Record->MAllocaCommands.end() != It) ? *It :
nullptr;
378 throw runtime_error(
"Cannot find buffer allocation",
379 PI_ERROR_INVALID_VALUE);
382 if (AllocaCmdSrc->getType() == Command::CommandType::ALLOCA_SUB_BUF)
384 static_cast<AllocaSubBufCommand *
>(AllocaCmdSrc)->getParentAlloca();
385 else if (AllocaCmdSrc->getSYCLMemObj() != Req->MSYCLMemObj)
386 assert(
false &&
"Inappropriate alloca command.");
391 if (AllocaCmdSrc->MLinkedAllocaCmd == AllocaCmdDst) {
397 Record->MHostAccess = MapMode;
402 Record->MCurContext = Queue->getContextImplPtr();
408 new MemCpyCommand(*AllocaCmdSrc->getRequirement(), AllocaCmdSrc,
409 *AllocaCmdDst->getRequirement(), AllocaCmdDst,
410 AllocaCmdSrc->getQueue(), AllocaCmdDst->getQueue());
413 std::vector<Command *> ToCleanUp;
416 DepDesc{Dep, NewCmd->getRequirement(), AllocaCmdDst}, ToCleanUp);
418 ToEnqueue.push_back(ConnCmd);
424 Record->MCurContext = Queue->getContextImplPtr();
428 Command *Scheduler::GraphBuilder::remapMemoryObject(
429 MemObjRecord *Record,
Requirement *Req, AllocaCommandBase *HostAllocaCmd,
430 std::vector<Command *> &ToEnqueue) {
431 assert(HostAllocaCmd->getQueue()->is_host() &&
432 "Host alloca command expected");
433 assert(HostAllocaCmd->MIsActive &&
"Active alloca command expected");
435 AllocaCommandBase *LinkedAllocaCmd = HostAllocaCmd->MLinkedAllocaCmd;
436 assert(LinkedAllocaCmd &&
"Linked alloca command expected");
438 std::set<Command *> Deps = findDepsForReq(Record, Req, Record->MCurContext);
440 UnMapMemObject *UnMapCmd =
new UnMapMemObject(
441 LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(),
442 &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue());
448 MapMemObject *MapCmd =
new MapMemObject(
449 LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(),
450 &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue(), MapMode);
452 std::vector<Command *> ToCleanUp;
454 Command *ConnCmd = UnMapCmd->addDep(
455 DepDesc{Dep, UnMapCmd->getRequirement(), LinkedAllocaCmd}, ToCleanUp);
457 ToEnqueue.push_back(ConnCmd);
460 Command *ConnCmd = MapCmd->addDep(
461 DepDesc{UnMapCmd, MapCmd->getRequirement(), HostAllocaCmd}, ToCleanUp);
463 ToEnqueue.push_back(ConnCmd);
469 Record->MHostAccess = MapMode;
477 std::vector<Command *> &ToEnqueue) {
481 if (Record && MPrintOptionsArray[BeforeAddCopyBack])
482 printGraphAsDot(
"before_addCopyBack");
488 std::set<Command *> Deps =
489 findDepsForReq(Record, Req, HostQueue->getContextImplPtr());
491 findAllocaForReq(Record, Req, Record->
MCurContext);
493 auto MemCpyCmdUniquePtr = std::make_unique<MemCpyCommandHost>(
495 SrcAllocaCmd->
getQueue(), std::move(HostQueue));
497 if (!MemCpyCmdUniquePtr)
498 throw runtime_error(
"Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
502 std::vector<Command *> ToCleanUp;
507 ToEnqueue.push_back(ConnCmd);
510 updateLeaves(Deps, Record, Req->
MAccessMode, ToCleanUp);
511 addNodeToLeaves(Record, MemCpyCmd, Req->
MAccessMode, ToEnqueue);
514 if (MPrintOptionsArray[AfterAddCopyBack])
515 printGraphAsDot(
"after_addCopyBack");
523 std::vector<Command *> &ToEnqueue) {
532 MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req, ToEnqueue);
533 if (MPrintOptionsArray[BeforeAddHostAcc])
534 printGraphAsDot(
"before_addHostAccessor");
535 markModifiedIfWrite(Record, Req);
538 getOrCreateAllocaForReq(Record, Req, HostQueue, ToEnqueue);
543 remapMemoryObject(Record, Req,
551 insertMemoryMove(Record, Req, HostQueue, ToEnqueue);
554 insertUpdateHostReqCmd(Record, Req, HostQueue, ToEnqueue);
558 addEmptyCmd(UpdateHostAccCmd, {Req}, HostQueue,
563 if (MPrintOptionsArray[AfterAddHostAcc])
564 printGraphAsDot(
"after_addHostAccessor");
566 return UpdateHostAccCmd;
570 std::unique_ptr<detail::CG> CommandGroup,
const QueueImplPtr &HostQueue,
571 std::vector<Command *> &ToEnqueue) {
573 auto UpdateHost =
static_cast<CGUpdateHost *
>(CommandGroup.get());
576 MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req, ToEnqueue);
577 return insertMemoryMove(Record, Req, HostQueue, ToEnqueue);
589 Scheduler::GraphBuilder::findDepsForReq(
MemObjRecord *Record,
592 std::set<Command *> RetDeps;
593 std::vector<Command *> Visited;
601 ToAnalyze.insert(ToAnalyze.begin(), V.begin(), V.end());
604 while (!ToAnalyze.empty()) {
605 Command *DepCmd = ToAnalyze.back();
606 ToAnalyze.pop_back();
608 std::vector<Command *> NewAnalyze;
610 for (
const DepDesc &Dep : DepCmd->MDeps) {
611 if (Dep.MDepRequirement->MSYCLMemObj != Req->
MSYCLMemObj)
614 bool CanBypassDep =
false;
620 CanBypassDep |= !
doOverlap(Dep.MDepRequirement, Req);
625 sameCtx(Context, Dep.MDepCommand->getQueue()->getContextImplPtr());
628 RetDeps.insert(DepCmd);
636 NewAnalyze.push_back(Dep.MDepCommand);
638 ToAnalyze.insert(ToAnalyze.end(), NewAnalyze.begin(), NewAnalyze.end());
653 assert(
false &&
"No dependency found for a leaf of the record");
654 return {
nullptr,
nullptr,
nullptr};
662 auto IsSuitableAlloca = [&Context, Req,
664 bool Res =
sameCtx(AllocaCmd->
getQueue()->getContextImplPtr(), Context);
667 Res &= AllocaCmd->
getType() == Command::CommandType::ALLOCA_SUB_BUF;
670 Res &= AllowConst || !AllocaCmd->
MIsConst;
681 if (std::strcmp(HUMConfig,
"0") == 0)
682 return Ctx->is_host();
683 if (std::strcmp(HUMConfig,
"1") == 0)
686 for (
const device &Device : Ctx->getDevices()) {
687 if (!Device.get_info<info::device::host_unified_memory>())
697 AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
699 std::vector<Command *> &ToEnqueue) {
701 AllocaCommandBase *AllocaCmd = findAllocaForReq(
702 Record, Req, Queue->getContextImplPtr(),
false);
705 std::vector<Command *> ToCleanUp;
709 range<3> ParentRange{Req->MSYCLMemObj->getSizeInBytes(), 1, 1};
711 {0, 0, 0}, ParentRange, ParentRange,
713 sizeof(char),
size_t(0));
716 getOrCreateAllocaForReq(Record, &ParentRequirement, Queue, ToEnqueue);
717 AllocaCmd =
new AllocaSubBufCommand(Queue, *Req, ParentAlloca, ToEnqueue,
721 const Requirement FullReq( {0, 0, 0}, Req->MMemoryRange,
723 Req->MSYCLMemObj, Req->MDims, Req->MElemSize,
732 const bool HostUnifiedMemory =
734 SYCLMemObjI *MemObj = Req->MSYCLMemObj;
735 const bool InitFromUserData = Record->MAllocaCommands.empty() &&
736 (HostUnifiedMemory || MemObj->isInterop());
737 AllocaCommandBase *LinkedAllocaCmd =
nullptr;
742 if (Record->MAllocaCommands.empty()) {
743 if (!HostUnifiedMemory &&
748 if (MemObj->hasUserDataPtr()) {
751 AllocaCommand *HostAllocaCmd =
new AllocaCommand(
754 MemObj->isHostPointerReadOnly() );
755 Record->MAllocaCommands.push_back(HostAllocaCmd);
756 Record->MWriteLeaves.push_back(HostAllocaCmd, ToEnqueue);
757 ++(HostAllocaCmd->MLeafCounter);
765 if (Req->MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer)
770 if (Queue->is_host() != Record->MCurContext->is_host()) {
779 bool PinnedHostMemory = MemObj->usesPinnedHostMemory();
781 bool HostUnifiedMemoryOnNonHostDevice =
784 if (PinnedHostMemory || HostUnifiedMemoryOnNonHostDevice) {
785 AllocaCommandBase *LinkedAllocaCmdCand = findAllocaForReq(
786 Record, Req, Record->MCurContext,
false);
789 if (LinkedAllocaCmdCand &&
790 !LinkedAllocaCmdCand->MLinkedAllocaCmd) {
791 LinkedAllocaCmd = LinkedAllocaCmdCand;
798 new AllocaCommand(Queue, FullReq, InitFromUserData, LinkedAllocaCmd);
801 if (LinkedAllocaCmd) {
802 Command *ConnCmd = AllocaCmd->addDep(
803 DepDesc{LinkedAllocaCmd, AllocaCmd->getRequirement(),
807 ToEnqueue.push_back(ConnCmd);
808 LinkedAllocaCmd->MLinkedAllocaCmd = AllocaCmd;
811 ConnCmd = AllocaCmd->getReleaseCmd()->addDep(
812 DepDesc(LinkedAllocaCmd->getReleaseCmd(),
813 AllocaCmd->getRequirement(), LinkedAllocaCmd),
816 ToEnqueue.push_back(ConnCmd);
822 if (Queue->is_host()) {
823 AllocaCmd->MIsActive =
false;
825 LinkedAllocaCmd->MIsActive =
false;
826 Record->MCurContext = Queue->getContextImplPtr();
828 std::set<Command *> Deps =
829 findDepsForReq(Record, Req, Queue->getContextImplPtr());
831 Command *ConnCmd = AllocaCmd->addDep(
832 DepDesc{Dep, Req, LinkedAllocaCmd}, ToCleanUp);
834 ToEnqueue.push_back(ConnCmd);
836 updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp);
837 addNodeToLeaves(Record, AllocaCmd, Req->MAccessMode, ToEnqueue);
842 Record->MAllocaCommands.push_back(AllocaCmd);
843 Record->MWriteLeaves.push_back(AllocaCmd, ToEnqueue);
844 ++(AllocaCmd->MLeafCounter);
852 void Scheduler::GraphBuilder::markModifiedIfWrite(MemObjRecord *Record,
854 switch (Req->MAccessMode) {
860 Record->MMemModified =
true;
867 EmptyCommand *Scheduler::GraphBuilder::addEmptyCmd(
868 Command *Cmd,
const std::vector<Requirement *> &Reqs,
870 std::vector<Command *> &ToEnqueue,
const bool AddDepsToLeaves) {
871 EmptyCommand *EmptyCmd =
875 throw runtime_error(
"Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
877 EmptyCmd->MIsBlockable =
true;
879 EmptyCmd->MBlockReason = Reason;
882 MemObjRecord *Record = getOrInsertMemObjRecord(Queue, Req, ToEnqueue);
883 AllocaCommandBase *AllocaCmd =
884 getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
885 EmptyCmd->addRequirement(Cmd, AllocaCmd, Req);
890 Cmd->addUser(EmptyCmd);
892 if (AddDepsToLeaves) {
893 const std::vector<DepDesc> &Deps = Cmd->MDeps;
894 std::vector<Command *> ToCleanUp;
895 for (
const DepDesc &Dep : Deps) {
899 updateLeaves({Cmd}, Record, Req->MAccessMode, ToCleanUp);
900 addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode, ToEnqueue);
910 if (Cmd->
getCG().
getType() != CG::CGTYPE::CodeplayHostTask)
920 std::unordered_map<SYCLMemObjI *, access::mode> CombinedModes;
921 bool HasDuplicateMemObjects =
false;
923 auto Result = CombinedModes.insert(
924 std::make_pair(Req->MSYCLMemObj, Req->MAccessMode));
925 if (!Result.second) {
926 Result.first->second =
928 HasDuplicateMemObjects =
true;
932 if (!HasDuplicateMemObjects)
935 Req->MAccessMode = CombinedModes[Req->MSYCLMemObj];
940 std::unique_ptr<detail::CG> CommandGroup,
const QueueImplPtr &Queue,
941 std::vector<Command *> &ToEnqueue,
943 const std::vector<sycl::detail::pi::PiExtSyncPoint> &Dependencies) {
944 std::vector<Requirement *> &Reqs = CommandGroup->getRequirements();
945 std::vector<detail::EventImplPtr> &Events = CommandGroup->getEvents();
947 auto NewCmd = std::make_unique<ExecCGCommand>(
948 std::move(CommandGroup), Queue, CommandBuffer, std::move(Dependencies));
951 throw runtime_error(
"Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
957 auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
959 if (NewCmd->isFusable()) {
960 auto *FusionCmd = findFusionList(QUniqueID)->second.get();
962 bool dependsOnFusion =
false;
963 for (
auto Ev = Events.begin(); Ev != Events.end();) {
964 auto *EvDepCmd =
static_cast<Command *
>((*Ev)->getCommand());
971 if (EvDepCmd->getQueue() != Queue && isPartOfActiveFusion(EvDepCmd)) {
973 "Aborting fusion because of event dependency from a "
979 if (EvDepCmd == FusionCmd) {
980 Ev = Events.erase(Ev);
981 dependsOnFusion =
true;
991 if (dependsOnFusion) {
992 for (
auto *Cmd : FusionCmd->getFusionList()) {
993 Events.push_back(Cmd->getEvent());
1000 createGraphForCommand(NewCmd.get(), NewCmd->getCG(),
1002 Queue, FusionCmd->auxiliaryCommands());
1006 FusionCmd->addToFusionList(NewCmd.get());
1007 NewCmd->MFusionCmd = FusionCmd;
1008 std::vector<Command *> ToCleanUp;
1011 auto ConnectionCmd = FusionCmd->addDep(NewCmd->getEvent(), ToCleanUp);
1012 if (ConnectionCmd) {
1013 FusionCmd->auxiliaryCommands().push_back(ConnectionCmd);
1015 return {NewCmd.release(), FusionCmd->getEvent(),
false};
1018 std::stringstream ss(
s);
1019 if (NewCmd->getCG().getType() == CG::CGTYPE::Kernel) {
1020 ss <<
"Not fusing kernel with 'use_root_sync' property. Can only fuse "
1021 "non-cooperative device kernels.";
1023 ss <<
"Not fusing '" << NewCmd->getTypeString()
1024 <<
"' command group. Can only fuse device kernel command groups.";
1026 printFusionWarning(ss.str());
1029 createGraphForCommand(NewCmd.get(), NewCmd->getCG(),
1032 auto Event = NewCmd->getEvent();
1033 return {NewCmd.release(), Event,
true};
1036 void Scheduler::GraphBuilder::createGraphForCommand(
1038 std::vector<Requirement *> &Reqs,
1039 const std::vector<detail::EventImplPtr> &Events,
QueueImplPtr Queue,
1040 std::vector<Command *> &ToEnqueue) {
1042 if (MPrintOptionsArray[BeforeAddCG])
1043 printGraphAsDot(
"before_addCG");
1049 std::vector<Command *> ToCleanUp;
1054 bool isSameCtx =
false;
1060 Record = getOrInsertMemObjRecord(QueueForAlloca, Req, ToEnqueue);
1061 markModifiedIfWrite(Record, Req);
1064 getOrCreateAllocaForReq(Record, Req, QueueForAlloca, ToEnqueue);
1067 sameCtx(QueueForAlloca->getContextImplPtr(), Record->MCurContext);
1077 remapMemoryObject(Record, Req,
1087 bool NeedMemMoveToHost =
false;
1088 auto MemMoveTargetQueue = Queue;
1090 if (isInteropTask) {
1091 const detail::CGHostTask &HT =
static_cast<detail::CGHostTask &
>(CG);
1093 if (HT.MQueue->getContextImplPtr() != Record->
MCurContext) {
1094 NeedMemMoveToHost =
true;
1095 MemMoveTargetQueue = HT.MQueue;
1097 }
else if (!Queue->is_host() && !Record->
MCurContext->is_host())
1098 NeedMemMoveToHost =
true;
1100 if (NeedMemMoveToHost)
1101 insertMemoryMove(Record, Req,
1104 insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue);
1106 std::set<Command *> Deps =
1107 findDepsForReq(Record, Req, Queue->getContextImplPtr());
1110 if (Dep != NewCmd) {
1112 NewCmd->
addDep(DepDesc{Dep, Req, AllocaCmd}, ToCleanUp);
1114 ToEnqueue.push_back(ConnCmd);
1123 std::vector<DepDesc> Deps = NewCmd->
MDeps;
1124 for (DepDesc &Dep : Deps) {
1127 updateLeaves({Dep.MDepCommand}, Record, Req->MAccessMode, ToCleanUp);
1128 addNodeToLeaves(Record, NewCmd, Req->MAccessMode, ToEnqueue);
1133 if (e->getCommand() && e->getCommand() == NewCmd) {
1137 ToEnqueue.push_back(ConnCmd);
1140 if (MPrintOptionsArray[AfterAddCG])
1141 printGraphAsDot(
"after_addCG");
1143 for (
Command *Cmd : ToCleanUp) {
1144 cleanupCommand(Cmd);
1153 cleanupCommand(Cmd);
1158 cleanupCommand(Cmd);
1163 std::vector<AllocaCommandBase *> &AllocaCommands = Record->
MAllocaCommands;
1164 if (AllocaCommands.empty())
1167 assert(MCmdsToVisit.empty());
1168 MVisitedCmds.clear();
1172 for (
Command *AllocaCmd : AllocaCommands) {
1178 if (UserCmd->
getType() != Command::CommandType::ALLOCA)
1179 MCmdsToVisit.push(UserCmd);
1186 AllocaCmd->
MUsers.clear();
1204 while (!MCmdsToVisit.empty()) {
1205 Command *Cmd = MCmdsToVisit.front();
1212 if (UserCmd->
getType() != Command::CommandType::ALLOCA)
1213 MCmdsToVisit.push(UserCmd);
1217 std::map<Command *, bool> ShouldBeUpdated;
1218 auto NewEnd = std::remove_if(
1220 if (std::find(AllocaCommands.begin(), AllocaCommands.end(),
1221 Dep.MAllocaCmd) != AllocaCommands.end()) {
1222 ShouldBeUpdated.insert({Dep.MDepCommand, true});
1228 Cmd->
MDeps.erase(NewEnd, Cmd->MDeps.end());
1231 for (
auto DepCmdIt : ShouldBeUpdated) {
1232 if (!DepCmdIt.second)
1234 DepCmdIt.first->MUsers.erase(Cmd);
1239 if (Cmd->MDeps.empty()) {
1240 Cmd->MUsers.clear();
1243 if (!Cmd->MMarkedForCleanup)
1244 Cmd->MMarks.MToBeDeleted =
true;
1252 Command *Cmd, [[maybe_unused]]
bool AllowUnsubmitted) {
1254 static bool DeprWarningPrinted =
false;
1255 if (!DeprWarningPrinted) {
1256 std::cerr <<
"WARNING: The enviroment variable "
1257 "SYCL_DISABLE_POST_ENQUEUE_CLEANUP is deprecated. Please "
1258 "use SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP instead.\n";
1259 DeprWarningPrinted =
true;
1291 DepCmd->
MUsers.erase(Cmd);
1302 Cmd->
getEvent()->setCommand(
nullptr);
1307 const auto It = std::find_if(
1308 MMemObjs.begin(), MMemObjs.end(),
1309 [MemObject](
const SYCLMemObjI *Obj) { return Obj == MemObject; });
1310 if (It != MMemObjs.end())
1330 std::vector<Command *> &ToCleanUp) {
1339 std::move(HT), {}, {}, {},
1348 }
catch (
const std::bad_alloc &) {
1349 throw runtime_error(
"Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
1356 (void)ConnectCmd->
addDep(Dep, ToCleanUp);
1357 assert(
reinterpret_cast<Command *
>(DepEvent->getCommand()) ==
1366 std::ignore = Cmd->
addDep(DepOnConnect, ToCleanUp);
1371 if (
Command *DepCmd =
reinterpret_cast<Command *
>(DepEvent->getCommand()))
1374 std::ignore = ConnectCmd->
addDep(DepEvent, ToCleanUp);
1386 auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1387 MFusionMap.emplace(QUniqueID, std::make_unique<KernelFusionCommand>(Queue));
1391 sycl::detail::queue_impl *Queue) {
1392 auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue);
1395 "Queue already in fusion mode"};
1397 auto OldFusionCmd = findFusionList(QUniqueID);
1398 if (OldFusionCmd != MFusionMap.end()) {
1403 OldFusionCmd->second->setFusionStatus(
1405 cleanupCommand(OldFusionCmd->second.release());
1406 MFusionMap.erase(OldFusionCmd);
1410 void Scheduler::GraphBuilder::removeNodeFromGraph(
1411 Command *Node, std::vector<Command *> &ToEnqueue) {
1414 for (
auto &Dep : Node->
MDeps) {
1415 auto AccessMode = Dep.MDepRequirement->MAccessMode;
1418 Node->
MLeafCounter -= Record->MReadLeaves.remove(Node);
1419 Node->
MLeafCounter -= Record->MWriteLeaves.remove(Node);
1423 for (
auto PrevDep : Dep.MDepCommand->MDeps) {
1424 auto *DepReq = PrevDep.MDepRequirement;
1426 if (DepRecord == Record) {
1429 assert(Dep.MDepCommand);
1430 addNodeToLeaves(Record, Dep.MDepCommand, DepReq->MAccessMode,
1435 Dep.MDepCommand->MUsers.erase(Node);
1445 std::vector<Command *> &ToEnqueue) {
1446 auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1452 auto *PlaceholderCmd = (*FusionList).second.get();
1455 auto FusedCmdList = PlaceholderCmd->getFusionList();
1456 ToEnqueue.insert(ToEnqueue.end(), FusedCmdList.begin(), FusedCmdList.end());
1460 ToEnqueue.insert(ToEnqueue.end(), PlaceholderCmd->auxiliaryCommands().begin(),
1461 PlaceholderCmd->auxiliaryCommands().end());
1463 ToEnqueue.push_back(PlaceholderCmd);
1465 if (MPrintOptionsArray[AfterFusionCancel]) {
1466 printGraphAsDot(
"after_fusionCancel");
1475 return static_cast<ExecCGCommand *
>(Cmd)->MFusionCmd == Fusion;
1492 return !PredPartOfFusion;
1500 for (
auto &Dep : Cmd->
MDeps) {
1501 auto *DepCmd = Dep.MDepCommand;
1510 auto *EvDepCmd =
static_cast<Command *
>(Ev->getCommand());
1519 auto *EvDepCmd =
static_cast<Command *
>(Ev->getCommand());
1532 std::vector<Command *> &ToEnqueue,
1534 auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1535 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
1537 auto InactiveFusionList = findFusionList(QUniqueID);
1538 if (InactiveFusionList == MFusionMap.end()) {
1541 "Calling complete_fusion on a queue not in fusion mode"};
1543 return InactiveFusionList->second->getEvent();
1547 auto *PlaceholderCmd =
FusionList->second.get();
1548 auto &CmdList = PlaceholderCmd->getFusionList();
1562 bool CreatesCircularDep =
1563 MFusionMap.size() > 1 &&
1565 return checkForCircularDependency(Cmd, true, PlaceholderCmd);
1567 if (CreatesCircularDep) {
1570 "Aborting fusion because it would create a circular dependency");
1571 auto LastEvent = PlaceholderCmd->getEvent();
1578 Queue, CmdList, PropList);
1584 auto LastEvent = PlaceholderCmd->getEvent();
1590 std::vector<EventImplPtr> FusedEventDeps;
1591 for (
auto *Cmd : CmdList) {
1592 FusedEventDeps.insert(FusedEventDeps.end(),
1595 FusedEventDeps.insert(FusedEventDeps.end(),
1602 FusedEventDeps.erase(
1603 std::remove_if(FusedEventDeps.begin(), FusedEventDeps.end(),
1605 if (E->getCommand() == PlaceholderCmd) {
1608 if (E->getCommand() &&
1612 static_cast<ExecCGCommand *>(E->getCommand());
1613 if (RunCGCmd->MFusionCmd == PlaceholderCmd) {
1619 FusedEventDeps.end());
1621 auto FusedKernelCmd =
1622 std::make_unique<ExecCGCommand>(std::move(FusedCG), Queue);
1626 PlaceholderCmd->getEvent());
1627 assert(PlaceholderCmd->MDeps.empty());
1632 for (
auto OldCmd = CmdList.rbegin(); OldCmd != CmdList.rend(); ++OldCmd) {
1633 removeNodeFromGraph(*OldCmd, ToEnqueue);
1634 cleanupCommand(*OldCmd,
true);
1637 createGraphForCommand(FusedKernelCmd.get(), FusedKernelCmd->getCG(),
false,
1638 FusedKernelCmd->getCG().getRequirements(),
1639 FusedEventDeps, Queue, ToEnqueue);
1641 ToEnqueue.push_back(FusedKernelCmd.get());
1643 std::vector<Command *> ToCleanUp;
1645 auto *ConnectToPlaceholder =
1646 PlaceholderCmd->addDep(FusedKernelCmd->getEvent(), ToCleanUp);
1647 if (ConnectToPlaceholder) {
1648 ToEnqueue.push_back(ConnectToPlaceholder);
1650 for (Command *Cmd : ToCleanUp) {
1651 cleanupCommand(Cmd);
1653 ToEnqueue.push_back(PlaceholderCmd);
1655 if (MPrintOptionsArray[AfterFusionComplete]) {
1656 printGraphAsDot(
"after_fusionComplete");
1662 return FusedKernelCmd.release()->getEvent();
1664 printFusionWarning(
"Kernel fusion not supported by this build");
1667 auto *PlaceholderCmd =
FusionList->second.get();
1668 auto LastEvent = PlaceholderCmd->getEvent();
1669 this->cancelFusion(Queue, ToEnqueue);
1674 bool Scheduler::GraphBuilder::isInFusionMode(
QueueIdT Id) {
detail::SYCLMemObjI * MSYCLMemObj
range< 3 > & MAccessRange
Base class for memory allocation commands.
const Requirement * getRequirement() const final
AllocaCommandBase * MLinkedAllocaCmd
Alloca command linked with current command.
bool MIsActive
Indicates that current alloca is active one.
The AllocaSubBuf command enqueues creation of sub-buffer of memory object.
std::unique_ptr< HostTask > MHostTask
"Update host" command group class.
Base class for all types of command groups.
The Command class represents some action that needs to be performed on one or more memory objects.
bool isSuccessfullyEnqueued() const
const std::vector< EventImplPtr > & getPreparedDepsEvents() const
const std::vector< EventImplPtr > & getPreparedHostDepsEvents() const
unsigned MLeafCounter
Counts the number of memory objects this command is a leaf for.
std::unordered_set< Command * > MUsers
Contains list of commands that depend on the command.
virtual const ContextImplPtr & getWorkerContext() const
Get the context of the queue this command will be submitted to.
Marks MMarks
Used for marking the node during graph traversal.
std::vector< DepDesc > MDeps
Contains list of dependencies(edges)
virtual bool readyForCleanup() const
Returns true iff this command is ready to be submitted for cleanup.
void addUser(Command *NewUser)
void clearAllDependencies()
Clear all dependency events This should only be used if a command is about to be deleted without bein...
const EventImplPtr & getEvent() const
CommandType getType() const
virtual void printDot(std::ostream &Stream) const =0
Command * addDep(DepDesc NewDep, std::vector< Command * > &ToCleanUp)
const QueueImplPtr & getQueue() const
The empty command does nothing during enqueue.
The exec CG command enqueues execution of kernel or explicit memory operation.
detail::CG & getCG() const
The KernelFusionCommand is placed in the execution graph together with the individual kernels of the ...
bool readyForDeletion() const
A wrapper for CircularBuffer class along with collection for host accessor's EmptyCommands.
size_t remove(value_type Cmd)
Replacement for std::remove with subsequent call to erase(newEnd, end()).
std::function< void(Command *, Command *, MemObjRecord *, EnqueueListT &)> AllocateDependencyF
std::vector< value_type > toVector() const
std::vector< Command * > EnqueueListT
The map command enqueues mapping of device memory onto host memory.
The mem copy host command enqueues memory copy between two instances of memory object.
const Requirement * getRequirement() const final
virtual ContextImplPtr getInteropContext() const =0
std::shared_ptr< MemObjRecord > MRecord
void handleWriteAccessorCreation()
void startFusion(QueueImplPtr Queue)
void cleanupCommand(Command *Cmd, bool AllowUnsubmitted=false)
void decrementLeafCountersForRecord(MemObjRecord *Record)
Decrements leaf counters for all leaves of the record.
MemObjRecord * getMemObjRecord(SYCLMemObjI *MemObject)
EventImplPtr completeFusion(QueueImplPtr Queue, std::vector< Command * > &ToEnqueue, const property_list &)
Command * addHostAccessor(Requirement *Req, std::vector< Command * > &ToEnqueue)
Enqueues a command to create a host accessor.
void cleanupCommandsForRecord(MemObjRecord *Record)
Removes commands that use the given MemObjRecord from the graph.
GraphBuildResult addCG(std::unique_ptr< detail::CG > CommandGroup, const QueueImplPtr &Queue, std::vector< Command * > &ToEnqueue, sycl::detail::pi::PiExtCommandBuffer CommandBuffer=nullptr, const std::vector< sycl::detail::pi::PiExtSyncPoint > &Dependencies={})
Registers command group and adds it to the dependency graph.
void removeRecordForMemObj(SYCLMemObjI *MemObject)
Removes the MemObjRecord for the memory object passed.
Command * addCopyBack(Requirement *Req, std::vector< Command * > &ToEnqueue)
Enqueues a command to update memory to the latest state.
Command * connectDepEvent(Command *const Cmd, const EventImplPtr &DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform connection of events in multiple contexts.
Command * addCGUpdateHost(std::unique_ptr< detail::CG > CommandGroup, const QueueImplPtr &HostQueue, std::vector< Command * > &ToEnqueue)
Registers a command group that updates host memory to the latest state.
MemObjRecord * getOrInsertMemObjRecord(const QueueImplPtr &Queue, const Requirement *Req, std::vector< Command * > &ToEnqueue)
void cancelFusion(QueueImplPtr Queue, std::vector< Command * > &ToEnqueue)
void addNodeToLeaves(MemObjRecord *Record, Command *Cmd, access::mode AccessMode, std::vector< Command * > &ToEnqueue)
Adds new command to leaves if needed.
void cleanUpCmdFusion(sycl::detail::queue_impl *Queue)
Clean up the internal fusion commands held for the given queue.
AllocaCommandBase * findAllocaForReq(MemObjRecord *Record, const Requirement *Req, const ContextImplPtr &Context, bool AllowConst=true)
Searches for suitable alloca in memory record.
DepDesc findDepForRecord(Command *Cmd, MemObjRecord *Record)
Finds a command dependency corresponding to the record.
void updateLeaves(const std::set< Command * > &Cmds, MemObjRecord *Record, access::mode AccessMode, std::vector< Command * > &ToCleanUp)
Removes commands from leaves.
QueueImplPtr getDefaultHostQueue()
bool isInFusionMode(QueueIdT Queue)
void cancelFusion(QueueImplPtr Queue)
static MemObjRecord * getMemObjRecord(const Requirement *const Req)
static Scheduler & getInstance()
void cleanUpCmdFusion(sycl::detail::queue_impl *Queue)
void takeAuxiliaryResources(const EventImplPtr &Dst, const EventImplPtr &Src)
Assign Src's auxiliary resources to Dst.
QueueImplPtr DefaultHostQueue
The unmap command removes mapping of host memory onto device memory.
const Requirement * getRequirement() const final
std::unique_ptr< detail::CG > fuseKernels(QueueImplPtr Queue, std::vector< ExecCGCommand * > &InputKernels, const property_list &)
static jit_compiler & get_instance()
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Objects of the property_list class are containers for the SYCL properties.
__SYCL_EXTERN_STREAM_ATTRS ostream cerr
Linked to standard error (unbuffered)
static bool doOverlap(const Requirement *LHS, const Requirement *RHS)
Checks whether two requirements overlap or not.
static bool createsCircularDependency(Command *Cmd, bool PredPartOfFusion, KernelFusionCommand *Fusion)
static bool isInteropHostTask(ExecCGCommand *Cmd)
static void unmarkVisitedNodes(std::vector< Command * > &Visited)
static bool checkForCircularDependency(Command *, bool, KernelFusionCommand *)
std::hash< std::shared_ptr< detail::queue_impl > >::result_type QueueIdT
static access::mode combineAccessModes(access::mode A, access::mode B)
Combines two access modes into a single one that allows both.
static bool markNodeAsVisited(Command *Cmd, std::vector< Command * > &Visited)
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
std::unique_ptr< KernelFusionCommand > FusionList
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
static bool isPartOfFusion(Command *Cmd, KernelFusionCommand *Fusion)
std::shared_ptr< event_impl > EventImplPtr
static void printDotRecursive(std::fstream &Stream, std::vector< Command * > &Visited, Command *Cmd)
AccessorImplHost Requirement
std::shared_ptr< device_impl > DeviceImplPtr
static void handleVisitedNodes(std::vector< Command * > &Visited)
static bool IsSuitableSubReq(const Requirement *Req)
Checks if current requirement is requirement for sub buffer.
static Command * insertMapUnmapForLinkedCmds(AllocaCommandBase *AllocaCmdSrc, AllocaCommandBase *AllocaCmdDst, access::mode MapMode)
static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx)
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
static bool isAccessModeAllowed(access::mode Required, access::mode Current)
Checks if the required access mode is allowed under the current one.
static void combineAccessModesOfReqs(std::vector< Requirement * > &Reqs)
static bool sameCtx(const ContextImplPtr &LHS, const ContextImplPtr &RHS)
constexpr if(sizeof(T)==8)
class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(local_accessor) local_accessor class __SYCL_EBO __SYCL_SPECIAL_CLASS AccessMode
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
bool any_of(const simd_mask< _Tp, _Abi > &) noexcept
bool MVisited
Used for marking the node as visited during graph traversal.
bool MToBeDeleted
Used for marking the node for deletion during cleanup.
Dependency between two commands.
const Requirement * MDepRequirement
Requirement for the dependency.
Command * MDepCommand
The actual dependency command.
AllocaCommandBase * MAllocaCmd
Allocation command for the memory object we have requirement for.
LeavesCollection MWriteLeaves
ContextImplPtr MCurContext
std::vector< AllocaCommandBase * > MAllocaCommands
LeavesCollection MReadLeaves