DPC++ Runtime
Runtime libraries for oneAPI DPC++
graph_builder.cpp
Go to the documentation of this file.
1 //===-- graph_builder.cpp - SYCL Graph Builder ------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "detail/config.hpp"
10 #include <detail/context_impl.hpp>
11 #include <detail/event_impl.hpp>
12 #include <sstream>
13 #include <sycl/feature_test.hpp>
14 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
15 #include <detail/jit_compiler.hpp>
16 #endif
17 #include <detail/graph_impl.hpp>
19 #include <detail/queue_impl.hpp>
22 #include <sycl/access/access.hpp>
23 #include <sycl/exception.hpp>
24 
25 #include <algorithm>
26 #include <cstdlib>
27 #include <cstring>
28 #include <fstream>
29 #include <map>
30 #include <memory>
31 #include <queue>
32 #include <set>
33 #include <vector>
34 
35 namespace sycl {
36 inline namespace _V1 {
37 namespace detail {
38 
43 // TODO merge with LeavesCollection's version of doOverlap (see
44 // leaves_collection.cpp).
45 static bool doOverlap(const Requirement *LHS, const Requirement *RHS) {
46  return (LHS->MOffsetInBytes + LHS->MAccessRange.size() * LHS->MElemSize >=
47  RHS->MOffsetInBytes) ||
48  (RHS->MOffsetInBytes + RHS->MAccessRange.size() * RHS->MElemSize >=
49  LHS->MOffsetInBytes);
50 }
51 
53 static bool IsSuitableSubReq(const Requirement *Req) {
54  return Req->MIsSubBuffer;
55 }
56 
57 static bool isOnSameContext(const ContextImplPtr Context,
58  const QueueImplPtr &Queue) {
59  // Covers case for host usage (nullptr == nullptr) and existing device
60  // contexts comparison.
61  return Context == queue_impl::getContext(Queue);
62 }
63 
65 static bool isAccessModeAllowed(access::mode Required, access::mode Current) {
66  switch (Current) {
67  case access::mode::read:
68  return (Required == Current);
70  assert(false && "Write only access is expected to be mapped as read_write");
71  return (Required == Current || Required == access::mode::discard_write);
76  return true;
77  }
78  assert(false);
79  return false;
80 }
81 
84  if (A == B)
85  return A;
86 
87  if (A == access::mode::discard_write &&
89  return B;
90 
91  if (B == access::mode::discard_write &&
93  return A;
94 
96 }
97 
99  if (const char *EnvVarCStr = SYCLConfig<SYCL_PRINT_EXECUTION_GRAPH>::get()) {
100  std::string GraphPrintOpts(EnvVarCStr);
101  bool EnableAlways = GraphPrintOpts.find("always") != std::string::npos;
102 
103  if (GraphPrintOpts.find("before_addCG") != std::string::npos ||
104  EnableAlways)
105  MPrintOptionsArray[BeforeAddCG] = true;
106  if (GraphPrintOpts.find("after_addCG") != std::string::npos || EnableAlways)
107  MPrintOptionsArray[AfterAddCG] = true;
108  if (GraphPrintOpts.find("before_addCopyBack") != std::string::npos ||
109  EnableAlways)
110  MPrintOptionsArray[BeforeAddCopyBack] = true;
111  if (GraphPrintOpts.find("after_addCopyBack") != std::string::npos ||
112  EnableAlways)
113  MPrintOptionsArray[AfterAddCopyBack] = true;
114  if (GraphPrintOpts.find("before_addHostAcc") != std::string::npos ||
115  EnableAlways)
116  MPrintOptionsArray[BeforeAddHostAcc] = true;
117  if (GraphPrintOpts.find("after_addHostAcc") != std::string::npos ||
118  EnableAlways)
119  MPrintOptionsArray[AfterAddHostAcc] = true;
120  if (GraphPrintOpts.find("after_fusionComplete") != std::string::npos ||
121  EnableAlways)
122  MPrintOptionsArray[AfterFusionComplete] = true;
123  if (GraphPrintOpts.find("after_fusionCancel") != std::string::npos ||
124  EnableAlways)
125  MPrintOptionsArray[AfterFusionCancel] = true;
126  }
127 }
128 
129 static bool markNodeAsVisited(Command *Cmd, std::vector<Command *> &Visited) {
130  assert(Cmd && "Cmd can't be nullptr");
131  if (Cmd->MMarks.MVisited)
132  return false;
133  Cmd->MMarks.MVisited = true;
134  Visited.push_back(Cmd);
135  return true;
136 }
137 
138 static void unmarkVisitedNodes(std::vector<Command *> &Visited) {
139  for (Command *Cmd : Visited)
140  Cmd->MMarks.MVisited = false;
141 }
142 
143 static void handleVisitedNodes(std::vector<Command *> &Visited) {
144  for (Command *Cmd : Visited) {
145  if (Cmd->MMarks.MToBeDeleted) {
146  if (Cmd->getType() == Command::FUSION &&
147  !static_cast<KernelFusionCommand *>(Cmd)->readyForDeletion()) {
148  // Fusion commands might still be needed because fusion might be
149  // aborted, but a later call to complete_fusion still needs to be able
150  // to return a valid event. Clean-up of fusion commands is therefore
151  // explicitly handled by start fusion.
152  return;
153  }
154  Cmd->getEvent()->setCommand(nullptr);
155  delete Cmd;
156  } else
157  Cmd->MMarks.MVisited = false;
158  }
159 }
160 
161 static void printDotRecursive(std::fstream &Stream,
162  std::vector<Command *> &Visited, Command *Cmd) {
163  if (!markNodeAsVisited(Cmd, Visited))
164  return;
165  for (Command *User : Cmd->MUsers) {
166  if (User)
167  printDotRecursive(Stream, Visited, User);
168  }
169  Cmd->printDot(Stream);
170 }
171 
172 void Scheduler::GraphBuilder::printGraphAsDot(const char *ModeName) {
173  static size_t Counter = 0;
174  std::string ModeNameStr(ModeName);
175  std::string FileName =
176  "graph_" + std::to_string(Counter) + ModeNameStr + ".dot";
177 
178  Counter++;
179 
180  std::fstream Stream(FileName, std::ios::out);
181  Stream << "strict digraph {" << std::endl;
182 
183  MVisitedCmds.clear();
184 
185  for (SYCLMemObjI *MemObject : MMemObjs)
186  for (Command *AllocaCmd : MemObject->MRecord->MAllocaCommands)
187  printDotRecursive(Stream, MVisitedCmds, AllocaCmd);
188 
189  Stream << "}" << std::endl;
190 
191  unmarkVisitedNodes(MVisitedCmds);
192 }
193 
195  return MemObject->MRecord.get();
196 }
197 
198 MemObjRecord *
200  const Requirement *Req) {
201  SYCLMemObjI *MemObject = Req->MSYCLMemObj;
202  MemObjRecord *Record = getMemObjRecord(MemObject);
203 
204  if (nullptr != Record)
205  return Record;
206 
207  const size_t LeafLimit = 8;
208  LeavesCollection::AllocateDependencyF AllocateDependency =
209  [this](Command *Dependant, Command *Dependency, MemObjRecord *Record,
210  LeavesCollection::EnqueueListT &ToEnqueue) {
211  // Add the old leaf as a dependency for the new one by duplicating one
212  // of the requirements for the current record
213  DepDesc Dep = findDepForRecord(Dependant, Record);
214  Dep.MDepCommand = Dependency;
215  std::vector<Command *> ToCleanUp;
216  Command *ConnectionCmd = Dependant->addDep(Dep, ToCleanUp);
217  if (ConnectionCmd)
218  ToEnqueue.push_back(ConnectionCmd);
219 
220  --(Dependency->MLeafCounter);
221  if (Dependency->readyForCleanup())
222  ToCleanUp.push_back(Dependency);
223  for (Command *Cmd : ToCleanUp)
224  cleanupCommand(Cmd);
225  };
226 
227  const ContextImplPtr &InteropCtxPtr = Req->MSYCLMemObj->getInteropContext();
228  if (InteropCtxPtr) {
229  // The memory object has been constructed using interoperability constructor
230  // which means that there is already an allocation(cl_mem) in some context.
231  // Registering this allocation in the SYCL graph.
232 
233  std::vector<sycl::device> Devices =
234  InteropCtxPtr->get_info<info::context::devices>();
235  assert(Devices.size() != 0);
236  DeviceImplPtr Dev = detail::getSyclObjImpl(Devices[0]);
237 
238  // Since all the Scheduler commands require queue but we have only context
239  // here, we need to create a dummy queue bound to the context and one of the
240  // devices from the context.
241  QueueImplPtr InteropQueuePtr{new detail::queue_impl{
242  Dev, InteropCtxPtr, /*AsyncHandler=*/{}, /*PropertyList=*/{}}};
243 
244  MemObject->MRecord.reset(
245  new MemObjRecord{InteropCtxPtr, LeafLimit, AllocateDependency});
246  std::vector<Command *> ToEnqueue;
247  getOrCreateAllocaForReq(MemObject->MRecord.get(), Req, InteropQueuePtr,
248  ToEnqueue);
249  assert(ToEnqueue.empty() && "Creation of the first alloca for a record "
250  "shouldn't lead to any enqueuing (no linked "
251  "alloca or exceeding the leaf limit).");
252  } else
253  MemObject->MRecord.reset(new MemObjRecord{queue_impl::getContext(Queue),
254  LeafLimit, AllocateDependency});
255 
256  MMemObjs.push_back(MemObject);
257  return MemObject->MRecord.get();
258 }
259 
260 void Scheduler::GraphBuilder::updateLeaves(const std::set<Command *> &Cmds,
261  MemObjRecord *Record,
263  std::vector<Command *> &ToCleanUp) {
264 
265  const bool ReadOnlyReq = AccessMode == access::mode::read;
266  if (ReadOnlyReq)
267  return;
268 
269  for (Command *Cmd : Cmds) {
270  bool WasLeaf = Cmd->MLeafCounter > 0;
271  Cmd->MLeafCounter -= Record->MReadLeaves.remove(Cmd);
272  Cmd->MLeafCounter -= Record->MWriteLeaves.remove(Cmd);
273  if (WasLeaf && Cmd->readyForCleanup()) {
274  ToCleanUp.push_back(Cmd);
275  }
276  }
277 }
278 
281  std::vector<Command *> &ToEnqueue) {
283  ? Record->MReadLeaves
284  : Record->MWriteLeaves};
285  if (Leaves.push_back(Cmd, ToEnqueue))
286  ++Cmd->MLeafCounter;
287 }
288 
289 UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd(
290  MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue,
291  std::vector<Command *> &ToEnqueue) {
292  auto Context = queue_impl::getContext(Queue);
293  AllocaCommandBase *AllocaCmd = findAllocaForReq(Record, Req, Context);
294  assert(AllocaCmd && "There must be alloca for requirement!");
295  UpdateHostRequirementCommand *UpdateCommand =
296  new UpdateHostRequirementCommand(Queue, *Req, AllocaCmd, &Req->MData);
297  // Need copy of requirement because after host accessor destructor call
298  // dependencies become invalid if requirement is stored by pointer.
299  const Requirement *StoredReq = UpdateCommand->getRequirement();
300 
301  std::set<Command *> Deps = findDepsForReq(Record, Req, Context);
302  std::vector<Command *> ToCleanUp;
303  for (Command *Dep : Deps) {
304  Command *ConnCmd =
305  UpdateCommand->addDep(DepDesc{Dep, StoredReq, AllocaCmd}, ToCleanUp);
306  if (ConnCmd)
307  ToEnqueue.push_back(ConnCmd);
308  }
309  updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp);
310  addNodeToLeaves(Record, UpdateCommand, Req->MAccessMode, ToEnqueue);
311  for (Command *Cmd : ToCleanUp)
312  cleanupCommand(Cmd);
313  return UpdateCommand;
314 }
315 
316 // Takes linked alloca commands. Makes AllocaCmdDst command active using map
317 // or unmap operation.
319  AllocaCommandBase *AllocaCmdDst,
320  access::mode MapMode) {
321  assert(AllocaCmdSrc->MLinkedAllocaCmd == AllocaCmdDst &&
322  "Expected linked alloca commands");
323  assert(AllocaCmdSrc->MIsActive &&
324  "Expected source alloca command to be active");
325 
326  if (!AllocaCmdSrc->getQueue()) {
327  UnMapMemObject *UnMapCmd = new UnMapMemObject(
328  AllocaCmdDst, *AllocaCmdDst->getRequirement(),
329  &AllocaCmdSrc->MMemAllocation, AllocaCmdDst->getQueue());
330 
331  std::swap(AllocaCmdSrc->MIsActive, AllocaCmdDst->MIsActive);
332 
333  return UnMapCmd;
334  }
335 
336  MapMemObject *MapCmd = new MapMemObject(
337  AllocaCmdSrc, *AllocaCmdSrc->getRequirement(),
338  &AllocaCmdDst->MMemAllocation, AllocaCmdSrc->getQueue(), MapMode);
339 
340  std::swap(AllocaCmdSrc->MIsActive, AllocaCmdDst->MIsActive);
341 
342  return MapCmd;
343 }
344 
345 Command *Scheduler::GraphBuilder::insertMemoryMove(
346  MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue,
347  std::vector<Command *> &ToEnqueue) {
348 
349  AllocaCommandBase *AllocaCmdDst =
350  getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
351  if (!AllocaCmdDst)
353  "Out of host memory");
354 
355  auto Context = queue_impl::getContext(Queue);
356  std::set<Command *> Deps = findDepsForReq(Record, Req, Context);
357  Deps.insert(AllocaCmdDst);
358  // Get parent allocation of sub buffer to perform full copy of whole buffer
359  if (IsSuitableSubReq(Req)) {
360  if (AllocaCmdDst->getType() == Command::CommandType::ALLOCA_SUB_BUF)
361  AllocaCmdDst =
362  static_cast<AllocaSubBufCommand *>(AllocaCmdDst)->getParentAlloca();
363  }
364 
365  AllocaCommandBase *AllocaCmdSrc =
366  findAllocaForReq(Record, Req, Record->MCurContext);
367  if (!AllocaCmdSrc && IsSuitableSubReq(Req)) {
368  // Since no alloca command for the sub buffer requirement was found in the
369  // current context, need to find a parent alloca command for it (it must be
370  // there)
371  auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) {
372  bool Res = isOnSameContext(Record->MCurContext, AllocaCmd->getQueue()) &&
373  // Looking for a parent buffer alloca command
374  AllocaCmd->getType() == Command::CommandType::ALLOCA;
375  return Res;
376  };
377  const auto It =
378  std::find_if(Record->MAllocaCommands.begin(),
379  Record->MAllocaCommands.end(), IsSuitableAlloca);
380  AllocaCmdSrc = (Record->MAllocaCommands.end() != It) ? *It : nullptr;
381  }
382  if (!AllocaCmdSrc)
383  throw exception(make_error_code(errc::runtime),
384  "Cannot find buffer allocation");
385  // Get parent allocation of sub buffer to perform full copy of whole buffer
386  if (IsSuitableSubReq(Req)) {
387  if (AllocaCmdSrc->getType() == Command::CommandType::ALLOCA_SUB_BUF)
388  AllocaCmdSrc =
389  static_cast<AllocaSubBufCommand *>(AllocaCmdSrc)->getParentAlloca();
390  else if (AllocaCmdSrc->getSYCLMemObj() != Req->MSYCLMemObj)
391  assert(false && "Inappropriate alloca command.");
392  }
393 
394  Command *NewCmd = nullptr;
395 
396  if (AllocaCmdSrc->MLinkedAllocaCmd == AllocaCmdDst) {
397  // Map write only as read-write
398  access::mode MapMode = Req->MAccessMode;
399  if (MapMode == access::mode::write)
400  MapMode = access::mode::read_write;
401  NewCmd = insertMapUnmapForLinkedCmds(AllocaCmdSrc, AllocaCmdDst, MapMode);
402  Record->MHostAccess = MapMode;
403  } else {
404 
405  if ((Req->MAccessMode == access::mode::discard_write) ||
406  (Req->MAccessMode == access::mode::discard_read_write)) {
407  Record->MCurContext = Context;
408  return nullptr;
409  } else {
410  // Full copy of buffer is needed to avoid loss of data that may be caused
411  // by copying specific range from host to device and backwards.
412  NewCmd =
413  new MemCpyCommand(*AllocaCmdSrc->getRequirement(), AllocaCmdSrc,
414  *AllocaCmdDst->getRequirement(), AllocaCmdDst,
415  AllocaCmdSrc->getQueue(), AllocaCmdDst->getQueue());
416  }
417  }
418  std::vector<Command *> ToCleanUp;
419  for (Command *Dep : Deps) {
420  Command *ConnCmd = NewCmd->addDep(
421  DepDesc{Dep, NewCmd->getRequirement(), AllocaCmdDst}, ToCleanUp);
422  if (ConnCmd)
423  ToEnqueue.push_back(ConnCmd);
424  }
425  updateLeaves(Deps, Record, access::mode::read_write, ToCleanUp);
426  addNodeToLeaves(Record, NewCmd, access::mode::read_write, ToEnqueue);
427  for (Command *Cmd : ToCleanUp)
428  cleanupCommand(Cmd);
429  Record->MCurContext = Context;
430  return NewCmd;
431 }
432 
433 Command *Scheduler::GraphBuilder::remapMemoryObject(
434  MemObjRecord *Record, Requirement *Req, AllocaCommandBase *HostAllocaCmd,
435  std::vector<Command *> &ToEnqueue) {
436  assert(!HostAllocaCmd->getQueue() && "Host alloca command expected");
437  assert(HostAllocaCmd->MIsActive && "Active alloca command expected");
438 
439  AllocaCommandBase *LinkedAllocaCmd = HostAllocaCmd->MLinkedAllocaCmd;
440  assert(LinkedAllocaCmd && "Linked alloca command expected");
441 
442  std::set<Command *> Deps = findDepsForReq(Record, Req, Record->MCurContext);
443 
444  UnMapMemObject *UnMapCmd = new UnMapMemObject(
445  LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(),
446  &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue());
447 
448  // Map write only as read-write
449  access::mode MapMode = Req->MAccessMode;
450  if (MapMode == access::mode::write)
451  MapMode = access::mode::read_write;
452  MapMemObject *MapCmd = new MapMemObject(
453  LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(),
454  &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue(), MapMode);
455 
456  std::vector<Command *> ToCleanUp;
457  for (Command *Dep : Deps) {
458  Command *ConnCmd = UnMapCmd->addDep(
459  DepDesc{Dep, UnMapCmd->getRequirement(), LinkedAllocaCmd}, ToCleanUp);
460  if (ConnCmd)
461  ToEnqueue.push_back(ConnCmd);
462  }
463 
464  Command *ConnCmd = MapCmd->addDep(
465  DepDesc{UnMapCmd, MapCmd->getRequirement(), HostAllocaCmd}, ToCleanUp);
466  if (ConnCmd)
467  ToEnqueue.push_back(ConnCmd);
468 
469  updateLeaves(Deps, Record, access::mode::read_write, ToCleanUp);
470  addNodeToLeaves(Record, MapCmd, access::mode::read_write, ToEnqueue);
471  for (Command *Cmd : ToCleanUp)
472  cleanupCommand(Cmd);
473  Record->MHostAccess = MapMode;
474  return MapCmd;
475 }
476 
477 // The function adds copy operation of the up to date'st memory to the memory
478 // pointed by Req.
479 Command *
481  std::vector<Command *> &ToEnqueue) {
482  SYCLMemObjI *MemObj = Req->MSYCLMemObj;
483  MemObjRecord *Record = getMemObjRecord(MemObj);
484  if (Record && MPrintOptionsArray[BeforeAddCopyBack])
485  printGraphAsDot("before_addCopyBack");
486 
487  // Do nothing if there were no or only read operations with the memory object.
488  if (nullptr == Record || !Record->MMemModified)
489  return nullptr;
490 
491  std::set<Command *> Deps = findDepsForReq(Record, Req, nullptr);
492  AllocaCommandBase *SrcAllocaCmd =
493  findAllocaForReq(Record, Req, Record->MCurContext);
494 
495  auto MemCpyCmdUniquePtr = std::make_unique<MemCpyCommandHost>(
496  *SrcAllocaCmd->getRequirement(), SrcAllocaCmd, *Req, &Req->MData,
497  SrcAllocaCmd->getQueue(), nullptr);
498 
499  if (!MemCpyCmdUniquePtr)
501  "Out of host memory");
502 
503  MemCpyCommandHost *MemCpyCmd = MemCpyCmdUniquePtr.release();
504 
505  std::vector<Command *> ToCleanUp;
506  for (Command *Dep : Deps) {
507  Command *ConnCmd = MemCpyCmd->addDep(
508  DepDesc{Dep, MemCpyCmd->getRequirement(), SrcAllocaCmd}, ToCleanUp);
509  if (ConnCmd)
510  ToEnqueue.push_back(ConnCmd);
511  }
512 
513  updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp);
514  addNodeToLeaves(Record, MemCpyCmd, Req->MAccessMode, ToEnqueue);
515  for (Command *Cmd : ToCleanUp)
516  cleanupCommand(Cmd);
517  if (MPrintOptionsArray[AfterAddCopyBack])
518  printGraphAsDot("after_addCopyBack");
519  return MemCpyCmd;
520 }
521 
522 // The function implements SYCL host accessor logic: host accessor
523 // should provide access to the buffer in user space.
524 Command *
526  std::vector<Command *> &ToEnqueue) {
527 
528  if (Req->MAccessMode != sycl::access_mode::read) {
529  auto SYCLMemObj = static_cast<detail::SYCLMemObjT *>(Req->MSYCLMemObj);
530  SYCLMemObj->handleWriteAccessorCreation();
531  }
532  // Host accessor is not attached to any queue so no QueueImplPtr object to be
533  // sent to getOrInsertMemObjRecord.
534  MemObjRecord *Record = getOrInsertMemObjRecord(nullptr, Req);
535  if (MPrintOptionsArray[BeforeAddHostAcc])
536  printGraphAsDot("before_addHostAccessor");
537  markModifiedIfWrite(Record, Req);
538 
539  AllocaCommandBase *HostAllocaCmd =
540  getOrCreateAllocaForReq(Record, Req, nullptr, ToEnqueue);
541 
542  if (isOnSameContext(Record->MCurContext, HostAllocaCmd->getQueue())) {
543  if (!isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess)) {
544  remapMemoryObject(Record, Req,
545  Req->MIsSubBuffer ? (static_cast<AllocaSubBufCommand *>(
546  HostAllocaCmd))
547  ->getParentAlloca()
548  : HostAllocaCmd,
549  ToEnqueue);
550  }
551  } else
552  insertMemoryMove(Record, Req, nullptr, ToEnqueue);
553 
554  Command *UpdateHostAccCmd =
555  insertUpdateHostReqCmd(Record, Req, nullptr, ToEnqueue);
556 
557  // Need empty command to be blocked until host accessor is destructed
558  EmptyCommand *EmptyCmd = addEmptyCmd(
559  UpdateHostAccCmd, {Req}, Command::BlockReason::HostAccessor, ToEnqueue);
560 
561  Req->MBlockedCmd = EmptyCmd;
562 
563  if (MPrintOptionsArray[AfterAddHostAcc])
564  printGraphAsDot("after_addHostAccessor");
565 
566  return UpdateHostAccCmd;
567 }
568 
570  std::unique_ptr<detail::CG> CommandGroup,
571  std::vector<Command *> &ToEnqueue) {
572 
573  auto UpdateHost = static_cast<CGUpdateHost *>(CommandGroup.get());
574  Requirement *Req = UpdateHost->getReqToUpdate();
575 
576  MemObjRecord *Record = getOrInsertMemObjRecord(nullptr, Req);
577  return insertMemoryMove(Record, Req, nullptr, ToEnqueue);
578 }
579 
588 std::set<Command *>
589 Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record,
590  const Requirement *Req,
591  const ContextImplPtr &Context) {
592  std::set<Command *> RetDeps;
593  std::vector<Command *> Visited;
594  const bool ReadOnlyReq = Req->MAccessMode == access::mode::read;
595 
596  std::vector<Command *> ToAnalyze{Record->MWriteLeaves.toVector()};
597 
598  if (!ReadOnlyReq) {
599  std::vector<Command *> V{Record->MReadLeaves.toVector()};
600 
601  ToAnalyze.insert(ToAnalyze.begin(), V.begin(), V.end());
602  }
603 
604  while (!ToAnalyze.empty()) {
605  Command *DepCmd = ToAnalyze.back();
606  ToAnalyze.pop_back();
607 
608  std::vector<Command *> NewAnalyze;
609 
610  for (const DepDesc &Dep : DepCmd->MDeps) {
611  if (Dep.MDepRequirement->MSYCLMemObj != Req->MSYCLMemObj)
612  continue;
613 
614  bool CanBypassDep = false;
615  // If both only read
616  CanBypassDep |=
617  Dep.MDepRequirement->MAccessMode == access::mode::read && ReadOnlyReq;
618 
619  // If not overlap
620  CanBypassDep |= !doOverlap(Dep.MDepRequirement, Req);
621 
622  // Going through copying memory between contexts is not supported.
623  if (Dep.MDepCommand) {
624  auto DepQueue = Dep.MDepCommand->getQueue();
625  CanBypassDep &= isOnSameContext(Context, DepQueue);
626  }
627 
628  if (!CanBypassDep) {
629  RetDeps.insert(DepCmd);
630  // No need to analyze deps of examining command as it's dependency
631  // itself.
632  NewAnalyze.clear();
633  break;
634  }
635 
636  if (markNodeAsVisited(Dep.MDepCommand, Visited))
637  NewAnalyze.push_back(Dep.MDepCommand);
638  }
639  ToAnalyze.insert(ToAnalyze.end(), NewAnalyze.begin(), NewAnalyze.end());
640  }
641  unmarkVisitedNodes(Visited);
642  return RetDeps;
643 }
644 
645 // A helper function for finding a command dependency on a specific memory
646 // object
648  MemObjRecord *Record) {
649  for (const DepDesc &DD : Cmd->MDeps) {
650  if (getMemObjRecord(DD.MDepRequirement->MSYCLMemObj) == Record) {
651  return DD;
652  }
653  }
654  assert(false && "No dependency found for a leaf of the record");
655  return {nullptr, nullptr, nullptr};
656 }
657 
658 // The function searches for the alloca command matching context and
659 // requirement.
661  MemObjRecord *Record, const Requirement *Req, const ContextImplPtr &Context,
662  bool AllowConst) {
663  auto IsSuitableAlloca = [&Context, Req,
664  AllowConst](AllocaCommandBase *AllocaCmd) {
665  bool Res = isOnSameContext(Context, AllocaCmd->getQueue());
666  if (IsSuitableSubReq(Req)) {
667  const Requirement *TmpReq = AllocaCmd->getRequirement();
668  Res &= AllocaCmd->getType() == Command::CommandType::ALLOCA_SUB_BUF;
669  Res &= TmpReq->MOffsetInBytes == Req->MOffsetInBytes;
670  Res &= TmpReq->MAccessRange == Req->MAccessRange;
671  Res &= AllowConst || !AllocaCmd->MIsConst;
672  }
673  return Res;
674  };
675  const auto It = std::find_if(Record->MAllocaCommands.begin(),
676  Record->MAllocaCommands.end(), IsSuitableAlloca);
677  return (Record->MAllocaCommands.end() != It) ? *It : nullptr;
678 }
679 
680 static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) {
681  if (const char *HUMConfig = SYCLConfig<SYCL_HOST_UNIFIED_MEMORY>::get()) {
682  if (std::strcmp(HUMConfig, "0") == 0)
683  return Ctx == nullptr;
684  if (std::strcmp(HUMConfig, "1") == 0)
685  return true;
686  }
687  // host task & host accessor is covered with no device context but provide
688  // required support.
689  if (Ctx == nullptr)
690  return true;
691 
692  for (const device &Device : Ctx->getDevices()) {
693  if (!Device.get_info<info::device::host_unified_memory>())
694  return false;
695  }
696  return true;
697 }
698 
699 // The function searches for the alloca command matching context and
700 // requirement. If none exists, new allocation command is created.
701 // Note, creation of new allocation command can lead to the current context
702 // (Record->MCurContext) change.
703 AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
704  MemObjRecord *Record, const Requirement *Req, const QueueImplPtr &Queue,
705  std::vector<Command *> &ToEnqueue) {
706  auto Context = queue_impl::getContext(Queue);
707  AllocaCommandBase *AllocaCmd =
708  findAllocaForReq(Record, Req, Context, /*AllowConst=*/false);
709 
710  if (!AllocaCmd) {
711  std::vector<Command *> ToCleanUp;
712  if (IsSuitableSubReq(Req)) {
713  // Get parent requirement. It's hard to get right parents' range
714  // so full parent requirement has range represented in bytes
715  range<3> ParentRange{Req->MSYCLMemObj->getSizeInBytes(), 1, 1};
716  Requirement ParentRequirement(
717  /*Offset*/ {0, 0, 0}, ParentRange, ParentRange,
718  access::mode::read_write, Req->MSYCLMemObj, /*Dims*/ 1,
719  /*Working with bytes*/ sizeof(char), /*offset*/ size_t(0));
720 
721  auto *ParentAlloca =
722  getOrCreateAllocaForReq(Record, &ParentRequirement, Queue, ToEnqueue);
723  AllocaCmd = new AllocaSubBufCommand(Queue, *Req, ParentAlloca, ToEnqueue,
724  ToCleanUp);
725  } else {
726 
727  const Requirement FullReq(/*Offset*/ {0, 0, 0}, Req->MMemoryRange,
728  Req->MMemoryRange, access::mode::read_write,
729  Req->MSYCLMemObj, Req->MDims, Req->MElemSize,
730  0 /*ReMOffsetInBytes*/, false /*MIsSubBuffer*/);
731  // Can reuse user data for the first allocation. Do so if host unified
732  // memory is supported regardless of the access mode (the pointer will be
733  // reused). For devices without host unified memory the initialization
734  // will be performed as a write operation.
735  // TODO the case where the first alloca is made with a discard mode and
736  // the user pointer is read-only is still not handled: it leads to
737  // unnecessary copy on devices with unified host memory support.
738  const bool HostUnifiedMemory = checkHostUnifiedMemory(Context);
739  SYCLMemObjI *MemObj = Req->MSYCLMemObj;
740  const bool InitFromUserData = Record->MAllocaCommands.empty() &&
741  (HostUnifiedMemory || MemObj->isInterop());
742  AllocaCommandBase *LinkedAllocaCmd = nullptr;
743 
744  // For the first allocation on a device without host unified memory we
745  // might need to also create a host alloca right away in order to perform
746  // the initial memory write.
747  if (Record->MAllocaCommands.empty()) {
748  if (!HostUnifiedMemory &&
749  Req->MAccessMode != access::mode::discard_write &&
750  Req->MAccessMode != access::mode::discard_read_write) {
751  // There's no need to make a host allocation if the buffer is not
752  // initialized with user data.
753  if (MemObj->hasUserDataPtr()) {
754  AllocaCommand *HostAllocaCmd = new AllocaCommand(
755  nullptr, FullReq, true /* InitFromUserData */,
756  nullptr /* LinkedAllocaCmd */,
757  MemObj->isHostPointerReadOnly() /* IsConst */);
758  Record->MAllocaCommands.push_back(HostAllocaCmd);
759  Record->MWriteLeaves.push_back(HostAllocaCmd, ToEnqueue);
760  ++(HostAllocaCmd->MLeafCounter);
761  Record->MCurContext = nullptr;
762  }
763  }
764  } else {
765  // If it is not the first allocation, try to setup a link
766  // FIXME: Temporary limitation, linked alloca commands for an image is
767  // not supported because map operation is not implemented for an image.
768  if (Req->MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer)
769  // Current limitation is to setup link between current allocation and
770  // new one. There could be situations when we could setup link with
771  // "not" current allocation, but it will require memory copy.
772  // Can setup link between cl and host allocations only
773  if ((Context == nullptr) != (Record->MCurContext == nullptr)) {
774  // Linked commands assume that the host allocation is reused by the
775  // plugin runtime and that can lead to unnecessary copy overhead on
776  // devices that do not support host unified memory. Do not link the
777  // allocations in this case.
778  // However, if the user explicitly requests use of pinned host
779  // memory, map/unmap operations are expected to work faster than
780  // read/write from/to an artbitrary host pointer. Link such commands
781  // regardless of host unified memory support.
782  bool PinnedHostMemory = MemObj->usesPinnedHostMemory();
783 
784  bool HostUnifiedMemoryOnNonHostDevice =
785  Queue == nullptr ? checkHostUnifiedMemory(Record->MCurContext)
786  : HostUnifiedMemory;
787  if (PinnedHostMemory || HostUnifiedMemoryOnNonHostDevice) {
788  AllocaCommandBase *LinkedAllocaCmdCand = findAllocaForReq(
789  Record, Req, Record->MCurContext, /*AllowConst=*/false);
790 
791  // Cannot setup link if candidate is linked already
792  if (LinkedAllocaCmdCand &&
793  !LinkedAllocaCmdCand->MLinkedAllocaCmd) {
794  LinkedAllocaCmd = LinkedAllocaCmdCand;
795  }
796  }
797  }
798  }
799 
800  AllocaCmd =
801  new AllocaCommand(Queue, FullReq, InitFromUserData, LinkedAllocaCmd);
802 
803  // Update linked command
804  if (LinkedAllocaCmd) {
805  Command *ConnCmd = AllocaCmd->addDep(
806  DepDesc{LinkedAllocaCmd, AllocaCmd->getRequirement(),
807  LinkedAllocaCmd},
808  ToCleanUp);
809  if (ConnCmd)
810  ToEnqueue.push_back(ConnCmd);
811  LinkedAllocaCmd->MLinkedAllocaCmd = AllocaCmd;
812 
813  // To ensure that the leader allocation is removed first
814  ConnCmd = AllocaCmd->getReleaseCmd()->addDep(
815  DepDesc(LinkedAllocaCmd->getReleaseCmd(),
816  AllocaCmd->getRequirement(), LinkedAllocaCmd),
817  ToCleanUp);
818  if (ConnCmd)
819  ToEnqueue.push_back(ConnCmd);
820 
821  // Device allocation takes ownership of the host ptr during
822  // construction, host allocation doesn't. So, device allocation should
823  // always be active here. Also if the "follower" command is a device one
824  // we have to change current context to the device one.
825  if (Queue == nullptr) {
826  AllocaCmd->MIsActive = false;
827  } else {
828  LinkedAllocaCmd->MIsActive = false;
829  Record->MCurContext = Context;
830 
831  std::set<Command *> Deps = findDepsForReq(Record, Req, Context);
832  for (Command *Dep : Deps) {
833  Command *ConnCmd = AllocaCmd->addDep(
834  DepDesc{Dep, Req, LinkedAllocaCmd}, ToCleanUp);
835  if (ConnCmd)
836  ToEnqueue.push_back(ConnCmd);
837  }
838  updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp);
839  addNodeToLeaves(Record, AllocaCmd, Req->MAccessMode, ToEnqueue);
840  }
841  }
842  }
843 
844  Record->MAllocaCommands.push_back(AllocaCmd);
845  Record->MWriteLeaves.push_back(AllocaCmd, ToEnqueue);
846  ++(AllocaCmd->MLeafCounter);
847  for (Command *Cmd : ToCleanUp)
848  cleanupCommand(Cmd);
849  }
850  return AllocaCmd;
851 }
852 
853 // The function sets MemModified flag in record if requirement has write access.
854 void Scheduler::GraphBuilder::markModifiedIfWrite(MemObjRecord *Record,
855  Requirement *Req) {
856  switch (Req->MAccessMode) {
857  case access::mode::write:
862  Record->MMemModified = true;
863  break;
864  case access::mode::read:
865  break;
866  }
867 }
868 
869 EmptyCommand *Scheduler::GraphBuilder::addEmptyCmd(
870  Command *Cmd, const std::vector<Requirement *> &Reqs,
871  Command::BlockReason Reason, std::vector<Command *> &ToEnqueue) {
872  EmptyCommand *EmptyCmd = new EmptyCommand();
873 
874  if (!EmptyCmd)
875  throw exception(make_error_code(errc::memory_allocation),
876  "Out of host memory");
877 
878  EmptyCmd->MIsBlockable = true;
879  EmptyCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueBlocked;
880  EmptyCmd->MBlockReason = Reason;
881 
882  for (Requirement *Req : Reqs) {
883  MemObjRecord *Record = getOrInsertMemObjRecord(nullptr, Req);
884  AllocaCommandBase *AllocaCmd =
885  getOrCreateAllocaForReq(Record, Req, nullptr, ToEnqueue);
886  EmptyCmd->addRequirement(Cmd, AllocaCmd, Req);
887  }
888  // addRequirement above call addDep that already will add EmptyCmd as user for
889  // Cmd no Reqs size check here so assume it is possible to have no Reqs passed
890  if (!Reqs.size())
891  Cmd->addUser(EmptyCmd);
892 
893  const std::vector<DepDesc> &Deps = Cmd->MDeps;
894  std::vector<Command *> ToCleanUp;
895  for (const DepDesc &Dep : Deps) {
896  const Requirement *Req = Dep.MDepRequirement;
897  MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj);
898 
899  updateLeaves({Cmd}, Record, Req->MAccessMode, ToCleanUp);
900  addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode, ToEnqueue);
901  }
902  for (Command *Cmd : ToCleanUp)
903  cleanupCommand(Cmd);
904 
905  return EmptyCmd;
906 }
907 
908 static bool isInteropHostTask(ExecCGCommand *Cmd) {
909  if (Cmd->getCG().getType() != CGType::CodeplayHostTask)
910  return false;
911 
912  const detail::CGHostTask &HT =
913  static_cast<detail::CGHostTask &>(Cmd->getCG());
914 
915  return HT.MHostTask->isInteropTask();
916 }
917 
918 static void combineAccessModesOfReqs(std::vector<Requirement *> &Reqs) {
919  std::unordered_map<SYCLMemObjI *, access::mode> CombinedModes;
920  bool HasDuplicateMemObjects = false;
921  for (const Requirement *Req : Reqs) {
922  auto Result = CombinedModes.insert(
923  std::make_pair(Req->MSYCLMemObj, Req->MAccessMode));
924  if (!Result.second) {
925  Result.first->second =
926  combineAccessModes(Result.first->second, Req->MAccessMode);
927  HasDuplicateMemObjects = true;
928  }
929  }
930 
931  if (!HasDuplicateMemObjects)
932  return;
933  for (Requirement *Req : Reqs) {
934  Req->MAccessMode = CombinedModes[Req->MSYCLMemObj];
935  }
936 }
937 
939  std::unique_ptr<detail::CG> CommandGroup, const QueueImplPtr &Queue,
940  std::vector<Command *> &ToEnqueue, bool EventNeeded,
942  const std::vector<sycl::detail::pi::PiExtSyncPoint> &Dependencies) {
943  std::vector<Requirement *> &Reqs = CommandGroup->getRequirements();
944  std::vector<detail::EventImplPtr> &Events = CommandGroup->getEvents();
945 
946  auto NewCmd = std::make_unique<ExecCGCommand>(std::move(CommandGroup), Queue,
947  EventNeeded, CommandBuffer,
948  std::move(Dependencies));
949 
950  if (!NewCmd)
952  "Out of host memory");
953 
954  // Only device kernel command groups can participate in fusion. Otherwise,
955  // command groups take the regular route. If they create any requirement or
956  // event dependency on any of the kernels in the fusion list, this will lead
957  // to cancellation of the fusion in the GraphProcessor.
958  auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
959  if (isInFusionMode(QUniqueID)) {
960  if (NewCmd->isFusable()) {
961  auto *FusionCmd = findFusionList(QUniqueID)->second.get();
962 
963  bool dependsOnFusion = false;
964  for (auto Ev = Events.begin(); Ev != Events.end();) {
965  auto *EvDepCmd = static_cast<Command *>((*Ev)->getCommand());
966  if (!EvDepCmd) {
967  ++Ev;
968  continue;
969  }
970  // Event dependencies on commands part of another active fusion are
971  // handled by cancelling fusion in that other queue.
972  if (EvDepCmd->getQueue() != Queue && isPartOfActiveFusion(EvDepCmd)) {
973  printFusionWarning(
974  "Aborting fusion because of event dependency from a "
975  "different fusion");
976  cancelFusion(EvDepCmd->getQueue(), ToEnqueue);
977  }
978  // Check if this command depends on the placeholder command for the
979  // fusion itself participates in.
980  if (EvDepCmd == FusionCmd) {
981  Ev = Events.erase(Ev);
982  dependsOnFusion = true;
983  } else {
984  ++Ev;
985  }
986  }
987 
988  // If this command has an explicit event dependency on the placeholder
989  // command for this fusion (because it used depends_on on the event
990  // returned by submitting another kernel to this fusion earlier), add a
991  // dependency on all the commands in the fusion list so far.
992  if (dependsOnFusion) {
993  for (auto *Cmd : FusionCmd->getFusionList()) {
994  Events.push_back(Cmd->getEvent());
995  }
996  }
997 
998  // Add the kernel to the graph, but delay the enqueue of any auxiliary
999  // commands (e.g., allocations) resulting from that process by adding them
1000  // to the list of auxiliary commands of the fusion command.
1001  createGraphForCommand(NewCmd.get(), NewCmd->getCG(),
1002  isInteropHostTask(NewCmd.get()), Reqs, Events,
1003  Queue, FusionCmd->auxiliaryCommands());
1004 
1005  // Set the fusion command, so we recognize when another command depends on
1006  // a kernel in the fusion list.
1007  FusionCmd->addToFusionList(NewCmd.get());
1008  NewCmd->MFusionCmd = FusionCmd;
1009  std::vector<Command *> ToCleanUp;
1010  // Add an event dependency from the fusion placeholder command to the new
1011  // kernel.
1012  auto ConnectionCmd = FusionCmd->addDep(NewCmd->getEvent(), ToCleanUp);
1013  if (ConnectionCmd) {
1014  FusionCmd->auxiliaryCommands().push_back(ConnectionCmd);
1015  }
1016  return {NewCmd.release(), FusionCmd->getEvent(), false};
1017  } else {
1018  std::string s;
1019  std::stringstream ss(s);
1020  if (NewCmd->getCG().getType() == CGType::Kernel) {
1021  ss << "Not fusing kernel with 'use_root_sync' property. Can only fuse "
1022  "non-cooperative device kernels.";
1023  } else {
1024  ss << "Not fusing '" << NewCmd->getTypeString()
1025  << "' command group. Can only fuse device kernel command groups.";
1026  }
1027  printFusionWarning(ss.str());
1028  }
1029  }
1030  createGraphForCommand(NewCmd.get(), NewCmd->getCG(),
1031  isInteropHostTask(NewCmd.get()), Reqs, Events, Queue,
1032  ToEnqueue);
1033  auto Event = NewCmd->getEvent();
1034  return {NewCmd.release(), Event, true};
1035 }
1036 
1037 void Scheduler::GraphBuilder::createGraphForCommand(
1038  Command *NewCmd, CG &CG, bool isInteropTask,
1039  std::vector<Requirement *> &Reqs,
1040  const std::vector<detail::EventImplPtr> &Events, QueueImplPtr Queue,
1041  std::vector<Command *> &ToEnqueue) {
1042 
1043  if (MPrintOptionsArray[BeforeAddCG])
1044  printGraphAsDot("before_addCG");
1045 
1046  // If there are multiple requirements for the same memory object, its
1047  // AllocaCommand creation will be dependent on the access mode of the first
1048  // requirement. Combine these access modes to take all of them into account.
1050  std::vector<Command *> ToCleanUp;
1051  for (Requirement *Req : Reqs) {
1052  MemObjRecord *Record = nullptr;
1053  AllocaCommandBase *AllocaCmd = nullptr;
1054 
1055  bool isSameCtx = false;
1056 
1057  {
1058  const QueueImplPtr &QueueForAlloca =
1059  isInteropTask ? static_cast<detail::CGHostTask &>(CG).MQueue : Queue;
1060 
1061  Record = getOrInsertMemObjRecord(QueueForAlloca, Req);
1062  markModifiedIfWrite(Record, Req);
1063 
1064  AllocaCmd =
1065  getOrCreateAllocaForReq(Record, Req, QueueForAlloca, ToEnqueue);
1066 
1067  isSameCtx = isOnSameContext(Record->MCurContext, QueueForAlloca);
1068  }
1069 
1070  // If there is alloca command we need to check if the latest memory is in
1071  // required context.
1072  if (isSameCtx) {
1073  // If the memory is already in the required host context, check if the
1074  // required access mode is valid, remap if not.
1075  if (!Record->MCurContext &&
1076  !isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess)) {
1077  remapMemoryObject(Record, Req,
1078  Req->MIsSubBuffer
1079  ? (static_cast<AllocaSubBufCommand *>(AllocaCmd))
1080  ->getParentAlloca()
1081  : AllocaCmd,
1082  ToEnqueue);
1083  }
1084  } else {
1085  // Cannot directly copy memory from OpenCL device to OpenCL device -
1086  // create two copies: device->host and host->device.
1087  bool NeedMemMoveToHost = false;
1088  auto MemMoveTargetQueue = Queue;
1089 
1090  if (isInteropTask) {
1091  const detail::CGHostTask &HT = static_cast<detail::CGHostTask &>(CG);
1092 
1093  if (!isOnSameContext(Record->MCurContext, HT.MQueue)) {
1094  NeedMemMoveToHost = true;
1095  MemMoveTargetQueue = HT.MQueue;
1096  }
1097  } else if (Queue && Record->MCurContext)
1098  NeedMemMoveToHost = true;
1099 
1100  if (NeedMemMoveToHost)
1101  insertMemoryMove(Record, Req, nullptr, ToEnqueue);
1102  insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue);
1103  }
1104 
1105  std::set<Command *> Deps =
1106  findDepsForReq(Record, Req, queue_impl::getContext(Queue));
1107 
1108  for (Command *Dep : Deps) {
1109  if (Dep != NewCmd) {
1110  Command *ConnCmd =
1111  NewCmd->addDep(DepDesc{Dep, Req, AllocaCmd}, ToCleanUp);
1112  if (ConnCmd)
1113  ToEnqueue.push_back(ConnCmd);
1114  }
1115  }
1116  }
1117 
1118  // Set new command as user for dependencies and update leaves.
1119  // Node dependencies can be modified further when adding the node to leaves,
1120  // iterate over their copy.
1121  // FIXME employ a reference here to eliminate copying of a vector
1122  std::vector<DepDesc> Deps = NewCmd->MDeps;
1123  for (DepDesc &Dep : Deps) {
1124  const Requirement *Req = Dep.MDepRequirement;
1125  MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj);
1126  updateLeaves({Dep.MDepCommand}, Record, Req->MAccessMode, ToCleanUp);
1127  addNodeToLeaves(Record, NewCmd, Req->MAccessMode, ToEnqueue);
1128  }
1129 
1130  // Register all the events as dependencies
1131  for (detail::EventImplPtr e : Events) {
1132  if (e->getCommand() && e->getCommand() == NewCmd) {
1133  continue;
1134  }
1135  if (Command *ConnCmd = NewCmd->addDep(e, ToCleanUp))
1136  ToEnqueue.push_back(ConnCmd);
1137  }
1138 
1139  if (MPrintOptionsArray[AfterAddCG])
1140  printGraphAsDot("after_addCG");
1141 
1142  for (Command *Cmd : ToCleanUp) {
1143  cleanupCommand(Cmd);
1144  }
1145 }
1146 
1148  MemObjRecord *Record) {
1149  for (Command *Cmd : Record->MReadLeaves) {
1150  --(Cmd->MLeafCounter);
1151  if (Cmd->readyForCleanup())
1152  cleanupCommand(Cmd);
1153  }
1154  for (Command *Cmd : Record->MWriteLeaves) {
1155  --(Cmd->MLeafCounter);
1156  if (Cmd->readyForCleanup())
1157  cleanupCommand(Cmd);
1158  }
1159 }
1160 
1162  std::vector<AllocaCommandBase *> &AllocaCommands = Record->MAllocaCommands;
1163  if (AllocaCommands.empty())
1164  return;
1165 
1166  assert(MCmdsToVisit.empty());
1167  MVisitedCmds.clear();
1168 
1169  // First, mark all allocas for deletion and their direct users for traversal
1170  // Dependencies of the users will be cleaned up during the traversal
1171  for (Command *AllocaCmd : AllocaCommands) {
1172  markNodeAsVisited(AllocaCmd, MVisitedCmds);
1173 
1174  for (Command *UserCmd : AllocaCmd->MUsers)
1175  // Linked alloca cmd may be in users of this alloca. We're not going to
1176  // visit it.
1177  if (UserCmd->getType() != Command::CommandType::ALLOCA)
1178  MCmdsToVisit.push(UserCmd);
1179  else
1180  markNodeAsVisited(UserCmd, MVisitedCmds);
1181 
1182  AllocaCmd->MMarks.MToBeDeleted = true;
1183  // These commands will be deleted later, clear users now to avoid
1184  // updating them during edge removal
1185  AllocaCmd->MUsers.clear();
1186  }
1187 
1188  // Make sure the Linked Allocas are marked visited by the previous walk.
1189  // Remove allocation commands from the users of their dependencies.
1190  for (AllocaCommandBase *AllocaCmd : AllocaCommands) {
1191  AllocaCommandBase *LinkedCmd = AllocaCmd->MLinkedAllocaCmd;
1192 
1193  if (LinkedCmd) {
1194  assert(LinkedCmd->MMarks.MVisited);
1195  }
1196 
1197  for (DepDesc &Dep : AllocaCmd->MDeps)
1198  if (Dep.MDepCommand)
1199  Dep.MDepCommand->MUsers.erase(AllocaCmd);
1200  }
1201 
1202  // Traverse the graph using BFS
1203  while (!MCmdsToVisit.empty()) {
1204  Command *Cmd = MCmdsToVisit.front();
1205  MCmdsToVisit.pop();
1206 
1207  if (!markNodeAsVisited(Cmd, MVisitedCmds))
1208  continue;
1209 
1210  for (Command *UserCmd : Cmd->MUsers)
1211  if (UserCmd->getType() != Command::CommandType::ALLOCA)
1212  MCmdsToVisit.push(UserCmd);
1213 
1214  // Delete all dependencies on any allocations being removed
1215  // Track which commands should have their users updated
1216  std::map<Command *, bool> ShouldBeUpdated;
1217  auto NewEnd = std::remove_if(
1218  Cmd->MDeps.begin(), Cmd->MDeps.end(), [&](const DepDesc &Dep) {
1219  if (std::find(AllocaCommands.begin(), AllocaCommands.end(),
1220  Dep.MAllocaCmd) != AllocaCommands.end()) {
1221  ShouldBeUpdated.insert({Dep.MDepCommand, true});
1222  return true;
1223  }
1224  ShouldBeUpdated[Dep.MDepCommand] = false;
1225  return false;
1226  });
1227  Cmd->MDeps.erase(NewEnd, Cmd->MDeps.end());
1228 
1229  // Update users of removed dependencies
1230  for (auto DepCmdIt : ShouldBeUpdated) {
1231  if (!DepCmdIt.second)
1232  continue;
1233  DepCmdIt.first->MUsers.erase(Cmd);
1234  }
1235 
1236  // If all dependencies have been removed this way, mark the command for
1237  // deletion
1238  if (Cmd->MDeps.empty()) {
1239  Cmd->MUsers.clear();
1240  // Do not delete the node if it's scheduled for post-enqueue cleanup to
1241  // avoid double free.
1242  if (!Cmd->MMarkedForCleanup)
1243  Cmd->MMarks.MToBeDeleted = true;
1244  }
1245  }
1246 
1247  handleVisitedNodes(MVisitedCmds);
1248 }
1249 
1251  Command *Cmd, [[maybe_unused]] bool AllowUnsubmitted) {
1253  static bool DeprWarningPrinted = false;
1254  if (!DeprWarningPrinted) {
1255  std::cerr << "WARNING: The enviroment variable "
1256  "SYCL_DISABLE_POST_ENQUEUE_CLEANUP is deprecated. Please "
1257  "use SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP instead.\n";
1258  DeprWarningPrinted = true;
1259  }
1260  return;
1261  }
1263  return;
1264 
1265  assert(Cmd->MLeafCounter == 0 &&
1266  (Cmd->isSuccessfullyEnqueued() || AllowUnsubmitted));
1267  Command::CommandType CmdT = Cmd->getType();
1268 
1269  assert(CmdT != Command::ALLOCA && CmdT != Command::ALLOCA_SUB_BUF);
1270  assert(CmdT != Command::RELEASE);
1271  (void)CmdT;
1272 
1273  for (Command *UserCmd : Cmd->MUsers) {
1274  for (DepDesc &Dep : UserCmd->MDeps) {
1275  // Link the users of the command to the alloca command(s) instead
1276  if (Dep.MDepCommand == Cmd) {
1277  // ... unless the user is the alloca itself.
1278  if (Dep.MAllocaCmd == UserCmd) {
1279  Dep.MDepCommand = nullptr;
1280  } else {
1281  Dep.MDepCommand = Dep.MAllocaCmd;
1282  Dep.MDepCommand->MUsers.insert(UserCmd);
1283  }
1284  }
1285  }
1286  }
1287  // Update dependency users
1288  for (DepDesc &Dep : Cmd->MDeps) {
1289  Command *DepCmd = Dep.MDepCommand;
1290  DepCmd->MUsers.erase(Cmd);
1291  }
1292 
1293  if (Cmd->getType() == Command::FUSION &&
1294  !static_cast<KernelFusionCommand *>(Cmd)->readyForDeletion()) {
1295  // Fusion commands might still be needed because fusion might be aborted,
1296  // but a later call to complete_fusion still needs to be able to return a
1297  // valid event. Clean-up of fusion commands is therefore explicitly handled
1298  // by start fusion.
1299  return;
1300  }
1301  Cmd->getEvent()->setCommand(nullptr);
1302  delete Cmd;
1303 }
1304 
1306  const auto It = std::find_if(
1307  MMemObjs.begin(), MMemObjs.end(),
1308  [MemObject](const SYCLMemObjI *Obj) { return Obj == MemObject; });
1309  if (It != MMemObjs.end())
1310  MMemObjs.erase(It);
1311  MemObject->MRecord.reset();
1312 }
1313 
1314 // Make Cmd depend on DepEvent from different context. Connection is performed
1315 // via distinct ConnectCmd with host task command group on host queue. Cmd will
1316 // depend on ConnectCmd's host event.
1317 // DepEvent may not have a command associated with it in at least two cases:
1318 // - the command was deleted upon cleanup process;
1319 // - DepEvent is user event.
1320 // In both of these cases the only thing we can do is to make ConnectCmd depend
1321 // on DepEvent.
1322 // Otherwise, when there is a command associated with DepEvent, we make
1323 // ConnectCmd depend on on this command. If there is valid, i.e. non-nil,
1324 // requirement in Dep we make ConnectCmd depend on DepEvent's command with this
1325 // requirement.
1326 // Optionality of Dep is set by Dep.MDepCommand equal to nullptr.
1328  Command *const Cmd, const EventImplPtr &DepEvent, const DepDesc &Dep,
1329  std::vector<Command *> &ToCleanUp) {
1330  assert(Cmd->getWorkerContext() != DepEvent->getContextImpl());
1331 
1332  // construct Host Task type command manually and make it depend on DepEvent
1333  ExecCGCommand *ConnectCmd = nullptr;
1334 
1335  try {
1336  std::shared_ptr<detail::HostTask> HT(new detail::HostTask);
1337  std::unique_ptr<detail::CG> ConnectCG(new detail::CGHostTask(
1338  std::move(HT), /* Queue = */ Cmd->getQueue(), /* Context = */ {},
1339  /* Args = */ {},
1341  /* ArgsStorage = */ {}, /* AccStorage = */ {},
1342  /* SharedPtrStorage = */ {}, /* Requirements = */ {},
1343  /* DepEvents = */ {DepEvent}),
1345  /* Payload */ {}));
1346  ConnectCmd = new ExecCGCommand(std::move(ConnectCG), nullptr,
1347  /*EventNeeded=*/true);
1348  } catch (const std::bad_alloc &) {
1350  "Out of host memory");
1351  }
1352 
1353  if (Dep.MDepRequirement) {
1354  // make ConnectCmd depend on requirement
1355  // Dismiss the result here as it's not a connection now,
1356  // 'cause ConnectCmd is host one
1357  (void)ConnectCmd->addDep(Dep, ToCleanUp);
1358  assert(reinterpret_cast<Command *>(DepEvent->getCommand()) ==
1359  Dep.MDepCommand);
1360  // add user to Dep.MDepCommand is already performed beyond this if branch
1361  {
1362  DepDesc DepOnConnect = Dep;
1363  DepOnConnect.MDepCommand = ConnectCmd;
1364 
1365  // Dismiss the result here as it's not a connection now,
1366  // 'cause ConnectCmd is host one
1367  std::ignore = Cmd->addDep(DepOnConnect, ToCleanUp);
1368  }
1369  } else {
1370  // It is required condition in another a path and addUser will be set in
1371  // addDep
1372  if (Command *DepCmd = reinterpret_cast<Command *>(DepEvent->getCommand()))
1373  DepCmd->addUser(ConnectCmd);
1374 
1375  std::ignore = ConnectCmd->addDep(DepEvent, ToCleanUp);
1376 
1377  std::ignore = Cmd->addDep(ConnectCmd->getEvent(), ToCleanUp);
1378 
1379  ConnectCmd->addUser(Cmd);
1380  }
1381 
1382  return ConnectCmd;
1383 }
1384 
1386  cleanUpCmdFusion(Queue.get());
1387  auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1388  MFusionMap.emplace(QUniqueID, std::make_unique<KernelFusionCommand>(Queue));
1389 }
1390 
1392  sycl::detail::queue_impl *Queue) {
1393  auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue);
1394  if (isInFusionMode(QUniqueID)) {
1395  throw sycl::exception{sycl::make_error_code(sycl::errc::invalid),
1396  "Queue already in fusion mode"};
1397  }
1398  auto OldFusionCmd = findFusionList(QUniqueID);
1399  if (OldFusionCmd != MFusionMap.end()) {
1400  // If fusion was used on this queue previously, the old fusion command might
1401  // still be around to make sure that even after
1402  // cancellation of the fusion due to synchronization, complete_fusion is
1403  // still able to return a valid event.
1404  OldFusionCmd->second->setFusionStatus(
1406  cleanupCommand(OldFusionCmd->second.release());
1407  MFusionMap.erase(OldFusionCmd);
1408  }
1409 }
1410 
1411 void Scheduler::GraphBuilder::removeNodeFromGraph(
1412  Command *Node, std::vector<Command *> &ToEnqueue) {
1413  // Remove the placeholder command as leaf of all its requirements and from the
1414  // user list of all its dependencies.
1415  for (auto &Dep : Node->MDeps) {
1416  auto AccessMode = Dep.MDepRequirement->MAccessMode;
1417  auto *Record = getMemObjRecord(Dep.MDepRequirement->MSYCLMemObj);
1418 
1419  Node->MLeafCounter -= Record->MReadLeaves.remove(Node);
1420  Node->MLeafCounter -= Record->MWriteLeaves.remove(Node);
1421  // If the placeholder had a write-requirement on this record, we need to
1422  // restore the previous leaves.
1423  if (AccessMode != access::mode::read) {
1424  for (auto PrevDep : Dep.MDepCommand->MDeps) {
1425  auto *DepReq = PrevDep.MDepRequirement;
1426  auto *DepRecord = getMemObjRecord(DepReq->MSYCLMemObj);
1427  if (DepRecord == Record) {
1428  // Need to restore this as a leaf, because we pushed it from the
1429  // leaves when adding the placeholder command.
1430  assert(Dep.MDepCommand);
1431  addNodeToLeaves(Record, Dep.MDepCommand, DepReq->MAccessMode,
1432  ToEnqueue);
1433  }
1434  }
1435  }
1436  Dep.MDepCommand->MUsers.erase(Node);
1437  }
1438 
1439  // Clear all the dependencies to avoid cleanDepEventsThroughOneLevel, called
1440  // from the destructor of the command to delete the dependencies of the
1441  // command this command depends on.
1442  Node->clearAllDependencies();
1443 }
1444 
1446  std::vector<Command *> &ToEnqueue) {
1447  auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1448  if (!isInFusionMode(QUniqueID)) {
1449  return;
1450  }
1451  auto FusionList = findFusionList(QUniqueID);
1452 
1453  auto *PlaceholderCmd = (*FusionList).second.get();
1454 
1455  // Enqueue all the kernels/commands from the fusion list
1456  auto FusedCmdList = PlaceholderCmd->getFusionList();
1457  ToEnqueue.insert(ToEnqueue.end(), FusedCmdList.begin(), FusedCmdList.end());
1458 
1459  // The commands establishing an event dependency between the fusion
1460  // placeholder command and the individual kernels need to be enqueued.
1461  ToEnqueue.insert(ToEnqueue.end(), PlaceholderCmd->auxiliaryCommands().begin(),
1462  PlaceholderCmd->auxiliaryCommands().end());
1463 
1464  ToEnqueue.push_back(PlaceholderCmd);
1465 
1466  if (MPrintOptionsArray[AfterFusionCancel]) {
1467  printGraphAsDot("after_fusionCancel");
1468  }
1469 
1470  // Set the status for the fusion command
1471  PlaceholderCmd->setFusionStatus(KernelFusionCommand::FusionStatus::CANCELLED);
1472 }
1473 
1474 static bool isPartOfFusion(Command *Cmd, KernelFusionCommand *Fusion) {
1475  if (Cmd->getType() == Command::RUN_CG) {
1476  return static_cast<ExecCGCommand *>(Cmd)->MFusionCmd == Fusion;
1477  }
1478  return false;
1479 }
1480 
1481 static bool checkForCircularDependency(Command *, bool, KernelFusionCommand *);
1482 
1483 static bool createsCircularDependency(Command *Cmd, bool PredPartOfFusion,
1484  KernelFusionCommand *Fusion) {
1485  if (isPartOfFusion(Cmd, Fusion)) {
1486  // If this is part of the fusion and the predecessor also was, we can stop
1487  // the traversal here. A direct dependency between two kernels in the same
1488  // fusion will never form a cyclic dependency and by iterating over all
1489  // commands in a fusion, we will detect any cycles originating from the
1490  // current command.
1491  // If the predecessor was not part of the fusion, but the current command
1492  // is, we have found a potential cycle in the dependency graph.
1493  return !PredPartOfFusion;
1494  }
1495  return checkForCircularDependency(Cmd, false, Fusion);
1496 }
1497 
1498 static bool checkForCircularDependency(Command *Cmd, bool IsPartOfFusion,
1499  KernelFusionCommand *Fusion) {
1500  // Check the requirement dependencies.
1501  for (auto &Dep : Cmd->MDeps) {
1502  auto *DepCmd = Dep.MDepCommand;
1503  if (!DepCmd) {
1504  continue;
1505  }
1506  if (createsCircularDependency(DepCmd, IsPartOfFusion, Fusion)) {
1507  return true;
1508  }
1509  }
1510  for (auto &Ev : Cmd->getPreparedDepsEvents()) {
1511  auto *EvDepCmd = static_cast<Command *>(Ev->getCommand());
1512  if (!EvDepCmd) {
1513  continue;
1514  }
1515  if (createsCircularDependency(EvDepCmd, IsPartOfFusion, Fusion)) {
1516  return true;
1517  }
1518  }
1519  for (auto &Ev : Cmd->getPreparedHostDepsEvents()) {
1520  auto *EvDepCmd = static_cast<Command *>(Ev->getCommand());
1521  if (!EvDepCmd) {
1522  continue;
1523  }
1524  if (createsCircularDependency(EvDepCmd, IsPartOfFusion, Fusion)) {
1525  return true;
1526  }
1527  }
1528  return false;
1529 }
1530 
1533  std::vector<Command *> &ToEnqueue,
1534  const property_list &PropList) {
1535  auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1536 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
1537  if (!isInFusionMode(QUniqueID)) {
1538  auto InactiveFusionList = findFusionList(QUniqueID);
1539  if (InactiveFusionList == MFusionMap.end()) {
1540  throw sycl::exception{
1541  sycl::make_error_code(sycl::errc::invalid),
1542  "Calling complete_fusion on a queue not in fusion mode"};
1543  }
1544  return InactiveFusionList->second->getEvent();
1545  }
1546 
1547  auto FusionList = findFusionList(QUniqueID);
1548  auto *PlaceholderCmd = FusionList->second.get();
1549  auto &CmdList = PlaceholderCmd->getFusionList();
1550 
1551  // If there is more than one queue currently in fusion mode, we need to check
1552  // if fusing the kernel would create a circular dependency. A circular
1553  // dependency would arise, if a kernel in the fusion list *indirectly* depends
1554  // on another kernel in the fusion list. Here, indirectly means, that the
1555  // dependency is created through a third command not part of the fusion, on
1556  // which this kernel depends and which in turn depends on another kernel in
1557  // fusion list.
1558  //
1559  // Note that we only have to consider dependencies via fusion queues here:
1560  // Let K1 be a kernel submitted to a queue Q1 in fusion mode. If a kernel K2
1561  // is submitted to a non-fusion queue Q2 and K2 depends on K1, fusion on Q1 is
1562  // cancelled automatically.
1563  bool CreatesCircularDep =
1564  MFusionMap.size() > 1 &&
1565  std::any_of(CmdList.begin(), CmdList.end(), [&](ExecCGCommand *Cmd) {
1566  return checkForCircularDependency(Cmd, true, PlaceholderCmd);
1567  });
1568  if (CreatesCircularDep) {
1569  // If fusing would create a fused kernel, cancel the fusion.
1570  printFusionWarning(
1571  "Aborting fusion because it would create a circular dependency");
1572  auto LastEvent = PlaceholderCmd->getEvent();
1573  this->cancelFusion(Queue, ToEnqueue);
1574  return LastEvent;
1575  }
1576 
1577  // Call the JIT compiler to generate a new fused kernel.
1579  Queue, CmdList, PropList);
1580 
1581  if (!FusedCG) {
1582  // If the JIT compiler returns a nullptr, JIT compilation of the fused
1583  // kernel failed. In that case, simply cancel the fusion and run each kernel
1584  // on its own.
1585  auto LastEvent = PlaceholderCmd->getEvent();
1586  this->cancelFusion(Queue, ToEnqueue);
1587  return LastEvent;
1588  }
1589 
1590  // Inherit all event dependencies from the input commands in the fusion list.
1591  std::vector<EventImplPtr> FusedEventDeps;
1592  for (auto *Cmd : CmdList) {
1593  FusedEventDeps.insert(FusedEventDeps.end(),
1594  Cmd->getPreparedDepsEvents().begin(),
1595  Cmd->getPreparedDepsEvents().end());
1596  FusedEventDeps.insert(FusedEventDeps.end(),
1597  Cmd->getPreparedHostDepsEvents().begin(),
1598  Cmd->getPreparedHostDepsEvents().end());
1599  }
1600 
1601  // Remove internal explicit dependencies, i.e., explicit dependencies from one
1602  // kernel in the fusion list to another kernel also in the fusion list.
1603  FusedEventDeps.erase(
1604  std::remove_if(FusedEventDeps.begin(), FusedEventDeps.end(),
1605  [&](EventImplPtr &E) {
1606  if (E->getCommand() == PlaceholderCmd) {
1607  return true;
1608  }
1609  if (E->getCommand() &&
1610  static_cast<Command *>(E->getCommand())->getType() ==
1611  Command::RUN_CG) {
1612  auto *RunCGCmd =
1613  static_cast<ExecCGCommand *>(E->getCommand());
1614  if (RunCGCmd->MFusionCmd == PlaceholderCmd) {
1615  return true;
1616  }
1617  }
1618  return false;
1619  }),
1620  FusedEventDeps.end());
1621 
1622  auto FusedKernelCmd = std::make_unique<ExecCGCommand>(
1623  std::move(FusedCG), Queue, /*EventNeeded=*/true);
1624 
1625  // Inherit auxiliary resources from fused command groups
1626  Scheduler::getInstance().takeAuxiliaryResources(FusedKernelCmd->getEvent(),
1627  PlaceholderCmd->getEvent());
1628  assert(PlaceholderCmd->MDeps.empty());
1629  // Next, backwards iterate over all the commands in the fusion list and remove
1630  // them from the graph to restore the state before starting fusion, so we can
1631  // add the fused kernel to the graph in the next step.
1632  // Clean up the old commands after successfully fusing them.
1633  for (auto OldCmd = CmdList.rbegin(); OldCmd != CmdList.rend(); ++OldCmd) {
1634  removeNodeFromGraph(*OldCmd, ToEnqueue);
1635  cleanupCommand(*OldCmd, /* AllowUnsubmitted */ true);
1636  }
1637 
1638  createGraphForCommand(FusedKernelCmd.get(), FusedKernelCmd->getCG(), false,
1639  FusedKernelCmd->getCG().getRequirements(),
1640  FusedEventDeps, Queue, ToEnqueue);
1641 
1642  ToEnqueue.push_back(FusedKernelCmd.get());
1643 
1644  std::vector<Command *> ToCleanUp;
1645  // Make the placeholder command depend on the execution of the fused kernel
1646  auto *ConnectToPlaceholder =
1647  PlaceholderCmd->addDep(FusedKernelCmd->getEvent(), ToCleanUp);
1648  if (ConnectToPlaceholder) {
1649  ToEnqueue.push_back(ConnectToPlaceholder);
1650  }
1651  for (Command *Cmd : ToCleanUp) {
1652  cleanupCommand(Cmd);
1653  }
1654  ToEnqueue.push_back(PlaceholderCmd);
1655 
1656  if (MPrintOptionsArray[AfterFusionComplete]) {
1657  printGraphAsDot("after_fusionComplete");
1658  }
1659 
1660  // Set the status for the fusion command.
1661  PlaceholderCmd->setFusionStatus(KernelFusionCommand::FusionStatus::COMPLETE);
1662 
1663  return FusedKernelCmd.release()->getEvent();
1664 #else // SYCL_EXT_CODEPLAY_KERNEL_FUSION
1665  printFusionWarning("Kernel fusion not supported by this build");
1666  (void)PropList;
1667  auto FusionList = findFusionList(QUniqueID);
1668  auto *PlaceholderCmd = FusionList->second.get();
1669  auto LastEvent = PlaceholderCmd->getEvent();
1670  this->cancelFusion(Queue, ToEnqueue);
1671  return LastEvent;
1672 #endif // SYCL_EXT_CODEPLAY_KERNEL_FUSION
1673 }
1674 
1675 bool Scheduler::GraphBuilder::isInFusionMode(QueueIdT Id) {
1676  auto FusionList = findFusionList(Id);
1677  if (FusionList == MFusionMap.end()) {
1678  return false;
1679  }
1680  return FusionList->second->isActive();
1681 }
1682 
1683 Command *Scheduler::GraphBuilder::addCommandGraphUpdate(
1685  std::vector<std::shared_ptr<ext::oneapi::experimental::detail::node_impl>>
1686  Nodes,
1687  const QueueImplPtr &Queue, std::vector<Requirement *> Requirements,
1688  std::vector<detail::EventImplPtr> &Events,
1689  std::vector<Command *> &ToEnqueue) {
1690  auto NewCmd =
1691  std::make_unique<UpdateCommandBufferCommand>(Queue, Graph, Nodes);
1692  // If there are multiple requirements for the same memory object, its
1693  // AllocaCommand creation will be dependent on the access mode of the first
1694  // requirement. Combine these access modes to take all of them into account.
1695  combineAccessModesOfReqs(Requirements);
1696  std::vector<Command *> ToCleanUp;
1697  for (Requirement *Req : Requirements) {
1698  MemObjRecord *Record = nullptr;
1699  AllocaCommandBase *AllocaCmd = nullptr;
1700 
1701  bool isSameCtx = false;
1702 
1703  {
1704 
1705  Record = getOrInsertMemObjRecord(Queue, Req);
1706  markModifiedIfWrite(Record, Req);
1707 
1708  AllocaCmd = getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
1709 
1710  isSameCtx = isOnSameContext(Record->MCurContext, Queue);
1711  }
1712 
1713  if (!isSameCtx) {
1714  // Cannot directly copy memory from OpenCL device to OpenCL device -
1715  // create two copies: device->host and host->device.
1716  bool NeedMemMoveToHost = false;
1717  auto MemMoveTargetQueue = Queue;
1718 
1719  if (Queue && Record->MCurContext)
1720  NeedMemMoveToHost = true;
1721 
1722  if (NeedMemMoveToHost)
1723  insertMemoryMove(Record, Req, nullptr, ToEnqueue);
1724  insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue);
1725  }
1726  std::set<Command *> Deps =
1727  findDepsForReq(Record, Req, queue_impl::getContext(Queue));
1728 
1729  for (Command *Dep : Deps) {
1730  if (Dep != NewCmd.get()) {
1731  Command *ConnCmd =
1732  NewCmd->addDep(DepDesc{Dep, Req, AllocaCmd}, ToCleanUp);
1733  if (ConnCmd)
1734  ToEnqueue.push_back(ConnCmd);
1735  }
1736  }
1737  }
1738 
1739  // Set new command as user for dependencies and update leaves.
1740  // Node dependencies can be modified further when adding the node to leaves,
1741  // iterate over their copy.
1742  // FIXME employ a reference here to eliminate copying of a vector
1743  std::vector<DepDesc> Deps = NewCmd->MDeps;
1744  for (DepDesc &Dep : Deps) {
1745  const Requirement *Req = Dep.MDepRequirement;
1746  MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj);
1747  updateLeaves({Dep.MDepCommand}, Record, Req->MAccessMode, ToCleanUp);
1748  addNodeToLeaves(Record, NewCmd.get(), Req->MAccessMode, ToEnqueue);
1749  }
1750 
1751  // Register all the events as dependencies
1752  for (detail::EventImplPtr e : Events) {
1753  if (e->getCommand() &&
1754  e->getCommand() == static_cast<Command *>(NewCmd.get())) {
1755  continue;
1756  }
1757  if (Command *ConnCmd = NewCmd->addDep(e, ToCleanUp))
1758  ToEnqueue.push_back(ConnCmd);
1759  }
1760 
1761  if (MPrintOptionsArray[AfterAddCG])
1762  printGraphAsDot("after_addCG");
1763 
1764  for (Command *Cmd : ToCleanUp) {
1765  cleanupCommand(Cmd);
1766  }
1767 
1768  return NewCmd.release();
1769 }
1770 
1771 } // namespace detail
1772 } // namespace _V1
1773 } // namespace sycl
Base class for memory allocation commands.
Definition: commands.hpp:456
const Requirement * getRequirement() const final
Definition: commands.hpp:467
AllocaCommandBase * MLinkedAllocaCmd
Alloca command linked with current command.
Definition: commands.hpp:484
bool MIsActive
Indicates that current alloca is active one.
Definition: commands.hpp:486
The AllocaSubBuf command enqueues creation of sub-buffer of memory object.
Definition: commands.hpp:521
std::shared_ptr< HostTask > MHostTask
Definition: cg.hpp:683
"Update host" command group class.
Definition: cg.hpp:343
Base class for all types of command groups.
Definition: cg.hpp:161
CGType getType() const
Definition: cg.hpp:208
The Command class represents some action that needs to be performed on one or more memory objects.
Definition: commands.hpp:109
bool isSuccessfullyEnqueued() const
Definition: commands.hpp:160
const std::vector< EventImplPtr > & getPreparedDepsEvents() const
Definition: commands.hpp:298
const std::vector< EventImplPtr > & getPreparedHostDepsEvents() const
Definition: commands.hpp:294
unsigned MLeafCounter
Counts the number of memory objects this command is a leaf for.
Definition: commands.hpp:324
std::unordered_set< Command * > MUsers
Contains list of commands that depend on the command.
Definition: commands.hpp:320
Marks MMarks
Used for marking the node during graph traversal.
Definition: commands.hpp:333
std::vector< DepDesc > MDeps
Contains list of dependencies(edges)
Definition: commands.hpp:318
virtual bool readyForCleanup() const
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:793
void addUser(Command *NewUser)
Definition: commands.hpp:143
virtual ContextImplPtr getWorkerContext() const
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:783
void clearAllDependencies()
Clear all dependency events This should only be used if a command is about to be deleted without bein...
Definition: commands.hpp:311
const EventImplPtr & getEvent() const
Definition: commands.hpp:182
CommandType getType() const
Definition: commands.hpp:146
virtual void printDot(std::ostream &Stream) const =0
Command * addDep(DepDesc NewDep, std::vector< Command * > &ToCleanUp)
Definition: commands.cpp:798
const QueueImplPtr & getQueue() const
Definition: commands.hpp:180
The empty command does nothing during enqueue.
Definition: commands.hpp:414
The exec CG command enqueues execution of kernel or explicit memory operation.
Definition: commands.hpp:646
detail::CG & getCG() const
Definition: commands.hpp:662
The KernelFusionCommand is placed in the execution graph together with the individual kernels of the ...
Definition: commands.hpp:729
A wrapper for CircularBuffer class along with collection for host accessor's EmptyCommands.
size_t remove(value_type Cmd)
Replacement for std::remove with subsequent call to erase(newEnd, end()).
std::function< void(Command *, Command *, MemObjRecord *, EnqueueListT &)> AllocateDependencyF
std::vector< value_type > toVector() const
std::vector< Command * > EnqueueListT
The map command enqueues mapping of device memory onto host memory.
Definition: commands.hpp:540
The mem copy host command enqueues memory copy between two instances of memory object.
Definition: commands.hpp:603
const Requirement * getRequirement() const final
Definition: commands.hpp:610
virtual ContextImplPtr getInteropContext() const =0
std::shared_ptr< MemObjRecord > MRecord
void cleanupCommand(Command *Cmd, bool AllowUnsubmitted=false)
MemObjRecord * getOrInsertMemObjRecord(const QueueImplPtr &Queue, const Requirement *Req)
Command * addCGUpdateHost(std::unique_ptr< detail::CG > CommandGroup, std::vector< Command * > &ToEnqueue)
Registers a command group that updates host memory to the latest state.
void decrementLeafCountersForRecord(MemObjRecord *Record)
Decrements leaf counters for all leaves of the record.
MemObjRecord * getMemObjRecord(SYCLMemObjI *MemObject)
EventImplPtr completeFusion(QueueImplPtr Queue, std::vector< Command * > &ToEnqueue, const property_list &)
Command * addHostAccessor(Requirement *Req, std::vector< Command * > &ToEnqueue)
Enqueues a command to create a host accessor.
void cleanupCommandsForRecord(MemObjRecord *Record)
Removes commands that use the given MemObjRecord from the graph.
void removeRecordForMemObj(SYCLMemObjI *MemObject)
Removes the MemObjRecord for the memory object passed.
Command * addCopyBack(Requirement *Req, std::vector< Command * > &ToEnqueue)
Enqueues a command to update memory to the latest state.
Command * connectDepEvent(Command *const Cmd, const EventImplPtr &DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform connection of events in multiple contexts.
GraphBuildResult addCG(std::unique_ptr< detail::CG > CommandGroup, const QueueImplPtr &Queue, std::vector< Command * > &ToEnqueue, bool EventNeeded, sycl::detail::pi::PiExtCommandBuffer CommandBuffer=nullptr, const std::vector< sycl::detail::pi::PiExtSyncPoint > &Dependencies={})
Registers command group and adds it to the dependency graph.
void cancelFusion(QueueImplPtr Queue, std::vector< Command * > &ToEnqueue)
void addNodeToLeaves(MemObjRecord *Record, Command *Cmd, access::mode AccessMode, std::vector< Command * > &ToEnqueue)
Adds new command to leaves if needed.
void cleanUpCmdFusion(sycl::detail::queue_impl *Queue)
Clean up the internal fusion commands held for the given queue.
AllocaCommandBase * findAllocaForReq(MemObjRecord *Record, const Requirement *Req, const ContextImplPtr &Context, bool AllowConst=true)
Searches for suitable alloca in memory record.
DepDesc findDepForRecord(Command *Cmd, MemObjRecord *Record)
Finds a command dependency corresponding to the record.
void updateLeaves(const std::set< Command * > &Cmds, MemObjRecord *Record, access::mode AccessMode, std::vector< Command * > &ToCleanUp)
Removes commands from leaves.
bool isInFusionMode(QueueIdT Queue)
Definition: scheduler.cpp:639
void cancelFusion(QueueImplPtr Queue)
Definition: scheduler.cpp:600
static MemObjRecord * getMemObjRecord(const Requirement *const Req)
Definition: scheduler.cpp:403
static Scheduler & getInstance()
Definition: scheduler.cpp:248
void cleanUpCmdFusion(sycl::detail::queue_impl *Queue)
Definition: scheduler.cpp:593
void takeAuxiliaryResources(const EventImplPtr &Dst, const EventImplPtr &Src)
Assign Src's auxiliary resources to Dst.
Definition: scheduler.cpp:553
The unmap command removes mapping of host memory onto device memory.
Definition: commands.hpp:559
const Requirement * getRequirement() const final
Definition: commands.hpp:716
std::unique_ptr< detail::CG > fuseKernels(QueueImplPtr Queue, std::vector< ExecCGCommand * > &InputKernels, const property_list &)
static jit_compiler & get_instance()
static ContextImplPtr getContext(const QueueImplPtr &Queue)
Definition: queue_impl.hpp:771
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64
Class representing the implementation of command_graph<executable>.
Objects of the property_list class are containers for the SYCL properties.
size_t size() const
Definition: range.hpp:56
__SYCL_EXTERN_STREAM_ATTRS ostream cerr
Linked to standard error (unbuffered)
static bool doOverlap(const Requirement *LHS, const Requirement *RHS)
Checks whether two requirements overlap or not.
decltype(Obj::impl) const & getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:31
static bool createsCircularDependency(Command *Cmd, bool PredPartOfFusion, KernelFusionCommand *Fusion)
static bool isInteropHostTask(ExecCGCommand *Cmd)
static void unmarkVisitedNodes(std::vector< Command * > &Visited)
static bool checkForCircularDependency(Command *, bool, KernelFusionCommand *)
std::hash< std::shared_ptr< detail::queue_impl > >::result_type QueueIdT
Definition: scheduler.hpp:191
static access::mode combineAccessModes(access::mode A, access::mode B)
Combines two access modes into a single one that allows both.
static bool markNodeAsVisited(Command *Cmd, std::vector< Command * > &Visited)
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
Definition: event_impl.hpp:32
std::unique_ptr< KernelFusionCommand > FusionList
Definition: scheduler.hpp:193
static bool isPartOfFusion(Command *Cmd, KernelFusionCommand *Fusion)
std::shared_ptr< event_impl > EventImplPtr
Definition: handler.hpp:184
static void printDotRecursive(std::fstream &Stream, std::vector< Command * > &Visited, Command *Cmd)
AccessorImplHost Requirement
std::shared_ptr< device_impl > DeviceImplPtr
static void handleVisitedNodes(std::vector< Command * > &Visited)
static bool IsSuitableSubReq(const Requirement *Req)
Checks if current requirement is requirement for sub buffer.
static Command * insertMapUnmapForLinkedCmds(AllocaCommandBase *AllocaCmdSrc, AllocaCommandBase *AllocaCmdDst, access::mode MapMode)
static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx)
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
Definition: helpers.hpp:46
static bool isAccessModeAllowed(access::mode Required, access::mode Current)
Checks if the required access mode is allowed under the current one.
static void combineAccessModesOfReqs(std::vector< Requirement * > &Reqs)
static ContextImplPtr getContext(const QueueImplPtr &Queue)
Definition: commands.cpp:98
static bool isOnSameContext(const ContextImplPtr Context, const QueueImplPtr &Queue)
constexpr if(sizeof(T)==8)
class __SYCL_EBO __SYCL_SPECIAL_CLASS AccessMode
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:64
Definition: access.hpp:18
bool any_of(const simd_mask< _Tp, _Abi > &) noexcept
bool MVisited
Used for marking the node as visited during graph traversal.
Definition: commands.hpp:328
bool MToBeDeleted
Used for marking the node for deletion during cleanup.
Definition: commands.hpp:330
Dependency between two commands.
Definition: commands.hpp:83
const Requirement * MDepRequirement
Requirement for the dependency.
Definition: commands.hpp:96
Command * MDepCommand
The actual dependency command.
Definition: commands.hpp:94
AllocaCommandBase * MAllocaCmd
Allocation command for the memory object we have requirement for.
Definition: commands.hpp:99
Memory Object Record.
Definition: scheduler.hpp:202
std::vector< AllocaCommandBase * > MAllocaCommands
Definition: scheduler.hpp:208