DPC++ Runtime
Runtime libraries for oneAPI DPC++
graph_builder.cpp
Go to the documentation of this file.
1 //===-- graph_builder.cpp - SYCL Graph Builder ------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "detail/config.hpp"
10 #include <detail/context_impl.hpp>
11 #include <detail/event_impl.hpp>
12 #include <sstream>
13 #include <sycl/feature_test.hpp>
14 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
15 #include <detail/jit_compiler.hpp>
16 #endif
17 #include <detail/graph_impl.hpp>
19 #include <detail/queue_impl.hpp>
22 #include <sycl/access/access.hpp>
23 #include <sycl/exception.hpp>
24 
25 #include <algorithm>
26 #include <cstdlib>
27 #include <cstring>
28 #include <fstream>
29 #include <map>
30 #include <memory>
31 #include <queue>
32 #include <set>
33 #include <vector>
34 
35 namespace sycl {
36 inline namespace _V1 {
37 namespace detail {
38 
43 // TODO merge with LeavesCollection's version of doOverlap (see
44 // leaves_collection.cpp).
45 static bool doOverlap(const Requirement *LHS, const Requirement *RHS) {
46  return (LHS->MOffsetInBytes + LHS->MAccessRange.size() * LHS->MElemSize >=
47  RHS->MOffsetInBytes) ||
48  (RHS->MOffsetInBytes + RHS->MAccessRange.size() * RHS->MElemSize >=
49  LHS->MOffsetInBytes);
50 }
51 
52 static bool sameCtx(const ContextImplPtr &LHS, const ContextImplPtr &RHS) {
53  // Consider two different host contexts to be the same to avoid additional
54  // allocation on the host
55  return LHS == RHS || (LHS->is_host() && RHS->is_host());
56 }
57 
59 static bool IsSuitableSubReq(const Requirement *Req) {
60  return Req->MIsSubBuffer;
61 }
62 
64 static bool isAccessModeAllowed(access::mode Required, access::mode Current) {
65  switch (Current) {
66  case access::mode::read:
67  return (Required == Current);
69  assert(false && "Write only access is expected to be mapped as read_write");
70  return (Required == Current || Required == access::mode::discard_write);
75  return true;
76  }
77  assert(false);
78  return false;
79 }
80 
83  if (A == B)
84  return A;
85 
86  if (A == access::mode::discard_write &&
88  return B;
89 
90  if (B == access::mode::discard_write &&
92  return A;
93 
95 }
96 
98  if (const char *EnvVarCStr = SYCLConfig<SYCL_PRINT_EXECUTION_GRAPH>::get()) {
99  std::string GraphPrintOpts(EnvVarCStr);
100  bool EnableAlways = GraphPrintOpts.find("always") != std::string::npos;
101 
102  if (GraphPrintOpts.find("before_addCG") != std::string::npos ||
103  EnableAlways)
104  MPrintOptionsArray[BeforeAddCG] = true;
105  if (GraphPrintOpts.find("after_addCG") != std::string::npos || EnableAlways)
106  MPrintOptionsArray[AfterAddCG] = true;
107  if (GraphPrintOpts.find("before_addCopyBack") != std::string::npos ||
108  EnableAlways)
109  MPrintOptionsArray[BeforeAddCopyBack] = true;
110  if (GraphPrintOpts.find("after_addCopyBack") != std::string::npos ||
111  EnableAlways)
112  MPrintOptionsArray[AfterAddCopyBack] = true;
113  if (GraphPrintOpts.find("before_addHostAcc") != std::string::npos ||
114  EnableAlways)
115  MPrintOptionsArray[BeforeAddHostAcc] = true;
116  if (GraphPrintOpts.find("after_addHostAcc") != std::string::npos ||
117  EnableAlways)
118  MPrintOptionsArray[AfterAddHostAcc] = true;
119  if (GraphPrintOpts.find("after_fusionComplete") != std::string::npos ||
120  EnableAlways)
121  MPrintOptionsArray[AfterFusionComplete] = true;
122  if (GraphPrintOpts.find("after_fusionCancel") != std::string::npos ||
123  EnableAlways)
124  MPrintOptionsArray[AfterFusionCancel] = true;
125  }
126 }
127 
128 static bool markNodeAsVisited(Command *Cmd, std::vector<Command *> &Visited) {
129  assert(Cmd && "Cmd can't be nullptr");
130  if (Cmd->MMarks.MVisited)
131  return false;
132  Cmd->MMarks.MVisited = true;
133  Visited.push_back(Cmd);
134  return true;
135 }
136 
137 static void unmarkVisitedNodes(std::vector<Command *> &Visited) {
138  for (Command *Cmd : Visited)
139  Cmd->MMarks.MVisited = false;
140 }
141 
142 static void handleVisitedNodes(std::vector<Command *> &Visited) {
143  for (Command *Cmd : Visited) {
144  if (Cmd->MMarks.MToBeDeleted) {
145  if (Cmd->getType() == Command::FUSION &&
146  !static_cast<KernelFusionCommand *>(Cmd)->readyForDeletion()) {
147  // Fusion commands might still be needed because fusion might be
148  // aborted, but a later call to complete_fusion still needs to be able
149  // to return a valid event. Clean-up of fusion commands is therefore
150  // explicitly handled by start fusion.
151  return;
152  }
153  Cmd->getEvent()->setCommand(nullptr);
154  delete Cmd;
155  } else
156  Cmd->MMarks.MVisited = false;
157  }
158 }
159 
160 static void printDotRecursive(std::fstream &Stream,
161  std::vector<Command *> &Visited, Command *Cmd) {
162  if (!markNodeAsVisited(Cmd, Visited))
163  return;
164  for (Command *User : Cmd->MUsers) {
165  if (User)
166  printDotRecursive(Stream, Visited, User);
167  }
168  Cmd->printDot(Stream);
169 }
170 
171 void Scheduler::GraphBuilder::printGraphAsDot(const char *ModeName) {
172  static size_t Counter = 0;
173  std::string ModeNameStr(ModeName);
174  std::string FileName =
175  "graph_" + std::to_string(Counter) + ModeNameStr + ".dot";
176 
177  Counter++;
178 
179  std::fstream Stream(FileName, std::ios::out);
180  Stream << "strict digraph {" << std::endl;
181 
182  MVisitedCmds.clear();
183 
184  for (SYCLMemObjI *MemObject : MMemObjs)
185  for (Command *AllocaCmd : MemObject->MRecord->MAllocaCommands)
186  printDotRecursive(Stream, MVisitedCmds, AllocaCmd);
187 
188  Stream << "}" << std::endl;
189 
190  unmarkVisitedNodes(MVisitedCmds);
191 }
192 
194  return MemObject->MRecord.get();
195 }
196 
198  const QueueImplPtr &Queue, const Requirement *Req,
199  std::vector<Command *> &ToEnqueue) {
200  SYCLMemObjI *MemObject = Req->MSYCLMemObj;
201  MemObjRecord *Record = getMemObjRecord(MemObject);
202 
203  if (nullptr != Record)
204  return Record;
205 
206  const size_t LeafLimit = 8;
207  LeavesCollection::AllocateDependencyF AllocateDependency =
208  [this](Command *Dependant, Command *Dependency, MemObjRecord *Record,
209  LeavesCollection::EnqueueListT &ToEnqueue) {
210  // Add the old leaf as a dependency for the new one by duplicating one
211  // of the requirements for the current record
212  DepDesc Dep = findDepForRecord(Dependant, Record);
213  Dep.MDepCommand = Dependency;
214  std::vector<Command *> ToCleanUp;
215  Command *ConnectionCmd = Dependant->addDep(Dep, ToCleanUp);
216  if (ConnectionCmd)
217  ToEnqueue.push_back(ConnectionCmd);
218 
219  --(Dependency->MLeafCounter);
220  if (Dependency->readyForCleanup())
221  ToCleanUp.push_back(Dependency);
222  for (Command *Cmd : ToCleanUp)
223  cleanupCommand(Cmd);
224  };
225 
226  const ContextImplPtr &InteropCtxPtr = Req->MSYCLMemObj->getInteropContext();
227  if (InteropCtxPtr) {
228  // The memory object has been constructed using interoperability constructor
229  // which means that there is already an allocation(cl_mem) in some context.
230  // Registering this allocation in the SYCL graph.
231 
232  std::vector<sycl::device> Devices =
233  InteropCtxPtr->get_info<info::context::devices>();
234  assert(Devices.size() != 0);
235  DeviceImplPtr Dev = detail::getSyclObjImpl(Devices[0]);
236 
237  // Since all the Scheduler commands require queue but we have only context
238  // here, we need to create a dummy queue bound to the context and one of the
239  // devices from the context.
240  QueueImplPtr InteropQueuePtr{new detail::queue_impl{
241  Dev, InteropCtxPtr, /*AsyncHandler=*/{}, /*PropertyList=*/{}}};
242 
243  MemObject->MRecord.reset(
244  new MemObjRecord{InteropCtxPtr, LeafLimit, AllocateDependency});
245  getOrCreateAllocaForReq(MemObject->MRecord.get(), Req, InteropQueuePtr,
246  ToEnqueue);
247  } else
248  MemObject->MRecord.reset(new MemObjRecord{Queue->getContextImplPtr(),
249  LeafLimit, AllocateDependency});
250 
251  MMemObjs.push_back(MemObject);
252  return MemObject->MRecord.get();
253 }
254 
255 void Scheduler::GraphBuilder::updateLeaves(const std::set<Command *> &Cmds,
256  MemObjRecord *Record,
258  std::vector<Command *> &ToCleanUp) {
259 
260  const bool ReadOnlyReq = AccessMode == access::mode::read;
261  if (ReadOnlyReq)
262  return;
263 
264  for (Command *Cmd : Cmds) {
265  bool WasLeaf = Cmd->MLeafCounter > 0;
266  Cmd->MLeafCounter -= Record->MReadLeaves.remove(Cmd);
267  Cmd->MLeafCounter -= Record->MWriteLeaves.remove(Cmd);
268  if (WasLeaf && Cmd->readyForCleanup()) {
269  ToCleanUp.push_back(Cmd);
270  }
271  }
272 }
273 
276  std::vector<Command *> &ToEnqueue) {
278  ? Record->MReadLeaves
279  : Record->MWriteLeaves};
280  if (Leaves.push_back(Cmd, ToEnqueue))
281  ++Cmd->MLeafCounter;
282 }
283 
284 UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd(
285  MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue,
286  std::vector<Command *> &ToEnqueue) {
287  AllocaCommandBase *AllocaCmd =
288  findAllocaForReq(Record, Req, Queue->getContextImplPtr());
289  assert(AllocaCmd && "There must be alloca for requirement!");
290  UpdateHostRequirementCommand *UpdateCommand =
291  new UpdateHostRequirementCommand(Queue, *Req, AllocaCmd, &Req->MData);
292  // Need copy of requirement because after host accessor destructor call
293  // dependencies become invalid if requirement is stored by pointer.
294  const Requirement *StoredReq = UpdateCommand->getRequirement();
295 
296  std::set<Command *> Deps =
297  findDepsForReq(Record, Req, Queue->getContextImplPtr());
298  std::vector<Command *> ToCleanUp;
299  for (Command *Dep : Deps) {
300  Command *ConnCmd =
301  UpdateCommand->addDep(DepDesc{Dep, StoredReq, AllocaCmd}, ToCleanUp);
302  if (ConnCmd)
303  ToEnqueue.push_back(ConnCmd);
304  }
305  updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp);
306  addNodeToLeaves(Record, UpdateCommand, Req->MAccessMode, ToEnqueue);
307  for (Command *Cmd : ToCleanUp)
308  cleanupCommand(Cmd);
309  return UpdateCommand;
310 }
311 
312 // Takes linked alloca commands. Makes AllocaCmdDst command active using map
313 // or unmap operation.
315  AllocaCommandBase *AllocaCmdDst,
316  access::mode MapMode) {
317  assert(AllocaCmdSrc->MLinkedAllocaCmd == AllocaCmdDst &&
318  "Expected linked alloca commands");
319  assert(AllocaCmdSrc->MIsActive &&
320  "Expected source alloca command to be active");
321 
322  if (AllocaCmdSrc->getQueue()->is_host()) {
323  UnMapMemObject *UnMapCmd = new UnMapMemObject(
324  AllocaCmdDst, *AllocaCmdDst->getRequirement(),
325  &AllocaCmdSrc->MMemAllocation, AllocaCmdDst->getQueue());
326 
327  std::swap(AllocaCmdSrc->MIsActive, AllocaCmdDst->MIsActive);
328 
329  return UnMapCmd;
330  }
331 
332  MapMemObject *MapCmd = new MapMemObject(
333  AllocaCmdSrc, *AllocaCmdSrc->getRequirement(),
334  &AllocaCmdDst->MMemAllocation, AllocaCmdSrc->getQueue(), MapMode);
335 
336  std::swap(AllocaCmdSrc->MIsActive, AllocaCmdDst->MIsActive);
337 
338  return MapCmd;
339 }
340 
341 Command *Scheduler::GraphBuilder::insertMemoryMove(
342  MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue,
343  std::vector<Command *> &ToEnqueue) {
344 
345  AllocaCommandBase *AllocaCmdDst =
346  getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
347  if (!AllocaCmdDst)
348  throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
349 
350  std::set<Command *> Deps =
351  findDepsForReq(Record, Req, Queue->getContextImplPtr());
352  Deps.insert(AllocaCmdDst);
353  // Get parent allocation of sub buffer to perform full copy of whole buffer
354  if (IsSuitableSubReq(Req)) {
355  if (AllocaCmdDst->getType() == Command::CommandType::ALLOCA_SUB_BUF)
356  AllocaCmdDst =
357  static_cast<AllocaSubBufCommand *>(AllocaCmdDst)->getParentAlloca();
358  }
359 
360  AllocaCommandBase *AllocaCmdSrc =
361  findAllocaForReq(Record, Req, Record->MCurContext);
362  if (!AllocaCmdSrc && IsSuitableSubReq(Req)) {
363  // Since no alloca command for the sub buffer requirement was found in the
364  // current context, need to find a parent alloca command for it (it must be
365  // there)
366  auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) {
367  bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(),
368  Record->MCurContext) &&
369  // Looking for a parent buffer alloca command
370  AllocaCmd->getType() == Command::CommandType::ALLOCA;
371  return Res;
372  };
373  const auto It =
374  std::find_if(Record->MAllocaCommands.begin(),
375  Record->MAllocaCommands.end(), IsSuitableAlloca);
376  AllocaCmdSrc = (Record->MAllocaCommands.end() != It) ? *It : nullptr;
377  }
378  if (!AllocaCmdSrc)
379  throw runtime_error("Cannot find buffer allocation",
380  PI_ERROR_INVALID_VALUE);
381  // Get parent allocation of sub buffer to perform full copy of whole buffer
382  if (IsSuitableSubReq(Req)) {
383  if (AllocaCmdSrc->getType() == Command::CommandType::ALLOCA_SUB_BUF)
384  AllocaCmdSrc =
385  static_cast<AllocaSubBufCommand *>(AllocaCmdSrc)->getParentAlloca();
386  else if (AllocaCmdSrc->getSYCLMemObj() != Req->MSYCLMemObj)
387  assert(false && "Inappropriate alloca command.");
388  }
389 
390  Command *NewCmd = nullptr;
391 
392  if (AllocaCmdSrc->MLinkedAllocaCmd == AllocaCmdDst) {
393  // Map write only as read-write
394  access::mode MapMode = Req->MAccessMode;
395  if (MapMode == access::mode::write)
396  MapMode = access::mode::read_write;
397  NewCmd = insertMapUnmapForLinkedCmds(AllocaCmdSrc, AllocaCmdDst, MapMode);
398  Record->MHostAccess = MapMode;
399  } else {
400 
401  if ((Req->MAccessMode == access::mode::discard_write) ||
402  (Req->MAccessMode == access::mode::discard_read_write)) {
403  Record->MCurContext = Queue->getContextImplPtr();
404  return nullptr;
405  } else {
406  // Full copy of buffer is needed to avoid loss of data that may be caused
407  // by copying specific range from host to device and backwards.
408  NewCmd =
409  new MemCpyCommand(*AllocaCmdSrc->getRequirement(), AllocaCmdSrc,
410  *AllocaCmdDst->getRequirement(), AllocaCmdDst,
411  AllocaCmdSrc->getQueue(), AllocaCmdDst->getQueue());
412  }
413  }
414  std::vector<Command *> ToCleanUp;
415  for (Command *Dep : Deps) {
416  Command *ConnCmd = NewCmd->addDep(
417  DepDesc{Dep, NewCmd->getRequirement(), AllocaCmdDst}, ToCleanUp);
418  if (ConnCmd)
419  ToEnqueue.push_back(ConnCmd);
420  }
421  updateLeaves(Deps, Record, access::mode::read_write, ToCleanUp);
422  addNodeToLeaves(Record, NewCmd, access::mode::read_write, ToEnqueue);
423  for (Command *Cmd : ToCleanUp)
424  cleanupCommand(Cmd);
425  Record->MCurContext = Queue->getContextImplPtr();
426  return NewCmd;
427 }
428 
429 Command *Scheduler::GraphBuilder::remapMemoryObject(
430  MemObjRecord *Record, Requirement *Req, AllocaCommandBase *HostAllocaCmd,
431  std::vector<Command *> &ToEnqueue) {
432  assert(HostAllocaCmd->getQueue()->is_host() &&
433  "Host alloca command expected");
434  assert(HostAllocaCmd->MIsActive && "Active alloca command expected");
435 
436  AllocaCommandBase *LinkedAllocaCmd = HostAllocaCmd->MLinkedAllocaCmd;
437  assert(LinkedAllocaCmd && "Linked alloca command expected");
438 
439  std::set<Command *> Deps = findDepsForReq(Record, Req, Record->MCurContext);
440 
441  UnMapMemObject *UnMapCmd = new UnMapMemObject(
442  LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(),
443  &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue());
444 
445  // Map write only as read-write
446  access::mode MapMode = Req->MAccessMode;
447  if (MapMode == access::mode::write)
448  MapMode = access::mode::read_write;
449  MapMemObject *MapCmd = new MapMemObject(
450  LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(),
451  &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue(), MapMode);
452 
453  std::vector<Command *> ToCleanUp;
454  for (Command *Dep : Deps) {
455  Command *ConnCmd = UnMapCmd->addDep(
456  DepDesc{Dep, UnMapCmd->getRequirement(), LinkedAllocaCmd}, ToCleanUp);
457  if (ConnCmd)
458  ToEnqueue.push_back(ConnCmd);
459  }
460 
461  Command *ConnCmd = MapCmd->addDep(
462  DepDesc{UnMapCmd, MapCmd->getRequirement(), HostAllocaCmd}, ToCleanUp);
463  if (ConnCmd)
464  ToEnqueue.push_back(ConnCmd);
465 
466  updateLeaves(Deps, Record, access::mode::read_write, ToCleanUp);
467  addNodeToLeaves(Record, MapCmd, access::mode::read_write, ToEnqueue);
468  for (Command *Cmd : ToCleanUp)
469  cleanupCommand(Cmd);
470  Record->MHostAccess = MapMode;
471  return MapCmd;
472 }
473 
474 // The function adds copy operation of the up to date'st memory to the memory
475 // pointed by Req.
476 Command *
478  std::vector<Command *> &ToEnqueue) {
480  SYCLMemObjI *MemObj = Req->MSYCLMemObj;
481  MemObjRecord *Record = getMemObjRecord(MemObj);
482  if (Record && MPrintOptionsArray[BeforeAddCopyBack])
483  printGraphAsDot("before_addCopyBack");
484 
485  // Do nothing if there were no or only read operations with the memory object.
486  if (nullptr == Record || !Record->MMemModified)
487  return nullptr;
488 
489  std::set<Command *> Deps =
490  findDepsForReq(Record, Req, HostQueue->getContextImplPtr());
491  AllocaCommandBase *SrcAllocaCmd =
492  findAllocaForReq(Record, Req, Record->MCurContext);
493 
494  auto MemCpyCmdUniquePtr = std::make_unique<MemCpyCommandHost>(
495  *SrcAllocaCmd->getRequirement(), SrcAllocaCmd, *Req, &Req->MData,
496  SrcAllocaCmd->getQueue(), std::move(HostQueue));
497 
498  if (!MemCpyCmdUniquePtr)
499  throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
500 
501  MemCpyCommandHost *MemCpyCmd = MemCpyCmdUniquePtr.release();
502 
503  std::vector<Command *> ToCleanUp;
504  for (Command *Dep : Deps) {
505  Command *ConnCmd = MemCpyCmd->addDep(
506  DepDesc{Dep, MemCpyCmd->getRequirement(), SrcAllocaCmd}, ToCleanUp);
507  if (ConnCmd)
508  ToEnqueue.push_back(ConnCmd);
509  }
510 
511  updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp);
512  addNodeToLeaves(Record, MemCpyCmd, Req->MAccessMode, ToEnqueue);
513  for (Command *Cmd : ToCleanUp)
514  cleanupCommand(Cmd);
515  if (MPrintOptionsArray[AfterAddCopyBack])
516  printGraphAsDot("after_addCopyBack");
517  return MemCpyCmd;
518 }
519 
520 // The function implements SYCL host accessor logic: host accessor
521 // should provide access to the buffer in user space.
522 Command *
524  std::vector<Command *> &ToEnqueue) {
525 
526  if (Req->MAccessMode != sycl::access_mode::read) {
527  auto SYCLMemObj = static_cast<detail::SYCLMemObjT *>(Req->MSYCLMemObj);
528  SYCLMemObj->handleWriteAccessorCreation();
529  }
530 
531  const QueueImplPtr &HostQueue = getInstance().getDefaultHostQueue();
532 
533  MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req, ToEnqueue);
534  if (MPrintOptionsArray[BeforeAddHostAcc])
535  printGraphAsDot("before_addHostAccessor");
536  markModifiedIfWrite(Record, Req);
537 
538  AllocaCommandBase *HostAllocaCmd =
539  getOrCreateAllocaForReq(Record, Req, HostQueue, ToEnqueue);
540 
541  if (sameCtx(HostAllocaCmd->getQueue()->getContextImplPtr(),
542  Record->MCurContext)) {
543  if (!isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess)) {
544  remapMemoryObject(Record, Req,
545  Req->MIsSubBuffer ? (static_cast<AllocaSubBufCommand *>(
546  HostAllocaCmd))
547  ->getParentAlloca()
548  : HostAllocaCmd,
549  ToEnqueue);
550  }
551  } else
552  insertMemoryMove(Record, Req, HostQueue, ToEnqueue);
553 
554  Command *UpdateHostAccCmd =
555  insertUpdateHostReqCmd(Record, Req, HostQueue, ToEnqueue);
556 
557  // Need empty command to be blocked until host accessor is destructed
558  EmptyCommand *EmptyCmd =
559  addEmptyCmd(UpdateHostAccCmd, {Req}, HostQueue,
561 
562  Req->MBlockedCmd = EmptyCmd;
563 
564  if (MPrintOptionsArray[AfterAddHostAcc])
565  printGraphAsDot("after_addHostAccessor");
566 
567  return UpdateHostAccCmd;
568 }
569 
571  std::unique_ptr<detail::CG> CommandGroup, const QueueImplPtr &HostQueue,
572  std::vector<Command *> &ToEnqueue) {
573 
574  auto UpdateHost = static_cast<CGUpdateHost *>(CommandGroup.get());
575  Requirement *Req = UpdateHost->getReqToUpdate();
576 
577  MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req, ToEnqueue);
578  return insertMemoryMove(Record, Req, HostQueue, ToEnqueue);
579 }
580 
589 std::set<Command *>
590 Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record,
591  const Requirement *Req,
592  const ContextImplPtr &Context) {
593  std::set<Command *> RetDeps;
594  std::vector<Command *> Visited;
595  const bool ReadOnlyReq = Req->MAccessMode == access::mode::read;
596 
597  std::vector<Command *> ToAnalyze{Record->MWriteLeaves.toVector()};
598 
599  if (!ReadOnlyReq) {
600  std::vector<Command *> V{Record->MReadLeaves.toVector()};
601 
602  ToAnalyze.insert(ToAnalyze.begin(), V.begin(), V.end());
603  }
604 
605  while (!ToAnalyze.empty()) {
606  Command *DepCmd = ToAnalyze.back();
607  ToAnalyze.pop_back();
608 
609  std::vector<Command *> NewAnalyze;
610 
611  for (const DepDesc &Dep : DepCmd->MDeps) {
612  if (Dep.MDepRequirement->MSYCLMemObj != Req->MSYCLMemObj)
613  continue;
614 
615  bool CanBypassDep = false;
616  // If both only read
617  CanBypassDep |=
618  Dep.MDepRequirement->MAccessMode == access::mode::read && ReadOnlyReq;
619 
620  // If not overlap
621  CanBypassDep |= !doOverlap(Dep.MDepRequirement, Req);
622 
623  // Going through copying memory between contexts is not supported.
624  if (Dep.MDepCommand)
625  CanBypassDep &=
626  sameCtx(Context, Dep.MDepCommand->getQueue()->getContextImplPtr());
627 
628  if (!CanBypassDep) {
629  RetDeps.insert(DepCmd);
630  // No need to analyze deps of examining command as it's dependency
631  // itself.
632  NewAnalyze.clear();
633  break;
634  }
635 
636  if (markNodeAsVisited(Dep.MDepCommand, Visited))
637  NewAnalyze.push_back(Dep.MDepCommand);
638  }
639  ToAnalyze.insert(ToAnalyze.end(), NewAnalyze.begin(), NewAnalyze.end());
640  }
641  unmarkVisitedNodes(Visited);
642  return RetDeps;
643 }
644 
645 // A helper function for finding a command dependency on a specific memory
646 // object
648  MemObjRecord *Record) {
649  for (const DepDesc &DD : Cmd->MDeps) {
650  if (getMemObjRecord(DD.MDepRequirement->MSYCLMemObj) == Record) {
651  return DD;
652  }
653  }
654  assert(false && "No dependency found for a leaf of the record");
655  return {nullptr, nullptr, nullptr};
656 }
657 
658 // The function searches for the alloca command matching context and
659 // requirement.
661  MemObjRecord *Record, const Requirement *Req, const ContextImplPtr &Context,
662  bool AllowConst) {
663  auto IsSuitableAlloca = [&Context, Req,
664  AllowConst](AllocaCommandBase *AllocaCmd) {
665  bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(), Context);
666  if (IsSuitableSubReq(Req)) {
667  const Requirement *TmpReq = AllocaCmd->getRequirement();
668  Res &= AllocaCmd->getType() == Command::CommandType::ALLOCA_SUB_BUF;
669  Res &= TmpReq->MOffsetInBytes == Req->MOffsetInBytes;
670  Res &= TmpReq->MAccessRange == Req->MAccessRange;
671  Res &= AllowConst || !AllocaCmd->MIsConst;
672  }
673  return Res;
674  };
675  const auto It = std::find_if(Record->MAllocaCommands.begin(),
676  Record->MAllocaCommands.end(), IsSuitableAlloca);
677  return (Record->MAllocaCommands.end() != It) ? *It : nullptr;
678 }
679 
680 static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) {
681  if (const char *HUMConfig = SYCLConfig<SYCL_HOST_UNIFIED_MEMORY>::get()) {
682  if (std::strcmp(HUMConfig, "0") == 0)
683  return Ctx->is_host();
684  if (std::strcmp(HUMConfig, "1") == 0)
685  return true;
686  }
687  for (const device &Device : Ctx->getDevices()) {
688  if (!Device.get_info<info::device::host_unified_memory>())
689  return false;
690  }
691  return true;
692 }
693 
694 // The function searches for the alloca command matching context and
695 // requirement. If none exists, new allocation command is created.
696 // Note, creation of new allocation command can lead to the current context
697 // (Record->MCurContext) change.
698 AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
699  MemObjRecord *Record, const Requirement *Req, const QueueImplPtr &Queue,
700  std::vector<Command *> &ToEnqueue) {
701 
702  AllocaCommandBase *AllocaCmd = findAllocaForReq(
703  Record, Req, Queue->getContextImplPtr(), /*AllowConst=*/false);
704 
705  if (!AllocaCmd) {
706  std::vector<Command *> ToCleanUp;
707  if (IsSuitableSubReq(Req)) {
708  // Get parent requirement. It's hard to get right parents' range
709  // so full parent requirement has range represented in bytes
710  range<3> ParentRange{Req->MSYCLMemObj->getSizeInBytes(), 1, 1};
711  Requirement ParentRequirement(
712  /*Offset*/ {0, 0, 0}, ParentRange, ParentRange,
713  access::mode::read_write, Req->MSYCLMemObj, /*Dims*/ 1,
714  /*Working with bytes*/ sizeof(char), /*offset*/ size_t(0));
715 
716  auto *ParentAlloca =
717  getOrCreateAllocaForReq(Record, &ParentRequirement, Queue, ToEnqueue);
718  AllocaCmd = new AllocaSubBufCommand(Queue, *Req, ParentAlloca, ToEnqueue,
719  ToCleanUp);
720  } else {
721 
722  const Requirement FullReq(/*Offset*/ {0, 0, 0}, Req->MMemoryRange,
723  Req->MMemoryRange, access::mode::read_write,
724  Req->MSYCLMemObj, Req->MDims, Req->MElemSize,
725  0 /*ReMOffsetInBytes*/, false /*MIsSubBuffer*/);
726  // Can reuse user data for the first allocation. Do so if host unified
727  // memory is supported regardless of the access mode (the pointer will be
728  // reused). For devices without host unified memory the initialization
729  // will be performed as a write operation.
730  // TODO the case where the first alloca is made with a discard mode and
731  // the user pointer is read-only is still not handled: it leads to
732  // unnecessary copy on devices with unified host memory support.
733  const bool HostUnifiedMemory =
734  checkHostUnifiedMemory(Queue->getContextImplPtr());
735  SYCLMemObjI *MemObj = Req->MSYCLMemObj;
736  const bool InitFromUserData = Record->MAllocaCommands.empty() &&
737  (HostUnifiedMemory || MemObj->isInterop());
738  AllocaCommandBase *LinkedAllocaCmd = nullptr;
739 
740  // For the first allocation on a device without host unified memory we
741  // might need to also create a host alloca right away in order to perform
742  // the initial memory write.
743  if (Record->MAllocaCommands.empty()) {
744  if (!HostUnifiedMemory &&
745  Req->MAccessMode != access::mode::discard_write &&
746  Req->MAccessMode != access::mode::discard_read_write) {
747  // There's no need to make a host allocation if the buffer is not
748  // initialized with user data.
749  if (MemObj->hasUserDataPtr()) {
752  AllocaCommand *HostAllocaCmd = new AllocaCommand(
753  DefaultHostQueue, FullReq, true /* InitFromUserData */,
754  nullptr /* LinkedAllocaCmd */,
755  MemObj->isHostPointerReadOnly() /* IsConst */);
756  Record->MAllocaCommands.push_back(HostAllocaCmd);
757  Record->MWriteLeaves.push_back(HostAllocaCmd, ToEnqueue);
758  ++(HostAllocaCmd->MLeafCounter);
759  Record->MCurContext = DefaultHostQueue->getContextImplPtr();
760  }
761  }
762  } else {
763  // If it is not the first allocation, try to setup a link
764  // FIXME: Temporary limitation, linked alloca commands for an image is
765  // not supported because map operation is not implemented for an image.
766  if (Req->MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer)
767  // Current limitation is to setup link between current allocation and
768  // new one. There could be situations when we could setup link with
769  // "not" current allocation, but it will require memory copy.
770  // Can setup link between cl and host allocations only
771  if (Queue->is_host() != Record->MCurContext->is_host()) {
772  // Linked commands assume that the host allocation is reused by the
773  // plugin runtime and that can lead to unnecessary copy overhead on
774  // devices that do not support host unified memory. Do not link the
775  // allocations in this case.
776  // However, if the user explicitly requests use of pinned host
777  // memory, map/unmap operations are expected to work faster than
778  // read/write from/to an artbitrary host pointer. Link such commands
779  // regardless of host unified memory support.
780  bool PinnedHostMemory = MemObj->usesPinnedHostMemory();
781 
782  bool HostUnifiedMemoryOnNonHostDevice =
783  Queue->is_host() ? checkHostUnifiedMemory(Record->MCurContext)
784  : HostUnifiedMemory;
785  if (PinnedHostMemory || HostUnifiedMemoryOnNonHostDevice) {
786  AllocaCommandBase *LinkedAllocaCmdCand = findAllocaForReq(
787  Record, Req, Record->MCurContext, /*AllowConst=*/false);
788 
789  // Cannot setup link if candidate is linked already
790  if (LinkedAllocaCmdCand &&
791  !LinkedAllocaCmdCand->MLinkedAllocaCmd) {
792  LinkedAllocaCmd = LinkedAllocaCmdCand;
793  }
794  }
795  }
796  }
797 
798  AllocaCmd =
799  new AllocaCommand(Queue, FullReq, InitFromUserData, LinkedAllocaCmd);
800 
801  // Update linked command
802  if (LinkedAllocaCmd) {
803  Command *ConnCmd = AllocaCmd->addDep(
804  DepDesc{LinkedAllocaCmd, AllocaCmd->getRequirement(),
805  LinkedAllocaCmd},
806  ToCleanUp);
807  if (ConnCmd)
808  ToEnqueue.push_back(ConnCmd);
809  LinkedAllocaCmd->MLinkedAllocaCmd = AllocaCmd;
810 
811  // To ensure that the leader allocation is removed first
812  ConnCmd = AllocaCmd->getReleaseCmd()->addDep(
813  DepDesc(LinkedAllocaCmd->getReleaseCmd(),
814  AllocaCmd->getRequirement(), LinkedAllocaCmd),
815  ToCleanUp);
816  if (ConnCmd)
817  ToEnqueue.push_back(ConnCmd);
818 
819  // Device allocation takes ownership of the host ptr during
820  // construction, host allocation doesn't. So, device allocation should
821  // always be active here. Also if the "follower" command is a device one
822  // we have to change current context to the device one.
823  if (Queue->is_host()) {
824  AllocaCmd->MIsActive = false;
825  } else {
826  LinkedAllocaCmd->MIsActive = false;
827  Record->MCurContext = Queue->getContextImplPtr();
828 
829  std::set<Command *> Deps =
830  findDepsForReq(Record, Req, Queue->getContextImplPtr());
831  for (Command *Dep : Deps) {
832  Command *ConnCmd = AllocaCmd->addDep(
833  DepDesc{Dep, Req, LinkedAllocaCmd}, ToCleanUp);
834  if (ConnCmd)
835  ToEnqueue.push_back(ConnCmd);
836  }
837  updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp);
838  addNodeToLeaves(Record, AllocaCmd, Req->MAccessMode, ToEnqueue);
839  }
840  }
841  }
842 
843  Record->MAllocaCommands.push_back(AllocaCmd);
844  Record->MWriteLeaves.push_back(AllocaCmd, ToEnqueue);
845  ++(AllocaCmd->MLeafCounter);
846  for (Command *Cmd : ToCleanUp)
847  cleanupCommand(Cmd);
848  }
849  return AllocaCmd;
850 }
851 
852 // The function sets MemModified flag in record if requirement has write access.
853 void Scheduler::GraphBuilder::markModifiedIfWrite(MemObjRecord *Record,
854  Requirement *Req) {
855  switch (Req->MAccessMode) {
856  case access::mode::write:
861  Record->MMemModified = true;
862  break;
863  case access::mode::read:
864  break;
865  }
866 }
867 
868 EmptyCommand *Scheduler::GraphBuilder::addEmptyCmd(
869  Command *Cmd, const std::vector<Requirement *> &Reqs,
870  const QueueImplPtr &Queue, Command::BlockReason Reason,
871  std::vector<Command *> &ToEnqueue, const bool AddDepsToLeaves) {
872  EmptyCommand *EmptyCmd =
873  new EmptyCommand(Scheduler::getInstance().getDefaultHostQueue());
874 
875  if (!EmptyCmd)
876  throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
877 
878  EmptyCmd->MIsBlockable = true;
879  EmptyCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueBlocked;
880  EmptyCmd->MBlockReason = Reason;
881 
882  for (Requirement *Req : Reqs) {
883  MemObjRecord *Record = getOrInsertMemObjRecord(Queue, Req, ToEnqueue);
884  AllocaCommandBase *AllocaCmd =
885  getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
886  EmptyCmd->addRequirement(Cmd, AllocaCmd, Req);
887  }
888  // addRequirement above call addDep that already will add EmptyCmd as user for
889  // Cmd no Reqs size check here so assume it is possible to have no Reqs passed
890  if (!Reqs.size())
891  Cmd->addUser(EmptyCmd);
892 
893  if (AddDepsToLeaves) {
894  const std::vector<DepDesc> &Deps = Cmd->MDeps;
895  std::vector<Command *> ToCleanUp;
896  for (const DepDesc &Dep : Deps) {
897  const Requirement *Req = Dep.MDepRequirement;
898  MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj);
899 
900  updateLeaves({Cmd}, Record, Req->MAccessMode, ToCleanUp);
901  addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode, ToEnqueue);
902  }
903  for (Command *Cmd : ToCleanUp)
904  cleanupCommand(Cmd);
905  }
906 
907  return EmptyCmd;
908 }
909 
910 static bool isInteropHostTask(ExecCGCommand *Cmd) {
911  if (Cmd->getCG().getType() != CG::CGTYPE::CodeplayHostTask)
912  return false;
913 
914  const detail::CGHostTask &HT =
915  static_cast<detail::CGHostTask &>(Cmd->getCG());
916 
917  return HT.MHostTask->isInteropTask();
918 }
919 
920 static void combineAccessModesOfReqs(std::vector<Requirement *> &Reqs) {
921  std::unordered_map<SYCLMemObjI *, access::mode> CombinedModes;
922  bool HasDuplicateMemObjects = false;
923  for (const Requirement *Req : Reqs) {
924  auto Result = CombinedModes.insert(
925  std::make_pair(Req->MSYCLMemObj, Req->MAccessMode));
926  if (!Result.second) {
927  Result.first->second =
928  combineAccessModes(Result.first->second, Req->MAccessMode);
929  HasDuplicateMemObjects = true;
930  }
931  }
932 
933  if (!HasDuplicateMemObjects)
934  return;
935  for (Requirement *Req : Reqs) {
936  Req->MAccessMode = CombinedModes[Req->MSYCLMemObj];
937  }
938 }
939 
941  std::unique_ptr<detail::CG> CommandGroup, const QueueImplPtr &Queue,
942  std::vector<Command *> &ToEnqueue,
944  const std::vector<sycl::detail::pi::PiExtSyncPoint> &Dependencies) {
945  std::vector<Requirement *> &Reqs = CommandGroup->getRequirements();
946  std::vector<detail::EventImplPtr> &Events = CommandGroup->getEvents();
947 
948  auto NewCmd = std::make_unique<ExecCGCommand>(
949  std::move(CommandGroup), Queue, CommandBuffer, std::move(Dependencies));
950 
951  if (!NewCmd)
952  throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
953 
954  // Only device kernel command groups can participate in fusion. Otherwise,
955  // command groups take the regular route. If they create any requirement or
956  // event dependency on any of the kernels in the fusion list, this will lead
957  // to cancellation of the fusion in the GraphProcessor.
958  auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
959  if (isInFusionMode(QUniqueID)) {
960  if (NewCmd->isFusable()) {
961  auto *FusionCmd = findFusionList(QUniqueID)->second.get();
962 
963  bool dependsOnFusion = false;
964  for (auto Ev = Events.begin(); Ev != Events.end();) {
965  auto *EvDepCmd = static_cast<Command *>((*Ev)->getCommand());
966  if (!EvDepCmd) {
967  ++Ev;
968  continue;
969  }
970  // Event dependencies on commands part of another active fusion are
971  // handled by cancelling fusion in that other queue.
972  if (EvDepCmd->getQueue() != Queue && isPartOfActiveFusion(EvDepCmd)) {
973  printFusionWarning(
974  "Aborting fusion because of event dependency from a "
975  "different fusion");
976  cancelFusion(EvDepCmd->getQueue(), ToEnqueue);
977  }
978  // Check if this command depends on the placeholder command for the
979  // fusion itself participates in.
980  if (EvDepCmd == FusionCmd) {
981  Ev = Events.erase(Ev);
982  dependsOnFusion = true;
983  } else {
984  ++Ev;
985  }
986  }
987 
988  // If this command has an explicit event dependency on the placeholder
989  // command for this fusion (because it used depends_on on the event
990  // returned by submitting another kernel to this fusion earlier), add a
991  // dependency on all the commands in the fusion list so far.
992  if (dependsOnFusion) {
993  for (auto *Cmd : FusionCmd->getFusionList()) {
994  Events.push_back(Cmd->getEvent());
995  }
996  }
997 
998  // Add the kernel to the graph, but delay the enqueue of any auxiliary
999  // commands (e.g., allocations) resulting from that process by adding them
1000  // to the list of auxiliary commands of the fusion command.
1001  createGraphForCommand(NewCmd.get(), NewCmd->getCG(),
1002  isInteropHostTask(NewCmd.get()), Reqs, Events,
1003  Queue, FusionCmd->auxiliaryCommands());
1004 
1005  // Set the fusion command, so we recognize when another command depends on
1006  // a kernel in the fusion list.
1007  FusionCmd->addToFusionList(NewCmd.get());
1008  NewCmd->MFusionCmd = FusionCmd;
1009  std::vector<Command *> ToCleanUp;
1010  // Add an event dependency from the fusion placeholder command to the new
1011  // kernel.
1012  auto ConnectionCmd = FusionCmd->addDep(NewCmd->getEvent(), ToCleanUp);
1013  if (ConnectionCmd) {
1014  FusionCmd->auxiliaryCommands().push_back(ConnectionCmd);
1015  }
1016  return {NewCmd.release(), FusionCmd->getEvent(), false};
1017  } else {
1018  std::string s;
1019  std::stringstream ss(s);
1020  if (NewCmd->getCG().getType() == CG::CGTYPE::Kernel) {
1021  ss << "Not fusing kernel with 'use_root_sync' property. Can only fuse "
1022  "non-cooperative device kernels.";
1023  } else {
1024  ss << "Not fusing '" << NewCmd->getTypeString()
1025  << "' command group. Can only fuse device kernel command groups.";
1026  }
1027  printFusionWarning(ss.str());
1028  }
1029  }
1030  createGraphForCommand(NewCmd.get(), NewCmd->getCG(),
1031  isInteropHostTask(NewCmd.get()), Reqs, Events, Queue,
1032  ToEnqueue);
1033  auto Event = NewCmd->getEvent();
1034  return {NewCmd.release(), Event, true};
1035 }
1036 
1037 void Scheduler::GraphBuilder::createGraphForCommand(
1038  Command *NewCmd, CG &CG, bool isInteropTask,
1039  std::vector<Requirement *> &Reqs,
1040  const std::vector<detail::EventImplPtr> &Events, QueueImplPtr Queue,
1041  std::vector<Command *> &ToEnqueue) {
1042 
1043  if (MPrintOptionsArray[BeforeAddCG])
1044  printGraphAsDot("before_addCG");
1045 
1046  // If there are multiple requirements for the same memory object, its
1047  // AllocaCommand creation will be dependent on the access mode of the first
1048  // requirement. Combine these access modes to take all of them into account.
1050  std::vector<Command *> ToCleanUp;
1051  for (Requirement *Req : Reqs) {
1052  MemObjRecord *Record = nullptr;
1053  AllocaCommandBase *AllocaCmd = nullptr;
1054 
1055  bool isSameCtx = false;
1056 
1057  {
1058  const QueueImplPtr &QueueForAlloca =
1059  isInteropTask ? static_cast<detail::CGHostTask &>(CG).MQueue : Queue;
1060 
1061  Record = getOrInsertMemObjRecord(QueueForAlloca, Req, ToEnqueue);
1062  markModifiedIfWrite(Record, Req);
1063 
1064  AllocaCmd =
1065  getOrCreateAllocaForReq(Record, Req, QueueForAlloca, ToEnqueue);
1066 
1067  isSameCtx =
1068  sameCtx(QueueForAlloca->getContextImplPtr(), Record->MCurContext);
1069  }
1070 
1071  // If there is alloca command we need to check if the latest memory is in
1072  // required context.
1073  if (isSameCtx) {
1074  // If the memory is already in the required host context, check if the
1075  // required access mode is valid, remap if not.
1076  if (Record->MCurContext->is_host() &&
1077  !isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess)) {
1078  remapMemoryObject(Record, Req,
1079  Req->MIsSubBuffer
1080  ? (static_cast<AllocaSubBufCommand *>(AllocaCmd))
1081  ->getParentAlloca()
1082  : AllocaCmd,
1083  ToEnqueue);
1084  }
1085  } else {
1086  // Cannot directly copy memory from OpenCL device to OpenCL device -
1087  // create two copies: device->host and host->device.
1088  bool NeedMemMoveToHost = false;
1089  auto MemMoveTargetQueue = Queue;
1090 
1091  if (isInteropTask) {
1092  const detail::CGHostTask &HT = static_cast<detail::CGHostTask &>(CG);
1093 
1094  if (HT.MQueue->getContextImplPtr() != Record->MCurContext) {
1095  NeedMemMoveToHost = true;
1096  MemMoveTargetQueue = HT.MQueue;
1097  }
1098  } else if (!Queue->is_host() && !Record->MCurContext->is_host())
1099  NeedMemMoveToHost = true;
1100 
1101  if (NeedMemMoveToHost)
1102  insertMemoryMove(Record, Req,
1104  ToEnqueue);
1105  insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue);
1106  }
1107  std::set<Command *> Deps =
1108  findDepsForReq(Record, Req, Queue->getContextImplPtr());
1109 
1110  for (Command *Dep : Deps) {
1111  if (Dep != NewCmd) {
1112  Command *ConnCmd =
1113  NewCmd->addDep(DepDesc{Dep, Req, AllocaCmd}, ToCleanUp);
1114  if (ConnCmd)
1115  ToEnqueue.push_back(ConnCmd);
1116  }
1117  }
1118  }
1119 
1120  // Set new command as user for dependencies and update leaves.
1121  // Node dependencies can be modified further when adding the node to leaves,
1122  // iterate over their copy.
1123  // FIXME employ a reference here to eliminate copying of a vector
1124  std::vector<DepDesc> Deps = NewCmd->MDeps;
1125  for (DepDesc &Dep : Deps) {
1126  const Requirement *Req = Dep.MDepRequirement;
1127  MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj);
1128  updateLeaves({Dep.MDepCommand}, Record, Req->MAccessMode, ToCleanUp);
1129  addNodeToLeaves(Record, NewCmd, Req->MAccessMode, ToEnqueue);
1130  }
1131 
1132  // Register all the events as dependencies
1133  for (detail::EventImplPtr e : Events) {
1134  if (e->getCommand() && e->getCommand() == NewCmd) {
1135  continue;
1136  }
1137  if (Command *ConnCmd = NewCmd->addDep(e, ToCleanUp))
1138  ToEnqueue.push_back(ConnCmd);
1139  }
1140 
1141  if (MPrintOptionsArray[AfterAddCG])
1142  printGraphAsDot("after_addCG");
1143 
1144  for (Command *Cmd : ToCleanUp) {
1145  cleanupCommand(Cmd);
1146  }
1147 }
1148 
1150  MemObjRecord *Record) {
1151  for (Command *Cmd : Record->MReadLeaves) {
1152  --(Cmd->MLeafCounter);
1153  if (Cmd->readyForCleanup())
1154  cleanupCommand(Cmd);
1155  }
1156  for (Command *Cmd : Record->MWriteLeaves) {
1157  --(Cmd->MLeafCounter);
1158  if (Cmd->readyForCleanup())
1159  cleanupCommand(Cmd);
1160  }
1161 }
1162 
1164  std::vector<AllocaCommandBase *> &AllocaCommands = Record->MAllocaCommands;
1165  if (AllocaCommands.empty())
1166  return;
1167 
1168  assert(MCmdsToVisit.empty());
1169  MVisitedCmds.clear();
1170 
1171  // First, mark all allocas for deletion and their direct users for traversal
1172  // Dependencies of the users will be cleaned up during the traversal
1173  for (Command *AllocaCmd : AllocaCommands) {
1174  markNodeAsVisited(AllocaCmd, MVisitedCmds);
1175 
1176  for (Command *UserCmd : AllocaCmd->MUsers)
1177  // Linked alloca cmd may be in users of this alloca. We're not going to
1178  // visit it.
1179  if (UserCmd->getType() != Command::CommandType::ALLOCA)
1180  MCmdsToVisit.push(UserCmd);
1181  else
1182  markNodeAsVisited(UserCmd, MVisitedCmds);
1183 
1184  AllocaCmd->MMarks.MToBeDeleted = true;
1185  // These commands will be deleted later, clear users now to avoid
1186  // updating them during edge removal
1187  AllocaCmd->MUsers.clear();
1188  }
1189 
1190  // Make sure the Linked Allocas are marked visited by the previous walk.
1191  // Remove allocation commands from the users of their dependencies.
1192  for (AllocaCommandBase *AllocaCmd : AllocaCommands) {
1193  AllocaCommandBase *LinkedCmd = AllocaCmd->MLinkedAllocaCmd;
1194 
1195  if (LinkedCmd) {
1196  assert(LinkedCmd->MMarks.MVisited);
1197  }
1198 
1199  for (DepDesc &Dep : AllocaCmd->MDeps)
1200  if (Dep.MDepCommand)
1201  Dep.MDepCommand->MUsers.erase(AllocaCmd);
1202  }
1203 
1204  // Traverse the graph using BFS
1205  while (!MCmdsToVisit.empty()) {
1206  Command *Cmd = MCmdsToVisit.front();
1207  MCmdsToVisit.pop();
1208 
1209  if (!markNodeAsVisited(Cmd, MVisitedCmds))
1210  continue;
1211 
1212  for (Command *UserCmd : Cmd->MUsers)
1213  if (UserCmd->getType() != Command::CommandType::ALLOCA)
1214  MCmdsToVisit.push(UserCmd);
1215 
1216  // Delete all dependencies on any allocations being removed
1217  // Track which commands should have their users updated
1218  std::map<Command *, bool> ShouldBeUpdated;
1219  auto NewEnd = std::remove_if(
1220  Cmd->MDeps.begin(), Cmd->MDeps.end(), [&](const DepDesc &Dep) {
1221  if (std::find(AllocaCommands.begin(), AllocaCommands.end(),
1222  Dep.MAllocaCmd) != AllocaCommands.end()) {
1223  ShouldBeUpdated.insert({Dep.MDepCommand, true});
1224  return true;
1225  }
1226  ShouldBeUpdated[Dep.MDepCommand] = false;
1227  return false;
1228  });
1229  Cmd->MDeps.erase(NewEnd, Cmd->MDeps.end());
1230 
1231  // Update users of removed dependencies
1232  for (auto DepCmdIt : ShouldBeUpdated) {
1233  if (!DepCmdIt.second)
1234  continue;
1235  DepCmdIt.first->MUsers.erase(Cmd);
1236  }
1237 
1238  // If all dependencies have been removed this way, mark the command for
1239  // deletion
1240  if (Cmd->MDeps.empty()) {
1241  Cmd->MUsers.clear();
1242  // Do not delete the node if it's scheduled for post-enqueue cleanup to
1243  // avoid double free.
1244  if (!Cmd->MMarkedForCleanup)
1245  Cmd->MMarks.MToBeDeleted = true;
1246  }
1247  }
1248 
1249  handleVisitedNodes(MVisitedCmds);
1250 }
1251 
1253  Command *Cmd, [[maybe_unused]] bool AllowUnsubmitted) {
1255  static bool DeprWarningPrinted = false;
1256  if (!DeprWarningPrinted) {
1257  std::cerr << "WARNING: The enviroment variable "
1258  "SYCL_DISABLE_POST_ENQUEUE_CLEANUP is deprecated. Please "
1259  "use SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP instead.\n";
1260  DeprWarningPrinted = true;
1261  }
1262  return;
1263  }
1265  return;
1266 
1267  assert(Cmd->MLeafCounter == 0 &&
1268  (Cmd->isSuccessfullyEnqueued() || AllowUnsubmitted));
1269  Command::CommandType CmdT = Cmd->getType();
1270 
1271  assert(CmdT != Command::ALLOCA && CmdT != Command::ALLOCA_SUB_BUF);
1272  assert(CmdT != Command::RELEASE);
1273  (void)CmdT;
1274 
1275  for (Command *UserCmd : Cmd->MUsers) {
1276  for (DepDesc &Dep : UserCmd->MDeps) {
1277  // Link the users of the command to the alloca command(s) instead
1278  if (Dep.MDepCommand == Cmd) {
1279  // ... unless the user is the alloca itself.
1280  if (Dep.MAllocaCmd == UserCmd) {
1281  Dep.MDepCommand = nullptr;
1282  } else {
1283  Dep.MDepCommand = Dep.MAllocaCmd;
1284  Dep.MDepCommand->MUsers.insert(UserCmd);
1285  }
1286  }
1287  }
1288  }
1289  // Update dependency users
1290  for (DepDesc &Dep : Cmd->MDeps) {
1291  Command *DepCmd = Dep.MDepCommand;
1292  DepCmd->MUsers.erase(Cmd);
1293  }
1294 
1295  if (Cmd->getType() == Command::FUSION &&
1296  !static_cast<KernelFusionCommand *>(Cmd)->readyForDeletion()) {
1297  // Fusion commands might still be needed because fusion might be aborted,
1298  // but a later call to complete_fusion still needs to be able to return a
1299  // valid event. Clean-up of fusion commands is therefore explicitly handled
1300  // by start fusion.
1301  return;
1302  }
1303  Cmd->getEvent()->setCommand(nullptr);
1304  delete Cmd;
1305 }
1306 
1308  const auto It = std::find_if(
1309  MMemObjs.begin(), MMemObjs.end(),
1310  [MemObject](const SYCLMemObjI *Obj) { return Obj == MemObject; });
1311  if (It != MMemObjs.end())
1312  MMemObjs.erase(It);
1313  MemObject->MRecord.reset();
1314 }
1315 
1316 // Make Cmd depend on DepEvent from different context. Connection is performed
1317 // via distinct ConnectCmd with host task command group on host queue. Cmd will
1318 // depend on ConnectCmd's host event.
1319 // DepEvent may not have a command associated with it in at least two cases:
1320 // - the command was deleted upon cleanup process;
1321 // - DepEvent is user event.
1322 // In both of these cases the only thing we can do is to make ConnectCmd depend
1323 // on DepEvent.
1324 // Otherwise, when there is a command associated with DepEvent, we make
1325 // ConnectCmd depend on on this command. If there is valid, i.e. non-nil,
1326 // requirement in Dep we make ConnectCmd depend on DepEvent's command with this
1327 // requirement.
1328 // Optionality of Dep is set by Dep.MDepCommand equal to nullptr.
1330  Command *const Cmd, const EventImplPtr &DepEvent, const DepDesc &Dep,
1331  std::vector<Command *> &ToCleanUp) {
1332  assert(Cmd->getWorkerContext() != DepEvent->getContextImpl());
1333 
1334  // construct Host Task type command manually and make it depend on DepEvent
1335  ExecCGCommand *ConnectCmd = nullptr;
1336 
1337  try {
1338  std::unique_ptr<detail::HostTask> HT(new detail::HostTask);
1339  std::unique_ptr<detail::CG> ConnectCG(new detail::CGHostTask(
1340  std::move(HT), /* Queue = */ {}, /* Context = */ {}, /* Args = */ {},
1342  /* ArgsStorage = */ {}, /* AccStorage = */ {},
1343  /* SharedPtrStorage = */ {}, /* Requirements = */ {},
1344  /* DepEvents = */ {DepEvent}),
1346  /* Payload */ {}));
1347  ConnectCmd = new ExecCGCommand(
1348  std::move(ConnectCG), Scheduler::getInstance().getDefaultHostQueue());
1349  } catch (const std::bad_alloc &) {
1350  throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
1351  }
1352 
1353  if (Dep.MDepRequirement) {
1354  // make ConnectCmd depend on requirement
1355  // Dismiss the result here as it's not a connection now,
1356  // 'cause ConnectCmd is host one
1357  (void)ConnectCmd->addDep(Dep, ToCleanUp);
1358  assert(reinterpret_cast<Command *>(DepEvent->getCommand()) ==
1359  Dep.MDepCommand);
1360  // add user to Dep.MDepCommand is already performed beyond this if branch
1361  {
1362  DepDesc DepOnConnect = Dep;
1363  DepOnConnect.MDepCommand = ConnectCmd;
1364 
1365  // Dismiss the result here as it's not a connection now,
1366  // 'cause ConnectCmd is host one
1367  std::ignore = Cmd->addDep(DepOnConnect, ToCleanUp);
1368  }
1369  } else {
1370  // It is required condition in another a path and addUser will be set in
1371  // addDep
1372  if (Command *DepCmd = reinterpret_cast<Command *>(DepEvent->getCommand()))
1373  DepCmd->addUser(ConnectCmd);
1374 
1375  std::ignore = ConnectCmd->addDep(DepEvent, ToCleanUp);
1376 
1377  std::ignore = Cmd->addDep(ConnectCmd->getEvent(), ToCleanUp);
1378 
1379  ConnectCmd->addUser(Cmd);
1380  }
1381 
1382  return ConnectCmd;
1383 }
1384 
1386  cleanUpCmdFusion(Queue.get());
1387  auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1388  MFusionMap.emplace(QUniqueID, std::make_unique<KernelFusionCommand>(Queue));
1389 }
1390 
1392  sycl::detail::queue_impl *Queue) {
1393  auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue);
1394  if (isInFusionMode(QUniqueID)) {
1395  throw sycl::exception{sycl::make_error_code(sycl::errc::invalid),
1396  "Queue already in fusion mode"};
1397  }
1398  auto OldFusionCmd = findFusionList(QUniqueID);
1399  if (OldFusionCmd != MFusionMap.end()) {
1400  // If fusion was used on this queue previously, the old fusion command might
1401  // still be around to make sure that even after
1402  // cancellation of the fusion due to synchronization, complete_fusion is
1403  // still able to return a valid event.
1404  OldFusionCmd->second->setFusionStatus(
1406  cleanupCommand(OldFusionCmd->second.release());
1407  MFusionMap.erase(OldFusionCmd);
1408  }
1409 }
1410 
1411 void Scheduler::GraphBuilder::removeNodeFromGraph(
1412  Command *Node, std::vector<Command *> &ToEnqueue) {
1413  // Remove the placeholder command as leaf of all its requirements and from the
1414  // user list of all its dependencies.
1415  for (auto &Dep : Node->MDeps) {
1416  auto AccessMode = Dep.MDepRequirement->MAccessMode;
1417  auto *Record = getMemObjRecord(Dep.MDepRequirement->MSYCLMemObj);
1418 
1419  Node->MLeafCounter -= Record->MReadLeaves.remove(Node);
1420  Node->MLeafCounter -= Record->MWriteLeaves.remove(Node);
1421  // If the placeholder had a write-requirement on this record, we need to
1422  // restore the previous leaves.
1423  if (AccessMode != access::mode::read) {
1424  for (auto PrevDep : Dep.MDepCommand->MDeps) {
1425  auto *DepReq = PrevDep.MDepRequirement;
1426  auto *DepRecord = getMemObjRecord(DepReq->MSYCLMemObj);
1427  if (DepRecord == Record) {
1428  // Need to restore this as a leaf, because we pushed it from the
1429  // leaves when adding the placeholder command.
1430  assert(Dep.MDepCommand);
1431  addNodeToLeaves(Record, Dep.MDepCommand, DepReq->MAccessMode,
1432  ToEnqueue);
1433  }
1434  }
1435  }
1436  Dep.MDepCommand->MUsers.erase(Node);
1437  }
1438 
1439  // Clear all the dependencies to avoid cleanDepEventsThroughOneLevel, called
1440  // from the destructor of the command to delete the dependencies of the
1441  // command this command depends on.
1442  Node->clearAllDependencies();
1443 }
1444 
1446  std::vector<Command *> &ToEnqueue) {
1447  auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1448  if (!isInFusionMode(QUniqueID)) {
1449  return;
1450  }
1451  auto FusionList = findFusionList(QUniqueID);
1452 
1453  auto *PlaceholderCmd = (*FusionList).second.get();
1454 
1455  // Enqueue all the kernels/commands from the fusion list
1456  auto FusedCmdList = PlaceholderCmd->getFusionList();
1457  ToEnqueue.insert(ToEnqueue.end(), FusedCmdList.begin(), FusedCmdList.end());
1458 
1459  // The commands establishing an event dependency between the fusion
1460  // placeholder command and the individual kernels need to be enqueued.
1461  ToEnqueue.insert(ToEnqueue.end(), PlaceholderCmd->auxiliaryCommands().begin(),
1462  PlaceholderCmd->auxiliaryCommands().end());
1463 
1464  ToEnqueue.push_back(PlaceholderCmd);
1465 
1466  if (MPrintOptionsArray[AfterFusionCancel]) {
1467  printGraphAsDot("after_fusionCancel");
1468  }
1469 
1470  // Set the status for the fusion command
1471  PlaceholderCmd->setFusionStatus(KernelFusionCommand::FusionStatus::CANCELLED);
1472 }
1473 
1474 static bool isPartOfFusion(Command *Cmd, KernelFusionCommand *Fusion) {
1475  if (Cmd->getType() == Command::RUN_CG) {
1476  return static_cast<ExecCGCommand *>(Cmd)->MFusionCmd == Fusion;
1477  }
1478  return false;
1479 }
1480 
1481 static bool checkForCircularDependency(Command *, bool, KernelFusionCommand *);
1482 
1483 static bool createsCircularDependency(Command *Cmd, bool PredPartOfFusion,
1484  KernelFusionCommand *Fusion) {
1485  if (isPartOfFusion(Cmd, Fusion)) {
1486  // If this is part of the fusion and the predecessor also was, we can stop
1487  // the traversal here. A direct dependency between two kernels in the same
1488  // fusion will never form a cyclic dependency and by iterating over all
1489  // commands in a fusion, we will detect any cycles originating from the
1490  // current command.
1491  // If the predecessor was not part of the fusion, but the current command
1492  // is, we have found a potential cycle in the dependency graph.
1493  return !PredPartOfFusion;
1494  }
1495  return checkForCircularDependency(Cmd, false, Fusion);
1496 }
1497 
1498 static bool checkForCircularDependency(Command *Cmd, bool IsPartOfFusion,
1499  KernelFusionCommand *Fusion) {
1500  // Check the requirement dependencies.
1501  for (auto &Dep : Cmd->MDeps) {
1502  auto *DepCmd = Dep.MDepCommand;
1503  if (!DepCmd) {
1504  continue;
1505  }
1506  if (createsCircularDependency(DepCmd, IsPartOfFusion, Fusion)) {
1507  return true;
1508  }
1509  }
1510  for (auto &Ev : Cmd->getPreparedDepsEvents()) {
1511  auto *EvDepCmd = static_cast<Command *>(Ev->getCommand());
1512  if (!EvDepCmd) {
1513  continue;
1514  }
1515  if (createsCircularDependency(EvDepCmd, IsPartOfFusion, Fusion)) {
1516  return true;
1517  }
1518  }
1519  for (auto &Ev : Cmd->getPreparedHostDepsEvents()) {
1520  auto *EvDepCmd = static_cast<Command *>(Ev->getCommand());
1521  if (!EvDepCmd) {
1522  continue;
1523  }
1524  if (createsCircularDependency(EvDepCmd, IsPartOfFusion, Fusion)) {
1525  return true;
1526  }
1527  }
1528  return false;
1529 }
1530 
1533  std::vector<Command *> &ToEnqueue,
1534  const property_list &PropList) {
1535  auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1536 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
1537  if (!isInFusionMode(QUniqueID)) {
1538  auto InactiveFusionList = findFusionList(QUniqueID);
1539  if (InactiveFusionList == MFusionMap.end()) {
1540  throw sycl::exception{
1541  sycl::make_error_code(sycl::errc::invalid),
1542  "Calling complete_fusion on a queue not in fusion mode"};
1543  }
1544  return InactiveFusionList->second->getEvent();
1545  }
1546 
1547  auto FusionList = findFusionList(QUniqueID);
1548  auto *PlaceholderCmd = FusionList->second.get();
1549  auto &CmdList = PlaceholderCmd->getFusionList();
1550 
1551  // If there is more than one queue currently in fusion mode, we need to check
1552  // if fusing the kernel would create a circular dependency. A circular
1553  // dependency would arise, if a kernel in the fusion list *indirectly* depends
1554  // on another kernel in the fusion list. Here, indirectly means, that the
1555  // dependency is created through a third command not part of the fusion, on
1556  // which this kernel depends and which in turn depends on another kernel in
1557  // fusion list.
1558  //
1559  // Note that we only have to consider dependencies via fusion queues here:
1560  // Let K1 be a kernel submitted to a queue Q1 in fusion mode. If a kernel K2
1561  // is submitted to a non-fusion queue Q2 and K2 depends on K1, fusion on Q1 is
1562  // cancelled automatically.
1563  bool CreatesCircularDep =
1564  MFusionMap.size() > 1 &&
1565  std::any_of(CmdList.begin(), CmdList.end(), [&](ExecCGCommand *Cmd) {
1566  return checkForCircularDependency(Cmd, true, PlaceholderCmd);
1567  });
1568  if (CreatesCircularDep) {
1569  // If fusing would create a fused kernel, cancel the fusion.
1570  printFusionWarning(
1571  "Aborting fusion because it would create a circular dependency");
1572  auto LastEvent = PlaceholderCmd->getEvent();
1573  this->cancelFusion(Queue, ToEnqueue);
1574  return LastEvent;
1575  }
1576 
1577  // Call the JIT compiler to generate a new fused kernel.
1579  Queue, CmdList, PropList);
1580 
1581  if (!FusedCG) {
1582  // If the JIT compiler returns a nullptr, JIT compilation of the fused
1583  // kernel failed. In that case, simply cancel the fusion and run each kernel
1584  // on its own.
1585  auto LastEvent = PlaceholderCmd->getEvent();
1586  this->cancelFusion(Queue, ToEnqueue);
1587  return LastEvent;
1588  }
1589 
1590  // Inherit all event dependencies from the input commands in the fusion list.
1591  std::vector<EventImplPtr> FusedEventDeps;
1592  for (auto *Cmd : CmdList) {
1593  FusedEventDeps.insert(FusedEventDeps.end(),
1594  Cmd->getPreparedDepsEvents().begin(),
1595  Cmd->getPreparedDepsEvents().end());
1596  FusedEventDeps.insert(FusedEventDeps.end(),
1597  Cmd->getPreparedHostDepsEvents().begin(),
1598  Cmd->getPreparedHostDepsEvents().end());
1599  }
1600 
1601  // Remove internal explicit dependencies, i.e., explicit dependencies from one
1602  // kernel in the fusion list to another kernel also in the fusion list.
1603  FusedEventDeps.erase(
1604  std::remove_if(FusedEventDeps.begin(), FusedEventDeps.end(),
1605  [&](EventImplPtr &E) {
1606  if (E->getCommand() == PlaceholderCmd) {
1607  return true;
1608  }
1609  if (E->getCommand() &&
1610  static_cast<Command *>(E->getCommand())->getType() ==
1611  Command::RUN_CG) {
1612  auto *RunCGCmd =
1613  static_cast<ExecCGCommand *>(E->getCommand());
1614  if (RunCGCmd->MFusionCmd == PlaceholderCmd) {
1615  return true;
1616  }
1617  }
1618  return false;
1619  }),
1620  FusedEventDeps.end());
1621 
1622  auto FusedKernelCmd =
1623  std::make_unique<ExecCGCommand>(std::move(FusedCG), Queue);
1624 
1625  // Inherit auxiliary resources from fused command groups
1626  Scheduler::getInstance().takeAuxiliaryResources(FusedKernelCmd->getEvent(),
1627  PlaceholderCmd->getEvent());
1628  assert(PlaceholderCmd->MDeps.empty());
1629  // Next, backwards iterate over all the commands in the fusion list and remove
1630  // them from the graph to restore the state before starting fusion, so we can
1631  // add the fused kernel to the graph in the next step.
1632  // Clean up the old commands after successfully fusing them.
1633  for (auto OldCmd = CmdList.rbegin(); OldCmd != CmdList.rend(); ++OldCmd) {
1634  removeNodeFromGraph(*OldCmd, ToEnqueue);
1635  cleanupCommand(*OldCmd, /* AllowUnsubmitted */ true);
1636  }
1637 
1638  createGraphForCommand(FusedKernelCmd.get(), FusedKernelCmd->getCG(), false,
1639  FusedKernelCmd->getCG().getRequirements(),
1640  FusedEventDeps, Queue, ToEnqueue);
1641 
1642  ToEnqueue.push_back(FusedKernelCmd.get());
1643 
1644  std::vector<Command *> ToCleanUp;
1645  // Make the placeholder command depend on the execution of the fused kernel
1646  auto *ConnectToPlaceholder =
1647  PlaceholderCmd->addDep(FusedKernelCmd->getEvent(), ToCleanUp);
1648  if (ConnectToPlaceholder) {
1649  ToEnqueue.push_back(ConnectToPlaceholder);
1650  }
1651  for (Command *Cmd : ToCleanUp) {
1652  cleanupCommand(Cmd);
1653  }
1654  ToEnqueue.push_back(PlaceholderCmd);
1655 
1656  if (MPrintOptionsArray[AfterFusionComplete]) {
1657  printGraphAsDot("after_fusionComplete");
1658  }
1659 
1660  // Set the status for the fusion command.
1661  PlaceholderCmd->setFusionStatus(KernelFusionCommand::FusionStatus::COMPLETE);
1662 
1663  return FusedKernelCmd.release()->getEvent();
1664 #else // SYCL_EXT_CODEPLAY_KERNEL_FUSION
1665  printFusionWarning("Kernel fusion not supported by this build");
1666  (void)PropList;
1667  auto FusionList = findFusionList(QUniqueID);
1668  auto *PlaceholderCmd = FusionList->second.get();
1669  auto LastEvent = PlaceholderCmd->getEvent();
1670  this->cancelFusion(Queue, ToEnqueue);
1671  return LastEvent;
1672 #endif // SYCL_EXT_CODEPLAY_KERNEL_FUSION
1673 }
1674 
1675 bool Scheduler::GraphBuilder::isInFusionMode(QueueIdT Id) {
1676  auto FusionList = findFusionList(Id);
1677  if (FusionList == MFusionMap.end()) {
1678  return false;
1679  }
1680  return FusionList->second->isActive();
1681 }
1682 
1683 Command *Scheduler::GraphBuilder::addCommandGraphUpdate(
1685  std::vector<std::shared_ptr<ext::oneapi::experimental::detail::node_impl>>
1686  Nodes,
1687  const QueueImplPtr &Queue, std::vector<Requirement *> Requirements,
1688  std::vector<detail::EventImplPtr> &Events,
1689  std::vector<Command *> &ToEnqueue) {
1690  auto NewCmd =
1691  std::make_unique<UpdateCommandBufferCommand>(Queue, Graph, Nodes);
1692  // If there are multiple requirements for the same memory object, its
1693  // AllocaCommand creation will be dependent on the access mode of the first
1694  // requirement. Combine these access modes to take all of them into account.
1695  combineAccessModesOfReqs(Requirements);
1696  std::vector<Command *> ToCleanUp;
1697  for (Requirement *Req : Requirements) {
1698  MemObjRecord *Record = nullptr;
1699  AllocaCommandBase *AllocaCmd = nullptr;
1700 
1701  bool isSameCtx = false;
1702 
1703  {
1704 
1705  Record = getOrInsertMemObjRecord(Queue, Req, ToEnqueue);
1706  markModifiedIfWrite(Record, Req);
1707 
1708  AllocaCmd = getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
1709 
1710  isSameCtx = sameCtx(Queue->getContextImplPtr(), Record->MCurContext);
1711  }
1712 
1713  if (!isSameCtx) {
1714  // Cannot directly copy memory from OpenCL device to OpenCL device -
1715  // create two copies: device->host and host->device.
1716  bool NeedMemMoveToHost = false;
1717  auto MemMoveTargetQueue = Queue;
1718 
1719  if (!Queue->is_host() && !Record->MCurContext->is_host())
1720  NeedMemMoveToHost = true;
1721 
1722  if (NeedMemMoveToHost)
1723  insertMemoryMove(Record, Req,
1724  Scheduler::getInstance().getDefaultHostQueue(),
1725  ToEnqueue);
1726  insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue);
1727  }
1728  std::set<Command *> Deps =
1729  findDepsForReq(Record, Req, Queue->getContextImplPtr());
1730 
1731  for (Command *Dep : Deps) {
1732  if (Dep != NewCmd.get()) {
1733  Command *ConnCmd =
1734  NewCmd->addDep(DepDesc{Dep, Req, AllocaCmd}, ToCleanUp);
1735  if (ConnCmd)
1736  ToEnqueue.push_back(ConnCmd);
1737  }
1738  }
1739  }
1740 
1741  // Set new command as user for dependencies and update leaves.
1742  // Node dependencies can be modified further when adding the node to leaves,
1743  // iterate over their copy.
1744  // FIXME employ a reference here to eliminate copying of a vector
1745  std::vector<DepDesc> Deps = NewCmd->MDeps;
1746  for (DepDesc &Dep : Deps) {
1747  const Requirement *Req = Dep.MDepRequirement;
1748  MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj);
1749  updateLeaves({Dep.MDepCommand}, Record, Req->MAccessMode, ToCleanUp);
1750  addNodeToLeaves(Record, NewCmd.get(), Req->MAccessMode, ToEnqueue);
1751  }
1752 
1753  // Register all the events as dependencies
1754  for (detail::EventImplPtr e : Events) {
1755  if (e->getCommand() &&
1756  e->getCommand() == static_cast<Command *>(NewCmd.get())) {
1757  continue;
1758  }
1759  if (Command *ConnCmd = NewCmd->addDep(e, ToCleanUp))
1760  ToEnqueue.push_back(ConnCmd);
1761  }
1762 
1763  if (MPrintOptionsArray[AfterAddCG])
1764  printGraphAsDot("after_addCG");
1765 
1766  for (Command *Cmd : ToCleanUp) {
1767  cleanupCommand(Cmd);
1768  }
1769 
1770  return NewCmd.release();
1771 }
1772 
1773 } // namespace detail
1774 } // namespace _V1
1775 } // namespace sycl
Base class for memory allocation commands.
Definition: commands.hpp:458
const Requirement * getRequirement() const final
Definition: commands.hpp:469
AllocaCommandBase * MLinkedAllocaCmd
Alloca command linked with current command.
Definition: commands.hpp:486
bool MIsActive
Indicates that current alloca is active one.
Definition: commands.hpp:488
The AllocaSubBuf command enqueues creation of sub-buffer of memory object.
Definition: commands.hpp:523
std::unique_ptr< HostTask > MHostTask
Definition: cg.hpp:338
"Update host" command group class.
Definition: cg.hpp:260
Base class for all types of command groups.
Definition: cg.hpp:53
CGTYPE getType() const
Definition: cg.hpp:128
The Command class represents some action that needs to be performed on one or more memory objects.
Definition: commands.hpp:107
bool isSuccessfullyEnqueued() const
Definition: commands.hpp:158
const std::vector< EventImplPtr > & getPreparedDepsEvents() const
Definition: commands.hpp:300
const std::vector< EventImplPtr > & getPreparedHostDepsEvents() const
Definition: commands.hpp:296
unsigned MLeafCounter
Counts the number of memory objects this command is a leaf for.
Definition: commands.hpp:326
std::unordered_set< Command * > MUsers
Contains list of commands that depend on the command.
Definition: commands.hpp:322
virtual const ContextImplPtr & getWorkerContext() const
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:741
Marks MMarks
Used for marking the node during graph traversal.
Definition: commands.hpp:335
std::vector< DepDesc > MDeps
Contains list of dependencies(edges)
Definition: commands.hpp:320
virtual bool readyForCleanup() const
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:754
void addUser(Command *NewUser)
Definition: commands.hpp:141
void clearAllDependencies()
Clear all dependency events This should only be used if a command is about to be deleted without bein...
Definition: commands.hpp:313
const EventImplPtr & getEvent() const
Definition: commands.hpp:180
CommandType getType() const
Definition: commands.hpp:144
virtual void printDot(std::ostream &Stream) const =0
Command * addDep(DepDesc NewDep, std::vector< Command * > &ToCleanUp)
Definition: commands.cpp:759
const QueueImplPtr & getQueue() const
Definition: commands.hpp:178
The empty command does nothing during enqueue.
Definition: commands.hpp:416
The exec CG command enqueues execution of kernel or explicit memory operation.
Definition: commands.hpp:647
detail::CG & getCG() const
Definition: commands.hpp:662
The KernelFusionCommand is placed in the execution graph together with the individual kernels of the ...
Definition: commands.hpp:724
A wrapper for CircularBuffer class along with collection for host accessor's EmptyCommands.
size_t remove(value_type Cmd)
Replacement for std::remove with subsequent call to erase(newEnd, end()).
std::function< void(Command *, Command *, MemObjRecord *, EnqueueListT &)> AllocateDependencyF
std::vector< value_type > toVector() const
std::vector< Command * > EnqueueListT
The map command enqueues mapping of device memory onto host memory.
Definition: commands.hpp:542
The mem copy host command enqueues memory copy between two instances of memory object.
Definition: commands.hpp:605
const Requirement * getRequirement() const final
Definition: commands.hpp:612
virtual ContextImplPtr getInteropContext() const =0
std::shared_ptr< MemObjRecord > MRecord
void cleanupCommand(Command *Cmd, bool AllowUnsubmitted=false)
void decrementLeafCountersForRecord(MemObjRecord *Record)
Decrements leaf counters for all leaves of the record.
MemObjRecord * getMemObjRecord(SYCLMemObjI *MemObject)
EventImplPtr completeFusion(QueueImplPtr Queue, std::vector< Command * > &ToEnqueue, const property_list &)
Command * addHostAccessor(Requirement *Req, std::vector< Command * > &ToEnqueue)
Enqueues a command to create a host accessor.
void cleanupCommandsForRecord(MemObjRecord *Record)
Removes commands that use the given MemObjRecord from the graph.
GraphBuildResult addCG(std::unique_ptr< detail::CG > CommandGroup, const QueueImplPtr &Queue, std::vector< Command * > &ToEnqueue, sycl::detail::pi::PiExtCommandBuffer CommandBuffer=nullptr, const std::vector< sycl::detail::pi::PiExtSyncPoint > &Dependencies={})
Registers command group and adds it to the dependency graph.
void removeRecordForMemObj(SYCLMemObjI *MemObject)
Removes the MemObjRecord for the memory object passed.
Command * addCopyBack(Requirement *Req, std::vector< Command * > &ToEnqueue)
Enqueues a command to update memory to the latest state.
Command * connectDepEvent(Command *const Cmd, const EventImplPtr &DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform connection of events in multiple contexts.
Command * addCGUpdateHost(std::unique_ptr< detail::CG > CommandGroup, const QueueImplPtr &HostQueue, std::vector< Command * > &ToEnqueue)
Registers a command group that updates host memory to the latest state.
MemObjRecord * getOrInsertMemObjRecord(const QueueImplPtr &Queue, const Requirement *Req, std::vector< Command * > &ToEnqueue)
void cancelFusion(QueueImplPtr Queue, std::vector< Command * > &ToEnqueue)
void addNodeToLeaves(MemObjRecord *Record, Command *Cmd, access::mode AccessMode, std::vector< Command * > &ToEnqueue)
Adds new command to leaves if needed.
void cleanUpCmdFusion(sycl::detail::queue_impl *Queue)
Clean up the internal fusion commands held for the given queue.
AllocaCommandBase * findAllocaForReq(MemObjRecord *Record, const Requirement *Req, const ContextImplPtr &Context, bool AllowConst=true)
Searches for suitable alloca in memory record.
DepDesc findDepForRecord(Command *Cmd, MemObjRecord *Record)
Finds a command dependency corresponding to the record.
void updateLeaves(const std::set< Command * > &Cmds, MemObjRecord *Record, access::mode AccessMode, std::vector< Command * > &ToCleanUp)
Removes commands from leaves.
QueueImplPtr getDefaultHostQueue()
Definition: scheduler.hpp:453
bool isInFusionMode(QueueIdT Queue)
Definition: scheduler.cpp:645
void cancelFusion(QueueImplPtr Queue)
Definition: scheduler.cpp:621
static MemObjRecord * getMemObjRecord(const Requirement *const Req)
Definition: scheduler.cpp:428
static Scheduler & getInstance()
Definition: scheduler.cpp:261
void cleanUpCmdFusion(sycl::detail::queue_impl *Queue)
Definition: scheduler.cpp:614
void takeAuxiliaryResources(const EventImplPtr &Dst, const EventImplPtr &Src)
Assign Src's auxiliary resources to Dst.
Definition: scheduler.cpp:574
The unmap command removes mapping of host memory onto device memory.
Definition: commands.hpp:561
const Requirement * getRequirement() const final
Definition: commands.hpp:711
std::unique_ptr< detail::CG > fuseKernels(QueueImplPtr Queue, std::vector< ExecCGCommand * > &InputKernels, const property_list &)
static jit_compiler & get_instance()
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64
Class representing the implementation of command_graph<executable>.
Objects of the property_list class are containers for the SYCL properties.
size_t size() const
Definition: range.hpp:56
__SYCL_EXTERN_STREAM_ATTRS ostream cerr
Linked to standard error (unbuffered)
static bool doOverlap(const Requirement *LHS, const Requirement *RHS)
Checks whether two requirements overlap or not.
static bool createsCircularDependency(Command *Cmd, bool PredPartOfFusion, KernelFusionCommand *Fusion)
static bool isInteropHostTask(ExecCGCommand *Cmd)
static void unmarkVisitedNodes(std::vector< Command * > &Visited)
static bool checkForCircularDependency(Command *, bool, KernelFusionCommand *)
std::hash< std::shared_ptr< detail::queue_impl > >::result_type QueueIdT
Definition: scheduler.hpp:191
static access::mode combineAccessModes(access::mode A, access::mode B)
Combines two access modes into a single one that allows both.
static bool markNodeAsVisited(Command *Cmd, std::vector< Command * > &Visited)
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
Definition: event_impl.hpp:32
std::unique_ptr< KernelFusionCommand > FusionList
Definition: scheduler.hpp:193
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:30
static bool isPartOfFusion(Command *Cmd, KernelFusionCommand *Fusion)
std::shared_ptr< event_impl > EventImplPtr
Definition: cg.hpp:43
static void printDotRecursive(std::fstream &Stream, std::vector< Command * > &Visited, Command *Cmd)
AccessorImplHost Requirement
std::shared_ptr< device_impl > DeviceImplPtr
static void handleVisitedNodes(std::vector< Command * > &Visited)
static bool IsSuitableSubReq(const Requirement *Req)
Checks if current requirement is requirement for sub buffer.
static Command * insertMapUnmapForLinkedCmds(AllocaCommandBase *AllocaCmdSrc, AllocaCommandBase *AllocaCmdDst, access::mode MapMode)
static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx)
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
Definition: event_impl.hpp:34
static bool isAccessModeAllowed(access::mode Required, access::mode Current)
Checks if the required access mode is allowed under the current one.
static void combineAccessModesOfReqs(std::vector< Requirement * > &Reqs)
static bool sameCtx(const ContextImplPtr &LHS, const ContextImplPtr &RHS)
constexpr if(sizeof(T)==8)
class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(local_accessor) local_accessor class __SYCL_EBO __SYCL_SPECIAL_CLASS AccessMode
Definition: accessor.hpp:3233
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:87
Definition: access.hpp:18
bool any_of(const simd_mask< _Tp, _Abi > &) noexcept
bool MVisited
Used for marking the node as visited during graph traversal.
Definition: commands.hpp:330
bool MToBeDeleted
Used for marking the node for deletion during cleanup.
Definition: commands.hpp:332
Dependency between two commands.
Definition: commands.hpp:81
const Requirement * MDepRequirement
Requirement for the dependency.
Definition: commands.hpp:94
Command * MDepCommand
The actual dependency command.
Definition: commands.hpp:92
AllocaCommandBase * MAllocaCmd
Allocation command for the memory object we have requirement for.
Definition: commands.hpp:97
Memory Object Record.
Definition: scheduler.hpp:202
std::vector< AllocaCommandBase * > MAllocaCommands
Definition: scheduler.hpp:209