DPC++ Runtime
Runtime libraries for oneAPI DPC++
graph_builder.cpp
Go to the documentation of this file.
1 //===-- graph_builder.cpp - SYCL Graph Builder ------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "detail/config.hpp"
10 #include <detail/context_impl.hpp>
11 #include <detail/event_impl.hpp>
12 #include <sstream>
13 #include <sycl/feature_test.hpp>
14 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
15 #include <detail/jit_compiler.hpp>
16 #endif
18 #include <detail/queue_impl.hpp>
21 #include <sycl/access/access.hpp>
22 #include <sycl/exception.hpp>
23 
24 #include <algorithm>
25 #include <cstdlib>
26 #include <cstring>
27 #include <fstream>
28 #include <map>
29 #include <memory>
30 #include <queue>
31 #include <set>
32 #include <vector>
33 
34 namespace sycl {
35 inline namespace _V1 {
36 namespace detail {
37 
42 // TODO merge with LeavesCollection's version of doOverlap (see
43 // leaves_collection.cpp).
44 static bool doOverlap(const Requirement *LHS, const Requirement *RHS) {
45  return (LHS->MOffsetInBytes + LHS->MAccessRange.size() * LHS->MElemSize >=
46  RHS->MOffsetInBytes) ||
47  (RHS->MOffsetInBytes + RHS->MAccessRange.size() * RHS->MElemSize >=
48  LHS->MOffsetInBytes);
49 }
50 
51 static bool sameCtx(const ContextImplPtr &LHS, const ContextImplPtr &RHS) {
52  // Consider two different host contexts to be the same to avoid additional
53  // allocation on the host
54  return LHS == RHS || (LHS->is_host() && RHS->is_host());
55 }
56 
58 static bool IsSuitableSubReq(const Requirement *Req) {
59  return Req->MIsSubBuffer;
60 }
61 
63 static bool isAccessModeAllowed(access::mode Required, access::mode Current) {
64  switch (Current) {
65  case access::mode::read:
66  return (Required == Current);
68  assert(false && "Write only access is expected to be mapped as read_write");
69  return (Required == Current || Required == access::mode::discard_write);
74  return true;
75  }
76  assert(false);
77  return false;
78 }
79 
82  if (A == B)
83  return A;
84 
85  if (A == access::mode::discard_write &&
87  return B;
88 
89  if (B == access::mode::discard_write &&
91  return A;
92 
94 }
95 
97  if (const char *EnvVarCStr = SYCLConfig<SYCL_PRINT_EXECUTION_GRAPH>::get()) {
98  std::string GraphPrintOpts(EnvVarCStr);
99  bool EnableAlways = GraphPrintOpts.find("always") != std::string::npos;
100 
101  if (GraphPrintOpts.find("before_addCG") != std::string::npos ||
102  EnableAlways)
103  MPrintOptionsArray[BeforeAddCG] = true;
104  if (GraphPrintOpts.find("after_addCG") != std::string::npos || EnableAlways)
105  MPrintOptionsArray[AfterAddCG] = true;
106  if (GraphPrintOpts.find("before_addCopyBack") != std::string::npos ||
107  EnableAlways)
108  MPrintOptionsArray[BeforeAddCopyBack] = true;
109  if (GraphPrintOpts.find("after_addCopyBack") != std::string::npos ||
110  EnableAlways)
111  MPrintOptionsArray[AfterAddCopyBack] = true;
112  if (GraphPrintOpts.find("before_addHostAcc") != std::string::npos ||
113  EnableAlways)
114  MPrintOptionsArray[BeforeAddHostAcc] = true;
115  if (GraphPrintOpts.find("after_addHostAcc") != std::string::npos ||
116  EnableAlways)
117  MPrintOptionsArray[AfterAddHostAcc] = true;
118  if (GraphPrintOpts.find("after_fusionComplete") != std::string::npos ||
119  EnableAlways)
120  MPrintOptionsArray[AfterFusionComplete] = true;
121  if (GraphPrintOpts.find("after_fusionCancel") != std::string::npos ||
122  EnableAlways)
123  MPrintOptionsArray[AfterFusionCancel] = true;
124  }
125 }
126 
127 static bool markNodeAsVisited(Command *Cmd, std::vector<Command *> &Visited) {
128  assert(Cmd && "Cmd can't be nullptr");
129  if (Cmd->MMarks.MVisited)
130  return false;
131  Cmd->MMarks.MVisited = true;
132  Visited.push_back(Cmd);
133  return true;
134 }
135 
136 static void unmarkVisitedNodes(std::vector<Command *> &Visited) {
137  for (Command *Cmd : Visited)
138  Cmd->MMarks.MVisited = false;
139 }
140 
141 static void handleVisitedNodes(std::vector<Command *> &Visited) {
142  for (Command *Cmd : Visited) {
143  if (Cmd->MMarks.MToBeDeleted) {
144  if (Cmd->getType() == Command::FUSION &&
145  !static_cast<KernelFusionCommand *>(Cmd)->readyForDeletion()) {
146  // Fusion commands might still be needed because fusion might be
147  // aborted, but a later call to complete_fusion still needs to be able
148  // to return a valid event. Clean-up of fusion commands is therefore
149  // explicitly handled by start fusion.
150  return;
151  }
152  Cmd->getEvent()->setCommand(nullptr);
153  delete Cmd;
154  } else
155  Cmd->MMarks.MVisited = false;
156  }
157 }
158 
159 static void printDotRecursive(std::fstream &Stream,
160  std::vector<Command *> &Visited, Command *Cmd) {
161  if (!markNodeAsVisited(Cmd, Visited))
162  return;
163  for (Command *User : Cmd->MUsers) {
164  if (User)
165  printDotRecursive(Stream, Visited, User);
166  }
167  Cmd->printDot(Stream);
168 }
169 
170 void Scheduler::GraphBuilder::printGraphAsDot(const char *ModeName) {
171  static size_t Counter = 0;
172  std::string ModeNameStr(ModeName);
173  std::string FileName =
174  "graph_" + std::to_string(Counter) + ModeNameStr + ".dot";
175 
176  Counter++;
177 
178  std::fstream Stream(FileName, std::ios::out);
179  Stream << "strict digraph {" << std::endl;
180 
181  MVisitedCmds.clear();
182 
183  for (SYCLMemObjI *MemObject : MMemObjs)
184  for (Command *AllocaCmd : MemObject->MRecord->MAllocaCommands)
185  printDotRecursive(Stream, MVisitedCmds, AllocaCmd);
186 
187  Stream << "}" << std::endl;
188 
189  unmarkVisitedNodes(MVisitedCmds);
190 }
191 
193  return MemObject->MRecord.get();
194 }
195 
197  const QueueImplPtr &Queue, const Requirement *Req,
198  std::vector<Command *> &ToEnqueue) {
199  SYCLMemObjI *MemObject = Req->MSYCLMemObj;
200  MemObjRecord *Record = getMemObjRecord(MemObject);
201 
202  if (nullptr != Record)
203  return Record;
204 
205  const size_t LeafLimit = 8;
206  LeavesCollection::AllocateDependencyF AllocateDependency =
207  [this](Command *Dependant, Command *Dependency, MemObjRecord *Record,
208  LeavesCollection::EnqueueListT &ToEnqueue) {
209  // Add the old leaf as a dependency for the new one by duplicating one
210  // of the requirements for the current record
211  DepDesc Dep = findDepForRecord(Dependant, Record);
212  Dep.MDepCommand = Dependency;
213  std::vector<Command *> ToCleanUp;
214  Command *ConnectionCmd = Dependant->addDep(Dep, ToCleanUp);
215  if (ConnectionCmd)
216  ToEnqueue.push_back(ConnectionCmd);
217 
218  --(Dependency->MLeafCounter);
219  if (Dependency->readyForCleanup())
220  ToCleanUp.push_back(Dependency);
221  for (Command *Cmd : ToCleanUp)
222  cleanupCommand(Cmd);
223  };
224 
225  const ContextImplPtr &InteropCtxPtr = Req->MSYCLMemObj->getInteropContext();
226  if (InteropCtxPtr) {
227  // The memory object has been constructed using interoperability constructor
228  // which means that there is already an allocation(cl_mem) in some context.
229  // Registering this allocation in the SYCL graph.
230 
231  std::vector<sycl::device> Devices =
232  InteropCtxPtr->get_info<info::context::devices>();
233  assert(Devices.size() != 0);
234  DeviceImplPtr Dev = detail::getSyclObjImpl(Devices[0]);
235 
236  // Since all the Scheduler commands require queue but we have only context
237  // here, we need to create a dummy queue bound to the context and one of the
238  // devices from the context.
239  QueueImplPtr InteropQueuePtr{new detail::queue_impl{
240  Dev, InteropCtxPtr, /*AsyncHandler=*/{}, /*PropertyList=*/{}}};
241 
242  MemObject->MRecord.reset(
243  new MemObjRecord{InteropCtxPtr, LeafLimit, AllocateDependency});
244  getOrCreateAllocaForReq(MemObject->MRecord.get(), Req, InteropQueuePtr,
245  ToEnqueue);
246  } else
247  MemObject->MRecord.reset(new MemObjRecord{Queue->getContextImplPtr(),
248  LeafLimit, AllocateDependency});
249 
250  MMemObjs.push_back(MemObject);
251  return MemObject->MRecord.get();
252 }
253 
254 void Scheduler::GraphBuilder::updateLeaves(const std::set<Command *> &Cmds,
255  MemObjRecord *Record,
257  std::vector<Command *> &ToCleanUp) {
258 
259  const bool ReadOnlyReq = AccessMode == access::mode::read;
260  if (ReadOnlyReq)
261  return;
262 
263  for (Command *Cmd : Cmds) {
264  bool WasLeaf = Cmd->MLeafCounter > 0;
265  Cmd->MLeafCounter -= Record->MReadLeaves.remove(Cmd);
266  Cmd->MLeafCounter -= Record->MWriteLeaves.remove(Cmd);
267  if (WasLeaf && Cmd->readyForCleanup()) {
268  ToCleanUp.push_back(Cmd);
269  }
270  }
271 }
272 
275  std::vector<Command *> &ToEnqueue) {
277  ? Record->MReadLeaves
278  : Record->MWriteLeaves};
279  if (Leaves.push_back(Cmd, ToEnqueue))
280  ++Cmd->MLeafCounter;
281 }
282 
283 UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd(
284  MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue,
285  std::vector<Command *> &ToEnqueue) {
286  AllocaCommandBase *AllocaCmd =
287  findAllocaForReq(Record, Req, Queue->getContextImplPtr());
288  assert(AllocaCmd && "There must be alloca for requirement!");
289  UpdateHostRequirementCommand *UpdateCommand =
290  new UpdateHostRequirementCommand(Queue, *Req, AllocaCmd, &Req->MData);
291  // Need copy of requirement because after host accessor destructor call
292  // dependencies become invalid if requirement is stored by pointer.
293  const Requirement *StoredReq = UpdateCommand->getRequirement();
294 
295  std::set<Command *> Deps =
296  findDepsForReq(Record, Req, Queue->getContextImplPtr());
297  std::vector<Command *> ToCleanUp;
298  for (Command *Dep : Deps) {
299  Command *ConnCmd =
300  UpdateCommand->addDep(DepDesc{Dep, StoredReq, AllocaCmd}, ToCleanUp);
301  if (ConnCmd)
302  ToEnqueue.push_back(ConnCmd);
303  }
304  updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp);
305  addNodeToLeaves(Record, UpdateCommand, Req->MAccessMode, ToEnqueue);
306  for (Command *Cmd : ToCleanUp)
307  cleanupCommand(Cmd);
308  return UpdateCommand;
309 }
310 
311 // Takes linked alloca commands. Makes AllocaCmdDst command active using map
312 // or unmap operation.
314  AllocaCommandBase *AllocaCmdDst,
315  access::mode MapMode) {
316  assert(AllocaCmdSrc->MLinkedAllocaCmd == AllocaCmdDst &&
317  "Expected linked alloca commands");
318  assert(AllocaCmdSrc->MIsActive &&
319  "Expected source alloca command to be active");
320 
321  if (AllocaCmdSrc->getQueue()->is_host()) {
322  UnMapMemObject *UnMapCmd = new UnMapMemObject(
323  AllocaCmdDst, *AllocaCmdDst->getRequirement(),
324  &AllocaCmdSrc->MMemAllocation, AllocaCmdDst->getQueue());
325 
326  std::swap(AllocaCmdSrc->MIsActive, AllocaCmdDst->MIsActive);
327 
328  return UnMapCmd;
329  }
330 
331  MapMemObject *MapCmd = new MapMemObject(
332  AllocaCmdSrc, *AllocaCmdSrc->getRequirement(),
333  &AllocaCmdDst->MMemAllocation, AllocaCmdSrc->getQueue(), MapMode);
334 
335  std::swap(AllocaCmdSrc->MIsActive, AllocaCmdDst->MIsActive);
336 
337  return MapCmd;
338 }
339 
340 Command *Scheduler::GraphBuilder::insertMemoryMove(
341  MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue,
342  std::vector<Command *> &ToEnqueue) {
343 
344  AllocaCommandBase *AllocaCmdDst =
345  getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
346  if (!AllocaCmdDst)
347  throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
348 
349  std::set<Command *> Deps =
350  findDepsForReq(Record, Req, Queue->getContextImplPtr());
351  Deps.insert(AllocaCmdDst);
352  // Get parent allocation of sub buffer to perform full copy of whole buffer
353  if (IsSuitableSubReq(Req)) {
354  if (AllocaCmdDst->getType() == Command::CommandType::ALLOCA_SUB_BUF)
355  AllocaCmdDst =
356  static_cast<AllocaSubBufCommand *>(AllocaCmdDst)->getParentAlloca();
357  }
358 
359  AllocaCommandBase *AllocaCmdSrc =
360  findAllocaForReq(Record, Req, Record->MCurContext);
361  if (!AllocaCmdSrc && IsSuitableSubReq(Req)) {
362  // Since no alloca command for the sub buffer requirement was found in the
363  // current context, need to find a parent alloca command for it (it must be
364  // there)
365  auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) {
366  bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(),
367  Record->MCurContext) &&
368  // Looking for a parent buffer alloca command
369  AllocaCmd->getType() == Command::CommandType::ALLOCA;
370  return Res;
371  };
372  const auto It =
373  std::find_if(Record->MAllocaCommands.begin(),
374  Record->MAllocaCommands.end(), IsSuitableAlloca);
375  AllocaCmdSrc = (Record->MAllocaCommands.end() != It) ? *It : nullptr;
376  }
377  if (!AllocaCmdSrc)
378  throw runtime_error("Cannot find buffer allocation",
379  PI_ERROR_INVALID_VALUE);
380  // Get parent allocation of sub buffer to perform full copy of whole buffer
381  if (IsSuitableSubReq(Req)) {
382  if (AllocaCmdSrc->getType() == Command::CommandType::ALLOCA_SUB_BUF)
383  AllocaCmdSrc =
384  static_cast<AllocaSubBufCommand *>(AllocaCmdSrc)->getParentAlloca();
385  else if (AllocaCmdSrc->getSYCLMemObj() != Req->MSYCLMemObj)
386  assert(false && "Inappropriate alloca command.");
387  }
388 
389  Command *NewCmd = nullptr;
390 
391  if (AllocaCmdSrc->MLinkedAllocaCmd == AllocaCmdDst) {
392  // Map write only as read-write
393  access::mode MapMode = Req->MAccessMode;
394  if (MapMode == access::mode::write)
395  MapMode = access::mode::read_write;
396  NewCmd = insertMapUnmapForLinkedCmds(AllocaCmdSrc, AllocaCmdDst, MapMode);
397  Record->MHostAccess = MapMode;
398  } else {
399 
400  if ((Req->MAccessMode == access::mode::discard_write) ||
401  (Req->MAccessMode == access::mode::discard_read_write)) {
402  Record->MCurContext = Queue->getContextImplPtr();
403  return nullptr;
404  } else {
405  // Full copy of buffer is needed to avoid loss of data that may be caused
406  // by copying specific range from host to device and backwards.
407  NewCmd =
408  new MemCpyCommand(*AllocaCmdSrc->getRequirement(), AllocaCmdSrc,
409  *AllocaCmdDst->getRequirement(), AllocaCmdDst,
410  AllocaCmdSrc->getQueue(), AllocaCmdDst->getQueue());
411  }
412  }
413  std::vector<Command *> ToCleanUp;
414  for (Command *Dep : Deps) {
415  Command *ConnCmd = NewCmd->addDep(
416  DepDesc{Dep, NewCmd->getRequirement(), AllocaCmdDst}, ToCleanUp);
417  if (ConnCmd)
418  ToEnqueue.push_back(ConnCmd);
419  }
420  updateLeaves(Deps, Record, access::mode::read_write, ToCleanUp);
421  addNodeToLeaves(Record, NewCmd, access::mode::read_write, ToEnqueue);
422  for (Command *Cmd : ToCleanUp)
423  cleanupCommand(Cmd);
424  Record->MCurContext = Queue->getContextImplPtr();
425  return NewCmd;
426 }
427 
428 Command *Scheduler::GraphBuilder::remapMemoryObject(
429  MemObjRecord *Record, Requirement *Req, AllocaCommandBase *HostAllocaCmd,
430  std::vector<Command *> &ToEnqueue) {
431  assert(HostAllocaCmd->getQueue()->is_host() &&
432  "Host alloca command expected");
433  assert(HostAllocaCmd->MIsActive && "Active alloca command expected");
434 
435  AllocaCommandBase *LinkedAllocaCmd = HostAllocaCmd->MLinkedAllocaCmd;
436  assert(LinkedAllocaCmd && "Linked alloca command expected");
437 
438  std::set<Command *> Deps = findDepsForReq(Record, Req, Record->MCurContext);
439 
440  UnMapMemObject *UnMapCmd = new UnMapMemObject(
441  LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(),
442  &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue());
443 
444  // Map write only as read-write
445  access::mode MapMode = Req->MAccessMode;
446  if (MapMode == access::mode::write)
447  MapMode = access::mode::read_write;
448  MapMemObject *MapCmd = new MapMemObject(
449  LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(),
450  &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue(), MapMode);
451 
452  std::vector<Command *> ToCleanUp;
453  for (Command *Dep : Deps) {
454  Command *ConnCmd = UnMapCmd->addDep(
455  DepDesc{Dep, UnMapCmd->getRequirement(), LinkedAllocaCmd}, ToCleanUp);
456  if (ConnCmd)
457  ToEnqueue.push_back(ConnCmd);
458  }
459 
460  Command *ConnCmd = MapCmd->addDep(
461  DepDesc{UnMapCmd, MapCmd->getRequirement(), HostAllocaCmd}, ToCleanUp);
462  if (ConnCmd)
463  ToEnqueue.push_back(ConnCmd);
464 
465  updateLeaves(Deps, Record, access::mode::read_write, ToCleanUp);
466  addNodeToLeaves(Record, MapCmd, access::mode::read_write, ToEnqueue);
467  for (Command *Cmd : ToCleanUp)
468  cleanupCommand(Cmd);
469  Record->MHostAccess = MapMode;
470  return MapCmd;
471 }
472 
473 // The function adds copy operation of the up to date'st memory to the memory
474 // pointed by Req.
475 Command *
477  std::vector<Command *> &ToEnqueue) {
479  SYCLMemObjI *MemObj = Req->MSYCLMemObj;
480  MemObjRecord *Record = getMemObjRecord(MemObj);
481  if (Record && MPrintOptionsArray[BeforeAddCopyBack])
482  printGraphAsDot("before_addCopyBack");
483 
484  // Do nothing if there were no or only read operations with the memory object.
485  if (nullptr == Record || !Record->MMemModified)
486  return nullptr;
487 
488  std::set<Command *> Deps =
489  findDepsForReq(Record, Req, HostQueue->getContextImplPtr());
490  AllocaCommandBase *SrcAllocaCmd =
491  findAllocaForReq(Record, Req, Record->MCurContext);
492 
493  auto MemCpyCmdUniquePtr = std::make_unique<MemCpyCommandHost>(
494  *SrcAllocaCmd->getRequirement(), SrcAllocaCmd, *Req, &Req->MData,
495  SrcAllocaCmd->getQueue(), std::move(HostQueue));
496 
497  if (!MemCpyCmdUniquePtr)
498  throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
499 
500  MemCpyCommandHost *MemCpyCmd = MemCpyCmdUniquePtr.release();
501 
502  std::vector<Command *> ToCleanUp;
503  for (Command *Dep : Deps) {
504  Command *ConnCmd = MemCpyCmd->addDep(
505  DepDesc{Dep, MemCpyCmd->getRequirement(), SrcAllocaCmd}, ToCleanUp);
506  if (ConnCmd)
507  ToEnqueue.push_back(ConnCmd);
508  }
509 
510  updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp);
511  addNodeToLeaves(Record, MemCpyCmd, Req->MAccessMode, ToEnqueue);
512  for (Command *Cmd : ToCleanUp)
513  cleanupCommand(Cmd);
514  if (MPrintOptionsArray[AfterAddCopyBack])
515  printGraphAsDot("after_addCopyBack");
516  return MemCpyCmd;
517 }
518 
519 // The function implements SYCL host accessor logic: host accessor
520 // should provide access to the buffer in user space.
521 Command *
523  std::vector<Command *> &ToEnqueue) {
524 
525  if (Req->MAccessMode != sycl::access_mode::read) {
526  auto SYCLMemObj = static_cast<detail::SYCLMemObjT *>(Req->MSYCLMemObj);
527  SYCLMemObj->handleWriteAccessorCreation();
528  }
529 
530  const QueueImplPtr &HostQueue = getInstance().getDefaultHostQueue();
531 
532  MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req, ToEnqueue);
533  if (MPrintOptionsArray[BeforeAddHostAcc])
534  printGraphAsDot("before_addHostAccessor");
535  markModifiedIfWrite(Record, Req);
536 
537  AllocaCommandBase *HostAllocaCmd =
538  getOrCreateAllocaForReq(Record, Req, HostQueue, ToEnqueue);
539 
540  if (sameCtx(HostAllocaCmd->getQueue()->getContextImplPtr(),
541  Record->MCurContext)) {
542  if (!isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess)) {
543  remapMemoryObject(Record, Req,
544  Req->MIsSubBuffer ? (static_cast<AllocaSubBufCommand *>(
545  HostAllocaCmd))
546  ->getParentAlloca()
547  : HostAllocaCmd,
548  ToEnqueue);
549  }
550  } else
551  insertMemoryMove(Record, Req, HostQueue, ToEnqueue);
552 
553  Command *UpdateHostAccCmd =
554  insertUpdateHostReqCmd(Record, Req, HostQueue, ToEnqueue);
555 
556  // Need empty command to be blocked until host accessor is destructed
557  EmptyCommand *EmptyCmd =
558  addEmptyCmd(UpdateHostAccCmd, {Req}, HostQueue,
560 
561  Req->MBlockedCmd = EmptyCmd;
562 
563  if (MPrintOptionsArray[AfterAddHostAcc])
564  printGraphAsDot("after_addHostAccessor");
565 
566  return UpdateHostAccCmd;
567 }
568 
570  std::unique_ptr<detail::CG> CommandGroup, const QueueImplPtr &HostQueue,
571  std::vector<Command *> &ToEnqueue) {
572 
573  auto UpdateHost = static_cast<CGUpdateHost *>(CommandGroup.get());
574  Requirement *Req = UpdateHost->getReqToUpdate();
575 
576  MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req, ToEnqueue);
577  return insertMemoryMove(Record, Req, HostQueue, ToEnqueue);
578 }
579 
588 std::set<Command *>
589 Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record,
590  const Requirement *Req,
591  const ContextImplPtr &Context) {
592  std::set<Command *> RetDeps;
593  std::vector<Command *> Visited;
594  const bool ReadOnlyReq = Req->MAccessMode == access::mode::read;
595 
596  std::vector<Command *> ToAnalyze{Record->MWriteLeaves.toVector()};
597 
598  if (!ReadOnlyReq) {
599  std::vector<Command *> V{Record->MReadLeaves.toVector()};
600 
601  ToAnalyze.insert(ToAnalyze.begin(), V.begin(), V.end());
602  }
603 
604  while (!ToAnalyze.empty()) {
605  Command *DepCmd = ToAnalyze.back();
606  ToAnalyze.pop_back();
607 
608  std::vector<Command *> NewAnalyze;
609 
610  for (const DepDesc &Dep : DepCmd->MDeps) {
611  if (Dep.MDepRequirement->MSYCLMemObj != Req->MSYCLMemObj)
612  continue;
613 
614  bool CanBypassDep = false;
615  // If both only read
616  CanBypassDep |=
617  Dep.MDepRequirement->MAccessMode == access::mode::read && ReadOnlyReq;
618 
619  // If not overlap
620  CanBypassDep |= !doOverlap(Dep.MDepRequirement, Req);
621 
622  // Going through copying memory between contexts is not supported.
623  if (Dep.MDepCommand)
624  CanBypassDep &=
625  sameCtx(Context, Dep.MDepCommand->getQueue()->getContextImplPtr());
626 
627  if (!CanBypassDep) {
628  RetDeps.insert(DepCmd);
629  // No need to analyze deps of examining command as it's dependency
630  // itself.
631  NewAnalyze.clear();
632  break;
633  }
634 
635  if (markNodeAsVisited(Dep.MDepCommand, Visited))
636  NewAnalyze.push_back(Dep.MDepCommand);
637  }
638  ToAnalyze.insert(ToAnalyze.end(), NewAnalyze.begin(), NewAnalyze.end());
639  }
640  unmarkVisitedNodes(Visited);
641  return RetDeps;
642 }
643 
644 // A helper function for finding a command dependency on a specific memory
645 // object
647  MemObjRecord *Record) {
648  for (const DepDesc &DD : Cmd->MDeps) {
649  if (getMemObjRecord(DD.MDepRequirement->MSYCLMemObj) == Record) {
650  return DD;
651  }
652  }
653  assert(false && "No dependency found for a leaf of the record");
654  return {nullptr, nullptr, nullptr};
655 }
656 
657 // The function searches for the alloca command matching context and
658 // requirement.
660  MemObjRecord *Record, const Requirement *Req, const ContextImplPtr &Context,
661  bool AllowConst) {
662  auto IsSuitableAlloca = [&Context, Req,
663  AllowConst](AllocaCommandBase *AllocaCmd) {
664  bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(), Context);
665  if (IsSuitableSubReq(Req)) {
666  const Requirement *TmpReq = AllocaCmd->getRequirement();
667  Res &= AllocaCmd->getType() == Command::CommandType::ALLOCA_SUB_BUF;
668  Res &= TmpReq->MOffsetInBytes == Req->MOffsetInBytes;
669  Res &= TmpReq->MAccessRange == Req->MAccessRange;
670  Res &= AllowConst || !AllocaCmd->MIsConst;
671  }
672  return Res;
673  };
674  const auto It = std::find_if(Record->MAllocaCommands.begin(),
675  Record->MAllocaCommands.end(), IsSuitableAlloca);
676  return (Record->MAllocaCommands.end() != It) ? *It : nullptr;
677 }
678 
679 static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) {
680  if (const char *HUMConfig = SYCLConfig<SYCL_HOST_UNIFIED_MEMORY>::get()) {
681  if (std::strcmp(HUMConfig, "0") == 0)
682  return Ctx->is_host();
683  if (std::strcmp(HUMConfig, "1") == 0)
684  return true;
685  }
686  for (const device &Device : Ctx->getDevices()) {
687  if (!Device.get_info<info::device::host_unified_memory>())
688  return false;
689  }
690  return true;
691 }
692 
693 // The function searches for the alloca command matching context and
694 // requirement. If none exists, new allocation command is created.
695 // Note, creation of new allocation command can lead to the current context
696 // (Record->MCurContext) change.
697 AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
698  MemObjRecord *Record, const Requirement *Req, const QueueImplPtr &Queue,
699  std::vector<Command *> &ToEnqueue) {
700 
701  AllocaCommandBase *AllocaCmd = findAllocaForReq(
702  Record, Req, Queue->getContextImplPtr(), /*AllowConst=*/false);
703 
704  if (!AllocaCmd) {
705  std::vector<Command *> ToCleanUp;
706  if (IsSuitableSubReq(Req)) {
707  // Get parent requirement. It's hard to get right parents' range
708  // so full parent requirement has range represented in bytes
709  range<3> ParentRange{Req->MSYCLMemObj->getSizeInBytes(), 1, 1};
710  Requirement ParentRequirement(
711  /*Offset*/ {0, 0, 0}, ParentRange, ParentRange,
712  access::mode::read_write, Req->MSYCLMemObj, /*Dims*/ 1,
713  /*Working with bytes*/ sizeof(char), /*offset*/ size_t(0));
714 
715  auto *ParentAlloca =
716  getOrCreateAllocaForReq(Record, &ParentRequirement, Queue, ToEnqueue);
717  AllocaCmd = new AllocaSubBufCommand(Queue, *Req, ParentAlloca, ToEnqueue,
718  ToCleanUp);
719  } else {
720 
721  const Requirement FullReq(/*Offset*/ {0, 0, 0}, Req->MMemoryRange,
722  Req->MMemoryRange, access::mode::read_write,
723  Req->MSYCLMemObj, Req->MDims, Req->MElemSize,
724  0 /*ReMOffsetInBytes*/, false /*MIsSubBuffer*/);
725  // Can reuse user data for the first allocation. Do so if host unified
726  // memory is supported regardless of the access mode (the pointer will be
727  // reused). For devices without host unified memory the initialization
728  // will be performed as a write operation.
729  // TODO the case where the first alloca is made with a discard mode and
730  // the user pointer is read-only is still not handled: it leads to
731  // unnecessary copy on devices with unified host memory support.
732  const bool HostUnifiedMemory =
733  checkHostUnifiedMemory(Queue->getContextImplPtr());
734  SYCLMemObjI *MemObj = Req->MSYCLMemObj;
735  const bool InitFromUserData = Record->MAllocaCommands.empty() &&
736  (HostUnifiedMemory || MemObj->isInterop());
737  AllocaCommandBase *LinkedAllocaCmd = nullptr;
738 
739  // For the first allocation on a device without host unified memory we
740  // might need to also create a host alloca right away in order to perform
741  // the initial memory write.
742  if (Record->MAllocaCommands.empty()) {
743  if (!HostUnifiedMemory &&
744  Req->MAccessMode != access::mode::discard_write &&
745  Req->MAccessMode != access::mode::discard_read_write) {
746  // There's no need to make a host allocation if the buffer is not
747  // initialized with user data.
748  if (MemObj->hasUserDataPtr()) {
751  AllocaCommand *HostAllocaCmd = new AllocaCommand(
752  DefaultHostQueue, FullReq, true /* InitFromUserData */,
753  nullptr /* LinkedAllocaCmd */,
754  MemObj->isHostPointerReadOnly() /* IsConst */);
755  Record->MAllocaCommands.push_back(HostAllocaCmd);
756  Record->MWriteLeaves.push_back(HostAllocaCmd, ToEnqueue);
757  ++(HostAllocaCmd->MLeafCounter);
758  Record->MCurContext = DefaultHostQueue->getContextImplPtr();
759  }
760  }
761  } else {
762  // If it is not the first allocation, try to setup a link
763  // FIXME: Temporary limitation, linked alloca commands for an image is
764  // not supported because map operation is not implemented for an image.
765  if (Req->MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer)
766  // Current limitation is to setup link between current allocation and
767  // new one. There could be situations when we could setup link with
768  // "not" current allocation, but it will require memory copy.
769  // Can setup link between cl and host allocations only
770  if (Queue->is_host() != Record->MCurContext->is_host()) {
771  // Linked commands assume that the host allocation is reused by the
772  // plugin runtime and that can lead to unnecessary copy overhead on
773  // devices that do not support host unified memory. Do not link the
774  // allocations in this case.
775  // However, if the user explicitly requests use of pinned host
776  // memory, map/unmap operations are expected to work faster than
777  // read/write from/to an artbitrary host pointer. Link such commands
778  // regardless of host unified memory support.
779  bool PinnedHostMemory = MemObj->usesPinnedHostMemory();
780 
781  bool HostUnifiedMemoryOnNonHostDevice =
782  Queue->is_host() ? checkHostUnifiedMemory(Record->MCurContext)
783  : HostUnifiedMemory;
784  if (PinnedHostMemory || HostUnifiedMemoryOnNonHostDevice) {
785  AllocaCommandBase *LinkedAllocaCmdCand = findAllocaForReq(
786  Record, Req, Record->MCurContext, /*AllowConst=*/false);
787 
788  // Cannot setup link if candidate is linked already
789  if (LinkedAllocaCmdCand &&
790  !LinkedAllocaCmdCand->MLinkedAllocaCmd) {
791  LinkedAllocaCmd = LinkedAllocaCmdCand;
792  }
793  }
794  }
795  }
796 
797  AllocaCmd =
798  new AllocaCommand(Queue, FullReq, InitFromUserData, LinkedAllocaCmd);
799 
800  // Update linked command
801  if (LinkedAllocaCmd) {
802  Command *ConnCmd = AllocaCmd->addDep(
803  DepDesc{LinkedAllocaCmd, AllocaCmd->getRequirement(),
804  LinkedAllocaCmd},
805  ToCleanUp);
806  if (ConnCmd)
807  ToEnqueue.push_back(ConnCmd);
808  LinkedAllocaCmd->MLinkedAllocaCmd = AllocaCmd;
809 
810  // To ensure that the leader allocation is removed first
811  ConnCmd = AllocaCmd->getReleaseCmd()->addDep(
812  DepDesc(LinkedAllocaCmd->getReleaseCmd(),
813  AllocaCmd->getRequirement(), LinkedAllocaCmd),
814  ToCleanUp);
815  if (ConnCmd)
816  ToEnqueue.push_back(ConnCmd);
817 
818  // Device allocation takes ownership of the host ptr during
819  // construction, host allocation doesn't. So, device allocation should
820  // always be active here. Also if the "follower" command is a device one
821  // we have to change current context to the device one.
822  if (Queue->is_host()) {
823  AllocaCmd->MIsActive = false;
824  } else {
825  LinkedAllocaCmd->MIsActive = false;
826  Record->MCurContext = Queue->getContextImplPtr();
827 
828  std::set<Command *> Deps =
829  findDepsForReq(Record, Req, Queue->getContextImplPtr());
830  for (Command *Dep : Deps) {
831  Command *ConnCmd = AllocaCmd->addDep(
832  DepDesc{Dep, Req, LinkedAllocaCmd}, ToCleanUp);
833  if (ConnCmd)
834  ToEnqueue.push_back(ConnCmd);
835  }
836  updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp);
837  addNodeToLeaves(Record, AllocaCmd, Req->MAccessMode, ToEnqueue);
838  }
839  }
840  }
841 
842  Record->MAllocaCommands.push_back(AllocaCmd);
843  Record->MWriteLeaves.push_back(AllocaCmd, ToEnqueue);
844  ++(AllocaCmd->MLeafCounter);
845  for (Command *Cmd : ToCleanUp)
846  cleanupCommand(Cmd);
847  }
848  return AllocaCmd;
849 }
850 
851 // The function sets MemModified flag in record if requirement has write access.
852 void Scheduler::GraphBuilder::markModifiedIfWrite(MemObjRecord *Record,
853  Requirement *Req) {
854  switch (Req->MAccessMode) {
855  case access::mode::write:
860  Record->MMemModified = true;
861  break;
862  case access::mode::read:
863  break;
864  }
865 }
866 
867 EmptyCommand *Scheduler::GraphBuilder::addEmptyCmd(
868  Command *Cmd, const std::vector<Requirement *> &Reqs,
869  const QueueImplPtr &Queue, Command::BlockReason Reason,
870  std::vector<Command *> &ToEnqueue, const bool AddDepsToLeaves) {
871  EmptyCommand *EmptyCmd =
872  new EmptyCommand(Scheduler::getInstance().getDefaultHostQueue());
873 
874  if (!EmptyCmd)
875  throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
876 
877  EmptyCmd->MIsBlockable = true;
878  EmptyCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueBlocked;
879  EmptyCmd->MBlockReason = Reason;
880 
881  for (Requirement *Req : Reqs) {
882  MemObjRecord *Record = getOrInsertMemObjRecord(Queue, Req, ToEnqueue);
883  AllocaCommandBase *AllocaCmd =
884  getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
885  EmptyCmd->addRequirement(Cmd, AllocaCmd, Req);
886  }
887  // addRequirement above call addDep that already will add EmptyCmd as user for
888  // Cmd no Reqs size check here so assume it is possible to have no Reqs passed
889  if (!Reqs.size())
890  Cmd->addUser(EmptyCmd);
891 
892  if (AddDepsToLeaves) {
893  const std::vector<DepDesc> &Deps = Cmd->MDeps;
894  std::vector<Command *> ToCleanUp;
895  for (const DepDesc &Dep : Deps) {
896  const Requirement *Req = Dep.MDepRequirement;
897  MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj);
898 
899  updateLeaves({Cmd}, Record, Req->MAccessMode, ToCleanUp);
900  addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode, ToEnqueue);
901  }
902  for (Command *Cmd : ToCleanUp)
903  cleanupCommand(Cmd);
904  }
905 
906  return EmptyCmd;
907 }
908 
909 static bool isInteropHostTask(ExecCGCommand *Cmd) {
910  if (Cmd->getCG().getType() != CG::CGTYPE::CodeplayHostTask)
911  return false;
912 
913  const detail::CGHostTask &HT =
914  static_cast<detail::CGHostTask &>(Cmd->getCG());
915 
916  return HT.MHostTask->isInteropTask();
917 }
918 
919 static void combineAccessModesOfReqs(std::vector<Requirement *> &Reqs) {
920  std::unordered_map<SYCLMemObjI *, access::mode> CombinedModes;
921  bool HasDuplicateMemObjects = false;
922  for (const Requirement *Req : Reqs) {
923  auto Result = CombinedModes.insert(
924  std::make_pair(Req->MSYCLMemObj, Req->MAccessMode));
925  if (!Result.second) {
926  Result.first->second =
927  combineAccessModes(Result.first->second, Req->MAccessMode);
928  HasDuplicateMemObjects = true;
929  }
930  }
931 
932  if (!HasDuplicateMemObjects)
933  return;
934  for (Requirement *Req : Reqs) {
935  Req->MAccessMode = CombinedModes[Req->MSYCLMemObj];
936  }
937 }
938 
940  std::unique_ptr<detail::CG> CommandGroup, const QueueImplPtr &Queue,
941  std::vector<Command *> &ToEnqueue,
943  const std::vector<sycl::detail::pi::PiExtSyncPoint> &Dependencies) {
944  std::vector<Requirement *> &Reqs = CommandGroup->getRequirements();
945  std::vector<detail::EventImplPtr> &Events = CommandGroup->getEvents();
946 
947  auto NewCmd = std::make_unique<ExecCGCommand>(
948  std::move(CommandGroup), Queue, CommandBuffer, std::move(Dependencies));
949 
950  if (!NewCmd)
951  throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
952 
953  // Only device kernel command groups can participate in fusion. Otherwise,
954  // command groups take the regular route. If they create any requirement or
955  // event dependency on any of the kernels in the fusion list, this will lead
956  // to cancellation of the fusion in the GraphProcessor.
957  auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
958  if (isInFusionMode(QUniqueID)) {
959  if (NewCmd->isFusable()) {
960  auto *FusionCmd = findFusionList(QUniqueID)->second.get();
961 
962  bool dependsOnFusion = false;
963  for (auto Ev = Events.begin(); Ev != Events.end();) {
964  auto *EvDepCmd = static_cast<Command *>((*Ev)->getCommand());
965  if (!EvDepCmd) {
966  ++Ev;
967  continue;
968  }
969  // Event dependencies on commands part of another active fusion are
970  // handled by cancelling fusion in that other queue.
971  if (EvDepCmd->getQueue() != Queue && isPartOfActiveFusion(EvDepCmd)) {
972  printFusionWarning(
973  "Aborting fusion because of event dependency from a "
974  "different fusion");
975  cancelFusion(EvDepCmd->getQueue(), ToEnqueue);
976  }
977  // Check if this command depends on the placeholder command for the
978  // fusion itself participates in.
979  if (EvDepCmd == FusionCmd) {
980  Ev = Events.erase(Ev);
981  dependsOnFusion = true;
982  } else {
983  ++Ev;
984  }
985  }
986 
987  // If this command has an explicit event dependency on the placeholder
988  // command for this fusion (because it used depends_on on the event
989  // returned by submitting another kernel to this fusion earlier), add a
990  // dependency on all the commands in the fusion list so far.
991  if (dependsOnFusion) {
992  for (auto *Cmd : FusionCmd->getFusionList()) {
993  Events.push_back(Cmd->getEvent());
994  }
995  }
996 
997  // Add the kernel to the graph, but delay the enqueue of any auxiliary
998  // commands (e.g., allocations) resulting from that process by adding them
999  // to the list of auxiliary commands of the fusion command.
1000  createGraphForCommand(NewCmd.get(), NewCmd->getCG(),
1001  isInteropHostTask(NewCmd.get()), Reqs, Events,
1002  Queue, FusionCmd->auxiliaryCommands());
1003 
1004  // Set the fusion command, so we recognize when another command depends on
1005  // a kernel in the fusion list.
1006  FusionCmd->addToFusionList(NewCmd.get());
1007  NewCmd->MFusionCmd = FusionCmd;
1008  std::vector<Command *> ToCleanUp;
1009  // Add an event dependency from the fusion placeholder command to the new
1010  // kernel.
1011  auto ConnectionCmd = FusionCmd->addDep(NewCmd->getEvent(), ToCleanUp);
1012  if (ConnectionCmd) {
1013  FusionCmd->auxiliaryCommands().push_back(ConnectionCmd);
1014  }
1015  return {NewCmd.release(), FusionCmd->getEvent(), false};
1016  } else {
1017  std::string s;
1018  std::stringstream ss(s);
1019  if (NewCmd->getCG().getType() == CG::CGTYPE::Kernel) {
1020  ss << "Not fusing kernel with 'use_root_sync' property. Can only fuse "
1021  "non-cooperative device kernels.";
1022  } else {
1023  ss << "Not fusing '" << NewCmd->getTypeString()
1024  << "' command group. Can only fuse device kernel command groups.";
1025  }
1026  printFusionWarning(ss.str());
1027  }
1028  }
1029  createGraphForCommand(NewCmd.get(), NewCmd->getCG(),
1030  isInteropHostTask(NewCmd.get()), Reqs, Events, Queue,
1031  ToEnqueue);
1032  auto Event = NewCmd->getEvent();
1033  return {NewCmd.release(), Event, true};
1034 }
1035 
1036 void Scheduler::GraphBuilder::createGraphForCommand(
1037  Command *NewCmd, CG &CG, bool isInteropTask,
1038  std::vector<Requirement *> &Reqs,
1039  const std::vector<detail::EventImplPtr> &Events, QueueImplPtr Queue,
1040  std::vector<Command *> &ToEnqueue) {
1041 
1042  if (MPrintOptionsArray[BeforeAddCG])
1043  printGraphAsDot("before_addCG");
1044 
1045  // If there are multiple requirements for the same memory object, its
1046  // AllocaCommand creation will be dependent on the access mode of the first
1047  // requirement. Combine these access modes to take all of them into account.
1049  std::vector<Command *> ToCleanUp;
1050  for (Requirement *Req : Reqs) {
1051  MemObjRecord *Record = nullptr;
1052  AllocaCommandBase *AllocaCmd = nullptr;
1053 
1054  bool isSameCtx = false;
1055 
1056  {
1057  const QueueImplPtr &QueueForAlloca =
1058  isInteropTask ? static_cast<detail::CGHostTask &>(CG).MQueue : Queue;
1059 
1060  Record = getOrInsertMemObjRecord(QueueForAlloca, Req, ToEnqueue);
1061  markModifiedIfWrite(Record, Req);
1062 
1063  AllocaCmd =
1064  getOrCreateAllocaForReq(Record, Req, QueueForAlloca, ToEnqueue);
1065 
1066  isSameCtx =
1067  sameCtx(QueueForAlloca->getContextImplPtr(), Record->MCurContext);
1068  }
1069 
1070  // If there is alloca command we need to check if the latest memory is in
1071  // required context.
1072  if (isSameCtx) {
1073  // If the memory is already in the required host context, check if the
1074  // required access mode is valid, remap if not.
1075  if (Record->MCurContext->is_host() &&
1076  !isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess)) {
1077  remapMemoryObject(Record, Req,
1078  Req->MIsSubBuffer
1079  ? (static_cast<AllocaSubBufCommand *>(AllocaCmd))
1080  ->getParentAlloca()
1081  : AllocaCmd,
1082  ToEnqueue);
1083  }
1084  } else {
1085  // Cannot directly copy memory from OpenCL device to OpenCL device -
1086  // create two copies: device->host and host->device.
1087  bool NeedMemMoveToHost = false;
1088  auto MemMoveTargetQueue = Queue;
1089 
1090  if (isInteropTask) {
1091  const detail::CGHostTask &HT = static_cast<detail::CGHostTask &>(CG);
1092 
1093  if (HT.MQueue->getContextImplPtr() != Record->MCurContext) {
1094  NeedMemMoveToHost = true;
1095  MemMoveTargetQueue = HT.MQueue;
1096  }
1097  } else if (!Queue->is_host() && !Record->MCurContext->is_host())
1098  NeedMemMoveToHost = true;
1099 
1100  if (NeedMemMoveToHost)
1101  insertMemoryMove(Record, Req,
1103  ToEnqueue);
1104  insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue);
1105  }
1106  std::set<Command *> Deps =
1107  findDepsForReq(Record, Req, Queue->getContextImplPtr());
1108 
1109  for (Command *Dep : Deps) {
1110  if (Dep != NewCmd) {
1111  Command *ConnCmd =
1112  NewCmd->addDep(DepDesc{Dep, Req, AllocaCmd}, ToCleanUp);
1113  if (ConnCmd)
1114  ToEnqueue.push_back(ConnCmd);
1115  }
1116  }
1117  }
1118 
1119  // Set new command as user for dependencies and update leaves.
1120  // Node dependencies can be modified further when adding the node to leaves,
1121  // iterate over their copy.
1122  // FIXME employ a reference here to eliminate copying of a vector
1123  std::vector<DepDesc> Deps = NewCmd->MDeps;
1124  for (DepDesc &Dep : Deps) {
1125  const Requirement *Req = Dep.MDepRequirement;
1126  MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj);
1127  updateLeaves({Dep.MDepCommand}, Record, Req->MAccessMode, ToCleanUp);
1128  addNodeToLeaves(Record, NewCmd, Req->MAccessMode, ToEnqueue);
1129  }
1130 
1131  // Register all the events as dependencies
1132  for (detail::EventImplPtr e : Events) {
1133  if (e->getCommand() && e->getCommand() == NewCmd) {
1134  continue;
1135  }
1136  if (Command *ConnCmd = NewCmd->addDep(e, ToCleanUp))
1137  ToEnqueue.push_back(ConnCmd);
1138  }
1139 
1140  if (MPrintOptionsArray[AfterAddCG])
1141  printGraphAsDot("after_addCG");
1142 
1143  for (Command *Cmd : ToCleanUp) {
1144  cleanupCommand(Cmd);
1145  }
1146 }
1147 
1149  MemObjRecord *Record) {
1150  for (Command *Cmd : Record->MReadLeaves) {
1151  --(Cmd->MLeafCounter);
1152  if (Cmd->readyForCleanup())
1153  cleanupCommand(Cmd);
1154  }
1155  for (Command *Cmd : Record->MWriteLeaves) {
1156  --(Cmd->MLeafCounter);
1157  if (Cmd->readyForCleanup())
1158  cleanupCommand(Cmd);
1159  }
1160 }
1161 
1163  std::vector<AllocaCommandBase *> &AllocaCommands = Record->MAllocaCommands;
1164  if (AllocaCommands.empty())
1165  return;
1166 
1167  assert(MCmdsToVisit.empty());
1168  MVisitedCmds.clear();
1169 
1170  // First, mark all allocas for deletion and their direct users for traversal
1171  // Dependencies of the users will be cleaned up during the traversal
1172  for (Command *AllocaCmd : AllocaCommands) {
1173  markNodeAsVisited(AllocaCmd, MVisitedCmds);
1174 
1175  for (Command *UserCmd : AllocaCmd->MUsers)
1176  // Linked alloca cmd may be in users of this alloca. We're not going to
1177  // visit it.
1178  if (UserCmd->getType() != Command::CommandType::ALLOCA)
1179  MCmdsToVisit.push(UserCmd);
1180  else
1181  markNodeAsVisited(UserCmd, MVisitedCmds);
1182 
1183  AllocaCmd->MMarks.MToBeDeleted = true;
1184  // These commands will be deleted later, clear users now to avoid
1185  // updating them during edge removal
1186  AllocaCmd->MUsers.clear();
1187  }
1188 
1189  // Make sure the Linked Allocas are marked visited by the previous walk.
1190  // Remove allocation commands from the users of their dependencies.
1191  for (AllocaCommandBase *AllocaCmd : AllocaCommands) {
1192  AllocaCommandBase *LinkedCmd = AllocaCmd->MLinkedAllocaCmd;
1193 
1194  if (LinkedCmd) {
1195  assert(LinkedCmd->MMarks.MVisited);
1196  }
1197 
1198  for (DepDesc &Dep : AllocaCmd->MDeps)
1199  if (Dep.MDepCommand)
1200  Dep.MDepCommand->MUsers.erase(AllocaCmd);
1201  }
1202 
1203  // Traverse the graph using BFS
1204  while (!MCmdsToVisit.empty()) {
1205  Command *Cmd = MCmdsToVisit.front();
1206  MCmdsToVisit.pop();
1207 
1208  if (!markNodeAsVisited(Cmd, MVisitedCmds))
1209  continue;
1210 
1211  for (Command *UserCmd : Cmd->MUsers)
1212  if (UserCmd->getType() != Command::CommandType::ALLOCA)
1213  MCmdsToVisit.push(UserCmd);
1214 
1215  // Delete all dependencies on any allocations being removed
1216  // Track which commands should have their users updated
1217  std::map<Command *, bool> ShouldBeUpdated;
1218  auto NewEnd = std::remove_if(
1219  Cmd->MDeps.begin(), Cmd->MDeps.end(), [&](const DepDesc &Dep) {
1220  if (std::find(AllocaCommands.begin(), AllocaCommands.end(),
1221  Dep.MAllocaCmd) != AllocaCommands.end()) {
1222  ShouldBeUpdated.insert({Dep.MDepCommand, true});
1223  return true;
1224  }
1225  ShouldBeUpdated[Dep.MDepCommand] = false;
1226  return false;
1227  });
1228  Cmd->MDeps.erase(NewEnd, Cmd->MDeps.end());
1229 
1230  // Update users of removed dependencies
1231  for (auto DepCmdIt : ShouldBeUpdated) {
1232  if (!DepCmdIt.second)
1233  continue;
1234  DepCmdIt.first->MUsers.erase(Cmd);
1235  }
1236 
1237  // If all dependencies have been removed this way, mark the command for
1238  // deletion
1239  if (Cmd->MDeps.empty()) {
1240  Cmd->MUsers.clear();
1241  // Do not delete the node if it's scheduled for post-enqueue cleanup to
1242  // avoid double free.
1243  if (!Cmd->MMarkedForCleanup)
1244  Cmd->MMarks.MToBeDeleted = true;
1245  }
1246  }
1247 
1248  handleVisitedNodes(MVisitedCmds);
1249 }
1250 
1252  Command *Cmd, [[maybe_unused]] bool AllowUnsubmitted) {
1254  static bool DeprWarningPrinted = false;
1255  if (!DeprWarningPrinted) {
1256  std::cerr << "WARNING: The enviroment variable "
1257  "SYCL_DISABLE_POST_ENQUEUE_CLEANUP is deprecated. Please "
1258  "use SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP instead.\n";
1259  DeprWarningPrinted = true;
1260  }
1261  return;
1262  }
1264  return;
1265 
1266  assert(Cmd->MLeafCounter == 0 &&
1267  (Cmd->isSuccessfullyEnqueued() || AllowUnsubmitted));
1268  Command::CommandType CmdT = Cmd->getType();
1269 
1270  assert(CmdT != Command::ALLOCA && CmdT != Command::ALLOCA_SUB_BUF);
1271  assert(CmdT != Command::RELEASE);
1272  (void)CmdT;
1273 
1274  for (Command *UserCmd : Cmd->MUsers) {
1275  for (DepDesc &Dep : UserCmd->MDeps) {
1276  // Link the users of the command to the alloca command(s) instead
1277  if (Dep.MDepCommand == Cmd) {
1278  // ... unless the user is the alloca itself.
1279  if (Dep.MAllocaCmd == UserCmd) {
1280  Dep.MDepCommand = nullptr;
1281  } else {
1282  Dep.MDepCommand = Dep.MAllocaCmd;
1283  Dep.MDepCommand->MUsers.insert(UserCmd);
1284  }
1285  }
1286  }
1287  }
1288  // Update dependency users
1289  for (DepDesc &Dep : Cmd->MDeps) {
1290  Command *DepCmd = Dep.MDepCommand;
1291  DepCmd->MUsers.erase(Cmd);
1292  }
1293 
1294  if (Cmd->getType() == Command::FUSION &&
1295  !static_cast<KernelFusionCommand *>(Cmd)->readyForDeletion()) {
1296  // Fusion commands might still be needed because fusion might be aborted,
1297  // but a later call to complete_fusion still needs to be able to return a
1298  // valid event. Clean-up of fusion commands is therefore explicitly handled
1299  // by start fusion.
1300  return;
1301  }
1302  Cmd->getEvent()->setCommand(nullptr);
1303  delete Cmd;
1304 }
1305 
1307  const auto It = std::find_if(
1308  MMemObjs.begin(), MMemObjs.end(),
1309  [MemObject](const SYCLMemObjI *Obj) { return Obj == MemObject; });
1310  if (It != MMemObjs.end())
1311  MMemObjs.erase(It);
1312  MemObject->MRecord.reset();
1313 }
1314 
1315 // Make Cmd depend on DepEvent from different context. Connection is performed
1316 // via distinct ConnectCmd with host task command group on host queue. Cmd will
1317 // depend on ConnectCmd's host event.
1318 // DepEvent may not have a command associated with it in at least two cases:
1319 // - the command was deleted upon cleanup process;
1320 // - DepEvent is user event.
1321 // In both of these cases the only thing we can do is to make ConnectCmd depend
1322 // on DepEvent.
1323 // Otherwise, when there is a command associated with DepEvent, we make
1324 // ConnectCmd depend on on this command. If there is valid, i.e. non-nil,
1325 // requirement in Dep we make ConnectCmd depend on DepEvent's command with this
1326 // requirement.
1327 // Optionality of Dep is set by Dep.MDepCommand equal to nullptr.
1329  Command *const Cmd, const EventImplPtr &DepEvent, const DepDesc &Dep,
1330  std::vector<Command *> &ToCleanUp) {
1331  assert(Cmd->getWorkerContext() != DepEvent->getContextImpl());
1332 
1333  // construct Host Task type command manually and make it depend on DepEvent
1334  ExecCGCommand *ConnectCmd = nullptr;
1335 
1336  try {
1337  std::unique_ptr<detail::HostTask> HT(new detail::HostTask);
1338  std::unique_ptr<detail::CG> ConnectCG(new detail::CGHostTask(
1339  std::move(HT), /* Queue = */ {}, /* Context = */ {}, /* Args = */ {},
1341  /* ArgsStorage = */ {}, /* AccStorage = */ {},
1342  /* SharedPtrStorage = */ {}, /* Requirements = */ {},
1343  /* DepEvents = */ {DepEvent}),
1345  /* Payload */ {}));
1346  ConnectCmd = new ExecCGCommand(
1347  std::move(ConnectCG), Scheduler::getInstance().getDefaultHostQueue());
1348  } catch (const std::bad_alloc &) {
1349  throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
1350  }
1351 
1352  if (Dep.MDepRequirement) {
1353  // make ConnectCmd depend on requirement
1354  // Dismiss the result here as it's not a connection now,
1355  // 'cause ConnectCmd is host one
1356  (void)ConnectCmd->addDep(Dep, ToCleanUp);
1357  assert(reinterpret_cast<Command *>(DepEvent->getCommand()) ==
1358  Dep.MDepCommand);
1359  // add user to Dep.MDepCommand is already performed beyond this if branch
1360  {
1361  DepDesc DepOnConnect = Dep;
1362  DepOnConnect.MDepCommand = ConnectCmd;
1363 
1364  // Dismiss the result here as it's not a connection now,
1365  // 'cause ConnectCmd is host one
1366  std::ignore = Cmd->addDep(DepOnConnect, ToCleanUp);
1367  }
1368  } else {
1369  // It is required condition in another a path and addUser will be set in
1370  // addDep
1371  if (Command *DepCmd = reinterpret_cast<Command *>(DepEvent->getCommand()))
1372  DepCmd->addUser(ConnectCmd);
1373 
1374  std::ignore = ConnectCmd->addDep(DepEvent, ToCleanUp);
1375 
1376  std::ignore = Cmd->addDep(ConnectCmd->getEvent(), ToCleanUp);
1377 
1378  ConnectCmd->addUser(Cmd);
1379  }
1380 
1381  return ConnectCmd;
1382 }
1383 
1385  cleanUpCmdFusion(Queue.get());
1386  auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1387  MFusionMap.emplace(QUniqueID, std::make_unique<KernelFusionCommand>(Queue));
1388 }
1389 
1391  sycl::detail::queue_impl *Queue) {
1392  auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue);
1393  if (isInFusionMode(QUniqueID)) {
1394  throw sycl::exception{sycl::make_error_code(sycl::errc::invalid),
1395  "Queue already in fusion mode"};
1396  }
1397  auto OldFusionCmd = findFusionList(QUniqueID);
1398  if (OldFusionCmd != MFusionMap.end()) {
1399  // If fusion was used on this queue previously, the old fusion command might
1400  // still be around to make sure that even after
1401  // cancellation of the fusion due to synchronization, complete_fusion is
1402  // still able to return a valid event.
1403  OldFusionCmd->second->setFusionStatus(
1405  cleanupCommand(OldFusionCmd->second.release());
1406  MFusionMap.erase(OldFusionCmd);
1407  }
1408 }
1409 
1410 void Scheduler::GraphBuilder::removeNodeFromGraph(
1411  Command *Node, std::vector<Command *> &ToEnqueue) {
1412  // Remove the placeholder command as leaf of all its requirements and from the
1413  // user list of all its dependencies.
1414  for (auto &Dep : Node->MDeps) {
1415  auto AccessMode = Dep.MDepRequirement->MAccessMode;
1416  auto *Record = getMemObjRecord(Dep.MDepRequirement->MSYCLMemObj);
1417 
1418  Node->MLeafCounter -= Record->MReadLeaves.remove(Node);
1419  Node->MLeafCounter -= Record->MWriteLeaves.remove(Node);
1420  // If the placeholder had a write-requirement on this record, we need to
1421  // restore the previous leaves.
1422  if (AccessMode != access::mode::read) {
1423  for (auto PrevDep : Dep.MDepCommand->MDeps) {
1424  auto *DepReq = PrevDep.MDepRequirement;
1425  auto *DepRecord = getMemObjRecord(DepReq->MSYCLMemObj);
1426  if (DepRecord == Record) {
1427  // Need to restore this as a leaf, because we pushed it from the
1428  // leaves when adding the placeholder command.
1429  assert(Dep.MDepCommand);
1430  addNodeToLeaves(Record, Dep.MDepCommand, DepReq->MAccessMode,
1431  ToEnqueue);
1432  }
1433  }
1434  }
1435  Dep.MDepCommand->MUsers.erase(Node);
1436  }
1437 
1438  // Clear all the dependencies to avoid cleanDepEventsThroughOneLevel, called
1439  // from the destructor of the command to delete the dependencies of the
1440  // command this command depends on.
1441  Node->clearAllDependencies();
1442 }
1443 
1445  std::vector<Command *> &ToEnqueue) {
1446  auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1447  if (!isInFusionMode(QUniqueID)) {
1448  return;
1449  }
1450  auto FusionList = findFusionList(QUniqueID);
1451 
1452  auto *PlaceholderCmd = (*FusionList).second.get();
1453 
1454  // Enqueue all the kernels/commands from the fusion list
1455  auto FusedCmdList = PlaceholderCmd->getFusionList();
1456  ToEnqueue.insert(ToEnqueue.end(), FusedCmdList.begin(), FusedCmdList.end());
1457 
1458  // The commands establishing an event dependency between the fusion
1459  // placeholder command and the individual kernels need to be enqueued.
1460  ToEnqueue.insert(ToEnqueue.end(), PlaceholderCmd->auxiliaryCommands().begin(),
1461  PlaceholderCmd->auxiliaryCommands().end());
1462 
1463  ToEnqueue.push_back(PlaceholderCmd);
1464 
1465  if (MPrintOptionsArray[AfterFusionCancel]) {
1466  printGraphAsDot("after_fusionCancel");
1467  }
1468 
1469  // Set the status for the fusion command
1470  PlaceholderCmd->setFusionStatus(KernelFusionCommand::FusionStatus::CANCELLED);
1471 }
1472 
1473 static bool isPartOfFusion(Command *Cmd, KernelFusionCommand *Fusion) {
1474  if (Cmd->getType() == Command::RUN_CG) {
1475  return static_cast<ExecCGCommand *>(Cmd)->MFusionCmd == Fusion;
1476  }
1477  return false;
1478 }
1479 
1480 static bool checkForCircularDependency(Command *, bool, KernelFusionCommand *);
1481 
1482 static bool createsCircularDependency(Command *Cmd, bool PredPartOfFusion,
1483  KernelFusionCommand *Fusion) {
1484  if (isPartOfFusion(Cmd, Fusion)) {
1485  // If this is part of the fusion and the predecessor also was, we can stop
1486  // the traversal here. A direct dependency between two kernels in the same
1487  // fusion will never form a cyclic dependency and by iterating over all
1488  // commands in a fusion, we will detect any cycles originating from the
1489  // current command.
1490  // If the predecessor was not part of the fusion, but the current command
1491  // is, we have found a potential cycle in the dependency graph.
1492  return !PredPartOfFusion;
1493  }
1494  return checkForCircularDependency(Cmd, false, Fusion);
1495 }
1496 
1497 static bool checkForCircularDependency(Command *Cmd, bool IsPartOfFusion,
1498  KernelFusionCommand *Fusion) {
1499  // Check the requirement dependencies.
1500  for (auto &Dep : Cmd->MDeps) {
1501  auto *DepCmd = Dep.MDepCommand;
1502  if (!DepCmd) {
1503  continue;
1504  }
1505  if (createsCircularDependency(DepCmd, IsPartOfFusion, Fusion)) {
1506  return true;
1507  }
1508  }
1509  for (auto &Ev : Cmd->getPreparedDepsEvents()) {
1510  auto *EvDepCmd = static_cast<Command *>(Ev->getCommand());
1511  if (!EvDepCmd) {
1512  continue;
1513  }
1514  if (createsCircularDependency(EvDepCmd, IsPartOfFusion, Fusion)) {
1515  return true;
1516  }
1517  }
1518  for (auto &Ev : Cmd->getPreparedHostDepsEvents()) {
1519  auto *EvDepCmd = static_cast<Command *>(Ev->getCommand());
1520  if (!EvDepCmd) {
1521  continue;
1522  }
1523  if (createsCircularDependency(EvDepCmd, IsPartOfFusion, Fusion)) {
1524  return true;
1525  }
1526  }
1527  return false;
1528 }
1529 
1532  std::vector<Command *> &ToEnqueue,
1533  const property_list &PropList) {
1534  auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1535 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
1536  if (!isInFusionMode(QUniqueID)) {
1537  auto InactiveFusionList = findFusionList(QUniqueID);
1538  if (InactiveFusionList == MFusionMap.end()) {
1539  throw sycl::exception{
1540  sycl::make_error_code(sycl::errc::invalid),
1541  "Calling complete_fusion on a queue not in fusion mode"};
1542  }
1543  return InactiveFusionList->second->getEvent();
1544  }
1545 
1546  auto FusionList = findFusionList(QUniqueID);
1547  auto *PlaceholderCmd = FusionList->second.get();
1548  auto &CmdList = PlaceholderCmd->getFusionList();
1549 
1550  // If there is more than one queue currently in fusion mode, we need to check
1551  // if fusing the kernel would create a circular dependency. A circular
1552  // dependency would arise, if a kernel in the fusion list *indirectly* depends
1553  // on another kernel in the fusion list. Here, indirectly means, that the
1554  // dependency is created through a third command not part of the fusion, on
1555  // which this kernel depends and which in turn depends on another kernel in
1556  // fusion list.
1557  //
1558  // Note that we only have to consider dependencies via fusion queues here:
1559  // Let K1 be a kernel submitted to a queue Q1 in fusion mode. If a kernel K2
1560  // is submitted to a non-fusion queue Q2 and K2 depends on K1, fusion on Q1 is
1561  // cancelled automatically.
1562  bool CreatesCircularDep =
1563  MFusionMap.size() > 1 &&
1564  std::any_of(CmdList.begin(), CmdList.end(), [&](ExecCGCommand *Cmd) {
1565  return checkForCircularDependency(Cmd, true, PlaceholderCmd);
1566  });
1567  if (CreatesCircularDep) {
1568  // If fusing would create a fused kernel, cancel the fusion.
1569  printFusionWarning(
1570  "Aborting fusion because it would create a circular dependency");
1571  auto LastEvent = PlaceholderCmd->getEvent();
1572  this->cancelFusion(Queue, ToEnqueue);
1573  return LastEvent;
1574  }
1575 
1576  // Call the JIT compiler to generate a new fused kernel.
1578  Queue, CmdList, PropList);
1579 
1580  if (!FusedCG) {
1581  // If the JIT compiler returns a nullptr, JIT compilation of the fused
1582  // kernel failed. In that case, simply cancel the fusion and run each kernel
1583  // on its own.
1584  auto LastEvent = PlaceholderCmd->getEvent();
1585  this->cancelFusion(Queue, ToEnqueue);
1586  return LastEvent;
1587  }
1588 
1589  // Inherit all event dependencies from the input commands in the fusion list.
1590  std::vector<EventImplPtr> FusedEventDeps;
1591  for (auto *Cmd : CmdList) {
1592  FusedEventDeps.insert(FusedEventDeps.end(),
1593  Cmd->getPreparedDepsEvents().begin(),
1594  Cmd->getPreparedDepsEvents().end());
1595  FusedEventDeps.insert(FusedEventDeps.end(),
1596  Cmd->getPreparedHostDepsEvents().begin(),
1597  Cmd->getPreparedHostDepsEvents().end());
1598  }
1599 
1600  // Remove internal explicit dependencies, i.e., explicit dependencies from one
1601  // kernel in the fusion list to another kernel also in the fusion list.
1602  FusedEventDeps.erase(
1603  std::remove_if(FusedEventDeps.begin(), FusedEventDeps.end(),
1604  [&](EventImplPtr &E) {
1605  if (E->getCommand() == PlaceholderCmd) {
1606  return true;
1607  }
1608  if (E->getCommand() &&
1609  static_cast<Command *>(E->getCommand())->getType() ==
1610  Command::RUN_CG) {
1611  auto *RunCGCmd =
1612  static_cast<ExecCGCommand *>(E->getCommand());
1613  if (RunCGCmd->MFusionCmd == PlaceholderCmd) {
1614  return true;
1615  }
1616  }
1617  return false;
1618  }),
1619  FusedEventDeps.end());
1620 
1621  auto FusedKernelCmd =
1622  std::make_unique<ExecCGCommand>(std::move(FusedCG), Queue);
1623 
1624  // Inherit auxiliary resources from fused command groups
1625  Scheduler::getInstance().takeAuxiliaryResources(FusedKernelCmd->getEvent(),
1626  PlaceholderCmd->getEvent());
1627  assert(PlaceholderCmd->MDeps.empty());
1628  // Next, backwards iterate over all the commands in the fusion list and remove
1629  // them from the graph to restore the state before starting fusion, so we can
1630  // add the fused kernel to the graph in the next step.
1631  // Clean up the old commands after successfully fusing them.
1632  for (auto OldCmd = CmdList.rbegin(); OldCmd != CmdList.rend(); ++OldCmd) {
1633  removeNodeFromGraph(*OldCmd, ToEnqueue);
1634  cleanupCommand(*OldCmd, /* AllowUnsubmitted */ true);
1635  }
1636 
1637  createGraphForCommand(FusedKernelCmd.get(), FusedKernelCmd->getCG(), false,
1638  FusedKernelCmd->getCG().getRequirements(),
1639  FusedEventDeps, Queue, ToEnqueue);
1640 
1641  ToEnqueue.push_back(FusedKernelCmd.get());
1642 
1643  std::vector<Command *> ToCleanUp;
1644  // Make the placeholder command depend on the execution of the fused kernel
1645  auto *ConnectToPlaceholder =
1646  PlaceholderCmd->addDep(FusedKernelCmd->getEvent(), ToCleanUp);
1647  if (ConnectToPlaceholder) {
1648  ToEnqueue.push_back(ConnectToPlaceholder);
1649  }
1650  for (Command *Cmd : ToCleanUp) {
1651  cleanupCommand(Cmd);
1652  }
1653  ToEnqueue.push_back(PlaceholderCmd);
1654 
1655  if (MPrintOptionsArray[AfterFusionComplete]) {
1656  printGraphAsDot("after_fusionComplete");
1657  }
1658 
1659  // Set the status for the fusion command.
1660  PlaceholderCmd->setFusionStatus(KernelFusionCommand::FusionStatus::COMPLETE);
1661 
1662  return FusedKernelCmd.release()->getEvent();
1663 #else // SYCL_EXT_CODEPLAY_KERNEL_FUSION
1664  printFusionWarning("Kernel fusion not supported by this build");
1665  (void)PropList;
1666  auto FusionList = findFusionList(QUniqueID);
1667  auto *PlaceholderCmd = FusionList->second.get();
1668  auto LastEvent = PlaceholderCmd->getEvent();
1669  this->cancelFusion(Queue, ToEnqueue);
1670  return LastEvent;
1671 #endif // SYCL_EXT_CODEPLAY_KERNEL_FUSION
1672 }
1673 
1674 bool Scheduler::GraphBuilder::isInFusionMode(QueueIdT Id) {
1675  auto FusionList = findFusionList(Id);
1676  if (FusionList == MFusionMap.end()) {
1677  return false;
1678  }
1679  return FusionList->second->isActive();
1680 }
1681 
1682 } // namespace detail
1683 } // namespace _V1
1684 } // namespace sycl
Base class for memory allocation commands.
Definition: commands.hpp:452
const Requirement * getRequirement() const final
Definition: commands.hpp:463
AllocaCommandBase * MLinkedAllocaCmd
Alloca command linked with current command.
Definition: commands.hpp:480
bool MIsActive
Indicates that current alloca is active one.
Definition: commands.hpp:482
The AllocaSubBuf command enqueues creation of sub-buffer of memory object.
Definition: commands.hpp:517
std::unique_ptr< HostTask > MHostTask
Definition: cg.hpp:338
"Update host" command group class.
Definition: cg.hpp:260
Base class for all types of command groups.
Definition: cg.hpp:53
CGTYPE getType() const
Definition: cg.hpp:128
The Command class represents some action that needs to be performed on one or more memory objects.
Definition: commands.hpp:102
bool isSuccessfullyEnqueued() const
Definition: commands.hpp:152
const std::vector< EventImplPtr > & getPreparedDepsEvents() const
Definition: commands.hpp:294
const std::vector< EventImplPtr > & getPreparedHostDepsEvents() const
Definition: commands.hpp:290
unsigned MLeafCounter
Counts the number of memory objects this command is a leaf for.
Definition: commands.hpp:320
std::unordered_set< Command * > MUsers
Contains list of commands that depend on the command.
Definition: commands.hpp:316
virtual const ContextImplPtr & getWorkerContext() const
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:740
Marks MMarks
Used for marking the node during graph traversal.
Definition: commands.hpp:329
std::vector< DepDesc > MDeps
Contains list of dependencies(edges)
Definition: commands.hpp:314
virtual bool readyForCleanup() const
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:753
void addUser(Command *NewUser)
Definition: commands.hpp:135
void clearAllDependencies()
Clear all dependency events This should only be used if a command is about to be deleted without bein...
Definition: commands.hpp:307
const EventImplPtr & getEvent() const
Definition: commands.hpp:174
CommandType getType() const
Definition: commands.hpp:138
virtual void printDot(std::ostream &Stream) const =0
Command * addDep(DepDesc NewDep, std::vector< Command * > &ToCleanUp)
Definition: commands.cpp:758
const QueueImplPtr & getQueue() const
Definition: commands.hpp:172
The empty command does nothing during enqueue.
Definition: commands.hpp:410
The exec CG command enqueues execution of kernel or explicit memory operation.
Definition: commands.hpp:641
detail::CG & getCG() const
Definition: commands.hpp:656
The KernelFusionCommand is placed in the execution graph together with the individual kernels of the ...
Definition: commands.hpp:718
A wrapper for CircularBuffer class along with collection for host accessor's EmptyCommands.
size_t remove(value_type Cmd)
Replacement for std::remove with subsequent call to erase(newEnd, end()).
std::function< void(Command *, Command *, MemObjRecord *, EnqueueListT &)> AllocateDependencyF
std::vector< value_type > toVector() const
std::vector< Command * > EnqueueListT
The map command enqueues mapping of device memory onto host memory.
Definition: commands.hpp:536
The mem copy host command enqueues memory copy between two instances of memory object.
Definition: commands.hpp:599
const Requirement * getRequirement() const final
Definition: commands.hpp:606
virtual ContextImplPtr getInteropContext() const =0
std::shared_ptr< MemObjRecord > MRecord
void cleanupCommand(Command *Cmd, bool AllowUnsubmitted=false)
void decrementLeafCountersForRecord(MemObjRecord *Record)
Decrements leaf counters for all leaves of the record.
MemObjRecord * getMemObjRecord(SYCLMemObjI *MemObject)
EventImplPtr completeFusion(QueueImplPtr Queue, std::vector< Command * > &ToEnqueue, const property_list &)
Command * addHostAccessor(Requirement *Req, std::vector< Command * > &ToEnqueue)
Enqueues a command to create a host accessor.
void cleanupCommandsForRecord(MemObjRecord *Record)
Removes commands that use the given MemObjRecord from the graph.
GraphBuildResult addCG(std::unique_ptr< detail::CG > CommandGroup, const QueueImplPtr &Queue, std::vector< Command * > &ToEnqueue, sycl::detail::pi::PiExtCommandBuffer CommandBuffer=nullptr, const std::vector< sycl::detail::pi::PiExtSyncPoint > &Dependencies={})
Registers command group and adds it to the dependency graph.
void removeRecordForMemObj(SYCLMemObjI *MemObject)
Removes the MemObjRecord for the memory object passed.
Command * addCopyBack(Requirement *Req, std::vector< Command * > &ToEnqueue)
Enqueues a command to update memory to the latest state.
Command * connectDepEvent(Command *const Cmd, const EventImplPtr &DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform connection of events in multiple contexts.
Command * addCGUpdateHost(std::unique_ptr< detail::CG > CommandGroup, const QueueImplPtr &HostQueue, std::vector< Command * > &ToEnqueue)
Registers a command group that updates host memory to the latest state.
MemObjRecord * getOrInsertMemObjRecord(const QueueImplPtr &Queue, const Requirement *Req, std::vector< Command * > &ToEnqueue)
void cancelFusion(QueueImplPtr Queue, std::vector< Command * > &ToEnqueue)
void addNodeToLeaves(MemObjRecord *Record, Command *Cmd, access::mode AccessMode, std::vector< Command * > &ToEnqueue)
Adds new command to leaves if needed.
void cleanUpCmdFusion(sycl::detail::queue_impl *Queue)
Clean up the internal fusion commands held for the given queue.
AllocaCommandBase * findAllocaForReq(MemObjRecord *Record, const Requirement *Req, const ContextImplPtr &Context, bool AllowConst=true)
Searches for suitable alloca in memory record.
DepDesc findDepForRecord(Command *Cmd, MemObjRecord *Record)
Finds a command dependency corresponding to the record.
void updateLeaves(const std::set< Command * > &Cmds, MemObjRecord *Record, access::mode AccessMode, std::vector< Command * > &ToCleanUp)
Removes commands from leaves.
QueueImplPtr getDefaultHostQueue()
Definition: scheduler.hpp:444
bool isInFusionMode(QueueIdT Queue)
Definition: scheduler.cpp:644
void cancelFusion(QueueImplPtr Queue)
Definition: scheduler.cpp:620
static MemObjRecord * getMemObjRecord(const Requirement *const Req)
Definition: scheduler.cpp:427
static Scheduler & getInstance()
Definition: scheduler.cpp:260
void cleanUpCmdFusion(sycl::detail::queue_impl *Queue)
Definition: scheduler.cpp:613
void takeAuxiliaryResources(const EventImplPtr &Dst, const EventImplPtr &Src)
Assign Src's auxiliary resources to Dst.
Definition: scheduler.cpp:573
The unmap command removes mapping of host memory onto device memory.
Definition: commands.hpp:555
const Requirement * getRequirement() const final
Definition: commands.hpp:705
std::unique_ptr< detail::CG > fuseKernels(QueueImplPtr Queue, std::vector< ExecCGCommand * > &InputKernels, const property_list &)
static jit_compiler & get_instance()
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:66
Objects of the property_list class are containers for the SYCL properties.
size_t size() const
Definition: range.hpp:56
__SYCL_EXTERN_STREAM_ATTRS ostream cerr
Linked to standard error (unbuffered)
static bool doOverlap(const Requirement *LHS, const Requirement *RHS)
Checks whether two requirements overlap or not.
static bool createsCircularDependency(Command *Cmd, bool PredPartOfFusion, KernelFusionCommand *Fusion)
static bool isInteropHostTask(ExecCGCommand *Cmd)
static void unmarkVisitedNodes(std::vector< Command * > &Visited)
static bool checkForCircularDependency(Command *, bool, KernelFusionCommand *)
std::hash< std::shared_ptr< detail::queue_impl > >::result_type QueueIdT
Definition: scheduler.hpp:187
static access::mode combineAccessModes(access::mode A, access::mode B)
Combines two access modes into a single one that allows both.
static bool markNodeAsVisited(Command *Cmd, std::vector< Command * > &Visited)
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
Definition: event_impl.hpp:32
std::unique_ptr< KernelFusionCommand > FusionList
Definition: scheduler.hpp:189
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:30
static bool isPartOfFusion(Command *Cmd, KernelFusionCommand *Fusion)
std::shared_ptr< event_impl > EventImplPtr
Definition: cg.hpp:43
static void printDotRecursive(std::fstream &Stream, std::vector< Command * > &Visited, Command *Cmd)
AccessorImplHost Requirement
std::shared_ptr< device_impl > DeviceImplPtr
static void handleVisitedNodes(std::vector< Command * > &Visited)
static bool IsSuitableSubReq(const Requirement *Req)
Checks if current requirement is requirement for sub buffer.
static Command * insertMapUnmapForLinkedCmds(AllocaCommandBase *AllocaCmdSrc, AllocaCommandBase *AllocaCmdDst, access::mode MapMode)
static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx)
std::string string
Definition: handler.hpp:423
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
Definition: event_impl.hpp:34
static bool isAccessModeAllowed(access::mode Required, access::mode Current)
Checks if the required access mode is allowed under the current one.
static void combineAccessModesOfReqs(std::vector< Requirement * > &Reqs)
static bool sameCtx(const ContextImplPtr &LHS, const ContextImplPtr &RHS)
constexpr if(sizeof(T)==8)
class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(local_accessor) local_accessor class __SYCL_EBO __SYCL_SPECIAL_CLASS AccessMode
Definition: accessor.hpp:3233
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:107
Definition: access.hpp:18
bool any_of(const simd_mask< _Tp, _Abi > &) noexcept
bool MVisited
Used for marking the node as visited during graph traversal.
Definition: commands.hpp:324
bool MToBeDeleted
Used for marking the node for deletion during cleanup.
Definition: commands.hpp:326
Dependency between two commands.
Definition: commands.hpp:76
const Requirement * MDepRequirement
Requirement for the dependency.
Definition: commands.hpp:89
Command * MDepCommand
The actual dependency command.
Definition: commands.hpp:87
AllocaCommandBase * MAllocaCmd
Allocation command for the memory object we have requirement for.
Definition: commands.hpp:92
Memory Object Record.
Definition: scheduler.hpp:198
std::vector< AllocaCommandBase * > MAllocaCommands
Definition: scheduler.hpp:205