DPC++ Runtime
Runtime libraries for oneAPI DPC++
graph_builder.cpp
Go to the documentation of this file.
1 //===-- graph_builder.cpp - SYCL Graph Builder ------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "detail/config.hpp"
10 #include <detail/context_impl.hpp>
11 #include <detail/event_impl.hpp>
12 #include <sycl/feature_test.hpp>
13 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
14 #include <detail/jit_compiler.hpp>
15 #endif
17 #include <detail/queue_impl.hpp>
20 #include <sycl/access/access.hpp>
21 #include <sycl/exception.hpp>
22 
23 #include <algorithm>
24 #include <cstdlib>
25 #include <cstring>
26 #include <fstream>
27 #include <map>
28 #include <memory>
29 #include <queue>
30 #include <set>
31 #include <vector>
32 
33 namespace sycl {
34 inline namespace _V1 {
35 namespace detail {
36 
41 // TODO merge with LeavesCollection's version of doOverlap (see
42 // leaves_collection.cpp).
43 static bool doOverlap(const Requirement *LHS, const Requirement *RHS) {
44  return (LHS->MOffsetInBytes + LHS->MAccessRange.size() * LHS->MElemSize >=
45  RHS->MOffsetInBytes) ||
46  (RHS->MOffsetInBytes + RHS->MAccessRange.size() * RHS->MElemSize >=
47  LHS->MOffsetInBytes);
48 }
49 
50 static bool sameCtx(const ContextImplPtr &LHS, const ContextImplPtr &RHS) {
51  // Consider two different host contexts to be the same to avoid additional
52  // allocation on the host
53  return LHS == RHS || (LHS->is_host() && RHS->is_host());
54 }
55 
57 static bool IsSuitableSubReq(const Requirement *Req) {
58  return Req->MIsSubBuffer;
59 }
60 
62 static bool isAccessModeAllowed(access::mode Required, access::mode Current) {
63  switch (Current) {
64  case access::mode::read:
65  return (Required == Current);
67  assert(false && "Write only access is expected to be mapped as read_write");
68  return (Required == Current || Required == access::mode::discard_write);
73  return true;
74  }
75  assert(false);
76  return false;
77 }
78 
81  if (A == B)
82  return A;
83 
84  if (A == access::mode::discard_write &&
86  return B;
87 
88  if (B == access::mode::discard_write &&
90  return A;
91 
93 }
94 
96  if (const char *EnvVarCStr = SYCLConfig<SYCL_PRINT_EXECUTION_GRAPH>::get()) {
97  std::string GraphPrintOpts(EnvVarCStr);
98  bool EnableAlways = GraphPrintOpts.find("always") != std::string::npos;
99 
100  if (GraphPrintOpts.find("before_addCG") != std::string::npos ||
101  EnableAlways)
102  MPrintOptionsArray[BeforeAddCG] = true;
103  if (GraphPrintOpts.find("after_addCG") != std::string::npos || EnableAlways)
104  MPrintOptionsArray[AfterAddCG] = true;
105  if (GraphPrintOpts.find("before_addCopyBack") != std::string::npos ||
106  EnableAlways)
107  MPrintOptionsArray[BeforeAddCopyBack] = true;
108  if (GraphPrintOpts.find("after_addCopyBack") != std::string::npos ||
109  EnableAlways)
110  MPrintOptionsArray[AfterAddCopyBack] = true;
111  if (GraphPrintOpts.find("before_addHostAcc") != std::string::npos ||
112  EnableAlways)
113  MPrintOptionsArray[BeforeAddHostAcc] = true;
114  if (GraphPrintOpts.find("after_addHostAcc") != std::string::npos ||
115  EnableAlways)
116  MPrintOptionsArray[AfterAddHostAcc] = true;
117  if (GraphPrintOpts.find("after_fusionComplete") != std::string::npos ||
118  EnableAlways)
119  MPrintOptionsArray[AfterFusionComplete] = true;
120  if (GraphPrintOpts.find("after_fusionCancel") != std::string::npos ||
121  EnableAlways)
122  MPrintOptionsArray[AfterFusionCancel] = true;
123  }
124 }
125 
126 static bool markNodeAsVisited(Command *Cmd, std::vector<Command *> &Visited) {
127  assert(Cmd && "Cmd can't be nullptr");
128  if (Cmd->MMarks.MVisited)
129  return false;
130  Cmd->MMarks.MVisited = true;
131  Visited.push_back(Cmd);
132  return true;
133 }
134 
135 static void unmarkVisitedNodes(std::vector<Command *> &Visited) {
136  for (Command *Cmd : Visited)
137  Cmd->MMarks.MVisited = false;
138 }
139 
140 static void handleVisitedNodes(std::vector<Command *> &Visited) {
141  for (Command *Cmd : Visited) {
142  if (Cmd->MMarks.MToBeDeleted) {
143  if (Cmd->getType() == Command::FUSION &&
144  !static_cast<KernelFusionCommand *>(Cmd)->readyForDeletion()) {
145  // Fusion commands might still be needed because fusion might be
146  // aborted, but a later call to complete_fusion still needs to be able
147  // to return a valid event. Clean-up of fusion commands is therefore
148  // explicitly handled by start fusion.
149  return;
150  }
151  Cmd->getEvent()->setCommand(nullptr);
152  delete Cmd;
153  } else
154  Cmd->MMarks.MVisited = false;
155  }
156 }
157 
158 static void printDotRecursive(std::fstream &Stream,
159  std::vector<Command *> &Visited, Command *Cmd) {
160  if (!markNodeAsVisited(Cmd, Visited))
161  return;
162  for (Command *User : Cmd->MUsers) {
163  if (User)
164  printDotRecursive(Stream, Visited, User);
165  }
166  Cmd->printDot(Stream);
167 }
168 
169 void Scheduler::GraphBuilder::printGraphAsDot(const char *ModeName) {
170  static size_t Counter = 0;
171  std::string ModeNameStr(ModeName);
172  std::string FileName =
173  "graph_" + std::to_string(Counter) + ModeNameStr + ".dot";
174 
175  Counter++;
176 
177  std::fstream Stream(FileName, std::ios::out);
178  Stream << "strict digraph {" << std::endl;
179 
180  MVisitedCmds.clear();
181 
182  for (SYCLMemObjI *MemObject : MMemObjs)
183  for (Command *AllocaCmd : MemObject->MRecord->MAllocaCommands)
184  printDotRecursive(Stream, MVisitedCmds, AllocaCmd);
185 
186  Stream << "}" << std::endl;
187 
188  unmarkVisitedNodes(MVisitedCmds);
189 }
190 
192  return MemObject->MRecord.get();
193 }
194 
196  const QueueImplPtr &Queue, const Requirement *Req,
197  std::vector<Command *> &ToEnqueue) {
198  SYCLMemObjI *MemObject = Req->MSYCLMemObj;
199  MemObjRecord *Record = getMemObjRecord(MemObject);
200 
201  if (nullptr != Record)
202  return Record;
203 
204  const size_t LeafLimit = 8;
205  LeavesCollection::AllocateDependencyF AllocateDependency =
206  [this](Command *Dependant, Command *Dependency, MemObjRecord *Record,
207  LeavesCollection::EnqueueListT &ToEnqueue) {
208  // Add the old leaf as a dependency for the new one by duplicating one
209  // of the requirements for the current record
210  DepDesc Dep = findDepForRecord(Dependant, Record);
211  Dep.MDepCommand = Dependency;
212  std::vector<Command *> ToCleanUp;
213  Command *ConnectionCmd = Dependant->addDep(Dep, ToCleanUp);
214  if (ConnectionCmd)
215  ToEnqueue.push_back(ConnectionCmd);
216 
217  --(Dependency->MLeafCounter);
218  if (Dependency->readyForCleanup())
219  ToCleanUp.push_back(Dependency);
220  for (Command *Cmd : ToCleanUp)
221  cleanupCommand(Cmd);
222  };
223 
224  const ContextImplPtr &InteropCtxPtr = Req->MSYCLMemObj->getInteropContext();
225  if (InteropCtxPtr) {
226  // The memory object has been constructed using interoperability constructor
227  // which means that there is already an allocation(cl_mem) in some context.
228  // Registering this allocation in the SYCL graph.
229 
230  std::vector<sycl::device> Devices =
231  InteropCtxPtr->get_info<info::context::devices>();
232  assert(Devices.size() != 0);
233  DeviceImplPtr Dev = detail::getSyclObjImpl(Devices[0]);
234 
235  // Since all the Scheduler commands require queue but we have only context
236  // here, we need to create a dummy queue bound to the context and one of the
237  // devices from the context.
238  QueueImplPtr InteropQueuePtr{new detail::queue_impl{
239  Dev, InteropCtxPtr, /*AsyncHandler=*/{}, /*PropertyList=*/{}}};
240 
241  MemObject->MRecord.reset(
242  new MemObjRecord{InteropCtxPtr, LeafLimit, AllocateDependency});
243  getOrCreateAllocaForReq(MemObject->MRecord.get(), Req, InteropQueuePtr,
244  ToEnqueue);
245  } else
246  MemObject->MRecord.reset(new MemObjRecord{Queue->getContextImplPtr(),
247  LeafLimit, AllocateDependency});
248 
249  MMemObjs.push_back(MemObject);
250  return MemObject->MRecord.get();
251 }
252 
253 void Scheduler::GraphBuilder::updateLeaves(const std::set<Command *> &Cmds,
254  MemObjRecord *Record,
256  std::vector<Command *> &ToCleanUp) {
257 
258  const bool ReadOnlyReq = AccessMode == access::mode::read;
259  if (ReadOnlyReq)
260  return;
261 
262  for (Command *Cmd : Cmds) {
263  bool WasLeaf = Cmd->MLeafCounter > 0;
264  Cmd->MLeafCounter -= Record->MReadLeaves.remove(Cmd);
265  Cmd->MLeafCounter -= Record->MWriteLeaves.remove(Cmd);
266  if (WasLeaf && Cmd->readyForCleanup()) {
267  ToCleanUp.push_back(Cmd);
268  }
269  }
270 }
271 
274  std::vector<Command *> &ToEnqueue) {
276  ? Record->MReadLeaves
277  : Record->MWriteLeaves};
278  if (Leaves.push_back(Cmd, ToEnqueue))
279  ++Cmd->MLeafCounter;
280 }
281 
282 UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd(
283  MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue,
284  std::vector<Command *> &ToEnqueue) {
285  AllocaCommandBase *AllocaCmd =
286  findAllocaForReq(Record, Req, Queue->getContextImplPtr());
287  assert(AllocaCmd && "There must be alloca for requirement!");
288  UpdateHostRequirementCommand *UpdateCommand =
289  new UpdateHostRequirementCommand(Queue, *Req, AllocaCmd, &Req->MData);
290  // Need copy of requirement because after host accessor destructor call
291  // dependencies become invalid if requirement is stored by pointer.
292  const Requirement *StoredReq = UpdateCommand->getRequirement();
293 
294  std::set<Command *> Deps =
295  findDepsForReq(Record, Req, Queue->getContextImplPtr());
296  std::vector<Command *> ToCleanUp;
297  for (Command *Dep : Deps) {
298  Command *ConnCmd =
299  UpdateCommand->addDep(DepDesc{Dep, StoredReq, AllocaCmd}, ToCleanUp);
300  if (ConnCmd)
301  ToEnqueue.push_back(ConnCmd);
302  }
303  updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp);
304  addNodeToLeaves(Record, UpdateCommand, Req->MAccessMode, ToEnqueue);
305  for (Command *Cmd : ToCleanUp)
306  cleanupCommand(Cmd);
307  return UpdateCommand;
308 }
309 
310 // Takes linked alloca commands. Makes AllocaCmdDst command active using map
311 // or unmap operation.
313  AllocaCommandBase *AllocaCmdDst,
314  access::mode MapMode) {
315  assert(AllocaCmdSrc->MLinkedAllocaCmd == AllocaCmdDst &&
316  "Expected linked alloca commands");
317  assert(AllocaCmdSrc->MIsActive &&
318  "Expected source alloca command to be active");
319 
320  if (AllocaCmdSrc->getQueue()->is_host()) {
321  UnMapMemObject *UnMapCmd = new UnMapMemObject(
322  AllocaCmdDst, *AllocaCmdDst->getRequirement(),
323  &AllocaCmdSrc->MMemAllocation, AllocaCmdDst->getQueue());
324 
325  std::swap(AllocaCmdSrc->MIsActive, AllocaCmdDst->MIsActive);
326 
327  return UnMapCmd;
328  }
329 
330  MapMemObject *MapCmd = new MapMemObject(
331  AllocaCmdSrc, *AllocaCmdSrc->getRequirement(),
332  &AllocaCmdDst->MMemAllocation, AllocaCmdSrc->getQueue(), MapMode);
333 
334  std::swap(AllocaCmdSrc->MIsActive, AllocaCmdDst->MIsActive);
335 
336  return MapCmd;
337 }
338 
339 Command *Scheduler::GraphBuilder::insertMemoryMove(
340  MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue,
341  std::vector<Command *> &ToEnqueue) {
342 
343  AllocaCommandBase *AllocaCmdDst =
344  getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
345  if (!AllocaCmdDst)
346  throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
347 
348  std::set<Command *> Deps =
349  findDepsForReq(Record, Req, Queue->getContextImplPtr());
350  Deps.insert(AllocaCmdDst);
351  // Get parent allocation of sub buffer to perform full copy of whole buffer
352  if (IsSuitableSubReq(Req)) {
353  if (AllocaCmdDst->getType() == Command::CommandType::ALLOCA_SUB_BUF)
354  AllocaCmdDst =
355  static_cast<AllocaSubBufCommand *>(AllocaCmdDst)->getParentAlloca();
356  }
357 
358  AllocaCommandBase *AllocaCmdSrc =
359  findAllocaForReq(Record, Req, Record->MCurContext);
360  if (!AllocaCmdSrc && IsSuitableSubReq(Req)) {
361  // Since no alloca command for the sub buffer requirement was found in the
362  // current context, need to find a parent alloca command for it (it must be
363  // there)
364  auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) {
365  bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(),
366  Record->MCurContext) &&
367  // Looking for a parent buffer alloca command
368  AllocaCmd->getType() == Command::CommandType::ALLOCA;
369  return Res;
370  };
371  const auto It =
372  std::find_if(Record->MAllocaCommands.begin(),
373  Record->MAllocaCommands.end(), IsSuitableAlloca);
374  AllocaCmdSrc = (Record->MAllocaCommands.end() != It) ? *It : nullptr;
375  }
376  if (!AllocaCmdSrc)
377  throw runtime_error("Cannot find buffer allocation",
378  PI_ERROR_INVALID_VALUE);
379  // Get parent allocation of sub buffer to perform full copy of whole buffer
380  if (IsSuitableSubReq(Req)) {
381  if (AllocaCmdSrc->getType() == Command::CommandType::ALLOCA_SUB_BUF)
382  AllocaCmdSrc =
383  static_cast<AllocaSubBufCommand *>(AllocaCmdSrc)->getParentAlloca();
384  else if (AllocaCmdSrc->getSYCLMemObj() != Req->MSYCLMemObj)
385  assert(false && "Inappropriate alloca command.");
386  }
387 
388  Command *NewCmd = nullptr;
389 
390  if (AllocaCmdSrc->MLinkedAllocaCmd == AllocaCmdDst) {
391  // Map write only as read-write
392  access::mode MapMode = Req->MAccessMode;
393  if (MapMode == access::mode::write)
394  MapMode = access::mode::read_write;
395  NewCmd = insertMapUnmapForLinkedCmds(AllocaCmdSrc, AllocaCmdDst, MapMode);
396  Record->MHostAccess = MapMode;
397  } else {
398 
399  if ((Req->MAccessMode == access::mode::discard_write) ||
400  (Req->MAccessMode == access::mode::discard_read_write)) {
401  Record->MCurContext = Queue->getContextImplPtr();
402  return nullptr;
403  } else {
404  // Full copy of buffer is needed to avoid loss of data that may be caused
405  // by copying specific range from host to device and backwards.
406  NewCmd =
407  new MemCpyCommand(*AllocaCmdSrc->getRequirement(), AllocaCmdSrc,
408  *AllocaCmdDst->getRequirement(), AllocaCmdDst,
409  AllocaCmdSrc->getQueue(), AllocaCmdDst->getQueue());
410  }
411  }
412  std::vector<Command *> ToCleanUp;
413  for (Command *Dep : Deps) {
414  Command *ConnCmd = NewCmd->addDep(
415  DepDesc{Dep, NewCmd->getRequirement(), AllocaCmdDst}, ToCleanUp);
416  if (ConnCmd)
417  ToEnqueue.push_back(ConnCmd);
418  }
419  updateLeaves(Deps, Record, access::mode::read_write, ToCleanUp);
420  addNodeToLeaves(Record, NewCmd, access::mode::read_write, ToEnqueue);
421  for (Command *Cmd : ToCleanUp)
422  cleanupCommand(Cmd);
423  Record->MCurContext = Queue->getContextImplPtr();
424  return NewCmd;
425 }
426 
427 Command *Scheduler::GraphBuilder::remapMemoryObject(
428  MemObjRecord *Record, Requirement *Req, AllocaCommandBase *HostAllocaCmd,
429  std::vector<Command *> &ToEnqueue) {
430  assert(HostAllocaCmd->getQueue()->is_host() &&
431  "Host alloca command expected");
432  assert(HostAllocaCmd->MIsActive && "Active alloca command expected");
433 
434  AllocaCommandBase *LinkedAllocaCmd = HostAllocaCmd->MLinkedAllocaCmd;
435  assert(LinkedAllocaCmd && "Linked alloca command expected");
436 
437  std::set<Command *> Deps = findDepsForReq(Record, Req, Record->MCurContext);
438 
439  UnMapMemObject *UnMapCmd = new UnMapMemObject(
440  LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(),
441  &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue());
442 
443  // Map write only as read-write
444  access::mode MapMode = Req->MAccessMode;
445  if (MapMode == access::mode::write)
446  MapMode = access::mode::read_write;
447  MapMemObject *MapCmd = new MapMemObject(
448  LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(),
449  &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue(), MapMode);
450 
451  std::vector<Command *> ToCleanUp;
452  for (Command *Dep : Deps) {
453  Command *ConnCmd = UnMapCmd->addDep(
454  DepDesc{Dep, UnMapCmd->getRequirement(), LinkedAllocaCmd}, ToCleanUp);
455  if (ConnCmd)
456  ToEnqueue.push_back(ConnCmd);
457  }
458 
459  Command *ConnCmd = MapCmd->addDep(
460  DepDesc{UnMapCmd, MapCmd->getRequirement(), HostAllocaCmd}, ToCleanUp);
461  if (ConnCmd)
462  ToEnqueue.push_back(ConnCmd);
463 
464  updateLeaves(Deps, Record, access::mode::read_write, ToCleanUp);
465  addNodeToLeaves(Record, MapCmd, access::mode::read_write, ToEnqueue);
466  for (Command *Cmd : ToCleanUp)
467  cleanupCommand(Cmd);
468  Record->MHostAccess = MapMode;
469  return MapCmd;
470 }
471 
472 // The function adds copy operation of the up to date'st memory to the memory
473 // pointed by Req.
474 Command *
476  std::vector<Command *> &ToEnqueue) {
478  SYCLMemObjI *MemObj = Req->MSYCLMemObj;
479  MemObjRecord *Record = getMemObjRecord(MemObj);
480  if (Record && MPrintOptionsArray[BeforeAddCopyBack])
481  printGraphAsDot("before_addCopyBack");
482 
483  // Do nothing if there were no or only read operations with the memory object.
484  if (nullptr == Record || !Record->MMemModified)
485  return nullptr;
486 
487  std::set<Command *> Deps =
488  findDepsForReq(Record, Req, HostQueue->getContextImplPtr());
489  AllocaCommandBase *SrcAllocaCmd =
490  findAllocaForReq(Record, Req, Record->MCurContext);
491 
492  auto MemCpyCmdUniquePtr = std::make_unique<MemCpyCommandHost>(
493  *SrcAllocaCmd->getRequirement(), SrcAllocaCmd, *Req, &Req->MData,
494  SrcAllocaCmd->getQueue(), std::move(HostQueue));
495 
496  if (!MemCpyCmdUniquePtr)
497  throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
498 
499  MemCpyCommandHost *MemCpyCmd = MemCpyCmdUniquePtr.release();
500 
501  std::vector<Command *> ToCleanUp;
502  for (Command *Dep : Deps) {
503  Command *ConnCmd = MemCpyCmd->addDep(
504  DepDesc{Dep, MemCpyCmd->getRequirement(), SrcAllocaCmd}, ToCleanUp);
505  if (ConnCmd)
506  ToEnqueue.push_back(ConnCmd);
507  }
508 
509  updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp);
510  addNodeToLeaves(Record, MemCpyCmd, Req->MAccessMode, ToEnqueue);
511  for (Command *Cmd : ToCleanUp)
512  cleanupCommand(Cmd);
513  if (MPrintOptionsArray[AfterAddCopyBack])
514  printGraphAsDot("after_addCopyBack");
515  return MemCpyCmd;
516 }
517 
518 // The function implements SYCL host accessor logic: host accessor
519 // should provide access to the buffer in user space.
520 Command *
522  std::vector<Command *> &ToEnqueue) {
523 
524  const QueueImplPtr &HostQueue = getInstance().getDefaultHostQueue();
525 
526  MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req, ToEnqueue);
527  if (MPrintOptionsArray[BeforeAddHostAcc])
528  printGraphAsDot("before_addHostAccessor");
529  markModifiedIfWrite(Record, Req);
530 
531  AllocaCommandBase *HostAllocaCmd =
532  getOrCreateAllocaForReq(Record, Req, HostQueue, ToEnqueue);
533 
534  if (sameCtx(HostAllocaCmd->getQueue()->getContextImplPtr(),
535  Record->MCurContext)) {
536  if (!isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess)) {
537  remapMemoryObject(Record, Req,
538  Req->MIsSubBuffer ? (static_cast<AllocaSubBufCommand *>(
539  HostAllocaCmd))
540  ->getParentAlloca()
541  : HostAllocaCmd,
542  ToEnqueue);
543  }
544  } else
545  insertMemoryMove(Record, Req, HostQueue, ToEnqueue);
546 
547  Command *UpdateHostAccCmd =
548  insertUpdateHostReqCmd(Record, Req, HostQueue, ToEnqueue);
549 
550  // Need empty command to be blocked until host accessor is destructed
551  EmptyCommand *EmptyCmd =
552  addEmptyCmd(UpdateHostAccCmd, {Req}, HostQueue,
554 
555  Req->MBlockedCmd = EmptyCmd;
556 
557  if (MPrintOptionsArray[AfterAddHostAcc])
558  printGraphAsDot("after_addHostAccessor");
559 
560  return UpdateHostAccCmd;
561 }
562 
564  std::unique_ptr<detail::CG> CommandGroup, const QueueImplPtr &HostQueue,
565  std::vector<Command *> &ToEnqueue) {
566 
567  auto UpdateHost = static_cast<CGUpdateHost *>(CommandGroup.get());
568  Requirement *Req = UpdateHost->getReqToUpdate();
569 
570  MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req, ToEnqueue);
571  return insertMemoryMove(Record, Req, HostQueue, ToEnqueue);
572 }
573 
582 std::set<Command *>
583 Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record,
584  const Requirement *Req,
585  const ContextImplPtr &Context) {
586  std::set<Command *> RetDeps;
587  std::vector<Command *> Visited;
588  const bool ReadOnlyReq = Req->MAccessMode == access::mode::read;
589 
590  std::vector<Command *> ToAnalyze{Record->MWriteLeaves.toVector()};
591 
592  if (!ReadOnlyReq) {
593  std::vector<Command *> V{Record->MReadLeaves.toVector()};
594 
595  ToAnalyze.insert(ToAnalyze.begin(), V.begin(), V.end());
596  }
597 
598  while (!ToAnalyze.empty()) {
599  Command *DepCmd = ToAnalyze.back();
600  ToAnalyze.pop_back();
601 
602  std::vector<Command *> NewAnalyze;
603 
604  for (const DepDesc &Dep : DepCmd->MDeps) {
605  if (Dep.MDepRequirement->MSYCLMemObj != Req->MSYCLMemObj)
606  continue;
607 
608  bool CanBypassDep = false;
609  // If both only read
610  CanBypassDep |=
611  Dep.MDepRequirement->MAccessMode == access::mode::read && ReadOnlyReq;
612 
613  // If not overlap
614  CanBypassDep |= !doOverlap(Dep.MDepRequirement, Req);
615 
616  // Going through copying memory between contexts is not supported.
617  if (Dep.MDepCommand)
618  CanBypassDep &=
619  sameCtx(Context, Dep.MDepCommand->getQueue()->getContextImplPtr());
620 
621  if (!CanBypassDep) {
622  RetDeps.insert(DepCmd);
623  // No need to analyze deps of examining command as it's dependency
624  // itself.
625  NewAnalyze.clear();
626  break;
627  }
628 
629  if (markNodeAsVisited(Dep.MDepCommand, Visited))
630  NewAnalyze.push_back(Dep.MDepCommand);
631  }
632  ToAnalyze.insert(ToAnalyze.end(), NewAnalyze.begin(), NewAnalyze.end());
633  }
634  unmarkVisitedNodes(Visited);
635  return RetDeps;
636 }
637 
638 // A helper function for finding a command dependency on a specific memory
639 // object
641  MemObjRecord *Record) {
642  for (const DepDesc &DD : Cmd->MDeps) {
643  if (getMemObjRecord(DD.MDepRequirement->MSYCLMemObj) == Record) {
644  return DD;
645  }
646  }
647  assert(false && "No dependency found for a leaf of the record");
648  return {nullptr, nullptr, nullptr};
649 }
650 
651 // The function searches for the alloca command matching context and
652 // requirement.
654  MemObjRecord *Record, const Requirement *Req, const ContextImplPtr &Context,
655  bool AllowConst) {
656  auto IsSuitableAlloca = [&Context, Req,
657  AllowConst](AllocaCommandBase *AllocaCmd) {
658  bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(), Context);
659  if (IsSuitableSubReq(Req)) {
660  const Requirement *TmpReq = AllocaCmd->getRequirement();
661  Res &= AllocaCmd->getType() == Command::CommandType::ALLOCA_SUB_BUF;
662  Res &= TmpReq->MOffsetInBytes == Req->MOffsetInBytes;
663  Res &= TmpReq->MAccessRange == Req->MAccessRange;
664  Res &= AllowConst || !AllocaCmd->MIsConst;
665  }
666  return Res;
667  };
668  const auto It = std::find_if(Record->MAllocaCommands.begin(),
669  Record->MAllocaCommands.end(), IsSuitableAlloca);
670  return (Record->MAllocaCommands.end() != It) ? *It : nullptr;
671 }
672 
673 static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) {
674  if (const char *HUMConfig = SYCLConfig<SYCL_HOST_UNIFIED_MEMORY>::get()) {
675  if (std::strcmp(HUMConfig, "0") == 0)
676  return Ctx->is_host();
677  if (std::strcmp(HUMConfig, "1") == 0)
678  return true;
679  }
680  for (const device &Device : Ctx->getDevices()) {
681  if (!Device.get_info<info::device::host_unified_memory>())
682  return false;
683  }
684  return true;
685 }
686 
687 // The function searches for the alloca command matching context and
688 // requirement. If none exists, new allocation command is created.
689 // Note, creation of new allocation command can lead to the current context
690 // (Record->MCurContext) change.
691 AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
692  MemObjRecord *Record, const Requirement *Req, const QueueImplPtr &Queue,
693  std::vector<Command *> &ToEnqueue) {
694 
695  AllocaCommandBase *AllocaCmd = findAllocaForReq(
696  Record, Req, Queue->getContextImplPtr(), /*AllowConst=*/false);
697 
698  if (!AllocaCmd) {
699  std::vector<Command *> ToCleanUp;
700  if (IsSuitableSubReq(Req)) {
701  // Get parent requirement. It's hard to get right parents' range
702  // so full parent requirement has range represented in bytes
703  range<3> ParentRange{Req->MSYCLMemObj->getSizeInBytes(), 1, 1};
704  Requirement ParentRequirement(
705  /*Offset*/ {0, 0, 0}, ParentRange, ParentRange,
706  access::mode::read_write, Req->MSYCLMemObj, /*Dims*/ 1,
707  /*Working with bytes*/ sizeof(char), /*offset*/ size_t(0));
708 
709  auto *ParentAlloca =
710  getOrCreateAllocaForReq(Record, &ParentRequirement, Queue, ToEnqueue);
711  AllocaCmd = new AllocaSubBufCommand(Queue, *Req, ParentAlloca, ToEnqueue,
712  ToCleanUp);
713  } else {
714 
715  const Requirement FullReq(/*Offset*/ {0, 0, 0}, Req->MMemoryRange,
716  Req->MMemoryRange, access::mode::read_write,
717  Req->MSYCLMemObj, Req->MDims, Req->MElemSize,
718  0 /*ReMOffsetInBytes*/, false /*MIsSubBuffer*/);
719  // Can reuse user data for the first allocation. Do so if host unified
720  // memory is supported regardless of the access mode (the pointer will be
721  // reused). For devices without host unified memory the initialization
722  // will be performed as a write operation.
723  // TODO the case where the first alloca is made with a discard mode and
724  // the user pointer is read-only is still not handled: it leads to
725  // unnecessary copy on devices with unified host memory support.
726  const bool HostUnifiedMemory =
727  checkHostUnifiedMemory(Queue->getContextImplPtr());
728  SYCLMemObjI *MemObj = Req->MSYCLMemObj;
729  const bool InitFromUserData = Record->MAllocaCommands.empty() &&
730  (HostUnifiedMemory || MemObj->isInterop());
731  AllocaCommandBase *LinkedAllocaCmd = nullptr;
732 
733  // For the first allocation on a device without host unified memory we
734  // might need to also create a host alloca right away in order to perform
735  // the initial memory write.
736  if (Record->MAllocaCommands.empty()) {
737  if (!HostUnifiedMemory &&
738  Req->MAccessMode != access::mode::discard_write &&
739  Req->MAccessMode != access::mode::discard_read_write) {
740  // There's no need to make a host allocation if the buffer is not
741  // initialized with user data.
742  if (MemObj->hasUserDataPtr()) {
745  AllocaCommand *HostAllocaCmd = new AllocaCommand(
746  DefaultHostQueue, FullReq, true /* InitFromUserData */,
747  nullptr /* LinkedAllocaCmd */,
748  MemObj->isHostPointerReadOnly() /* IsConst */);
749  Record->MAllocaCommands.push_back(HostAllocaCmd);
750  Record->MWriteLeaves.push_back(HostAllocaCmd, ToEnqueue);
751  ++(HostAllocaCmd->MLeafCounter);
752  Record->MCurContext = DefaultHostQueue->getContextImplPtr();
753  }
754  }
755  } else {
756  // If it is not the first allocation, try to setup a link
757  // FIXME: Temporary limitation, linked alloca commands for an image is
758  // not supported because map operation is not implemented for an image.
759  if (Req->MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer)
760  // Current limitation is to setup link between current allocation and
761  // new one. There could be situations when we could setup link with
762  // "not" current allocation, but it will require memory copy.
763  // Can setup link between cl and host allocations only
764  if (Queue->is_host() != Record->MCurContext->is_host()) {
765  // Linked commands assume that the host allocation is reused by the
766  // plugin runtime and that can lead to unnecessary copy overhead on
767  // devices that do not support host unified memory. Do not link the
768  // allocations in this case.
769  // However, if the user explicitly requests use of pinned host
770  // memory, map/unmap operations are expected to work faster than
771  // read/write from/to an artbitrary host pointer. Link such commands
772  // regardless of host unified memory support.
773  bool PinnedHostMemory = MemObj->usesPinnedHostMemory();
774 
775  bool HostUnifiedMemoryOnNonHostDevice =
776  Queue->is_host() ? checkHostUnifiedMemory(Record->MCurContext)
777  : HostUnifiedMemory;
778  if (PinnedHostMemory || HostUnifiedMemoryOnNonHostDevice) {
779  AllocaCommandBase *LinkedAllocaCmdCand = findAllocaForReq(
780  Record, Req, Record->MCurContext, /*AllowConst=*/false);
781 
782  // Cannot setup link if candidate is linked already
783  if (LinkedAllocaCmdCand &&
784  !LinkedAllocaCmdCand->MLinkedAllocaCmd) {
785  LinkedAllocaCmd = LinkedAllocaCmdCand;
786  }
787  }
788  }
789  }
790 
791  AllocaCmd =
792  new AllocaCommand(Queue, FullReq, InitFromUserData, LinkedAllocaCmd);
793 
794  // Update linked command
795  if (LinkedAllocaCmd) {
796  Command *ConnCmd = AllocaCmd->addDep(
797  DepDesc{LinkedAllocaCmd, AllocaCmd->getRequirement(),
798  LinkedAllocaCmd},
799  ToCleanUp);
800  if (ConnCmd)
801  ToEnqueue.push_back(ConnCmd);
802  LinkedAllocaCmd->MLinkedAllocaCmd = AllocaCmd;
803 
804  // To ensure that the leader allocation is removed first
805  ConnCmd = AllocaCmd->getReleaseCmd()->addDep(
806  DepDesc(LinkedAllocaCmd->getReleaseCmd(),
807  AllocaCmd->getRequirement(), LinkedAllocaCmd),
808  ToCleanUp);
809  if (ConnCmd)
810  ToEnqueue.push_back(ConnCmd);
811 
812  // Device allocation takes ownership of the host ptr during
813  // construction, host allocation doesn't. So, device allocation should
814  // always be active here. Also if the "follower" command is a device one
815  // we have to change current context to the device one.
816  if (Queue->is_host()) {
817  AllocaCmd->MIsActive = false;
818  } else {
819  LinkedAllocaCmd->MIsActive = false;
820  Record->MCurContext = Queue->getContextImplPtr();
821 
822  std::set<Command *> Deps =
823  findDepsForReq(Record, Req, Queue->getContextImplPtr());
824  for (Command *Dep : Deps) {
825  Command *ConnCmd = AllocaCmd->addDep(
826  DepDesc{Dep, Req, LinkedAllocaCmd}, ToCleanUp);
827  if (ConnCmd)
828  ToEnqueue.push_back(ConnCmd);
829  }
830  updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp);
831  addNodeToLeaves(Record, AllocaCmd, Req->MAccessMode, ToEnqueue);
832  }
833  }
834  }
835 
836  Record->MAllocaCommands.push_back(AllocaCmd);
837  Record->MWriteLeaves.push_back(AllocaCmd, ToEnqueue);
838  ++(AllocaCmd->MLeafCounter);
839  for (Command *Cmd : ToCleanUp)
840  cleanupCommand(Cmd);
841  }
842  return AllocaCmd;
843 }
844 
845 // The function sets MemModified flag in record if requirement has write access.
846 void Scheduler::GraphBuilder::markModifiedIfWrite(MemObjRecord *Record,
847  Requirement *Req) {
848  switch (Req->MAccessMode) {
849  case access::mode::write:
854  Record->MMemModified = true;
855  break;
856  case access::mode::read:
857  break;
858  }
859 }
860 
861 EmptyCommand *Scheduler::GraphBuilder::addEmptyCmd(
862  Command *Cmd, const std::vector<Requirement *> &Reqs,
863  const QueueImplPtr &Queue, Command::BlockReason Reason,
864  std::vector<Command *> &ToEnqueue, const bool AddDepsToLeaves) {
865  EmptyCommand *EmptyCmd =
866  new EmptyCommand(Scheduler::getInstance().getDefaultHostQueue());
867 
868  if (!EmptyCmd)
869  throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
870 
871  EmptyCmd->MIsBlockable = true;
872  EmptyCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueBlocked;
873  EmptyCmd->MBlockReason = Reason;
874 
875  for (Requirement *Req : Reqs) {
876  MemObjRecord *Record = getOrInsertMemObjRecord(Queue, Req, ToEnqueue);
877  AllocaCommandBase *AllocaCmd =
878  getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
879  EmptyCmd->addRequirement(Cmd, AllocaCmd, Req);
880  }
881  // addRequirement above call addDep that already will add EmptyCmd as user for
882  // Cmd no Reqs size check here so assume it is possible to have no Reqs passed
883  if (!Reqs.size())
884  Cmd->addUser(EmptyCmd);
885 
886  if (AddDepsToLeaves) {
887  const std::vector<DepDesc> &Deps = Cmd->MDeps;
888  std::vector<Command *> ToCleanUp;
889  for (const DepDesc &Dep : Deps) {
890  const Requirement *Req = Dep.MDepRequirement;
891  MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj);
892 
893  updateLeaves({Cmd}, Record, Req->MAccessMode, ToCleanUp);
894  addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode, ToEnqueue);
895  }
896  for (Command *Cmd : ToCleanUp)
897  cleanupCommand(Cmd);
898  }
899 
900  return EmptyCmd;
901 }
902 
903 static bool isInteropHostTask(ExecCGCommand *Cmd) {
904  if (Cmd->getCG().getType() != CG::CGTYPE::CodeplayHostTask)
905  return false;
906 
907  const detail::CGHostTask &HT =
908  static_cast<detail::CGHostTask &>(Cmd->getCG());
909 
910  return HT.MHostTask->isInteropTask();
911 }
912 
913 static void combineAccessModesOfReqs(std::vector<Requirement *> &Reqs) {
914  std::unordered_map<SYCLMemObjI *, access::mode> CombinedModes;
915  bool HasDuplicateMemObjects = false;
916  for (const Requirement *Req : Reqs) {
917  auto Result = CombinedModes.insert(
918  std::make_pair(Req->MSYCLMemObj, Req->MAccessMode));
919  if (!Result.second) {
920  Result.first->second =
921  combineAccessModes(Result.first->second, Req->MAccessMode);
922  HasDuplicateMemObjects = true;
923  }
924  }
925 
926  if (!HasDuplicateMemObjects)
927  return;
928  for (Requirement *Req : Reqs) {
929  Req->MAccessMode = CombinedModes[Req->MSYCLMemObj];
930  }
931 }
932 
934  std::unique_ptr<detail::CG> CommandGroup, const QueueImplPtr &Queue,
935  std::vector<Command *> &ToEnqueue,
937  const std::vector<sycl::detail::pi::PiExtSyncPoint> &Dependencies) {
938  std::vector<Requirement *> &Reqs = CommandGroup->getRequirements();
939  std::vector<detail::EventImplPtr> &Events = CommandGroup->getEvents();
940 
941  auto NewCmd = std::make_unique<ExecCGCommand>(
942  std::move(CommandGroup), Queue, CommandBuffer, std::move(Dependencies));
943 
944  if (!NewCmd)
945  throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
946 
947  // Host tasks cannot participate in fusion. They take the regular route. If
948  // they create any requirement or event dependency on any of the kernels in
949  // the fusion list, this will lead to cancellation of the fusion in the
950  // GraphProcessor.
951  auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
952  if (isInFusionMode(QUniqueID) && !NewCmd->isHostTask()) {
953  auto *FusionCmd = findFusionList(QUniqueID)->second.get();
954 
955  bool dependsOnFusion = false;
956  for (auto Ev = Events.begin(); Ev != Events.end();) {
957  auto *EvDepCmd = static_cast<Command *>((*Ev)->getCommand());
958  if (!EvDepCmd) {
959  continue;
960  }
961  // Handle event dependencies on any commands part of another active
962  // fusion.
963  if (EvDepCmd->getQueue() != Queue && isPartOfActiveFusion(EvDepCmd)) {
964  printFusionWarning("Aborting fusion because of event dependency from a "
965  "different fusion");
966  cancelFusion(EvDepCmd->getQueue(), ToEnqueue);
967  }
968  // Check if this command depends on the placeholder command for the fusion
969  // itself participates in.
970  if (EvDepCmd == FusionCmd) {
971  Ev = Events.erase(Ev);
972  dependsOnFusion = true;
973  } else {
974  ++Ev;
975  }
976  }
977 
978  // If this command has an explicit event dependency on the placeholder
979  // command for this fusion (because it used depends_on on the event returned
980  // by submitting another kernel to this fusion earlier), add a dependency on
981  // all the commands in the fusion list so far.
982  if (dependsOnFusion) {
983  for (auto *Cmd : FusionCmd->getFusionList()) {
984  Events.push_back(Cmd->getEvent());
985  }
986  }
987 
988  // Add the kernel to the graph, but delay the enqueue of any auxiliary
989  // commands (e.g., allocations) resulting from that process by adding them
990  // to the list of auxiliary commands of the fusion command.
991  createGraphForCommand(NewCmd.get(), NewCmd->getCG(),
992  isInteropHostTask(NewCmd.get()), Reqs, Events, Queue,
993  FusionCmd->auxiliaryCommands());
994 
995  // Set the fusion command, so we recognize when another command depends on a
996  // kernel in the fusion list.
997  FusionCmd->addToFusionList(NewCmd.get());
998  NewCmd->MFusionCmd = FusionCmd;
999  std::vector<Command *> ToCleanUp;
1000  // Add an event dependency from the fusion placeholder command to the new
1001  // kernel.
1002  auto ConnectionCmd = FusionCmd->addDep(NewCmd->getEvent(), ToCleanUp);
1003  if (ConnectionCmd) {
1004  FusionCmd->auxiliaryCommands().push_back(ConnectionCmd);
1005  }
1006  return {NewCmd.release(), FusionCmd->getEvent(), false};
1007  }
1008  createGraphForCommand(NewCmd.get(), NewCmd->getCG(),
1009  isInteropHostTask(NewCmd.get()), Reqs, Events, Queue,
1010  ToEnqueue);
1011  auto Event = NewCmd->getEvent();
1012  return {NewCmd.release(), Event, true};
1013 }
1014 
1015 void Scheduler::GraphBuilder::createGraphForCommand(
1016  Command *NewCmd, CG &CG, bool isInteropTask,
1017  std::vector<Requirement *> &Reqs,
1018  const std::vector<detail::EventImplPtr> &Events, QueueImplPtr Queue,
1019  std::vector<Command *> &ToEnqueue) {
1020 
1021  if (MPrintOptionsArray[BeforeAddCG])
1022  printGraphAsDot("before_addCG");
1023 
1024  // If there are multiple requirements for the same memory object, its
1025  // AllocaCommand creation will be dependent on the access mode of the first
1026  // requirement. Combine these access modes to take all of them into account.
1028  std::vector<Command *> ToCleanUp;
1029  for (Requirement *Req : Reqs) {
1030  MemObjRecord *Record = nullptr;
1031  AllocaCommandBase *AllocaCmd = nullptr;
1032 
1033  bool isSameCtx = false;
1034 
1035  {
1036  const QueueImplPtr &QueueForAlloca =
1037  isInteropTask ? static_cast<detail::CGHostTask &>(CG).MQueue : Queue;
1038 
1039  Record = getOrInsertMemObjRecord(QueueForAlloca, Req, ToEnqueue);
1040  markModifiedIfWrite(Record, Req);
1041 
1042  AllocaCmd =
1043  getOrCreateAllocaForReq(Record, Req, QueueForAlloca, ToEnqueue);
1044 
1045  isSameCtx =
1046  sameCtx(QueueForAlloca->getContextImplPtr(), Record->MCurContext);
1047  }
1048 
1049  // If there is alloca command we need to check if the latest memory is in
1050  // required context.
1051  if (isSameCtx) {
1052  // If the memory is already in the required host context, check if the
1053  // required access mode is valid, remap if not.
1054  if (Record->MCurContext->is_host() &&
1055  !isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess)) {
1056  remapMemoryObject(Record, Req,
1057  Req->MIsSubBuffer
1058  ? (static_cast<AllocaSubBufCommand *>(AllocaCmd))
1059  ->getParentAlloca()
1060  : AllocaCmd,
1061  ToEnqueue);
1062  }
1063  } else {
1064  // Cannot directly copy memory from OpenCL device to OpenCL device -
1065  // create two copies: device->host and host->device.
1066  bool NeedMemMoveToHost = false;
1067  auto MemMoveTargetQueue = Queue;
1068 
1069  if (isInteropTask) {
1070  const detail::CGHostTask &HT = static_cast<detail::CGHostTask &>(CG);
1071 
1072  if (HT.MQueue->getContextImplPtr() != Record->MCurContext) {
1073  NeedMemMoveToHost = true;
1074  MemMoveTargetQueue = HT.MQueue;
1075  }
1076  } else if (!Queue->is_host() && !Record->MCurContext->is_host())
1077  NeedMemMoveToHost = true;
1078 
1079  if (NeedMemMoveToHost)
1080  insertMemoryMove(Record, Req,
1082  ToEnqueue);
1083  insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue);
1084  }
1085  std::set<Command *> Deps =
1086  findDepsForReq(Record, Req, Queue->getContextImplPtr());
1087 
1088  for (Command *Dep : Deps) {
1089  if (Dep != NewCmd) {
1090  Command *ConnCmd =
1091  NewCmd->addDep(DepDesc{Dep, Req, AllocaCmd}, ToCleanUp);
1092  if (ConnCmd)
1093  ToEnqueue.push_back(ConnCmd);
1094  }
1095  }
1096  }
1097 
1098  // Set new command as user for dependencies and update leaves.
1099  // Node dependencies can be modified further when adding the node to leaves,
1100  // iterate over their copy.
1101  // FIXME employ a reference here to eliminate copying of a vector
1102  std::vector<DepDesc> Deps = NewCmd->MDeps;
1103  for (DepDesc &Dep : Deps) {
1104  const Requirement *Req = Dep.MDepRequirement;
1105  MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj);
1106  updateLeaves({Dep.MDepCommand}, Record, Req->MAccessMode, ToCleanUp);
1107  addNodeToLeaves(Record, NewCmd, Req->MAccessMode, ToEnqueue);
1108  }
1109 
1110  // Register all the events as dependencies
1111  for (detail::EventImplPtr e : Events) {
1112  if (e->getCommand() && e->getCommand() == NewCmd) {
1113  continue;
1114  }
1115  if (Command *ConnCmd = NewCmd->addDep(e, ToCleanUp))
1116  ToEnqueue.push_back(ConnCmd);
1117  }
1118 
1119  if (MPrintOptionsArray[AfterAddCG])
1120  printGraphAsDot("after_addCG");
1121 
1122  for (Command *Cmd : ToCleanUp) {
1123  cleanupCommand(Cmd);
1124  }
1125 }
1126 
1128  MemObjRecord *Record) {
1129  for (Command *Cmd : Record->MReadLeaves) {
1130  --(Cmd->MLeafCounter);
1131  if (Cmd->readyForCleanup())
1132  cleanupCommand(Cmd);
1133  }
1134  for (Command *Cmd : Record->MWriteLeaves) {
1135  --(Cmd->MLeafCounter);
1136  if (Cmd->readyForCleanup())
1137  cleanupCommand(Cmd);
1138  }
1139 }
1140 
1142  std::vector<AllocaCommandBase *> &AllocaCommands = Record->MAllocaCommands;
1143  if (AllocaCommands.empty())
1144  return;
1145 
1146  assert(MCmdsToVisit.empty());
1147  MVisitedCmds.clear();
1148 
1149  // First, mark all allocas for deletion and their direct users for traversal
1150  // Dependencies of the users will be cleaned up during the traversal
1151  for (Command *AllocaCmd : AllocaCommands) {
1152  markNodeAsVisited(AllocaCmd, MVisitedCmds);
1153 
1154  for (Command *UserCmd : AllocaCmd->MUsers)
1155  // Linked alloca cmd may be in users of this alloca. We're not going to
1156  // visit it.
1157  if (UserCmd->getType() != Command::CommandType::ALLOCA)
1158  MCmdsToVisit.push(UserCmd);
1159  else
1160  markNodeAsVisited(UserCmd, MVisitedCmds);
1161 
1162  AllocaCmd->MMarks.MToBeDeleted = true;
1163  // These commands will be deleted later, clear users now to avoid
1164  // updating them during edge removal
1165  AllocaCmd->MUsers.clear();
1166  }
1167 
1168  // Make sure the Linked Allocas are marked visited by the previous walk.
1169  // Remove allocation commands from the users of their dependencies.
1170  for (AllocaCommandBase *AllocaCmd : AllocaCommands) {
1171  AllocaCommandBase *LinkedCmd = AllocaCmd->MLinkedAllocaCmd;
1172 
1173  if (LinkedCmd) {
1174  assert(LinkedCmd->MMarks.MVisited);
1175  }
1176 
1177  for (DepDesc &Dep : AllocaCmd->MDeps)
1178  if (Dep.MDepCommand)
1179  Dep.MDepCommand->MUsers.erase(AllocaCmd);
1180  }
1181 
1182  // Traverse the graph using BFS
1183  while (!MCmdsToVisit.empty()) {
1184  Command *Cmd = MCmdsToVisit.front();
1185  MCmdsToVisit.pop();
1186 
1187  if (!markNodeAsVisited(Cmd, MVisitedCmds))
1188  continue;
1189 
1190  for (Command *UserCmd : Cmd->MUsers)
1191  if (UserCmd->getType() != Command::CommandType::ALLOCA)
1192  MCmdsToVisit.push(UserCmd);
1193 
1194  // Delete all dependencies on any allocations being removed
1195  // Track which commands should have their users updated
1196  std::map<Command *, bool> ShouldBeUpdated;
1197  auto NewEnd = std::remove_if(
1198  Cmd->MDeps.begin(), Cmd->MDeps.end(), [&](const DepDesc &Dep) {
1199  if (std::find(AllocaCommands.begin(), AllocaCommands.end(),
1200  Dep.MAllocaCmd) != AllocaCommands.end()) {
1201  ShouldBeUpdated.insert({Dep.MDepCommand, true});
1202  return true;
1203  }
1204  ShouldBeUpdated[Dep.MDepCommand] = false;
1205  return false;
1206  });
1207  Cmd->MDeps.erase(NewEnd, Cmd->MDeps.end());
1208 
1209  // Update users of removed dependencies
1210  for (auto DepCmdIt : ShouldBeUpdated) {
1211  if (!DepCmdIt.second)
1212  continue;
1213  DepCmdIt.first->MUsers.erase(Cmd);
1214  }
1215 
1216  // If all dependencies have been removed this way, mark the command for
1217  // deletion
1218  if (Cmd->MDeps.empty()) {
1219  Cmd->MUsers.clear();
1220  // Do not delete the node if it's scheduled for post-enqueue cleanup to
1221  // avoid double free.
1222  if (!Cmd->MMarkedForCleanup)
1223  Cmd->MMarks.MToBeDeleted = true;
1224  }
1225  }
1226 
1227  handleVisitedNodes(MVisitedCmds);
1228 }
1229 
1231  Command *Cmd, [[maybe_unused]] bool AllowUnsubmitted) {
1233  static bool DeprWarningPrinted = false;
1234  if (!DeprWarningPrinted) {
1235  std::cerr << "WARNING: The enviroment variable "
1236  "SYCL_DISABLE_POST_ENQUEUE_CLEANUP is deprecated. Please "
1237  "use SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP instead.\n";
1238  DeprWarningPrinted = true;
1239  }
1240  return;
1241  }
1243  return;
1244 
1245  assert(Cmd->MLeafCounter == 0 &&
1246  (Cmd->isSuccessfullyEnqueued() || AllowUnsubmitted));
1247  Command::CommandType CmdT = Cmd->getType();
1248 
1249  assert(CmdT != Command::ALLOCA && CmdT != Command::ALLOCA_SUB_BUF);
1250  assert(CmdT != Command::RELEASE);
1251  (void)CmdT;
1252 
1253  for (Command *UserCmd : Cmd->MUsers) {
1254  for (DepDesc &Dep : UserCmd->MDeps) {
1255  // Link the users of the command to the alloca command(s) instead
1256  if (Dep.MDepCommand == Cmd) {
1257  // ... unless the user is the alloca itself.
1258  if (Dep.MAllocaCmd == UserCmd) {
1259  Dep.MDepCommand = nullptr;
1260  } else {
1261  Dep.MDepCommand = Dep.MAllocaCmd;
1262  Dep.MDepCommand->MUsers.insert(UserCmd);
1263  }
1264  }
1265  }
1266  }
1267  // Update dependency users
1268  for (DepDesc &Dep : Cmd->MDeps) {
1269  Command *DepCmd = Dep.MDepCommand;
1270  DepCmd->MUsers.erase(Cmd);
1271  }
1272 
1273  if (Cmd->getType() == Command::FUSION &&
1274  !static_cast<KernelFusionCommand *>(Cmd)->readyForDeletion()) {
1275  // Fusion commands might still be needed because fusion might be aborted,
1276  // but a later call to complete_fusion still needs to be able to return a
1277  // valid event. Clean-up of fusion commands is therefore explicitly handled
1278  // by start fusion.
1279  return;
1280  }
1281  Cmd->getEvent()->setCommand(nullptr);
1282  delete Cmd;
1283 }
1284 
1286  const auto It = std::find_if(
1287  MMemObjs.begin(), MMemObjs.end(),
1288  [MemObject](const SYCLMemObjI *Obj) { return Obj == MemObject; });
1289  if (It != MMemObjs.end())
1290  MMemObjs.erase(It);
1291  MemObject->MRecord.reset();
1292 }
1293 
1294 // Make Cmd depend on DepEvent from different context. Connection is performed
1295 // via distinct ConnectCmd with host task command group on host queue. Cmd will
1296 // depend on ConnectCmd's host event.
1297 // DepEvent may not have a command associated with it in at least two cases:
1298 // - the command was deleted upon cleanup process;
1299 // - DepEvent is user event.
1300 // In both of these cases the only thing we can do is to make ConnectCmd depend
1301 // on DepEvent.
1302 // Otherwise, when there is a command associated with DepEvent, we make
1303 // ConnectCmd depend on on this command. If there is valid, i.e. non-nil,
1304 // requirement in Dep we make ConnectCmd depend on DepEvent's command with this
1305 // requirement.
1306 // Optionality of Dep is set by Dep.MDepCommand equal to nullptr.
1308  Command *const Cmd, const EventImplPtr &DepEvent, const DepDesc &Dep,
1309  std::vector<Command *> &ToCleanUp) {
1310  assert(Cmd->getWorkerContext() != DepEvent->getContextImpl());
1311 
1312  // construct Host Task type command manually and make it depend on DepEvent
1313  ExecCGCommand *ConnectCmd = nullptr;
1314 
1315  try {
1316  std::unique_ptr<detail::HostTask> HT(new detail::HostTask);
1317  std::unique_ptr<detail::CG> ConnectCG(new detail::CGHostTask(
1318  std::move(HT), /* Queue = */ {}, /* Context = */ {}, /* Args = */ {},
1320  /* ArgsStorage = */ {}, /* AccStorage = */ {},
1321  /* SharedPtrStorage = */ {}, /* Requirements = */ {},
1322  /* DepEvents = */ {DepEvent}),
1324  /* Payload */ {}));
1325  ConnectCmd = new ExecCGCommand(
1326  std::move(ConnectCG), Scheduler::getInstance().getDefaultHostQueue());
1327  } catch (const std::bad_alloc &) {
1328  throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
1329  }
1330 
1331  if (Dep.MDepRequirement) {
1332  // make ConnectCmd depend on requirement
1333  // Dismiss the result here as it's not a connection now,
1334  // 'cause ConnectCmd is host one
1335  (void)ConnectCmd->addDep(Dep, ToCleanUp);
1336  assert(reinterpret_cast<Command *>(DepEvent->getCommand()) ==
1337  Dep.MDepCommand);
1338  // add user to Dep.MDepCommand is already performed beyond this if branch
1339  {
1340  DepDesc DepOnConnect = Dep;
1341  DepOnConnect.MDepCommand = ConnectCmd;
1342 
1343  // Dismiss the result here as it's not a connection now,
1344  // 'cause ConnectCmd is host one
1345  std::ignore = Cmd->addDep(DepOnConnect, ToCleanUp);
1346  }
1347  } else {
1348  // It is required condition in another a path and addUser will be set in
1349  // addDep
1350  if (Command *DepCmd = reinterpret_cast<Command *>(DepEvent->getCommand()))
1351  DepCmd->addUser(ConnectCmd);
1352 
1353  std::ignore = ConnectCmd->addDep(DepEvent, ToCleanUp);
1354 
1355  std::ignore = Cmd->addDep(ConnectCmd->getEvent(), ToCleanUp);
1356 
1357  ConnectCmd->addUser(Cmd);
1358  }
1359 
1360  return ConnectCmd;
1361 }
1362 
1364  cleanUpCmdFusion(Queue.get());
1365  auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1366  MFusionMap.emplace(QUniqueID, std::make_unique<KernelFusionCommand>(Queue));
1367 }
1368 
1370  sycl::detail::queue_impl *Queue) {
1371  auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue);
1372  if (isInFusionMode(QUniqueID)) {
1373  throw sycl::exception{sycl::make_error_code(sycl::errc::invalid),
1374  "Queue already in fusion mode"};
1375  }
1376  auto OldFusionCmd = findFusionList(QUniqueID);
1377  if (OldFusionCmd != MFusionMap.end()) {
1378  // If fusion was used on this queue previously, the old fusion command might
1379  // still be around to make sure that even after
1380  // cancellation of the fusion due to synchronization, complete_fusion is
1381  // still able to return a valid event.
1382  OldFusionCmd->second->setFusionStatus(
1384  cleanupCommand(OldFusionCmd->second.release());
1385  MFusionMap.erase(OldFusionCmd);
1386  }
1387 }
1388 
1389 void Scheduler::GraphBuilder::removeNodeFromGraph(
1390  Command *Node, std::vector<Command *> &ToEnqueue) {
1391  // Remove the placeholder command as leaf of all its requirements and from the
1392  // user list of all its dependencies.
1393  for (auto &Dep : Node->MDeps) {
1394  auto AccessMode = Dep.MDepRequirement->MAccessMode;
1395  auto *Record = getMemObjRecord(Dep.MDepRequirement->MSYCLMemObj);
1396 
1397  Node->MLeafCounter -= Record->MReadLeaves.remove(Node);
1398  Node->MLeafCounter -= Record->MWriteLeaves.remove(Node);
1399  // If the placeholder had a write-requirement on this record, we need to
1400  // restore the previous leaves.
1401  if (AccessMode != access::mode::read) {
1402  for (auto PrevDep : Dep.MDepCommand->MDeps) {
1403  auto *DepReq = PrevDep.MDepRequirement;
1404  auto *DepRecord = getMemObjRecord(DepReq->MSYCLMemObj);
1405  if (DepRecord == Record) {
1406  // Need to restore this as a leaf, because we pushed it from the
1407  // leaves when adding the placeholder command.
1408  assert(Dep.MDepCommand);
1409  addNodeToLeaves(Record, Dep.MDepCommand, DepReq->MAccessMode,
1410  ToEnqueue);
1411  }
1412  }
1413  }
1414  Dep.MDepCommand->MUsers.erase(Node);
1415  }
1416 
1417  // Clear all the dependencies to avoid cleanDepEventsThroughOneLevel, called
1418  // from the destructor of the command to delete the dependencies of the
1419  // command this command depends on.
1420  Node->clearAllDependencies();
1421 }
1422 
1424  std::vector<Command *> &ToEnqueue) {
1425  auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1426  if (!isInFusionMode(QUniqueID)) {
1427  return;
1428  }
1429  auto FusionList = findFusionList(QUniqueID);
1430 
1431  auto *PlaceholderCmd = (*FusionList).second.get();
1432 
1433  // Enqueue all the kernels/commands from the fusion list
1434  auto FusedCmdList = PlaceholderCmd->getFusionList();
1435  ToEnqueue.insert(ToEnqueue.end(), FusedCmdList.begin(), FusedCmdList.end());
1436 
1437  // The commands establishing an event dependency between the fusion
1438  // placeholder command and the individual kernels need to be enqueued.
1439  ToEnqueue.insert(ToEnqueue.end(), PlaceholderCmd->auxiliaryCommands().begin(),
1440  PlaceholderCmd->auxiliaryCommands().end());
1441 
1442  ToEnqueue.push_back(PlaceholderCmd);
1443 
1444  if (MPrintOptionsArray[AfterFusionCancel]) {
1445  printGraphAsDot("after_fusionCancel");
1446  }
1447 
1448  // Set the status for the fusion command
1449  PlaceholderCmd->setFusionStatus(KernelFusionCommand::FusionStatus::CANCELLED);
1450 }
1451 
1452 static bool isPartOfFusion(Command *Cmd, KernelFusionCommand *Fusion) {
1453  if (Cmd->getType() == Command::RUN_CG) {
1454  return static_cast<ExecCGCommand *>(Cmd)->MFusionCmd == Fusion;
1455  }
1456  return false;
1457 }
1458 
1459 static bool checkForCircularDependency(Command *, bool, KernelFusionCommand *);
1460 
1461 static bool createsCircularDependency(Command *Cmd, bool PredPartOfFusion,
1462  KernelFusionCommand *Fusion) {
1463  if (isPartOfFusion(Cmd, Fusion)) {
1464  // If this is part of the fusion and the predecessor also was, we can stop
1465  // the traversal here. A direct dependency between two kernels in the same
1466  // fusion will never form a cyclic dependency and by iterating over all
1467  // commands in a fusion, we will detect any cycles originating from the
1468  // current command.
1469  // If the predecessor was not part of the fusion, but the current command
1470  // is, we have found a potential cycle in the dependency graph.
1471  return !PredPartOfFusion;
1472  }
1473  return checkForCircularDependency(Cmd, false, Fusion);
1474 }
1475 
1476 static bool checkForCircularDependency(Command *Cmd, bool IsPartOfFusion,
1477  KernelFusionCommand *Fusion) {
1478  // Check the requirement dependencies.
1479  for (auto &Dep : Cmd->MDeps) {
1480  auto *DepCmd = Dep.MDepCommand;
1481  if (!DepCmd) {
1482  continue;
1483  }
1484  if (createsCircularDependency(DepCmd, IsPartOfFusion, Fusion)) {
1485  return true;
1486  }
1487  }
1488  for (auto &Ev : Cmd->getPreparedDepsEvents()) {
1489  auto *EvDepCmd = static_cast<Command *>(Ev->getCommand());
1490  if (!EvDepCmd) {
1491  continue;
1492  }
1493  if (createsCircularDependency(EvDepCmd, IsPartOfFusion, Fusion)) {
1494  return true;
1495  }
1496  }
1497  for (auto &Ev : Cmd->getPreparedHostDepsEvents()) {
1498  auto *EvDepCmd = static_cast<Command *>(Ev->getCommand());
1499  if (!EvDepCmd) {
1500  continue;
1501  }
1502  if (createsCircularDependency(EvDepCmd, IsPartOfFusion, Fusion)) {
1503  return true;
1504  }
1505  }
1506  return false;
1507 }
1508 
1511  std::vector<Command *> &ToEnqueue,
1512  const property_list &PropList) {
1513  auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1514 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
1515  if (!isInFusionMode(QUniqueID)) {
1516  auto InactiveFusionList = findFusionList(QUniqueID);
1517  if (InactiveFusionList == MFusionMap.end()) {
1518  throw sycl::exception{
1519  sycl::make_error_code(sycl::errc::invalid),
1520  "Calling complete_fusion on a queue not in fusion mode"};
1521  }
1522  return InactiveFusionList->second->getEvent();
1523  }
1524 
1525  auto FusionList = findFusionList(QUniqueID);
1526  auto *PlaceholderCmd = FusionList->second.get();
1527  auto &CmdList = PlaceholderCmd->getFusionList();
1528 
1529  // We need to check if fusing the kernel would create a circular dependency. A
1530  // circular dependency would arise, if a kernel in the fusion list
1531  // *indirectly* depends on another kernel in the fusion list. Here, indirectly
1532  // means, that the dependency is created through a third command not part of
1533  // the fusion, on which this kernel depends and which in turn depends on
1534  // another kernel in fusion list.
1535  bool CreatesCircularDep =
1536  std::any_of(CmdList.begin(), CmdList.end(), [&](ExecCGCommand *Cmd) {
1537  return checkForCircularDependency(Cmd, true, PlaceholderCmd);
1538  });
1539  if (CreatesCircularDep) {
1540  // If fusing would create a fused kernel, cancel the fusion.
1541  printFusionWarning(
1542  "Aborting fusion because it would create a circular dependency");
1543  auto LastEvent = PlaceholderCmd->getEvent();
1544  this->cancelFusion(Queue, ToEnqueue);
1545  return LastEvent;
1546  }
1547 
1548  // Call the JIT compiler to generate a new fused kernel.
1550  Queue, CmdList, PropList);
1551 
1552  if (!FusedCG) {
1553  // If the JIT compiler returns a nullptr, JIT compilation of the fused
1554  // kernel failed. In that case, simply cancel the fusion and run each kernel
1555  // on its own.
1556  auto LastEvent = PlaceholderCmd->getEvent();
1557  this->cancelFusion(Queue, ToEnqueue);
1558  return LastEvent;
1559  }
1560 
1561  // Inherit all event dependencies from the input commands in the fusion list.
1562  std::vector<EventImplPtr> FusedEventDeps;
1563  for (auto *Cmd : CmdList) {
1564  FusedEventDeps.insert(FusedEventDeps.end(),
1565  Cmd->getPreparedDepsEvents().begin(),
1566  Cmd->getPreparedDepsEvents().end());
1567  FusedEventDeps.insert(FusedEventDeps.end(),
1568  Cmd->getPreparedHostDepsEvents().begin(),
1569  Cmd->getPreparedHostDepsEvents().end());
1570  }
1571 
1572  // Remove internal explicit dependencies, i.e., explicit dependencies from one
1573  // kernel in the fusion list to another kernel also in the fusion list.
1574  FusedEventDeps.erase(
1575  std::remove_if(FusedEventDeps.begin(), FusedEventDeps.end(),
1576  [&](EventImplPtr &E) {
1577  if (E->getCommand() == PlaceholderCmd) {
1578  return true;
1579  }
1580  if (E->getCommand() &&
1581  static_cast<Command *>(E->getCommand())->getType() ==
1582  Command::RUN_CG) {
1583  auto *RunCGCmd =
1584  static_cast<ExecCGCommand *>(E->getCommand());
1585  if (RunCGCmd->MFusionCmd == PlaceholderCmd) {
1586  return true;
1587  }
1588  }
1589  return false;
1590  }),
1591  FusedEventDeps.end());
1592 
1593  auto FusedKernelCmd =
1594  std::make_unique<ExecCGCommand>(std::move(FusedCG), Queue);
1595 
1596  assert(PlaceholderCmd->MDeps.empty());
1597  // Next, backwards iterate over all the commands in the fusion list and remove
1598  // them from the graph to restore the state before starting fusion, so we can
1599  // add the fused kernel to the graph in the next step.
1600  // Clean up the old commands after successfully fusing them.
1601  for (auto OldCmd = CmdList.rbegin(); OldCmd != CmdList.rend(); ++OldCmd) {
1602  removeNodeFromGraph(*OldCmd, ToEnqueue);
1603  cleanupCommand(*OldCmd, /* AllowUnsubmitted */ true);
1604  }
1605 
1606  createGraphForCommand(FusedKernelCmd.get(), FusedKernelCmd->getCG(), false,
1607  FusedKernelCmd->getCG().getRequirements(),
1608  FusedEventDeps, Queue, ToEnqueue);
1609 
1610  ToEnqueue.push_back(FusedKernelCmd.get());
1611 
1612  std::vector<Command *> ToCleanUp;
1613  // Make the placeholder command depend on the execution of the fused kernel
1614  auto *ConnectToPlaceholder =
1615  PlaceholderCmd->addDep(FusedKernelCmd->getEvent(), ToCleanUp);
1616  if (ConnectToPlaceholder) {
1617  ToEnqueue.push_back(ConnectToPlaceholder);
1618  }
1619  for (Command *Cmd : ToCleanUp) {
1620  cleanupCommand(Cmd);
1621  }
1622  ToEnqueue.push_back(PlaceholderCmd);
1623 
1624  if (MPrintOptionsArray[AfterFusionComplete]) {
1625  printGraphAsDot("after_fusionComplete");
1626  }
1627 
1628  // Set the status for the fusion command.
1629  PlaceholderCmd->setFusionStatus(KernelFusionCommand::FusionStatus::COMPLETE);
1630 
1631  return FusedKernelCmd.release()->getEvent();
1632 #else // SYCL_EXT_CODEPLAY_KERNEL_FUSION
1633  printFusionWarning("Kernel fusion not supported by this build");
1634  (void)PropList;
1635  auto FusionList = findFusionList(QUniqueID);
1636  auto *PlaceholderCmd = FusionList->second.get();
1637  auto LastEvent = PlaceholderCmd->getEvent();
1638  this->cancelFusion(Queue, ToEnqueue);
1639  return LastEvent;
1640 #endif // SYCL_EXT_CODEPLAY_KERNEL_FUSION
1641 }
1642 
1643 bool Scheduler::GraphBuilder::isInFusionMode(QueueIdT Id) {
1644  auto FusionList = findFusionList(Id);
1645  if (FusionList == MFusionMap.end()) {
1646  return false;
1647  }
1648  return FusionList->second->isActive();
1649 }
1650 
1651 } // namespace detail
1652 } // namespace _V1
1653 } // namespace sycl
sycl::_V1::property_list
Objects of the property_list class are containers for the SYCL properties.
Definition: property_list.hpp:31
sycl::_V1::detail::Command
The Command class represents some action that needs to be performed on one or more memory objects.
Definition: commands.hpp:99
sycl::_V1::detail::Command::readyForCleanup
virtual bool readyForCleanup() const
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:733
sycl::_V1::detail::LeavesCollection::remove
size_t remove(value_type Cmd)
Replacement for std::remove with subsequent call to erase(newEnd, end()).
Definition: leaves_collection.cpp:38
sycl::_V1::access::mode::discard_read_write
@ discard_read_write
event_impl.hpp
sycl::_V1::detail::Scheduler::Command
friend class Command
Definition: scheduler.hpp:918
sycl::_V1::detail::Scheduler::GraphBuilder::startFusion
void startFusion(QueueImplPtr Queue)
Definition: graph_builder.cpp:1363
sycl::_V1::access::mode
mode
Definition: access.hpp:34
sycl::_V1::detail::markNodeAsVisited
static bool markNodeAsVisited(Command *Cmd, std::vector< Command * > &Visited)
Definition: graph_builder.cpp:126
sycl::_V1::detail::AllocaCommandBase::MLinkedAllocaCmd
AllocaCommandBase * MLinkedAllocaCmd
Alloca command linked with current command.
Definition: commands.hpp:475
sycl::_V1::detail::createsCircularDependency
static bool createsCircularDependency(Command *Cmd, bool PredPartOfFusion, KernelFusionCommand *Fusion)
Definition: graph_builder.cpp:1461
sycl::_V1::detail::CGUpdateHost
"Update host" command group class.
Definition: cg.hpp:254
context_impl.hpp
sycl::_V1::detail::Command::isSuccessfullyEnqueued
bool isSuccessfullyEnqueued() const
Definition: commands.hpp:149
sycl::_V1::detail::ContextImplPtr
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
Definition: event_impl.hpp:33
sycl::_V1::make_error_code
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:94
sycl::_V1::detail::Command::MMarks
Marks MMarks
Used for marking the node during graph traversal.
Definition: commands.hpp:324
config.hpp
sycl::_V1::detail::Scheduler::GraphBuilder::getOrInsertMemObjRecord
MemObjRecord * getOrInsertMemObjRecord(const QueueImplPtr &Queue, const Requirement *Req, std::vector< Command * > &ToEnqueue)
Definition: graph_builder.cpp:195
sycl::_V1::detail::AccessorImplHost::MIsSubBuffer
bool MIsSubBuffer
Definition: accessor_impl.hpp:137
sycl::_V1::detail::QueueIdT
std::hash< std::shared_ptr< detail::queue_impl > >::result_type QueueIdT
Definition: scheduler.hpp:187
sycl::_V1::detail::AccessorImplHost
Definition: accessor_impl.hpp:42
sycl::_V1::detail::jit_compiler::fuseKernels
std::unique_ptr< detail::CG > fuseKernels(QueueImplPtr Queue, std::vector< ExecCGCommand * > &InputKernels, const property_list &)
sycl::_V1::detail::AccessorImplHost::MAccessMode
access::mode MAccessMode
Definition: accessor_impl.hpp:130
sycl::_V1::detail::combineAccessModesOfReqs
static void combineAccessModesOfReqs(std::vector< Requirement * > &Reqs)
Definition: graph_builder.cpp:913
sycl::_V1::detail::Scheduler::GraphBuilder::completeFusion
EventImplPtr completeFusion(QueueImplPtr Queue, std::vector< Command * > &ToEnqueue, const property_list &)
Definition: graph_builder.cpp:1510
sycl::_V1::detail::DeviceImplPtr
std::shared_ptr< device_impl > DeviceImplPtr
Definition: program_manager.hpp:65
sycl::_V1::detail::MemCpyCommandHost::getRequirement
const Requirement * getRequirement() const final
Definition: commands.hpp:601
sycl::_V1::detail::AccessorImplHost::MElemSize
unsigned int MElemSize
Definition: accessor_impl.hpp:135
sycl::_V1::detail::Command::ALLOCA
@ ALLOCA
Definition: commands.hpp:104
sycl::_V1::detail::MapMemObject
The map command enqueues mapping of device memory onto host memory.
Definition: commands.hpp:531
sycl::_V1::detail::Command::getQueue
const QueueImplPtr & getQueue() const
Definition: commands.hpp:169
sycl::_V1::detail::SYCLConfig
Definition: config.hpp:111
sycl::_V1::detail::Scheduler::GraphBuilder::getMemObjRecord
MemObjRecord * getMemObjRecord(SYCLMemObjI *MemObject)
Definition: graph_builder.cpp:191
sycl::_V1::detail::isInteropHostTask
static bool isInteropHostTask(ExecCGCommand *Cmd)
Definition: graph_builder.cpp:903
sycl::_V1::detail::KernelFusionCommand::FusionStatus::CANCELLED
@ CANCELLED
detail
---— Error handling, matching OpenCL plugin semantics.
Definition: common.hpp:44
sycl::_V1::access::mode::atomic
@ atomic
sycl::_V1::detail::Scheduler::GraphBuilder::updateLeaves
void updateLeaves(const std::set< Command * > &Cmds, MemObjRecord *Record, access::mode AccessMode, std::vector< Command * > &ToCleanUp)
Removes commands from leaves.
Definition: graph_builder.cpp:253
sycl
Definition: access.hpp:18
sycl::_V1::detail::EmptyCommand
The empty command does nothing during enqueue.
Definition: commands.hpp:405
sycl::_V1::detail::Command::Marks::MToBeDeleted
bool MToBeDeleted
Used for marking the node for deletion during cleanup.
Definition: commands.hpp:321
sycl::_V1::detail::EnqueueResultT::SyclEnqueueBlocked
@ SyclEnqueueBlocked
Definition: commands.hpp:58
sycl::_V1::detail::doOverlap
static bool doOverlap(const Requirement *LHS, const Requirement *RHS)
Checks whether two requirements overlap or not.
Definition: graph_builder.cpp:43
sycl::_V1::detail::Command::printDot
virtual void printDot(std::ostream &Stream) const =0
access.hpp
queue_impl.hpp
sycl::_V1::detail::KernelFusionCommand
The KernelFusionCommand is placed in the execution graph together with the individual kernels of the ...
Definition: commands.hpp:709
sycl::_V1::detail::Command::addUser
void addUser(Command *NewUser)
Definition: commands.hpp:132
sycl::_V1::detail::ExecCGCommand::getCG
detail::CG & getCG() const
Definition: commands.hpp:649
sycl::_V1::detail::Scheduler::GraphBuilder::cleanupCommandsForRecord
void cleanupCommandsForRecord(MemObjRecord *Record)
Removes commands that use the given MemObjRecord from the graph.
Definition: graph_builder.cpp:1141
sycl::_V1::detail::Command::MLeafCounter
unsigned MLeafCounter
Counts the number of memory objects this command is a leaf for.
Definition: commands.hpp:315
sycl::_V1::detail::Requirement
AccessorImplHost Requirement
Definition: accessor_impl.hpp:210
scheduler.hpp
sycl::_V1::detail::Scheduler::cancelFusion
void cancelFusion(QueueImplPtr Queue)
Definition: scheduler.cpp:587
sycl::_V1::detail::MemObjRecord::MMemModified
bool MMemModified
Definition: scheduler.hpp:222
sycl::_V1::detail::Command::FUSION
@ FUSION
Definition: commands.hpp:112
sycl::_V1::detail::Scheduler::GraphBuilder::cleanupCommand
void cleanupCommand(Command *Cmd, bool AllowUnsubmitted=false)
Definition: graph_builder.cpp:1230
sycl::_V1::detail::AllocaSubBufCommand
The AllocaSubBuf command enqueues creation of sub-buffer of memory object.
Definition: commands.hpp:512
_pi_ext_command_buffer
Definition: pi_cuda.hpp:80
sycl::_V1::detail::Command::MDeps
std::vector< DepDesc > MDeps
Contains list of dependencies(edges)
Definition: commands.hpp:309
sycl::_V1::range< 3 >
sycl::_V1::detail::MemObjRecord::MReadLeaves
LeavesCollection MReadLeaves
Definition: scheduler.hpp:208
sycl::_V1::detail::DepDesc::MAllocaCmd
AllocaCommandBase * MAllocaCmd
Allocation command for the memory object we have requirement for.
Definition: commands.hpp:89
sycl::_V1::detail::AllocaCommandBase::getRequirement
const Requirement * getRequirement() const final
Definition: commands.hpp:458
sycl::_V1::detail::UpdateHostRequirementCommand::getRequirement
const Requirement * getRequirement() const final
Definition: commands.hpp:696
sycl::_V1::detail::LeavesCollection
A wrapper for CircularBuffer class along with collection for host accessor's EmptyCommands.
Definition: leaves_collection.hpp:38
sycl::_V1::detail::checkForCircularDependency
static bool checkForCircularDependency(Command *, bool, KernelFusionCommand *)
Definition: graph_builder.cpp:1476
sycl::_V1::detail::Command::BlockReason
BlockReason
Definition: commands.hpp:326
sycl::_V1::detail::DepDesc::MDepRequirement
const Requirement * MDepRequirement
Requirement for the dependency.
Definition: commands.hpp:86
sycl::_V1::detail::Scheduler::GraphBuilder::removeRecordForMemObj
void removeRecordForMemObj(SYCLMemObjI *MemObject)
Removes the MemObjRecord for the memory object passed.
Definition: graph_builder.cpp:1285
sycl::_V1::detail::DepDesc::MDepCommand
Command * MDepCommand
The actual dependency command.
Definition: commands.hpp:84
sycl::_V1::detail::Scheduler::getMemObjRecord
static MemObjRecord * getMemObjRecord(const Requirement *const Req)
Definition: scheduler.cpp:426
sycl::_V1::detail::SYCLMemObjI::getInteropContext
virtual ContextImplPtr getInteropContext() const =0
sycl::_V1::detail::insertMapUnmapForLinkedCmds
static Command * insertMapUnmapForLinkedCmds(AllocaCommandBase *AllocaCmdSrc, AllocaCommandBase *AllocaCmdDst, access::mode MapMode)
Definition: graph_builder.cpp:312
sycl::_V1::detail::AccessorImplHost::MSYCLMemObj
detail::SYCLMemObjI * MSYCLMemObj
Definition: accessor_impl.hpp:132
sycl::_V1::detail::MemObjRecord
Memory Object Record.
Definition: scheduler.hpp:198
sycl::_V1::detail::Scheduler::GraphBuilder::cancelFusion
void cancelFusion(QueueImplPtr Queue, std::vector< Command * > &ToEnqueue)
Definition: graph_builder.cpp:1423
sycl::_V1::detail::Command::getType
CommandType getType() const
Definition: commands.hpp:135
sycl::_V1::detail::MemObjRecord::MWriteLeaves
LeavesCollection MWriteLeaves
Definition: scheduler.hpp:211
sycl::_V1::detail::Scheduler::GraphBuilder::findAllocaForReq
AllocaCommandBase * findAllocaForReq(MemObjRecord *Record, const Requirement *Req, const ContextImplPtr &Context, bool AllowConst=true)
Searches for suitable alloca in memory record.
Definition: graph_builder.cpp:653
sycl::_V1::detail::IsSuitableSubReq
static bool IsSuitableSubReq(const Requirement *Req)
Checks if current requirement is requirement for sub buffer.
Definition: graph_builder.cpp:57
sycl::_V1::detail::CGHostTask::MHostTask
std::unique_ptr< HostTask > MHostTask
Definition: cg.hpp:332
sycl::_V1::detail::DepDesc
Dependency between two commands.
Definition: commands.hpp:73
sycl::_V1::detail::MemObjRecord::MCurContext
ContextImplPtr MCurContext
Definition: scheduler.hpp:214
sycl::_V1::detail::Command::getEvent
const EventImplPtr & getEvent() const
Definition: commands.hpp:171
sycl::_V1::detail::isPartOfFusion
static bool isPartOfFusion(Command *Cmd, KernelFusionCommand *Fusion)
Definition: graph_builder.cpp:1452
sycl::_V1::range::size
size_t size() const
Definition: range.hpp:56
sycl::_V1::detail::HostTask
Definition: cg_types.hpp:232
sycl::_V1::detail::isAccessModeAllowed
static bool isAccessModeAllowed(access::mode Required, access::mode Current)
Checks if the required access mode is allowed under the current one.
Definition: graph_builder.cpp:62
sycl::_V1::detail::KernelFusionCommand::FusionStatus::COMPLETE
@ COMPLETE
sycl::_V1::detail::AccessorImplHost::MBlockedCmd
Command * MBlockedCmd
Definition: accessor_impl.hpp:141
sycl::_V1::exception
Definition: exception.hpp:68
std::cerr
__SYCL_EXTERN_STREAM_ATTRS ostream cerr
Linked to standard error (unbuffered)
sycl::_V1::detail::MemObjRecord::MHostAccess
access::mode MHostAccess
Definition: scheduler.hpp:218
sycl::_V1::detail::CG::StorageInitHelper
Definition: cg.hpp:79
sycl::_V1::device
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:59
sycl::_V1::detail::Scheduler::cleanUpCmdFusion
void cleanUpCmdFusion(sycl::detail::queue_impl *Queue)
Definition: scheduler.cpp:580
sycl::_V1::detail::EventImplPtr
std::shared_ptr< event_impl > EventImplPtr
Definition: cg.hpp:39
sycl::_V1::detail::LeavesCollection::EnqueueListT
std::vector< Command * > EnqueueListT
Definition: leaves_collection.hpp:42
sycl::_V1::detail::AllocaCommandBase::MMemAllocation
void * MMemAllocation
Definition: commands.hpp:468
sycl::_V1::detail::Scheduler::DefaultHostQueue
QueueImplPtr DefaultHostQueue
Definition: scheduler.hpp:916
sycl::_V1::detail::Scheduler::GraphBuildResult
Definition: scheduler.hpp:540
sycl::_V1::detail::LeavesCollection::AllocateDependencyF
std::function< void(Command *, Command *, MemObjRecord *, EnqueueListT &)> AllocateDependencyF
Definition: leaves_collection.hpp:46
sycl::_V1::detail::Command::clearAllDependencies
void clearAllDependencies()
Clear all dependency events This should only be used if a command is about to be deleted without bein...
Definition: commands.hpp:302
sycl::_V1::detail::CG::getType
CGTYPE getType()
Definition: cg.hpp:124
sycl::_V1::detail::Scheduler::GraphBuilder::addNodeToLeaves
void addNodeToLeaves(MemObjRecord *Record, Command *Cmd, access::mode AccessMode, std::vector< Command * > &ToEnqueue)
Adds new command to leaves if needed.
Definition: graph_builder.cpp:272
sycl::_V1::detail::Command::getPreparedDepsEvents
const std::vector< EventImplPtr > & getPreparedDepsEvents() const
Definition: commands.hpp:289
sycl::_V1::detail::queue_impl
Definition: queue_impl.hpp:70
sycl::_V1::detail::Command::MUsers
std::unordered_set< Command * > MUsers
Contains list of commands that depend on the command.
Definition: commands.hpp:311
sycl::_V1::detail::Command::RELEASE
@ RELEASE
Definition: commands.hpp:106
sycl::_V1::detail::ExecCGCommand
The exec CG command enqueues execution of kernel or explicit memory operation.
Definition: commands.hpp:635
sycl::_V1::detail::printDotRecursive
static void printDotRecursive(std::fstream &Stream, std::vector< Command * > &Visited, Command *Cmd)
Definition: graph_builder.cpp:158
sycl::_V1::detail::combineAccessModes
static access::mode combineAccessModes(access::mode A, access::mode B)
Combines two access modes into a single one that allows both.
Definition: graph_builder.cpp:80
sycl::_V1::detail::CGHostTask
Definition: cg.hpp:330
sycl::_V1::detail::Command::CommandType
CommandType
Definition: commands.hpp:101
sycl::_V1::detail::Command::Marks::MVisited
bool MVisited
Used for marking the node as visited during graph traversal.
Definition: commands.hpp:319
sycl::_V1::detail::Command::BlockReason::HostAccessor
@ HostAccessor
sycl::_V1::detail::AllocaCommandBase::MIsActive
bool MIsActive
Indicates that current alloca is active one.
Definition: commands.hpp:477
sycl::_V1::detail::Scheduler::GraphBuilder::addCopyBack
Command * addCopyBack(Requirement *Req, std::vector< Command * > &ToEnqueue)
Enqueues a command to update memory to the latest state.
Definition: graph_builder.cpp:475
sycl::_V1::detail::Scheduler::GraphBuilder::cleanUpCmdFusion
void cleanUpCmdFusion(sycl::detail::queue_impl *Queue)
Clean up the internal fusion commands held for the given queue.
Definition: graph_builder.cpp:1369
sycl::_V1::detail::Scheduler::GraphBuilder::decrementLeafCountersForRecord
void decrementLeafCountersForRecord(MemObjRecord *Record)
Decrements leaf counters for all leaves of the record.
Definition: graph_builder.cpp:1127
exception.hpp
sycl::_V1::detail::Scheduler::getDefaultHostQueue
QueueImplPtr getDefaultHostQueue()
Definition: scheduler.hpp:444
sycl::_V1::detail::Command::getPreparedHostDepsEvents
const std::vector< EventImplPtr > & getPreparedHostDepsEvents() const
Definition: commands.hpp:285
sycl::_V1::detail::handleVisitedNodes
static void handleVisitedNodes(std::vector< Command * > &Visited)
Definition: graph_builder.cpp:140
sycl::_V1::detail::QueueImplPtr
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
Definition: event_impl.hpp:35
sycl::_V1::detail::sameCtx
static bool sameCtx(const ContextImplPtr &LHS, const ContextImplPtr &RHS)
Definition: graph_builder.cpp:50
sycl::_V1::detail::Scheduler::GraphBuilder::connectDepEvent
Command * connectDepEvent(Command *const Cmd, const EventImplPtr &DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform connection of events in multiple contexts.
Definition: graph_builder.cpp:1307
sycl::_V1::detail::Scheduler::getInstance
static Scheduler & getInstance()
Definition: scheduler.cpp:260
sycl::_V1::detail::Command::addDep
Command * addDep(DepDesc NewDep, std::vector< Command * > &ToCleanUp)
Definition: commands.cpp:738
sycl::_V1::detail::SYCLMemObjI
Definition: sycl_mem_obj_i.hpp:28
sycl::_V1::detail::AccessorImplHost::MOffsetInBytes
size_t MOffsetInBytes
Definition: accessor_impl.hpp:136
any_of
bool any_of(const simd_mask< _Tp, _Abi > &) noexcept
sycl::_V1::detail::checkHostUnifiedMemory
static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx)
Definition: graph_builder.cpp:673
sycl::_V1::detail::AccessorImplHost::MData
void *& MData
Definition: accessor_impl.hpp:139
sycl::_V1::detail::AllocaCommandBase::MIsConst
bool MIsConst
Definition: commands.hpp:483
sycl::_V1::detail::Scheduler::GraphBuilder::GraphBuilder
GraphBuilder()
Definition: graph_builder.cpp:95
sycl::_V1::detail::AccessorImplHost::MAccessRange
range< 3 > & MAccessRange
Definition: accessor_impl.hpp:127
sycl::_V1::detail::Scheduler::GraphBuilder::addCGUpdateHost
Command * addCGUpdateHost(std::unique_ptr< detail::CG > CommandGroup, const QueueImplPtr &HostQueue, std::vector< Command * > &ToEnqueue)
Registers a command group that updates host memory to the latest state.
Definition: graph_builder.cpp:563
sycl::_V1::detail::Scheduler::GraphBuilder::addCG
GraphBuildResult addCG(std::unique_ptr< detail::CG > CommandGroup, const QueueImplPtr &Queue, std::vector< Command * > &ToEnqueue, sycl::detail::pi::PiExtCommandBuffer CommandBuffer=nullptr, const std::vector< sycl::detail::pi::PiExtSyncPoint > &Dependencies={})
Registers command group and adds it to the dependency graph.
Definition: graph_builder.cpp:933
sycl::_V1::detail::CG::CodeplayHostTask
@ CodeplayHostTask
Definition: cg.hpp:65
sycl::_V1::detail::LeavesCollection::toVector
std::vector< value_type > toVector() const
Definition: leaves_collection.cpp:64
sycl::_V1::detail::unmarkVisitedNodes
static void unmarkVisitedNodes(std::vector< Command * > &Visited)
Definition: graph_builder.cpp:135
sycl::_V1::detail::Scheduler::GraphBuilder::addHostAccessor
Command * addHostAccessor(Requirement *Req, std::vector< Command * > &ToEnqueue)
Enqueues a command to create a host accessor.
Definition: graph_builder.cpp:521
sycl::_V1::detail::CG
Base class for all types of command groups.
Definition: cg.hpp:49
sycl::_V1::detail::jit_compiler::get_instance
static jit_compiler & get_instance()
Definition: jit_compiler.hpp:39
sycl::_V1::detail::UpdateHostRequirementCommand
Definition: commands.hpp:690
sycl::_V1::detail::KernelFusionCommand::readyForDeletion
bool readyForDeletion() const
Definition: commands.hpp:737
sycl::_V1::detail::AllocaCommandBase
Base class for memory allocation commands.
Definition: commands.hpp:447
sycl::_V1::detail::FusionList
std::unique_ptr< KernelFusionCommand > FusionList
Definition: scheduler.hpp:189
sycl::_V1::AccessMode
class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(local_accessor) local_accessor class __SYCL_EBO __SYCL_SPECIAL_CLASS AccessMode
Definition: accessor.hpp:3215
sycl::_V1::access::mode::read_write
@ read_write
sycl::_V1::detail::Command::RUN_CG
@ RUN_CG
Definition: commands.hpp:102
sycl::_V1::detail::UnMapMemObject
The unmap command removes mapping of host memory onto device memory.
Definition: commands.hpp:550
jit_compiler.hpp
sycl::_V1::access::mode::write
@ write
sycl::_V1::detail::MemObjRecord::MAllocaCommands
std::vector< AllocaCommandBase * > MAllocaCommands
Definition: scheduler.hpp:205
sycl::_V1::detail::MemCpyCommandHost
The mem copy host command enqueues memory copy between two instances of memory object.
Definition: commands.hpp:594
sycl::_V1::detail::SYCLMemObjI::MRecord
std::shared_ptr< MemObjRecord > MRecord
Definition: sycl_mem_obj_i.hpp:83
sycl::_V1::access::mode::read
@ read
sycl::_V1::detail::Scheduler::GraphBuilder::findDepForRecord
DepDesc findDepForRecord(Command *Cmd, MemObjRecord *Record)
Finds a command dependency corresponding to the record.
Definition: graph_builder.cpp:640
sycl_mem_obj_t.hpp
sycl::_V1::detail::KernelFusionCommand::FusionStatus::DELETED
@ DELETED
sycl::_V1::access::mode::discard_write
@ discard_write
sycl::_V1::detail::getSyclObjImpl
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:30
sycl::_V1::detail::Scheduler::isInFusionMode
bool isInFusionMode(QueueIdT Queue)
Definition: scheduler.cpp:611
sycl::_V1::detail::Command::ALLOCA_SUB_BUF
@ ALLOCA_SUB_BUF
Definition: commands.hpp:105
sycl::_V1::detail::Command::getWorkerContext
virtual const ContextImplPtr & getWorkerContext() const
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:720
memory_manager.hpp