DPC++ Runtime
Runtime libraries for oneAPI Data Parallel C++
graph_builder.cpp
Go to the documentation of this file.
1 //===-- graph_builder.cpp - SYCL Graph Builder ------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "detail/config.hpp"
12 #include <CL/sycl/exception.hpp>
13 #include <detail/context_impl.hpp>
14 #include <detail/event_impl.hpp>
15 #include <detail/queue_impl.hpp>
17 
18 #include <cstdlib>
19 #include <cstring>
20 #include <fstream>
21 #include <map>
22 #include <memory>
23 #include <queue>
24 #include <set>
25 #include <vector>
26 
28 namespace sycl {
29 namespace detail {
30 
35 // TODO merge with LeavesCollection's version of doOverlap (see
36 // leaves_collection.cpp).
37 static bool doOverlap(const Requirement *LHS, const Requirement *RHS) {
38  return (LHS->MOffsetInBytes + LHS->MAccessRange.size() * LHS->MElemSize >=
39  RHS->MOffsetInBytes) ||
40  (RHS->MOffsetInBytes + RHS->MAccessRange.size() * RHS->MElemSize >=
41  LHS->MOffsetInBytes);
42 }
43 
44 static bool sameCtx(const ContextImplPtr &LHS, const ContextImplPtr &RHS) {
45  // Consider two different host contexts to be the same to avoid additional
46  // allocation on the host
47  return LHS == RHS || (LHS->is_host() && RHS->is_host());
48 }
49 
51 static bool IsSuitableSubReq(const Requirement *Req) {
52  return Req->MIsSubBuffer;
53 }
54 
56 static bool isAccessModeAllowed(access::mode Required, access::mode Current) {
57  switch (Current) {
58  case access::mode::read:
59  return (Required == Current);
61  assert(false && "Write only access is expected to be mapped as read_write");
62  return (Required == Current || Required == access::mode::discard_write);
63  case access::mode::read_write:
64  case access::mode::atomic:
65  case access::mode::discard_write:
66  case access::mode::discard_read_write:
67  return true;
68  }
69  assert(false);
70  return false;
71 }
72 
75  if (A == B)
76  return A;
77 
78  if (A == access::mode::discard_write &&
79  (B == access::mode::discard_read_write || B == access::mode::write))
80  return B;
81 
82  if (B == access::mode::discard_write &&
83  (A == access::mode::discard_read_write || A == access::mode::write))
84  return A;
85 
86  return access::mode::read_write;
87 }
88 
89 Scheduler::GraphBuilder::GraphBuilder() {
90  if (const char *EnvVarCStr = SYCLConfig<SYCL_PRINT_EXECUTION_GRAPH>::get()) {
91  std::string GraphPrintOpts(EnvVarCStr);
92  bool EnableAlways = GraphPrintOpts.find("always") != std::string::npos;
93 
94  if (GraphPrintOpts.find("before_addCG") != std::string::npos ||
95  EnableAlways)
96  MPrintOptionsArray[BeforeAddCG] = true;
97  if (GraphPrintOpts.find("after_addCG") != std::string::npos || EnableAlways)
98  MPrintOptionsArray[AfterAddCG] = true;
99  if (GraphPrintOpts.find("before_addCopyBack") != std::string::npos ||
100  EnableAlways)
101  MPrintOptionsArray[BeforeAddCopyBack] = true;
102  if (GraphPrintOpts.find("after_addCopyBack") != std::string::npos ||
103  EnableAlways)
104  MPrintOptionsArray[AfterAddCopyBack] = true;
105  if (GraphPrintOpts.find("before_addHostAcc") != std::string::npos ||
106  EnableAlways)
107  MPrintOptionsArray[BeforeAddHostAcc] = true;
108  if (GraphPrintOpts.find("after_addHostAcc") != std::string::npos ||
109  EnableAlways)
110  MPrintOptionsArray[AfterAddHostAcc] = true;
111  }
112 }
113 
114 static bool markNodeAsVisited(Command *Cmd, std::vector<Command *> &Visited) {
115  assert(Cmd && "Cmd can't be nullptr");
116  if (Cmd->MMarks.MVisited)
117  return false;
118  Cmd->MMarks.MVisited = true;
119  Visited.push_back(Cmd);
120  return true;
121 }
122 
123 static void unmarkVisitedNodes(std::vector<Command *> &Visited) {
124  for (Command *Cmd : Visited)
125  Cmd->MMarks.MVisited = false;
126 }
127 
128 static void handleVisitedNodes(std::vector<Command *> &Visited) {
129  for (Command *Cmd : Visited) {
130  if (Cmd->MMarks.MToBeDeleted) {
131  Cmd->getEvent()->setCommand(nullptr);
132  delete Cmd;
133  } else
134  Cmd->MMarks.MVisited = false;
135  }
136 }
137 
138 static void printDotRecursive(std::fstream &Stream,
139  std::vector<Command *> &Visited, Command *Cmd) {
140  if (!markNodeAsVisited(Cmd, Visited))
141  return;
142  for (Command *User : Cmd->MUsers) {
143  if (User)
144  printDotRecursive(Stream, Visited, User);
145  }
146  Cmd->printDot(Stream);
147 }
148 
149 void Scheduler::GraphBuilder::printGraphAsDot(const char *ModeName) {
150  static size_t Counter = 0;
151  std::string ModeNameStr(ModeName);
152  std::string FileName =
153  "graph_" + std::to_string(Counter) + ModeNameStr + ".dot";
154 
155  Counter++;
156 
157  std::fstream Stream(FileName, std::ios::out);
158  Stream << "strict digraph {" << std::endl;
159 
160  MVisitedCmds.clear();
161 
162  for (SYCLMemObjI *MemObject : MMemObjs)
163  for (Command *AllocaCmd : MemObject->MRecord->MAllocaCommands)
164  printDotRecursive(Stream, MVisitedCmds, AllocaCmd);
165 
166  Stream << "}" << std::endl;
167 
168  unmarkVisitedNodes(MVisitedCmds);
169 }
170 
171 MemObjRecord *Scheduler::GraphBuilder::getMemObjRecord(SYCLMemObjI *MemObject) {
172  return MemObject->MRecord.get();
173 }
174 
175 MemObjRecord *Scheduler::GraphBuilder::getOrInsertMemObjRecord(
176  const QueueImplPtr &Queue, const Requirement *Req,
177  std::vector<Command *> &ToEnqueue) {
178  SYCLMemObjI *MemObject = Req->MSYCLMemObj;
179  MemObjRecord *Record = getMemObjRecord(MemObject);
180 
181  if (nullptr != Record)
182  return Record;
183 
184  const size_t LeafLimit = 8;
185  LeavesCollection::AllocateDependencyF AllocateDependency =
186  [this](Command *Dependant, Command *Dependency, MemObjRecord *Record,
187  LeavesCollection::EnqueueListT &ToEnqueue) {
188  // Add the old leaf as a dependency for the new one by duplicating one
189  // of the requirements for the current record
190  DepDesc Dep = findDepForRecord(Dependant, Record);
191  Dep.MDepCommand = Dependency;
192  if (Command *ConnectionCmd = Dependant->addDep(Dep))
193  ToEnqueue.push_back(ConnectionCmd);
194  Dependency->addUser(Dependant);
195  --(Dependency->MLeafCounter);
196  };
197 
198  const ContextImplPtr &InteropCtxPtr = Req->MSYCLMemObj->getInteropContext();
199  if (InteropCtxPtr) {
200  // The memory object has been constructed using interoperability constructor
201  // which means that there is already an allocation(cl_mem) in some context.
202  // Registering this allocation in the SYCL graph.
203 
204  std::vector<sycl::device> Devices =
205  InteropCtxPtr->get_info<info::context::devices>();
206  assert(Devices.size() != 0);
207  DeviceImplPtr Dev = detail::getSyclObjImpl(Devices[0]);
208 
209  // Since all the Scheduler commands require queue but we have only context
210  // here, we need to create a dummy queue bound to the context and one of the
211  // devices from the context.
212  QueueImplPtr InteropQueuePtr{new detail::queue_impl{
213  Dev, InteropCtxPtr, /*AsyncHandler=*/{}, /*PropertyList=*/{}}};
214 
215  MemObject->MRecord.reset(
216  new MemObjRecord{InteropCtxPtr, LeafLimit, AllocateDependency});
217  getOrCreateAllocaForReq(MemObject->MRecord.get(), Req, InteropQueuePtr,
218  ToEnqueue);
219  } else
220  MemObject->MRecord.reset(new MemObjRecord{Queue->getContextImplPtr(),
221  LeafLimit, AllocateDependency});
222 
223  MMemObjs.push_back(MemObject);
224  return MemObject->MRecord.get();
225 }
226 
227 void Scheduler::GraphBuilder::updateLeaves(const std::set<Command *> &Cmds,
228  MemObjRecord *Record,
229  access::mode AccessMode) {
230 
231  const bool ReadOnlyReq = AccessMode == access::mode::read;
232  if (ReadOnlyReq)
233  return;
234 
235  for (Command *Cmd : Cmds) {
236  Cmd->MLeafCounter -= Record->MReadLeaves.remove(Cmd);
237  Cmd->MLeafCounter -= Record->MWriteLeaves.remove(Cmd);
238  }
239 }
240 
241 void Scheduler::GraphBuilder::addNodeToLeaves(
242  MemObjRecord *Record, Command *Cmd, access::mode AccessMode,
243  std::vector<Command *> &ToEnqueue) {
244  LeavesCollection &Leaves{AccessMode == access::mode::read
245  ? Record->MReadLeaves
246  : Record->MWriteLeaves};
247  if (Leaves.push_back(Cmd, ToEnqueue))
248  ++Cmd->MLeafCounter;
249 }
250 
251 UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd(
252  MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue,
253  std::vector<Command *> &ToEnqueue) {
254  AllocaCommandBase *AllocaCmd =
255  findAllocaForReq(Record, Req, Queue->getContextImplPtr());
256  assert(AllocaCmd && "There must be alloca for requirement!");
257  UpdateHostRequirementCommand *UpdateCommand =
258  new UpdateHostRequirementCommand(Queue, *Req, AllocaCmd, &Req->MData);
259  // Need copy of requirement because after host accessor destructor call
260  // dependencies become invalid if requirement is stored by pointer.
261  const Requirement *StoredReq = UpdateCommand->getRequirement();
262 
263  std::set<Command *> Deps =
264  findDepsForReq(Record, Req, Queue->getContextImplPtr());
265  for (Command *Dep : Deps) {
266  Command *ConnCmd =
267  UpdateCommand->addDep(DepDesc{Dep, StoredReq, AllocaCmd});
268  if (ConnCmd)
269  ToEnqueue.push_back(ConnCmd);
270  Dep->addUser(UpdateCommand);
271  }
272  updateLeaves(Deps, Record, Req->MAccessMode);
273  addNodeToLeaves(Record, UpdateCommand, Req->MAccessMode, ToEnqueue);
274  return UpdateCommand;
275 }
276 
277 // Takes linked alloca commands. Makes AllocaCmdDst command active using map
278 // or unmap operation.
280  AllocaCommandBase *AllocaCmdDst,
281  access::mode MapMode) {
282  assert(AllocaCmdSrc->MLinkedAllocaCmd == AllocaCmdDst &&
283  "Expected linked alloca commands");
284  assert(AllocaCmdSrc->MIsActive &&
285  "Expected source alloca command to be active");
286 
287  if (AllocaCmdSrc->getQueue()->is_host()) {
288  UnMapMemObject *UnMapCmd = new UnMapMemObject(
289  AllocaCmdDst, *AllocaCmdDst->getRequirement(),
290  &AllocaCmdSrc->MMemAllocation, AllocaCmdDst->getQueue());
291 
292  std::swap(AllocaCmdSrc->MIsActive, AllocaCmdDst->MIsActive);
293 
294  return UnMapCmd;
295  }
296 
297  MapMemObject *MapCmd = new MapMemObject(
298  AllocaCmdSrc, *AllocaCmdSrc->getRequirement(),
299  &AllocaCmdDst->MMemAllocation, AllocaCmdSrc->getQueue(), MapMode);
300 
301  std::swap(AllocaCmdSrc->MIsActive, AllocaCmdDst->MIsActive);
302 
303  return MapCmd;
304 }
305 
306 Command *Scheduler::GraphBuilder::insertMemoryMove(
307  MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue,
308  std::vector<Command *> &ToEnqueue) {
309 
310  AllocaCommandBase *AllocaCmdDst =
311  getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
312  if (!AllocaCmdDst)
313  throw runtime_error("Out of host memory", PI_OUT_OF_HOST_MEMORY);
314 
315  std::set<Command *> Deps =
316  findDepsForReq(Record, Req, Queue->getContextImplPtr());
317  Deps.insert(AllocaCmdDst);
318  // Get parent allocation of sub buffer to perform full copy of whole buffer
319  if (IsSuitableSubReq(Req)) {
320  if (AllocaCmdDst->getType() == Command::CommandType::ALLOCA_SUB_BUF)
321  AllocaCmdDst =
322  static_cast<AllocaSubBufCommand *>(AllocaCmdDst)->getParentAlloca();
323  }
324 
325  AllocaCommandBase *AllocaCmdSrc =
326  findAllocaForReq(Record, Req, Record->MCurContext);
327  if (!AllocaCmdSrc && IsSuitableSubReq(Req)) {
328  // Since no alloca command for the sub buffer requirement was found in the
329  // current context, need to find a parent alloca command for it (it must be
330  // there)
331  auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) {
332  bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(),
333  Record->MCurContext) &&
334  // Looking for a parent buffer alloca command
335  AllocaCmd->getType() == Command::CommandType::ALLOCA;
336  return Res;
337  };
338  const auto It =
339  std::find_if(Record->MAllocaCommands.begin(),
340  Record->MAllocaCommands.end(), IsSuitableAlloca);
341  AllocaCmdSrc = (Record->MAllocaCommands.end() != It) ? *It : nullptr;
342  }
343  if (!AllocaCmdSrc)
344  throw runtime_error("Cannot find buffer allocation", PI_INVALID_VALUE);
345  // Get parent allocation of sub buffer to perform full copy of whole buffer
346  if (IsSuitableSubReq(Req)) {
347  if (AllocaCmdSrc->getType() == Command::CommandType::ALLOCA_SUB_BUF)
348  AllocaCmdSrc =
349  static_cast<AllocaSubBufCommand *>(AllocaCmdSrc)->getParentAlloca();
350  else if (AllocaCmdSrc->getSYCLMemObj() != Req->MSYCLMemObj)
351  assert(false && "Inappropriate alloca command.");
352  }
353 
354  Command *NewCmd = nullptr;
355 
356  if (AllocaCmdSrc->MLinkedAllocaCmd == AllocaCmdDst) {
357  // Map write only as read-write
358  access::mode MapMode = Req->MAccessMode;
359  if (MapMode == access::mode::write)
360  MapMode = access::mode::read_write;
361  NewCmd = insertMapUnmapForLinkedCmds(AllocaCmdSrc, AllocaCmdDst, MapMode);
362  Record->MHostAccess = MapMode;
363  } else {
364 
365  if ((Req->MAccessMode == access::mode::discard_write) ||
366  (Req->MAccessMode == access::mode::discard_read_write)) {
367  Record->MCurContext = Queue->getContextImplPtr();
368  return nullptr;
369  } else {
370  // Full copy of buffer is needed to avoid loss of data that may be caused
371  // by copying specific range from host to device and backwards.
372  NewCmd =
373  new MemCpyCommand(*AllocaCmdSrc->getRequirement(), AllocaCmdSrc,
374  *AllocaCmdDst->getRequirement(), AllocaCmdDst,
375  AllocaCmdSrc->getQueue(), AllocaCmdDst->getQueue());
376  }
377  }
378 
379  for (Command *Dep : Deps) {
380  Command *ConnCmd =
381  NewCmd->addDep(DepDesc{Dep, NewCmd->getRequirement(), AllocaCmdDst});
382  if (ConnCmd)
383  ToEnqueue.push_back(ConnCmd);
384  Dep->addUser(NewCmd);
385  }
386  updateLeaves(Deps, Record, access::mode::read_write);
387  addNodeToLeaves(Record, NewCmd, access::mode::read_write, ToEnqueue);
388  Record->MCurContext = Queue->getContextImplPtr();
389  return NewCmd;
390 }
391 
392 Command *Scheduler::GraphBuilder::remapMemoryObject(
393  MemObjRecord *Record, Requirement *Req, AllocaCommandBase *HostAllocaCmd,
394  std::vector<Command *> &ToEnqueue) {
395  assert(HostAllocaCmd->getQueue()->is_host() &&
396  "Host alloca command expected");
397  assert(HostAllocaCmd->MIsActive && "Active alloca command expected");
398 
399  AllocaCommandBase *LinkedAllocaCmd = HostAllocaCmd->MLinkedAllocaCmd;
400  assert(LinkedAllocaCmd && "Linked alloca command expected");
401 
402  std::set<Command *> Deps = findDepsForReq(Record, Req, Record->MCurContext);
403 
404  UnMapMemObject *UnMapCmd = new UnMapMemObject(
405  LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(),
406  &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue());
407 
408  // Map write only as read-write
409  access::mode MapMode = Req->MAccessMode;
410  if (MapMode == access::mode::write)
411  MapMode = access::mode::read_write;
412  MapMemObject *MapCmd = new MapMemObject(
413  LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(),
414  &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue(), MapMode);
415 
416  for (Command *Dep : Deps) {
417  Command *ConnCmd = UnMapCmd->addDep(
418  DepDesc{Dep, UnMapCmd->getRequirement(), LinkedAllocaCmd});
419  if (ConnCmd)
420  ToEnqueue.push_back(ConnCmd);
421  Dep->addUser(UnMapCmd);
422  }
423 
424  Command *ConnCmd = MapCmd->addDep(
425  DepDesc{UnMapCmd, MapCmd->getRequirement(), HostAllocaCmd});
426  if (ConnCmd)
427  ToEnqueue.push_back(ConnCmd);
428  UnMapCmd->addUser(MapCmd);
429 
430  updateLeaves(Deps, Record, access::mode::read_write);
431  addNodeToLeaves(Record, MapCmd, access::mode::read_write, ToEnqueue);
432  Record->MHostAccess = MapMode;
433  return MapCmd;
434 }
435 
436 // The function adds copy operation of the up to date'st memory to the memory
437 // pointed by Req.
438 Command *
439 Scheduler::GraphBuilder::addCopyBack(Requirement *Req,
440  std::vector<Command *> &ToEnqueue) {
441  QueueImplPtr HostQueue = Scheduler::getInstance().getDefaultHostQueue();
442  SYCLMemObjI *MemObj = Req->MSYCLMemObj;
443  MemObjRecord *Record = getMemObjRecord(MemObj);
444  if (Record && MPrintOptionsArray[BeforeAddCopyBack])
445  printGraphAsDot("before_addCopyBack");
446 
447  // Do nothing if there were no or only read operations with the memory object.
448  if (nullptr == Record || !Record->MMemModified)
449  return nullptr;
450 
451  std::set<Command *> Deps =
452  findDepsForReq(Record, Req, HostQueue->getContextImplPtr());
453  AllocaCommandBase *SrcAllocaCmd =
454  findAllocaForReq(Record, Req, Record->MCurContext);
455 
456  auto MemCpyCmdUniquePtr = std::make_unique<MemCpyCommandHost>(
457  *SrcAllocaCmd->getRequirement(), SrcAllocaCmd, *Req, &Req->MData,
458  SrcAllocaCmd->getQueue(), std::move(HostQueue));
459 
460  if (!MemCpyCmdUniquePtr)
461  throw runtime_error("Out of host memory", PI_OUT_OF_HOST_MEMORY);
462 
463  MemCpyCommandHost *MemCpyCmd = MemCpyCmdUniquePtr.release();
464  for (Command *Dep : Deps) {
465  Command *ConnCmd = MemCpyCmd->addDep(
466  DepDesc{Dep, MemCpyCmd->getRequirement(), SrcAllocaCmd});
467  if (ConnCmd)
468  ToEnqueue.push_back(ConnCmd);
469  Dep->addUser(MemCpyCmd);
470  }
471 
472  updateLeaves(Deps, Record, Req->MAccessMode);
473  addNodeToLeaves(Record, MemCpyCmd, Req->MAccessMode, ToEnqueue);
474  if (MPrintOptionsArray[AfterAddCopyBack])
475  printGraphAsDot("after_addCopyBack");
476  return MemCpyCmd;
477 }
478 
479 // The function implements SYCL host accessor logic: host accessor
480 // should provide access to the buffer in user space.
481 Command *
482 Scheduler::GraphBuilder::addHostAccessor(Requirement *Req,
483  std::vector<Command *> &ToEnqueue) {
484 
485  const QueueImplPtr &HostQueue = getInstance().getDefaultHostQueue();
486 
487  MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req, ToEnqueue);
488  if (MPrintOptionsArray[BeforeAddHostAcc])
489  printGraphAsDot("before_addHostAccessor");
490  markModifiedIfWrite(Record, Req);
491 
492  AllocaCommandBase *HostAllocaCmd =
493  getOrCreateAllocaForReq(Record, Req, HostQueue, ToEnqueue);
494 
495  if (sameCtx(HostAllocaCmd->getQueue()->getContextImplPtr(),
496  Record->MCurContext)) {
497  if (!isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess))
498  remapMemoryObject(Record, Req, HostAllocaCmd, ToEnqueue);
499  } else
500  insertMemoryMove(Record, Req, HostQueue, ToEnqueue);
501 
502  Command *UpdateHostAccCmd =
503  insertUpdateHostReqCmd(Record, Req, HostQueue, ToEnqueue);
504 
505  // Need empty command to be blocked until host accessor is destructed
506  EmptyCommand *EmptyCmd =
507  addEmptyCmd<Requirement>(UpdateHostAccCmd, {Req}, HostQueue,
508  Command::BlockReason::HostAccessor, ToEnqueue);
509 
510  Req->MBlockedCmd = EmptyCmd;
511 
512  if (MPrintOptionsArray[AfterAddHostAcc])
513  printGraphAsDot("after_addHostAccessor");
514 
515  return UpdateHostAccCmd;
516 }
517 
518 Command *Scheduler::GraphBuilder::addCGUpdateHost(
519  std::unique_ptr<detail::CG> CommandGroup, QueueImplPtr HostQueue,
520  std::vector<Command *> &ToEnqueue) {
521 
522  auto UpdateHost = static_cast<CGUpdateHost *>(CommandGroup.get());
523  Requirement *Req = UpdateHost->getReqToUpdate();
524 
525  MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req, ToEnqueue);
526  return insertMemoryMove(Record, Req, HostQueue, ToEnqueue);
527 }
528 
537 std::set<Command *>
538 Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record,
539  const Requirement *Req,
540  const ContextImplPtr &Context) {
541  std::set<Command *> RetDeps;
542  std::vector<Command *> Visited;
543  const bool ReadOnlyReq = Req->MAccessMode == access::mode::read;
544 
545  std::vector<Command *> ToAnalyze{Record->MWriteLeaves.toVector()};
546 
547  if (!ReadOnlyReq) {
548  std::vector<Command *> V{Record->MReadLeaves.toVector()};
549 
550  ToAnalyze.insert(ToAnalyze.begin(), V.begin(), V.end());
551  }
552 
553  while (!ToAnalyze.empty()) {
554  Command *DepCmd = ToAnalyze.back();
555  ToAnalyze.pop_back();
556 
557  std::vector<Command *> NewAnalyze;
558 
559  for (const DepDesc &Dep : DepCmd->MDeps) {
560  if (Dep.MDepRequirement->MSYCLMemObj != Req->MSYCLMemObj)
561  continue;
562 
563  bool CanBypassDep = false;
564  // If both only read
565  CanBypassDep |=
566  Dep.MDepRequirement->MAccessMode == access::mode::read && ReadOnlyReq;
567 
568  // If not overlap
569  CanBypassDep |= !doOverlap(Dep.MDepRequirement, Req);
570 
571  // Going through copying memory between contexts is not supported.
572  if (Dep.MDepCommand)
573  CanBypassDep &=
574  sameCtx(Context, Dep.MDepCommand->getQueue()->getContextImplPtr());
575 
576  if (!CanBypassDep) {
577  RetDeps.insert(DepCmd);
578  // No need to analyze deps of examining command as it's dependency
579  // itself.
580  NewAnalyze.clear();
581  break;
582  }
583 
584  if (markNodeAsVisited(Dep.MDepCommand, Visited))
585  NewAnalyze.push_back(Dep.MDepCommand);
586  }
587  ToAnalyze.insert(ToAnalyze.end(), NewAnalyze.begin(), NewAnalyze.end());
588  }
589  unmarkVisitedNodes(Visited);
590  return RetDeps;
591 }
592 
593 // A helper function for finding a command dependency on a specific memory
594 // object
595 DepDesc Scheduler::GraphBuilder::findDepForRecord(Command *Cmd,
596  MemObjRecord *Record) {
597  for (const DepDesc &DD : Cmd->MDeps) {
598  if (getMemObjRecord(DD.MDepRequirement->MSYCLMemObj) == Record) {
599  return DD;
600  }
601  }
602  assert(false && "No dependency found for a leaf of the record");
603  return {nullptr, nullptr, nullptr};
604 }
605 
606 // The function searches for the alloca command matching context and
607 // requirement.
609 Scheduler::GraphBuilder::findAllocaForReq(MemObjRecord *Record,
610  const Requirement *Req,
611  const ContextImplPtr &Context) {
612  auto IsSuitableAlloca = [&Context, Req](AllocaCommandBase *AllocaCmd) {
613  bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(), Context);
614  if (IsSuitableSubReq(Req)) {
615  const Requirement *TmpReq = AllocaCmd->getRequirement();
616  Res &= AllocaCmd->getType() == Command::CommandType::ALLOCA_SUB_BUF;
617  Res &= TmpReq->MOffsetInBytes == Req->MOffsetInBytes;
618  Res &= TmpReq->MSYCLMemObj->getSize() == Req->MSYCLMemObj->getSize();
619  }
620  return Res;
621  };
622  const auto It = std::find_if(Record->MAllocaCommands.begin(),
623  Record->MAllocaCommands.end(), IsSuitableAlloca);
624  return (Record->MAllocaCommands.end() != It) ? *It : nullptr;
625 }
626 
627 static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) {
628  if (const char *HUMConfig = SYCLConfig<SYCL_HOST_UNIFIED_MEMORY>::get()) {
629  if (std::strcmp(HUMConfig, "0") == 0)
630  return Ctx->is_host();
631  if (std::strcmp(HUMConfig, "1") == 0)
632  return true;
633  }
634  for (const device &Device : Ctx->getDevices()) {
635  if (!Device.get_info<info::device::host_unified_memory>())
636  return false;
637  }
638  return true;
639 }
640 
641 // The function searches for the alloca command matching context and
642 // requirement. If none exists, new allocation command is created.
643 // Note, creation of new allocation command can lead to the current context
644 // (Record->MCurContext) change.
645 AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
646  MemObjRecord *Record, const Requirement *Req, QueueImplPtr Queue,
647  std::vector<Command *> &ToEnqueue) {
648 
649  AllocaCommandBase *AllocaCmd =
650  findAllocaForReq(Record, Req, Queue->getContextImplPtr());
651 
652  if (!AllocaCmd) {
653  if (IsSuitableSubReq(Req)) {
654  // Get parent requirement. It's hard to get right parents' range
655  // so full parent requirement has range represented in bytes
656  range<3> ParentRange{Req->MSYCLMemObj->getSize(), 1, 1};
657  Requirement ParentRequirement(/*Offset*/ {0, 0, 0}, ParentRange,
658  ParentRange, access::mode::read_write,
659  Req->MSYCLMemObj, /*Dims*/ 1,
660  /*Working with bytes*/ sizeof(char));
661 
662  auto *ParentAlloca =
663  getOrCreateAllocaForReq(Record, &ParentRequirement, Queue, ToEnqueue);
664  AllocaCmd = new AllocaSubBufCommand(Queue, *Req, ParentAlloca, ToEnqueue);
665  } else {
666 
667  const Requirement FullReq(/*Offset*/ {0, 0, 0}, Req->MMemoryRange,
668  Req->MMemoryRange, access::mode::read_write,
669  Req->MSYCLMemObj, Req->MDims, Req->MElemSize,
670  0 /*ReMOffsetInBytes*/, false /*MIsSubBuffer*/);
671  // Can reuse user data for the first allocation. Do so if host unified
672  // memory is supported regardless of the access mode (the pointer will be
673  // reused). For devices without host unified memory the initialization
674  // will be performed as a write operation.
675  // TODO the case where the first alloca is made with a discard mode and
676  // the user pointer is read-only is still not handled: it leads to
677  // unnecessary copy on devices with unified host memory support.
678  const bool HostUnifiedMemory =
679  checkHostUnifiedMemory(Queue->getContextImplPtr());
680  // TODO casting is required here to get the necessary information
681  // without breaking ABI, replace with the next major version.
682  auto *MemObj = static_cast<SYCLMemObjT *>(Req->MSYCLMemObj);
683  const bool InitFromUserData = Record->MAllocaCommands.empty() &&
684  (HostUnifiedMemory || MemObj->isInterop());
685  AllocaCommandBase *LinkedAllocaCmd = nullptr;
686 
687  // For the first allocation on a device without host unified memory we
688  // might need to also create a host alloca right away in order to perform
689  // the initial memory write.
690  if (Record->MAllocaCommands.empty()) {
691  if (!HostUnifiedMemory &&
692  Req->MAccessMode != access::mode::discard_write &&
693  Req->MAccessMode != access::mode::discard_read_write) {
694  // There's no need to make a host allocation if the buffer is not
695  // initialized with user data.
696  if (MemObj->hasUserDataPtr()) {
697  QueueImplPtr DefaultHostQueue =
698  Scheduler::getInstance().getDefaultHostQueue();
699  AllocaCommand *HostAllocaCmd = new AllocaCommand(
700  DefaultHostQueue, FullReq, true /* InitFromUserData */,
701  nullptr /* LinkedAllocaCmd */);
702  Record->MAllocaCommands.push_back(HostAllocaCmd);
703  Record->MWriteLeaves.push_back(HostAllocaCmd, ToEnqueue);
704  ++(HostAllocaCmd->MLeafCounter);
705  Record->MCurContext = DefaultHostQueue->getContextImplPtr();
706  }
707  }
708  } else {
709  // If it is not the first allocation, try to setup a link
710  // FIXME: Temporary limitation, linked alloca commands for an image is
711  // not supported because map operation is not implemented for an image.
712  if (Req->MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer)
713  // Current limitation is to setup link between current allocation and
714  // new one. There could be situations when we could setup link with
715  // "not" current allocation, but it will require memory copy.
716  // Can setup link between cl and host allocations only
717  if (Queue->is_host() != Record->MCurContext->is_host()) {
718  // Linked commands assume that the host allocation is reused by the
719  // plugin runtime and that can lead to unnecessary copy overhead on
720  // devices that do not support host unified memory. Do not link the
721  // allocations in this case.
722  // However, if the user explicitly requests use of pinned host
723  // memory, map/unmap operations are expected to work faster than
724  // read/write from/to an artbitrary host pointer. Link such commands
725  // regardless of host unified memory support.
726  bool PinnedHostMemory = MemObj->has_property<
727  sycl::ext::oneapi::property::buffer::use_pinned_host_memory>();
728 
729  bool HostUnifiedMemoryOnNonHostDevice =
730  Queue->is_host() ? checkHostUnifiedMemory(Record->MCurContext)
731  : HostUnifiedMemory;
732  if (PinnedHostMemory || HostUnifiedMemoryOnNonHostDevice) {
733  AllocaCommandBase *LinkedAllocaCmdCand =
734  findAllocaForReq(Record, Req, Record->MCurContext);
735 
736  // Cannot setup link if candidate is linked already
737  if (LinkedAllocaCmdCand &&
738  !LinkedAllocaCmdCand->MLinkedAllocaCmd) {
739  LinkedAllocaCmd = LinkedAllocaCmdCand;
740  }
741  }
742  }
743  }
744 
745  AllocaCmd =
746  new AllocaCommand(Queue, FullReq, InitFromUserData, LinkedAllocaCmd);
747 
748  // Update linked command
749  if (LinkedAllocaCmd) {
750  Command *ConnCmd = AllocaCmd->addDep(DepDesc{
751  LinkedAllocaCmd, AllocaCmd->getRequirement(), LinkedAllocaCmd});
752  if (ConnCmd)
753  ToEnqueue.push_back(ConnCmd);
754  LinkedAllocaCmd->addUser(AllocaCmd);
755  LinkedAllocaCmd->MLinkedAllocaCmd = AllocaCmd;
756 
757  // To ensure that the leader allocation is removed first
758  ConnCmd = AllocaCmd->getReleaseCmd()->addDep(
759  DepDesc(LinkedAllocaCmd->getReleaseCmd(),
760  AllocaCmd->getRequirement(), LinkedAllocaCmd));
761  if (ConnCmd)
762  ToEnqueue.push_back(ConnCmd);
763 
764  // Device allocation takes ownership of the host ptr during
765  // construction, host allocation doesn't. So, device allocation should
766  // always be active here. Also if the "follower" command is a device one
767  // we have to change current context to the device one.
768  if (Queue->is_host()) {
769  AllocaCmd->MIsActive = false;
770  } else {
771  LinkedAllocaCmd->MIsActive = false;
772  Record->MCurContext = Queue->getContextImplPtr();
773 
774  std::set<Command *> Deps =
775  findDepsForReq(Record, Req, Queue->getContextImplPtr());
776  for (Command *Dep : Deps) {
777  Command *ConnCmd =
778  AllocaCmd->addDep(DepDesc{Dep, Req, LinkedAllocaCmd});
779  if (ConnCmd)
780  ToEnqueue.push_back(ConnCmd);
781  Dep->addUser(AllocaCmd);
782  }
783  updateLeaves(Deps, Record, Req->MAccessMode);
784  addNodeToLeaves(Record, AllocaCmd, Req->MAccessMode, ToEnqueue);
785  }
786  }
787  }
788 
789  Record->MAllocaCommands.push_back(AllocaCmd);
790  Record->MWriteLeaves.push_back(AllocaCmd, ToEnqueue);
791  ++(AllocaCmd->MLeafCounter);
792  }
793  return AllocaCmd;
794 }
795 
796 // The function sets MemModified flag in record if requirement has write access.
797 void Scheduler::GraphBuilder::markModifiedIfWrite(MemObjRecord *Record,
798  Requirement *Req) {
799  switch (Req->MAccessMode) {
800  case access::mode::write:
801  case access::mode::read_write:
802  case access::mode::discard_write:
803  case access::mode::discard_read_write:
804  case access::mode::atomic:
805  Record->MMemModified = true;
806  break;
807  case access::mode::read:
808  break;
809  }
810 }
811 
812 template <typename T>
813 typename detail::enable_if_t<
814  std::is_same<typename std::remove_cv_t<T>, Requirement>::value,
815  EmptyCommand *>
816 Scheduler::GraphBuilder::addEmptyCmd(Command *Cmd, const std::vector<T *> &Reqs,
817  const QueueImplPtr &Queue,
818  Command::BlockReason Reason,
819  std::vector<Command *> &ToEnqueue) {
820  EmptyCommand *EmptyCmd =
821  new EmptyCommand(Scheduler::getInstance().getDefaultHostQueue());
822 
823  if (!EmptyCmd)
824  throw runtime_error("Out of host memory", PI_OUT_OF_HOST_MEMORY);
825 
826  EmptyCmd->MIsBlockable = true;
827  EmptyCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueBlocked;
828  EmptyCmd->MBlockReason = Reason;
829 
830  for (T *Req : Reqs) {
831  MemObjRecord *Record = getOrInsertMemObjRecord(Queue, Req, ToEnqueue);
832  AllocaCommandBase *AllocaCmd =
833  getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
834  EmptyCmd->addRequirement(Cmd, AllocaCmd, Req);
835  }
836 
837  Cmd->addUser(EmptyCmd);
838 
839  const std::vector<DepDesc> &Deps = Cmd->MDeps;
840  for (const DepDesc &Dep : Deps) {
841  const Requirement *Req = Dep.MDepRequirement;
842  MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj);
843 
844  updateLeaves({Cmd}, Record, Req->MAccessMode);
845  addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode, ToEnqueue);
846  }
847 
848  return EmptyCmd;
849 }
850 
851 static bool isInteropHostTask(const std::unique_ptr<ExecCGCommand> &Cmd) {
852  if (Cmd->getCG().getType() != CG::CGTYPE::CodeplayHostTask)
853  return false;
854 
855  const detail::CGHostTask &HT =
856  static_cast<detail::CGHostTask &>(Cmd->getCG());
857 
858  return HT.MHostTask->isInteropTask();
859 }
860 
861 static void combineAccessModesOfReqs(std::vector<Requirement *> &Reqs) {
862  std::unordered_map<SYCLMemObjI *, access::mode> CombinedModes;
863  bool HasDuplicateMemObjects = false;
864  for (const Requirement *Req : Reqs) {
865  auto Result = CombinedModes.insert(
866  std::make_pair(Req->MSYCLMemObj, Req->MAccessMode));
867  if (!Result.second) {
868  Result.first->second =
869  combineAccessModes(Result.first->second, Req->MAccessMode);
870  HasDuplicateMemObjects = true;
871  }
872  }
873 
874  if (!HasDuplicateMemObjects)
875  return;
876  for (Requirement *Req : Reqs) {
877  Req->MAccessMode = CombinedModes[Req->MSYCLMemObj];
878  }
879 }
880 
881 Command *
882 Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
883  QueueImplPtr Queue,
884  std::vector<Command *> &ToEnqueue) {
885  std::vector<Requirement *> &Reqs = CommandGroup->MRequirements;
886  const std::vector<detail::EventImplPtr> &Events = CommandGroup->MEvents;
887  const CG::CGTYPE CGType = CommandGroup->getType();
888 
889  auto NewCmd = std::make_unique<ExecCGCommand>(std::move(CommandGroup), Queue);
890  if (!NewCmd)
891  throw runtime_error("Out of host memory", PI_OUT_OF_HOST_MEMORY);
892 
893  if (MPrintOptionsArray[BeforeAddCG])
894  printGraphAsDot("before_addCG");
895 
896  // If there are multiple requirements for the same memory object, its
897  // AllocaCommand creation will be dependent on the access mode of the first
898  // requirement. Combine these access modes to take all of them into account.
900  for (Requirement *Req : Reqs) {
901  MemObjRecord *Record = nullptr;
902  AllocaCommandBase *AllocaCmd = nullptr;
903 
904  bool isSameCtx = false;
905 
906  {
907  const QueueImplPtr &QueueForAlloca =
908  isInteropHostTask(NewCmd)
909  ? static_cast<detail::CGHostTask &>(NewCmd->getCG()).MQueue
910  : Queue;
911 
912  Record = getOrInsertMemObjRecord(QueueForAlloca, Req, ToEnqueue);
913  markModifiedIfWrite(Record, Req);
914 
915  AllocaCmd =
916  getOrCreateAllocaForReq(Record, Req, QueueForAlloca, ToEnqueue);
917 
918  isSameCtx =
919  sameCtx(QueueForAlloca->getContextImplPtr(), Record->MCurContext);
920  }
921 
922  // If there is alloca command we need to check if the latest memory is in
923  // required context.
924  if (isSameCtx) {
925  // If the memory is already in the required host context, check if the
926  // required access mode is valid, remap if not.
927  if (Record->MCurContext->is_host() &&
928  !isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess))
929  remapMemoryObject(Record, Req, AllocaCmd, ToEnqueue);
930  } else {
931  // Cannot directly copy memory from OpenCL device to OpenCL device -
932  // create two copies: device->host and host->device.
933  bool NeedMemMoveToHost = false;
934  auto MemMoveTargetQueue = Queue;
935 
936  if (isInteropHostTask(NewCmd)) {
937  const detail::CGHostTask &HT =
938  static_cast<detail::CGHostTask &>(NewCmd->getCG());
939 
940  if (HT.MQueue->getContextImplPtr() != Record->MCurContext) {
941  NeedMemMoveToHost = true;
942  MemMoveTargetQueue = HT.MQueue;
943  }
944  } else if (!Queue->is_host() && !Record->MCurContext->is_host())
945  NeedMemMoveToHost = true;
946 
947  if (NeedMemMoveToHost)
948  insertMemoryMove(Record, Req,
949  Scheduler::getInstance().getDefaultHostQueue(),
950  ToEnqueue);
951  insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue);
952  }
953  std::set<Command *> Deps =
954  findDepsForReq(Record, Req, Queue->getContextImplPtr());
955 
956  for (Command *Dep : Deps)
957  if (Command *ConnCmd = NewCmd->addDep(DepDesc{Dep, Req, AllocaCmd}))
958  ToEnqueue.push_back(ConnCmd);
959  }
960 
961  // Set new command as user for dependencies and update leaves.
962  // Node dependencies can be modified further when adding the node to leaves,
963  // iterate over their copy.
964  // FIXME employ a reference here to eliminate copying of a vector
965  std::vector<DepDesc> Deps = NewCmd->MDeps;
966  for (DepDesc &Dep : Deps) {
967  Dep.MDepCommand->addUser(NewCmd.get());
968  const Requirement *Req = Dep.MDepRequirement;
969  MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj);
970  updateLeaves({Dep.MDepCommand}, Record, Req->MAccessMode);
971  addNodeToLeaves(Record, NewCmd.get(), Req->MAccessMode, ToEnqueue);
972  }
973 
974  // Register all the events as dependencies
975  for (detail::EventImplPtr e : Events) {
976  if (Command *ConnCmd = NewCmd->addDep(e))
977  ToEnqueue.push_back(ConnCmd);
978  }
979 
980  if (CGType == CG::CGTYPE::CodeplayHostTask)
981  NewCmd->MEmptyCmd =
982  addEmptyCmd(NewCmd.get(), NewCmd->getCG().MRequirements, Queue,
983  Command::BlockReason::HostTask, ToEnqueue);
984 
985  if (MPrintOptionsArray[AfterAddCG])
986  printGraphAsDot("after_addCG");
987 
988  return NewCmd.release();
989 }
990 
991 void Scheduler::GraphBuilder::decrementLeafCountersForRecord(
992  MemObjRecord *Record) {
993  for (Command *Cmd : Record->MReadLeaves) {
994  --(Cmd->MLeafCounter);
995  }
996  for (Command *Cmd : Record->MWriteLeaves) {
997  --(Cmd->MLeafCounter);
998  }
999 }
1000 
1001 void Scheduler::GraphBuilder::cleanupCommandsForRecord(
1002  MemObjRecord *Record,
1003  std::vector<std::shared_ptr<stream_impl>> &StreamsToDeallocate) {
1004  std::vector<AllocaCommandBase *> &AllocaCommands = Record->MAllocaCommands;
1005  if (AllocaCommands.empty())
1006  return;
1007 
1008  assert(MCmdsToVisit.empty());
1009  MVisitedCmds.clear();
1010 
1011  // First, mark all allocas for deletion and their direct users for traversal
1012  // Dependencies of the users will be cleaned up during the traversal
1013  for (Command *AllocaCmd : AllocaCommands) {
1014  markNodeAsVisited(AllocaCmd, MVisitedCmds);
1015 
1016  for (Command *UserCmd : AllocaCmd->MUsers)
1017  // Linked alloca cmd may be in users of this alloca. We're not going to
1018  // visit it.
1019  if (UserCmd->getType() != Command::CommandType::ALLOCA)
1020  MCmdsToVisit.push(UserCmd);
1021  else
1022  markNodeAsVisited(UserCmd, MVisitedCmds);
1023 
1024  AllocaCmd->MMarks.MToBeDeleted = true;
1025  // These commands will be deleted later, clear users now to avoid
1026  // updating them during edge removal
1027  AllocaCmd->MUsers.clear();
1028  }
1029 
1030  // Make sure the Linked Allocas are marked visited by the previous walk.
1031  // Remove allocation commands from the users of their dependencies.
1032  for (AllocaCommandBase *AllocaCmd : AllocaCommands) {
1033  AllocaCommandBase *LinkedCmd = AllocaCmd->MLinkedAllocaCmd;
1034 
1035  if (LinkedCmd) {
1036  assert(LinkedCmd->MMarks.MVisited);
1037  }
1038 
1039  for (DepDesc &Dep : AllocaCmd->MDeps)
1040  if (Dep.MDepCommand)
1041  Dep.MDepCommand->MUsers.erase(AllocaCmd);
1042  }
1043 
1044  // Traverse the graph using BFS
1045  while (!MCmdsToVisit.empty()) {
1046  Command *Cmd = MCmdsToVisit.front();
1047  MCmdsToVisit.pop();
1048 
1049  if (!markNodeAsVisited(Cmd, MVisitedCmds))
1050  continue;
1051 
1052  // Collect stream objects for a visited command.
1053  if (Cmd->getType() == Command::CommandType::RUN_CG) {
1054  auto ExecCmd = static_cast<ExecCGCommand *>(Cmd);
1055  std::vector<std::shared_ptr<stream_impl>> Streams = ExecCmd->getStreams();
1056  ExecCmd->clearStreams();
1057  StreamsToDeallocate.insert(StreamsToDeallocate.end(), Streams.begin(),
1058  Streams.end());
1059  }
1060 
1061  for (Command *UserCmd : Cmd->MUsers)
1062  if (UserCmd->getType() != Command::CommandType::ALLOCA)
1063  MCmdsToVisit.push(UserCmd);
1064 
1065  // Delete all dependencies on any allocations being removed
1066  // Track which commands should have their users updated
1067  std::map<Command *, bool> ShouldBeUpdated;
1068  auto NewEnd = std::remove_if(
1069  Cmd->MDeps.begin(), Cmd->MDeps.end(), [&](const DepDesc &Dep) {
1070  if (std::find(AllocaCommands.begin(), AllocaCommands.end(),
1071  Dep.MAllocaCmd) != AllocaCommands.end()) {
1072  ShouldBeUpdated.insert({Dep.MDepCommand, true});
1073  return true;
1074  }
1075  ShouldBeUpdated[Dep.MDepCommand] = false;
1076  return false;
1077  });
1078  Cmd->MDeps.erase(NewEnd, Cmd->MDeps.end());
1079 
1080  // Update users of removed dependencies
1081  for (auto DepCmdIt : ShouldBeUpdated) {
1082  if (!DepCmdIt.second)
1083  continue;
1084  DepCmdIt.first->MUsers.erase(Cmd);
1085  }
1086 
1087  // If all dependencies have been removed this way, mark the command for
1088  // deletion
1089  if (Cmd->MDeps.empty()) {
1090  Cmd->MMarks.MToBeDeleted = true;
1091  Cmd->MUsers.clear();
1092  }
1093  }
1094 
1095  handleVisitedNodes(MVisitedCmds);
1096 }
1097 
1098 void Scheduler::GraphBuilder::cleanupFinishedCommands(
1099  Command *FinishedCmd,
1100  std::vector<std::shared_ptr<stream_impl>> &StreamsToDeallocate) {
1101  assert(MCmdsToVisit.empty());
1102  MCmdsToVisit.push(FinishedCmd);
1103  MVisitedCmds.clear();
1104 
1105  // Traverse the graph using BFS
1106  while (!MCmdsToVisit.empty()) {
1107  Command *Cmd = MCmdsToVisit.front();
1108  MCmdsToVisit.pop();
1109 
1110  if (!markNodeAsVisited(Cmd, MVisitedCmds))
1111  continue;
1112 
1113  // Collect stream objects for a visited command.
1114  if (Cmd->getType() == Command::CommandType::RUN_CG) {
1115  auto ExecCmd = static_cast<ExecCGCommand *>(Cmd);
1116  std::vector<std::shared_ptr<stream_impl>> Streams = ExecCmd->getStreams();
1117  ExecCmd->clearStreams();
1118  StreamsToDeallocate.insert(StreamsToDeallocate.end(), Streams.begin(),
1119  Streams.end());
1120  }
1121 
1122  for (const DepDesc &Dep : Cmd->MDeps) {
1123  if (Dep.MDepCommand)
1124  MCmdsToVisit.push(Dep.MDepCommand);
1125  }
1126 
1127  // Do not clean up the node if it is a leaf for any memory object
1128  if (Cmd->MLeafCounter > 0)
1129  continue;
1130  // Do not clean up allocation commands
1131  Command::CommandType CmdT = Cmd->getType();
1132  if (CmdT == Command::ALLOCA || CmdT == Command::ALLOCA_SUB_BUF)
1133  continue;
1134 
1135  for (Command *UserCmd : Cmd->MUsers) {
1136  for (DepDesc &Dep : UserCmd->MDeps) {
1137  // Link the users of the command to the alloca command(s) instead
1138  if (Dep.MDepCommand == Cmd) {
1139  Dep.MDepCommand = Dep.MAllocaCmd;
1140  Dep.MDepCommand->MUsers.insert(UserCmd);
1141  }
1142  }
1143  }
1144  // Update dependency users
1145  for (DepDesc &Dep : Cmd->MDeps) {
1146  Command *DepCmd = Dep.MDepCommand;
1147  DepCmd->MUsers.erase(Cmd);
1148  }
1149 
1150  Cmd->MMarks.MToBeDeleted = true;
1151  }
1152  handleVisitedNodes(MVisitedCmds);
1153 }
1154 
1155 void Scheduler::GraphBuilder::removeRecordForMemObj(SYCLMemObjI *MemObject) {
1156  const auto It = std::find_if(
1157  MMemObjs.begin(), MMemObjs.end(),
1158  [MemObject](const SYCLMemObjI *Obj) { return Obj == MemObject; });
1159  if (It != MMemObjs.end())
1160  MMemObjs.erase(It);
1161  MemObject->MRecord.reset();
1162 }
1163 
1164 // Make Cmd depend on DepEvent from different context. Connection is performed
1165 // via distinct ConnectCmd with host task command group on host queue. Cmd will
1166 // depend on ConnectCmd's host event.
1167 // DepEvent may not have a command associated with it in at least two cases:
1168 // - the command was deleted upon cleanup process;
1169 // - DepEvent is user event.
1170 // In both of these cases the only thing we can do is to make ConnectCmd depend
1171 // on DepEvent.
1172 // Otherwise, when there is a command associated with DepEvent, we make
1173 // ConnectCmd depend on on this command. If there is valid, i.e. non-nil,
1174 // requirement in Dep we make ConnectCmd depend on DepEvent's command with this
1175 // requirement.
1176 // Optionality of Dep is set by Dep.MDepCommand equal to nullptr.
1177 Command *Scheduler::GraphBuilder::connectDepEvent(Command *const Cmd,
1178  EventImplPtr DepEvent,
1179  const DepDesc &Dep) {
1180  assert(Cmd->getWorkerContext() != DepEvent->getContextImpl());
1181 
1182  // construct Host Task type command manually and make it depend on DepEvent
1183  ExecCGCommand *ConnectCmd = nullptr;
1184 
1185  try {
1186  std::unique_ptr<detail::HostTask> HT(new detail::HostTask);
1187  std::unique_ptr<detail::CG> ConnectCG(new detail::CGHostTask(
1188  std::move(HT), /* Queue = */ {}, /* Context = */ {}, /* Args = */ {},
1189  /* ArgsStorage = */ {}, /* AccStorage = */ {},
1190  /* SharedPtrStorage = */ {}, /* Requirements = */ {},
1191  /* DepEvents = */ {DepEvent}, CG::CodeplayHostTask,
1192  /* Payload */ {}));
1193  ConnectCmd = new ExecCGCommand(
1194  std::move(ConnectCG), Scheduler::getInstance().getDefaultHostQueue());
1195  } catch (const std::bad_alloc &) {
1196  throw runtime_error("Out of host memory", PI_OUT_OF_HOST_MEMORY);
1197  }
1198 
1199  if (Command *DepCmd = reinterpret_cast<Command *>(DepEvent->getCommand()))
1200  DepCmd->addUser(ConnectCmd);
1201 
1202  EmptyCommand *EmptyCmd = nullptr;
1203 
1204  if (Dep.MDepRequirement) {
1205  // make ConnectCmd depend on requirement
1206  // Dismiss the result here as it's not a connection now,
1207  // 'cause ConnectCmd is host one
1208  (void)ConnectCmd->addDep(Dep);
1209  assert(reinterpret_cast<Command *>(DepEvent->getCommand()) ==
1210  Dep.MDepCommand);
1211  // add user to Dep.MDepCommand is already performed beyond this if branch
1212 
1213  MemObjRecord *Record = getMemObjRecord(Dep.MDepRequirement->MSYCLMemObj);
1214 
1215  updateLeaves({Dep.MDepCommand}, Record, Dep.MDepRequirement->MAccessMode);
1216  std::vector<Command *> ToEnqueue;
1217  addNodeToLeaves(Record, ConnectCmd, Dep.MDepRequirement->MAccessMode,
1218  ToEnqueue);
1219  assert(ToEnqueue.size() == 0);
1220 
1221  const std::vector<const Requirement *> Reqs(1, Dep.MDepRequirement);
1222  EmptyCmd = addEmptyCmd(ConnectCmd, Reqs,
1223  Scheduler::getInstance().getDefaultHostQueue(),
1224  Command::BlockReason::HostTask, ToEnqueue);
1225  assert(ToEnqueue.size() == 0);
1226  // Dependencies for EmptyCmd are set in addEmptyCmd for provided Reqs.
1227 
1228  // Depend Cmd on empty command
1229  {
1230  DepDesc CmdDep = Dep;
1231  CmdDep.MDepCommand = EmptyCmd;
1232 
1233  // Dismiss the result here as it's not a connection now,
1234  // 'cause EmptyCmd is host one
1235  (void)Cmd->addDep(CmdDep);
1236  }
1237  } else {
1238  std::vector<Command *> ToEnqueue;
1239  EmptyCmd = addEmptyCmd<Requirement>(
1240  ConnectCmd, {}, Scheduler::getInstance().getDefaultHostQueue(),
1241  Command::BlockReason::HostTask, ToEnqueue);
1242  assert(ToEnqueue.size() == 0);
1243 
1244  // There is no requirement thus, empty command will only depend on
1245  // ConnectCmd via its event.
1246  // Dismiss the result here as it's not a connection now,
1247  // 'cause ConnectCmd is host one.
1248  (void)EmptyCmd->addDep(ConnectCmd->getEvent());
1249  (void)ConnectCmd->addDep(DepEvent);
1250 
1251  // Depend Cmd on empty command
1252  // Dismiss the result here as it's not a connection now,
1253  // 'cause EmptyCmd is host one
1254  (void)Cmd->addDep(EmptyCmd->getEvent());
1255  }
1256 
1257  EmptyCmd->addUser(Cmd);
1258 
1259  ConnectCmd->MEmptyCmd = EmptyCmd;
1260 
1261  return ConnectCmd;
1262 }
1263 
1264 } // namespace detail
1265 } // namespace sycl
1266 } // __SYCL_INLINE_NAMESPACE(cl)
cl::sycl::detail::AllocaCommandBase::getReleaseCmd
ReleaseCommand * getReleaseCmd()
Definition: commands.hpp:349
cl::sycl::detail::AccessorImplHost::MDims
unsigned int MDims
Definition: accessor_impl.hpp:116
cl::sycl::detail::UnMapMemObject
The unmap command removes mapping of host memory onto device memory.
Definition: commands.hpp:439
event_impl.hpp
cl::sycl::detail::handleVisitedNodes
static void handleVisitedNodes(std::vector< Command * > &Visited)
Definition: graph_builder.cpp:128
cl::sycl::detail::ExecCGCommand
The exec CG command enqueues execution of kernel or explicit memory operation.
Definition: commands.hpp:508
cl::sycl::detail::Command::MUsers
std::unordered_set< Command * > MUsers
Contains list of commands that depend on the command.
Definition: commands.hpp:243
cl::sycl::detail::MemObjRecord::MCurContext
ContextImplPtr MCurContext
Definition: scheduler.hpp:209
cl::sycl::detail::ContextImplPtr
std::shared_ptr< detail::context_impl > ContextImplPtr
Definition: memory_manager.hpp:31
cl::sycl::detail::AllocaCommandBase::MIsActive
bool MIsActive
Indicates that current alloca is active one.
Definition: commands.hpp:370
cl::sycl::detail::markNodeAsVisited
static bool markNodeAsVisited(Command *Cmd, std::vector< Command * > &Visited)
Definition: graph_builder.cpp:114
context_impl.hpp
T
cl::sycl::detail::Command::getQueue
const QueueImplPtr & getQueue() const
Definition: commands.hpp:139
cl::sycl::detail::HostTask
Definition: cg_types.hpp:229
config.hpp
cl::sycl::detail::ExecCGCommand::MEmptyCmd
EmptyCommand * MEmptyCmd
Definition: commands.hpp:525
cl::sycl::detail::MemObjRecord
Memory Object Record.
Definition: scheduler.hpp:193
cl::sycl::detail::unmarkVisitedNodes
static void unmarkVisitedNodes(std::vector< Command * > &Visited)
Definition: graph_builder.cpp:123
cl::sycl::detail::MemCpyCommandHost::getRequirement
const Requirement * getRequirement() const final
Definition: commands.hpp:491
cl::sycl::detail::LeavesCollection::push_back
bool push_back(value_type Cmd, EnqueueListT &ToEnqueue)
Returns true if insertion took place. Returns false otherwise.
Definition: leaves_collection.cpp:52
cl::sycl::detail::Command::MDeps
std::vector< DepDesc > MDeps
Contains list of dependencies(edges)
Definition: commands.hpp:241
cl::sycl::detail::SYCLConfig
Definition: config.hpp:104
cl::sycl::detail::MapMemObject
The map command enqueues mapping of device memory onto host memory.
Definition: commands.hpp:420
cl::sycl::detail::AccessorImplHost::MData
void * MData
Definition: accessor_impl.hpp:121
cl::sycl::detail::Command::CommandType
CommandType
Definition: commands.hpp:95
cl::sycl::detail::isAccessModeAllowed
static bool isAccessModeAllowed(access::mode Required, access::mode Current)
Checks if the required access mode is allowed under the current one.
Definition: graph_builder.cpp:56
cl::sycl::detail::MemObjRecord::MWriteLeaves
LeavesCollection MWriteLeaves
Definition: scheduler.hpp:206
cl::sycl::detail::isInteropHostTask
static bool isInteropHostTask(const std::unique_ptr< ExecCGCommand > &Cmd)
Definition: graph_builder.cpp:851
cl::sycl::detail::combineAccessModesOfReqs
static void combineAccessModesOfReqs(std::vector< Requirement * > &Reqs)
Definition: graph_builder.cpp:861
cl::sycl::detail::LeavesCollection::toVector
std::vector< value_type > toVector() const
Definition: leaves_collection.cpp:64
cl::sycl::detail::CGHostTask::MHostTask
std::unique_ptr< HostTask > MHostTask
Definition: cg.hpp:473
access.hpp
queue_impl.hpp
cl::sycl::detail::MemObjRecord::MHostAccess
access::mode MHostAccess
Definition: scheduler.hpp:213
scheduler.hpp
cl::sycl::detail::write
void write(GlobalBufAccessorT &GlobalFlushBuf, size_t FlushBufferSize, unsigned WIOffset, const char *Str, unsigned Len, unsigned Padding=0)
Definition: stream.hpp:110
cl::sycl::range< 3 >
cl::sycl::detail::insertMapUnmapForLinkedCmds
static Command * insertMapUnmapForLinkedCmds(AllocaCommandBase *AllocaCmdSrc, AllocaCommandBase *AllocaCmdDst, access::mode MapMode)
Definition: graph_builder.cpp:279
cl::sycl::detail::Requirement
AccessorImplHost Requirement
Definition: accessor_impl.hpp:208
cl::sycl::detail::LeavesCollection::remove
size_t remove(value_type Cmd)
Replacement for std::remove with subsequent call to erase(newEnd, end()).
Definition: leaves_collection.cpp:38
cl::sycl::detail::SYCLMemObjI::getType
virtual MemObjType getType() const =0
cl::sycl::detail::Command::addUser
void addUser(Command *NewUser)
Definition: commands.hpp:116
cl::sycl::detail::MemObjRecord::MReadLeaves
LeavesCollection MReadLeaves
Definition: scheduler.hpp:203
cl::sycl::detail::Command::MLeafCounter
unsigned MLeafCounter
Counts the number of memory objects this command is a leaf for.
Definition: commands.hpp:247
cl::sycl::range::size
size_t size() const
Definition: range.hpp:50
cl::sycl::detail::AccessorImplHost::MElemSize
unsigned int MElemSize
Definition: accessor_impl.hpp:117
cl::sycl::detail::MemObjRecord::MMemModified
bool MMemModified
Definition: scheduler.hpp:217
cl::sycl::detail::CG::CGTYPE
CGTYPE
Type of the command group.
Definition: cg.hpp:156
cl::sycl::detail::LeavesCollection::EnqueueListT
std::vector< Command * > EnqueueListT
Definition: leaves_collection.hpp:42
cl::sycl::detail::LeavesCollection::AllocateDependencyF
std::function< void(Command *, Command *, MemObjRecord *, EnqueueListT &)> AllocateDependencyF
Definition: leaves_collection.hpp:46
char
cl::sycl::detail::AccessorImplHost
Definition: accessor_impl.hpp:76
cl::sycl::detail::ExecCGCommand::getStreams
std::vector< StreamImplPtr > getStreams() const
Definition: commands.cpp:1293
cl::sycl::detail::CGHostTask::MQueue
std::shared_ptr< detail::queue_impl > MQueue
Definition: cg.hpp:475
cl::sycl::detail::sameCtx
static bool sameCtx(const ContextImplPtr &LHS, const ContextImplPtr &RHS)
Definition: graph_builder.cpp:44
cl::sycl::detail::CGUpdateHost
"Update host" command group class.
Definition: cg.hpp:337
cl::sycl::detail::DepDesc::MDepRequirement
const Requirement * MDepRequirement
Requirement for the dependency.
Definition: commands.hpp:80
cl::sycl::detail::AccessorImplHost::MOffsetInBytes
unsigned int MOffsetInBytes
Definition: accessor_impl.hpp:118
cl::sycl::device
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:34
cl::sycl::detail::DepDesc
Dependency between two commands.
Definition: commands.hpp:67
cl::sycl::detail::AllocaSubBufCommand
The AllocaSubBuf command enqueues creation of sub-buffer of memory object.
Definition: commands.hpp:402
cl::sycl::detail::Command::getType
CommandType getType() const
Definition: commands.hpp:119
cl::sycl::detail::SYCLMemObjI::MRecord
std::shared_ptr< MemObjRecord > MRecord
Definition: sycl_mem_obj_i.hpp:74
cl::sycl::detail::AccessorImplHost::MIsSubBuffer
bool MIsSubBuffer
Definition: accessor_impl.hpp:119
cl::sycl::detail::doOverlap
static bool doOverlap(const Requirement *LHS, const Requirement *RHS)
Checks whether two requirements overlap or not.
Definition: graph_builder.cpp:37
cl::sycl::detail::EventImplPtr
std::shared_ptr< detail::event_impl > EventImplPtr
Definition: memory_manager.hpp:30
cl::sycl::detail::CGHostTask
Definition: cg.hpp:471
cl::sycl::detail::Command::MMarks
Marks MMarks
Used for marking the node during graph traversal.
Definition: commands.hpp:256
cl::sycl::detail::AllocaCommandBase
Base class for memory allocation commands.
Definition: commands.hpp:344
cl::sycl::detail::AllocaCommandBase::MMemAllocation
void * MMemAllocation
Definition: commands.hpp:361
cl::sycl::detail::Command
The Command class represents some action that needs to be performed on one or more memory objects.
Definition: commands.hpp:93
cl
We provide new interfaces for matrix muliply in this patch:
Definition: access.hpp:13
cl::sycl::detail::SYCLMemObjI::getSize
virtual size_t getSize() const =0
cl::sycl::detail::AccessorImplHost::MSYCLMemObj
detail::SYCLMemObjI * MSYCLMemObj
Definition: accessor_impl.hpp:114
cl::sycl::detail::Command::Marks::MToBeDeleted
bool MToBeDeleted
Used for marking the node for deletion during cleanup.
Definition: commands.hpp:253
cl::sycl::detail::UpdateHostRequirementCommand
Definition: commands.hpp:554
cl::sycl::detail::queue_impl
Definition: queue_impl.hpp:53
cl::sycl::detail::QueueImplPtr
std::shared_ptr< detail::queue_impl > QueueImplPtr
Definition: memory_manager.hpp:29
cl::sycl::detail::Command::addDep
Command * addDep(DepDesc NewDep)
Definition: commands.cpp:565
cl::sycl::detail::AllocaCommandBase::getRequirement
const Requirement * getRequirement() const final
Definition: commands.hpp:355
cl::sycl::detail::AccessorImplHost::MAccessMode
access::mode MAccessMode
Definition: accessor_impl.hpp:112
PI_INVALID_VALUE
@ PI_INVALID_VALUE
Definition: pi.h:87
cl::sycl::detail::getSyclObjImpl
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: common.hpp:182
exception.hpp
cl::sycl::detail::checkHostUnifiedMemory
static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx)
Definition: graph_builder.cpp:627
cl::sycl::detail::AccessorImplHost::MMemoryRange
range< 3 > MMemoryRange
Definition: accessor_impl.hpp:111
cl::sycl::detail::DepDesc::MDepCommand
Command * MDepCommand
The actual dependency command.
Definition: commands.hpp:78
PI_OUT_OF_HOST_MEMORY
@ PI_OUT_OF_HOST_MEMORY
Definition: pi.h:93
cl::sycl::detail::DeviceImplPtr
std::shared_ptr< device_impl > DeviceImplPtr
Definition: program_manager.hpp:54
cl::sycl::detail::Command::getEvent
const EventImplPtr & getEvent() const
Definition: commands.hpp:143
cl::sycl::detail::MemObjRecord::MAllocaCommands
std::vector< AllocaCommandBase * > MAllocaCommands
Definition: scheduler.hpp:200
cl::sycl::detail::DepDesc::MAllocaCmd
AllocaCommandBase * MAllocaCmd
Allocation command for the memory object we have requirement for.
Definition: commands.hpp:83
cl::sycl::detail::printDotRecursive
static void printDotRecursive(std::fstream &Stream, std::vector< Command * > &Visited, Command *Cmd)
Definition: graph_builder.cpp:138
cl::sycl::access::mode
mode
Definition: access.hpp:28
cl::sycl::detail::AccessorImplHost::MAccessRange
range< 3 > MAccessRange
Definition: accessor_impl.hpp:109
cl::sycl::detail::AccessorImplHost::MBlockedCmd
Command * MBlockedCmd
Definition: accessor_impl.hpp:123
cl::sycl::detail::LeavesCollection
A wrapper for CircularBuffer class along with collection for host accessor's EmptyCommands.
Definition: leaves_collection.hpp:38
cl::sycl::detail::Command::getWorkerContext
virtual const ContextImplPtr & getWorkerContext() const
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:557
cl::sycl::detail::IsSuitableSubReq
static bool IsSuitableSubReq(const Requirement *Req)
Checks if current requirement is requirement for sub buffer.
Definition: graph_builder.cpp:51
cl::sycl::detail::Command::printDot
virtual void printDot(std::ostream &Stream) const =0
cl::sycl::detail::AllocaCommandBase::MLinkedAllocaCmd
AllocaCommandBase * MLinkedAllocaCmd
Alloca command linked with current command.
Definition: commands.hpp:368
cl::sycl::detail::enable_if_t
typename std::enable_if< B, T >::type enable_if_t
Definition: stl_type_traits.hpp:24
cl::sycl::detail::combineAccessModes
static access::mode combineAccessModes(access::mode A, access::mode B)
Combines two access modes into a single one that allows both.
Definition: graph_builder.cpp:74
cl::sycl::detail::EmptyCommand
The empty command does nothing during enqueue.
Definition: commands.hpp:304
cl::sycl::detail::Command::Marks::MVisited
bool MVisited
Used for marking the node as visited during graph traversal.
Definition: commands.hpp:251
cl::sycl::detail::SYCLMemObjI
Definition: sycl_mem_obj_i.hpp:28
cl::sycl::detail::SYCLMemObjI::getInteropContext
virtual ContextImplPtr getInteropContext() const =0
cl::sycl::detail::UpdateHostRequirementCommand::getRequirement
const Requirement * getRequirement() const final
Definition: commands.hpp:560
__SYCL_INLINE_NAMESPACE
#define __SYCL_INLINE_NAMESPACE(X)
Definition: defines_elementary.hpp:12
cl::sycl::detail::MemCpyCommandHost
The mem copy host command enqueues memory copy between two instances of memory object.
Definition: commands.hpp:484
memory_manager.hpp