DPC++ Runtime
Runtime libraries for oneAPI DPC++
graph_impl.cpp
Go to the documentation of this file.
1 //==--------- graph_impl.cpp - SYCL graph extension -----------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include <detail/graph_impl.hpp>
10 #include <detail/handler_impl.hpp>
13 #include <detail/queue_impl.hpp>
16 #include <sycl/feature_test.hpp>
17 #include <sycl/queue.hpp>
18 
19 namespace sycl {
20 inline namespace _V1 {
21 
22 namespace ext {
23 namespace oneapi {
24 namespace experimental {
25 namespace detail {
26 
27 namespace {
39 bool visitNodeDepthFirst(
40  std::shared_ptr<node_impl> Node,
41  std::set<std::shared_ptr<node_impl>> &VisitedNodes,
42  std::deque<std::shared_ptr<node_impl>> &NodeStack,
43  std::function<bool(std::shared_ptr<node_impl> &,
44  std::deque<std::shared_ptr<node_impl>> &)>
45  NodeFunc) {
46  auto EarlyReturn = NodeFunc(Node, NodeStack);
47  if (EarlyReturn) {
48  return true;
49  }
50  NodeStack.push_back(Node);
51  Node->MVisited = true;
52  VisitedNodes.emplace(Node);
53  for (auto &Successor : Node->MSuccessors) {
54  if (visitNodeDepthFirst(Successor.lock(), VisitedNodes, NodeStack,
55  NodeFunc)) {
56  return true;
57  }
58  }
59  NodeStack.pop_back();
60  return false;
61 }
62 
69 void sortTopological(std::shared_ptr<node_impl> NodeImpl,
70  std::list<std::shared_ptr<node_impl>> &Schedule,
71  bool PartitionBounded = false) {
72  for (auto &Succ : NodeImpl->MSuccessors) {
73  auto NextNode = Succ.lock();
74  if (PartitionBounded &&
75  (NextNode->MPartitionNum != NodeImpl->MPartitionNum)) {
76  continue;
77  }
78  // Check if we've already scheduled this node
79  if (std::find(Schedule.begin(), Schedule.end(), NextNode) ==
80  Schedule.end()) {
81  sortTopological(NextNode, Schedule, PartitionBounded);
82  }
83  }
84 
85  Schedule.push_front(NodeImpl);
86 }
87 
99 void propagatePartitionUp(std::shared_ptr<node_impl> Node, int PartitionNum) {
100  if (((Node->MPartitionNum != -1) && (Node->MPartitionNum <= PartitionNum)) ||
101  (Node->MCGType == sycl::detail::CG::CGTYPE::CodeplayHostTask)) {
102  return;
103  }
104  Node->MPartitionNum = PartitionNum;
105  for (auto &Predecessor : Node->MPredecessors) {
106  propagatePartitionUp(Predecessor.lock(), PartitionNum);
107  }
108 }
109 
117 void propagatePartitionDown(
118  std::shared_ptr<node_impl> Node, int PartitionNum,
119  std::list<std::shared_ptr<node_impl>> &HostTaskList) {
120  if (Node->MCGType == sycl::detail::CG::CGTYPE::CodeplayHostTask) {
121  if (Node->MPartitionNum != -1) {
122  HostTaskList.push_front(Node);
123  }
124  return;
125  }
126  Node->MPartitionNum = PartitionNum;
127  for (auto &Successor : Node->MSuccessors) {
128  propagatePartitionDown(Successor.lock(), PartitionNum, HostTaskList);
129  }
130 }
131 
136 bool isPartitionRoot(std::shared_ptr<node_impl> Node) {
137  for (auto &Predecessor : Node->MPredecessors) {
138  if (Predecessor.lock()->MPartitionNum == Node->MPartitionNum) {
139  return false;
140  }
141  }
142  return true;
143 }
144 
147 std::vector<node> createNodesFromImpls(
148  const std::vector<std::weak_ptr<detail::node_impl>> &Impls) {
149  std::vector<node> Nodes{};
150  Nodes.reserve(Impls.size());
151 
152  for (std::weak_ptr<detail::node_impl> Impl : Impls) {
153  Nodes.push_back(sycl::detail::createSyclObjFromImpl<node>(Impl.lock()));
154  }
155 
156  return Nodes;
157 }
158 
161 std::vector<node> createNodesFromImpls(
162  const std::vector<std::shared_ptr<detail::node_impl>> &Impls) {
163  std::vector<node> Nodes{};
164  Nodes.reserve(Impls.size());
165 
166  for (std::shared_ptr<detail::node_impl> Impl : Impls) {
167  Nodes.push_back(sycl::detail::createSyclObjFromImpl<node>(Impl));
168  }
169 
170  return Nodes;
171 }
172 
173 } // anonymous namespace
174 
176  if (MSchedule.empty()) {
177  for (auto &Node : MRoots) {
178  sortTopological(Node.lock(), MSchedule, true);
179  }
180  }
181 }
182 
184  int CurrentPartition = -1;
185  std::list<std::shared_ptr<node_impl>> HostTaskList;
186  // find all the host-tasks in the graph
187  for (auto &Node : MNodeStorage) {
188  if (Node->MCGType == sycl::detail::CG::CodeplayHostTask) {
189  HostTaskList.push_back(Node);
190  }
191  }
192 
193  // Annotate nodes
194  // The first step in graph partitioning is to annotate all nodes of the graph
195  // with a temporary partition or group number. This step allows us to group
196  // the graph nodes into sets of nodes with kind of meta-dependencies that must
197  // be enforced by the runtime. For example, Group 2 depends on Groups 0 and 1,
198  // which means that we should not try to run Group 2 before Groups 0 and 1
199  // have finished executing. Since host-tasks are currently the only tasks that
200  // require runtime dependency handling, groups of nodes are created from
201  // host-task nodes. We therefore loop over all the host-task nodes, and for
202  // each node:
203  // - Its predecessors are assigned to group number `n-1`
204  // - The node itself constitutes a group, group number `n`
205  // - Its successors are assigned to group number `n+1`
206  // Since running multiple partitions slows down the whole graph execution, we
207  // then try to reduce the number of partitions by merging them when possible.
208  // Typically, the grouping algorithm can create two successive partitions
209  // of target nodes in the following case:
210  // A host-task `A` is added to the graph. Later, another host task `B` is
211  // added to the graph. Consequently, the node `A` is stored before the node
212  // `B` in the node storage vector. Now, if `A` is placed as a successor of `B`
213  // (using make_edge function to make node `A` dependent on node `B`.) In this
214  // case, the host-task node `A` must be reprocessed after the node `B` and the
215  // group that includes the predecessor of `B` can be merged with the group of
216  // the predecessors of the node `A`.
217  while (HostTaskList.size() > 0) {
218  auto Node = HostTaskList.front();
219  HostTaskList.pop_front();
220  CurrentPartition++;
221  for (auto &Predecessor : Node->MPredecessors) {
222  propagatePartitionUp(Predecessor.lock(), CurrentPartition);
223  }
224  CurrentPartition++;
225  Node->MPartitionNum = CurrentPartition;
226  CurrentPartition++;
227  auto TmpSize = HostTaskList.size();
228  for (auto &Successor : Node->MSuccessors) {
229  propagatePartitionDown(Successor.lock(), CurrentPartition, HostTaskList);
230  }
231  if (HostTaskList.size() > TmpSize) {
232  // At least one HostTask has been re-numbered so group merge opportunities
233  for (const auto &HT : HostTaskList) {
234  auto HTPartitionNum = HT->MPartitionNum;
235  if (HTPartitionNum != -1) {
236  // can merge predecessors of node `Node` with predecessors of node
237  // `HT` (HTPartitionNum-1) since HT must be reprocessed
238  for (const auto &NodeImpl : MNodeStorage) {
239  if (NodeImpl->MPartitionNum == Node->MPartitionNum - 1) {
240  NodeImpl->MPartitionNum = HTPartitionNum - 1;
241  }
242  }
243  } else {
244  break;
245  }
246  }
247  }
248  }
249 
250  // Create partitions
251  int PartitionFinalNum = 0;
252  for (int i = -1; i <= CurrentPartition; i++) {
253  const std::shared_ptr<partition> &Partition = std::make_shared<partition>();
254  for (auto &Node : MNodeStorage) {
255  if (Node->MPartitionNum == i) {
256  MPartitionNodes[Node] = PartitionFinalNum;
257  if (isPartitionRoot(Node)) {
258  Partition->MRoots.insert(Node);
259  }
260  }
261  }
262  if (Partition->MRoots.size() > 0) {
263  Partition->schedule();
264  MPartitions.push_back(Partition);
265  PartitionFinalNum++;
266  }
267  }
268 
269  // Add an empty partition if there is no partition, i.e. empty graph
270  if (MPartitions.size() == 0) {
271  MPartitions.push_back(std::make_shared<partition>());
272  }
273 
274  // Make global schedule list
275  for (const auto &Partition : MPartitions) {
276  MSchedule.insert(MSchedule.end(), Partition->MSchedule.begin(),
277  Partition->MSchedule.end());
278  }
279 
280  // Compute partition dependencies
281  for (const auto &Partition : MPartitions) {
282  for (auto const &Root : Partition->MRoots) {
283  auto RootNode = Root.lock();
284  for (const auto &Dep : RootNode->MPredecessors) {
285  auto NodeDep = Dep.lock();
286  Partition->MPredecessors.push_back(
287  MPartitions[MPartitionNodes[NodeDep]]);
288  }
289  }
290  }
291 
292  // Reset node groups (if node have to be re-processed - e.g. subgraph)
293  for (auto &Node : MNodeStorage) {
294  Node->MPartitionNum = -1;
295  }
296 }
297 
299  clearQueues();
300  for (auto &MemObj : MMemObjs) {
301  MemObj->markNoLongerBeingUsedInGraph();
302  }
303 }
304 
305 std::shared_ptr<node_impl> graph_impl::addNodesToExits(
306  const std::shared_ptr<graph_impl> &Impl,
307  const std::list<std::shared_ptr<node_impl>> &NodeList) {
308  // Find all input and output nodes from the node list
309  std::vector<std::shared_ptr<node_impl>> Inputs;
310  std::vector<std::shared_ptr<node_impl>> Outputs;
311  for (auto &NodeImpl : NodeList) {
312  if (NodeImpl->MPredecessors.size() == 0) {
313  Inputs.push_back(NodeImpl);
314  }
315  if (NodeImpl->MSuccessors.size() == 0) {
316  Outputs.push_back(NodeImpl);
317  }
318  }
319 
320  // Find all exit nodes in the current graph and register the Inputs as
321  // successors
322  for (auto &NodeImpl : MNodeStorage) {
323  if (NodeImpl->MSuccessors.size() == 0) {
324  for (auto &Input : Inputs) {
325  NodeImpl->registerSuccessor(Input, NodeImpl);
326  }
327  }
328  }
329 
330  // Add all the new nodes to the node storage
331  for (auto &Node : NodeList) {
332  MNodeStorage.push_back(Node);
333  addEventForNode(Impl, std::make_shared<sycl::detail::event_impl>(), Node);
334  }
335 
336  return this->add(Impl, Outputs);
337 }
338 
339 void graph_impl::addRoot(const std::shared_ptr<node_impl> &Root) {
340  MRoots.insert(Root);
341 }
342 
343 void graph_impl::removeRoot(const std::shared_ptr<node_impl> &Root) {
344  MRoots.erase(Root);
345 }
346 
347 std::shared_ptr<node_impl>
348 graph_impl::add(const std::shared_ptr<graph_impl> &Impl,
349  const std::vector<std::shared_ptr<node_impl>> &Dep) {
350  // Copy deps so we can modify them
351  auto Deps = Dep;
352 
353  const std::shared_ptr<node_impl> &NodeImpl = std::make_shared<node_impl>();
354 
355  // Add any deps from the vector of extra dependencies
356  Deps.insert(Deps.end(), MExtraDependencies.begin(), MExtraDependencies.end());
357 
358  MNodeStorage.push_back(NodeImpl);
359 
360  addDepsToNode(NodeImpl, Deps);
361  // Add an event associated with this explicit node for mixed usage
362  addEventForNode(Impl, std::make_shared<sycl::detail::event_impl>(), NodeImpl);
363  return NodeImpl;
364 }
365 
366 std::shared_ptr<node_impl>
367 graph_impl::add(const std::shared_ptr<graph_impl> &Impl,
368  std::function<void(handler &)> CGF,
369  const std::vector<sycl::detail::ArgDesc> &Args,
370  const std::vector<std::shared_ptr<node_impl>> &Dep) {
371  (void)Args;
372  sycl::handler Handler{Impl};
373  CGF(Handler);
374  Handler.finalize();
375 
376  if (Handler.MCGType == sycl::detail::CG::Barrier) {
377  throw sycl::exception(
379  "The sycl_ext_oneapi_enqueue_barrier feature is not available with "
380  "SYCL Graph Explicit API. Please use empty nodes instead.");
381  }
382 
383  node_type NodeType =
384  Handler.MImpl->MUserFacingNodeType !=
386  ? Handler.MImpl->MUserFacingNodeType
388  Handler.MCGType);
389 
390  auto NodeImpl = this->add(NodeType, std::move(Handler.MGraphNodeCG), Dep);
391  NodeImpl->MNDRangeUsed = Handler.MImpl->MNDRangeUsed;
392  // Add an event associated with this explicit node for mixed usage
393  addEventForNode(Impl, std::make_shared<sycl::detail::event_impl>(), NodeImpl);
394 
395  // Retrieve any dynamic parameters which have been registered in the CGF and
396  // register the actual nodes with them.
397  auto &DynamicParams = Handler.MImpl->MDynamicParameters;
398 
399  if (NodeType != node_type::kernel && DynamicParams.size() > 0) {
401  "dynamic_parameters cannot be registered with graph "
402  "nodes which do not represent kernel executions");
403  }
404 
405  for (auto &[DynamicParam, ArgIndex] : DynamicParams) {
406  DynamicParam->registerNode(NodeImpl, ArgIndex);
407  }
408 
409  return NodeImpl;
410 }
411 
412 std::shared_ptr<node_impl>
413 graph_impl::add(const std::shared_ptr<graph_impl> &Impl,
414  const std::vector<sycl::detail::EventImplPtr> Events) {
415 
416  std::vector<std::shared_ptr<node_impl>> Deps;
417 
418  // Add any nodes specified by event dependencies into the dependency list
419  for (const auto &Dep : Events) {
420  if (auto NodeImpl = MEventsMap.find(Dep); NodeImpl != MEventsMap.end()) {
421  Deps.push_back(NodeImpl->second);
422  } else {
424  "Event dependency from handler::depends_on does "
425  "not correspond to a node within the graph");
426  }
427  }
428 
429  return this->add(Impl, Deps);
430 }
431 
432 std::shared_ptr<node_impl>
434  std::unique_ptr<sycl::detail::CG> CommandGroup,
435  const std::vector<std::shared_ptr<node_impl>> &Dep) {
436  // Copy deps so we can modify them
437  auto Deps = Dep;
438 
439  // A unique set of dependencies obtained by checking requirements and events
440  std::set<std::shared_ptr<node_impl>> UniqueDeps;
441  const auto &Requirements = CommandGroup->getRequirements();
442  if (!MAllowBuffers && Requirements.size()) {
444  "Cannot use buffers in a graph without passing the "
445  "assume_buffer_outlives_graph property on "
446  "Graph construction.");
447  }
448 
449  for (auto &Req : Requirements) {
450  // Track and mark the memory objects being used by the graph.
451  auto MemObj = static_cast<sycl::detail::SYCLMemObjT *>(Req->MSYCLMemObj);
452  bool WasInserted = MMemObjs.insert(MemObj).second;
453  if (WasInserted) {
454  MemObj->markBeingUsedInGraph();
455  }
456  // Look through the graph for nodes which share this requirement
457  for (auto &Node : MNodeStorage) {
458  if (Node->hasRequirementDependency(Req)) {
459  bool ShouldAddDep = true;
460  // If any of this node's successors have this requirement then we skip
461  // adding the current node as a dependency.
462  for (auto &Succ : Node->MSuccessors) {
463  if (Succ.lock()->hasRequirementDependency(Req)) {
464  ShouldAddDep = false;
465  break;
466  }
467  }
468  if (ShouldAddDep) {
469  UniqueDeps.insert(Node);
470  }
471  }
472  }
473  }
474 
475  // Add any nodes specified by event dependencies into the dependency list
476  for (auto &Dep : CommandGroup->getEvents()) {
477  if (auto NodeImpl = MEventsMap.find(Dep); NodeImpl != MEventsMap.end()) {
478  UniqueDeps.insert(NodeImpl->second);
479  } else {
481  "Event dependency from handler::depends_on does "
482  "not correspond to a node within the graph");
483  }
484  }
485  // Add any deps determined from requirements and events into the dependency
486  // list
487  Deps.insert(Deps.end(), UniqueDeps.begin(), UniqueDeps.end());
488 
489  // Add any deps from the extra dependencies vector
490  Deps.insert(Deps.end(), MExtraDependencies.begin(), MExtraDependencies.end());
491 
492  const std::shared_ptr<node_impl> &NodeImpl =
493  std::make_shared<node_impl>(NodeType, std::move(CommandGroup));
494  MNodeStorage.push_back(NodeImpl);
495 
496  addDepsToNode(NodeImpl, Deps);
497 
498  // Set barrier nodes as prerequisites (new start points) for subsequent nodes
499  if (NodeImpl->MCGType == sycl::detail::CG::Barrier) {
500  MExtraDependencies.push_back(NodeImpl);
501  }
502 
503  return NodeImpl;
504 }
505 
507  bool AnyQueuesCleared = false;
508  for (auto &Queue : MRecordingQueues) {
509  if (auto ValidQueue = Queue.lock(); ValidQueue) {
510  ValidQueue->setCommandGraph(nullptr);
511  AnyQueuesCleared = true;
512  }
513  }
514  MRecordingQueues.clear();
515 
516  return AnyQueuesCleared;
517 }
518 
519 void graph_impl::searchDepthFirst(
520  std::function<bool(std::shared_ptr<node_impl> &,
521  std::deque<std::shared_ptr<node_impl>> &)>
522  NodeFunc) {
523  // Track nodes visited during the search which can be used by NodeFunc in
524  // depth first search queries. Currently unusued but is an
525  // integral part of depth first searches.
526  std::set<std::shared_ptr<node_impl>> VisitedNodes;
527 
528  for (auto &Root : MRoots) {
529  std::deque<std::shared_ptr<node_impl>> NodeStack;
530  if (visitNodeDepthFirst(Root.lock(), VisitedNodes, NodeStack, NodeFunc)) {
531  break;
532  }
533  }
534 
535  // Reset the visited status of all nodes encountered in the search.
536  for (auto &Node : VisitedNodes) {
537  Node->MVisited = false;
538  }
539 }
540 
541 bool graph_impl::checkForCycles() {
542  // Using a depth-first search and checking if we vist a node more than once in
543  // the current path to identify if there are cycles.
544  bool CycleFound = false;
545  auto CheckFunc = [&](std::shared_ptr<node_impl> &Node,
546  std::deque<std::shared_ptr<node_impl>> &NodeStack) {
547  // If the current node has previously been found in the current path through
548  // the graph then we have a cycle and we end the search early.
549  if (std::find(NodeStack.begin(), NodeStack.end(), Node) !=
550  NodeStack.end()) {
551  CycleFound = true;
552  return true;
553  }
554  return false;
555  };
556  searchDepthFirst(CheckFunc);
557  return CycleFound;
558 }
559 
560 void graph_impl::makeEdge(std::shared_ptr<node_impl> Src,
561  std::shared_ptr<node_impl> Dest) {
562  throwIfGraphRecordingQueue("make_edge()");
563  if (Src == Dest) {
564  throw sycl::exception(
565  make_error_code(sycl::errc::invalid),
566  "make_edge() cannot be called when Src and Dest are the same.");
567  }
568 
569  bool SrcFound = false;
570  bool DestFound = false;
571  for (const auto &Node : MNodeStorage) {
572 
573  SrcFound |= Node == Src;
574  DestFound |= Node == Dest;
575 
576  if (SrcFound && DestFound) {
577  break;
578  }
579  }
580 
581  if (!SrcFound) {
582  throw sycl::exception(make_error_code(sycl::errc::invalid),
583  "Src must be a node inside the graph.");
584  }
585  if (!DestFound) {
586  throw sycl::exception(make_error_code(sycl::errc::invalid),
587  "Dest must be a node inside the graph.");
588  }
589 
590  // We need to add the edges first before checking for cycles
591  Src->registerSuccessor(Dest, Src);
592 
593  // We can skip cycle checks if either Dest has no successors (cycle not
594  // possible) or cycle checks have been disabled with the no_cycle_check
595  // property;
596  if (Dest->MSuccessors.empty() || !MSkipCycleChecks) {
597  bool CycleFound = checkForCycles();
598 
599  if (CycleFound) {
600  // Remove the added successor and predecessor
601  Src->MSuccessors.pop_back();
602  Dest->MPredecessors.pop_back();
603 
604  throw sycl::exception(make_error_code(sycl::errc::invalid),
605  "Command graphs cannot contain cycles.");
606  }
607  }
608  removeRoot(Dest); // remove receiver from root node list
609 }
610 
611 std::vector<sycl::detail::EventImplPtr> graph_impl::getExitNodesEvents() {
612  std::vector<sycl::detail::EventImplPtr> Events;
613 
614  for (auto &Node : MNodeStorage) {
615  if (Node->MSuccessors.empty()) {
616  Events.push_back(getEventForNode(Node));
617  }
618  }
619 
620  return Events;
621 }
622 
623 // Check if nodes are empty and if so loop back through predecessors until we
624 // find the real dependency.
625 void exec_graph_impl::findRealDeps(
626  std::vector<sycl::detail::pi::PiExtSyncPoint> &Deps,
627  std::shared_ptr<node_impl> CurrentNode, int ReferencePartitionNum) {
628  if (CurrentNode->isEmpty()) {
629  for (auto &N : CurrentNode->MPredecessors) {
630  auto NodeImpl = N.lock();
631  findRealDeps(Deps, NodeImpl, ReferencePartitionNum);
632  }
633  } else {
634  // Verify if CurrentNode belong the the same partition
635  if (MPartitionNodes[CurrentNode] == ReferencePartitionNum) {
636  // Verify that the sync point has actually been set for this node.
637  auto SyncPoint = MPiSyncPoints.find(CurrentNode);
638  assert(SyncPoint != MPiSyncPoints.end() &&
639  "No sync point has been set for node dependency.");
640  // Check if the dependency has already been added.
641  if (std::find(Deps.begin(), Deps.end(), SyncPoint->second) ==
642  Deps.end()) {
643  Deps.push_back(SyncPoint->second);
644  }
645  }
646  }
647 }
648 
649 sycl::detail::pi::PiExtSyncPoint exec_graph_impl::enqueueNodeDirect(
652  std::shared_ptr<node_impl> Node) {
653  std::vector<sycl::detail::pi::PiExtSyncPoint> Deps;
654  for (auto &N : Node->MPredecessors) {
655  findRealDeps(Deps, N.lock(), MPartitionNodes[Node]);
656  }
660  Ctx, DeviceImpl, CommandBuffer,
661  *static_cast<sycl::detail::CGExecKernel *>((Node->MCommandGroup.get())),
662  Deps, &NewSyncPoint, &NewCommand, nullptr);
663 
664  MCommandMap[Node] = NewCommand;
665 
666  if (Res != pi_result::PI_SUCCESS) {
668  "Failed to add kernel to PI command-buffer");
669  }
670 
671  return NewSyncPoint;
672 }
673 
674 sycl::detail::pi::PiExtSyncPoint exec_graph_impl::enqueueNode(
675  sycl::context Ctx, std::shared_ptr<sycl::detail::device_impl> DeviceImpl,
677  std::shared_ptr<node_impl> Node) {
678 
679  // Queue which will be used for allocation operations for accessors.
680  auto AllocaQueue = std::make_shared<sycl::detail::queue_impl>(
683 
684  std::vector<sycl::detail::pi::PiExtSyncPoint> Deps;
685  for (auto &N : Node->MPredecessors) {
686  findRealDeps(Deps, N.lock(), MPartitionNodes[Node]);
687  }
688 
690  sycl::detail::Scheduler::getInstance().addCG(
691  Node->getCGCopy(), AllocaQueue, CommandBuffer, Deps);
692 
693  MCommandMap[Node] = Event->getCommandBufferCommand();
694  return Event->getSyncPoint();
695 }
697  sycl::device Device, std::shared_ptr<partition> &Partition) {
698  sycl::detail::pi::PiExtCommandBuffer OutCommandBuffer;
701  MIsUpdatable};
702  auto ContextImpl = sycl::detail::getSyclObjImpl(MContext);
703  const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin();
704  auto DeviceImpl = sycl::detail::getSyclObjImpl(Device);
705  pi_result Res =
707  ContextImpl->getHandleRef(), DeviceImpl->getHandleRef(), &Desc,
708  &OutCommandBuffer);
709  if (Res != pi_result::PI_SUCCESS) {
710  throw sycl::exception(errc::invalid, "Failed to create PI command-buffer");
711  }
712 
713  Partition->MPiCommandBuffers[Device] = OutCommandBuffer;
714 
715  for (const auto &Node : Partition->MSchedule) {
716  // Empty nodes are not processed as other nodes, but only their
717  // dependencies are propagated in findRealDeps
718  if (Node->isEmpty())
719  continue;
720 
721  sycl::detail::CG::CGTYPE type = Node->MCGType;
722  // If the node is a kernel with no special requirements we can enqueue it
723  // directly.
724  if (type == sycl::detail::CG::Kernel &&
725  Node->MCommandGroup->getRequirements().size() +
726  static_cast<sycl::detail::CGExecKernel *>(
727  Node->MCommandGroup.get())
728  ->MStreams.size() ==
729  0) {
730  MPiSyncPoints[Node] =
731  enqueueNodeDirect(MContext, DeviceImpl, OutCommandBuffer, Node);
732  } else {
733  MPiSyncPoints[Node] =
734  enqueueNode(MContext, DeviceImpl, OutCommandBuffer, Node);
735  }
736 
737  // Append Node requirements to overall graph requirements
738  MRequirements.insert(MRequirements.end(),
739  Node->MCommandGroup->getRequirements().begin(),
740  Node->MCommandGroup->getRequirements().end());
741  // Also store the actual accessor to make sure they are kept alive when
742  // commands are submitted
743  MAccessors.insert(MAccessors.end(),
744  Node->MCommandGroup->getAccStorage().begin(),
745  Node->MCommandGroup->getAccStorage().end());
746  }
747 
748  Res =
750  OutCommandBuffer);
751  if (Res != pi_result::PI_SUCCESS) {
753  "Failed to finalize PI command-buffer");
754  }
755 }
756 
758  const std::shared_ptr<graph_impl> &GraphImpl,
759  const property_list &PropList)
760  : MSchedule(), MGraphImpl(GraphImpl), MPiSyncPoints(),
761  MDevice(GraphImpl->getDevice()), MContext(Context), MRequirements(),
762  MExecutionEvents(),
763  MIsUpdatable(PropList.has_property<property::graph::updatable>()) {
764 
765  // If the graph has been marked as updatable then check if the backend
766  // actually supports that. Devices supporting aspect::ext_oneapi_graph must
767  // have support for graph update.
768  if (MIsUpdatable) {
769  bool SupportsUpdate = MGraphImpl->getDevice().has(aspect::ext_oneapi_graph);
770  if (!SupportsUpdate) {
772  "Device does not support Command Graph update");
773  }
774  }
775  // Copy nodes from GraphImpl and merge any subgraph nodes into this graph.
776  duplicateNodes();
777 }
778 
780  const sycl::detail::PluginPtr &Plugin =
781  sycl::detail::getSyclObjImpl(MContext)->getPlugin();
782  MSchedule.clear();
783  // We need to wait on all command buffer executions before we can release
784  // them.
785  for (auto &Event : MExecutionEvents) {
786  Event->wait(Event);
787  }
788 
789  for (const auto &Partition : MPartitions) {
790  Partition->MSchedule.clear();
791  for (const auto &Iter : Partition->MPiCommandBuffers) {
792  if (auto CmdBuf = Iter.second; CmdBuf) {
793  pi_result Res = Plugin->call_nocheck<
795  (void)Res;
796  assert(Res == pi_result::PI_SUCCESS);
797  }
798  }
799  }
800 
801  for (auto &Iter : MCommandMap) {
802  if (auto Command = Iter.second; Command) {
803  pi_result Res = Plugin->call_nocheck<
805  (void)Res;
806  assert(Res == pi_result::PI_SUCCESS);
807  }
808  }
809 }
810 
812 exec_graph_impl::enqueue(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
814  WriteLock Lock(MMutex);
815 
816  // Map of the partitions to their execution events
817  std::unordered_map<std::shared_ptr<partition>, sycl::detail::EventImplPtr>
818  PartitionsExecutionEvents;
819 
820  auto CreateNewEvent([&]() {
821  auto NewEvent = std::make_shared<sycl::detail::event_impl>(Queue);
822  NewEvent->setContextImpl(Queue->getContextImplPtr());
823  NewEvent->setStateIncomplete();
824  return NewEvent;
825  });
826 
828  std::vector<sycl::detail::EventImplPtr> BackupCGDataMEvents;
829  if (MPartitions.size() > 1) {
830  BackupCGDataMEvents = CGData.MEvents;
831  }
832  for (uint32_t currentPartitionsNum = 0;
833  currentPartitionsNum < MPartitions.size(); currentPartitionsNum++) {
834  auto CurrentPartition = MPartitions[currentPartitionsNum];
835  // restore initial MEvents to add only needed additional depenencies
836  if (currentPartitionsNum > 0) {
837  CGData.MEvents = BackupCGDataMEvents;
838  }
839 
840  for (auto const &DepPartition : CurrentPartition->MPredecessors) {
841  CGData.MEvents.push_back(PartitionsExecutionEvents[DepPartition]);
842  }
843 
844  auto CommandBuffer =
845  CurrentPartition->MPiCommandBuffers[Queue->get_device()];
846 
847  if (CommandBuffer) {
848  // if previous submissions are incompleted, we automatically
849  // add completion events of previous submissions as dependencies.
850  // With Level-Zero backend we cannot resubmit a command-buffer until the
851  // previous one has already completed.
852  // Indeed, since a command-list does not accept a list a dependencies at
853  // submission, we circumvent this lack by adding a barrier that waits on a
854  // specific event and then define the conditions to signal this event in
855  // another command-list. Consequently, if a second submission is
856  // performed, the signal conditions of this single event are redefined by
857  // this second submission. Thus, this can lead to an undefined behaviour
858  // and potential hangs. We have therefore to expliclty wait in the host
859  // for previous submission to complete before resubmitting the
860  // command-buffer for level-zero backend.
861  // TODO : add a check to release this constraint and allow multiple
862  // concurrent submissions if the exec_graph has been updated since the
863  // last submission.
864  for (std::vector<sycl::detail::EventImplPtr>::iterator It =
865  MExecutionEvents.begin();
866  It != MExecutionEvents.end();) {
867  auto Event = *It;
868  if (!Event->isCompleted()) {
869  if (Queue->get_device().get_backend() ==
870  sycl::backend::ext_oneapi_level_zero) {
871  Event->wait(Event);
872  } else {
873  auto &AttachedEventsList = Event->getPostCompleteEvents();
874  CGData.MEvents.reserve(AttachedEventsList.size() + 1);
875  CGData.MEvents.push_back(Event);
876  // Add events of the previous execution of all graph partitions.
877  for (auto &AttachedEvent : AttachedEventsList) {
878  CGData.MEvents.push_back(AttachedEvent);
879  }
880  }
881  ++It;
882  } else {
883  // Remove completed events
884  It = MExecutionEvents.erase(It);
885  }
886  }
887 
888  NewEvent = CreateNewEvent();
889  sycl::detail::pi::PiEvent *OutEvent = &NewEvent->getHandleRef();
890  // Merge requirements from the nodes into requirements (if any) from the
891  // handler.
892  CGData.MRequirements.insert(CGData.MRequirements.end(),
893  MRequirements.begin(), MRequirements.end());
894  CGData.MAccStorage.insert(CGData.MAccStorage.end(), MAccessors.begin(),
895  MAccessors.end());
896 
897  // If we have no requirements or dependent events for the command buffer,
898  // enqueue it directly
899  if (CGData.MRequirements.empty() && CGData.MEvents.empty()) {
900  if (NewEvent != nullptr)
901  NewEvent->setHostEnqueueTime();
902  pi_result Res =
903  Queue->getPlugin()
904  ->call_nocheck<
906  CommandBuffer, Queue->getHandleRef(), 0, nullptr, OutEvent);
907  if (Res == pi_result::PI_ERROR_INVALID_QUEUE_PROPERTIES) {
908  throw sycl::exception(
910  "Graphs cannot be submitted to a queue which uses "
911  "immediate command lists. Use "
912  "sycl::ext::intel::property::queue::no_immediate_"
913  "command_list to disable them.");
914  } else if (Res != pi_result::PI_SUCCESS) {
915  throw sycl::exception(
916  errc::event,
917  "Failed to enqueue event for command buffer submission");
918  }
919  } else {
920  std::unique_ptr<sycl::detail::CG> CommandGroup =
921  std::make_unique<sycl::detail::CGExecCommandBuffer>(
922  CommandBuffer, nullptr, std::move(CGData));
923 
924  NewEvent = sycl::detail::Scheduler::getInstance().addCG(
925  std::move(CommandGroup), Queue);
926  }
927  NewEvent->setEventFromSubmittedExecCommandBuffer(true);
928  } else if ((CurrentPartition->MSchedule.size() > 0) &&
929  (CurrentPartition->MSchedule.front()->MCGType ==
930  sycl::detail::CG::CGTYPE::CodeplayHostTask)) {
931  auto NodeImpl = CurrentPartition->MSchedule.front();
932  // Schedule host task
933  NodeImpl->MCommandGroup->getEvents().insert(
934  NodeImpl->MCommandGroup->getEvents().end(), CGData.MEvents.begin(),
935  CGData.MEvents.end());
936  // HostTask CG stores the Queue on which the task was submitted.
937  // In case of graph, this queue may differ from the actual execution
938  // queue. We therefore overload this Queue before submitting the task.
939  static_cast<sycl::detail::CGHostTask &>(*NodeImpl->MCommandGroup.get())
940  .MQueue = Queue;
941 
942  NewEvent = sycl::detail::Scheduler::getInstance().addCG(
943  NodeImpl->getCGCopy(), Queue);
944  } else {
945  std::vector<std::shared_ptr<sycl::detail::event_impl>> ScheduledEvents;
946  for (auto &NodeImpl : CurrentPartition->MSchedule) {
947  std::vector<sycl::detail::pi::PiEvent> RawEvents;
948 
949  // If the node has no requirements for accessors etc. then we skip the
950  // scheduler and enqueue directly.
951  if (NodeImpl->MCGType == sycl::detail::CG::Kernel &&
952  NodeImpl->MCommandGroup->getRequirements().size() +
953  static_cast<sycl::detail::CGExecKernel *>(
954  NodeImpl->MCommandGroup.get())
955  ->MStreams.size() ==
956  0) {
957  sycl::detail::CGExecKernel *CG =
958  static_cast<sycl::detail::CGExecKernel *>(
959  NodeImpl->MCommandGroup.get());
960  auto OutEvent = CreateNewEvent();
962  Queue, CG->MNDRDesc, CG->MArgs, CG->MKernelBundle,
963  CG->MSyclKernel, CG->MKernelName, RawEvents, OutEvent,
964  // TODO: Pass accessor mem allocations
965  nullptr,
966  // TODO: Extract from handler
967  PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT, CG->MKernelIsCooperative);
968  if (Res != pi_result::PI_SUCCESS) {
969  throw sycl::exception(
971  "Error during emulated graph command group submission.");
972  }
973  ScheduledEvents.push_back(NewEvent);
974  } else if (!NodeImpl->isEmpty()) {
975  // Empty nodes are node processed as other nodes, but only their
976  // dependencies are propagated in findRealDeps
977  sycl::detail::EventImplPtr EventImpl =
978  sycl::detail::Scheduler::getInstance().addCG(
979  NodeImpl->getCGCopy(), Queue);
980 
981  ScheduledEvents.push_back(EventImpl);
982  }
983  }
984  // Create an event which has all kernel events as dependencies
985  NewEvent = std::make_shared<sycl::detail::event_impl>(Queue);
986  NewEvent->setStateIncomplete();
987  NewEvent->getPreparedDepsEvents() = ScheduledEvents;
988  }
989  PartitionsExecutionEvents[CurrentPartition] = NewEvent;
990  }
991 
992  // Keep track of this execution event so we can make sure it's completed in
993  // the destructor.
994  MExecutionEvents.push_back(NewEvent);
995  // Attach events of previous partitions to ensure that when the returned event
996  // is complete all execution associated with the graph have been completed.
997  for (auto const &Elem : PartitionsExecutionEvents) {
998  if (Elem.second != NewEvent) {
999  NewEvent->attachEventToComplete(Elem.second);
1000  }
1001  }
1002  sycl::event QueueEvent =
1003  sycl::detail::createSyclObjFromImpl<sycl::event>(NewEvent);
1004  return QueueEvent;
1005 }
1006 
1007 void exec_graph_impl::duplicateNodes() {
1008  // Map of original modifiable nodes (keys) to new duplicated nodes (values)
1009  std::map<std::shared_ptr<node_impl>, std::shared_ptr<node_impl>> NodesMap;
1010 
1011  const std::vector<std::shared_ptr<node_impl>> &ModifiableNodes =
1012  MGraphImpl->MNodeStorage;
1013  std::deque<std::shared_ptr<node_impl>> NewNodes;
1014 
1015  for (size_t i = 0; i < ModifiableNodes.size(); i++) {
1016  auto OriginalNode = ModifiableNodes[i];
1017  std::shared_ptr<node_impl> NodeCopy =
1018  std::make_shared<node_impl>(*OriginalNode);
1019 
1020  // Associate the ID of the original node with the node copy for later quick
1021  // access
1022  MIDCache.insert(std::make_pair(OriginalNode->MID, NodeCopy));
1023 
1024  // Clear edges between nodes so that we can replace with new ones
1025  NodeCopy->MSuccessors.clear();
1026  NodeCopy->MPredecessors.clear();
1027  // Push the new node to the front of the stack
1028  NewNodes.push_back(NodeCopy);
1029  // Associate the new node with the old one for updating edges
1030  NodesMap.insert({OriginalNode, NodeCopy});
1031  }
1032 
1033  // Now that all nodes have been copied rebuild edges on new nodes. This must
1034  // be done as a separate step since successors may be out of order.
1035  for (size_t i = 0; i < ModifiableNodes.size(); i++) {
1036  auto OriginalNode = ModifiableNodes[i];
1037  auto NodeCopy = NewNodes[i];
1038  // Look through all the original node successors, find their copies and
1039  // register those as successors with the current copied node
1040  for (auto &NextNode : OriginalNode->MSuccessors) {
1041  auto Successor = NodesMap.at(NextNode.lock());
1042  NodeCopy->registerSuccessor(Successor, NodeCopy);
1043  }
1044  }
1045 
1046  // Subgraph nodes need special handling, we extract all subgraph nodes and
1047  // merge them into the main node list
1048 
1049  for (auto NewNodeIt = NewNodes.rbegin(); NewNodeIt != NewNodes.rend();
1050  ++NewNodeIt) {
1051  auto NewNode = *NewNodeIt;
1052  if (NewNode->MNodeType != node_type::subgraph) {
1053  continue;
1054  }
1055  const std::vector<std::shared_ptr<node_impl>> &SubgraphNodes =
1056  NewNode->MSubGraphImpl->MNodeStorage;
1057  std::deque<std::shared_ptr<node_impl>> NewSubgraphNodes{};
1058 
1059  // Map of original subgraph nodes (keys) to new duplicated nodes (values)
1060  std::map<std::shared_ptr<node_impl>, std::shared_ptr<node_impl>>
1061  SubgraphNodesMap;
1062 
1063  // Copy subgraph nodes
1064  for (size_t i = 0; i < SubgraphNodes.size(); i++) {
1065  auto SubgraphNode = SubgraphNodes[i];
1066  auto NodeCopy = std::make_shared<node_impl>(*SubgraphNode);
1067  // Associate the ID of the original subgraph node with all extracted node
1068  // copies for future quick access.
1069  MIDCache.insert(std::make_pair(SubgraphNode->MID, NodeCopy));
1070 
1071  NewSubgraphNodes.push_back(NodeCopy);
1072  SubgraphNodesMap.insert({SubgraphNode, NodeCopy});
1073  NodeCopy->MSuccessors.clear();
1074  NodeCopy->MPredecessors.clear();
1075  }
1076 
1077  // Rebuild edges for new subgraph nodes
1078  for (size_t i = 0; i < SubgraphNodes.size(); i++) {
1079  auto SubgraphNode = SubgraphNodes[i];
1080  auto NodeCopy = NewSubgraphNodes[i];
1081 
1082  for (auto &NextNode : SubgraphNode->MSuccessors) {
1083  auto Successor = SubgraphNodesMap.at(NextNode.lock());
1084  NodeCopy->registerSuccessor(Successor, NodeCopy);
1085  }
1086  }
1087 
1088  // Collect input and output nodes for the subgraph
1089  std::vector<std::shared_ptr<node_impl>> Inputs;
1090  std::vector<std::shared_ptr<node_impl>> Outputs;
1091  for (auto &NodeImpl : NewSubgraphNodes) {
1092  if (NodeImpl->MPredecessors.size() == 0) {
1093  Inputs.push_back(NodeImpl);
1094  }
1095  if (NodeImpl->MSuccessors.size() == 0) {
1096  Outputs.push_back(NodeImpl);
1097  }
1098  }
1099 
1100  // Update the predecessors and successors of the nodes which reference the
1101  // original subgraph node
1102 
1103  // Predecessors
1104  for (auto &PredNodeWeak : NewNode->MPredecessors) {
1105  auto PredNode = PredNodeWeak.lock();
1106  auto &Successors = PredNode->MSuccessors;
1107 
1108  // Remove the subgraph node from this nodes successors
1109  Successors.erase(std::remove_if(Successors.begin(), Successors.end(),
1110  [NewNode](auto WeakNode) {
1111  return WeakNode.lock() == NewNode;
1112  }),
1113  Successors.end());
1114 
1115  // Add all input nodes from the subgraph as successors for this node
1116  // instead
1117  for (auto &Input : Inputs) {
1118  PredNode->registerSuccessor(Input, PredNode);
1119  }
1120  }
1121 
1122  // Successors
1123  for (auto &SuccNodeWeak : NewNode->MSuccessors) {
1124  auto SuccNode = SuccNodeWeak.lock();
1125  auto &Predecessors = SuccNode->MPredecessors;
1126 
1127  // Remove the subgraph node from this nodes successors
1128  Predecessors.erase(std::remove_if(Predecessors.begin(),
1129  Predecessors.end(),
1130  [NewNode](auto WeakNode) {
1131  return WeakNode.lock() == NewNode;
1132  }),
1133  Predecessors.end());
1134 
1135  // Add all Output nodes from the subgraph as predecessors for this node
1136  // instead
1137  for (auto &Output : Outputs) {
1138  Output->registerSuccessor(SuccNode, Output);
1139  }
1140  }
1141 
1142  // Remove single subgraph node and add all new individual subgraph nodes
1143  // to the node storage in its place
1144  auto OldPositionIt =
1145  NewNodes.erase(std::find(NewNodes.begin(), NewNodes.end(), NewNode));
1146  // Also set the iterator to the newly added nodes so we can continue
1147  // iterating over all remaining nodes
1148  auto InsertIt = NewNodes.insert(OldPositionIt, NewSubgraphNodes.begin(),
1149  NewSubgraphNodes.end());
1150  // Since the new reverse_iterator will be at i - 1 we need to advance it
1151  // when constructing
1152  NewNodeIt = std::make_reverse_iterator(std::next(InsertIt));
1153  }
1154 
1155  // Store all the new nodes locally
1156  MNodeStorage.insert(MNodeStorage.begin(), NewNodes.begin(), NewNodes.end());
1157 }
1158 
1159 void exec_graph_impl::update(std::shared_ptr<graph_impl> GraphImpl) {
1160 
1161  if (MDevice != GraphImpl->getDevice()) {
1162  throw sycl::exception(
1164  "Cannot update using a graph created with a different device.");
1165  }
1166  if (MContext != GraphImpl->getContext()) {
1167  throw sycl::exception(
1169  "Cannot update using a graph created with a different context.");
1170  }
1171 
1172  if (MNodeStorage.size() != GraphImpl->MNodeStorage.size()) {
1174  "Cannot update using a graph with a different "
1175  "topology. Mismatch found in the number of nodes.");
1176  } else {
1177  for (uint32_t i = 0; i < MNodeStorage.size(); ++i) {
1178  if (MNodeStorage[i]->MSuccessors.size() !=
1179  GraphImpl->MNodeStorage[i]->MSuccessors.size() ||
1180  MNodeStorage[i]->MPredecessors.size() !=
1181  GraphImpl->MNodeStorage[i]->MPredecessors.size()) {
1182  throw sycl::exception(
1184  "Cannot update using a graph with a different topology. Mismatch "
1185  "found in the number of edges.");
1186  }
1187 
1188  if (MNodeStorage[i]->MCGType != GraphImpl->MNodeStorage[i]->MCGType) {
1189  throw sycl::exception(
1191  "Cannot update using a graph with mismatched node types. Each pair "
1192  "of nodes being updated must have the same type");
1193  }
1194  }
1195  }
1196 
1197  for (uint32_t i = 0; i < MNodeStorage.size(); ++i) {
1198  MIDCache.insert(
1199  std::make_pair(GraphImpl->MNodeStorage[i]->MID, MNodeStorage[i]));
1200  }
1201 
1202  update(GraphImpl->MNodeStorage);
1203 }
1204 
1205 void exec_graph_impl::update(std::shared_ptr<node_impl> Node) {
1206  this->update(std::vector<std::shared_ptr<node_impl>>{Node});
1207 }
1208 
1210  const std::vector<std::shared_ptr<node_impl>> Nodes) {
1211 
1212  if (!MIsUpdatable) {
1214  "update() cannot be called on a executable graph "
1215  "which was not created with property::updatable");
1216  }
1217 
1218  // If there are any accessor requirements, we have to update through the
1219  // scheduler to ensure that any allocations have taken place before trying to
1220  // update.
1221  bool NeedScheduledUpdate = false;
1222  std::vector<sycl::detail::AccessorImplHost *> UpdateRequirements;
1223  // At worst we may have as many requirements as there are for the entire graph
1224  // for updating.
1225  UpdateRequirements.reserve(MRequirements.size());
1226  for (auto &Node : Nodes) {
1227  // Check if node(s) derived from this modifiable node exists in this graph
1228  if (MIDCache.count(Node->getID()) == 0) {
1229  throw sycl::exception(
1231  "Node passed to update() is not part of the graph.");
1232  }
1233  if (Node->MCGType != sycl::detail::CG::Kernel) {
1234  throw sycl::exception(errc::invalid, "Cannot update non-kernel nodes");
1235  }
1236 
1237  if (Node->MCommandGroup->getRequirements().size() == 0) {
1238  continue;
1239  }
1240  NeedScheduledUpdate = true;
1241 
1242  UpdateRequirements.insert(UpdateRequirements.end(),
1243  Node->MCommandGroup->getRequirements().begin(),
1244  Node->MCommandGroup->getRequirements().end());
1245  }
1246 
1247  // Clean up any execution events which have finished so we don't pass them to
1248  // the scheduler.
1249  for (auto It = MExecutionEvents.begin(); It != MExecutionEvents.end();) {
1250  if ((*It)->isCompleted()) {
1251  It = MExecutionEvents.erase(It);
1252  continue;
1253  }
1254  ++It;
1255  }
1256 
1257  // If we have previous execution events do the update through the scheduler to
1258  // ensure it is ordered correctly.
1259  NeedScheduledUpdate |= MExecutionEvents.size() > 0;
1260 
1261  if (NeedScheduledUpdate) {
1262  auto AllocaQueue = std::make_shared<sycl::detail::queue_impl>(
1263  sycl::detail::getSyclObjImpl(MGraphImpl->getDevice()),
1264  sycl::detail::getSyclObjImpl(MGraphImpl->getContext()),
1266  // Don't need to care about the return event here because it is synchronous
1267  sycl::detail::Scheduler::getInstance().addCommandGraphUpdate(
1268  this, Nodes, AllocaQueue, UpdateRequirements, MExecutionEvents);
1269  } else {
1270  for (auto &Node : Nodes) {
1271  updateImpl(Node);
1272  }
1273  }
1274 
1275  // Rebuild cached requirements for this graph with updated nodes
1276  MRequirements.clear();
1277  for (auto &Node : MNodeStorage) {
1278  MRequirements.insert(MRequirements.end(),
1279  Node->MCommandGroup->getRequirements().begin(),
1280  Node->MCommandGroup->getRequirements().end());
1281  }
1282 }
1283 
1284 void exec_graph_impl::updateImpl(std::shared_ptr<node_impl> Node) {
1285  auto ContextImpl = sycl::detail::getSyclObjImpl(MContext);
1286  const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin();
1287  auto DeviceImpl = sycl::detail::getSyclObjImpl(MGraphImpl->getDevice());
1288 
1289  // Gather arg information from Node
1290  auto &ExecCG =
1291  *(static_cast<sycl::detail::CGExecKernel *>(Node->MCommandGroup.get()));
1292  // Copy args because we may modify them
1293  std::vector<sycl::detail::ArgDesc> NodeArgs = ExecCG.getArguments();
1294  // Copy NDR desc since we need to modify it
1295  auto NDRDesc = ExecCG.MNDRDesc;
1296 
1297  pi_kernel PiKernel = nullptr;
1298  auto Kernel = ExecCG.MSyclKernel;
1299  auto KernelBundleImplPtr = ExecCG.MKernelBundle;
1300  std::shared_ptr<sycl::detail::kernel_impl> SyclKernelImpl = nullptr;
1301  const sycl::detail::KernelArgMask *EliminatedArgMask = nullptr;
1302 
1303  // Use kernel_bundle if available unless it is interop.
1304  // Interop bundles can't be used in the first branch, because the kernels
1305  // in interop kernel bundles (if any) do not have kernel_id
1306  // and can therefore not be looked up, but since they are self-contained
1307  // they can simply be launched directly.
1308  if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
1309  auto KernelName = ExecCG.MKernelName;
1310  kernel_id KernelID =
1311  sycl::detail::ProgramManager::getInstance().getSYCLKernelID(KernelName);
1312  kernel SyclKernel =
1313  KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr);
1314  SyclKernelImpl = sycl::detail::getSyclObjImpl(SyclKernel);
1315  PiKernel = SyclKernelImpl->getHandleRef();
1316  EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
1317  } else if (Kernel != nullptr) {
1318  PiKernel = Kernel->getHandleRef();
1319  EliminatedArgMask = Kernel->getKernelArgMask();
1320  } else {
1321  std::tie(PiKernel, std::ignore, EliminatedArgMask, std::ignore) =
1322  sycl::detail::ProgramManager::getInstance().getOrCreateKernel(
1323  ContextImpl, DeviceImpl, ExecCG.MKernelName);
1324  }
1325 
1326  // Remove eliminated args
1327  std::vector<sycl::detail::ArgDesc> MaskedArgs;
1328  MaskedArgs.reserve(NodeArgs.size());
1329 
1331  EliminatedArgMask, NodeArgs,
1332  [&MaskedArgs](sycl::detail::ArgDesc &Arg, int NextTrueIndex) {
1333  MaskedArgs.emplace_back(Arg.MType, Arg.MPtr, Arg.MSize, NextTrueIndex);
1334  });
1335 
1336  // Reverse kernel dims
1338 
1339  size_t RequiredWGSize[3] = {0, 0, 0};
1340  size_t *LocalSize = nullptr;
1341 
1342  if (NDRDesc.LocalSize[0] != 0)
1343  LocalSize = &NDRDesc.LocalSize[0];
1344  else {
1346  PiKernel, DeviceImpl->getHandleRef(),
1347  PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, sizeof(RequiredWGSize),
1348  RequiredWGSize,
1349  /* param_value_size_ret = */ nullptr);
1350 
1351  const bool EnforcedLocalSize =
1352  (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
1353  RequiredWGSize[2] != 0);
1354  if (EnforcedLocalSize)
1355  LocalSize = RequiredWGSize;
1356  }
1357  // Create update descriptor
1358 
1359  // Storage for individual arg descriptors
1360  std::vector<pi_ext_command_buffer_update_memobj_arg_desc_t> MemobjDescs;
1361  std::vector<pi_ext_command_buffer_update_pointer_arg_desc_t> PtrDescs;
1362  std::vector<pi_ext_command_buffer_update_value_arg_desc_t> ValueDescs;
1363  MemobjDescs.reserve(MaskedArgs.size());
1364  PtrDescs.reserve(MaskedArgs.size());
1365  ValueDescs.reserve(MaskedArgs.size());
1366 
1368 
1369  // Collect arg descriptors and fill kernel launch descriptor
1371  for (size_t i = 0; i < MaskedArgs.size(); i++) {
1372  auto &NodeArg = MaskedArgs[i];
1373  switch (NodeArg.MType) {
1374  case kernel_param_kind_t::kind_pointer: {
1375  PtrDescs.push_back({static_cast<uint32_t>(NodeArg.MIndex), NodeArg.MPtr});
1376  } break;
1377  case kernel_param_kind_t::kind_std_layout: {
1378  ValueDescs.push_back({static_cast<uint32_t>(NodeArg.MIndex),
1379  static_cast<uint32_t>(NodeArg.MSize),
1380  NodeArg.MPtr});
1381  } break;
1382  case kernel_param_kind_t::kind_accessor: {
1384  static_cast<sycl::detail::Requirement *>(NodeArg.MPtr);
1385 
1386  pi_mem_obj_property MemObjData{};
1387 
1388  switch (Req->MAccessMode) {
1389  case access::mode::read: {
1390  MemObjData.mem_access = PI_ACCESS_READ_ONLY;
1391  break;
1392  }
1393  case access::mode::write:
1395  MemObjData.mem_access = PI_ACCESS_WRITE_ONLY;
1396  break;
1397  }
1398  default: {
1399  MemObjData.mem_access = PI_ACCESS_READ_WRITE;
1400  break;
1401  }
1402  }
1403  MemObjData.type = PI_KERNEL_ARG_MEM_OBJ_ACCESS;
1404  MemobjDescs.push_back(pi_ext_command_buffer_update_memobj_arg_desc_t{
1405  static_cast<uint32_t>(NodeArg.MIndex), &MemObjData,
1406  static_cast<pi_mem>(Req->MData)});
1407 
1408  } break;
1409 
1410  default:
1411  break;
1412  }
1413  }
1414 
1415  UpdateDesc.num_mem_obj_args = MemobjDescs.size();
1416  UpdateDesc.mem_obj_arg_list = MemobjDescs.data();
1417  UpdateDesc.num_ptr_args = PtrDescs.size();
1418  UpdateDesc.ptr_arg_list = PtrDescs.data();
1419  UpdateDesc.num_value_args = ValueDescs.size();
1420  UpdateDesc.value_arg_list = ValueDescs.data();
1421 
1422  UpdateDesc.global_work_offset = &NDRDesc.GlobalOffset[0];
1423  UpdateDesc.global_work_size = &NDRDesc.GlobalSize[0];
1424  UpdateDesc.local_work_size = LocalSize;
1425  UpdateDesc.num_work_dim = NDRDesc.Dims;
1426 
1427  // Query the ID cache to find the equivalent exec node for the node passed to
1428  // this function.
1429  // TODO: Handle subgraphs or any other cases where multiple nodes may be
1430  // associated with a single key, once those node types are supported for
1431  // update.
1432  auto ExecNode = MIDCache.find(Node->MID);
1433  assert(ExecNode != MIDCache.end() && "Node ID was not found in ID cache");
1434 
1435  // Update ExecNode with new values from Node, in case we ever need to
1436  // rebuild the command buffers
1437  ExecNode->second->updateFromOtherNode(Node);
1438 
1440  MCommandMap[ExecNode->second];
1441  pi_result Res = Plugin->call_nocheck<
1443  Command, &UpdateDesc);
1444 
1445  if (Res != PI_SUCCESS) {
1446  throw sycl::exception(errc::invalid, "Error updating command_graph");
1447  }
1448 }
1449 
1451  const sycl::context &SyclContext, const sycl::device &SyclDevice,
1452  const sycl::property_list &PropList)
1453  : impl(std::make_shared<detail::graph_impl>(SyclContext, SyclDevice,
1454  PropList)) {}
1455 
1457  const sycl::queue &SyclQueue, const sycl::property_list &PropList)
1458  : impl(std::make_shared<detail::graph_impl>(
1459  SyclQueue.get_context(), SyclQueue.get_device(), PropList)) {}
1460 
1461 node modifiable_command_graph::addImpl(const std::vector<node> &Deps) {
1462  impl->throwIfGraphRecordingQueue("Explicit API \"Add()\" function");
1463  std::vector<std::shared_ptr<detail::node_impl>> DepImpls;
1464  for (auto &D : Deps) {
1465  DepImpls.push_back(sycl::detail::getSyclObjImpl(D));
1466  }
1467 
1468  graph_impl::WriteLock Lock(impl->MMutex);
1469  std::shared_ptr<detail::node_impl> NodeImpl = impl->add(impl, DepImpls);
1470  return sycl::detail::createSyclObjFromImpl<node>(NodeImpl);
1471 }
1472 
1474  const std::vector<node> &Deps) {
1475  impl->throwIfGraphRecordingQueue("Explicit API \"Add()\" function");
1476  std::vector<std::shared_ptr<detail::node_impl>> DepImpls;
1477  for (auto &D : Deps) {
1478  DepImpls.push_back(sycl::detail::getSyclObjImpl(D));
1479  }
1480 
1481  graph_impl::WriteLock Lock(impl->MMutex);
1482  std::shared_ptr<detail::node_impl> NodeImpl =
1483  impl->add(impl, CGF, {}, DepImpls);
1484  return sycl::detail::createSyclObjFromImpl<node>(NodeImpl);
1485 }
1486 
1488  // Find all exit nodes in the current graph and add them to the dependency
1489  // vector
1490  std::shared_ptr<detail::node_impl> DstImpl =
1492  graph_impl::WriteLock Lock(impl->MMutex);
1493  for (auto &NodeImpl : impl->MNodeStorage) {
1494  if ((NodeImpl->MSuccessors.size() == 0) && (NodeImpl != DstImpl)) {
1495  impl->makeEdge(NodeImpl, DstImpl);
1496  }
1497  }
1498 }
1499 
1501  std::shared_ptr<detail::node_impl> SenderImpl =
1503  std::shared_ptr<detail::node_impl> ReceiverImpl =
1505 
1506  graph_impl::WriteLock Lock(impl->MMutex);
1507  impl->makeEdge(SenderImpl, ReceiverImpl);
1508 }
1509 
1512  // Graph is read and written in this scope so we lock
1513  // this graph with full priviledges.
1514  graph_impl::WriteLock Lock(impl->MMutex);
1516  this->impl, this->impl->getContext(), PropList};
1517 }
1518 
1520  queue &RecordingQueue, const sycl::property_list &PropList) {
1521  std::ignore = PropList;
1522 
1523  auto QueueImpl = sycl::detail::getSyclObjImpl(RecordingQueue);
1524  assert(QueueImpl);
1525  if (QueueImpl->get_context() != impl->getContext()) {
1527  "begin_recording called for a queue whose context "
1528  "differs from the graph context.");
1529  }
1530  if (QueueImpl->get_device() != impl->getDevice()) {
1532  "begin_recording called for a queue whose device "
1533  "differs from the graph device.");
1534  }
1535 
1536  if (QueueImpl->is_in_fusion_mode()) {
1538  "SYCL queue in kernel in fusion mode "
1539  "can NOT be recorded.");
1540  }
1541 
1542  if (QueueImpl->get_context() != impl->getContext()) {
1544  "begin_recording called for a queue whose context "
1545  "differs from the graph context.");
1546  }
1547  if (QueueImpl->get_device() != impl->getDevice()) {
1549  "begin_recording called for a queue whose device "
1550  "differs from the graph device.");
1551  }
1552 
1553  if (QueueImpl->getCommandGraph() == nullptr) {
1554  QueueImpl->setCommandGraph(impl);
1555  graph_impl::WriteLock Lock(impl->MMutex);
1556  impl->addQueue(QueueImpl);
1557  }
1558  if (QueueImpl->getCommandGraph() != impl) {
1560  "begin_recording called for a queue which is already "
1561  "recording to a different graph.");
1562  }
1563 }
1564 
1566  const std::vector<queue> &RecordingQueues,
1567  const sycl::property_list &PropList) {
1568  for (queue Queue : RecordingQueues) {
1569  this->begin_recording(Queue, PropList);
1570  }
1571 }
1572 
1574  graph_impl::WriteLock Lock(impl->MMutex);
1575  impl->clearQueues();
1576 }
1577 
1579  auto QueueImpl = sycl::detail::getSyclObjImpl(RecordingQueue);
1580  if (QueueImpl && QueueImpl->getCommandGraph() == impl) {
1581  QueueImpl->setCommandGraph(nullptr);
1582  graph_impl::WriteLock Lock(impl->MMutex);
1583  impl->removeQueue(QueueImpl);
1584  }
1585  if (QueueImpl->getCommandGraph() != nullptr) {
1587  "end_recording called for a queue which is recording "
1588  "to a different graph.");
1589  }
1590 }
1591 
1593  const std::vector<queue> &RecordingQueues) {
1594  for (queue Queue : RecordingQueues) {
1595  this->end_recording(Queue);
1596  }
1597 }
1598 
1600  bool verbose) const {
1601  graph_impl::ReadLock Lock(impl->MMutex);
1602  if (path.substr(path.find_last_of(".") + 1) == "dot") {
1603  impl->printGraphAsDot(path, verbose);
1604  } else {
1605  throw sycl::exception(
1607  "DOT graph is the only format supported at the moment.");
1608  }
1609 }
1610 
1611 std::vector<node> modifiable_command_graph::get_nodes() const {
1612  return createNodesFromImpls(impl->MNodeStorage);
1613 }
1614 std::vector<node> modifiable_command_graph::get_root_nodes() const {
1615  auto &Roots = impl->MRoots;
1616  std::vector<std::weak_ptr<node_impl>> Impls{};
1617 
1618  std::copy(Roots.begin(), Roots.end(), std::back_inserter(Impls));
1619  return createNodesFromImpls(Impls);
1620 }
1621 
1623  const std::shared_ptr<detail::graph_impl> &Graph, const sycl::context &Ctx,
1624  const property_list &PropList)
1625  : impl(std::make_shared<detail::exec_graph_impl>(Ctx, Graph, PropList)) {
1626  finalizeImpl(); // Create backend representation for executable graph
1627 }
1628 
1630  impl->makePartitions();
1631 
1632  auto Device = impl->getGraphImpl()->getDevice();
1633  for (auto Partition : impl->getPartitions()) {
1634  if (!Partition->isHostTask()) {
1635  impl->createCommandBuffers(Device, Partition);
1636  }
1637  }
1638 }
1639 
1642  impl->update(sycl::detail::getSyclObjImpl(Graph));
1643 }
1644 
1646  impl->update(sycl::detail::getSyclObjImpl(Node));
1647 }
1648 
1649 void executable_command_graph::update(const std::vector<node> &Nodes) {
1650  std::vector<std::shared_ptr<node_impl>> NodeImpls{};
1651  NodeImpls.reserve(Nodes.size());
1652  for (auto &Node : Nodes) {
1653  NodeImpls.push_back(sycl::detail::getSyclObjImpl(Node));
1654  }
1655 
1656  impl->update(NodeImpls);
1657 }
1658 
1660  command_graph<graph_state::modifiable> Graph, size_t ParamSize,
1661  const void *Data)
1662  : impl(std::make_shared<dynamic_parameter_impl>(
1663  sycl::detail::getSyclObjImpl(Graph), ParamSize, Data)) {}
1664 
1665 void dynamic_parameter_base::updateValue(const void *NewValue, size_t Size) {
1666  impl->updateValue(NewValue, Size);
1667 }
1668 
1670  const sycl::detail::AccessorBaseHost *Acc) {
1671  impl->updateAccessor(Acc);
1672 }
1673 
1674 } // namespace detail
1675 
1676 node_type node::get_type() const { return impl->MNodeType; }
1677 
1678 std::vector<node> node::get_predecessors() const {
1679  return detail::createNodesFromImpls(impl->MPredecessors);
1680 }
1681 
1682 std::vector<node> node::get_successors() const {
1683  return detail::createNodesFromImpls(impl->MSuccessors);
1684 }
1685 
1687  auto EventImpl = sycl::detail::getSyclObjImpl(nodeEvent);
1688  auto GraphImpl = EventImpl->getCommandGraph();
1689 
1690  return sycl::detail::createSyclObjFromImpl<node>(
1691  GraphImpl->getNodeForEvent(EventImpl));
1692 }
1693 
1694 template <> void node::update_nd_range<1>(nd_range<1> NDRange) {
1695  impl->updateNDRange(NDRange);
1696 }
1697 template <> void node::update_nd_range<2>(nd_range<2> NDRange) {
1698  impl->updateNDRange(NDRange);
1699 }
1700 template <> void node::update_nd_range<3>(nd_range<3> NDRange) {
1701  impl->updateNDRange(NDRange);
1702 }
1703 template <> void node::update_range<1>(range<1> Range) {
1704  impl->updateRange(Range);
1705 }
1706 template <> void node::update_range<2>(range<2> Range) {
1707  impl->updateRange(Range);
1708 }
1709 template <> void node::update_range<3>(range<3> Range) {
1710  impl->updateRange(Range);
1711 }
1712 } // namespace experimental
1713 } // namespace oneapi
1714 } // namespace ext
1715 } // namespace _V1
1716 } // namespace sycl
The context class represents a SYCL context on which kernel functions may be executed.
Definition: context.hpp:50
CGTYPE
Type of the command group.
Definition: cg.hpp:56
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Definition: event.hpp:44
void updateAccessor(const sycl::detail::AccessorBaseHost *Acc)
dynamic_parameter_base(sycl::ext::oneapi::experimental::command_graph< graph_state::modifiable > Graph, size_t ParamSize, const void *Data)
std::shared_ptr< dynamic_parameter_impl > impl
Definition: graph.hpp:433
Class representing the implementation of command_graph<executable>.
exec_graph_impl(sycl::context Context, const std::shared_ptr< graph_impl > &GraphImpl, const property_list &PropList)
Constructor.
Definition: graph_impl.cpp:757
void createCommandBuffers(sycl::device Device, std::shared_ptr< partition > &Partition)
Turns the internal graph representation into UR command-buffers for a device.
Definition: graph_impl.cpp:696
void update(std::shared_ptr< graph_impl > GraphImpl)
void makePartitions()
Partition the graph nodes and put the partition in MPartitions.
Definition: graph_impl.cpp:183
void updateImpl(std::shared_ptr< node_impl > NodeImpl)
std::shared_mutex MMutex
Protects all the fields that can be changed by class' methods.
sycl::event enqueue(const std::shared_ptr< sycl::detail::queue_impl > &Queue, sycl::detail::CG::StorageInitHelper CGData)
Called by handler::ext_oneapi_command_graph() to schedule graph for execution.
Definition: graph_impl.cpp:812
void update(const command_graph< graph_state::modifiable > &Graph)
Update the inputs & output of the graph.
executable_command_graph()=delete
An executable command-graph is not user constructable.
void finalizeImpl()
Creates a backend representation of the graph in impl member variable.
Implementation details of command_graph<modifiable>.
Definition: graph_impl.hpp:794
std::vector< std::shared_ptr< node_impl > > MNodeStorage
Storage for all nodes contained within a graph.
Definition: graph_impl.hpp:959
void removeRoot(const std::shared_ptr< node_impl > &Root)
Remove node from list of root nodes.
Definition: graph_impl.cpp:343
std::unique_lock< std::shared_mutex > WriteLock
Definition: graph_impl.hpp:797
void makeEdge(std::shared_ptr< node_impl > Src, std::shared_ptr< node_impl > Dest)
Make an edge between two nodes in the graph.
Definition: graph_impl.cpp:560
std::vector< sycl::detail::EventImplPtr > getExitNodesEvents()
Traverse the graph recursively to get the events associated with the output nodes of this graph.
Definition: graph_impl.cpp:611
void throwIfGraphRecordingQueue(const std::string ExceptionMsg) const
Throws an invalid exception if this function is called while a queue is recording commands to the gra...
std::shared_ptr< sycl::detail::event_impl > getEventForNode(std::shared_ptr< node_impl > NodeImpl) const
Find the sycl event associated with a node.
Definition: graph_impl.hpp:911
std::set< std::weak_ptr< node_impl >, std::owner_less< std::weak_ptr< node_impl > > > MRoots
List of root nodes.
Definition: graph_impl.hpp:953
bool clearQueues()
Remove all queues which are recording to this graph, also sets all queues cleared back to the executi...
Definition: graph_impl.cpp:506
std::shared_ptr< node_impl > add(node_type NodeType, std::unique_ptr< sycl::detail::CG > CommandGroup, const std::vector< std::shared_ptr< node_impl >> &Dep={})
Create a kernel node in the graph.
Definition: graph_impl.cpp:433
void addEventForNode(std::shared_ptr< graph_impl > GraphImpl, std::shared_ptr< sycl::detail::event_impl > EventImpl, std::shared_ptr< node_impl > NodeImpl)
Associate a sycl event with a node in the graph.
Definition: graph_impl.hpp:899
std::shared_lock< std::shared_mutex > ReadLock
Definition: graph_impl.hpp:796
command_graph< graph_state::executable > finalize(const property_list &PropList={}) const
Finalize modifiable graph into an executable graph.
void addGraphLeafDependencies(node Node)
Adds all graph leaves as dependencies.
void begin_recording(queue &RecordingQueue, const property_list &PropList={})
Change the state of a queue to be recording and associate this graph with it.
std::vector< node > get_nodes() const
Get a list of all nodes contained in this graph.
void end_recording()
Set all queues currently recording to this graph to the executing state.
void print_graph(const std::string path, bool verbose=false) const
Synchronous operation that writes a DOT formatted description of the graph to the provided path.
node addImpl(std::function< void(handler &)> CGF, const std::vector< node > &Dep)
Template-less implementation of add() for CGF nodes.
void make_edge(node &Src, node &Dest)
Add a dependency between two nodes.
modifiable_command_graph(const context &SyclContext, const device &SyclDevice, const property_list &PropList={})
Constructor.
std::vector< node > get_root_nodes() const
Get a list of all root nodes (nodes without dependencies) in this graph.
std::set< std::weak_ptr< node_impl >, std::owner_less< std::weak_ptr< node_impl > > > MRoots
List of root nodes.
Definition: graph_impl.hpp:774
std::list< std::shared_ptr< node_impl > > MSchedule
Execution schedule of nodes in the graph.
Definition: graph_impl.hpp:776
Class representing a node in the graph, returned by command_graph::add().
Definition: graph.hpp:105
node_type get_type() const
Get the type of command associated with this node.
static node get_node_from_event(event nodeEvent)
Get the node associated with a SYCL event returned from a queue recording submission.
std::vector< node > get_successors() const
Get a list of all nodes which depend on this node.
std::vector< node > get_predecessors() const
Get a list of all the node dependencies of this node.
Command group handler class.
Definition: handler.hpp:458
Objects of the class identify kernel is some kernel_bundle related APIs.
Provides an abstraction of a SYCL kernel.
Definition: kernel.hpp:77
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: nd_range.hpp:22
Objects of the property_list class are containers for the SYCL properties.
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
Definition: queue.hpp:111
::pi_kernel PiKernel
Definition: pi.hpp:138
::pi_ext_sync_point PiExtSyncPoint
Definition: pi.hpp:156
::pi_ext_command_buffer_command PiExtCommandBufferCommand
Definition: pi.hpp:159
std::vector< bool > KernelArgMask
void ReverseRangeDimensionsForKernel(NDRDescT &NDR)
Definition: commands.cpp:2262
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:30
std::shared_ptr< event_impl > EventImplPtr
Definition: cg.hpp:43
std::shared_ptr< plugin > PluginPtr
Definition: pi.hpp:48
AccessorImplHost Requirement
std::shared_ptr< device_impl > DeviceImplPtr
pi_int32 enqueueImpKernel(const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector< ArgDesc > &Args, const std::shared_ptr< detail::kernel_bundle_impl > &KernelBundleImplPtr, const std::shared_ptr< detail::kernel_impl > &MSyclKernel, const std::string &KernelName, std::vector< sycl::detail::pi::PiEvent > &RawEvents, const detail::EventImplPtr &OutEventImpl, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig, const bool KernelIsCooperative)
Definition: commands.cpp:2559
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
void applyFuncOnFilteredArgs(const KernelArgMask *EliminatedArgMask, std::vector< ArgDesc > &Args, std::function< void(detail::ArgDesc &Arg, int NextTrueIndex)> Func)
Definition: commands.cpp:111
auto tie(Ts &...Args)
Definition: tuple.hpp:39
pi_int32 enqueueImpCommandBufferKernel(context Ctx, DeviceImplPtr DeviceImpl, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, const CGExecKernel &CommandGroup, std::vector< sycl::detail::pi::PiExtSyncPoint > &SyncPoints, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint, sycl::detail::pi::PiExtCommandBufferCommand *OutCommand, const std::function< void *(Requirement *Req)> &getMemAllocationFunc)
Definition: commands.cpp:2455
node_type getNodeTypeFromCG(sycl::detail::CG::CGTYPE CGType)
Definition: graph_impl.hpp:41
void copy(handler &CGH, const T *Src, T *Dest, size_t Count)
static constexpr bool has_property()
std::function< void(sycl::exception_list)> async_handler
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:91
Definition: access.hpp:18
static device_ext & get_device(unsigned int id)
Util function to get a device by id.
Definition: device.hpp:777
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT
Definition: pi.h:810
int32_t pi_int32
Definition: pi.h:212
_pi_result
Definition: pi.h:224
@ PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
Definition: pi.h:520
pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer, pi_queue queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
API to submit the command-buffer to queue for execution, returns an error if the command-buffer is no...
Definition: pi_cuda.cpp:1213
pi_result piextCommandBufferFinalize(pi_ext_command_buffer command_buffer)
API to stop command-buffer recording such that no more commands can be appended, and makes the comman...
Definition: pi_cuda.cpp:1083
pi_result piextCommandBufferRelease(pi_ext_command_buffer command_buffer)
API to decrement the reference count of the command-buffer.
Definition: pi_cuda.cpp:1079
@ PI_ACCESS_READ_WRITE
Definition: pi.h:1904
@ PI_ACCESS_READ_ONLY
Definition: pi.h:1906
@ PI_ACCESS_WRITE_ONLY
Definition: pi.h:1905
pi_result piKernelGetGroupInfo(pi_kernel kernel, pi_device device, pi_kernel_group_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_cuda.cpp:508
pi_result piextCommandBufferUpdateKernelLaunch(pi_ext_command_buffer_command command, pi_ext_command_buffer_update_kernel_launch_desc *desc)
API to update a kernel launch command inside of a command-buffer.
Definition: pi_cuda.cpp:1222
@ PI_EXT_STRUCTURE_TYPE_COMMAND_BUFFER_DESC
Definition: pi.h:2355
pi_result piextCommandBufferReleaseCommand(pi_ext_command_buffer_command command)
API to decrement the reference count of a command-buffer command.
Definition: pi_cuda.cpp:1234
@ PI_KERNEL_ARG_MEM_OBJ_ACCESS
Definition: pi.h:1912
pi_result piextCommandBufferCreate(pi_context context, pi_device device, const pi_ext_command_buffer_desc *desc, pi_ext_command_buffer *ret_command_buffer)
API to create a command-buffer.
Definition: pi_cuda.cpp:1068
pi_ext_command_buffer_update_pointer_arg_desc_t * ptr_arg_list
Definition: pi.h:2389
pi_ext_command_buffer_update_memobj_arg_desc_t * mem_obj_arg_list
Definition: pi.h:2388
pi_ext_command_buffer_update_value_arg_desc_t * value_arg_list
Definition: pi.h:2390
std::vector< detail::AccessorImplPtr > MAccStorage
Storage for accessors.
Definition: cg.hpp:101
std::vector< detail::EventImplPtr > MEvents
List of events that order the execution of this CG.
Definition: cg.hpp:109
std::vector< AccessorImplHost * > MRequirements
List of requirements that specify which memory is needed for the command group to be executed.
Definition: cg.hpp:107