DPC++ Runtime
Runtime libraries for oneAPI DPC++
commands.cpp
Go to the documentation of this file.
1 //===----------- commands.cpp - SYCL commands -------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 
11 #include <detail/context_impl.hpp>
12 #include <detail/event_impl.hpp>
15 #include <detail/kernel_impl.hpp>
16 #include <detail/kernel_info.hpp>
18 #include <detail/program_impl.hpp>
20 #include <detail/queue_impl.hpp>
21 #include <detail/sampler_impl.hpp>
24 #include <detail/stream_impl.hpp>
25 #include <detail/xpti_registry.hpp>
26 #include <sycl/access/access.hpp>
27 #include <sycl/backend_types.hpp>
28 #include <sycl/detail/cg_types.hpp>
30 #include <sycl/sampler.hpp>
31 
32 #include <cassert>
33 #include <optional>
34 #include <string>
35 #include <vector>
36 
37 #ifdef __has_include
38 #if __has_include(<cxxabi.h>)
39 #define __SYCL_ENABLE_GNU_DEMANGLING
40 #include <cstdlib>
41 #include <cxxabi.h>
42 #include <memory>
43 #endif
44 #endif
45 
46 #ifdef XPTI_ENABLE_INSTRUMENTATION
47 #include "xpti/xpti_trace_framework.hpp"
48 #include <detail/xpti_registry.hpp>
49 #endif
50 
51 namespace sycl {
52 inline namespace _V1 {
53 namespace detail {
54 
55 #ifdef XPTI_ENABLE_INSTRUMENTATION
56 // Global graph for the application
57 extern xpti::trace_event_data_t *GSYCLGraphEvent;
58 
59 bool CurrentCodeLocationValid() {
60  detail::tls_code_loc_t Tls;
61  auto CodeLoc = Tls.query();
62  auto FileName = CodeLoc.fileName();
63  auto FunctionName = CodeLoc.functionName();
64  return (FileName && FileName[0] != '\0') ||
65  (FunctionName && FunctionName[0] != '\0');
66 }
67 
68 void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID,
69  xpti_td *TraceEvent, uint16_t Type,
70  const void *Addr) {
71  if (!(xptiCheckTraceEnabled(StreamID, Type) && TraceEvent))
72  return;
73  // Trace event notifier that emits a Type event
74  xptiNotifySubscribers(StreamID, Type, detail::GSYCLGraphEvent,
75  static_cast<xpti_td *>(TraceEvent), InstanceID, Addr);
76 }
77 #endif
78 
79 #ifdef __SYCL_ENABLE_GNU_DEMANGLING
80 struct DemangleHandle {
81  char *p;
82  DemangleHandle(char *ptr) : p(ptr) {}
83 
84  DemangleHandle(const DemangleHandle &) = delete;
85  DemangleHandle &operator=(const DemangleHandle &) = delete;
86 
87  ~DemangleHandle() { std::free(p); }
88 };
89 static std::string demangleKernelName(std::string Name) {
90  int Status = -1; // some arbitrary value to eliminate the compiler warning
91  DemangleHandle result(abi::__cxa_demangle(Name.c_str(), NULL, NULL, &Status));
92  return (Status == 0) ? result.p : Name;
93 }
94 #else
95 static std::string demangleKernelName(std::string Name) { return Name; }
96 #endif
97 
98 static std::string deviceToString(device Device) {
99  if (getSyclObjImpl(Device)->is_host())
100  return "HOST";
101  else if (Device.is_cpu())
102  return "CPU";
103  else if (Device.is_gpu())
104  return "GPU";
105  else if (Device.is_accelerator())
106  return "ACCELERATOR";
107  else
108  return "UNKNOWN";
109 }
110 
112  const KernelArgMask *EliminatedArgMask, std::vector<ArgDesc> &Args,
113  std::function<void(detail::ArgDesc &Arg, int NextTrueIndex)> Func) {
114  if (!EliminatedArgMask) {
115  for (ArgDesc &Arg : Args) {
116  Func(Arg, Arg.MIndex);
117  }
118  } else {
119  // TODO this is not necessary as long as we can guarantee that the
120  // arguments are already sorted (e. g. handle the sorting in handler
121  // if necessary due to set_arg(...) usage).
122  std::sort(Args.begin(), Args.end(), [](const ArgDesc &A, const ArgDesc &B) {
123  return A.MIndex < B.MIndex;
124  });
125  int LastIndex = -1;
126  size_t NextTrueIndex = 0;
127 
128  for (ArgDesc &Arg : Args) {
129  // Handle potential gaps in set arguments (e. g. if some of them are
130  // set on the user side).
131  for (int Idx = LastIndex + 1; Idx < Arg.MIndex; ++Idx)
132  if (!(*EliminatedArgMask)[Idx])
133  ++NextTrueIndex;
134  LastIndex = Arg.MIndex;
135 
136  if ((*EliminatedArgMask)[Arg.MIndex])
137  continue;
138 
139  Func(Arg, NextTrueIndex);
140  ++NextTrueIndex;
141  }
142  }
143 }
144 
145 #ifdef XPTI_ENABLE_INSTRUMENTATION
146 static size_t deviceToID(const device &Device) {
147  if (getSyclObjImpl(Device)->is_host())
148  return 0;
149  else
150  return reinterpret_cast<size_t>(getSyclObjImpl(Device)->getHandleRef());
151 }
152 #endif
153 
154 static std::string accessModeToString(access::mode Mode) {
155  switch (Mode) {
156  case access::mode::read:
157  return "read";
158  case access::mode::write:
159  return "write";
161  return "read_write";
163  return "discard_write";
165  return "discard_read_write";
166  default:
167  return "unknown";
168  }
169 }
170 
171 #ifdef XPTI_ENABLE_INSTRUMENTATION
172 // Using the command group type to create node types for the asynchronous task
173 // graph modeling
174 static std::string commandToNodeType(Command::CommandType Type) {
175  switch (Type) {
176  case Command::CommandType::RUN_CG:
177  return "command_group_node";
178  case Command::CommandType::COPY_MEMORY:
179  return "memory_transfer_node";
180  case Command::CommandType::ALLOCA:
181  return "memory_allocation_node";
182  case Command::CommandType::ALLOCA_SUB_BUF:
183  return "sub_buffer_creation_node";
184  case Command::CommandType::RELEASE:
185  return "memory_deallocation_node";
186  case Command::CommandType::MAP_MEM_OBJ:
187  return "memory_transfer_node";
188  case Command::CommandType::UNMAP_MEM_OBJ:
189  return "memory_transfer_node";
190  case Command::CommandType::UPDATE_REQUIREMENT:
191  return "host_acc_create_buffer_lock_node";
192  case Command::CommandType::EMPTY_TASK:
193  return "host_acc_destroy_buffer_release_node";
194  case Command::CommandType::FUSION:
195  return "kernel_fusion_placeholder_node";
196  default:
197  return "unknown_node";
198  }
199 }
200 
201 // Using the names being generated and the string are subject to change to
202 // something more meaningful to end-users as this will be visible in analysis
203 // tools that subscribe to this data
204 static std::string commandToName(Command::CommandType Type) {
205  switch (Type) {
206  case Command::CommandType::RUN_CG:
207  return "Command Group Action";
208  case Command::CommandType::COPY_MEMORY:
209  return "Memory Transfer (Copy)";
210  case Command::CommandType::ALLOCA:
211  return "Memory Allocation";
212  case Command::CommandType::ALLOCA_SUB_BUF:
213  return "Sub Buffer Creation";
214  case Command::CommandType::RELEASE:
215  return "Memory Deallocation";
216  case Command::CommandType::MAP_MEM_OBJ:
217  return "Memory Transfer (Map)";
218  case Command::CommandType::UNMAP_MEM_OBJ:
219  return "Memory Transfer (Unmap)";
220  case Command::CommandType::UPDATE_REQUIREMENT:
221  return "Host Accessor Creation/Buffer Lock";
222  case Command::CommandType::EMPTY_TASK:
223  return "Host Accessor Destruction/Buffer Lock Release";
224  case Command::CommandType::FUSION:
225  return "Kernel Fusion Placeholder";
226  default:
227  return "Unknown Action";
228  }
229 }
230 #endif
231 
232 std::vector<sycl::detail::pi::PiEvent>
233 Command::getPiEvents(const std::vector<EventImplPtr> &EventImpls) const {
234  std::vector<sycl::detail::pi::PiEvent> RetPiEvents;
235  for (auto &EventImpl : EventImpls) {
236  if (EventImpl->getHandleRef() == nullptr)
237  continue;
238 
239  // Do not add redundant event dependencies for in-order queues.
240  // At this stage dependency is definitely pi task and need to check if
241  // current one is a host task. In this case we should not skip pi event due
242  // to different sync mechanisms for different task types on in-order queue.
243  const QueueImplPtr &WorkerQueue = getWorkerQueue();
244  // MWorkerQueue in command is always not null. So check if
245  // EventImpl->getWorkerQueue != nullptr is implicit.
246  if (EventImpl->getWorkerQueue() == WorkerQueue &&
247  WorkerQueue->isInOrder() && !isHostTask())
248  continue;
249 
250  RetPiEvents.push_back(EventImpl->getHandleRef());
251  }
252 
253  return RetPiEvents;
254 }
255 
256 // This function is implemented (duplicating getPiEvents a lot) as short term
257 // solution for the issue that barrier with wait list could not
258 // handle empty pi event handles when kernel is enqueued on host task
259 // completion.
260 std::vector<sycl::detail::pi::PiEvent> Command::getPiEventsBlocking(
261  const std::vector<EventImplPtr> &EventImpls) const {
262  std::vector<sycl::detail::pi::PiEvent> RetPiEvents;
263  for (auto &EventImpl : EventImpls) {
264  // Throwaway events created with empty constructor will not have a context
265  // (which is set lazily) calling getContextImpl() would set that
266  // context, which we wish to avoid as it is expensive.
267  // Skip host task and NOP events also.
268  if (!EventImpl->isContextInitialized() || EventImpl->is_host() ||
269  EventImpl->isNOP())
270  continue;
271  // In this path nullptr native event means that the command has not been
272  // enqueued. It may happen if async enqueue in a host task is involved.
273  if (EventImpl->getHandleRef() == nullptr) {
274  if (!EventImpl->getCommand() ||
275  !static_cast<Command *>(EventImpl->getCommand())->producesPiEvent())
276  continue;
277  std::vector<Command *> AuxCmds;
278  Scheduler::getInstance().enqueueCommandForCG(EventImpl, AuxCmds,
279  BLOCKING);
280  }
281  // Do not add redundant event dependencies for in-order queues.
282  // At this stage dependency is definitely pi task and need to check if
283  // current one is a host task. In this case we should not skip pi event due
284  // to different sync mechanisms for different task types on in-order queue.
285  const QueueImplPtr &WorkerQueue = getWorkerQueue();
286  // MWorkerQueue in command is always not null. So check if
287  // EventImpl->getWorkerQueue != nullptr is implicit.
288  if (EventImpl->getWorkerQueue() == WorkerQueue &&
289  WorkerQueue->isInOrder() && !isHostTask())
290  continue;
291 
292  RetPiEvents.push_back(EventImpl->getHandleRef());
293  }
294 
295  return RetPiEvents;
296 }
297 
298 bool Command::isHostTask() const {
299  return (MType == CommandType::RUN_CG) /* host task has this type also */ &&
300  ((static_cast<const ExecCGCommand *>(this))->getCG().getType() ==
301  CG::CGTYPE::CodeplayHostTask);
302 }
303 
304 bool Command::isFusable() const {
305  if ((MType != CommandType::RUN_CG)) {
306  return false;
307  }
308  const auto &CG = (static_cast<const ExecCGCommand &>(*this)).getCG();
309  return (CG.getType() == CG::CGTYPE::Kernel) &&
310  (!static_cast<const CGExecKernel &>(CG).MKernelIsCooperative);
311 }
312 
313 static void flushCrossQueueDeps(const std::vector<EventImplPtr> &EventImpls,
314  const QueueImplPtr &Queue) {
315  for (auto &EventImpl : EventImpls) {
316  EventImpl->flushIfNeeded(Queue);
317  }
318 }
319 
321  ExecCGCommand *MThisCmd;
322  std::vector<interop_handle::ReqToMem> MReqToMem;
323 
324  pi_result waitForEvents() const {
325  std::map<const PluginPtr, std::vector<EventImplPtr>>
326  RequiredEventsPerPlugin;
327 
328  for (const EventImplPtr &Event : MThisCmd->MPreparedDepsEvents) {
329  const PluginPtr &Plugin = Event->getPlugin();
330  RequiredEventsPerPlugin[Plugin].push_back(Event);
331  }
332 
333  // wait for dependency device events
334  // FIXME Current implementation of waiting for events will make the thread
335  // 'sleep' until all of dependency events are complete. We need a bit more
336  // sophisticated waiting mechanism to allow to utilize this thread for any
337  // other available job and resume once all required events are ready.
338  for (auto &PluginWithEvents : RequiredEventsPerPlugin) {
339  std::vector<sycl::detail::pi::PiEvent> RawEvents =
340  MThisCmd->getPiEvents(PluginWithEvents.second);
341  try {
342  PluginWithEvents.first->call<PiApiKind::piEventsWait>(RawEvents.size(),
343  RawEvents.data());
344  } catch (const sycl::exception &E) {
345  CGHostTask &HostTask = static_cast<CGHostTask &>(MThisCmd->getCG());
346  HostTask.MQueue->reportAsyncException(std::current_exception());
347  return (pi_result)E.get_cl_code();
348  } catch (...) {
349  CGHostTask &HostTask = static_cast<CGHostTask &>(MThisCmd->getCG());
350  HostTask.MQueue->reportAsyncException(std::current_exception());
351  return PI_ERROR_UNKNOWN;
352  }
353  }
354 
355  // Wait for dependency host events.
356  // Host events can't throw exceptions so don't try to catch it.
357  for (const EventImplPtr &Event : MThisCmd->MPreparedHostDepsEvents) {
358  Event->waitInternal();
359  }
360 
361  return PI_SUCCESS;
362  }
363 
364 public:
366  std::vector<interop_handle::ReqToMem> ReqToMem)
367  : MThisCmd{ThisCmd}, MReqToMem(std::move(ReqToMem)) {}
368 
369  void operator()() const {
370  assert(MThisCmd->getCG().getType() == CG::CGTYPE::CodeplayHostTask);
371 
372  CGHostTask &HostTask = static_cast<CGHostTask &>(MThisCmd->getCG());
373 
374 #ifdef XPTI_ENABLE_INSTRUMENTATION
375  // Host task is executed async and in a separate thread that do not allow to
376  // use code location data stored in TLS. So we keep submission code location
377  // as Command field and put it here to TLS so that thrown exception could
378  // query and report it.
379  std::unique_ptr<detail::tls_code_loc_t> AsyncCodeLocationPtr;
380  if (xptiTraceEnabled() && !CurrentCodeLocationValid()) {
381  AsyncCodeLocationPtr.reset(
383  }
384 #endif
385 
386  pi_result WaitResult = waitForEvents();
387  if (WaitResult != PI_SUCCESS) {
388  std::exception_ptr EPtr = std::make_exception_ptr(sycl::runtime_error(
389  std::string("Couldn't wait for host-task's dependencies"),
390  WaitResult));
391  HostTask.MQueue->reportAsyncException(EPtr);
392  // reset host-task's lambda and quit
393  HostTask.MHostTask.reset();
395  return;
396  }
397 
398  try {
399  // we're ready to call the user-defined lambda now
400  if (HostTask.MHostTask->isInteropTask()) {
401  interop_handle IH{MReqToMem, HostTask.MQueue,
402  HostTask.MQueue->getDeviceImplPtr(),
403  HostTask.MQueue->getContextImplPtr()};
404 
405  HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo(), IH);
406  } else
407  HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo());
408  } catch (...) {
409  auto CurrentException = std::current_exception();
410 #ifdef XPTI_ENABLE_INSTRUMENTATION
411  // sycl::exception emit tracing of message with code location if
412  // available. For other types of exception we need to explicitly trigger
413  // tracing by calling TraceEventXPTI.
414  if (xptiTraceEnabled()) {
415  try {
416  rethrow_exception(CurrentException);
417  } catch (const sycl::exception &) {
418  // it is already traced, nothing to care about
419  } catch (const std::exception &StdException) {
420  GlobalHandler::instance().TraceEventXPTI(StdException.what());
421  } catch (...) {
423  "Host task lambda thrown non standard exception");
424  }
425  }
426 #endif
427  HostTask.MQueue->reportAsyncException(CurrentException);
428  }
429 
430  HostTask.MHostTask.reset();
431 
432 #ifdef XPTI_ENABLE_INSTRUMENTATION
433  // Host Task is done, clear its submittion location to not interfere with
434  // following dependent kernels submission.
435  AsyncCodeLocationPtr.reset();
436 #endif
437 
438  try {
439  // If we enqueue blocked users - pi level could throw exception that
440  // should be treated as async now.
442  } catch (...) {
443  auto CurrentException = std::current_exception();
444  HostTask.MQueue->reportAsyncException(CurrentException);
445  }
446  }
447 };
448 
450  for (const EventImplPtr &HostEvent : MPreparedHostDepsEvents)
451  HostEvent->waitInternal();
452 }
453 
455  std::vector<EventImplPtr> &EventImpls,
456  sycl::detail::pi::PiEvent &Event) {
457  if (!EventImpls.empty()) {
458  if (Queue->is_host()) {
459  // Host queue can wait for events from different contexts, i.e. it may
460  // contain events with different contexts in its MPreparedDepsEvents.
461  // OpenCL 2.1 spec says that clWaitForEvents will return
462  // CL_INVALID_CONTEXT if events specified in the list do not belong to
463  // the same context. Thus we split all the events into per-context map.
464  // An example. We have two queues for the same CPU device: Q1, Q2. Thus
465  // we will have two different contexts for the same CPU device: C1, C2.
466  // Also we have default host queue. This queue is accessible via
467  // Scheduler. Now, let's assume we have three different events: E1(C1),
468  // E2(C1), E3(C2). The command's MPreparedDepsEvents will contain all
469  // three events (E1, E2, E3). Now, if piEventsWait is called for all
470  // three events we'll experience failure with CL_INVALID_CONTEXT 'cause
471  // these events refer to different contexts.
472  std::map<context_impl *, std::vector<EventImplPtr>>
473  RequiredEventsPerContext;
474 
475  for (const EventImplPtr &Event : EventImpls) {
476  ContextImplPtr Context = Event->getContextImpl();
477  assert(Context.get() &&
478  "Only non-host events are expected to be waited for here");
479  RequiredEventsPerContext[Context.get()].push_back(Event);
480  }
481 
482  for (auto &CtxWithEvents : RequiredEventsPerContext) {
483  std::vector<sycl::detail::pi::PiEvent> RawEvents =
484  getPiEvents(CtxWithEvents.second);
485  CtxWithEvents.first->getPlugin()->call<PiApiKind::piEventsWait>(
486  RawEvents.size(), RawEvents.data());
487  }
488  } else {
489 #ifndef NDEBUG
490  for (const EventImplPtr &Event : EventImpls)
491  assert(Event->getContextImpl().get() &&
492  "Only non-host events are expected to be waited for here");
493 #endif
494 
495  std::vector<sycl::detail::pi::PiEvent> RawEvents =
496  getPiEvents(EventImpls);
497  flushCrossQueueDeps(EventImpls, getWorkerQueue());
498  const PluginPtr &Plugin = Queue->getPlugin();
499 
500  if (MEvent != nullptr)
501  MEvent->setHostEnqueueTime();
502  Plugin->call<PiApiKind::piEnqueueEventsWait>(
503  Queue->getHandleRef(), RawEvents.size(), &RawEvents[0], &Event);
504  }
505  }
506 }
507 
512  CommandType Type, QueueImplPtr Queue,
514  const std::vector<sycl::detail::pi::PiExtSyncPoint> &SyncPoints)
515  : MQueue(std::move(Queue)),
516  MEvent(std::make_shared<detail::event_impl>(MQueue)),
517  MPreparedDepsEvents(MEvent->getPreparedDepsEvents()),
518  MPreparedHostDepsEvents(MEvent->getPreparedHostDepsEvents()), MType(Type),
519  MCommandBuffer(CommandBuffer), MSyncPointDeps(SyncPoints) {
521  MEvent->setWorkerQueue(MWorkerQueue);
522  MEvent->setSubmittedQueue(MWorkerQueue);
523  MEvent->setCommand(this);
524  MEvent->setContextImpl(MQueue->getContextImplPtr());
525  MEvent->setStateIncomplete();
527 
528 #ifdef XPTI_ENABLE_INSTRUMENTATION
529  if (!xptiTraceEnabled())
530  return;
531  // Obtain the stream ID so all commands can emit traces to that stream
532  MStreamID = xptiRegisterStream(SYCL_STREAM_NAME);
533 #endif
534 }
535 
537 #ifdef XPTI_ENABLE_INSTRUMENTATION
539 #endif
540 }
541 
553  Command *Cmd, void *ObjAddr, bool IsCommand,
554  std::optional<access::mode> AccMode) {
555 #ifdef XPTI_ENABLE_INSTRUMENTATION
556  // Bail early if either the source or the target node for the given
557  // dependency is undefined or NULL
558  constexpr uint16_t NotificationTraceType = xpti::trace_edge_create;
559  if (!(xptiCheckTraceEnabled(MStreamID, NotificationTraceType) &&
560  MTraceEvent && Cmd && Cmd->MTraceEvent))
561  return;
562 
563  // If all the information we need for creating an edge event is available,
564  // then go ahead with creating it; if not, bail early!
565  xpti::utils::StringHelper SH;
566  std::string AddressStr = SH.addressAsString<void *>(ObjAddr);
567  std::string Prefix = AccMode ? accessModeToString(AccMode.value()) : "Event";
568  std::string TypeString = SH.nameWithAddressString(Prefix, AddressStr);
569  // Create an edge with the dependent buffer address for which a command
570  // object has been created as one of the properties of the edge
571  xpti::payload_t Payload(TypeString.c_str(), MAddress);
572  uint64_t EdgeInstanceNo;
573  xpti_td *EdgeEvent =
574  xptiMakeEvent(TypeString.c_str(), &Payload, xpti::trace_graph_event,
575  xpti_at::active, &EdgeInstanceNo);
576  if (EdgeEvent) {
577  xpti_td *SrcEvent = static_cast<xpti_td *>(Cmd->MTraceEvent);
578  xpti_td *TgtEvent = static_cast<xpti_td *>(MTraceEvent);
579  EdgeEvent->source_id = SrcEvent->unique_id;
580  EdgeEvent->target_id = TgtEvent->unique_id;
581  if (IsCommand) {
582  xpti::addMetadata(EdgeEvent, "access_mode",
583  static_cast<int>(AccMode.value()));
584  xpti::addMetadata(EdgeEvent, "memory_object",
585  reinterpret_cast<size_t>(ObjAddr));
586  } else {
587  xpti::addMetadata(EdgeEvent, "event", reinterpret_cast<size_t>(ObjAddr));
588  }
589  xptiNotifySubscribers(MStreamID, NotificationTraceType,
590  detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
591  nullptr);
592  }
593  // General comment - None of these are serious errors as the instrumentation
594  // layer MUST be tolerant of errors. If we need to let the end user know, we
595  // throw exceptions in the future
596 #endif
597 }
598 
604  Command *Cmd, sycl::detail::pi::PiEvent &PiEventAddr) {
605 #ifdef XPTI_ENABLE_INSTRUMENTATION
606  // If we have failed to create an event to represent the Command, then we
607  // cannot emit an edge event. Bail early!
608  if (!(xptiCheckTraceEnabled(MStreamID) && MTraceEvent))
609  return;
610 
611  if (Cmd && Cmd->MTraceEvent) {
612  // If the event is associated with a command, we use this command's trace
613  // event as the source of edge, hence modeling the control flow
614  emitEdgeEventForCommandDependence(Cmd, (void *)PiEventAddr, false);
615  return;
616  }
617  if (PiEventAddr) {
618  xpti::utils::StringHelper SH;
619  std::string AddressStr =
620  SH.addressAsString<sycl::detail::pi::PiEvent>(PiEventAddr);
621  // This is the case when it is a OCL event enqueued by the user or another
622  // event is registered by the runtime as a dependency The dependency on
623  // this occasion is an OCL event; so we build a virtual node in the graph
624  // with the event as the metadata for the node
625  std::string NodeName = SH.nameWithAddressString("virtual_node", AddressStr);
626  // Node name is "virtual_node[<event_addr>]"
627  xpti::payload_t VNPayload(NodeName.c_str(), MAddress);
628  uint64_t VNodeInstanceNo;
629  xpti_td *NodeEvent =
630  xptiMakeEvent(NodeName.c_str(), &VNPayload, xpti::trace_graph_event,
631  xpti_at::active, &VNodeInstanceNo);
632  // Emit the virtual node first
633  xpti::addMetadata(NodeEvent, "kernel_name", NodeName);
634  xptiNotifySubscribers(MStreamID, xpti::trace_node_create,
635  detail::GSYCLGraphEvent, NodeEvent, VNodeInstanceNo,
636  nullptr);
637  // Create a new event for the edge
638  std::string EdgeName = SH.nameWithAddressString("Event", AddressStr);
639  xpti::payload_t EdgePayload(EdgeName.c_str(), MAddress);
640  uint64_t EdgeInstanceNo;
641  xpti_td *EdgeEvent =
642  xptiMakeEvent(EdgeName.c_str(), &EdgePayload, xpti::trace_graph_event,
643  xpti_at::active, &EdgeInstanceNo);
644  if (EdgeEvent && NodeEvent) {
645  // Source node represents the event and this event needs to be completed
646  // before target node can execute
647  xpti_td *TgtEvent = static_cast<xpti_td *>(MTraceEvent);
648  EdgeEvent->source_id = NodeEvent->unique_id;
649  EdgeEvent->target_id = TgtEvent->unique_id;
650  xpti::addMetadata(EdgeEvent, "event",
651  reinterpret_cast<size_t>(PiEventAddr));
652  xptiNotifySubscribers(MStreamID, xpti::trace_edge_create,
653  detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
654  nullptr);
655  }
656  return;
657  }
658 #endif
659 }
660 
661 uint64_t Command::makeTraceEventProlog(void *MAddress) {
662  uint64_t CommandInstanceNo = 0;
663 #ifdef XPTI_ENABLE_INSTRUMENTATION
664  if (!xptiCheckTraceEnabled(MStreamID))
665  return CommandInstanceNo;
666 
668  // Setup the member variables with information needed for event notification
669  MCommandNodeType = commandToNodeType(MType);
670  MCommandName = commandToName(MType);
671  xpti::utils::StringHelper SH;
672  MAddressString = SH.addressAsString<void *>(MAddress);
673  std::string CommandString =
674  SH.nameWithAddressString(MCommandName, MAddressString);
675 
676  xpti::payload_t p(CommandString.c_str(), MAddress);
677  xpti_td *CmdTraceEvent =
678  xptiMakeEvent(CommandString.c_str(), &p, xpti::trace_graph_event,
679  xpti_at::active, &CommandInstanceNo);
680  MInstanceID = CommandInstanceNo;
681  if (CmdTraceEvent) {
682  MTraceEvent = (void *)CmdTraceEvent;
683  // If we are seeing this event again, then the instance ID will be greater
684  // than 1; in this case, we must skip sending a notification to create a
685  // node as this node has already been created. We return this value so the
686  // epilog method can be called selectively.
687  MFirstInstance = (CommandInstanceNo == 1);
688  }
689 #endif
690  return CommandInstanceNo;
691 }
692 
694 #ifdef XPTI_ENABLE_INSTRUMENTATION
695  constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
696  if (!(xptiCheckTraceEnabled(MStreamID, NotificationTraceType) && MTraceEvent))
697  return;
699  xptiNotifySubscribers(MStreamID, NotificationTraceType,
700  detail::GSYCLGraphEvent,
701  static_cast<xpti_td *>(MTraceEvent), MInstanceID,
702  static_cast<const void *>(MCommandNodeType.c_str()));
703 #endif
704 }
705 
707  std::vector<Command *> &ToCleanUp) {
708  const QueueImplPtr &WorkerQueue = getWorkerQueue();
709  const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr();
710 
711  // 1. Async work is not supported for host device.
712  // 2. Non-host events can be ignored if they are not fully initialized.
713  // 3. Some types of commands do not produce PI events after they are
714  // enqueued
715  // (e.g. alloca). Note that we can't check the pi event to make that
716  // distinction since the command might still be unenqueued at this point.
717  bool PiEventExpected = (!DepEvent->is_host() && DepEvent->isInitialized());
718  if (auto *DepCmd = static_cast<Command *>(DepEvent->getCommand()))
719  PiEventExpected &= DepCmd->producesPiEvent();
720 
721  if (!PiEventExpected) {
722  // call to waitInternal() is in waitForPreparedHostEvents() as it's called
723  // from enqueue process functions
724  MPreparedHostDepsEvents.push_back(DepEvent);
725  return nullptr;
726  }
727 
728  Command *ConnectionCmd = nullptr;
729 
730  ContextImplPtr DepEventContext = DepEvent->getContextImpl();
731  // If contexts don't match we'll connect them using host task
732  if (DepEventContext != WorkerContext && !WorkerContext->is_host()) {
734  ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep, ToCleanUp);
735  } else
736  MPreparedDepsEvents.push_back(std::move(DepEvent));
737 
738  return ConnectionCmd;
739 }
740 
742  return MQueue->getContextImplPtr();
743 }
744 
746  assert(MWorkerQueue && "MWorkerQueue must not be nullptr");
747  return MWorkerQueue;
748 }
749 
750 bool Command::producesPiEvent() const { return true; }
751 
752 bool Command::supportsPostEnqueueCleanup() const { return true; }
753 
755  return MLeafCounter == 0 &&
757 }
758 
759 Command *Command::addDep(DepDesc NewDep, std::vector<Command *> &ToCleanUp) {
760  Command *ConnectionCmd = nullptr;
761 
762  if (NewDep.MDepCommand) {
763  ConnectionCmd =
764  processDepEvent(NewDep.MDepCommand->getEvent(), NewDep, ToCleanUp);
765  }
766  // ConnectionCmd insertion builds the following dependency structure:
767  // this -> emptyCmd (for ConnectionCmd) -> ConnectionCmd -> NewDep
768  // that means that this and NewDep are already dependent
769  if (!ConnectionCmd) {
770  MDeps.push_back(NewDep);
771  if (NewDep.MDepCommand)
772  NewDep.MDepCommand->addUser(this);
773  }
774 
775 #ifdef XPTI_ENABLE_INSTRUMENTATION
777  (void *)NewDep.MDepRequirement->MSYCLMemObj,
778  true, NewDep.MDepRequirement->MAccessMode);
779 #endif
780 
781  return ConnectionCmd;
782 }
783 
785  std::vector<Command *> &ToCleanUp) {
786 #ifdef XPTI_ENABLE_INSTRUMENTATION
787  // We need this for just the instrumentation, so guarding it will prevent
788  // unused variable warnings when instrumentation is turned off
789  Command *Cmd = (Command *)Event->getCommand();
790  sycl::detail::pi::PiEvent &PiEventAddr = Event->getHandleRef();
791  // Now make an edge for the dependent event
792  emitEdgeEventForEventDependence(Cmd, PiEventAddr);
793 #endif
794 
795  return processDepEvent(std::move(Event), DepDesc{nullptr, nullptr, nullptr},
796  ToCleanUp);
797 }
798 
800 #ifdef XPTI_ENABLE_INSTRUMENTATION
801  emitInstrumentationGeneral(
802  MStreamID, MInstanceID, static_cast<xpti_td *>(MTraceEvent),
803  xpti::trace_signal, static_cast<const void *>(PiEventAddr));
804 #endif
805  std::ignore = PiEventAddr;
806 }
807 
808 void Command::emitInstrumentation(uint16_t Type, const char *Txt) {
809 #ifdef XPTI_ENABLE_INSTRUMENTATION
810  return emitInstrumentationGeneral(MStreamID, MInstanceID,
811  static_cast<xpti_td *>(MTraceEvent), Type,
812  static_cast<const void *>(Txt));
813 #else
814  std::ignore = Type;
815  std::ignore = Txt;
816 #endif
817 }
818 
819 bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking,
820  std::vector<Command *> &ToCleanUp) {
821 #ifdef XPTI_ENABLE_INSTRUMENTATION
822  // If command is enqueued from host task thread - it will not have valid
823  // submission code location set. So we set it manually to properly trace
824  // failures if pi level report any.
825  std::unique_ptr<detail::tls_code_loc_t> AsyncCodeLocationPtr;
826  if (xptiTraceEnabled() && !CurrentCodeLocationValid()) {
827  AsyncCodeLocationPtr.reset(
829  }
830 #endif
831  // Exit if already enqueued
833  return true;
834 
835  // If the command is blocked from enqueueing
837  // Exit if enqueue type is not blocking
838  if (!Blocking) {
839  EnqueueResult = EnqueueResultT(EnqueueResultT::SyclEnqueueBlocked, this);
840  return false;
841  }
842 
843 #ifdef XPTI_ENABLE_INSTRUMENTATION
844  // Scoped trace event notifier that emits a barrier begin and barrier end
845  // event, which models the barrier while enqueuing along with the blocked
846  // reason, as determined by the scheduler
847  std::string Info = "enqueue.barrier[";
848  Info += std::string(getBlockReason()) + "]";
849  emitInstrumentation(xpti::trace_barrier_begin, Info.c_str());
850 #endif
851 
852  // Wait if blocking
854  ;
855 #ifdef XPTI_ENABLE_INSTRUMENTATION
856  emitInstrumentation(xpti::trace_barrier_end, Info.c_str());
857 #endif
858  }
859 
860  std::lock_guard<std::mutex> Lock(MEnqueueMtx);
861 
862  // Exit if the command is already enqueued
864  return true;
865 
866 #ifdef XPTI_ENABLE_INSTRUMENTATION
867  emitInstrumentation(xpti::trace_task_begin, nullptr);
868 #endif
869 
871  EnqueueResult = EnqueueResultT(EnqueueResultT::SyclEnqueueFailed, this);
872  return false;
873  }
874 
875  // Command status set to "failed" beforehand, so this command
876  // has already been marked as "failed" if enqueueImp throws an exception.
877  // This will avoid execution of the same failed command twice.
880  pi_int32 Res = enqueueImp();
881 
882  if (PI_SUCCESS != Res)
883  EnqueueResult =
885  else {
887  (MEvent->is_host() || MEvent->getHandleRef() == nullptr))
888  MEvent->setComplete();
889 
890  // Consider the command is successfully enqueued if return code is
891  // PI_SUCCESS
896  assert(!MMarkedForCleanup);
897  MMarkedForCleanup = true;
898  ToCleanUp.push_back(this);
899  }
900  }
901 
902  // Emit this correlation signal before the task end
903  emitEnqueuedEventSignal(MEvent->getHandleRef());
904 #ifdef XPTI_ENABLE_INSTRUMENTATION
905  emitInstrumentation(xpti::trace_task_end, nullptr);
906 #endif
908 }
909 
910 void Command::resolveReleaseDependencies(std::set<Command *> &DepList) {
911 #ifdef XPTI_ENABLE_INSTRUMENTATION
912  assert(MType == CommandType::RELEASE && "Expected release command");
913  if (!MTraceEvent)
914  return;
915  // The current command is the target node for all dependencies as the source
916  // nodes have to be completed first before the current node can begin to
917  // execute; these edges model control flow
918  xpti_td *TgtTraceEvent = static_cast<xpti_td *>(MTraceEvent);
919  // We have all the Commands that must be completed before the release
920  // command can be enqueued; here we'll find the command that is an Alloca
921  // with the same SYCLMemObject address and create a dependency line (edge)
922  // between them in our sematic modeling
923  for (auto &Item : DepList) {
924  if (Item->MTraceEvent && Item->MAddress == MAddress) {
925  xpti::utils::StringHelper SH;
926  std::string AddressStr = SH.addressAsString<void *>(MAddress);
927  std::string TypeString =
928  "Edge:" + SH.nameWithAddressString(commandToName(MType), AddressStr);
929 
930  // Create an edge with the dependent buffer address being one of the
931  // properties of the edge
932  xpti::payload_t p(TypeString.c_str(), MAddress);
933  uint64_t EdgeInstanceNo;
934  xpti_td *EdgeEvent =
935  xptiMakeEvent(TypeString.c_str(), &p, xpti::trace_graph_event,
936  xpti_at::active, &EdgeInstanceNo);
937  if (EdgeEvent) {
938  xpti_td *SrcTraceEvent = static_cast<xpti_td *>(Item->MTraceEvent);
939  EdgeEvent->target_id = TgtTraceEvent->unique_id;
940  EdgeEvent->source_id = SrcTraceEvent->unique_id;
941  xpti::addMetadata(EdgeEvent, "memory_object",
942  reinterpret_cast<size_t>(MAddress));
943  xptiNotifySubscribers(MStreamID, xpti::trace_edge_create,
944  detail::GSYCLGraphEvent, EdgeEvent,
945  EdgeInstanceNo, nullptr);
946  }
947  }
948  }
949 #endif
950 }
951 
952 const char *Command::getBlockReason() const {
953  switch (MBlockReason) {
955  return "A Buffer is locked by the host accessor";
957  return "Blocked by host task";
958  }
959 
960  return "Unknown block reason";
961 }
962 
964 #ifdef XPTI_ENABLE_INSTRUMENTATION
965  if (!xptiTraceEnabled())
966  return;
967 
969  auto TData = Tls.query();
970  if (TData.fileName())
971  MSubmissionFileName = TData.fileName();
972  if (TData.functionName())
973  MSubmissionFunctionName = TData.functionName();
974  if (MSubmissionFileName.size() || MSubmissionFunctionName.size())
977  (int)TData.lineNumber(), (int)TData.columnNumber()};
978 #endif
979 }
980 
982  Requirement Req,
983  AllocaCommandBase *LinkedAllocaCmd,
984  bool IsConst)
985  : Command(Type, Queue), MLinkedAllocaCmd(LinkedAllocaCmd),
986  MIsLeaderAlloca(nullptr == LinkedAllocaCmd), MIsConst(IsConst),
987  MRequirement(std::move(Req)), MReleaseCmd(Queue, this) {
990 }
991 
993 #ifdef XPTI_ENABLE_INSTRUMENTATION
994  if (!xptiCheckTraceEnabled(MStreamID))
995  return;
996  // Create a payload with the command name and an event using this payload to
997  // emit a node_create
1000  // Set the relevant meta data properties for this command
1001  if (MTraceEvent && MFirstInstance) {
1002  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1003  xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
1004  xpti::addMetadata(TE, "sycl_device_type",
1005  deviceToString(MQueue->get_device()));
1006  xpti::addMetadata(TE, "sycl_device_name",
1007  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1008  xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
1009  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
1010  // as this data is mutable and the metadata is supposed to be invariant
1011  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1012  MQueue->getQueueID());
1013  }
1014 #endif
1015 }
1016 
1017 bool AllocaCommandBase::producesPiEvent() const { return false; }
1018 
1020 
1021 bool AllocaCommandBase::readyForCleanup() const { return false; }
1022 
1024  bool InitFromUserData,
1025  AllocaCommandBase *LinkedAllocaCmd, bool IsConst)
1026  : AllocaCommandBase(CommandType::ALLOCA, std::move(Queue), std::move(Req),
1027  LinkedAllocaCmd, IsConst),
1028  MInitFromUserData(InitFromUserData) {
1029  // Node event must be created before the dependent edge is added to this
1030  // node, so this call must be before the addDep() call.
1032  // "Nothing to depend on"
1033  std::vector<Command *> ToCleanUp;
1034  Command *ConnectionCmd =
1035  addDep(DepDesc(nullptr, getRequirement(), this), ToCleanUp);
1036  assert(ConnectionCmd == nullptr);
1037  assert(ToCleanUp.empty());
1038  (void)ConnectionCmd;
1039 }
1040 
1042 #ifdef XPTI_ENABLE_INSTRUMENTATION
1043  if (!xptiCheckTraceEnabled(MStreamID))
1044  return;
1045 
1046  // Only if it is the first event, we emit a node create event
1047  if (MFirstInstance) {
1049  }
1050 #endif
1051 }
1052 
1053 pi_int32 AllocaCommand::enqueueImp() {
1055  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1056 
1057  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
1058 
1059  void *HostPtr = nullptr;
1060  if (!MIsLeaderAlloca) {
1061 
1062  if (MQueue->is_host()) {
1063  // Do not need to make allocation if we have a linked device allocation
1064  Command::waitForEvents(MQueue, EventImpls, Event);
1065 
1066  return PI_SUCCESS;
1067  }
1068  HostPtr = MLinkedAllocaCmd->getMemAllocation();
1069  }
1070  // TODO: Check if it is correct to use std::move on stack variable and
1071  // delete it RawEvents below.
1073  MQueue->getContextImplPtr(), getSYCLMemObj(), MInitFromUserData, HostPtr,
1074  std::move(EventImpls), Event);
1075 
1076  return PI_SUCCESS;
1077 }
1078 
1079 void AllocaCommand::printDot(std::ostream &Stream) const {
1080  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1081 
1082  Stream << "ID = " << this << "\\n";
1083  Stream << "ALLOCA ON " << deviceToString(MQueue->get_device()) << "\\n";
1084  Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
1085  Stream << " Link : " << this->MLinkedAllocaCmd << "\\n";
1086  Stream << "\"];" << std::endl;
1087 
1088  for (const auto &Dep : MDeps) {
1089  if (Dep.MDepCommand == nullptr)
1090  continue;
1091  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1092  << " [ label = \"Access mode: "
1093  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1094  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1095  << std::endl;
1096  }
1097 }
1098 
1100  AllocaCommandBase *ParentAlloca,
1101  std::vector<Command *> &ToEnqueue,
1102  std::vector<Command *> &ToCleanUp)
1103  : AllocaCommandBase(CommandType::ALLOCA_SUB_BUF, std::move(Queue),
1104  std::move(Req),
1105  /*LinkedAllocaCmd*/ nullptr, /*IsConst*/ false),
1106  MParentAlloca(ParentAlloca) {
1107  // Node event must be created before the dependent edge
1108  // is added to this node, so this call must be before
1109  // the addDep() call.
1111  Command *ConnectionCmd = addDep(
1112  DepDesc(MParentAlloca, getRequirement(), MParentAlloca), ToCleanUp);
1113  if (ConnectionCmd)
1114  ToEnqueue.push_back(ConnectionCmd);
1115 }
1116 
1118 #ifdef XPTI_ENABLE_INSTRUMENTATION
1119  if (!xptiCheckTraceEnabled(MStreamID))
1120  return;
1121 
1122  // Only if it is the first event, we emit a node create event and any meta
1123  // data that is available for the command
1124  if (MFirstInstance) {
1125  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1126  xpti::addMetadata(TE, "offset", this->MRequirement.MOffsetInBytes);
1127  xpti::addMetadata(TE, "access_range_start",
1128  this->MRequirement.MAccessRange[0]);
1129  xpti::addMetadata(TE, "access_range_end",
1130  this->MRequirement.MAccessRange[1]);
1131  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1132  MQueue->getQueueID());
1134  }
1135 #endif
1136 }
1137 
1139  // In some cases parent`s memory allocation might change (e.g., after
1140  // map/unmap operations). If parent`s memory allocation changes, sub-buffer
1141  // memory allocation should be changed as well.
1142  if (MQueue->is_host()) {
1143  return static_cast<void *>(
1144  static_cast<char *>(MParentAlloca->getMemAllocation()) +
1146  }
1147  return MMemAllocation;
1148 }
1149 
1150 pi_int32 AllocaSubBufCommand::enqueueImp() {
1152  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1153  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
1154 
1156  MQueue->getContextImplPtr(), MParentAlloca->getMemAllocation(),
1158  MRequirement.MAccessRange, std::move(EventImpls), Event);
1159 
1161  MMemAllocation);
1162  return PI_SUCCESS;
1163 }
1164 
1165 void AllocaSubBufCommand::printDot(std::ostream &Stream) const {
1166  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1167 
1168  Stream << "ID = " << this << "\\n";
1169  Stream << "ALLOCA SUB BUF ON " << deviceToString(MQueue->get_device())
1170  << "\\n";
1171  Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
1172  Stream << " Offset : " << this->MRequirement.MOffsetInBytes << "\\n";
1173  Stream << " Access range : " << this->MRequirement.MAccessRange[0] << "\\n";
1174  Stream << "\"];" << std::endl;
1175 
1176  for (const auto &Dep : MDeps) {
1177  if (Dep.MDepCommand == nullptr)
1178  continue;
1179  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1180  << " [ label = \"Access mode: "
1181  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1182  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1183  << std::endl;
1184  }
1185 }
1186 
1188  : Command(CommandType::RELEASE, std::move(Queue)), MAllocaCmd(AllocaCmd) {
1190 }
1191 
1193 #ifdef XPTI_ENABLE_INSTRUMENTATION
1194  if (!xptiCheckTraceEnabled(MStreamID))
1195  return;
1196  // Create a payload with the command name and an event using this payload to
1197  // emit a node_create
1198  MAddress = MAllocaCmd->getSYCLMemObj();
1200 
1201  if (MFirstInstance) {
1202  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1203  xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
1204  xpti::addMetadata(TE, "sycl_device_type",
1205  deviceToString(MQueue->get_device()));
1206  xpti::addMetadata(TE, "sycl_device_name",
1207  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1208  xpti::addMetadata(TE, "allocation_type",
1209  commandToName(MAllocaCmd->getType()));
1210  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
1211  // as this data is mutable and the metadata is supposed to be invariant
1212  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1213  MQueue->getQueueID());
1215  }
1216 #endif
1217 }
1218 
1219 pi_int32 ReleaseCommand::enqueueImp() {
1221  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1222  std::vector<sycl::detail::pi::PiEvent> RawEvents = getPiEvents(EventImpls);
1223  bool SkipRelease = false;
1224 
1225  // On host side we only allocate memory for full buffers.
1226  // Thus, deallocating sub buffers leads to double memory freeing.
1227  SkipRelease |= MQueue->is_host() && MAllocaCmd->getType() == ALLOCA_SUB_BUF;
1228 
1229  const bool CurAllocaIsHost = MAllocaCmd->getQueue()->is_host();
1230  bool NeedUnmap = false;
1231  if (MAllocaCmd->MLinkedAllocaCmd) {
1232 
1233  // When releasing one of the "linked" allocations special rules take
1234  // place:
1235  // 1. Device allocation should always be released.
1236  // 2. Host allocation should be released if host allocation is "leader".
1237  // 3. Device alloca in the pair should be in active state in order to be
1238  // correctly released.
1239 
1240  // There is no actual memory allocation if a host alloca command is
1241  // created being linked to a device allocation.
1242  SkipRelease |= CurAllocaIsHost && !MAllocaCmd->MIsLeaderAlloca;
1243 
1244  NeedUnmap |= CurAllocaIsHost == MAllocaCmd->MIsActive;
1245  }
1246 
1247  if (NeedUnmap) {
1248  const QueueImplPtr &Queue = CurAllocaIsHost
1249  ? MAllocaCmd->MLinkedAllocaCmd->getQueue()
1250  : MAllocaCmd->getQueue();
1251 
1252  EventImplPtr UnmapEventImpl(new event_impl(Queue));
1253  UnmapEventImpl->setContextImpl(Queue->getContextImplPtr());
1254  UnmapEventImpl->setStateIncomplete();
1255  sycl::detail::pi::PiEvent &UnmapEvent = UnmapEventImpl->getHandleRef();
1256 
1257  void *Src = CurAllocaIsHost
1258  ? MAllocaCmd->getMemAllocation()
1259  : MAllocaCmd->MLinkedAllocaCmd->getMemAllocation();
1260 
1261  void *Dst = !CurAllocaIsHost
1262  ? MAllocaCmd->getMemAllocation()
1263  : MAllocaCmd->MLinkedAllocaCmd->getMemAllocation();
1264 
1265  MemoryManager::unmap(MAllocaCmd->getSYCLMemObj(), Dst, Queue, Src,
1266  RawEvents, UnmapEvent);
1267 
1268  std::swap(MAllocaCmd->MIsActive, MAllocaCmd->MLinkedAllocaCmd->MIsActive);
1269  EventImpls.clear();
1270  EventImpls.push_back(UnmapEventImpl);
1271  }
1272  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
1273  if (SkipRelease)
1274  Command::waitForEvents(MQueue, EventImpls, Event);
1275  else {
1277  MQueue->getContextImplPtr(), MAllocaCmd->getSYCLMemObj(),
1278  MAllocaCmd->getMemAllocation(), std::move(EventImpls), Event);
1279  }
1280  return PI_SUCCESS;
1281 }
1282 
1283 void ReleaseCommand::printDot(std::ostream &Stream) const {
1284  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FF827A\", label=\"";
1285 
1286  Stream << "ID = " << this << " ; ";
1287  Stream << "RELEASE ON " << deviceToString(MQueue->get_device()) << "\\n";
1288  Stream << " Alloca : " << MAllocaCmd << "\\n";
1289  Stream << " MemObj : " << MAllocaCmd->getSYCLMemObj() << "\\n";
1290  Stream << "\"];" << std::endl;
1291 
1292  for (const auto &Dep : MDeps) {
1293  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1294  << " [ label = \"Access mode: "
1295  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1296  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1297  << std::endl;
1298  }
1299 }
1300 
1301 bool ReleaseCommand::producesPiEvent() const { return false; }
1302 
1303 bool ReleaseCommand::supportsPostEnqueueCleanup() const { return false; }
1304 
1305 bool ReleaseCommand::readyForCleanup() const { return false; }
1306 
1308  void **DstPtr, QueueImplPtr Queue,
1309  access::mode MapMode)
1310  : Command(CommandType::MAP_MEM_OBJ, std::move(Queue)),
1311  MSrcAllocaCmd(SrcAllocaCmd), MSrcReq(std::move(Req)), MDstPtr(DstPtr),
1312  MMapMode(MapMode) {
1314 }
1315 
1317 #ifdef XPTI_ENABLE_INSTRUMENTATION
1318  if (!xptiCheckTraceEnabled(MStreamID))
1319  return;
1320  // Create a payload with the command name and an event using this payload to
1321  // emit a node_create
1322  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1324 
1325  if (MFirstInstance) {
1326  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1327  xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
1328  xpti::addMetadata(TE, "sycl_device_type",
1329  deviceToString(MQueue->get_device()));
1330  xpti::addMetadata(TE, "sycl_device_name",
1331  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1332  xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
1333  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
1334  // as this data is mutable and the metadata is supposed to be invariant
1335  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1336  MQueue->getQueueID());
1338  }
1339 #endif
1340 }
1341 
1342 pi_int32 MapMemObject::enqueueImp() {
1344  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1345  std::vector<sycl::detail::pi::PiEvent> RawEvents = getPiEvents(EventImpls);
1346  flushCrossQueueDeps(EventImpls, getWorkerQueue());
1347 
1348  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
1349  *MDstPtr = MemoryManager::map(
1350  MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(), MQueue,
1351  MMapMode, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1352  MSrcReq.MOffset, MSrcReq.MElemSize, std::move(RawEvents), Event);
1353 
1354  return PI_SUCCESS;
1355 }
1356 
1357 void MapMemObject::printDot(std::ostream &Stream) const {
1358  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#77AFFF\", label=\"";
1359 
1360  Stream << "ID = " << this << " ; ";
1361  Stream << "MAP ON " << deviceToString(MQueue->get_device()) << "\\n";
1362 
1363  Stream << "\"];" << std::endl;
1364 
1365  for (const auto &Dep : MDeps) {
1366  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1367  << " [ label = \"Access mode: "
1368  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1369  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1370  << std::endl;
1371  }
1372 }
1373 
1375  void **SrcPtr, QueueImplPtr Queue)
1376  : Command(CommandType::UNMAP_MEM_OBJ, std::move(Queue)),
1377  MDstAllocaCmd(DstAllocaCmd), MDstReq(std::move(Req)), MSrcPtr(SrcPtr) {
1379 }
1380 
1382 #ifdef XPTI_ENABLE_INSTRUMENTATION
1383  if (!xptiCheckTraceEnabled(MStreamID))
1384  return;
1385  // Create a payload with the command name and an event using this payload to
1386  // emit a node_create
1387  MAddress = MDstAllocaCmd->getSYCLMemObj();
1389 
1390  if (MFirstInstance) {
1391  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1392  xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
1393  xpti::addMetadata(TE, "sycl_device_type",
1394  deviceToString(MQueue->get_device()));
1395  xpti::addMetadata(TE, "sycl_device_name",
1396  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1397  xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
1398  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
1399  // as this data is mutable and the metadata is supposed to be invariant
1400  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1401  MQueue->getQueueID());
1403  }
1404 #endif
1405 }
1406 
1408  // TODO remove this workaround once the batching issue is addressed in Level
1409  // Zero plugin.
1410  // Consider the following scenario on Level Zero:
1411  // 1. Kernel A, which uses buffer A, is submitted to queue A.
1412  // 2. Kernel B, which uses buffer B, is submitted to queue B.
1413  // 3. queueA.wait().
1414  // 4. queueB.wait().
1415  // DPCPP runtime used to treat unmap/write commands for buffer A/B as host
1416  // dependencies (i.e. they were waited for prior to enqueueing any command
1417  // that's dependent on them). This allowed Level Zero plugin to detect that
1418  // each queue is idle on steps 1/2 and submit the command list right away.
1419  // This is no longer the case since we started passing these dependencies in
1420  // an event waitlist and Level Zero plugin attempts to batch these commands,
1421  // so the execution of kernel B starts only on step 4. This workaround
1422  // restores the old behavior in this case until this is resolved.
1423  return MQueue->getDeviceImplPtr()->getBackend() !=
1425  MEvent->getHandleRef() != nullptr;
1426 }
1427 
1428 pi_int32 UnMapMemObject::enqueueImp() {
1430  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1431  std::vector<sycl::detail::pi::PiEvent> RawEvents = getPiEvents(EventImpls);
1432  flushCrossQueueDeps(EventImpls, getWorkerQueue());
1433 
1434  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
1435  MemoryManager::unmap(MDstAllocaCmd->getSYCLMemObj(),
1436  MDstAllocaCmd->getMemAllocation(), MQueue, *MSrcPtr,
1437  std::move(RawEvents), Event);
1438 
1439  return PI_SUCCESS;
1440 }
1441 
1442 void UnMapMemObject::printDot(std::ostream &Stream) const {
1443  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#EBC40F\", label=\"";
1444 
1445  Stream << "ID = " << this << " ; ";
1446  Stream << "UNMAP ON " << deviceToString(MQueue->get_device()) << "\\n";
1447 
1448  Stream << "\"];" << std::endl;
1449 
1450  for (const auto &Dep : MDeps) {
1451  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1452  << " [ label = \"Access mode: "
1453  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1454  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1455  << std::endl;
1456  }
1457 }
1458 
1460  AllocaCommandBase *SrcAllocaCmd,
1461  Requirement DstReq,
1462  AllocaCommandBase *DstAllocaCmd,
1463  QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
1464  : Command(CommandType::COPY_MEMORY, std::move(DstQueue)),
1465  MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)),
1466  MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)),
1467  MDstAllocaCmd(DstAllocaCmd) {
1468  if (!MSrcQueue->is_host()) {
1469  MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1470  }
1471 
1472  MWorkerQueue = MQueue->is_host() ? MSrcQueue : MQueue;
1473  MEvent->setWorkerQueue(MWorkerQueue);
1474 
1476 }
1477 
1479 #ifdef XPTI_ENABLE_INSTRUMENTATION
1480  if (!xptiCheckTraceEnabled(MStreamID))
1481  return;
1482  // Create a payload with the command name and an event using this payload to
1483  // emit a node_create
1484  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1486 
1487  if (MFirstInstance) {
1488  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1489  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1490  deviceToID(MQueue->get_device()));
1491  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1492  deviceToString(MQueue->get_device()));
1493  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1494  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1495  xpti::addMetadata(CmdTraceEvent, "memory_object",
1496  reinterpret_cast<size_t>(MAddress));
1497  xpti::addMetadata(CmdTraceEvent, "copy_from",
1498  reinterpret_cast<size_t>(
1499  getSyclObjImpl(MSrcQueue->get_device()).get()));
1500  xpti::addMetadata(
1501  CmdTraceEvent, "copy_to",
1502  reinterpret_cast<size_t>(getSyclObjImpl(MQueue->get_device()).get()));
1503  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
1504  // as this data is mutable and the metadata is supposed to be invariant
1505  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1506  MQueue->getQueueID());
1508  }
1509 #endif
1510 }
1511 
1513  return getWorkerQueue()->getContextImplPtr();
1514 }
1515 
1517  // TODO remove this workaround once the batching issue is addressed in Level
1518  // Zero plugin.
1519  // Consider the following scenario on Level Zero:
1520  // 1. Kernel A, which uses buffer A, is submitted to queue A.
1521  // 2. Kernel B, which uses buffer B, is submitted to queue B.
1522  // 3. queueA.wait().
1523  // 4. queueB.wait().
1524  // DPCPP runtime used to treat unmap/write commands for buffer A/B as host
1525  // dependencies (i.e. they were waited for prior to enqueueing any command
1526  // that's dependent on them). This allowed Level Zero plugin to detect that
1527  // each queue is idle on steps 1/2 and submit the command list right away.
1528  // This is no longer the case since we started passing these dependencies in
1529  // an event waitlist and Level Zero plugin attempts to batch these commands,
1530  // so the execution of kernel B starts only on step 4. This workaround
1531  // restores the old behavior in this case until this is resolved.
1532  return MQueue->is_host() ||
1533  MQueue->getDeviceImplPtr()->getBackend() !=
1535  MEvent->getHandleRef() != nullptr;
1536 }
1537 
1538 pi_int32 MemCpyCommand::enqueueImp() {
1540  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1541 
1542  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
1543 
1544  auto RawEvents = getPiEvents(EventImpls);
1545  flushCrossQueueDeps(EventImpls, getWorkerQueue());
1546 
1548  MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(),
1549  MSrcQueue, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1550  MSrcReq.MOffset, MSrcReq.MElemSize, MDstAllocaCmd->getMemAllocation(),
1551  MQueue, MDstReq.MDims, MDstReq.MMemoryRange, MDstReq.MAccessRange,
1552  MDstReq.MOffset, MDstReq.MElemSize, std::move(RawEvents), Event, MEvent);
1553 
1554  return PI_SUCCESS;
1555 }
1556 
1557 void MemCpyCommand::printDot(std::ostream &Stream) const {
1558  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#C7EB15\" label=\"";
1559 
1560  Stream << "ID = " << this << " ; ";
1561  Stream << "MEMCPY ON " << deviceToString(MQueue->get_device()) << "\\n";
1562  Stream << "From: " << MSrcAllocaCmd << " is host: " << MSrcQueue->is_host()
1563  << "\\n";
1564  Stream << "To: " << MDstAllocaCmd << " is host: " << MQueue->is_host()
1565  << "\\n";
1566 
1567  Stream << "\"];" << std::endl;
1568 
1569  for (const auto &Dep : MDeps) {
1570  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1571  << " [ label = \"Access mode: "
1572  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1573  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1574  << std::endl;
1575  }
1576 }
1577 
1578 AllocaCommandBase *ExecCGCommand::getAllocaForReq(Requirement *Req) {
1579  for (const DepDesc &Dep : MDeps) {
1580  if (Dep.MDepRequirement == Req)
1581  return Dep.MAllocaCmd;
1582  }
1583  // Default constructed accessors do not add dependencies, but they can be
1584  // passed to commands. Simply return nullptr, since they are empty and don't
1585  // really require any memory.
1586  return nullptr;
1587 }
1588 
1589 std::vector<std::shared_ptr<const void>>
1591  if (MCommandGroup->getType() == CG::Kernel)
1592  return ((CGExecKernel *)MCommandGroup.get())->getAuxiliaryResources();
1593  return {};
1594 }
1595 
1597  if (MCommandGroup->getType() == CG::Kernel)
1598  ((CGExecKernel *)MCommandGroup.get())->clearAuxiliaryResources();
1599 }
1600 
1601 pi_int32 UpdateHostRequirementCommand::enqueueImp() {
1603  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1604  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
1605  Command::waitForEvents(MQueue, EventImpls, Event);
1606 
1607  assert(MSrcAllocaCmd && "Expected valid alloca command");
1608  assert(MSrcAllocaCmd->getMemAllocation() && "Expected valid source pointer");
1609  assert(MDstPtr && "Expected valid target pointer");
1610  *MDstPtr = MSrcAllocaCmd->getMemAllocation();
1611 
1612  return PI_SUCCESS;
1613 }
1614 
1615 void UpdateHostRequirementCommand::printDot(std::ostream &Stream) const {
1616  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#f1337f\", label=\"";
1617 
1618  Stream << "ID = " << this << "\\n";
1619  Stream << "UPDATE REQ ON " << deviceToString(MQueue->get_device()) << "\\n";
1620  bool IsReqOnBuffer =
1621  MDstReq.MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer;
1622  Stream << "TYPE: " << (IsReqOnBuffer ? "Buffer" : "Image") << "\\n";
1623  if (IsReqOnBuffer)
1624  Stream << "Is sub buffer: " << std::boolalpha << MDstReq.MIsSubBuffer
1625  << "\\n";
1626 
1627  Stream << "\"];" << std::endl;
1628 
1629  for (const auto &Dep : MDeps) {
1630  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1631  << " [ label = \"Access mode: "
1632  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1633  << "MemObj: " << Dep.MAllocaCmd->getSYCLMemObj() << " \" ]"
1634  << std::endl;
1635  }
1636 }
1637 
1639  AllocaCommandBase *SrcAllocaCmd,
1640  Requirement DstReq, void **DstPtr,
1641  QueueImplPtr SrcQueue,
1642  QueueImplPtr DstQueue)
1643  : Command(CommandType::COPY_MEMORY, std::move(DstQueue)),
1644  MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)),
1645  MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)), MDstPtr(DstPtr) {
1646  if (!MSrcQueue->is_host()) {
1647  MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1648  }
1649 
1650  MWorkerQueue = MQueue->is_host() ? MSrcQueue : MQueue;
1651  MEvent->setWorkerQueue(MWorkerQueue);
1652 
1654 }
1655 
1657 #ifdef XPTI_ENABLE_INSTRUMENTATION
1658  if (!xptiCheckTraceEnabled(MStreamID))
1659  return;
1660  // Create a payload with the command name and an event using this payload to
1661  // emit a node_create
1662  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1664 
1665  if (MFirstInstance) {
1666  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1667  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1668  deviceToID(MQueue->get_device()));
1669  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1670  deviceToString(MQueue->get_device()));
1671  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1672  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1673  xpti::addMetadata(CmdTraceEvent, "memory_object",
1674  reinterpret_cast<size_t>(MAddress));
1675  xpti::addMetadata(CmdTraceEvent, "copy_from",
1676  reinterpret_cast<size_t>(
1677  getSyclObjImpl(MSrcQueue->get_device()).get()));
1678  xpti::addMetadata(
1679  CmdTraceEvent, "copy_to",
1680  reinterpret_cast<size_t>(getSyclObjImpl(MQueue->get_device()).get()));
1681  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
1682  // as this data is mutable and the metadata is supposed to be invariant
1683  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1684  MQueue->getQueueID());
1686  }
1687 #endif
1688 }
1689 
1691  return getWorkerQueue()->getContextImplPtr();
1692 }
1693 
1694 pi_int32 MemCpyCommandHost::enqueueImp() {
1695  const QueueImplPtr &Queue = getWorkerQueue();
1697  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1698  std::vector<sycl::detail::pi::PiEvent> RawEvents = getPiEvents(EventImpls);
1699 
1700  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
1701  // Omit copying if mode is discard one.
1702  // TODO: Handle this at the graph building time by, for example, creating
1703  // empty node instead of memcpy.
1706  Command::waitForEvents(Queue, EventImpls, Event);
1707 
1708  return PI_SUCCESS;
1709  }
1710 
1711  flushCrossQueueDeps(EventImpls, getWorkerQueue());
1713  MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(),
1714  MSrcQueue, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1715  MSrcReq.MOffset, MSrcReq.MElemSize, *MDstPtr, MQueue, MDstReq.MDims,
1716  MDstReq.MMemoryRange, MDstReq.MAccessRange, MDstReq.MOffset,
1717  MDstReq.MElemSize, std::move(RawEvents), MEvent->getHandleRef(), MEvent);
1718 
1719  return PI_SUCCESS;
1720 }
1721 
1723  : Command(CommandType::EMPTY_TASK, std::move(Queue)) {
1725 }
1726 
1727 pi_int32 EmptyCommand::enqueueImp() {
1729  waitForEvents(MQueue, MPreparedDepsEvents, MEvent->getHandleRef());
1730 
1731  return PI_SUCCESS;
1732 }
1733 
1735  const Requirement *Req) {
1736  const Requirement &ReqRef = *Req;
1737  MRequirements.emplace_back(ReqRef);
1738  const Requirement *const StoredReq = &MRequirements.back();
1739 
1740  // EmptyCommand is always host one, so we believe that result of addDep is
1741  // nil
1742  std::vector<Command *> ToCleanUp;
1743  Command *Cmd = addDep(DepDesc{DepCmd, StoredReq, AllocaCmd}, ToCleanUp);
1744  assert(Cmd == nullptr && "Conection command should be null for EmptyCommand");
1745  assert(ToCleanUp.empty() && "addDep should add a command for cleanup only if "
1746  "there's a connection command");
1747  (void)Cmd;
1748 }
1749 
1751 #ifdef XPTI_ENABLE_INSTRUMENTATION
1752  if (!xptiCheckTraceEnabled(MStreamID))
1753  return;
1754  // Create a payload with the command name and an event using this payload to
1755  // emit a node_create
1756  if (MRequirements.empty())
1757  return;
1758 
1759  Requirement &Req = *MRequirements.begin();
1760 
1761  MAddress = Req.MSYCLMemObj;
1763 
1764  if (MFirstInstance) {
1765  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1766  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1767  deviceToID(MQueue->get_device()));
1768  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1769  deviceToString(MQueue->get_device()));
1770  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1771  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1772  xpti::addMetadata(CmdTraceEvent, "memory_object",
1773  reinterpret_cast<size_t>(MAddress));
1774  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
1775  // as this data is mutable and the metadata is supposed to be invariant
1776  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1777  MQueue->getQueueID());
1779  }
1780 #endif
1781 }
1782 
1783 void EmptyCommand::printDot(std::ostream &Stream) const {
1784  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#8d8f29\", label=\"";
1785 
1786  Stream << "ID = " << this << "\\n";
1787  Stream << "EMPTY NODE"
1788  << "\\n";
1789 
1790  Stream << "\"];" << std::endl;
1791 
1792  for (const auto &Dep : MDeps) {
1793  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1794  << " [ label = \"Access mode: "
1795  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1796  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1797  << std::endl;
1798  }
1799 }
1800 
1801 bool EmptyCommand::producesPiEvent() const { return false; }
1802 
1803 void MemCpyCommandHost::printDot(std::ostream &Stream) const {
1804  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\"";
1805 
1806  Stream << "ID = " << this << "\\n";
1807  Stream << "MEMCPY HOST ON " << deviceToString(MQueue->get_device()) << "\\n";
1808 
1809  Stream << "\"];" << std::endl;
1810 
1811  for (const auto &Dep : MDeps) {
1812  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1813  << " [ label = \"Access mode: "
1814  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1815  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1816  << std::endl;
1817  }
1818 }
1819 
1821  QueueImplPtr Queue, Requirement Req, AllocaCommandBase *SrcAllocaCmd,
1822  void **DstPtr)
1823  : Command(CommandType::UPDATE_REQUIREMENT, std::move(Queue)),
1824  MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(Req)), MDstPtr(DstPtr) {
1825 
1827 }
1828 
1830 #ifdef XPTI_ENABLE_INSTRUMENTATION
1831  if (!xptiCheckTraceEnabled(MStreamID))
1832  return;
1833  // Create a payload with the command name and an event using this payload to
1834  // emit a node_create
1835  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1837 
1838  if (MFirstInstance) {
1839  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1840  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1841  deviceToID(MQueue->get_device()));
1842  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1843  deviceToString(MQueue->get_device()));
1844  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1845  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1846  xpti::addMetadata(CmdTraceEvent, "memory_object",
1847  reinterpret_cast<size_t>(MAddress));
1848  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
1849  // as this data is mutable and the metadata is supposed to be invariant
1850  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
1851  MQueue->getQueueID());
1853  }
1854 #endif
1855 }
1856 
1857 static std::string_view cgTypeToString(detail::CG::CGTYPE Type) {
1858  switch (Type) {
1859  case detail::CG::Kernel:
1860  return "Kernel";
1861  break;
1863  return "update_host";
1864  break;
1865  case detail::CG::Fill:
1866  return "fill";
1867  break;
1869  return "copy acc to acc";
1870  break;
1872  return "copy acc to ptr";
1873  break;
1875  return "copy ptr to acc";
1876  break;
1877  case detail::CG::Barrier:
1878  return "barrier";
1880  return "barrier waitlist";
1881  case detail::CG::CopyUSM:
1882  return "copy usm";
1883  break;
1884  case detail::CG::FillUSM:
1885  return "fill usm";
1886  break;
1888  return "prefetch usm";
1889  break;
1891  return "host task";
1892  break;
1893  case detail::CG::Copy2DUSM:
1894  return "copy 2d usm";
1895  break;
1896  case detail::CG::Fill2DUSM:
1897  return "fill 2d usm";
1898  break;
1899  case detail::CG::AdviseUSM:
1900  return "advise usm";
1902  return "memset 2d usm";
1903  break;
1905  return "copy to device_global";
1906  break;
1908  return "copy from device_global";
1909  break;
1911  return "read_write host pipe";
1913  return "exec command buffer";
1914  case detail::CG::CopyImage:
1915  return "copy image";
1917  return "semaphore wait";
1919  return "semaphore signal";
1920  default:
1921  return "unknown";
1922  break;
1923  }
1924 }
1925 
1927  std::unique_ptr<detail::CG> CommandGroup, QueueImplPtr Queue,
1929  const std::vector<sycl::detail::pi::PiExtSyncPoint> &Dependencies)
1930  : Command(CommandType::RUN_CG, std::move(Queue), CommandBuffer,
1931  Dependencies),
1932  MCommandGroup(std::move(CommandGroup)) {
1933  if (MCommandGroup->getType() == detail::CG::CodeplayHostTask) {
1934  MEvent->setSubmittedQueue(
1935  static_cast<detail::CGHostTask *>(MCommandGroup.get())->MQueue);
1936  }
1937 
1939 }
1940 
1941 #ifdef XPTI_ENABLE_INSTRUMENTATION
1942 std::string instrumentationGetKernelName(
1943  const std::shared_ptr<detail::kernel_impl> &SyclKernel,
1944  const std::string &FunctionName, const std::string &SyclKernelName,
1945  void *&Address, std::optional<bool> &FromSource) {
1946  std::string KernelName;
1947  if (SyclKernel && SyclKernel->isCreatedFromSource()) {
1948  FromSource = true;
1949  pi_kernel KernelHandle = SyclKernel->getHandleRef();
1950  Address = KernelHandle;
1951  KernelName = FunctionName;
1952  } else {
1953  FromSource = false;
1954  KernelName = demangleKernelName(SyclKernelName);
1955  }
1956  return KernelName;
1957 }
1958 
1959 void instrumentationAddExtraKernelMetadata(
1960  xpti_td *&CmdTraceEvent, const NDRDescT &NDRDesc,
1961  const std::shared_ptr<detail::kernel_bundle_impl> &KernelBundleImplPtr,
1962  const std::string &KernelName,
1963  const std::shared_ptr<detail::kernel_impl> &SyclKernel,
1964  const QueueImplPtr &Queue,
1965  std::vector<ArgDesc> &CGArgs) // CGArgs are not const since they could be
1966  // sorted in this function
1967 {
1968  std::vector<ArgDesc> Args;
1969 
1970  auto FilterArgs = [&Args](detail::ArgDesc &Arg, int NextTrueIndex) {
1971  Args.push_back({Arg.MType, Arg.MPtr, Arg.MSize, NextTrueIndex});
1972  };
1973  sycl::detail::pi::PiProgram Program = nullptr;
1974  sycl::detail::pi::PiKernel Kernel = nullptr;
1975  std::mutex *KernelMutex = nullptr;
1976  const KernelArgMask *EliminatedArgMask = nullptr;
1977 
1978  std::shared_ptr<kernel_impl> SyclKernelImpl;
1979  std::shared_ptr<device_image_impl> DeviceImageImpl;
1980 
1981  // Use kernel_bundle if available unless it is interop.
1982  // Interop bundles can't be used in the first branch, because the
1983  // kernels in interop kernel bundles (if any) do not have kernel_id and
1984  // can therefore not be looked up, but since they are self-contained
1985  // they can simply be launched directly.
1986  if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
1987  kernel_id KernelID =
1989  kernel SyclKernel =
1990  KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr);
1991  std::shared_ptr<kernel_impl> KernelImpl =
1992  detail::getSyclObjImpl(SyclKernel);
1993 
1994  EliminatedArgMask = KernelImpl->getKernelArgMask();
1995  Program = KernelImpl->getDeviceImage()->get_program_ref();
1996  } else if (nullptr != SyclKernel) {
1997  Program = SyclKernel->getProgramRef();
1998  if (!SyclKernel->isCreatedFromSource())
1999  EliminatedArgMask = SyclKernel->getKernelArgMask();
2000  } else {
2001  std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) =
2003  Queue->getContextImplPtr(), Queue->getDeviceImplPtr(), KernelName);
2004  }
2005 
2006  applyFuncOnFilteredArgs(EliminatedArgMask, CGArgs, FilterArgs);
2007 
2008  xpti::offload_kernel_enqueue_data_t KernelData{
2009  {NDRDesc.GlobalSize[0], NDRDesc.GlobalSize[1], NDRDesc.GlobalSize[2]},
2010  {NDRDesc.LocalSize[0], NDRDesc.LocalSize[1], NDRDesc.LocalSize[2]},
2011  {NDRDesc.GlobalOffset[0], NDRDesc.GlobalOffset[1],
2012  NDRDesc.GlobalOffset[2]},
2013  Args.size()};
2014  xpti::addMetadata(CmdTraceEvent, "enqueue_kernel_data", KernelData);
2015  for (size_t i = 0; i < Args.size(); i++) {
2016  std::string Prefix("arg");
2017  xpti::offload_kernel_arg_data_t arg{(int)Args[i].MType, Args[i].MPtr,
2018  Args[i].MSize, Args[i].MIndex};
2019  xpti::addMetadata(CmdTraceEvent, Prefix + std::to_string(i), arg);
2020  }
2021 }
2022 
2023 void instrumentationFillCommonData(const std::string &KernelName,
2024  const std::string &FileName, uint64_t Line,
2025  uint64_t Column, const void *const Address,
2026  const QueueImplPtr &Queue,
2027  std::optional<bool> &FromSource,
2028  uint64_t &OutInstanceID,
2029  xpti_td *&OutTraceEvent) {
2030  // Get source file, line number information from the CommandGroup object
2031  // and create payload using name, address, and source info
2032  //
2033  // On Windows, since the support for builtin functions is not available in
2034  // MSVC, the MFileName, MLine will be set to nullptr and "0" respectively.
2035  // Handle this condition explicitly here.
2036  bool HasSourceInfo = false;
2037  xpti::payload_t Payload;
2038  if (!FileName.empty()) {
2039  // File name has a valid string
2040  Payload = xpti::payload_t(KernelName.c_str(), FileName.c_str(), Line,
2041  Column, Address);
2042  HasSourceInfo = true;
2043  } else if (Address) {
2044  // We have a valid function name and an address
2045  Payload = xpti::payload_t(KernelName.c_str(), Address);
2046  } else {
2047  // In any case, we will have a valid function name and we'll use that to
2048  // create the hash
2049  Payload = xpti::payload_t(KernelName.c_str());
2050  }
2051  uint64_t CGKernelInstanceNo;
2052  // Create event using the payload
2053  xpti_td *CmdTraceEvent =
2054  xptiMakeEvent("ExecCG", &Payload, xpti::trace_graph_event,
2055  xpti::trace_activity_type_t::active, &CGKernelInstanceNo);
2056  if (CmdTraceEvent) {
2057  OutInstanceID = CGKernelInstanceNo;
2058  OutTraceEvent = CmdTraceEvent;
2059  // If we are seeing this event again, then the instance ID will be greater
2060  // than 1; in this case, we will skip sending a notification to create a
2061  // node as this node has already been created.
2062  if (CGKernelInstanceNo > 1)
2063  return;
2064 
2065  xpti::addMetadata(CmdTraceEvent, "sycl_device",
2066  deviceToID(Queue->get_device()));
2067  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
2068  deviceToString(Queue->get_device()));
2069  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
2070  getSyclObjImpl(Queue->get_device())->getDeviceName());
2071  if (!KernelName.empty()) {
2072  xpti::addMetadata(CmdTraceEvent, "kernel_name", KernelName);
2073  }
2074  if (FromSource.has_value()) {
2075  xpti::addMetadata(CmdTraceEvent, "from_source", FromSource.value());
2076  }
2077  if (HasSourceInfo) {
2078  xpti::addMetadata(CmdTraceEvent, "sym_function_name", KernelName);
2079  xpti::addMetadata(CmdTraceEvent, "sym_source_file_name", FileName);
2080  xpti::addMetadata(CmdTraceEvent, "sym_line_no", static_cast<int>(Line));
2081  xpti::addMetadata(CmdTraceEvent, "sym_column_no",
2082  static_cast<int>(Column));
2083  }
2084  // We no longer set the 'queue_id' in the metadata structure as it is a
2085  // mutable value and multiple threads using the same queue created at the
2086  // same location will overwrite the metadata values creating inconsistencies
2087  }
2088 }
2089 #endif
2090 
2091 #ifdef XPTI_ENABLE_INSTRUMENTATION
2092 std::pair<xpti_td *, uint64_t> emitKernelInstrumentationData(
2093  int32_t StreamID, const std::shared_ptr<detail::kernel_impl> &SyclKernel,
2094  const detail::code_location &CodeLoc, const std::string &SyclKernelName,
2095  const QueueImplPtr &Queue, const NDRDescT &NDRDesc,
2096  const std::shared_ptr<detail::kernel_bundle_impl> &KernelBundleImplPtr,
2097  std::vector<ArgDesc> &CGArgs) {
2098 
2099  auto XptiObjects = std::make_pair<xpti_td *, uint64_t>(nullptr, -1);
2100  constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
2101  if (!xptiCheckTraceEnabled(StreamID))
2102  return XptiObjects;
2103 
2104  void *Address = nullptr;
2105  std::optional<bool> FromSource;
2106  std::string KernelName = instrumentationGetKernelName(
2107  SyclKernel, std::string(CodeLoc.functionName()), SyclKernelName, Address,
2108  FromSource);
2109 
2110  auto &[CmdTraceEvent, InstanceID] = XptiObjects;
2111 
2112  std::string FileName =
2113  CodeLoc.fileName() ? CodeLoc.fileName() : std::string();
2114  instrumentationFillCommonData(KernelName, FileName, CodeLoc.lineNumber(),
2115  CodeLoc.columnNumber(), Address, Queue,
2116  FromSource, InstanceID, CmdTraceEvent);
2117 
2118  if (CmdTraceEvent) {
2119  // Stash the queue_id mutable metadata in TLS
2120  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
2121  Queue->getQueueID());
2122 
2123  instrumentationAddExtraKernelMetadata(CmdTraceEvent, NDRDesc,
2124  KernelBundleImplPtr, SyclKernelName,
2125  SyclKernel, Queue, CGArgs);
2126 
2127  xptiNotifySubscribers(
2128  StreamID, NotificationTraceType, detail::GSYCLGraphEvent, CmdTraceEvent,
2129  InstanceID,
2130  static_cast<const void *>(
2131  commandToNodeType(Command::CommandType::RUN_CG).c_str()));
2132  }
2133 
2134  return XptiObjects;
2135 }
2136 #endif
2137 
2139 #ifdef XPTI_ENABLE_INSTRUMENTATION
2140  constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
2141  if (!xptiCheckTraceEnabled(MStreamID))
2142  return;
2143 
2144  std::string KernelName;
2145  std::optional<bool> FromSource;
2146  switch (MCommandGroup->getType()) {
2147  case detail::CG::Kernel: {
2148  auto KernelCG =
2149  reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
2150  KernelName = instrumentationGetKernelName(
2151  KernelCG->MSyclKernel, MCommandGroup->MFunctionName,
2152  KernelCG->getKernelName(), MAddress, FromSource);
2153  } break;
2154  default:
2155  KernelName = getTypeString();
2156  break;
2157  }
2158 
2159  xpti_td *CmdTraceEvent = nullptr;
2160  instrumentationFillCommonData(KernelName, MCommandGroup->MFileName,
2161  MCommandGroup->MLine, MCommandGroup->MColumn,
2162  MAddress, MQueue, FromSource, MInstanceID,
2163  CmdTraceEvent);
2164 
2165  if (CmdTraceEvent) {
2166  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
2167  MQueue->getQueueID());
2168  MTraceEvent = static_cast<void *>(CmdTraceEvent);
2169  if (MCommandGroup->getType() == detail::CG::Kernel) {
2170  auto KernelCG =
2171  reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
2172  instrumentationAddExtraKernelMetadata(
2173  CmdTraceEvent, KernelCG->MNDRDesc, KernelCG->getKernelBundle(),
2174  KernelCG->MKernelName, KernelCG->MSyclKernel, MQueue,
2175  KernelCG->MArgs);
2176  }
2177 
2178  xptiNotifySubscribers(
2179  MStreamID, NotificationTraceType, detail::GSYCLGraphEvent,
2180  CmdTraceEvent, MInstanceID,
2181  static_cast<const void *>(commandToNodeType(MType).c_str()));
2182  }
2183 #endif
2184 }
2185 
2186 void ExecCGCommand::printDot(std::ostream &Stream) const {
2187  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
2188 
2189  Stream << "ID = " << this << "\\n";
2190  Stream << "EXEC CG ON " << deviceToString(MQueue->get_device()) << "\\n";
2191 
2192  switch (MCommandGroup->getType()) {
2193  case detail::CG::Kernel: {
2194  auto KernelCG =
2195  reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
2196  Stream << "Kernel name: ";
2197  if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource())
2198  Stream << "created from source";
2199  else
2200  Stream << demangleKernelName(KernelCG->getKernelName());
2201  Stream << "\\n";
2202  break;
2203  }
2204  default:
2205  Stream << "CG type: " << getTypeString() << "\\n";
2206  break;
2207  }
2208 
2209  Stream << "\"];" << std::endl;
2210 
2211  for (const auto &Dep : MDeps) {
2212  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
2213  << " [ label = \"Access mode: "
2214  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
2215  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
2216  << std::endl;
2217  }
2218 }
2219 
2220 std::string_view ExecCGCommand::getTypeString() const {
2221  return cgTypeToString(MCommandGroup->getType());
2222 }
2223 
2224 // SYCL has a parallel_for_work_group variant where the only NDRange
2225 // characteristics set by a user is the number of work groups. This does not
2226 // map to the OpenCL clEnqueueNDRangeAPI, which requires global work size to
2227 // be set as well. This function determines local work size based on the
2228 // device characteristics and the number of work groups requested by the user,
2229 // then calculates the global work size. SYCL specification (from 4.8.5.3):
2230 // The member function handler::parallel_for_work_group is parameterized by
2231 // the number of work - groups, such that the size of each group is chosen by
2232 // the runtime, or by the number of work - groups and number of work - items
2233 // for users who need more control.
2236  const device_impl &DeviceImpl) {
2237  if (NDR.GlobalSize[0] != 0)
2238  return; // GlobalSize is set - no need to adjust
2239  // check the prerequisites:
2240  assert(NDR.LocalSize[0] == 0);
2241  // TODO might be good to cache this info together with the kernel info to
2242  // avoid get_kernel_work_group_info on every kernel run
2244  sycl::info::kernel_device_specific::compile_work_group_size>(
2245  Kernel, DeviceImpl.getHandleRef(), DeviceImpl.getPlugin());
2246 
2247  if (WGSize[0] == 0) {
2248  WGSize = {1, 1, 1};
2249  }
2250  NDR.set(NDR.Dims, nd_range<3>(NDR.NumWorkGroups * WGSize, WGSize));
2251 }
2252 
2253 // We have the following mapping between dimensions with SPIR-V builtins:
2254 // 1D: id[0] -> x
2255 // 2D: id[0] -> y, id[1] -> x
2256 // 3D: id[0] -> z, id[1] -> y, id[2] -> x
2257 // So in order to ensure the correctness we update all the kernel
2258 // parameters accordingly.
2259 // Initially we keep the order of NDRDescT as it provided by the user, this
2260 // simplifies overall handling and do the reverse only when
2261 // the kernel is enqueued.
2263  if (NDR.Dims > 1) {
2264  std::swap(NDR.GlobalSize[0], NDR.GlobalSize[NDR.Dims - 1]);
2265  std::swap(NDR.LocalSize[0], NDR.LocalSize[NDR.Dims - 1]);
2266  std::swap(NDR.GlobalOffset[0], NDR.GlobalOffset[NDR.Dims - 1]);
2267  }
2268 }
2269 
2271  switch (AccessorMode) {
2272  case access::mode::read:
2273  return PI_ACCESS_READ_ONLY;
2274  case access::mode::write:
2276  return PI_ACCESS_WRITE_ONLY;
2277  default:
2278  return PI_ACCESS_READ_WRITE;
2279  }
2280 }
2281 
2283  const PluginPtr &Plugin, sycl::detail::pi::PiKernel Kernel,
2284  const std::shared_ptr<device_image_impl> &DeviceImageImpl,
2285  const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
2286  const sycl::context &Context, bool IsHost, detail::ArgDesc &Arg,
2287  size_t NextTrueIndex) {
2288  switch (Arg.MType) {
2290  break;
2292  Requirement *Req = (Requirement *)(Arg.MPtr);
2293 
2294  // getMemAllocationFunc is nullptr when there are no requirements. However,
2295  // we may pass default constructed accessors to a command, which don't add
2296  // requirements. In such case, getMemAllocationFunc is nullptr, but it's a
2297  // valid case, so we need to properly handle it.
2298  sycl::detail::pi::PiMem MemArg =
2299  getMemAllocationFunc
2300  ? (sycl::detail::pi::PiMem)getMemAllocationFunc(Req)
2301  : nullptr;
2302  if (Context.get_backend() == backend::opencl) {
2303  // clSetKernelArg (corresponding to piKernelSetArg) returns an error
2304  // when MemArg is null, which is the case when zero-sized buffers are
2305  // handled. Below assignment provides later call to clSetKernelArg with
2306  // acceptable arguments.
2307  if (!MemArg)
2308  MemArg = sycl::detail::pi::PiMem();
2309 
2310  Plugin->call<PiApiKind::piKernelSetArg>(
2311  Kernel, NextTrueIndex, sizeof(sycl::detail::pi::PiMem), &MemArg);
2312  } else {
2313  pi_mem_obj_property MemObjData{};
2314  MemObjData.mem_access = AccessModeToPi(Req->MAccessMode);
2315  MemObjData.type = PI_KERNEL_ARG_MEM_OBJ_ACCESS;
2316  Plugin->call<PiApiKind::piextKernelSetArgMemObj>(Kernel, NextTrueIndex,
2317  &MemObjData, &MemArg);
2318  }
2319  break;
2320  }
2322  Plugin->call<PiApiKind::piKernelSetArg>(Kernel, NextTrueIndex, Arg.MSize,
2323  Arg.MPtr);
2324  break;
2325  }
2327  sampler *SamplerPtr = (sampler *)Arg.MPtr;
2328  sycl::detail::pi::PiSampler Sampler =
2329  detail::getSyclObjImpl(*SamplerPtr)->getOrCreateSampler(Context);
2330  Plugin->call<PiApiKind::piextKernelSetArgSampler>(Kernel, NextTrueIndex,
2331  &Sampler);
2332  break;
2333  }
2335  Plugin->call<PiApiKind::piextKernelSetArgPointer>(Kernel, NextTrueIndex,
2336  Arg.MSize, Arg.MPtr);
2337  break;
2338  }
2340  if (IsHost) {
2341  throw sycl::exception(
2342  sycl::make_error_code(sycl::errc::feature_not_supported),
2343  "SYCL2020 specialization constants are not yet supported on host "
2344  "device " +
2345  codeToString(PI_ERROR_INVALID_OPERATION));
2346  }
2347  assert(DeviceImageImpl != nullptr);
2348  sycl::detail::pi::PiMem SpecConstsBuffer =
2349  DeviceImageImpl->get_spec_const_buffer_ref();
2350  // Avoid taking an address of nullptr
2351  sycl::detail::pi::PiMem *SpecConstsBufferArg =
2352  SpecConstsBuffer ? &SpecConstsBuffer : nullptr;
2353 
2354  pi_mem_obj_property MemObjData{};
2355  MemObjData.mem_access = PI_ACCESS_READ_ONLY;
2356  MemObjData.type = PI_KERNEL_ARG_MEM_OBJ_ACCESS;
2357  Plugin->call<PiApiKind::piextKernelSetArgMemObj>(
2358  Kernel, NextTrueIndex, &MemObjData, SpecConstsBufferArg);
2359  break;
2360  }
2362  throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
2363  "Invalid kernel param kind " +
2364  codeToString(PI_ERROR_INVALID_VALUE));
2365  break;
2366  }
2367 }
2368 
2370  const QueueImplPtr &Queue, std::vector<ArgDesc> &Args,
2371  const std::shared_ptr<device_image_impl> &DeviceImageImpl,
2372  sycl::detail::pi::PiKernel Kernel, NDRDescT &NDRDesc,
2373  std::vector<sycl::detail::pi::PiEvent> &RawEvents,
2374  const detail::EventImplPtr &OutEventImpl,
2375  const KernelArgMask *EliminatedArgMask,
2376  const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
2377  bool IsCooperative) {
2378  const PluginPtr &Plugin = Queue->getPlugin();
2379 
2380  auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
2381  &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) {
2382  SetArgBasedOnType(Plugin, Kernel, DeviceImageImpl, getMemAllocationFunc,
2383  Queue->get_context(), Queue->is_host(), Arg,
2384  NextTrueIndex);
2385  };
2386 
2387  applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc);
2388 
2389  adjustNDRangePerKernel(NDRDesc, Kernel, *(Queue->getDeviceImplPtr()));
2390 
2391  // Remember this information before the range dimensions are reversed
2392  const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0);
2393 
2395 
2396  size_t RequiredWGSize[3] = {0, 0, 0};
2397  size_t *LocalSize = nullptr;
2398 
2399  if (HasLocalSize)
2400  LocalSize = &NDRDesc.LocalSize[0];
2401  else {
2402  Plugin->call<PiApiKind::piKernelGetGroupInfo>(
2403  Kernel, Queue->getDeviceImplPtr()->getHandleRef(),
2404  PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, sizeof(RequiredWGSize),
2405  RequiredWGSize, /* param_value_size_ret = */ nullptr);
2406 
2407  const bool EnforcedLocalSize =
2408  (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2409  RequiredWGSize[2] != 0);
2410  if (EnforcedLocalSize)
2411  LocalSize = RequiredWGSize;
2412  }
2413  if (OutEventImpl != nullptr)
2414  OutEventImpl->setHostEnqueueTime();
2415  pi_result Error =
2416  [&](auto... Args) {
2417  if (IsCooperative) {
2418  return Plugin
2420  Args...);
2421  }
2422  return Plugin->call_nocheck<PiApiKind::piEnqueueKernelLaunch>(Args...);
2423  }(Queue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
2424  &NDRDesc.GlobalSize[0], LocalSize, RawEvents.size(),
2425  RawEvents.empty() ? nullptr : &RawEvents[0],
2426  OutEventImpl ? &OutEventImpl->getHandleRef() : nullptr);
2427  return Error;
2428 }
2429 
2430 // The function initialize accessors and calls lambda.
2431 void DispatchNativeKernel(void *Blob) {
2432  void **CastedBlob = (void **)Blob;
2433 
2434  std::vector<Requirement *> *Reqs =
2435  static_cast<std::vector<Requirement *> *>(CastedBlob[0]);
2436 
2437  std::shared_ptr<HostKernelBase> *HostKernel =
2438  static_cast<std::shared_ptr<HostKernelBase> *>(CastedBlob[1]);
2439 
2440  NDRDescT *NDRDesc = static_cast<NDRDescT *>(CastedBlob[2]);
2441 
2442  // Other value are pointer to the buffers.
2443  void **NextArg = CastedBlob + 3;
2444  for (detail::Requirement *Req : *Reqs)
2445  Req->MData = *(NextArg++);
2446 
2447  (*HostKernel)->call(*NDRDesc, nullptr);
2448 
2449  // The ownership of these objects have been passed to us, need to cleanup
2450  delete Reqs;
2451  delete HostKernel;
2452  delete NDRDesc;
2453 }
2454 
2456  context Ctx, DeviceImplPtr DeviceImpl,
2458  const CGExecKernel &CommandGroup,
2459  std::vector<sycl::detail::pi::PiExtSyncPoint> &SyncPoints,
2460  sycl::detail::pi::PiExtSyncPoint *OutSyncPoint,
2462  const std::function<void *(Requirement *Req)> &getMemAllocationFunc) {
2463  auto ContextImpl = sycl::detail::getSyclObjImpl(Ctx);
2464  const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin();
2465  pi_kernel PiKernel = nullptr;
2466  pi_program PiProgram = nullptr;
2467  std::shared_ptr<kernel_impl> SyclKernelImpl = nullptr;
2468  std::shared_ptr<device_image_impl> DeviceImageImpl = nullptr;
2469 
2470  auto Kernel = CommandGroup.MSyclKernel;
2471  auto KernelBundleImplPtr = CommandGroup.MKernelBundle;
2472  const KernelArgMask *EliminatedArgMask = nullptr;
2473 
2474  // Use kernel_bundle if available unless it is interop.
2475  // Interop bundles can't be used in the first branch, because the kernels
2476  // in interop kernel bundles (if any) do not have kernel_id
2477  // and can therefore not be looked up, but since they are self-contained
2478  // they can simply be launched directly.
2479  if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
2480  auto KernelName = CommandGroup.MKernelName;
2481  kernel_id KernelID =
2483  kernel SyclKernel =
2484  KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr);
2485  SyclKernelImpl = detail::getSyclObjImpl(SyclKernel);
2486  PiKernel = SyclKernelImpl->getHandleRef();
2487  DeviceImageImpl = SyclKernelImpl->getDeviceImage();
2488  PiProgram = DeviceImageImpl->get_program_ref();
2489  EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
2490  } else if (Kernel != nullptr) {
2491  PiKernel = Kernel->getHandleRef();
2492  PiProgram = Kernel->getProgramRef();
2493  EliminatedArgMask = Kernel->getKernelArgMask();
2494  } else {
2495  std::tie(PiKernel, std::ignore, EliminatedArgMask, PiProgram) =
2496  sycl::detail::ProgramManager::getInstance().getOrCreateKernel(
2497  ContextImpl, DeviceImpl, CommandGroup.MKernelName);
2498  }
2499 
2500  auto SetFunc = [&Plugin, &PiKernel, &DeviceImageImpl, &Ctx,
2501  &getMemAllocationFunc](sycl::detail::ArgDesc &Arg,
2502  size_t NextTrueIndex) {
2503  sycl::detail::SetArgBasedOnType(Plugin, PiKernel, DeviceImageImpl,
2504  getMemAllocationFunc, Ctx, false, Arg,
2505  NextTrueIndex);
2506  };
2507  // Copy args for modification
2508  auto Args = CommandGroup.MArgs;
2509  sycl::detail::applyFuncOnFilteredArgs(EliminatedArgMask, Args, SetFunc);
2510 
2511  // Remember this information before the range dimensions are reversed
2512  const bool HasLocalSize = (CommandGroup.MNDRDesc.LocalSize[0] != 0);
2513 
2514  // Copy NDRDesc for modification
2515  auto NDRDesc = CommandGroup.MNDRDesc;
2516  // Reverse kernel dims
2518 
2519  size_t RequiredWGSize[3] = {0, 0, 0};
2520  size_t *LocalSize = nullptr;
2521 
2522  if (HasLocalSize)
2523  LocalSize = &NDRDesc.LocalSize[0];
2524  else {
2526  PiKernel, DeviceImpl->getHandleRef(),
2527  PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, sizeof(RequiredWGSize),
2528  RequiredWGSize,
2529  /* param_value_size_ret = */ nullptr);
2530 
2531  const bool EnforcedLocalSize =
2532  (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2533  RequiredWGSize[2] != 0);
2534  if (EnforcedLocalSize)
2535  LocalSize = RequiredWGSize;
2536  }
2537 
2538  pi_result Res = Plugin->call_nocheck<
2540  CommandBuffer, PiKernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
2541  &NDRDesc.GlobalSize[0], LocalSize, SyncPoints.size(),
2542  SyncPoints.size() ? SyncPoints.data() : nullptr, OutSyncPoint,
2543  OutCommand);
2544 
2545  if (!SyclKernelImpl && !Kernel) {
2546  Plugin->call<PiApiKind::piKernelRelease>(PiKernel);
2547  Plugin->call<PiApiKind::piProgramRelease>(PiProgram);
2548  }
2549 
2550  if (Res != pi_result::PI_SUCCESS) {
2551  const device_impl &DeviceImplem = *(DeviceImpl);
2553  PiKernel, NDRDesc);
2554  }
2555 
2556  return Res;
2557 }
2558 
2560  const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector<ArgDesc> &Args,
2561  const std::shared_ptr<detail::kernel_bundle_impl> &KernelBundleImplPtr,
2562  const std::shared_ptr<detail::kernel_impl> &MSyclKernel,
2563  const std::string &KernelName,
2564  std::vector<sycl::detail::pi::PiEvent> &RawEvents,
2565  const detail::EventImplPtr &OutEventImpl,
2566  const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
2567  sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig,
2568  const bool KernelIsCooperative) {
2569 
2570  // Run OpenCL kernel
2571  auto ContextImpl = Queue->getContextImplPtr();
2572  auto DeviceImpl = Queue->getDeviceImplPtr();
2573  sycl::detail::pi::PiKernel Kernel = nullptr;
2574  std::mutex *KernelMutex = nullptr;
2575  sycl::detail::pi::PiProgram Program = nullptr;
2576  const KernelArgMask *EliminatedArgMask;
2577 
2578  std::shared_ptr<kernel_impl> SyclKernelImpl;
2579  std::shared_ptr<device_image_impl> DeviceImageImpl;
2580 
2581  // Use kernel_bundle if available unless it is interop.
2582  // Interop bundles can't be used in the first branch, because the kernels
2583  // in interop kernel bundles (if any) do not have kernel_id
2584  // and can therefore not be looked up, but since they are self-contained
2585  // they can simply be launched directly.
2586  if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
2587  kernel_id KernelID =
2589  kernel SyclKernel =
2590  KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr);
2591 
2592  SyclKernelImpl = detail::getSyclObjImpl(SyclKernel);
2593 
2594  Kernel = SyclKernelImpl->getHandleRef();
2595  DeviceImageImpl = SyclKernelImpl->getDeviceImage();
2596 
2597  Program = DeviceImageImpl->get_program_ref();
2598 
2599  EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
2600  KernelMutex = SyclKernelImpl->getCacheMutex();
2601  } else if (nullptr != MSyclKernel) {
2602  assert(MSyclKernel->get_info<info::kernel::context>() ==
2603  Queue->get_context());
2604  Kernel = MSyclKernel->getHandleRef();
2605  Program = MSyclKernel->getProgramRef();
2606 
2607  // Non-cacheable kernels use mutexes from kernel_impls.
2608  // TODO this can still result in a race condition if multiple SYCL
2609  // kernels are created with the same native handle. To address this,
2610  // we need to either store and use a pi_native_handle -> mutex map or
2611  // reuse and return existing SYCL kernels from make_native to avoid
2612  // their duplication in such cases.
2613  KernelMutex = &MSyclKernel->getNoncacheableEnqueueMutex();
2614  EliminatedArgMask = MSyclKernel->getKernelArgMask();
2615  } else {
2616  std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) =
2618  ContextImpl, DeviceImpl, KernelName, NDRDesc);
2619  }
2620 
2621  // We may need more events for the launch, so we make another reference.
2622  std::vector<sycl::detail::pi::PiEvent> &EventsWaitList = RawEvents;
2623 
2624  // Initialize device globals associated with this.
2625  std::vector<sycl::detail::pi::PiEvent> DeviceGlobalInitEvents =
2626  ContextImpl->initializeDeviceGlobals(Program, Queue);
2627  std::vector<sycl::detail::pi::PiEvent> EventsWithDeviceGlobalInits;
2628  if (!DeviceGlobalInitEvents.empty()) {
2629  EventsWithDeviceGlobalInits.reserve(RawEvents.size() +
2630  DeviceGlobalInitEvents.size());
2631  EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(),
2632  RawEvents.begin(), RawEvents.end());
2633  EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(),
2634  DeviceGlobalInitEvents.begin(),
2635  DeviceGlobalInitEvents.end());
2636  EventsWaitList = EventsWithDeviceGlobalInits;
2637  }
2638 
2639  pi_result Error = PI_SUCCESS;
2640  {
2641  // When KernelMutex is null, this means that in-memory caching is
2642  // disabled, which means that kernel object is not shared, so no locking
2643  // is necessary.
2644  using LockT = std::unique_lock<std::mutex>;
2645  auto Lock = KernelMutex ? LockT(*KernelMutex) : LockT();
2646 
2647  // Set SLM/Cache configuration for the kernel if non-default value is
2648  // provided.
2649  if (KernelCacheConfig == PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM ||
2650  KernelCacheConfig == PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA) {
2651  const PluginPtr &Plugin = Queue->getPlugin();
2652  Plugin->call<PiApiKind::piKernelSetExecInfo>(
2654  sizeof(sycl::detail::pi::PiKernelCacheConfig), &KernelCacheConfig);
2655  }
2656 
2657  Error = SetKernelParamsAndLaunch(Queue, Args, DeviceImageImpl, Kernel,
2658  NDRDesc, EventsWaitList, OutEventImpl,
2659  EliminatedArgMask, getMemAllocationFunc,
2660  KernelIsCooperative);
2661 
2662  const PluginPtr &Plugin = Queue->getPlugin();
2663  if (!SyclKernelImpl && !MSyclKernel) {
2664  Plugin->call<PiApiKind::piKernelRelease>(Kernel);
2665  Plugin->call<PiApiKind::piProgramRelease>(Program);
2666  }
2667  }
2668  if (PI_SUCCESS != Error) {
2669  // If we have got non-success error code, let's analyze it to emit nice
2670  // exception explaining what was wrong
2671  const device_impl &DeviceImpl = *(Queue->getDeviceImplPtr());
2673  Kernel, NDRDesc);
2674  }
2675 
2676  return PI_SUCCESS;
2677 }
2678 
2679 pi_int32
2680 enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName,
2681  bool blocking, void *ptr, size_t size,
2682  std::vector<sycl::detail::pi::PiEvent> &RawEvents,
2683  const detail::EventImplPtr &OutEventImpl, bool read) {
2684  detail::HostPipeMapEntry *hostPipeEntry =
2686 
2687  sycl::detail::pi::PiProgram Program = nullptr;
2688  device Device = Queue->get_device();
2689  ContextImplPtr ContextImpl = Queue->getContextImplPtr();
2690  std::optional<sycl::detail::pi::PiProgram> CachedProgram =
2691  ContextImpl->getProgramForHostPipe(Device, hostPipeEntry);
2692  if (CachedProgram)
2693  Program = *CachedProgram;
2694  else {
2695  // If there was no cached program, build one.
2696  device_image_plain devImgPlain =
2698  hostPipeEntry->getDevBinImage(), Queue->get_context(),
2699  Queue->get_device());
2700  device_image_plain BuiltImage =
2701  ProgramManager::getInstance().build(devImgPlain, {Device}, {});
2702  Program = getSyclObjImpl(BuiltImage)->get_program_ref();
2703  }
2704  assert(Program && "Program for this hostpipe is not compiled.");
2705 
2706  // Get plugin for calling opencl functions
2707  const PluginPtr &Plugin = Queue->getPlugin();
2708 
2709  pi_queue pi_q = Queue->getHandleRef();
2710  pi_result Error;
2711 
2712  auto OutEvent = OutEventImpl ? &OutEventImpl->getHandleRef() : nullptr;
2713  if (OutEventImpl != nullptr)
2714  OutEventImpl->setHostEnqueueTime();
2715  if (read) {
2716  Error =
2718  pi_q, Program, PipeName.c_str(), blocking, ptr, size,
2719  RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
2720  OutEvent);
2721  } else {
2722  Error =
2723  Plugin
2725  pi_q, Program, PipeName.c_str(), blocking, ptr, size,
2726  RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
2727  OutEvent);
2728  }
2729  return Error;
2730 }
2731 
2732 pi_int32 ExecCGCommand::enqueueImpCommandBuffer() {
2733  // Wait on host command dependencies
2735 
2736  // Any device dependencies need to be waited on here since subsequent
2737  // submissions of the command buffer itself will not receive dependencies on
2738  // them, e.g. initial copies from host to device
2739  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
2740  flushCrossQueueDeps(EventImpls, getWorkerQueue());
2741  std::vector<sycl::detail::pi::PiEvent> RawEvents = getPiEvents(EventImpls);
2742  if (!RawEvents.empty()) {
2743  const PluginPtr &Plugin = MQueue->getPlugin();
2744  Plugin->call<PiApiKind::piEventsWait>(RawEvents.size(), &RawEvents[0]);
2745  }
2746 
2747  sycl::detail::pi::PiEvent *Event =
2748  (MQueue->supportsDiscardingPiEvents() &&
2749  MCommandGroup->getRequirements().size() == 0)
2750  ? nullptr
2751  : &MEvent->getHandleRef();
2752  sycl::detail::pi::PiExtSyncPoint OutSyncPoint;
2753  sycl::detail::pi::PiExtCommandBufferCommand OutCommand = nullptr;
2754  switch (MCommandGroup->getType()) {
2755  case CG::CGTYPE::Kernel: {
2756  CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
2757 
2758  auto getMemAllocationFunc = [this](Requirement *Req) {
2759  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2760  return AllocaCmd->getMemAllocation();
2761  };
2762 
2763  if (!Event) {
2764  // Kernel only uses assert if it's non interop one
2765  bool KernelUsesAssert =
2766  !(ExecKernel->MSyclKernel && ExecKernel->MSyclKernel->isInterop()) &&
2768  ExecKernel->MKernelName);
2769  if (KernelUsesAssert) {
2770  Event = &MEvent->getHandleRef();
2771  }
2772  }
2773  auto result = enqueueImpCommandBufferKernel(
2774  MQueue->get_context(), MQueue->getDeviceImplPtr(), MCommandBuffer,
2775  *ExecKernel, MSyncPointDeps, &OutSyncPoint, &OutCommand,
2776  getMemAllocationFunc);
2777  MEvent->setSyncPoint(OutSyncPoint);
2778  MEvent->setCommandBufferCommand(OutCommand);
2779  return result;
2780  }
2781  case CG::CGTYPE::CopyUSM: {
2782  CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
2784  MQueue->getContextImplPtr(), Copy->getSrc(), MCommandBuffer,
2785  Copy->getLength(), Copy->getDst(), MSyncPointDeps, &OutSyncPoint);
2786  MEvent->setSyncPoint(OutSyncPoint);
2787  return PI_SUCCESS;
2788  }
2789  case CG::CGTYPE::CopyAccToAcc: {
2790  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2791  Requirement *ReqSrc = (Requirement *)(Copy->getSrc());
2792  Requirement *ReqDst = (Requirement *)(Copy->getDst());
2793 
2794  AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc);
2795  AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst);
2796 
2798  MQueue->getContextImplPtr(), MCommandBuffer,
2799  AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(),
2800  ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange,
2801  ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
2802  ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
2803  ReqDst->MOffset, ReqDst->MElemSize, std::move(MSyncPointDeps),
2804  &OutSyncPoint);
2805  MEvent->setSyncPoint(OutSyncPoint);
2806  return PI_SUCCESS;
2807  }
2808  case CG::CGTYPE::CopyAccToPtr: {
2809  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2810  Requirement *Req = (Requirement *)Copy->getSrc();
2811  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2812 
2814  MQueue->getContextImplPtr(), MCommandBuffer, AllocaCmd->getSYCLMemObj(),
2815  AllocaCmd->getMemAllocation(), Req->MDims, Req->MMemoryRange,
2816  Req->MAccessRange, Req->MOffset, Req->MElemSize, (char *)Copy->getDst(),
2817  Req->MDims, Req->MAccessRange,
2818  /*DstOffset=*/{0, 0, 0}, Req->MElemSize, std::move(MSyncPointDeps),
2819  &OutSyncPoint);
2820  MEvent->setSyncPoint(OutSyncPoint);
2821  return PI_SUCCESS;
2822  }
2823  case CG::CGTYPE::CopyPtrToAcc: {
2824  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2825  Requirement *Req = (Requirement *)(Copy->getDst());
2826  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2827 
2829  MQueue->getContextImplPtr(), MCommandBuffer, AllocaCmd->getSYCLMemObj(),
2830  (char *)Copy->getSrc(), Req->MDims, Req->MAccessRange,
2831  /*SrcOffset*/ {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(),
2832  Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2833  Req->MElemSize, std::move(MSyncPointDeps), &OutSyncPoint);
2834  MEvent->setSyncPoint(OutSyncPoint);
2835  return PI_SUCCESS;
2836  }
2837  case CG::CGTYPE::Fill: {
2838  CGFill *Fill = (CGFill *)MCommandGroup.get();
2839  Requirement *Req = (Requirement *)(Fill->getReqToFill());
2840  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2841 
2843  MQueue->getContextImplPtr(), MCommandBuffer, AllocaCmd->getSYCLMemObj(),
2844  AllocaCmd->getMemAllocation(), Fill->MPattern.size(),
2845  Fill->MPattern.data(), Req->MDims, Req->MMemoryRange, Req->MAccessRange,
2846  Req->MOffset, Req->MElemSize, std::move(MSyncPointDeps), &OutSyncPoint);
2847  MEvent->setSyncPoint(OutSyncPoint);
2848  return PI_SUCCESS;
2849  }
2850  case CG::CGTYPE::FillUSM: {
2851  CGFillUSM *Fill = (CGFillUSM *)MCommandGroup.get();
2853  MQueue->getContextImplPtr(), MCommandBuffer, Fill->getDst(),
2854  Fill->getLength(), Fill->getFill(), std::move(MSyncPointDeps),
2855  &OutSyncPoint);
2856  MEvent->setSyncPoint(OutSyncPoint);
2857  return PI_SUCCESS;
2858  }
2859  case CG::CGTYPE::PrefetchUSM: {
2860  CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get();
2862  MQueue->getContextImplPtr(), MCommandBuffer, Prefetch->getDst(),
2863  Prefetch->getLength(), std::move(MSyncPointDeps), &OutSyncPoint);
2864  MEvent->setSyncPoint(OutSyncPoint);
2865  return PI_SUCCESS;
2866  }
2867  case CG::CGTYPE::AdviseUSM: {
2868  CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get();
2870  MQueue->getContextImplPtr(), MCommandBuffer, Advise->getDst(),
2871  Advise->getLength(), Advise->getAdvice(), std::move(MSyncPointDeps),
2872  &OutSyncPoint);
2873  MEvent->setSyncPoint(OutSyncPoint);
2874  return PI_SUCCESS;
2875  }
2876 
2877  default:
2878  throw runtime_error("CG type not implemented for command buffers.",
2879  PI_ERROR_INVALID_OPERATION);
2880  }
2881 }
2882 
2883 pi_int32 ExecCGCommand::enqueueImp() {
2884  if (MCommandBuffer) {
2885  return enqueueImpCommandBuffer();
2886  } else {
2887  return enqueueImpQueue();
2888  }
2889 }
2890 
2891 pi_int32 ExecCGCommand::enqueueImpQueue() {
2892  if (getCG().getType() != CG::CGTYPE::CodeplayHostTask)
2894  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
2895  auto RawEvents = getPiEvents(EventImpls);
2896  flushCrossQueueDeps(EventImpls, getWorkerQueue());
2897 
2898  bool DiscardPiEvent = (MQueue->supportsDiscardingPiEvents() &&
2899  MCommandGroup->getRequirements().size() == 0);
2900  sycl::detail::pi::PiEvent *Event =
2901  DiscardPiEvent ? nullptr : &MEvent->getHandleRef();
2902  detail::EventImplPtr EventImpl = DiscardPiEvent ? nullptr : MEvent;
2903 
2904  switch (MCommandGroup->getType()) {
2905 
2906  case CG::CGTYPE::UpdateHost: {
2907  throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
2908  "Update host should be handled by the Scheduler. " +
2909  codeToString(PI_ERROR_INVALID_VALUE));
2910  }
2911  case CG::CGTYPE::CopyAccToPtr: {
2912  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2913  Requirement *Req = (Requirement *)Copy->getSrc();
2914  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2915 
2917  AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(), MQueue,
2918  Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2919  Req->MElemSize, Copy->getDst(),
2921  Req->MAccessRange, Req->MAccessRange, /*DstOffset=*/{0, 0, 0},
2922  Req->MElemSize, std::move(RawEvents), MEvent->getHandleRef(), MEvent);
2923 
2924  return PI_SUCCESS;
2925  }
2926  case CG::CGTYPE::CopyPtrToAcc: {
2927  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2928  Requirement *Req = (Requirement *)(Copy->getDst());
2929  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2930 
2932 
2934  AllocaCmd->getSYCLMemObj(), Copy->getSrc(),
2936  Req->MAccessRange, Req->MAccessRange,
2937  /*SrcOffset*/ {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(),
2938  MQueue, Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2939  Req->MElemSize, std::move(RawEvents), MEvent->getHandleRef(), MEvent);
2940 
2941  return PI_SUCCESS;
2942  }
2943  case CG::CGTYPE::CopyAccToAcc: {
2944  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2945  Requirement *ReqSrc = (Requirement *)(Copy->getSrc());
2946  Requirement *ReqDst = (Requirement *)(Copy->getDst());
2947 
2948  AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc);
2949  AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst);
2950 
2952  AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(), MQueue,
2953  ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange,
2954  ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
2955  MQueue, ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
2956  ReqDst->MOffset, ReqDst->MElemSize, std::move(RawEvents),
2957  MEvent->getHandleRef(), MEvent);
2958 
2959  return PI_SUCCESS;
2960  }
2961  case CG::CGTYPE::Fill: {
2962  CGFill *Fill = (CGFill *)MCommandGroup.get();
2963  Requirement *Req = (Requirement *)(Fill->getReqToFill());
2964  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2965 
2967  AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(), MQueue,
2968  Fill->MPattern.size(), Fill->MPattern.data(), Req->MDims,
2969  Req->MMemoryRange, Req->MAccessRange, Req->MOffset, Req->MElemSize,
2970  std::move(RawEvents), MEvent->getHandleRef(), MEvent);
2971 
2972  return PI_SUCCESS;
2973  }
2974  case CG::CGTYPE::Kernel: {
2975  CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
2976 
2977  NDRDescT &NDRDesc = ExecKernel->MNDRDesc;
2978  std::vector<ArgDesc> &Args = ExecKernel->MArgs;
2979 
2980  if (MQueue->is_host() || (MQueue->getDeviceImplPtr()->getBackend() ==
2981  backend::ext_intel_esimd_emulator)) {
2982  for (ArgDesc &Arg : Args)
2983  if (kernel_param_kind_t::kind_accessor == Arg.MType) {
2984  Requirement *Req = (Requirement *)(Arg.MPtr);
2985  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2986  Req->MData = AllocaCmd->getMemAllocation();
2987  }
2988  if (!RawEvents.empty()) {
2989  // Assuming that the events are for devices to the same Plugin.
2990  const PluginPtr &Plugin = EventImpls[0]->getPlugin();
2991  Plugin->call<PiApiKind::piEventsWait>(RawEvents.size(), &RawEvents[0]);
2992  }
2993 
2994  if (MQueue->is_host()) {
2995  ExecKernel->MHostKernel->call(NDRDesc,
2996  getEvent()->getHostProfilingInfo());
2997  } else {
2998  assert(MQueue->getDeviceImplPtr()->getBackend() ==
2999  backend::ext_intel_esimd_emulator);
3000  if (MEvent != nullptr)
3001  MEvent->setHostEnqueueTime();
3002  MQueue->getPlugin()->call<PiApiKind::piEnqueueKernelLaunch>(
3003  nullptr,
3004  reinterpret_cast<pi_kernel>(ExecKernel->MHostKernel->getPtr()),
3005  NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0],
3006  &NDRDesc.LocalSize[0], 0, nullptr, nullptr);
3007  }
3008  return PI_SUCCESS;
3009  }
3010 
3011  auto getMemAllocationFunc = [this](Requirement *Req) {
3012  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
3013  // getAllocaForReq may return nullptr if Req is a default constructed
3014  // accessor. Simply return nullptr in such a case.
3015  return AllocaCmd ? AllocaCmd->getMemAllocation() : nullptr;
3016  };
3017 
3018  const std::shared_ptr<detail::kernel_impl> &SyclKernel =
3019  ExecKernel->MSyclKernel;
3020  const std::string &KernelName = ExecKernel->MKernelName;
3021 
3022  if (!EventImpl) {
3023  // Kernel only uses assert if it's non interop one
3024  bool KernelUsesAssert =
3025  !(SyclKernel && SyclKernel->isInterop()) &&
3027  if (KernelUsesAssert) {
3028  EventImpl = MEvent;
3029  }
3030  }
3031 
3032  return enqueueImpKernel(
3033  MQueue, NDRDesc, Args, ExecKernel->getKernelBundle(), SyclKernel,
3034  KernelName, RawEvents, EventImpl, getMemAllocationFunc,
3035  ExecKernel->MKernelCacheConfig, ExecKernel->MKernelIsCooperative);
3036  }
3037  case CG::CGTYPE::CopyUSM: {
3038  CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
3039  MemoryManager::copy_usm(Copy->getSrc(), MQueue, Copy->getLength(),
3040  Copy->getDst(), std::move(RawEvents), Event,
3041  MEvent);
3042 
3043  return PI_SUCCESS;
3044  }
3045  case CG::CGTYPE::FillUSM: {
3046  CGFillUSM *Fill = (CGFillUSM *)MCommandGroup.get();
3047  MemoryManager::fill_usm(Fill->getDst(), MQueue, Fill->getLength(),
3048  Fill->getFill(), std::move(RawEvents), Event,
3049  MEvent);
3050 
3051  return PI_SUCCESS;
3052  }
3053  case CG::CGTYPE::PrefetchUSM: {
3054  CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get();
3055  MemoryManager::prefetch_usm(Prefetch->getDst(), MQueue,
3056  Prefetch->getLength(), std::move(RawEvents),
3057  Event, MEvent);
3058 
3059  return PI_SUCCESS;
3060  }
3061  case CG::CGTYPE::AdviseUSM: {
3062  CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get();
3063  MemoryManager::advise_usm(Advise->getDst(), MQueue, Advise->getLength(),
3064  Advise->getAdvice(), std::move(RawEvents), Event,
3065  MEvent);
3066 
3067  return PI_SUCCESS;
3068  }
3069  case CG::CGTYPE::Copy2DUSM: {
3070  CGCopy2DUSM *Copy = (CGCopy2DUSM *)MCommandGroup.get();
3071  MemoryManager::copy_2d_usm(Copy->getSrc(), Copy->getSrcPitch(), MQueue,
3072  Copy->getDst(), Copy->getDstPitch(),
3073  Copy->getWidth(), Copy->getHeight(),
3074  std::move(RawEvents), Event, MEvent);
3075  return PI_SUCCESS;
3076  }
3077  case CG::CGTYPE::Fill2DUSM: {
3078  CGFill2DUSM *Fill = (CGFill2DUSM *)MCommandGroup.get();
3079  MemoryManager::fill_2d_usm(Fill->getDst(), MQueue, Fill->getPitch(),
3080  Fill->getWidth(), Fill->getHeight(),
3081  Fill->getPattern(), std::move(RawEvents), Event,
3082  MEvent);
3083  return PI_SUCCESS;
3084  }
3085  case CG::CGTYPE::Memset2DUSM: {
3086  CGMemset2DUSM *Memset = (CGMemset2DUSM *)MCommandGroup.get();
3087  MemoryManager::memset_2d_usm(Memset->getDst(), MQueue, Memset->getPitch(),
3088  Memset->getWidth(), Memset->getHeight(),
3089  Memset->getValue(), std::move(RawEvents),
3090  Event, MEvent);
3091  return PI_SUCCESS;
3092  }
3093  case CG::CGTYPE::CodeplayHostTask: {
3094  CGHostTask *HostTask = static_cast<CGHostTask *>(MCommandGroup.get());
3095 
3096  for (ArgDesc &Arg : HostTask->MArgs) {
3097  switch (Arg.MType) {
3099  Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
3100  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
3101 
3102  if (AllocaCmd)
3103  Req->MData = AllocaCmd->getMemAllocation();
3104  break;
3105  }
3106  default:
3107  throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
3108  "Unsupported arg type " +
3109  codeToString(PI_ERROR_INVALID_VALUE));
3110  }
3111  }
3112 
3113  std::vector<interop_handle::ReqToMem> ReqToMem;
3114 
3115  if (HostTask->MHostTask->isInteropTask()) {
3116  // Extract the Mem Objects for all Requirements, to ensure they are
3117  // available if a user asks for them inside the interop task scope
3118  const std::vector<Requirement *> &HandlerReq =
3119  HostTask->getRequirements();
3120  auto ReqToMemConv = [&ReqToMem, HostTask](Requirement *Req) {
3121  const std::vector<AllocaCommandBase *> &AllocaCmds =
3122  Req->MSYCLMemObj->MRecord->MAllocaCommands;
3123 
3124  for (AllocaCommandBase *AllocaCmd : AllocaCmds)
3125  if (HostTask->MQueue->getContextImplPtr() ==
3126  AllocaCmd->getQueue()->getContextImplPtr()) {
3127  auto MemArg =
3128  reinterpret_cast<pi_mem>(AllocaCmd->getMemAllocation());
3129  ReqToMem.emplace_back(std::make_pair(Req, MemArg));
3130 
3131  return;
3132  }
3133 
3134  assert(false &&
3135  "Can't get memory object due to no allocation available");
3136 
3137  throw sycl::exception(
3138  sycl::make_error_code(sycl::errc::runtime),
3139  "Can't get memory object due to no allocation available " +
3140  codeToString(PI_ERROR_INVALID_MEM_OBJECT));
3141  };
3142  std::for_each(std::begin(HandlerReq), std::end(HandlerReq), ReqToMemConv);
3143  std::sort(std::begin(ReqToMem), std::end(ReqToMem));
3144  }
3145 
3146  // Host task is executed asynchronously so we should record where it was
3147  // submitted to report exception origin properly.
3149 
3150  MQueue->getThreadPool().submit<DispatchHostTask>(
3151  DispatchHostTask(this, std::move(ReqToMem)));
3152 
3154 
3155  return PI_SUCCESS;
3156  }
3157  case CG::CGTYPE::Barrier: {
3158  if (MQueue->getDeviceImplPtr()->is_host()) {
3159  // NOP for host device.
3160  return PI_SUCCESS;
3161  }
3162  const PluginPtr &Plugin = MQueue->getPlugin();
3163  if (MEvent != nullptr)
3164  MEvent->setHostEnqueueTime();
3166  MQueue->getHandleRef(), 0, nullptr, Event);
3167 
3168  return PI_SUCCESS;
3169  }
3170  case CG::CGTYPE::BarrierWaitlist: {
3171  CGBarrier *Barrier = static_cast<CGBarrier *>(MCommandGroup.get());
3172  std::vector<detail::EventImplPtr> Events = Barrier->MEventsWaitWithBarrier;
3173  std::vector<sycl::detail::pi::PiEvent> PiEvents =
3174  getPiEventsBlocking(Events);
3175  if (MQueue->getDeviceImplPtr()->is_host() || PiEvents.empty()) {
3176  // NOP for host device.
3177  // If Events is empty, then the barrier has no effect.
3178  return PI_SUCCESS;
3179  }
3180  const PluginPtr &Plugin = MQueue->getPlugin();
3181  if (MEvent != nullptr)
3182  MEvent->setHostEnqueueTime();
3184  MQueue->getHandleRef(), PiEvents.size(), &PiEvents[0], Event);
3185 
3186  return PI_SUCCESS;
3187  }
3188  case CG::CGTYPE::CopyToDeviceGlobal: {
3189  CGCopyToDeviceGlobal *Copy = (CGCopyToDeviceGlobal *)MCommandGroup.get();
3191  Copy->getDeviceGlobalPtr(), Copy->isDeviceImageScoped(), MQueue,
3192  Copy->getNumBytes(), Copy->getOffset(), Copy->getSrc(),
3193  std::move(RawEvents), Event, MEvent);
3194 
3195  return CL_SUCCESS;
3196  }
3197  case CG::CGTYPE::CopyFromDeviceGlobal: {
3198  CGCopyFromDeviceGlobal *Copy =
3199  (CGCopyFromDeviceGlobal *)MCommandGroup.get();
3201  Copy->getDeviceGlobalPtr(), Copy->isDeviceImageScoped(), MQueue,
3202  Copy->getNumBytes(), Copy->getOffset(), Copy->getDest(),
3203  std::move(RawEvents), Event, MEvent);
3204 
3205  return CL_SUCCESS;
3206  }
3207  case CG::CGTYPE::ReadWriteHostPipe: {
3208  CGReadWriteHostPipe *ExecReadWriteHostPipe =
3209  (CGReadWriteHostPipe *)MCommandGroup.get();
3210  std::string pipeName = ExecReadWriteHostPipe->getPipeName();
3211  void *hostPtr = ExecReadWriteHostPipe->getHostPtr();
3212  size_t typeSize = ExecReadWriteHostPipe->getTypeSize();
3213  bool blocking = ExecReadWriteHostPipe->isBlocking();
3214  bool read = ExecReadWriteHostPipe->isReadHostPipe();
3215 
3216  if (!EventImpl) {
3217  EventImpl = MEvent;
3218  }
3219  return enqueueReadWriteHostPipe(MQueue, pipeName, blocking, hostPtr,
3220  typeSize, RawEvents, EventImpl, read);
3221  }
3222  case CG::CGTYPE::ExecCommandBuffer: {
3223  CGExecCommandBuffer *CmdBufferCG =
3224  static_cast<CGExecCommandBuffer *>(MCommandGroup.get());
3225  if (MEvent != nullptr)
3226  MEvent->setHostEnqueueTime();
3227  return MQueue->getPlugin()
3229  CmdBufferCG->MCommandBuffer, MQueue->getHandleRef(),
3230  RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
3231  Event);
3232  }
3233  case CG::CGTYPE::CopyImage: {
3234  CGCopyImage *Copy = (CGCopyImage *)MCommandGroup.get();
3235 
3236  sycl::detail::pi::PiMemImageDesc Desc = Copy->getDesc();
3237 
3239  Copy->getSrc(), MQueue, Copy->getDst(), Desc, Copy->getFormat(),
3240  Copy->getCopyFlags(), Copy->getSrcOffset(), Copy->getDstOffset(),
3241  Copy->getHostExtent(), Copy->getCopyExtent(), std::move(RawEvents),
3242  Event);
3243  return PI_SUCCESS;
3244  }
3245  case CG::CGTYPE::SemaphoreWait: {
3246  CGSemaphoreWait *SemWait = (CGSemaphoreWait *)MCommandGroup.get();
3247  if (MQueue->getDeviceImplPtr()->is_host()) {
3248  // NOP for host device.
3249  return PI_SUCCESS;
3250  }
3251 
3252  const detail::PluginPtr &Plugin = MQueue->getPlugin();
3254  MQueue->getHandleRef(), SemWait->getInteropSemaphoreHandle(), 0,
3255  nullptr, nullptr);
3256 
3257  return PI_SUCCESS;
3258  }
3259  case CG::CGTYPE::SemaphoreSignal: {
3260  CGSemaphoreSignal *SemSignal = (CGSemaphoreSignal *)MCommandGroup.get();
3261  if (MQueue->getDeviceImplPtr()->is_host()) {
3262  // NOP for host device.
3263  return PI_SUCCESS;
3264  }
3265 
3266  const detail::PluginPtr &Plugin = MQueue->getPlugin();
3268  MQueue->getHandleRef(), SemSignal->getInteropSemaphoreHandle(), 0,
3269  nullptr, nullptr);
3270 
3271  return PI_SUCCESS;
3272  }
3273  case CG::CGTYPE::None:
3274  throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
3275  "CG type not implemented. " +
3276  codeToString(PI_ERROR_INVALID_OPERATION));
3277  }
3278  return PI_ERROR_INVALID_OPERATION;
3279 }
3280 
3282  return !MCommandBuffer &&
3283  MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask;
3284 }
3285 
3287  // Host tasks are cleaned up upon completion instead.
3289  (MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask);
3290 }
3291 
3293  if (MCommandGroup->getType() == CG::CGTYPE::CodeplayHostTask)
3294  return MLeafCounter == 0 && MEvent->isCompleted();
3295  return Command::readyForCleanup();
3296 }
3297 
3299  : Command(Command::CommandType::FUSION, Queue),
3300  MStatus(FusionStatus::ACTIVE) {
3302 }
3303 
3304 std::vector<Command *> &KernelFusionCommand::auxiliaryCommands() {
3305  return MAuxiliaryCommands;
3306 }
3307 
3309  MFusionList.push_back(Kernel);
3310 }
3311 
3312 std::vector<ExecCGCommand *> &KernelFusionCommand::getFusionList() {
3313  return MFusionList;
3314 }
3315 
3316 bool KernelFusionCommand::producesPiEvent() const { return false; }
3317 
3318 pi_int32 KernelFusionCommand::enqueueImp() {
3320  waitForEvents(MQueue, MPreparedDepsEvents, MEvent->getHandleRef());
3321 
3322  // We need to release the queue here because KernelFusionCommands are
3323  // held back by the scheduler thus prevent the deallocation of the queue.
3324  resetQueue();
3325  return PI_SUCCESS;
3326 }
3327 
3329  MStatus = Status;
3330 }
3331 
3333  assert(MStatus != FusionStatus::ACTIVE &&
3334  "Cannot release the queue attached to the KernelFusionCommand if it "
3335  "is active.");
3336  MQueue.reset();
3337  MWorkerQueue.reset();
3338 }
3339 
3341 #ifdef XPTI_ENABLE_INSTRUMENTATION
3342  constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
3343  if (!xptiCheckTraceEnabled(MStreamID)) {
3344  return;
3345  }
3346  // Create a payload with the command name and an event using this payload to
3347  // emit a node_create
3348  MCommandNodeType = commandToNodeType(MType);
3349  MCommandName = commandToName(MType);
3350 
3351  static unsigned FusionNodeCount = 0;
3352  std::stringstream PayloadStr;
3353  PayloadStr << "Fusion command #" << FusionNodeCount++;
3354  xpti::payload_t Payload = xpti::payload_t(PayloadStr.str().c_str());
3355 
3356  uint64_t CommandInstanceNo = 0;
3357  xpti_td *CmdTraceEvent =
3358  xptiMakeEvent(MCommandName.c_str(), &Payload, xpti::trace_graph_event,
3359  xpti_at::active, &CommandInstanceNo);
3360 
3361  MInstanceID = CommandInstanceNo;
3362  if (CmdTraceEvent) {
3363  MTraceEvent = static_cast<void *>(CmdTraceEvent);
3364  // If we are seeing this event again, then the instance ID
3365  // will be greater
3366  // than 1; in this case, we must skip sending a
3367  // notification to create a node as this node has already
3368  // been created. We return this value so the epilog method
3369  // can be called selectively.
3370  // See makeTraceEventProlog.
3371  MFirstInstance = (CommandInstanceNo == 1);
3372  }
3373 
3374  // This function is called in the constructor of the command. At this point
3375  // the kernel fusion list is still empty, so we don't have a terrible lot of
3376  // information we could attach to this node here.
3377  if (MFirstInstance && CmdTraceEvent) {
3378  xpti::addMetadata(CmdTraceEvent, "sycl_device",
3379  deviceToID(MQueue->get_device()));
3380  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
3381  deviceToString(MQueue->get_device()));
3382  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
3383  getSyclObjImpl(MQueue->get_device())->getDeviceName());
3384  }
3385  if (MFirstInstance) {
3386  // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
3387  // as this data is mutable and the metadata is supposed to be invariant
3388  xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
3389  MQueue->getQueueID());
3390  xptiNotifySubscribers(MStreamID, NotificationTraceType,
3391  detail::GSYCLGraphEvent,
3392  static_cast<xpti_td *>(MTraceEvent), MInstanceID,
3393  static_cast<const void *>(MCommandNodeType.c_str()));
3394  }
3395 
3396 #endif
3397 }
3398 
3399 void KernelFusionCommand::printDot(std::ostream &Stream) const {
3400  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
3401 
3402  Stream << "ID = " << this << "\\n";
3403  Stream << "KERNEL FUSION on " << deviceToString(MQueue->get_device()) << "\\n"
3404  << "FUSION LIST: {";
3405  bool Initial = true;
3406  for (auto *Cmd : MFusionList) {
3407  if (!Initial) {
3408  Stream << ",\\n";
3409  }
3410  Initial = false;
3411  auto *KernelCG = static_cast<detail::CGExecKernel *>(&Cmd->getCG());
3412  if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource()) {
3413  Stream << "created from source";
3414  } else {
3415  Stream << demangleKernelName(KernelCG->getKernelName());
3416  }
3417  }
3418  Stream << "}\\n";
3419 
3420  Stream << "\"];" << std::endl;
3421 
3422  for (const auto &Dep : MDeps) {
3423  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
3424  << " [ label = \"Access mode: "
3425  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
3426  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
3427  << std::endl;
3428  }
3429 }
3430 
3432  QueueImplPtr Queue,
3434  std::vector<std::shared_ptr<ext::oneapi::experimental::detail::node_impl>>
3435  Nodes)
3436  : Command(CommandType::UPDATE_CMD_BUFFER, Queue), MGraph(Graph),
3437  MNodes(Nodes) {}
3438 
3439 pi_int32 UpdateCommandBufferCommand::enqueueImp() {
3441  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
3442  auto RawEvents = getPiEvents(EventImpls);
3443  flushCrossQueueDeps(EventImpls, getWorkerQueue());
3444 
3445  for (auto &Node : MNodes) {
3446  auto CG = static_cast<CGExecKernel *>(Node->MCommandGroup.get());
3447  for (auto &Arg : CG->MArgs) {
3448  if (Arg.MType != kernel_param_kind_t::kind_accessor) {
3449  continue;
3450  }
3451  // Search through deps to get actual allocation for accessor args.
3452  for (const DepDesc &Dep : MDeps) {
3453  Requirement *Req = static_cast<AccessorImplHost *>(Arg.MPtr);
3454  if (Dep.MDepRequirement == Req) {
3455  if (Dep.MAllocaCmd) {
3456  Req->MData = Dep.MAllocaCmd->getMemAllocation();
3457  } else {
3459  "No allocation available for accessor when "
3460  "updating command buffer!");
3461  }
3462  }
3463  }
3464  }
3465  MGraph->updateImpl(Node);
3466  }
3467 
3468  return PI_SUCCESS;
3469 }
3470 
3471 void UpdateCommandBufferCommand::printDot(std::ostream &Stream) const {
3472  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#8d8f29\", label=\"";
3473 
3474  Stream << "ID = " << this << "\\n";
3475  Stream << "CommandBuffer Command Update"
3476  << "\\n";
3477 
3478  Stream << "\"];" << std::endl;
3479 
3480  for (const auto &Dep : MDeps) {
3481  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
3482  << " [ label = \"Access mode: "
3483  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
3484  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
3485  << std::endl;
3486  }
3487 }
3488 
3490 bool UpdateCommandBufferCommand::producesPiEvent() const { return false; }
3491 
3492 } // namespace detail
3493 } // namespace _V1
3494 } // namespace sycl
The context class represents a SYCL context on which kernel functions may be executed.
Definition: context.hpp:51
backend get_backend() const noexcept
Returns the backend associated with this context.
Definition: context.cpp:146
Base class for memory allocation commands.
Definition: commands.hpp:458
const Requirement * getRequirement() const final
Definition: commands.hpp:469
AllocaCommandBase * MLinkedAllocaCmd
Alloca command linked with current command.
Definition: commands.hpp:486
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:1021
virtual void * getMemAllocation() const =0
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1017
bool MIsActive
Indicates that current alloca is active one.
Definition: commands.hpp:488
bool MIsLeaderAlloca
Indicates that the command owns memory allocation in case of connected alloca command.
Definition: commands.hpp:492
AllocaCommandBase(CommandType Type, QueueImplPtr Queue, Requirement Req, AllocaCommandBase *LinkedAllocaCmd, bool IsConst)
Definition: commands.cpp:981
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:1019
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:992
SYCLMemObjI * getSYCLMemObj() const
Definition: commands.hpp:465
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1079
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1041
AllocaCommand(QueueImplPtr Queue, Requirement Req, bool InitFromUserData=true, AllocaCommandBase *LinkedAllocaCmd=nullptr, bool IsConst=false)
Definition: commands.cpp:1023
void * getMemAllocation() const final
Definition: commands.cpp:1138
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1117
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1165
AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *ParentAlloca, std::vector< Command * > &ToEnqueue, std::vector< Command * > &ToCleanUp)
Definition: commands.cpp:1099
sycl::detail::kernel_param_kind_t MType
Definition: cg_types.hpp:45
"Execute kernel" command group class.
Definition: cg.hpp:167
std::shared_ptr< detail::kernel_bundle_impl > MKernelBundle
Definition: cg.hpp:173
std::vector< ArgDesc > MArgs
Definition: cg.hpp:174
NDRDescT MNDRDesc
Stores ND-range description.
Definition: cg.hpp:170
std::shared_ptr< detail::kernel_impl > MSyclKernel
Definition: cg.hpp:172
std::shared_ptr< detail::queue_impl > MQueue
Definition: cg.hpp:340
Base class for all types of command groups.
Definition: cg.hpp:53
CGTYPE getType() const
Definition: cg.hpp:128
CGTYPE
Type of the command group.
Definition: cg.hpp:56
The Command class represents some action that needs to be performed on one or more memory objects.
Definition: commands.hpp:107
Command * processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform glueing of events from different contexts.
Definition: commands.cpp:706
void * MTraceEvent
The event for node_create and task_begin.
Definition: commands.hpp:351
CommandType MType
The type of the command.
Definition: commands.hpp:289
virtual bool producesPiEvent() const
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:750
void emitEnqueuedEventSignal(sycl::detail::pi::PiEvent &PiEventAddr)
Creates a signal event with the enqueued kernel event handle.
Definition: commands.cpp:799
int32_t MStreamID
The stream under which the traces are emitted.
Definition: commands.hpp:355
void emitInstrumentation(uint16_t Type, const char *Txt=nullptr)
Emits an event of Type.
Definition: commands.cpp:808
virtual void emitInstrumentationData()=0
Instrumentation method which emits telemetry data.
void resolveReleaseDependencies(std::set< Command * > &list)
Looks at all the dependencies for the release command and enables instrumentation to report these dep...
Definition: commands.cpp:910
std::string MCommandName
Buffer to build the command end-user understandable name.
Definition: commands.hpp:364
virtual pi_int32 enqueueImp()=0
Private interface. Derived classes should implement this method.
bool MMarkedForCleanup
Indicates that the node will be freed by graph cleanup.
Definition: commands.hpp:391
void emitEdgeEventForEventDependence(Command *Cmd, sycl::detail::pi::PiEvent &EventAddr)
Creates an edge event when the dependency is an event.
Definition: commands.cpp:603
unsigned MLeafCounter
Counts the number of memory objects this command is a leaf for.
Definition: commands.hpp:326
virtual bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, std::vector< Command * > &ToCleanUp)
Checks if the command is enqueued, and calls enqueueImp.
Definition: commands.cpp:819
std::vector< sycl::detail::pi::PiEvent > getPiEvents(const std::vector< EventImplPtr > &EventImpls) const
Collect PI events from EventImpls and filter out some of them in case of in order queue.
Definition: commands.cpp:233
void waitForPreparedHostEvents() const
Definition: commands.cpp:449
std::string MSubmissionFileName
Introduces string to handle memory management since code_location struct works with raw char arrays.
Definition: commands.hpp:377
std::mutex MEnqueueMtx
Mutex used to protect enqueueing from race conditions.
Definition: commands.hpp:291
virtual const ContextImplPtr & getWorkerContext() const
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:741
void emitInstrumentationDataProxy()
Proxy method which calls emitInstrumentationData.
Definition: commands.cpp:536
code_location MSubmissionCodeLocation
Represents code location of command submission to SYCL API, assigned with the valid value only if com...
Definition: commands.hpp:374
void makeTraceEventEpilog()
If prolog has been run, run epilog; this must be guarded by a check for xptiTraceEnabled().
Definition: commands.cpp:693
std::vector< sycl::detail::pi::PiEvent > getPiEventsBlocking(const std::vector< EventImplPtr > &EventImpls) const
Collect PI events from EventImpls and filter out some of them in case of in order queue.
Definition: commands.cpp:260
std::vector< EventImplPtr > & MPreparedHostDepsEvents
Definition: commands.hpp:263
std::vector< EventImplPtr > & MPreparedDepsEvents
Dependency events prepared for waiting by backend.
Definition: commands.hpp:262
Command(CommandType Type, QueueImplPtr Queue, sycl::detail::pi::PiExtCommandBuffer CommandBuffer=nullptr, const std::vector< sycl::detail::pi::PiExtSyncPoint > &SyncPoints={})
It is safe to bind MPreparedDepsEvents and MPreparedHostDepsEvents references to event_impl class mem...
Definition: commands.cpp:511
std::string MSubmissionFunctionName
Definition: commands.hpp:378
uint64_t MInstanceID
Instance ID tracked for the command.
Definition: commands.hpp:370
const QueueImplPtr & getWorkerQueue() const
Get the queue this command will be submitted to.
Definition: commands.cpp:745
std::vector< DepDesc > MDeps
Contains list of dependencies(edges)
Definition: commands.hpp:320
const char * getBlockReason() const
Definition: commands.cpp:952
std::vector< sycl::detail::pi::PiExtSyncPoint > MSyncPointDeps
List of sync points for submissions to a command buffer.
Definition: commands.hpp:411
virtual bool readyForCleanup() const
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:754
void addUser(Command *NewUser)
Definition: commands.hpp:141
std::atomic< EnqueueResultT::ResultT > MEnqueueStatus
Describes the status of the command.
Definition: commands.hpp:343
const EventImplPtr & getEvent() const
Definition: commands.hpp:180
uint64_t makeTraceEventProlog(void *MAddress)
Create a trace event of node_create type; this must be guarded by a check for xptiTraceEnabled().
Definition: commands.cpp:661
friend class DispatchHostTask
Definition: commands.hpp:293
CommandType getType() const
Definition: commands.hpp:144
void * MAddress
Reserved for storing the object address such as SPIR-V or memory object address.
Definition: commands.hpp:358
std::string MAddressString
Buffer to build the address string.
Definition: commands.hpp:360
void waitForEvents(QueueImplPtr Queue, std::vector< EventImplPtr > &RawEvents, sycl::detail::pi::PiEvent &Event)
Definition: commands.cpp:454
bool MIsBlockable
Indicates whether the command can be blocked from enqueueing.
Definition: commands.hpp:324
virtual bool supportsPostEnqueueCleanup() const
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:752
bool MTraceEventPrologComplete
Flag to indicate if makeTraceEventProlog() has been run.
Definition: commands.hpp:366
sycl::detail::pi::PiExtCommandBuffer MCommandBuffer
CommandBuffer which will be used to submit to instead of the queue, if set.
Definition: commands.hpp:409
std::string MCommandNodeType
Buffer to build the command node type.
Definition: commands.hpp:362
Command * addDep(DepDesc NewDep, std::vector< Command * > &ToCleanUp)
Definition: commands.cpp:759
void emitEdgeEventForCommandDependence(Command *Cmd, void *ObjAddr, bool IsCommand, std::optional< access::mode > AccMode=std::nullopt)
Creates an edge event when the dependency is a command.
Definition: commands.cpp:552
const QueueImplPtr & getQueue() const
Definition: commands.hpp:178
bool MFirstInstance
Flag to indicate if this is the first time we are seeing this payload.
Definition: commands.hpp:368
DispatchHostTask(ExecCGCommand *ThisCmd, std::vector< interop_handle::ReqToMem > ReqToMem)
Definition: commands.cpp:365
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1801
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1783
void addRequirement(Command *DepCmd, AllocaCommandBase *AllocaCmd, const Requirement *Req)
Definition: commands.cpp:1734
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1750
EmptyCommand(QueueImplPtr Queue)
Definition: commands.cpp:1722
The exec CG command enqueues execution of kernel or explicit memory operation.
Definition: commands.hpp:647
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:3281
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:3286
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:3292
std::string_view getTypeString() const
Definition: commands.cpp:2220
detail::CG & getCG() const
Definition: commands.hpp:662
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:2186
std::vector< std::shared_ptr< const void > > getAuxiliaryResources() const
Definition: commands.cpp:1590
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:2138
ExecCGCommand(std::unique_ptr< detail::CG > CommandGroup, QueueImplPtr Queue, sycl::detail::pi::PiExtCommandBuffer CommandBuffer=nullptr, const std::vector< sycl::detail::pi::PiExtSyncPoint > &Dependencies={})
Definition: commands.cpp:1926
void TraceEventXPTI(const char *Message)
static GlobalHandler & instance()
void addToFusionList(ExecCGCommand *Kernel)
Definition: commands.cpp:3308
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:3316
void setFusionStatus(FusionStatus Status)
Set the status of this fusion command to Status.
Definition: commands.cpp:3328
std::vector< ExecCGCommand * > & getFusionList()
Definition: commands.cpp:3312
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:3399
KernelFusionCommand(QueueImplPtr Queue)
Definition: commands.cpp:3298
void resetQueue()
Reset the queue.
Definition: commands.cpp:3332
std::vector< Command * > & auxiliaryCommands()
Definition: commands.cpp:3304
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:3340
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1316
MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, void **DstPtr, QueueImplPtr Queue, access::mode MapMode)
Definition: commands.cpp:1307
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1357
const ContextImplPtr & getWorkerContext() const final
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:1690
MemCpyCommandHost(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, Requirement DstReq, void **DstPtr, QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
Definition: commands.cpp:1638
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1656
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1803
MemCpyCommand(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, Requirement DstReq, AllocaCommandBase *DstAllocaCmd, QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
Definition: commands.cpp:1459
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1478
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1516
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1557
const ContextImplPtr & getWorkerContext() const final
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:1512
static void * allocateMemSubBuffer(ContextImplPtr TargetContext, void *ParentMemObj, size_t ElemSize, size_t Offset, range< 3 > Range, std::vector< EventImplPtr > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
static void ext_oneapi_copyH2D_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, char *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void ext_oneapi_copyD2H_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, void *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, char *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void prefetch_usm(void *Ptr, QueueImplPtr Queue, size_t Len, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void advise_usm(const void *Ptr, QueueImplPtr Queue, size_t Len, pi_mem_advice Advice, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy_2d_usm(const void *SrcMem, size_t SrcPitch, QueueImplPtr Queue, void *DstMem, size_t DstPitch, size_t Width, size_t Height, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_prefetch_usm_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, void *Mem, size_t Length, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void fill(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, size_t PatternSize, const char *Pattern, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent &OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_copyD2D_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, void *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void * allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, bool InitFromUserData, void *HostPtr, std::vector< EventImplPtr > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
static void copy_image_bindless(void *Src, QueueImplPtr Queue, void *Dst, const sycl::detail::pi::PiMemImageDesc &Desc, const sycl::detail::pi::PiMemImageFormat &Format, const sycl::detail::pi::PiImageCopyFlags Flags, sycl::detail::pi::PiImageOffset SrcOffset, sycl::detail::pi::PiImageOffset DstOffset, sycl::detail::pi::PiImageRegion CopyExtent, sycl::detail::pi::PiImageRegion HostExtent, const std::vector< sycl::detail::pi::PiEvent > &DepEvents, sycl::detail::pi::PiEvent *OutEvent)
static void copy_usm(const void *SrcMem, QueueImplPtr Queue, size_t Len, void *DstMem, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy_to_device_global(const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue, size_t NumBytes, size_t Offset, const void *SrcMem, const std::vector< sycl::detail::pi::PiEvent > &DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void release(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *MemAllocation, std::vector< EventImplPtr > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
static void ext_oneapi_fill_usm_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, void *DstMem, size_t Len, int Pattern, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void copy_from_device_global(const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue, size_t NumBytes, size_t Offset, void *DstMem, const std::vector< sycl::detail::pi::PiEvent > &DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_copy_usm_cmd_buffer(ContextImplPtr Context, const void *SrcMem, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, size_t Len, void *DstMem, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void memset_2d_usm(void *DstMem, QueueImplPtr Queue, size_t Pitch, size_t Width, size_t Height, char Value, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void fill_usm(void *DstMem, QueueImplPtr Queue, size_t Len, int Pattern, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy(SYCLMemObjI *SYCLMemObj, void *SrcMem, QueueImplPtr SrcQueue, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, QueueImplPtr TgtQueue, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent &OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_fill_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, void *Mem, size_t PatternSize, const char *Pattern, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void * map(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, access::mode AccessMode, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
static void fill_2d_usm(void *DstMem, QueueImplPtr Queue, size_t Pitch, size_t Width, size_t Height, const std::vector< char > &Pattern, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void unmap(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, void *MappedPtr, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
static void ext_oneapi_advise_usm_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, const void *Mem, size_t Length, pi_mem_advice Advice, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
sycl::range< 3 > GlobalSize
Definition: cg_types.hpp:129
sycl::range< 3 > NumWorkGroups
Number of workgroups, used to record the number of workgroups from the simplest form of parallel_for_...
Definition: cg_types.hpp:135
sycl::id< 3 > GlobalOffset
Definition: cg_types.hpp:131
sycl::range< 3 > LocalSize
Definition: cg_types.hpp:130
void set(sycl::range< Dims_ > NumWorkItems)
Definition: cg_types.hpp:70
static ProgramManager & getInstance()
kernel_id getSYCLKernelID(const std::string &KernelName)
device_image_plain getDeviceImageFromBinaryImage(RTDeviceBinaryImage *BinImage, const context &Ctx, const device &Dev)
bool kernelUsesAssert(const std::string &KernelName) const
HostPipeMapEntry * getHostPipeEntry(const std::string &UniqueId)
device_image_plain build(const device_image_plain &DeviceImage, const std::vector< device > &Devs, const property_list &PropList)
std::tuple< sycl::detail::pi::PiKernel, std::mutex *, const KernelArgMask *, sycl::detail::pi::PiProgram > getOrCreateKernel(const ContextImplPtr &ContextImpl, const DeviceImplPtr &DeviceImpl, const std::string &KernelName, const NDRDescT &NDRDesc={})
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1301
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:1303
ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd)
Definition: commands.cpp:1187
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1283
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1192
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:1305
virtual MemObjType getType() const =0
Command * connectDepEvent(Command *const Cmd, const EventImplPtr &DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform connection of events in multiple contexts.
QueueImplPtr getDefaultHostQueue()
Definition: scheduler.hpp:453
void enqueueCommandForCG(EventImplPtr NewEvent, std::vector< Command * > &AuxilaryCmds, BlockingT Blocking=NON_BLOCKING)
Definition: scheduler.cpp:166
static Scheduler & getInstance()
Definition: scheduler.cpp:261
void NotifyHostTaskCompletion(Command *Cmd)
Definition: scheduler.cpp:472
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1442
UnMapMemObject(AllocaCommandBase *DstAllocaCmd, Requirement Req, void **SrcPtr, QueueImplPtr Queue)
Definition: commands.cpp:1374
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1381
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1407
UpdateCommandBufferCommand(QueueImplPtr Queue, ext::oneapi::experimental::detail::exec_graph_impl *Graph, std::vector< std::shared_ptr< ext::oneapi::experimental::detail::node_impl >> Nodes)
Definition: commands.cpp:3431
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:3471
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:3489
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:3490
UpdateHostRequirementCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *SrcAllocaCmd, void **DstPtr)
Definition: commands.cpp:1820
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1829
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1615
static void bufferAssociateNotification(const void *, const void *)
const PluginPtr & getPlugin() const
sycl::detail::pi::PiDevice & getHandleRef()
Get reference to PI device.
Definition: device_impl.hpp:66
Data type that manages the code_location information in TLS.
Definition: common.hpp:129
const detail::code_location & query()
Query the information in the TLS slot.
Definition: common.cpp:54
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64
bool is_accelerator() const
Check if device is an accelerator device.
Definition: device.cpp:83
bool is_gpu() const
Check if device is a GPU device.
Definition: device.cpp:81
bool is_cpu() const
Check if device is a CPU device.
Definition: device.cpp:79
cl_int get_cl_code() const
Definition: exception.cpp:80
Class representing the implementation of command_graph<executable>.
void updateImpl(std::shared_ptr< node_impl > NodeImpl)
Objects of the class identify kernel is some kernel_bundle related APIs.
Provides an abstraction of a SYCL kernel.
Definition: kernel.hpp:77
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: nd_range.hpp:22
void handleErrorOrWarning(pi_result Error, const device_impl &DeviceImpl, pi_kernel Kernel, const NDRDescT &NDRDesc)
Analyzes error code and arguments of piEnqueueKernelLaunch to emit user-friendly exception describing...
::pi_mem PiMem
Definition: pi.hpp:141
::pi_kernel PiKernel
Definition: pi.hpp:138
::pi_kernel_cache_config PiKernelCacheConfig
Definition: pi.hpp:155
::pi_ext_sync_point PiExtSyncPoint
Definition: pi.hpp:156
::pi_program PiProgram
Definition: pi.hpp:137
::pi_ext_command_buffer_command PiExtCommandBufferCommand
Definition: pi.hpp:159
void free(void *Ptr, const context &Ctxt, const code_location &CL)
Definition: usm_impl.cpp:297
static void flushCrossQueueDeps(const std::vector< EventImplPtr > &EventImpls, const QueueImplPtr &Queue)
Definition: commands.cpp:313
static std::string_view cgTypeToString(detail::CG::CGTYPE Type)
Definition: commands.cpp:1857
static std::string demangleKernelName(std::string Name)
Definition: commands.cpp:95
std::vector< bool > KernelArgMask
constexpr const char * SYCL_STREAM_NAME
std::enable_if< !std::is_same< typename Param::return_type, sycl::range< 3 > >::value, typename Param::return_type >::type get_kernel_device_specific_info(sycl::detail::pi::PiKernel Kernel, sycl::detail::pi::PiDevice Device, const PluginPtr &Plugin)
Definition: kernel_info.hpp:88
std::string codeToString(pi_int32 code)
Definition: common.hpp:153
void ReverseRangeDimensionsForKernel(NDRDescT &NDR)
Definition: commands.cpp:2262
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
Definition: event_impl.hpp:32
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:30
pi_int32 enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName, bool blocking, void *ptr, size_t size, std::vector< sycl::detail::pi::PiEvent > &RawEvents, const detail::EventImplPtr &OutEventImpl, bool read)
Definition: commands.cpp:2680
std::shared_ptr< event_impl > EventImplPtr
Definition: cg.hpp:43
static void adjustNDRangePerKernel(NDRDescT &NDR, sycl::detail::pi::PiKernel Kernel, const device_impl &DeviceImpl)
Definition: commands.cpp:2234
pi_mem_obj_access AccessModeToPi(access::mode AccessorMode)
Definition: commands.cpp:2270
static pi_result SetKernelParamsAndLaunch(const QueueImplPtr &Queue, std::vector< ArgDesc > &Args, const std::shared_ptr< device_image_impl > &DeviceImageImpl, sycl::detail::pi::PiKernel Kernel, NDRDescT &NDRDesc, std::vector< sycl::detail::pi::PiEvent > &RawEvents, const detail::EventImplPtr &OutEventImpl, const KernelArgMask *EliminatedArgMask, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, bool IsCooperative)
Definition: commands.cpp:2369
std::shared_ptr< plugin > PluginPtr
Definition: pi.hpp:48
AccessorImplHost Requirement
std::shared_ptr< device_impl > DeviceImplPtr
static std::string deviceToString(device Device)
Definition: commands.cpp:98
pi_int32 enqueueImpKernel(const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector< ArgDesc > &Args, const std::shared_ptr< detail::kernel_bundle_impl > &KernelBundleImplPtr, const std::shared_ptr< detail::kernel_impl > &MSyclKernel, const std::string &KernelName, std::vector< sycl::detail::pi::PiEvent > &RawEvents, const detail::EventImplPtr &OutEventImpl, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig, const bool KernelIsCooperative)
Definition: commands.cpp:2559
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
void applyFuncOnFilteredArgs(const KernelArgMask *EliminatedArgMask, std::vector< ArgDesc > &Args, std::function< void(detail::ArgDesc &Arg, int NextTrueIndex)> Func)
Definition: commands.cpp:111
auto tie(Ts &...Args)
Definition: tuple.hpp:39
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
Definition: event_impl.hpp:34
void SetArgBasedOnType(const PluginPtr &Plugin, sycl::detail::pi::PiKernel Kernel, const std::shared_ptr< device_image_impl > &DeviceImageImpl, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, const sycl::context &Context, bool IsHost, detail::ArgDesc &Arg, size_t NextTrueIndex)
Definition: commands.cpp:2282
void DispatchNativeKernel(void *Blob)
Definition: commands.cpp:2431
pi_int32 enqueueImpCommandBufferKernel(context Ctx, DeviceImplPtr DeviceImpl, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, const CGExecKernel &CommandGroup, std::vector< sycl::detail::pi::PiExtSyncPoint > &SyncPoints, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint, sycl::detail::pi::PiExtCommandBufferCommand *OutCommand, const std::function< void *(Requirement *Req)> &getMemAllocationFunc)
Definition: commands.cpp:2455
static std::string accessModeToString(access::mode Mode)
Definition: commands.cpp:154
Function for_each(Group g, Ptr first, Ptr last, Function f)
__DPCPP_SYCL_EXTERNAL _SYCL_EXT_CPLX_INLINE_VISIBILITY std::enable_if_t< is_genfloat< _Tp >::value, _Tp > arg(const complex< _Tp > &__c)
PropertyListT int access::address_space multi_ptr & operator=(multi_ptr &&)=default
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:87
Definition: access.hpp:18
_pi_kernel_cache_config
Definition: pi.h:808
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA
Definition: pi.h:814
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM
Definition: pi.h:812
int32_t pi_int32
Definition: pi.h:212
_pi_result
Definition: pi.h:224
pi_result piextEnqueueReadHostPipe(pi_queue queue, pi_program program, const char *pipe_symbol, pi_bool blocking, void *ptr, size_t size, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
Plugin.
Definition: pi_cuda.cpp:1013
pi_result piKernelRelease(pi_kernel kernel)
Definition: pi_cuda.cpp:531
pi_result piextEnqueueWriteHostPipe(pi_queue queue, pi_program program, const char *pipe_symbol, pi_bool blocking, void *ptr, size_t size, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
Write to pipe of a given name.
Definition: pi_cuda.cpp:1033
pi_result piKernelSetArg(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
Definition: pi_cuda.cpp:347
pi_result piEnqueueEventsWaitWithBarrier(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:665
pi_result piEnqueueKernelLaunch(pi_queue queue, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:537
@ PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
Definition: pi.h:520
pi_result piextWaitExternalSemaphore(pi_queue command_queue, pi_interop_semaphore_handle sem_handle, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
API to instruct the queue with a non-blocking wait on an external semaphore.
Definition: pi_cuda.cpp:492
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG
provides the preferred cache configuration (large slm or large data)
Definition: pi.h:1607
pi_result piProgramRelease(pi_program program)
Definition: pi_cuda.cpp:324
pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer, pi_queue queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
API to submit the command-buffer to queue for execution, returns an error if the command-buffer is no...
Definition: pi_cuda.cpp:1213
pi_result piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index, const pi_sampler *arg_value)
Definition: pi_cuda.cpp:360
pi_result piEnqueueEventsWait(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:657
pi_result piextCommandBufferNDRangeKernel(pi_ext_command_buffer command_buffer, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_sync_points_in_wait_list, const pi_ext_sync_point *sync_point_wait_list, pi_ext_sync_point *sync_point, pi_ext_command_buffer_command *command)
API to append a kernel execution command to the command-buffer.
Definition: pi_cuda.cpp:1087
pi_result piKernelSetExecInfo(pi_kernel kernel, pi_kernel_exec_info value_name, size_t param_value_size, const void *param_value)
API to set attributes controlling kernel execution.
Definition: pi_cuda.cpp:1053
pi_result piEventsWait(pi_uint32 num_events, const pi_event *event_list)
Definition: pi_cuda.cpp:598
pi_result piextEnqueueCooperativeKernelLaunch(pi_queue queue, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:547
_pi_mem_obj_access
Definition: pi.h:1903
@ PI_ACCESS_READ_WRITE
Definition: pi.h:1904
@ PI_ACCESS_READ_ONLY
Definition: pi.h:1906
@ PI_ACCESS_WRITE_ONLY
Definition: pi.h:1905
pi_result piextSignalExternalSemaphore(pi_queue command_queue, pi_interop_semaphore_handle sem_handle, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
API to instruct the queue to signal the external semaphore handle once all previous commands have com...
Definition: pi_cuda.cpp:500
pi_result piextKernelSetArgPointer(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
Sets up pointer arguments for CL kernels.
Definition: pi_cuda.cpp:900
pi_result piKernelGetGroupInfo(pi_kernel kernel, pi_device device, pi_kernel_group_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_cuda.cpp:508
@ PI_KERNEL_ARG_MEM_OBJ_ACCESS
Definition: pi.h:1912
pi_result piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, const pi_mem_obj_property *arg_properties, const pi_mem *arg_value)
Definition: pi_cuda.cpp:353
pi_mem_access_flag mem_access
Definition: pi.h:1920
Dependency between two commands.
Definition: commands.hpp:81
const Requirement * MDepRequirement
Requirement for the dependency.
Definition: commands.hpp:94
Command * MDepCommand
The actual dependency command.
Definition: commands.hpp:92
Result of command enqueueing.
Definition: commands.hpp:62