DPC++ Runtime
Runtime libraries for oneAPI DPC++
commands.cpp
Go to the documentation of this file.
1 //===----------- commands.cpp - SYCL commands -------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 
11 #include <detail/context_impl.hpp>
12 #include <detail/event_impl.hpp>
15 #include <detail/kernel_impl.hpp>
16 #include <detail/kernel_info.hpp>
18 #include <detail/program_impl.hpp>
20 #include <detail/queue_impl.hpp>
21 #include <detail/sampler_impl.hpp>
24 #include <detail/stream_impl.hpp>
25 #include <detail/xpti_registry.hpp>
26 #include <sycl/access/access.hpp>
27 #include <sycl/backend_types.hpp>
28 #include <sycl/detail/cg_types.hpp>
30 #include <sycl/sampler.hpp>
31 
32 #include <cassert>
33 #include <optional>
34 #include <string>
35 #include <vector>
36 
37 #ifdef __has_include
38 #if __has_include(<cxxabi.h>)
39 #define __SYCL_ENABLE_GNU_DEMANGLING
40 #include <cstdlib>
41 #include <cxxabi.h>
42 #include <memory>
43 #endif
44 #endif
45 
46 #ifdef XPTI_ENABLE_INSTRUMENTATION
47 #include "xpti/xpti_trace_framework.hpp"
48 #include <detail/xpti_registry.hpp>
49 #endif
50 
51 namespace sycl {
52 inline namespace _V1 {
53 namespace detail {
54 
55 #ifdef XPTI_ENABLE_INSTRUMENTATION
56 // Global graph for the application
57 extern xpti::trace_event_data_t *GSYCLGraphEvent;
58 
59 bool CurrentCodeLocationValid() {
60  detail::tls_code_loc_t Tls;
61  auto CodeLoc = Tls.query();
62  auto FileName = CodeLoc.fileName();
63  auto FunctionName = CodeLoc.functionName();
64  return (FileName && FileName[0] != '\0') ||
65  (FunctionName && FunctionName[0] != '\0');
66 }
67 
68 void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID,
69  xpti_td *TraceEvent, uint16_t Type,
70  const void *Addr) {
71  if (!(xptiCheckTraceEnabled(StreamID, Type) && TraceEvent))
72  return;
73  // Trace event notifier that emits a Type event
74  xptiNotifySubscribers(StreamID, Type, detail::GSYCLGraphEvent,
75  static_cast<xpti_td *>(TraceEvent), InstanceID, Addr);
76 }
77 #endif
78 
79 #ifdef __SYCL_ENABLE_GNU_DEMANGLING
80 struct DemangleHandle {
81  char *p;
82  DemangleHandle(char *ptr) : p(ptr) {}
83 
84  DemangleHandle &operator=(const DemangleHandle &) = delete;
85 
86  ~DemangleHandle() { std::free(p); }
87 };
89  int Status = -1; // some arbitrary value to eliminate the compiler warning
90  DemangleHandle result(abi::__cxa_demangle(Name.c_str(), NULL, NULL, &Status));
91  return (Status == 0) ? result.p : Name;
92 }
93 #else
94 static std::string demangleKernelName(std::string Name) { return Name; }
95 #endif
96 
98  if (getSyclObjImpl(Device)->is_host())
99  return "HOST";
100  else if (Device.is_cpu())
101  return "CPU";
102  else if (Device.is_gpu())
103  return "GPU";
104  else if (Device.is_accelerator())
105  return "ACCELERATOR";
106  else
107  return "UNKNOWN";
108 }
109 
111  const KernelArgMask *EliminatedArgMask, std::vector<ArgDesc> &Args,
112  std::function<void(detail::ArgDesc &Arg, int NextTrueIndex)> Func) {
113  if (!EliminatedArgMask) {
114  for (ArgDesc &Arg : Args) {
115  Func(Arg, Arg.MIndex);
116  }
117  } else {
118  // TODO this is not necessary as long as we can guarantee that the
119  // arguments are already sorted (e. g. handle the sorting in handler
120  // if necessary due to set_arg(...) usage).
121  std::sort(Args.begin(), Args.end(), [](const ArgDesc &A, const ArgDesc &B) {
122  return A.MIndex < B.MIndex;
123  });
124  int LastIndex = -1;
125  size_t NextTrueIndex = 0;
126 
127  for (ArgDesc &Arg : Args) {
128  // Handle potential gaps in set arguments (e. g. if some of them are
129  // set on the user side).
130  for (int Idx = LastIndex + 1; Idx < Arg.MIndex; ++Idx)
131  if (!(*EliminatedArgMask)[Idx])
132  ++NextTrueIndex;
133  LastIndex = Arg.MIndex;
134 
135  if ((*EliminatedArgMask)[Arg.MIndex])
136  continue;
137 
138  Func(Arg, NextTrueIndex);
139  ++NextTrueIndex;
140  }
141  }
142 }
143 
144 #ifdef XPTI_ENABLE_INSTRUMENTATION
145 static size_t deviceToID(const device &Device) {
146  if (getSyclObjImpl(Device)->is_host())
147  return 0;
148  else
149  return reinterpret_cast<size_t>(getSyclObjImpl(Device)->getHandleRef());
150 }
151 #endif
152 
154  switch (Mode) {
155  case access::mode::read:
156  return "read";
157  case access::mode::write:
158  return "write";
160  return "read_write";
162  return "discard_write";
164  return "discard_read_write";
165  default:
166  return "unknown";
167  }
168 }
169 
170 #ifdef XPTI_ENABLE_INSTRUMENTATION
171 // Using the command group type to create node types for the asynchronous task
172 // graph modeling
173 static std::string commandToNodeType(Command::CommandType Type) {
174  switch (Type) {
175  case Command::CommandType::RUN_CG:
176  return "command_group_node";
177  case Command::CommandType::COPY_MEMORY:
178  return "memory_transfer_node";
179  case Command::CommandType::ALLOCA:
180  return "memory_allocation_node";
181  case Command::CommandType::ALLOCA_SUB_BUF:
182  return "sub_buffer_creation_node";
183  case Command::CommandType::RELEASE:
184  return "memory_deallocation_node";
185  case Command::CommandType::MAP_MEM_OBJ:
186  return "memory_transfer_node";
187  case Command::CommandType::UNMAP_MEM_OBJ:
188  return "memory_transfer_node";
189  case Command::CommandType::UPDATE_REQUIREMENT:
190  return "host_acc_create_buffer_lock_node";
191  case Command::CommandType::EMPTY_TASK:
192  return "host_acc_destroy_buffer_release_node";
193  case Command::CommandType::FUSION:
194  return "kernel_fusion_placeholder_node";
195  default:
196  return "unknown_node";
197  }
198 }
199 
200 // Using the names being generated and the string are subject to change to
201 // something more meaningful to end-users as this will be visible in analysis
202 // tools that subscribe to this data
203 static std::string commandToName(Command::CommandType Type) {
204  switch (Type) {
205  case Command::CommandType::RUN_CG:
206  return "Command Group Action";
207  case Command::CommandType::COPY_MEMORY:
208  return "Memory Transfer (Copy)";
209  case Command::CommandType::ALLOCA:
210  return "Memory Allocation";
211  case Command::CommandType::ALLOCA_SUB_BUF:
212  return "Sub Buffer Creation";
213  case Command::CommandType::RELEASE:
214  return "Memory Deallocation";
215  case Command::CommandType::MAP_MEM_OBJ:
216  return "Memory Transfer (Map)";
217  case Command::CommandType::UNMAP_MEM_OBJ:
218  return "Memory Transfer (Unmap)";
219  case Command::CommandType::UPDATE_REQUIREMENT:
220  return "Host Accessor Creation/Buffer Lock";
221  case Command::CommandType::EMPTY_TASK:
222  return "Host Accessor Destruction/Buffer Lock Release";
223  case Command::CommandType::FUSION:
224  return "Kernel Fusion Placeholder";
225  default:
226  return "Unknown Action";
227  }
228 }
229 #endif
230 
231 std::vector<sycl::detail::pi::PiEvent>
232 Command::getPiEvents(const std::vector<EventImplPtr> &EventImpls) const {
233  std::vector<sycl::detail::pi::PiEvent> RetPiEvents;
234  for (auto &EventImpl : EventImpls) {
235  if (EventImpl->getHandleRef() == nullptr)
236  continue;
237 
238  // Do not add redundant event dependencies for in-order queues.
239  // At this stage dependency is definitely pi task and need to check if
240  // current one is a host task. In this case we should not skip pi event due
241  // to different sync mechanisms for different task types on in-order queue.
242  const QueueImplPtr &WorkerQueue = getWorkerQueue();
243  // MWorkerQueue in command is always not null. So check if
244  // EventImpl->getWorkerQueue != nullptr is implicit.
245  if (EventImpl->getWorkerQueue() == WorkerQueue &&
246  WorkerQueue->isInOrder() && !isHostTask())
247  continue;
248 
249  RetPiEvents.push_back(EventImpl->getHandleRef());
250  }
251 
252  return RetPiEvents;
253 }
254 
255 // This function is implemented (duplicating getPiEvents a lot) as short term
256 // solution for the issue that barrier with wait list could not
257 // handle empty pi event handles when kernel is enqueued on host task
258 // completion.
259 std::vector<sycl::detail::pi::PiEvent> Command::getPiEventsBlocking(
260  const std::vector<EventImplPtr> &EventImpls) const {
261  std::vector<sycl::detail::pi::PiEvent> RetPiEvents;
262  for (auto &EventImpl : EventImpls) {
263  // Throwaway events created with empty constructor will not have a context
264  // (which is set lazily) calling getContextImpl() would set that
265  // context, which we wish to avoid as it is expensive.
266  // Skip host task and NOP events also.
267  if (!EventImpl->isContextInitialized() || EventImpl->is_host() ||
268  EventImpl->isNOP())
269  continue;
270  // In this path nullptr native event means that the command has not been
271  // enqueued. It may happen if async enqueue in a host task is involved.
272  if (EventImpl->getHandleRef() == nullptr) {
273  if (!EventImpl->getCommand() ||
274  !static_cast<Command *>(EventImpl->getCommand())->producesPiEvent())
275  continue;
276  std::vector<Command *> AuxCmds;
277  Scheduler::getInstance().enqueueCommandForCG(EventImpl, AuxCmds,
278  BLOCKING);
279  }
280  // Do not add redundant event dependencies for in-order queues.
281  // At this stage dependency is definitely pi task and need to check if
282  // current one is a host task. In this case we should not skip pi event due
283  // to different sync mechanisms for different task types on in-order queue.
284  const QueueImplPtr &WorkerQueue = getWorkerQueue();
285  // MWorkerQueue in command is always not null. So check if
286  // EventImpl->getWorkerQueue != nullptr is implicit.
287  if (EventImpl->getWorkerQueue() == WorkerQueue &&
288  WorkerQueue->isInOrder() && !isHostTask())
289  continue;
290 
291  RetPiEvents.push_back(EventImpl->getHandleRef());
292  }
293 
294  return RetPiEvents;
295 }
296 
297 bool Command::isHostTask() const {
298  return (MType == CommandType::RUN_CG) /* host task has this type also */ &&
299  ((static_cast<const ExecCGCommand *>(this))->getCG().getType() ==
300  CG::CGTYPE::CodeplayHostTask);
301 }
302 
303 bool Command::isFusable() const {
304  if ((MType != CommandType::RUN_CG)) {
305  return false;
306  }
307  const auto &CG = (static_cast<const ExecCGCommand &>(*this)).getCG();
308  return (CG.getType() == CG::CGTYPE::Kernel) &&
309  (!static_cast<const CGExecKernel &>(CG).MKernelIsCooperative);
310 }
311 
312 static void flushCrossQueueDeps(const std::vector<EventImplPtr> &EventImpls,
313  const QueueImplPtr &Queue) {
314  for (auto &EventImpl : EventImpls) {
315  EventImpl->flushIfNeeded(Queue);
316  }
317 }
318 
320  ExecCGCommand *MThisCmd;
321  std::vector<interop_handle::ReqToMem> MReqToMem;
322 
323  pi_result waitForEvents() const {
324  std::map<const PluginPtr, std::vector<EventImplPtr>>
325  RequiredEventsPerPlugin;
326 
327  for (const EventImplPtr &Event : MThisCmd->MPreparedDepsEvents) {
328  const PluginPtr &Plugin = Event->getPlugin();
329  RequiredEventsPerPlugin[Plugin].push_back(Event);
330  }
331 
332  // wait for dependency device events
333  // FIXME Current implementation of waiting for events will make the thread
334  // 'sleep' until all of dependency events are complete. We need a bit more
335  // sophisticated waiting mechanism to allow to utilize this thread for any
336  // other available job and resume once all required events are ready.
337  for (auto &PluginWithEvents : RequiredEventsPerPlugin) {
338  std::vector<sycl::detail::pi::PiEvent> RawEvents =
339  MThisCmd->getPiEvents(PluginWithEvents.second);
340  try {
341  PluginWithEvents.first->call<PiApiKind::piEventsWait>(RawEvents.size(),
342  RawEvents.data());
343  } catch (const sycl::exception &E) {
344  CGHostTask &HostTask = static_cast<CGHostTask &>(MThisCmd->getCG());
345  HostTask.MQueue->reportAsyncException(std::current_exception());
346  return (pi_result)E.get_cl_code();
347  } catch (...) {
348  CGHostTask &HostTask = static_cast<CGHostTask &>(MThisCmd->getCG());
349  HostTask.MQueue->reportAsyncException(std::current_exception());
350  return PI_ERROR_UNKNOWN;
351  }
352  }
353 
354  // Wait for dependency host events.
355  // Host events can't throw exceptions so don't try to catch it.
356  for (const EventImplPtr &Event : MThisCmd->MPreparedHostDepsEvents) {
357  Event->waitInternal();
358  }
359 
360  return PI_SUCCESS;
361  }
362 
363 public:
365  std::vector<interop_handle::ReqToMem> ReqToMem)
366  : MThisCmd{ThisCmd}, MReqToMem(std::move(ReqToMem)) {}
367 
368  void operator()() const {
369  assert(MThisCmd->getCG().getType() == CG::CGTYPE::CodeplayHostTask);
370 
371  CGHostTask &HostTask = static_cast<CGHostTask &>(MThisCmd->getCG());
372 
373 #ifdef XPTI_ENABLE_INSTRUMENTATION
374  // Host task is executed async and in a separate thread that do not allow to
375  // use code location data stored in TLS. So we keep submission code location
376  // as Command field and put it here to TLS so that thrown exception could
377  // query and report it.
378  std::unique_ptr<detail::tls_code_loc_t> AsyncCodeLocationPtr;
379  if (xptiTraceEnabled() && !CurrentCodeLocationValid()) {
380  AsyncCodeLocationPtr.reset(
382  }
383 #endif
384 
385  pi_result WaitResult = waitForEvents();
386  if (WaitResult != PI_SUCCESS) {
387  std::exception_ptr EPtr = std::make_exception_ptr(sycl::runtime_error(
388  std::string("Couldn't wait for host-task's dependencies"),
389  WaitResult));
390  HostTask.MQueue->reportAsyncException(EPtr);
391  // reset host-task's lambda and quit
392  HostTask.MHostTask.reset();
394  return;
395  }
396 
397  try {
398  // we're ready to call the user-defined lambda now
399  if (HostTask.MHostTask->isInteropTask()) {
400  interop_handle IH{MReqToMem, HostTask.MQueue,
401  HostTask.MQueue->getDeviceImplPtr(),
402  HostTask.MQueue->getContextImplPtr()};
403 
404  HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo(), IH);
405  } else
406  HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo());
407  } catch (...) {
408  auto CurrentException = std::current_exception();
409 #ifdef XPTI_ENABLE_INSTRUMENTATION
410  // sycl::exception emit tracing of message with code location if
411  // available. For other types of exception we need to explicitly trigger
412  // tracing by calling TraceEventXPTI.
413  if (xptiTraceEnabled()) {
414  try {
415  rethrow_exception(CurrentException);
416  } catch (const sycl::exception &) {
417  // it is already traced, nothing to care about
418  } catch (const std::exception &StdException) {
419  GlobalHandler::instance().TraceEventXPTI(StdException.what());
420  } catch (...) {
422  "Host task lambda thrown non standard exception");
423  }
424  }
425 #endif
426  HostTask.MQueue->reportAsyncException(CurrentException);
427  }
428 
429  HostTask.MHostTask.reset();
430 
431 #ifdef XPTI_ENABLE_INSTRUMENTATION
432  // Host Task is done, clear its submittion location to not interfere with
433  // following dependent kernels submission.
434  AsyncCodeLocationPtr.reset();
435 #endif
436 
437  try {
438  // If we enqueue blocked users - pi level could throw exception that
439  // should be treated as async now.
441  } catch (...) {
442  auto CurrentException = std::current_exception();
443  HostTask.MQueue->reportAsyncException(CurrentException);
444  }
445  }
446 };
447 
449  for (const EventImplPtr &HostEvent : MPreparedHostDepsEvents)
450  HostEvent->waitInternal();
451 }
452 
454  std::vector<EventImplPtr> &EventImpls,
455  sycl::detail::pi::PiEvent &Event) {
456  if (!EventImpls.empty()) {
457  if (Queue->is_host()) {
458  // Host queue can wait for events from different contexts, i.e. it may
459  // contain events with different contexts in its MPreparedDepsEvents.
460  // OpenCL 2.1 spec says that clWaitForEvents will return
461  // CL_INVALID_CONTEXT if events specified in the list do not belong to
462  // the same context. Thus we split all the events into per-context map.
463  // An example. We have two queues for the same CPU device: Q1, Q2. Thus
464  // we will have two different contexts for the same CPU device: C1, C2.
465  // Also we have default host queue. This queue is accessible via
466  // Scheduler. Now, let's assume we have three different events: E1(C1),
467  // E2(C1), E3(C2). The command's MPreparedDepsEvents will contain all
468  // three events (E1, E2, E3). Now, if piEventsWait is called for all
469  // three events we'll experience failure with CL_INVALID_CONTEXT 'cause
470  // these events refer to different contexts.
471  std::map<context_impl *, std::vector<EventImplPtr>>
472  RequiredEventsPerContext;
473 
474  for (const EventImplPtr &Event : EventImpls) {
475  ContextImplPtr Context = Event->getContextImpl();
476  assert(Context.get() &&
477  "Only non-host events are expected to be waited for here");
478  RequiredEventsPerContext[Context.get()].push_back(Event);
479  }
480 
481  for (auto &CtxWithEvents : RequiredEventsPerContext) {
482  std::vector<sycl::detail::pi::PiEvent> RawEvents =
483  getPiEvents(CtxWithEvents.second);
484  CtxWithEvents.first->getPlugin()->call<PiApiKind::piEventsWait>(
485  RawEvents.size(), RawEvents.data());
486  }
487  } else {
488 #ifndef NDEBUG
489  for (const EventImplPtr &Event : EventImpls)
490  assert(Event->getContextImpl().get() &&
491  "Only non-host events are expected to be waited for here");
492 #endif
493 
494  std::vector<sycl::detail::pi::PiEvent> RawEvents =
495  getPiEvents(EventImpls);
496  flushCrossQueueDeps(EventImpls, getWorkerQueue());
497  const PluginPtr &Plugin = Queue->getPlugin();
498 
499  if (MEvent != nullptr)
500  MEvent->setHostEnqueueTime();
501  Plugin->call<PiApiKind::piEnqueueEventsWait>(
502  Queue->getHandleRef(), RawEvents.size(), &RawEvents[0], &Event);
503  }
504  }
505 }
506 
511  CommandType Type, QueueImplPtr Queue,
513  const std::vector<sycl::detail::pi::PiExtSyncPoint> &SyncPoints)
514  : MQueue(std::move(Queue)),
515  MEvent(std::make_shared<detail::event_impl>(MQueue)),
516  MPreparedDepsEvents(MEvent->getPreparedDepsEvents()),
517  MPreparedHostDepsEvents(MEvent->getPreparedHostDepsEvents()), MType(Type),
518  MCommandBuffer(CommandBuffer), MSyncPointDeps(SyncPoints) {
520  MEvent->setWorkerQueue(MWorkerQueue);
521  MEvent->setSubmittedQueue(MWorkerQueue);
522  MEvent->setCommand(this);
523  MEvent->setContextImpl(MQueue->getContextImplPtr());
524  MEvent->setStateIncomplete();
526 
527 #ifdef XPTI_ENABLE_INSTRUMENTATION
528  if (!xptiTraceEnabled())
529  return;
530  // Obtain the stream ID so all commands can emit traces to that stream
531  MStreamID = xptiRegisterStream(SYCL_STREAM_NAME);
532 #endif
533 }
534 
536 #ifdef XPTI_ENABLE_INSTRUMENTATION
538 #endif
539 }
540 
552  Command *Cmd, void *ObjAddr, bool IsCommand,
553  std::optional<access::mode> AccMode) {
554 #ifdef XPTI_ENABLE_INSTRUMENTATION
555  // Bail early if either the source or the target node for the given
556  // dependency is undefined or NULL
557  constexpr uint16_t NotificationTraceType = xpti::trace_edge_create;
558  if (!(xptiCheckTraceEnabled(MStreamID, NotificationTraceType) &&
559  MTraceEvent && Cmd && Cmd->MTraceEvent))
560  return;
561 
562  // If all the information we need for creating an edge event is available,
563  // then go ahead with creating it; if not, bail early!
564  xpti::utils::StringHelper SH;
565  std::string AddressStr = SH.addressAsString<void *>(ObjAddr);
566  std::string Prefix = AccMode ? accessModeToString(AccMode.value()) : "Event";
567  std::string TypeString = SH.nameWithAddressString(Prefix, AddressStr);
568  // Create an edge with the dependent buffer address for which a command
569  // object has been created as one of the properties of the edge
570  xpti::payload_t Payload(TypeString.c_str(), MAddress);
571  uint64_t EdgeInstanceNo;
572  xpti_td *EdgeEvent =
573  xptiMakeEvent(TypeString.c_str(), &Payload, xpti::trace_graph_event,
574  xpti_at::active, &EdgeInstanceNo);
575  if (EdgeEvent) {
576  xpti_td *SrcEvent = static_cast<xpti_td *>(Cmd->MTraceEvent);
577  xpti_td *TgtEvent = static_cast<xpti_td *>(MTraceEvent);
578  EdgeEvent->source_id = SrcEvent->unique_id;
579  EdgeEvent->target_id = TgtEvent->unique_id;
580  if (IsCommand) {
581  xpti::addMetadata(EdgeEvent, "access_mode",
582  static_cast<int>(AccMode.value()));
583  xpti::addMetadata(EdgeEvent, "memory_object",
584  reinterpret_cast<size_t>(ObjAddr));
585  } else {
586  xpti::addMetadata(EdgeEvent, "event", reinterpret_cast<size_t>(ObjAddr));
587  }
588  xptiNotifySubscribers(MStreamID, NotificationTraceType,
589  detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
590  nullptr);
591  }
592  // General comment - None of these are serious errors as the instrumentation
593  // layer MUST be tolerant of errors. If we need to let the end user know, we
594  // throw exceptions in the future
595 #endif
596 }
597 
603  Command *Cmd, sycl::detail::pi::PiEvent &PiEventAddr) {
604 #ifdef XPTI_ENABLE_INSTRUMENTATION
605  // If we have failed to create an event to represent the Command, then we
606  // cannot emit an edge event. Bail early!
607  if (!(xptiCheckTraceEnabled(MStreamID) && MTraceEvent))
608  return;
609 
610  if (Cmd && Cmd->MTraceEvent) {
611  // If the event is associated with a command, we use this command's trace
612  // event as the source of edge, hence modeling the control flow
613  emitEdgeEventForCommandDependence(Cmd, (void *)PiEventAddr, false);
614  return;
615  }
616  if (PiEventAddr) {
617  xpti::utils::StringHelper SH;
618  std::string AddressStr =
619  SH.addressAsString<sycl::detail::pi::PiEvent>(PiEventAddr);
620  // This is the case when it is a OCL event enqueued by the user or another
621  // event is registered by the runtime as a dependency The dependency on
622  // this occasion is an OCL event; so we build a virtual node in the graph
623  // with the event as the metadata for the node
624  std::string NodeName = SH.nameWithAddressString("virtual_node", AddressStr);
625  // Node name is "virtual_node[<event_addr>]"
626  xpti::payload_t VNPayload(NodeName.c_str(), MAddress);
627  uint64_t VNodeInstanceNo;
628  xpti_td *NodeEvent =
629  xptiMakeEvent(NodeName.c_str(), &VNPayload, xpti::trace_graph_event,
630  xpti_at::active, &VNodeInstanceNo);
631  // Emit the virtual node first
632  xpti::addMetadata(NodeEvent, "kernel_name", NodeName);
633  xptiNotifySubscribers(MStreamID, xpti::trace_node_create,
634  detail::GSYCLGraphEvent, NodeEvent, VNodeInstanceNo,
635  nullptr);
636  // Create a new event for the edge
637  std::string EdgeName = SH.nameWithAddressString("Event", AddressStr);
638  xpti::payload_t EdgePayload(EdgeName.c_str(), MAddress);
639  uint64_t EdgeInstanceNo;
640  xpti_td *EdgeEvent =
641  xptiMakeEvent(EdgeName.c_str(), &EdgePayload, xpti::trace_graph_event,
642  xpti_at::active, &EdgeInstanceNo);
643  if (EdgeEvent && NodeEvent) {
644  // Source node represents the event and this event needs to be completed
645  // before target node can execute
646  xpti_td *TgtEvent = static_cast<xpti_td *>(MTraceEvent);
647  EdgeEvent->source_id = NodeEvent->unique_id;
648  EdgeEvent->target_id = TgtEvent->unique_id;
649  xpti::addMetadata(EdgeEvent, "event",
650  reinterpret_cast<size_t>(PiEventAddr));
651  xptiNotifySubscribers(MStreamID, xpti::trace_edge_create,
652  detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
653  nullptr);
654  }
655  return;
656  }
657 #endif
658 }
659 
660 uint64_t Command::makeTraceEventProlog(void *MAddress) {
661  uint64_t CommandInstanceNo = 0;
662 #ifdef XPTI_ENABLE_INSTRUMENTATION
663  if (!xptiCheckTraceEnabled(MStreamID))
664  return CommandInstanceNo;
665 
667  // Setup the member variables with information needed for event notification
668  MCommandNodeType = commandToNodeType(MType);
669  MCommandName = commandToName(MType);
670  xpti::utils::StringHelper SH;
671  MAddressString = SH.addressAsString<void *>(MAddress);
672  std::string CommandString =
673  SH.nameWithAddressString(MCommandName, MAddressString);
674 
675  xpti::payload_t p(CommandString.c_str(), MAddress);
676  xpti_td *CmdTraceEvent =
677  xptiMakeEvent(CommandString.c_str(), &p, xpti::trace_graph_event,
678  xpti_at::active, &CommandInstanceNo);
679  MInstanceID = CommandInstanceNo;
680  if (CmdTraceEvent) {
681  MTraceEvent = (void *)CmdTraceEvent;
682  // If we are seeing this event again, then the instance ID will be greater
683  // than 1; in this case, we must skip sending a notification to create a
684  // node as this node has already been created. We return this value so the
685  // epilog method can be called selectively.
686  MFirstInstance = (CommandInstanceNo == 1);
687  }
688 #endif
689  return CommandInstanceNo;
690 }
691 
693 #ifdef XPTI_ENABLE_INSTRUMENTATION
694  constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
695  if (!(xptiCheckTraceEnabled(MStreamID, NotificationTraceType) && MTraceEvent))
696  return;
698  xptiNotifySubscribers(MStreamID, NotificationTraceType,
699  detail::GSYCLGraphEvent,
700  static_cast<xpti_td *>(MTraceEvent), MInstanceID,
701  static_cast<const void *>(MCommandNodeType.c_str()));
702 #endif
703 }
704 
706  std::vector<Command *> &ToCleanUp) {
707  const QueueImplPtr &WorkerQueue = getWorkerQueue();
708  const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr();
709 
710  // 1. Async work is not supported for host device.
711  // 2. Non-host events can be ignored if they are not fully initialized.
712  // 3. Some types of commands do not produce PI events after they are
713  // enqueued
714  // (e.g. alloca). Note that we can't check the pi event to make that
715  // distinction since the command might still be unenqueued at this point.
716  bool PiEventExpected = (!DepEvent->is_host() && DepEvent->isInitialized());
717  if (auto *DepCmd = static_cast<Command *>(DepEvent->getCommand()))
718  PiEventExpected &= DepCmd->producesPiEvent();
719 
720  if (!PiEventExpected) {
721  // call to waitInternal() is in waitForPreparedHostEvents() as it's called
722  // from enqueue process functions
723  MPreparedHostDepsEvents.push_back(DepEvent);
724  return nullptr;
725  }
726 
727  Command *ConnectionCmd = nullptr;
728 
729  ContextImplPtr DepEventContext = DepEvent->getContextImpl();
730  // If contexts don't match we'll connect them using host task
731  if (DepEventContext != WorkerContext && !WorkerContext->is_host()) {
733  ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep, ToCleanUp);
734  } else
735  MPreparedDepsEvents.push_back(std::move(DepEvent));
736 
737  return ConnectionCmd;
738 }
739 
741  return MQueue->getContextImplPtr();
742 }
743 
745  assert(MWorkerQueue && "MWorkerQueue must not be nullptr");
746  return MWorkerQueue;
747 }
748 
749 bool Command::producesPiEvent() const { return true; }
750 
751 bool Command::supportsPostEnqueueCleanup() const { return true; }
752 
754  return MLeafCounter == 0 &&
756 }
757 
758 Command *Command::addDep(DepDesc NewDep, std::vector<Command *> &ToCleanUp) {
759  Command *ConnectionCmd = nullptr;
760 
761  if (NewDep.MDepCommand) {
762  ConnectionCmd =
763  processDepEvent(NewDep.MDepCommand->getEvent(), NewDep, ToCleanUp);
764  }
765  // ConnectionCmd insertion builds the following dependency structure:
766  // this -> emptyCmd (for ConnectionCmd) -> ConnectionCmd -> NewDep
767  // that means that this and NewDep are already dependent
768  if (!ConnectionCmd) {
769  MDeps.push_back(NewDep);
770  if (NewDep.MDepCommand)
771  NewDep.MDepCommand->addUser(this);
772  }
773 
774 #ifdef XPTI_ENABLE_INSTRUMENTATION
776  (void *)NewDep.MDepRequirement->MSYCLMemObj,
777  true, NewDep.MDepRequirement->MAccessMode);
778 #endif
779 
780  return ConnectionCmd;
781 }
782 
784  std::vector<Command *> &ToCleanUp) {
785 #ifdef XPTI_ENABLE_INSTRUMENTATION
786  // We need this for just the instrumentation, so guarding it will prevent
787  // unused variable warnings when instrumentation is turned off
788  Command *Cmd = (Command *)Event->getCommand();
789  sycl::detail::pi::PiEvent &PiEventAddr = Event->getHandleRef();
790  // Now make an edge for the dependent event
791  emitEdgeEventForEventDependence(Cmd, PiEventAddr);
792 #endif
793 
794  return processDepEvent(std::move(Event), DepDesc{nullptr, nullptr, nullptr},
795  ToCleanUp);
796 }
797 
799 #ifdef XPTI_ENABLE_INSTRUMENTATION
800  emitInstrumentationGeneral(
801  MStreamID, MInstanceID, static_cast<xpti_td *>(MTraceEvent),
802  xpti::trace_signal, static_cast<const void *>(PiEventAddr));
803 #endif
804  std::ignore = PiEventAddr;
805 }
806 
807 void Command::emitInstrumentation(uint16_t Type, const char *Txt) {
808 #ifdef XPTI_ENABLE_INSTRUMENTATION
809  return emitInstrumentationGeneral(MStreamID, MInstanceID,
810  static_cast<xpti_td *>(MTraceEvent), Type,
811  static_cast<const void *>(Txt));
812 #else
813  std::ignore = Type;
814  std::ignore = Txt;
815 #endif
816 }
817 
818 bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking,
819  std::vector<Command *> &ToCleanUp) {
820 #ifdef XPTI_ENABLE_INSTRUMENTATION
821  // If command is enqueued from host task thread - it will not have valid
822  // submission code location set. So we set it manually to properly trace
823  // failures if pi level report any.
824  std::unique_ptr<detail::tls_code_loc_t> AsyncCodeLocationPtr;
825  if (xptiTraceEnabled() && !CurrentCodeLocationValid()) {
826  AsyncCodeLocationPtr.reset(
828  }
829 #endif
830  // Exit if already enqueued
832  return true;
833 
834  // If the command is blocked from enqueueing
836  // Exit if enqueue type is not blocking
837  if (!Blocking) {
838  EnqueueResult = EnqueueResultT(EnqueueResultT::SyclEnqueueBlocked, this);
839  return false;
840  }
841 
842 #ifdef XPTI_ENABLE_INSTRUMENTATION
843  // Scoped trace event notifier that emits a barrier begin and barrier end
844  // event, which models the barrier while enqueuing along with the blocked
845  // reason, as determined by the scheduler
846  std::string Info = "enqueue.barrier[";
847  Info += std::string(getBlockReason()) + "]";
848  emitInstrumentation(xpti::trace_barrier_begin, Info.c_str());
849 #endif
850 
851  // Wait if blocking
853  ;
854 #ifdef XPTI_ENABLE_INSTRUMENTATION
855  emitInstrumentation(xpti::trace_barrier_end, Info.c_str());
856 #endif
857  }
858 
859  std::lock_guard<std::mutex> Lock(MEnqueueMtx);
860 
861  // Exit if the command is already enqueued
863  return true;
864 
865 #ifdef XPTI_ENABLE_INSTRUMENTATION
866  emitInstrumentation(xpti::trace_task_begin, nullptr);
867 #endif
868 
870  EnqueueResult = EnqueueResultT(EnqueueResultT::SyclEnqueueFailed, this);
871  return false;
872  }
873 
874  // Command status set to "failed" beforehand, so this command
875  // has already been marked as "failed" if enqueueImp throws an exception.
876  // This will avoid execution of the same failed command twice.
879  pi_int32 Res = enqueueImp();
880 
881  if (PI_SUCCESS != Res)
882  EnqueueResult =
884  else {
886  (MEvent->is_host() || MEvent->getHandleRef() == nullptr))
887  MEvent->setComplete();
888 
889  // Consider the command is successfully enqueued if return code is
890  // PI_SUCCESS
895  assert(!MMarkedForCleanup);
896  MMarkedForCleanup = true;
897  ToCleanUp.push_back(this);
898  }
899  }
900 
901  // Emit this correlation signal before the task end
902  emitEnqueuedEventSignal(MEvent->getHandleRef());
903 #ifdef XPTI_ENABLE_INSTRUMENTATION
904  emitInstrumentation(xpti::trace_task_end, nullptr);
905 #endif
907 }
908 
909 void Command::resolveReleaseDependencies(std::set<Command *> &DepList) {
910 #ifdef XPTI_ENABLE_INSTRUMENTATION
911  assert(MType == CommandType::RELEASE && "Expected release command");
912  if (!MTraceEvent)
913  return;
914  // The current command is the target node for all dependencies as the source
915  // nodes have to be completed first before the current node can begin to
916  // execute; these edges model control flow
917  xpti_td *TgtTraceEvent = static_cast<xpti_td *>(MTraceEvent);
918  // We have all the Commands that must be completed before the release
919  // command can be enqueued; here we'll find the command that is an Alloca
920  // with the same SYCLMemObject address and create a dependency line (edge)
921  // between them in our sematic modeling
922  for (auto &Item : DepList) {
923  if (Item->MTraceEvent && Item->MAddress == MAddress) {
924  xpti::utils::StringHelper SH;
925  std::string AddressStr = SH.addressAsString<void *>(MAddress);
926  std::string TypeString =
927  "Edge:" + SH.nameWithAddressString(commandToName(MType), AddressStr);
928 
929  // Create an edge with the dependent buffer address being one of the
930  // properties of the edge
931  xpti::payload_t p(TypeString.c_str(), MAddress);
932  uint64_t EdgeInstanceNo;
933  xpti_td *EdgeEvent =
934  xptiMakeEvent(TypeString.c_str(), &p, xpti::trace_graph_event,
935  xpti_at::active, &EdgeInstanceNo);
936  if (EdgeEvent) {
937  xpti_td *SrcTraceEvent = static_cast<xpti_td *>(Item->MTraceEvent);
938  EdgeEvent->target_id = TgtTraceEvent->unique_id;
939  EdgeEvent->source_id = SrcTraceEvent->unique_id;
940  xpti::addMetadata(EdgeEvent, "memory_object",
941  reinterpret_cast<size_t>(MAddress));
942  xptiNotifySubscribers(MStreamID, xpti::trace_edge_create,
943  detail::GSYCLGraphEvent, EdgeEvent,
944  EdgeInstanceNo, nullptr);
945  }
946  }
947  }
948 #endif
949 }
950 
951 const char *Command::getBlockReason() const {
952  switch (MBlockReason) {
954  return "A Buffer is locked by the host accessor";
956  return "Blocked by host task";
957  }
958 
959  return "Unknown block reason";
960 }
961 
963 #ifdef XPTI_ENABLE_INSTRUMENTATION
964  if (!xptiTraceEnabled())
965  return;
966 
968  auto TData = Tls.query();
969  if (TData.fileName())
970  MSubmissionFileName = TData.fileName();
971  if (TData.functionName())
972  MSubmissionFunctionName = TData.functionName();
973  if (MSubmissionFileName.size() || MSubmissionFunctionName.size())
976  (int)TData.lineNumber(), (int)TData.columnNumber()};
977 #endif
978 }
979 
981  Requirement Req,
982  AllocaCommandBase *LinkedAllocaCmd,
983  bool IsConst)
984  : Command(Type, Queue), MLinkedAllocaCmd(LinkedAllocaCmd),
985  MIsLeaderAlloca(nullptr == LinkedAllocaCmd), MIsConst(IsConst),
986  MRequirement(std::move(Req)), MReleaseCmd(Queue, this) {
989 }
990 
992 #ifdef XPTI_ENABLE_INSTRUMENTATION
993  if (!xptiCheckTraceEnabled(MStreamID))
994  return;
995  // Create a payload with the command name and an event using this payload to
996  // emit a node_create
999  // Set the relevant meta data properties for this command
1000  if (MTraceEvent && MFirstInstance) {
1001  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1002  xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
1003  xpti::addMetadata(TE, "sycl_device_type",
1004  deviceToString(MQueue->get_device()));
1005  xpti::addMetadata(TE, "sycl_device_name",
1006  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1007  xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
1008  xpti::addMetadata(TE, "queue_id", MQueue->getQueueID());
1009  }
1010 #endif
1011 }
1012 
1013 bool AllocaCommandBase::producesPiEvent() const { return false; }
1014 
1016 
1017 bool AllocaCommandBase::readyForCleanup() const { return false; }
1018 
1020  bool InitFromUserData,
1021  AllocaCommandBase *LinkedAllocaCmd, bool IsConst)
1022  : AllocaCommandBase(CommandType::ALLOCA, std::move(Queue), std::move(Req),
1023  LinkedAllocaCmd, IsConst),
1024  MInitFromUserData(InitFromUserData) {
1025  // Node event must be created before the dependent edge is added to this
1026  // node, so this call must be before the addDep() call.
1028  // "Nothing to depend on"
1029  std::vector<Command *> ToCleanUp;
1030  Command *ConnectionCmd =
1031  addDep(DepDesc(nullptr, getRequirement(), this), ToCleanUp);
1032  assert(ConnectionCmd == nullptr);
1033  assert(ToCleanUp.empty());
1034  (void)ConnectionCmd;
1035 }
1036 
1038 #ifdef XPTI_ENABLE_INSTRUMENTATION
1039  if (!xptiCheckTraceEnabled(MStreamID))
1040  return;
1041 
1042  // Only if it is the first event, we emit a node create event
1043  if (MFirstInstance) {
1045  }
1046 #endif
1047 }
1048 
1049 pi_int32 AllocaCommand::enqueueImp() {
1051  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1052 
1053  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
1054 
1055  void *HostPtr = nullptr;
1056  if (!MIsLeaderAlloca) {
1057 
1058  if (MQueue->is_host()) {
1059  // Do not need to make allocation if we have a linked device allocation
1060  Command::waitForEvents(MQueue, EventImpls, Event);
1061 
1062  return PI_SUCCESS;
1063  }
1064  HostPtr = MLinkedAllocaCmd->getMemAllocation();
1065  }
1066  // TODO: Check if it is correct to use std::move on stack variable and
1067  // delete it RawEvents below.
1069  MQueue->getContextImplPtr(), getSYCLMemObj(), MInitFromUserData, HostPtr,
1070  std::move(EventImpls), Event);
1071 
1072  return PI_SUCCESS;
1073 }
1074 
1075 void AllocaCommand::printDot(std::ostream &Stream) const {
1076  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1077 
1078  Stream << "ID = " << this << "\\n";
1079  Stream << "ALLOCA ON " << deviceToString(MQueue->get_device()) << "\\n";
1080  Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
1081  Stream << " Link : " << this->MLinkedAllocaCmd << "\\n";
1082  Stream << "\"];" << std::endl;
1083 
1084  for (const auto &Dep : MDeps) {
1085  if (Dep.MDepCommand == nullptr)
1086  continue;
1087  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1088  << " [ label = \"Access mode: "
1089  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1090  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1091  << std::endl;
1092  }
1093 }
1094 
1096  AllocaCommandBase *ParentAlloca,
1097  std::vector<Command *> &ToEnqueue,
1098  std::vector<Command *> &ToCleanUp)
1099  : AllocaCommandBase(CommandType::ALLOCA_SUB_BUF, std::move(Queue),
1100  std::move(Req),
1101  /*LinkedAllocaCmd*/ nullptr, /*IsConst*/ false),
1102  MParentAlloca(ParentAlloca) {
1103  // Node event must be created before the dependent edge
1104  // is added to this node, so this call must be before
1105  // the addDep() call.
1107  Command *ConnectionCmd = addDep(
1108  DepDesc(MParentAlloca, getRequirement(), MParentAlloca), ToCleanUp);
1109  if (ConnectionCmd)
1110  ToEnqueue.push_back(ConnectionCmd);
1111 }
1112 
1114 #ifdef XPTI_ENABLE_INSTRUMENTATION
1115  if (!xptiCheckTraceEnabled(MStreamID))
1116  return;
1117 
1118  // Only if it is the first event, we emit a node create event and any meta
1119  // data that is available for the command
1120  if (MFirstInstance) {
1121  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1122  xpti::addMetadata(TE, "offset", this->MRequirement.MOffsetInBytes);
1123  xpti::addMetadata(TE, "access_range_start",
1124  this->MRequirement.MAccessRange[0]);
1125  xpti::addMetadata(TE, "access_range_end",
1126  this->MRequirement.MAccessRange[1]);
1127  xpti::addMetadata(TE, "queue_id", MQueue->getQueueID());
1129  }
1130 #endif
1131 }
1132 
1134  // In some cases parent`s memory allocation might change (e.g., after
1135  // map/unmap operations). If parent`s memory allocation changes, sub-buffer
1136  // memory allocation should be changed as well.
1137  if (MQueue->is_host()) {
1138  return static_cast<void *>(
1139  static_cast<char *>(MParentAlloca->getMemAllocation()) +
1141  }
1142  return MMemAllocation;
1143 }
1144 
1145 pi_int32 AllocaSubBufCommand::enqueueImp() {
1147  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1148  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
1149 
1151  MQueue->getContextImplPtr(), MParentAlloca->getMemAllocation(),
1153  MRequirement.MAccessRange, std::move(EventImpls), Event);
1154 
1156  MMemAllocation);
1157  return PI_SUCCESS;
1158 }
1159 
1160 void AllocaSubBufCommand::printDot(std::ostream &Stream) const {
1161  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1162 
1163  Stream << "ID = " << this << "\\n";
1164  Stream << "ALLOCA SUB BUF ON " << deviceToString(MQueue->get_device())
1165  << "\\n";
1166  Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
1167  Stream << " Offset : " << this->MRequirement.MOffsetInBytes << "\\n";
1168  Stream << " Access range : " << this->MRequirement.MAccessRange[0] << "\\n";
1169  Stream << "\"];" << std::endl;
1170 
1171  for (const auto &Dep : MDeps) {
1172  if (Dep.MDepCommand == nullptr)
1173  continue;
1174  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1175  << " [ label = \"Access mode: "
1176  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1177  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1178  << std::endl;
1179  }
1180 }
1181 
1183  : Command(CommandType::RELEASE, std::move(Queue)), MAllocaCmd(AllocaCmd) {
1185 }
1186 
1188 #ifdef XPTI_ENABLE_INSTRUMENTATION
1189  if (!xptiCheckTraceEnabled(MStreamID))
1190  return;
1191  // Create a payload with the command name and an event using this payload to
1192  // emit a node_create
1193  MAddress = MAllocaCmd->getSYCLMemObj();
1195 
1196  if (MFirstInstance) {
1197  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1198  xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
1199  xpti::addMetadata(TE, "sycl_device_type",
1200  deviceToString(MQueue->get_device()));
1201  xpti::addMetadata(TE, "sycl_device_name",
1202  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1203  xpti::addMetadata(TE, "allocation_type",
1204  commandToName(MAllocaCmd->getType()));
1205  xpti::addMetadata(TE, "queue_id", MQueue->getQueueID());
1206 
1208  }
1209 #endif
1210 }
1211 
1212 pi_int32 ReleaseCommand::enqueueImp() {
1214  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1215  std::vector<sycl::detail::pi::PiEvent> RawEvents = getPiEvents(EventImpls);
1216  bool SkipRelease = false;
1217 
1218  // On host side we only allocate memory for full buffers.
1219  // Thus, deallocating sub buffers leads to double memory freeing.
1220  SkipRelease |= MQueue->is_host() && MAllocaCmd->getType() == ALLOCA_SUB_BUF;
1221 
1222  const bool CurAllocaIsHost = MAllocaCmd->getQueue()->is_host();
1223  bool NeedUnmap = false;
1224  if (MAllocaCmd->MLinkedAllocaCmd) {
1225 
1226  // When releasing one of the "linked" allocations special rules take
1227  // place:
1228  // 1. Device allocation should always be released.
1229  // 2. Host allocation should be released if host allocation is "leader".
1230  // 3. Device alloca in the pair should be in active state in order to be
1231  // correctly released.
1232 
1233  // There is no actual memory allocation if a host alloca command is
1234  // created being linked to a device allocation.
1235  SkipRelease |= CurAllocaIsHost && !MAllocaCmd->MIsLeaderAlloca;
1236 
1237  NeedUnmap |= CurAllocaIsHost == MAllocaCmd->MIsActive;
1238  }
1239 
1240  if (NeedUnmap) {
1241  const QueueImplPtr &Queue = CurAllocaIsHost
1242  ? MAllocaCmd->MLinkedAllocaCmd->getQueue()
1243  : MAllocaCmd->getQueue();
1244 
1245  EventImplPtr UnmapEventImpl(new event_impl(Queue));
1246  UnmapEventImpl->setContextImpl(Queue->getContextImplPtr());
1247  UnmapEventImpl->setStateIncomplete();
1248  sycl::detail::pi::PiEvent &UnmapEvent = UnmapEventImpl->getHandleRef();
1249 
1250  void *Src = CurAllocaIsHost
1251  ? MAllocaCmd->getMemAllocation()
1252  : MAllocaCmd->MLinkedAllocaCmd->getMemAllocation();
1253 
1254  void *Dst = !CurAllocaIsHost
1255  ? MAllocaCmd->getMemAllocation()
1256  : MAllocaCmd->MLinkedAllocaCmd->getMemAllocation();
1257 
1258  MemoryManager::unmap(MAllocaCmd->getSYCLMemObj(), Dst, Queue, Src,
1259  RawEvents, UnmapEvent);
1260 
1261  std::swap(MAllocaCmd->MIsActive, MAllocaCmd->MLinkedAllocaCmd->MIsActive);
1262  EventImpls.clear();
1263  EventImpls.push_back(UnmapEventImpl);
1264  }
1265  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
1266  if (SkipRelease)
1267  Command::waitForEvents(MQueue, EventImpls, Event);
1268  else {
1270  MQueue->getContextImplPtr(), MAllocaCmd->getSYCLMemObj(),
1271  MAllocaCmd->getMemAllocation(), std::move(EventImpls), Event);
1272  }
1273  return PI_SUCCESS;
1274 }
1275 
1276 void ReleaseCommand::printDot(std::ostream &Stream) const {
1277  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FF827A\", label=\"";
1278 
1279  Stream << "ID = " << this << " ; ";
1280  Stream << "RELEASE ON " << deviceToString(MQueue->get_device()) << "\\n";
1281  Stream << " Alloca : " << MAllocaCmd << "\\n";
1282  Stream << " MemObj : " << MAllocaCmd->getSYCLMemObj() << "\\n";
1283  Stream << "\"];" << std::endl;
1284 
1285  for (const auto &Dep : MDeps) {
1286  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1287  << " [ label = \"Access mode: "
1288  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1289  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1290  << std::endl;
1291  }
1292 }
1293 
1294 bool ReleaseCommand::producesPiEvent() const { return false; }
1295 
1296 bool ReleaseCommand::supportsPostEnqueueCleanup() const { return false; }
1297 
1298 bool ReleaseCommand::readyForCleanup() const { return false; }
1299 
1301  void **DstPtr, QueueImplPtr Queue,
1302  access::mode MapMode)
1303  : Command(CommandType::MAP_MEM_OBJ, std::move(Queue)),
1304  MSrcAllocaCmd(SrcAllocaCmd), MSrcReq(std::move(Req)), MDstPtr(DstPtr),
1305  MMapMode(MapMode) {
1307 }
1308 
1310 #ifdef XPTI_ENABLE_INSTRUMENTATION
1311  if (!xptiCheckTraceEnabled(MStreamID))
1312  return;
1313  // Create a payload with the command name and an event using this payload to
1314  // emit a node_create
1315  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1317 
1318  if (MFirstInstance) {
1319  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1320  xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
1321  xpti::addMetadata(TE, "sycl_device_type",
1322  deviceToString(MQueue->get_device()));
1323  xpti::addMetadata(TE, "sycl_device_name",
1324  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1325  xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
1326  xpti::addMetadata(TE, "queue_id", MQueue->getQueueID());
1327 
1329  }
1330 #endif
1331 }
1332 
1333 pi_int32 MapMemObject::enqueueImp() {
1335  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1336  std::vector<sycl::detail::pi::PiEvent> RawEvents = getPiEvents(EventImpls);
1337  flushCrossQueueDeps(EventImpls, getWorkerQueue());
1338 
1339  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
1340  *MDstPtr = MemoryManager::map(
1341  MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(), MQueue,
1342  MMapMode, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1343  MSrcReq.MOffset, MSrcReq.MElemSize, std::move(RawEvents), Event);
1344 
1345  return PI_SUCCESS;
1346 }
1347 
1348 void MapMemObject::printDot(std::ostream &Stream) const {
1349  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#77AFFF\", label=\"";
1350 
1351  Stream << "ID = " << this << " ; ";
1352  Stream << "MAP ON " << deviceToString(MQueue->get_device()) << "\\n";
1353 
1354  Stream << "\"];" << std::endl;
1355 
1356  for (const auto &Dep : MDeps) {
1357  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1358  << " [ label = \"Access mode: "
1359  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1360  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1361  << std::endl;
1362  }
1363 }
1364 
1366  void **SrcPtr, QueueImplPtr Queue)
1367  : Command(CommandType::UNMAP_MEM_OBJ, std::move(Queue)),
1368  MDstAllocaCmd(DstAllocaCmd), MDstReq(std::move(Req)), MSrcPtr(SrcPtr) {
1370 }
1371 
1373 #ifdef XPTI_ENABLE_INSTRUMENTATION
1374  if (!xptiCheckTraceEnabled(MStreamID))
1375  return;
1376  // Create a payload with the command name and an event using this payload to
1377  // emit a node_create
1378  MAddress = MDstAllocaCmd->getSYCLMemObj();
1380 
1381  if (MFirstInstance) {
1382  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1383  xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
1384  xpti::addMetadata(TE, "sycl_device_type",
1385  deviceToString(MQueue->get_device()));
1386  xpti::addMetadata(TE, "sycl_device_name",
1387  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1388  xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
1389  xpti::addMetadata(TE, "queue_id", MQueue->getQueueID());
1390 
1392  }
1393 #endif
1394 }
1395 
1397  // TODO remove this workaround once the batching issue is addressed in Level
1398  // Zero plugin.
1399  // Consider the following scenario on Level Zero:
1400  // 1. Kernel A, which uses buffer A, is submitted to queue A.
1401  // 2. Kernel B, which uses buffer B, is submitted to queue B.
1402  // 3. queueA.wait().
1403  // 4. queueB.wait().
1404  // DPCPP runtime used to treat unmap/write commands for buffer A/B as host
1405  // dependencies (i.e. they were waited for prior to enqueueing any command
1406  // that's dependent on them). This allowed Level Zero plugin to detect that
1407  // each queue is idle on steps 1/2 and submit the command list right away.
1408  // This is no longer the case since we started passing these dependencies in
1409  // an event waitlist and Level Zero plugin attempts to batch these commands,
1410  // so the execution of kernel B starts only on step 4. This workaround
1411  // restores the old behavior in this case until this is resolved.
1412  return MQueue->getDeviceImplPtr()->getBackend() !=
1414  MEvent->getHandleRef() != nullptr;
1415 }
1416 
1417 pi_int32 UnMapMemObject::enqueueImp() {
1419  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1420  std::vector<sycl::detail::pi::PiEvent> RawEvents = getPiEvents(EventImpls);
1421  flushCrossQueueDeps(EventImpls, getWorkerQueue());
1422 
1423  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
1424  MemoryManager::unmap(MDstAllocaCmd->getSYCLMemObj(),
1425  MDstAllocaCmd->getMemAllocation(), MQueue, *MSrcPtr,
1426  std::move(RawEvents), Event);
1427 
1428  return PI_SUCCESS;
1429 }
1430 
1431 void UnMapMemObject::printDot(std::ostream &Stream) const {
1432  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#EBC40F\", label=\"";
1433 
1434  Stream << "ID = " << this << " ; ";
1435  Stream << "UNMAP ON " << deviceToString(MQueue->get_device()) << "\\n";
1436 
1437  Stream << "\"];" << std::endl;
1438 
1439  for (const auto &Dep : MDeps) {
1440  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1441  << " [ label = \"Access mode: "
1442  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1443  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1444  << std::endl;
1445  }
1446 }
1447 
1449  AllocaCommandBase *SrcAllocaCmd,
1450  Requirement DstReq,
1451  AllocaCommandBase *DstAllocaCmd,
1452  QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
1453  : Command(CommandType::COPY_MEMORY, std::move(DstQueue)),
1454  MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)),
1455  MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)),
1456  MDstAllocaCmd(DstAllocaCmd) {
1457  if (!MSrcQueue->is_host()) {
1458  MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1459  }
1460 
1461  MWorkerQueue = MQueue->is_host() ? MSrcQueue : MQueue;
1462  MEvent->setWorkerQueue(MWorkerQueue);
1463 
1465 }
1466 
1468 #ifdef XPTI_ENABLE_INSTRUMENTATION
1469  if (!xptiCheckTraceEnabled(MStreamID))
1470  return;
1471  // Create a payload with the command name and an event using this payload to
1472  // emit a node_create
1473  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1475 
1476  if (MFirstInstance) {
1477  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1478  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1479  deviceToID(MQueue->get_device()));
1480  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1481  deviceToString(MQueue->get_device()));
1482  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1483  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1484  xpti::addMetadata(CmdTraceEvent, "memory_object",
1485  reinterpret_cast<size_t>(MAddress));
1486  xpti::addMetadata(CmdTraceEvent, "copy_from",
1487  reinterpret_cast<size_t>(
1488  getSyclObjImpl(MSrcQueue->get_device()).get()));
1489  xpti::addMetadata(
1490  CmdTraceEvent, "copy_to",
1491  reinterpret_cast<size_t>(getSyclObjImpl(MQueue->get_device()).get()));
1492  xpti::addMetadata(CmdTraceEvent, "queue_id", MQueue->getQueueID());
1493 
1495  }
1496 #endif
1497 }
1498 
1500  return getWorkerQueue()->getContextImplPtr();
1501 }
1502 
1504  // TODO remove this workaround once the batching issue is addressed in Level
1505  // Zero plugin.
1506  // Consider the following scenario on Level Zero:
1507  // 1. Kernel A, which uses buffer A, is submitted to queue A.
1508  // 2. Kernel B, which uses buffer B, is submitted to queue B.
1509  // 3. queueA.wait().
1510  // 4. queueB.wait().
1511  // DPCPP runtime used to treat unmap/write commands for buffer A/B as host
1512  // dependencies (i.e. they were waited for prior to enqueueing any command
1513  // that's dependent on them). This allowed Level Zero plugin to detect that
1514  // each queue is idle on steps 1/2 and submit the command list right away.
1515  // This is no longer the case since we started passing these dependencies in
1516  // an event waitlist and Level Zero plugin attempts to batch these commands,
1517  // so the execution of kernel B starts only on step 4. This workaround
1518  // restores the old behavior in this case until this is resolved.
1519  return MQueue->is_host() ||
1520  MQueue->getDeviceImplPtr()->getBackend() !=
1522  MEvent->getHandleRef() != nullptr;
1523 }
1524 
1525 pi_int32 MemCpyCommand::enqueueImp() {
1527  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1528 
1529  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
1530 
1531  auto RawEvents = getPiEvents(EventImpls);
1532  flushCrossQueueDeps(EventImpls, getWorkerQueue());
1533 
1535  MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(),
1536  MSrcQueue, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1537  MSrcReq.MOffset, MSrcReq.MElemSize, MDstAllocaCmd->getMemAllocation(),
1538  MQueue, MDstReq.MDims, MDstReq.MMemoryRange, MDstReq.MAccessRange,
1539  MDstReq.MOffset, MDstReq.MElemSize, std::move(RawEvents), Event, MEvent);
1540 
1541  return PI_SUCCESS;
1542 }
1543 
1544 void MemCpyCommand::printDot(std::ostream &Stream) const {
1545  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#C7EB15\" label=\"";
1546 
1547  Stream << "ID = " << this << " ; ";
1548  Stream << "MEMCPY ON " << deviceToString(MQueue->get_device()) << "\\n";
1549  Stream << "From: " << MSrcAllocaCmd << " is host: " << MSrcQueue->is_host()
1550  << "\\n";
1551  Stream << "To: " << MDstAllocaCmd << " is host: " << MQueue->is_host()
1552  << "\\n";
1553 
1554  Stream << "\"];" << std::endl;
1555 
1556  for (const auto &Dep : MDeps) {
1557  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1558  << " [ label = \"Access mode: "
1559  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1560  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1561  << std::endl;
1562  }
1563 }
1564 
1565 AllocaCommandBase *ExecCGCommand::getAllocaForReq(Requirement *Req) {
1566  for (const DepDesc &Dep : MDeps) {
1567  if (Dep.MDepRequirement == Req)
1568  return Dep.MAllocaCmd;
1569  }
1570  // Default constructed accessors do not add dependencies, but they can be
1571  // passed to commands. Simply return nullptr, since they are empty and don't
1572  // really require any memory.
1573  return nullptr;
1574 }
1575 
1576 std::vector<std::shared_ptr<const void>>
1578  if (MCommandGroup->getType() == CG::Kernel)
1579  return ((CGExecKernel *)MCommandGroup.get())->getAuxiliaryResources();
1580  return {};
1581 }
1582 
1584  if (MCommandGroup->getType() == CG::Kernel)
1585  ((CGExecKernel *)MCommandGroup.get())->clearAuxiliaryResources();
1586 }
1587 
1588 pi_int32 UpdateHostRequirementCommand::enqueueImp() {
1590  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1591  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
1592  Command::waitForEvents(MQueue, EventImpls, Event);
1593 
1594  assert(MSrcAllocaCmd && "Expected valid alloca command");
1595  assert(MSrcAllocaCmd->getMemAllocation() && "Expected valid source pointer");
1596  assert(MDstPtr && "Expected valid target pointer");
1597  *MDstPtr = MSrcAllocaCmd->getMemAllocation();
1598 
1599  return PI_SUCCESS;
1600 }
1601 
1602 void UpdateHostRequirementCommand::printDot(std::ostream &Stream) const {
1603  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#f1337f\", label=\"";
1604 
1605  Stream << "ID = " << this << "\\n";
1606  Stream << "UPDATE REQ ON " << deviceToString(MQueue->get_device()) << "\\n";
1607  bool IsReqOnBuffer =
1608  MDstReq.MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer;
1609  Stream << "TYPE: " << (IsReqOnBuffer ? "Buffer" : "Image") << "\\n";
1610  if (IsReqOnBuffer)
1611  Stream << "Is sub buffer: " << std::boolalpha << MDstReq.MIsSubBuffer
1612  << "\\n";
1613 
1614  Stream << "\"];" << std::endl;
1615 
1616  for (const auto &Dep : MDeps) {
1617  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1618  << " [ label = \"Access mode: "
1619  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1620  << "MemObj: " << Dep.MAllocaCmd->getSYCLMemObj() << " \" ]"
1621  << std::endl;
1622  }
1623 }
1624 
1626  AllocaCommandBase *SrcAllocaCmd,
1627  Requirement DstReq, void **DstPtr,
1628  QueueImplPtr SrcQueue,
1629  QueueImplPtr DstQueue)
1630  : Command(CommandType::COPY_MEMORY, std::move(DstQueue)),
1631  MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)),
1632  MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)), MDstPtr(DstPtr) {
1633  if (!MSrcQueue->is_host()) {
1634  MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1635  }
1636 
1637  MWorkerQueue = MQueue->is_host() ? MSrcQueue : MQueue;
1638  MEvent->setWorkerQueue(MWorkerQueue);
1639 
1641 }
1642 
1644 #ifdef XPTI_ENABLE_INSTRUMENTATION
1645  if (!xptiCheckTraceEnabled(MStreamID))
1646  return;
1647  // Create a payload with the command name and an event using this payload to
1648  // emit a node_create
1649  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1651 
1652  if (MFirstInstance) {
1653  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1654  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1655  deviceToID(MQueue->get_device()));
1656  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1657  deviceToString(MQueue->get_device()));
1658  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1659  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1660  xpti::addMetadata(CmdTraceEvent, "memory_object",
1661  reinterpret_cast<size_t>(MAddress));
1662  xpti::addMetadata(CmdTraceEvent, "copy_from",
1663  reinterpret_cast<size_t>(
1664  getSyclObjImpl(MSrcQueue->get_device()).get()));
1665  xpti::addMetadata(
1666  CmdTraceEvent, "copy_to",
1667  reinterpret_cast<size_t>(getSyclObjImpl(MQueue->get_device()).get()));
1668  xpti::addMetadata(CmdTraceEvent, "queue_id", MQueue->getQueueID());
1669 
1671  }
1672 #endif
1673 }
1674 
1676  return getWorkerQueue()->getContextImplPtr();
1677 }
1678 
1679 pi_int32 MemCpyCommandHost::enqueueImp() {
1680  const QueueImplPtr &Queue = getWorkerQueue();
1682  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1683  std::vector<sycl::detail::pi::PiEvent> RawEvents = getPiEvents(EventImpls);
1684 
1685  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
1686  // Omit copying if mode is discard one.
1687  // TODO: Handle this at the graph building time by, for example, creating
1688  // empty node instead of memcpy.
1691  Command::waitForEvents(Queue, EventImpls, Event);
1692 
1693  return PI_SUCCESS;
1694  }
1695 
1696  flushCrossQueueDeps(EventImpls, getWorkerQueue());
1698  MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(),
1699  MSrcQueue, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1700  MSrcReq.MOffset, MSrcReq.MElemSize, *MDstPtr, MQueue, MDstReq.MDims,
1701  MDstReq.MMemoryRange, MDstReq.MAccessRange, MDstReq.MOffset,
1702  MDstReq.MElemSize, std::move(RawEvents), MEvent->getHandleRef(), MEvent);
1703 
1704  return PI_SUCCESS;
1705 }
1706 
1708  : Command(CommandType::EMPTY_TASK, std::move(Queue)) {
1710 }
1711 
1712 pi_int32 EmptyCommand::enqueueImp() {
1714  waitForEvents(MQueue, MPreparedDepsEvents, MEvent->getHandleRef());
1715 
1716  return PI_SUCCESS;
1717 }
1718 
1720  const Requirement *Req) {
1721  const Requirement &ReqRef = *Req;
1722  MRequirements.emplace_back(ReqRef);
1723  const Requirement *const StoredReq = &MRequirements.back();
1724 
1725  // EmptyCommand is always host one, so we believe that result of addDep is
1726  // nil
1727  std::vector<Command *> ToCleanUp;
1728  Command *Cmd = addDep(DepDesc{DepCmd, StoredReq, AllocaCmd}, ToCleanUp);
1729  assert(Cmd == nullptr && "Conection command should be null for EmptyCommand");
1730  assert(ToCleanUp.empty() && "addDep should add a command for cleanup only if "
1731  "there's a connection command");
1732  (void)Cmd;
1733 }
1734 
1736 #ifdef XPTI_ENABLE_INSTRUMENTATION
1737  if (!xptiCheckTraceEnabled(MStreamID))
1738  return;
1739  // Create a payload with the command name and an event using this payload to
1740  // emit a node_create
1741  if (MRequirements.empty())
1742  return;
1743 
1744  Requirement &Req = *MRequirements.begin();
1745 
1746  MAddress = Req.MSYCLMemObj;
1748 
1749  if (MFirstInstance) {
1750  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1751  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1752  deviceToID(MQueue->get_device()));
1753  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1754  deviceToString(MQueue->get_device()));
1755  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1756  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1757  xpti::addMetadata(CmdTraceEvent, "memory_object",
1758  reinterpret_cast<size_t>(MAddress));
1759  xpti::addMetadata(CmdTraceEvent, "queue_id", MQueue->getQueueID());
1760 
1762  }
1763 #endif
1764 }
1765 
1766 void EmptyCommand::printDot(std::ostream &Stream) const {
1767  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#8d8f29\", label=\"";
1768 
1769  Stream << "ID = " << this << "\\n";
1770  Stream << "EMPTY NODE"
1771  << "\\n";
1772 
1773  Stream << "\"];" << std::endl;
1774 
1775  for (const auto &Dep : MDeps) {
1776  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1777  << " [ label = \"Access mode: "
1778  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1779  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1780  << std::endl;
1781  }
1782 }
1783 
1784 bool EmptyCommand::producesPiEvent() const { return false; }
1785 
1786 void MemCpyCommandHost::printDot(std::ostream &Stream) const {
1787  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\"";
1788 
1789  Stream << "ID = " << this << "\\n";
1790  Stream << "MEMCPY HOST ON " << deviceToString(MQueue->get_device()) << "\\n";
1791 
1792  Stream << "\"];" << std::endl;
1793 
1794  for (const auto &Dep : MDeps) {
1795  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1796  << " [ label = \"Access mode: "
1797  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1798  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1799  << std::endl;
1800  }
1801 }
1802 
1804  QueueImplPtr Queue, Requirement Req, AllocaCommandBase *SrcAllocaCmd,
1805  void **DstPtr)
1806  : Command(CommandType::UPDATE_REQUIREMENT, std::move(Queue)),
1807  MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(Req)), MDstPtr(DstPtr) {
1808 
1810 }
1811 
1813 #ifdef XPTI_ENABLE_INSTRUMENTATION
1814  if (!xptiCheckTraceEnabled(MStreamID))
1815  return;
1816  // Create a payload with the command name and an event using this payload to
1817  // emit a node_create
1818  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1820 
1821  if (MFirstInstance) {
1822  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1823  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1824  deviceToID(MQueue->get_device()));
1825  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1826  deviceToString(MQueue->get_device()));
1827  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1828  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1829  xpti::addMetadata(CmdTraceEvent, "memory_object",
1830  reinterpret_cast<size_t>(MAddress));
1831  xpti::addMetadata(CmdTraceEvent, "queue_id", MQueue->getQueueID());
1832 
1834  }
1835 #endif
1836 }
1837 
1839  switch (Type) {
1840  case detail::CG::Kernel:
1841  return "Kernel";
1842  break;
1844  return "update_host";
1845  break;
1846  case detail::CG::Fill:
1847  return "fill";
1848  break;
1850  return "copy acc to acc";
1851  break;
1853  return "copy acc to ptr";
1854  break;
1856  return "copy ptr to acc";
1857  break;
1858  case detail::CG::Barrier:
1859  return "barrier";
1861  return "barrier waitlist";
1862  case detail::CG::CopyUSM:
1863  return "copy usm";
1864  break;
1865  case detail::CG::FillUSM:
1866  return "fill usm";
1867  break;
1869  return "prefetch usm";
1870  break;
1872  return "host task";
1873  break;
1874  case detail::CG::Copy2DUSM:
1875  return "copy 2d usm";
1876  break;
1877  case detail::CG::Fill2DUSM:
1878  return "fill 2d usm";
1879  break;
1880  case detail::CG::AdviseUSM:
1881  return "advise usm";
1883  return "memset 2d usm";
1884  break;
1886  return "copy to device_global";
1887  break;
1889  return "copy from device_global";
1890  break;
1892  return "read_write host pipe";
1894  return "exec command buffer";
1895  case detail::CG::CopyImage:
1896  return "copy image";
1898  return "semaphore wait";
1900  return "semaphore signal";
1901  default:
1902  return "unknown";
1903  break;
1904  }
1905 }
1906 
1908  std::unique_ptr<detail::CG> CommandGroup, QueueImplPtr Queue,
1910  const std::vector<sycl::detail::pi::PiExtSyncPoint> &Dependencies)
1911  : Command(CommandType::RUN_CG, std::move(Queue), CommandBuffer,
1912  Dependencies),
1913  MCommandGroup(std::move(CommandGroup)) {
1914  if (MCommandGroup->getType() == detail::CG::CodeplayHostTask) {
1915  MEvent->setSubmittedQueue(
1916  static_cast<detail::CGHostTask *>(MCommandGroup.get())->MQueue);
1917  }
1918 
1920 }
1921 
1922 #ifdef XPTI_ENABLE_INSTRUMENTATION
1923 std::string instrumentationGetKernelName(
1924  const std::shared_ptr<detail::kernel_impl> &SyclKernel,
1925  const std::string &FunctionName, const std::string &SyclKernelName,
1926  void *&Address, std::optional<bool> &FromSource) {
1927  std::string KernelName;
1928  if (SyclKernel && SyclKernel->isCreatedFromSource()) {
1929  FromSource = true;
1930  pi_kernel KernelHandle = SyclKernel->getHandleRef();
1931  Address = KernelHandle;
1932  KernelName = FunctionName;
1933  } else {
1934  FromSource = false;
1935  KernelName = demangleKernelName(SyclKernelName);
1936  }
1937  return KernelName;
1938 }
1939 
1940 void instrumentationAddExtraKernelMetadata(
1941  xpti_td *&CmdTraceEvent, const NDRDescT &NDRDesc,
1942  const std::shared_ptr<detail::kernel_bundle_impl> &KernelBundleImplPtr,
1943  const std::string &KernelName,
1944  const std::shared_ptr<detail::kernel_impl> &SyclKernel,
1945  const QueueImplPtr &Queue,
1946  std::vector<ArgDesc> &CGArgs) // CGArgs are not const since they could be
1947  // sorted in this function
1948 {
1949  std::vector<ArgDesc> Args;
1950 
1951  auto FilterArgs = [&Args](detail::ArgDesc &Arg, int NextTrueIndex) {
1952  Args.push_back({Arg.MType, Arg.MPtr, Arg.MSize, NextTrueIndex});
1953  };
1954  sycl::detail::pi::PiProgram Program = nullptr;
1955  sycl::detail::pi::PiKernel Kernel = nullptr;
1956  std::mutex *KernelMutex = nullptr;
1957  const KernelArgMask *EliminatedArgMask = nullptr;
1958 
1959  std::shared_ptr<kernel_impl> SyclKernelImpl;
1960  std::shared_ptr<device_image_impl> DeviceImageImpl;
1961 
1962  // Use kernel_bundle if available unless it is interop.
1963  // Interop bundles can't be used in the first branch, because the
1964  // kernels in interop kernel bundles (if any) do not have kernel_id and
1965  // can therefore not be looked up, but since they are self-contained
1966  // they can simply be launched directly.
1967  if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
1968  kernel_id KernelID =
1970  kernel SyclKernel =
1971  KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr);
1972  std::shared_ptr<kernel_impl> KernelImpl =
1973  detail::getSyclObjImpl(SyclKernel);
1974 
1975  EliminatedArgMask = KernelImpl->getKernelArgMask();
1976  Program = KernelImpl->getDeviceImage()->get_program_ref();
1977  } else if (nullptr != SyclKernel) {
1978  auto SyclProg = SyclKernel->getProgramImpl();
1979  Program = SyclProg->getHandleRef();
1980  if (!SyclKernel->isCreatedFromSource())
1981  EliminatedArgMask = SyclKernel->getKernelArgMask();
1982  } else {
1983  std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) =
1985  Queue->getContextImplPtr(), Queue->getDeviceImplPtr(), KernelName);
1986  }
1987 
1988  applyFuncOnFilteredArgs(EliminatedArgMask, CGArgs, FilterArgs);
1989 
1990  xpti::offload_kernel_enqueue_data_t KernelData{
1991  {NDRDesc.GlobalSize[0], NDRDesc.GlobalSize[1], NDRDesc.GlobalSize[2]},
1992  {NDRDesc.LocalSize[0], NDRDesc.LocalSize[1], NDRDesc.LocalSize[2]},
1993  {NDRDesc.GlobalOffset[0], NDRDesc.GlobalOffset[1],
1994  NDRDesc.GlobalOffset[2]},
1995  Args.size()};
1996  xpti::addMetadata(CmdTraceEvent, "enqueue_kernel_data", KernelData);
1997  for (size_t i = 0; i < Args.size(); i++) {
1998  std::string Prefix("arg");
1999  xpti::offload_kernel_arg_data_t arg{(int)Args[i].MType, Args[i].MPtr,
2000  Args[i].MSize, Args[i].MIndex};
2001  xpti::addMetadata(CmdTraceEvent, Prefix + std::to_string(i), arg);
2002  }
2003 }
2004 
2005 void instrumentationFillCommonData(const std::string &KernelName,
2006  const std::string &FileName, uint64_t Line,
2007  uint64_t Column, const void *const Address,
2008  const QueueImplPtr &Queue,
2009  std::optional<bool> &FromSource,
2010  uint64_t &OutInstanceID,
2011  xpti_td *&OutTraceEvent) {
2012  // Get source file, line number information from the CommandGroup object
2013  // and create payload using name, address, and source info
2014  //
2015  // On Windows, since the support for builtin functions is not available in
2016  // MSVC, the MFileName, MLine will be set to nullptr and "0" respectively.
2017  // Handle this condition explicitly here.
2018  bool HasSourceInfo = false;
2019  xpti::payload_t Payload;
2020  if (!FileName.empty()) {
2021  // File name has a valid string
2022  Payload = xpti::payload_t(KernelName.c_str(), FileName.c_str(), Line,
2023  Column, Address);
2024  HasSourceInfo = true;
2025  } else if (Address) {
2026  // We have a valid function name and an address
2027  Payload = xpti::payload_t(KernelName.c_str(), Address);
2028  } else {
2029  // In any case, we will have a valid function name and we'll use that to
2030  // create the hash
2031  Payload = xpti::payload_t(KernelName.c_str());
2032  }
2033  uint64_t CGKernelInstanceNo;
2034  // Create event using the payload
2035  xpti_td *CmdTraceEvent =
2036  xptiMakeEvent("ExecCG", &Payload, xpti::trace_graph_event,
2037  xpti::trace_activity_type_t::active, &CGKernelInstanceNo);
2038  if (CmdTraceEvent) {
2039  OutInstanceID = CGKernelInstanceNo;
2040  OutTraceEvent = CmdTraceEvent;
2041  // If we are seeing this event again, then the instance ID will be greater
2042  // than 1; in this case, we will skip sending a notification to create a
2043  // node as this node has already been created.
2044  if (CGKernelInstanceNo > 1)
2045  return;
2046 
2047  xpti::addMetadata(CmdTraceEvent, "sycl_device",
2048  deviceToID(Queue->get_device()));
2049  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
2050  deviceToString(Queue->get_device()));
2051  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
2052  getSyclObjImpl(Queue->get_device())->getDeviceName());
2053  if (!KernelName.empty()) {
2054  xpti::addMetadata(CmdTraceEvent, "kernel_name", KernelName);
2055  }
2056  if (FromSource.has_value()) {
2057  xpti::addMetadata(CmdTraceEvent, "from_source", FromSource.value());
2058  }
2059  if (HasSourceInfo) {
2060  xpti::addMetadata(CmdTraceEvent, "sym_function_name", KernelName);
2061  xpti::addMetadata(CmdTraceEvent, "sym_source_file_name", FileName);
2062  xpti::addMetadata(CmdTraceEvent, "sym_line_no", static_cast<int>(Line));
2063  xpti::addMetadata(CmdTraceEvent, "sym_column_no",
2064  static_cast<int>(Column));
2065  }
2066  xpti::addMetadata(CmdTraceEvent, "queue_id", Queue->getQueueID());
2067  }
2068 }
2069 #endif
2070 
2071 #ifdef XPTI_ENABLE_INSTRUMENTATION
2072 std::pair<xpti_td *, uint64_t> emitKernelInstrumentationData(
2073  int32_t StreamID, const std::shared_ptr<detail::kernel_impl> &SyclKernel,
2074  const detail::code_location &CodeLoc, const std::string &SyclKernelName,
2075  const QueueImplPtr &Queue, const NDRDescT &NDRDesc,
2076  const std::shared_ptr<detail::kernel_bundle_impl> &KernelBundleImplPtr,
2077  std::vector<ArgDesc> &CGArgs) {
2078 
2079  auto XptiObjects = std::make_pair<xpti_td *, uint64_t>(nullptr, -1);
2080  constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
2081  if (!xptiCheckTraceEnabled(StreamID))
2082  return XptiObjects;
2083 
2084  void *Address = nullptr;
2085  std::optional<bool> FromSource;
2086  std::string KernelName = instrumentationGetKernelName(
2087  SyclKernel, std::string(CodeLoc.functionName()), SyclKernelName, Address,
2088  FromSource);
2089 
2090  auto &[CmdTraceEvent, InstanceID] = XptiObjects;
2091 
2092  std::string FileName =
2093  CodeLoc.fileName() ? CodeLoc.fileName() : std::string();
2094  instrumentationFillCommonData(KernelName, FileName, CodeLoc.lineNumber(),
2095  CodeLoc.columnNumber(), Address, Queue,
2096  FromSource, InstanceID, CmdTraceEvent);
2097 
2098  if (CmdTraceEvent) {
2099  instrumentationAddExtraKernelMetadata(CmdTraceEvent, NDRDesc,
2100  KernelBundleImplPtr, SyclKernelName,
2101  SyclKernel, Queue, CGArgs);
2102 
2103  xptiNotifySubscribers(
2104  StreamID, NotificationTraceType, detail::GSYCLGraphEvent, CmdTraceEvent,
2105  InstanceID,
2106  static_cast<const void *>(
2107  commandToNodeType(Command::CommandType::RUN_CG).c_str()));
2108  }
2109 
2110  return XptiObjects;
2111 }
2112 #endif
2113 
2115 #ifdef XPTI_ENABLE_INSTRUMENTATION
2116  constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
2117  if (!xptiCheckTraceEnabled(MStreamID))
2118  return;
2119 
2120  std::string KernelName;
2121  std::optional<bool> FromSource;
2122  switch (MCommandGroup->getType()) {
2123  case detail::CG::Kernel: {
2124  auto KernelCG =
2125  reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
2126  KernelName = instrumentationGetKernelName(
2127  KernelCG->MSyclKernel, MCommandGroup->MFunctionName,
2128  KernelCG->getKernelName(), MAddress, FromSource);
2129  } break;
2130  default:
2131  KernelName = getTypeString();
2132  break;
2133  }
2134 
2135  xpti_td *CmdTraceEvent = nullptr;
2136  instrumentationFillCommonData(KernelName, MCommandGroup->MFileName,
2137  MCommandGroup->MLine, MCommandGroup->MColumn,
2138  MAddress, MQueue, FromSource, MInstanceID,
2139  CmdTraceEvent);
2140 
2141  if (CmdTraceEvent) {
2142  MTraceEvent = static_cast<void *>(CmdTraceEvent);
2143  if (MCommandGroup->getType() == detail::CG::Kernel) {
2144  auto KernelCG =
2145  reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
2146  instrumentationAddExtraKernelMetadata(
2147  CmdTraceEvent, KernelCG->MNDRDesc, KernelCG->getKernelBundle(),
2148  KernelCG->MKernelName, KernelCG->MSyclKernel, MQueue,
2149  KernelCG->MArgs);
2150  }
2151 
2152  xptiNotifySubscribers(
2153  MStreamID, NotificationTraceType, detail::GSYCLGraphEvent,
2154  CmdTraceEvent, MInstanceID,
2155  static_cast<const void *>(commandToNodeType(MType).c_str()));
2156  }
2157 #endif
2158 }
2159 
2160 void ExecCGCommand::printDot(std::ostream &Stream) const {
2161  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
2162 
2163  Stream << "ID = " << this << "\\n";
2164  Stream << "EXEC CG ON " << deviceToString(MQueue->get_device()) << "\\n";
2165 
2166  switch (MCommandGroup->getType()) {
2167  case detail::CG::Kernel: {
2168  auto KernelCG =
2169  reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
2170  Stream << "Kernel name: ";
2171  if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource())
2172  Stream << "created from source";
2173  else
2174  Stream << demangleKernelName(KernelCG->getKernelName());
2175  Stream << "\\n";
2176  break;
2177  }
2178  default:
2179  Stream << "CG type: " << getTypeString() << "\\n";
2180  break;
2181  }
2182 
2183  Stream << "\"];" << std::endl;
2184 
2185  for (const auto &Dep : MDeps) {
2186  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
2187  << " [ label = \"Access mode: "
2188  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
2189  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
2190  << std::endl;
2191  }
2192 }
2193 
2195  return cgTypeToString(MCommandGroup->getType());
2196 }
2197 
2198 // SYCL has a parallel_for_work_group variant where the only NDRange
2199 // characteristics set by a user is the number of work groups. This does not
2200 // map to the OpenCL clEnqueueNDRangeAPI, which requires global work size to
2201 // be set as well. This function determines local work size based on the
2202 // device characteristics and the number of work groups requested by the user,
2203 // then calculates the global work size. SYCL specification (from 4.8.5.3):
2204 // The member function handler::parallel_for_work_group is parameterized by
2205 // the number of work - groups, such that the size of each group is chosen by
2206 // the runtime, or by the number of work - groups and number of work - items
2207 // for users who need more control.
2210  const device_impl &DeviceImpl) {
2211  if (NDR.GlobalSize[0] != 0)
2212  return; // GlobalSize is set - no need to adjust
2213  // check the prerequisites:
2214  assert(NDR.LocalSize[0] == 0);
2215  // TODO might be good to cache this info together with the kernel info to
2216  // avoid get_kernel_work_group_info on every kernel run
2218  sycl::info::kernel_device_specific::compile_work_group_size>(
2219  Kernel, DeviceImpl.getHandleRef(), DeviceImpl.getPlugin());
2220 
2221  if (WGSize[0] == 0) {
2222  WGSize = {1, 1, 1};
2223  }
2224  NDR.set(NDR.Dims, nd_range<3>(NDR.NumWorkGroups * WGSize, WGSize));
2225 }
2226 
2227 // We have the following mapping between dimensions with SPIR-V builtins:
2228 // 1D: id[0] -> x
2229 // 2D: id[0] -> y, id[1] -> x
2230 // 3D: id[0] -> z, id[1] -> y, id[2] -> x
2231 // So in order to ensure the correctness we update all the kernel
2232 // parameters accordingly.
2233 // Initially we keep the order of NDRDescT as it provided by the user, this
2234 // simplifies overall handling and do the reverse only when
2235 // the kernel is enqueued.
2237  if (NDR.Dims > 1) {
2238  std::swap(NDR.GlobalSize[0], NDR.GlobalSize[NDR.Dims - 1]);
2239  std::swap(NDR.LocalSize[0], NDR.LocalSize[NDR.Dims - 1]);
2240  std::swap(NDR.GlobalOffset[0], NDR.GlobalOffset[NDR.Dims - 1]);
2241  }
2242 }
2243 
2245  switch (AccessorMode) {
2246  case access::mode::read:
2247  return PI_ACCESS_READ_ONLY;
2248  case access::mode::write:
2250  return PI_ACCESS_WRITE_ONLY;
2251  default:
2252  return PI_ACCESS_READ_WRITE;
2253  }
2254 }
2255 
2257  const PluginPtr &Plugin, sycl::detail::pi::PiKernel Kernel,
2258  const std::shared_ptr<device_image_impl> &DeviceImageImpl,
2259  const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
2260  const sycl::context &Context, bool IsHost, detail::ArgDesc &Arg,
2261  size_t NextTrueIndex) {
2262  switch (Arg.MType) {
2264  break;
2266  Requirement *Req = (Requirement *)(Arg.MPtr);
2267 
2268  // getMemAllocationFunc is nullptr when there are no requirements. However,
2269  // we may pass default constructed accessors to a command, which don't add
2270  // requirements. In such case, getMemAllocationFunc is nullptr, but it's a
2271  // valid case, so we need to properly handle it.
2272  sycl::detail::pi::PiMem MemArg =
2273  getMemAllocationFunc
2274  ? (sycl::detail::pi::PiMem)getMemAllocationFunc(Req)
2275  : nullptr;
2276  if (Context.get_backend() == backend::opencl) {
2277  // clSetKernelArg (corresponding to piKernelSetArg) returns an error
2278  // when MemArg is null, which is the case when zero-sized buffers are
2279  // handled. Below assignment provides later call to clSetKernelArg with
2280  // acceptable arguments.
2281  if (!MemArg)
2282  MemArg = sycl::detail::pi::PiMem();
2283 
2284  Plugin->call<PiApiKind::piKernelSetArg>(
2285  Kernel, NextTrueIndex, sizeof(sycl::detail::pi::PiMem), &MemArg);
2286  } else {
2287  pi_mem_obj_property MemObjData{};
2288  MemObjData.mem_access = AccessModeToPi(Req->MAccessMode);
2289  MemObjData.type = PI_KERNEL_ARG_MEM_OBJ_ACCESS;
2290  Plugin->call<PiApiKind::piextKernelSetArgMemObj>(Kernel, NextTrueIndex,
2291  &MemObjData, &MemArg);
2292  }
2293  break;
2294  }
2296  Plugin->call<PiApiKind::piKernelSetArg>(Kernel, NextTrueIndex, Arg.MSize,
2297  Arg.MPtr);
2298  break;
2299  }
2301  sampler *SamplerPtr = (sampler *)Arg.MPtr;
2302  sycl::detail::pi::PiSampler Sampler =
2303  detail::getSyclObjImpl(*SamplerPtr)->getOrCreateSampler(Context);
2304  Plugin->call<PiApiKind::piextKernelSetArgSampler>(Kernel, NextTrueIndex,
2305  &Sampler);
2306  break;
2307  }
2309  Plugin->call<PiApiKind::piextKernelSetArgPointer>(Kernel, NextTrueIndex,
2310  Arg.MSize, Arg.MPtr);
2311  break;
2312  }
2314  if (IsHost) {
2315  throw sycl::exception(
2316  sycl::make_error_code(sycl::errc::feature_not_supported),
2317  "SYCL2020 specialization constants are not yet supported on host "
2318  "device " +
2319  codeToString(PI_ERROR_INVALID_OPERATION));
2320  }
2321  assert(DeviceImageImpl != nullptr);
2322  sycl::detail::pi::PiMem SpecConstsBuffer =
2323  DeviceImageImpl->get_spec_const_buffer_ref();
2324  // Avoid taking an address of nullptr
2325  sycl::detail::pi::PiMem *SpecConstsBufferArg =
2326  SpecConstsBuffer ? &SpecConstsBuffer : nullptr;
2327 
2328  pi_mem_obj_property MemObjData{};
2329  MemObjData.mem_access = PI_ACCESS_READ_ONLY;
2330  MemObjData.type = PI_KERNEL_ARG_MEM_OBJ_ACCESS;
2331  Plugin->call<PiApiKind::piextKernelSetArgMemObj>(
2332  Kernel, NextTrueIndex, &MemObjData, SpecConstsBufferArg);
2333  break;
2334  }
2336  throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
2337  "Invalid kernel param kind " +
2338  codeToString(PI_ERROR_INVALID_VALUE));
2339  break;
2340  }
2341 }
2342 
2344  const QueueImplPtr &Queue, std::vector<ArgDesc> &Args,
2345  const std::shared_ptr<device_image_impl> &DeviceImageImpl,
2346  sycl::detail::pi::PiKernel Kernel, NDRDescT &NDRDesc,
2347  std::vector<sycl::detail::pi::PiEvent> &RawEvents,
2348  const detail::EventImplPtr &OutEventImpl,
2349  const KernelArgMask *EliminatedArgMask,
2350  const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
2351  bool IsCooperative) {
2352  const PluginPtr &Plugin = Queue->getPlugin();
2353 
2354  auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
2355  &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) {
2356  SetArgBasedOnType(Plugin, Kernel, DeviceImageImpl, getMemAllocationFunc,
2357  Queue->get_context(), Queue->is_host(), Arg,
2358  NextTrueIndex);
2359  };
2360 
2361  applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc);
2362 
2363  adjustNDRangePerKernel(NDRDesc, Kernel, *(Queue->getDeviceImplPtr()));
2364 
2365  // Remember this information before the range dimensions are reversed
2366  const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0);
2367 
2369 
2370  size_t RequiredWGSize[3] = {0, 0, 0};
2371  size_t *LocalSize = nullptr;
2372 
2373  if (HasLocalSize)
2374  LocalSize = &NDRDesc.LocalSize[0];
2375  else {
2376  Plugin->call<PiApiKind::piKernelGetGroupInfo>(
2377  Kernel, Queue->getDeviceImplPtr()->getHandleRef(),
2378  PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, sizeof(RequiredWGSize),
2379  RequiredWGSize, /* param_value_size_ret = */ nullptr);
2380 
2381  const bool EnforcedLocalSize =
2382  (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2383  RequiredWGSize[2] != 0);
2384  if (EnforcedLocalSize)
2385  LocalSize = RequiredWGSize;
2386  }
2387  if (OutEventImpl != nullptr)
2388  OutEventImpl->setHostEnqueueTime();
2389  pi_result Error =
2390  [&](auto... Args) {
2391  if (IsCooperative) {
2392  return Plugin
2394  Args...);
2395  }
2396  return Plugin->call_nocheck<PiApiKind::piEnqueueKernelLaunch>(Args...);
2397  }(Queue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
2398  &NDRDesc.GlobalSize[0], LocalSize, RawEvents.size(),
2399  RawEvents.empty() ? nullptr : &RawEvents[0],
2400  OutEventImpl ? &OutEventImpl->getHandleRef() : nullptr);
2401  return Error;
2402 }
2403 
2404 // The function initialize accessors and calls lambda.
2405 void DispatchNativeKernel(void *Blob) {
2406  void **CastedBlob = (void **)Blob;
2407 
2408  std::vector<Requirement *> *Reqs =
2409  static_cast<std::vector<Requirement *> *>(CastedBlob[0]);
2410 
2411  std::shared_ptr<HostKernelBase> *HostKernel =
2412  static_cast<std::shared_ptr<HostKernelBase> *>(CastedBlob[1]);
2413 
2414  NDRDescT *NDRDesc = static_cast<NDRDescT *>(CastedBlob[2]);
2415 
2416  // Other value are pointer to the buffers.
2417  void **NextArg = CastedBlob + 3;
2418  for (detail::Requirement *Req : *Reqs)
2419  Req->MData = *(NextArg++);
2420 
2421  (*HostKernel)->call(*NDRDesc, nullptr);
2422 
2423  // The ownership of these objects have been passed to us, need to cleanup
2424  delete Reqs;
2425  delete HostKernel;
2426  delete NDRDesc;
2427 }
2428 
2430  context Ctx, DeviceImplPtr DeviceImpl,
2432  const CGExecKernel &CommandGroup,
2433  std::vector<sycl::detail::pi::PiExtSyncPoint> &SyncPoints,
2434  sycl::detail::pi::PiExtSyncPoint *OutSyncPoint,
2435  const std::function<void *(Requirement *Req)> &getMemAllocationFunc) {
2436  auto ContextImpl = sycl::detail::getSyclObjImpl(Ctx);
2437  const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin();
2438  pi_kernel PiKernel = nullptr;
2439  pi_program PiProgram = nullptr;
2440  std::shared_ptr<kernel_impl> SyclKernelImpl = nullptr;
2441  std::shared_ptr<device_image_impl> DeviceImageImpl = nullptr;
2442 
2443  auto Kernel = CommandGroup.MSyclKernel;
2444  auto KernelBundleImplPtr = CommandGroup.MKernelBundle;
2445  const KernelArgMask *EliminatedArgMask = nullptr;
2446 
2447  // Use kernel_bundle if available unless it is interop.
2448  // Interop bundles can't be used in the first branch, because the kernels
2449  // in interop kernel bundles (if any) do not have kernel_id
2450  // and can therefore not be looked up, but since they are self-contained
2451  // they can simply be launched directly.
2452  if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
2453  auto KernelName = CommandGroup.MKernelName;
2454  kernel_id KernelID =
2456  kernel SyclKernel =
2457  KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr);
2458  SyclKernelImpl = detail::getSyclObjImpl(SyclKernel);
2459  PiKernel = SyclKernelImpl->getHandleRef();
2460  DeviceImageImpl = SyclKernelImpl->getDeviceImage();
2461  PiProgram = DeviceImageImpl->get_program_ref();
2462  EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
2463  } else if (Kernel != nullptr) {
2464  PiKernel = Kernel->getHandleRef();
2465  auto SyclProg = Kernel->getProgramImpl();
2466  PiProgram = SyclProg->getHandleRef();
2467  EliminatedArgMask = Kernel->getKernelArgMask();
2468  } else {
2469  std::tie(PiKernel, std::ignore, EliminatedArgMask, PiProgram) =
2470  sycl::detail::ProgramManager::getInstance().getOrCreateKernel(
2471  ContextImpl, DeviceImpl, CommandGroup.MKernelName);
2472  }
2473 
2474  auto SetFunc = [&Plugin, &PiKernel, &DeviceImageImpl, &Ctx,
2475  &getMemAllocationFunc](sycl::detail::ArgDesc &Arg,
2476  size_t NextTrueIndex) {
2477  sycl::detail::SetArgBasedOnType(Plugin, PiKernel, DeviceImageImpl,
2478  getMemAllocationFunc, Ctx, false, Arg,
2479  NextTrueIndex);
2480  };
2481  // Copy args for modification
2482  auto Args = CommandGroup.MArgs;
2483  sycl::detail::applyFuncOnFilteredArgs(EliminatedArgMask, Args, SetFunc);
2484 
2485  // Remember this information before the range dimensions are reversed
2486  const bool HasLocalSize = (CommandGroup.MNDRDesc.LocalSize[0] != 0);
2487 
2488  // Copy NDRDesc for modification
2489  auto NDRDesc = CommandGroup.MNDRDesc;
2490  // Reverse kernel dims
2492 
2493  size_t RequiredWGSize[3] = {0, 0, 0};
2494  size_t *LocalSize = nullptr;
2495 
2496  if (HasLocalSize)
2497  LocalSize = &NDRDesc.LocalSize[0];
2498  else {
2500  PiKernel, DeviceImpl->getHandleRef(),
2501  PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, sizeof(RequiredWGSize),
2502  RequiredWGSize,
2503  /* param_value_size_ret = */ nullptr);
2504 
2505  const bool EnforcedLocalSize =
2506  (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2507  RequiredWGSize[2] != 0);
2508  if (EnforcedLocalSize)
2509  LocalSize = RequiredWGSize;
2510  }
2511 
2512  pi_result Res = Plugin->call_nocheck<
2514  CommandBuffer, PiKernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
2515  &NDRDesc.GlobalSize[0], LocalSize, SyncPoints.size(),
2516  SyncPoints.size() ? SyncPoints.data() : nullptr, OutSyncPoint);
2517 
2518  if (!SyclKernelImpl && !Kernel) {
2519  Plugin->call<PiApiKind::piKernelRelease>(PiKernel);
2520  Plugin->call<PiApiKind::piProgramRelease>(PiProgram);
2521  }
2522 
2523  if (Res != pi_result::PI_SUCCESS) {
2524  const device_impl &DeviceImplem = *(DeviceImpl);
2526  PiKernel, NDRDesc);
2527  }
2528 
2529  return Res;
2530 }
2531 
2533  const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector<ArgDesc> &Args,
2534  const std::shared_ptr<detail::kernel_bundle_impl> &KernelBundleImplPtr,
2535  const std::shared_ptr<detail::kernel_impl> &MSyclKernel,
2536  const std::string &KernelName,
2537  std::vector<sycl::detail::pi::PiEvent> &RawEvents,
2538  const detail::EventImplPtr &OutEventImpl,
2539  const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
2540  sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig,
2541  const bool KernelIsCooperative) {
2542 
2543  // Run OpenCL kernel
2544  auto ContextImpl = Queue->getContextImplPtr();
2545  auto DeviceImpl = Queue->getDeviceImplPtr();
2546  sycl::detail::pi::PiKernel Kernel = nullptr;
2547  std::mutex *KernelMutex = nullptr;
2548  sycl::detail::pi::PiProgram Program = nullptr;
2549  const KernelArgMask *EliminatedArgMask;
2550 
2551  std::shared_ptr<kernel_impl> SyclKernelImpl;
2552  std::shared_ptr<device_image_impl> DeviceImageImpl;
2553 
2554  // Use kernel_bundle if available unless it is interop.
2555  // Interop bundles can't be used in the first branch, because the kernels
2556  // in interop kernel bundles (if any) do not have kernel_id
2557  // and can therefore not be looked up, but since they are self-contained
2558  // they can simply be launched directly.
2559  if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
2560  kernel_id KernelID =
2562  kernel SyclKernel =
2563  KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr);
2564 
2565  SyclKernelImpl = detail::getSyclObjImpl(SyclKernel);
2566 
2567  Kernel = SyclKernelImpl->getHandleRef();
2568  DeviceImageImpl = SyclKernelImpl->getDeviceImage();
2569 
2570  Program = DeviceImageImpl->get_program_ref();
2571 
2572  EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
2573  KernelMutex = SyclKernelImpl->getCacheMutex();
2574  } else if (nullptr != MSyclKernel) {
2575  assert(MSyclKernel->get_info<info::kernel::context>() ==
2576  Queue->get_context());
2577  Kernel = MSyclKernel->getHandleRef();
2578  auto SyclProg = MSyclKernel->getProgramImpl();
2579  Program = SyclProg->getHandleRef();
2580  // Non-cacheable kernels use mutexes from kernel_impls.
2581  // TODO this can still result in a race condition if multiple SYCL
2582  // kernels are created with the same native handle. To address this,
2583  // we need to either store and use a pi_native_handle -> mutex map or
2584  // reuse and return existing SYCL kernels from make_native to avoid
2585  // their duplication in such cases.
2586  KernelMutex = &MSyclKernel->getNoncacheableEnqueueMutex();
2587  EliminatedArgMask = MSyclKernel->getKernelArgMask();
2588  } else {
2589  std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) =
2591  ContextImpl, DeviceImpl, KernelName, NDRDesc);
2592  }
2593 
2594  // We may need more events for the launch, so we make another reference.
2595  std::vector<sycl::detail::pi::PiEvent> &EventsWaitList = RawEvents;
2596 
2597  // Initialize device globals associated with this.
2598  std::vector<sycl::detail::pi::PiEvent> DeviceGlobalInitEvents =
2599  ContextImpl->initializeDeviceGlobals(Program, Queue);
2600  std::vector<sycl::detail::pi::PiEvent> EventsWithDeviceGlobalInits;
2601  if (!DeviceGlobalInitEvents.empty()) {
2602  EventsWithDeviceGlobalInits.reserve(RawEvents.size() +
2603  DeviceGlobalInitEvents.size());
2604  EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(),
2605  RawEvents.begin(), RawEvents.end());
2606  EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(),
2607  DeviceGlobalInitEvents.begin(),
2608  DeviceGlobalInitEvents.end());
2609  EventsWaitList = EventsWithDeviceGlobalInits;
2610  }
2611 
2612  pi_result Error = PI_SUCCESS;
2613  {
2614  // When KernelMutex is null, this means that in-memory caching is
2615  // disabled, which means that kernel object is not shared, so no locking
2616  // is necessary.
2617  using LockT = std::unique_lock<std::mutex>;
2618  auto Lock = KernelMutex ? LockT(*KernelMutex) : LockT();
2619 
2620  // Set SLM/Cache configuration for the kernel if non-default value is
2621  // provided.
2622  if (KernelCacheConfig == PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM ||
2623  KernelCacheConfig == PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA) {
2624  const PluginPtr &Plugin = Queue->getPlugin();
2625  Plugin->call<PiApiKind::piKernelSetExecInfo>(
2627  sizeof(sycl::detail::pi::PiKernelCacheConfig), &KernelCacheConfig);
2628  }
2629 
2630  Error = SetKernelParamsAndLaunch(Queue, Args, DeviceImageImpl, Kernel,
2631  NDRDesc, EventsWaitList, OutEventImpl,
2632  EliminatedArgMask, getMemAllocationFunc,
2633  KernelIsCooperative);
2634 
2635  const PluginPtr &Plugin = Queue->getPlugin();
2636  if (!SyclKernelImpl && !MSyclKernel) {
2637  Plugin->call<PiApiKind::piKernelRelease>(Kernel);
2638  Plugin->call<PiApiKind::piProgramRelease>(Program);
2639  }
2640  }
2641  if (PI_SUCCESS != Error) {
2642  // If we have got non-success error code, let's analyze it to emit nice
2643  // exception explaining what was wrong
2644  const device_impl &DeviceImpl = *(Queue->getDeviceImplPtr());
2646  Kernel, NDRDesc);
2647  }
2648 
2649  return PI_SUCCESS;
2650 }
2651 
2652 pi_int32
2653 enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName,
2654  bool blocking, void *ptr, size_t size,
2655  std::vector<sycl::detail::pi::PiEvent> &RawEvents,
2656  const detail::EventImplPtr &OutEventImpl, bool read) {
2657  detail::HostPipeMapEntry *hostPipeEntry =
2659 
2660  sycl::detail::pi::PiProgram Program = nullptr;
2661  device Device = Queue->get_device();
2662  ContextImplPtr ContextImpl = Queue->getContextImplPtr();
2663  std::optional<sycl::detail::pi::PiProgram> CachedProgram =
2664  ContextImpl->getProgramForHostPipe(Device, hostPipeEntry);
2665  if (CachedProgram)
2666  Program = *CachedProgram;
2667  else {
2668  // If there was no cached program, build one.
2669  device_image_plain devImgPlain =
2671  hostPipeEntry->getDevBinImage(), Queue->get_context(),
2672  Queue->get_device());
2673  device_image_plain BuiltImage =
2674  ProgramManager::getInstance().build(devImgPlain, {Device}, {});
2675  Program = getSyclObjImpl(BuiltImage)->get_program_ref();
2676  }
2677  assert(Program && "Program for this hostpipe is not compiled.");
2678 
2679  // Get plugin for calling opencl functions
2680  const PluginPtr &Plugin = Queue->getPlugin();
2681 
2682  pi_queue pi_q = Queue->getHandleRef();
2683  pi_result Error;
2684 
2685  auto OutEvent = OutEventImpl ? &OutEventImpl->getHandleRef() : nullptr;
2686  if (OutEventImpl != nullptr)
2687  OutEventImpl->setHostEnqueueTime();
2688  if (read) {
2689  Error =
2691  pi_q, Program, PipeName.c_str(), blocking, ptr, size,
2692  RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
2693  OutEvent);
2694  } else {
2695  Error =
2696  Plugin
2698  pi_q, Program, PipeName.c_str(), blocking, ptr, size,
2699  RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
2700  OutEvent);
2701  }
2702  return Error;
2703 }
2704 
2705 pi_int32 ExecCGCommand::enqueueImpCommandBuffer() {
2706  // Wait on host command dependencies
2708 
2709  // Any device dependencies need to be waited on here since subsequent
2710  // submissions of the command buffer itself will not receive dependencies on
2711  // them, e.g. initial copies from host to device
2712  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
2713  flushCrossQueueDeps(EventImpls, getWorkerQueue());
2714  std::vector<sycl::detail::pi::PiEvent> RawEvents = getPiEvents(EventImpls);
2715  if (!RawEvents.empty()) {
2716  const PluginPtr &Plugin = MQueue->getPlugin();
2717  Plugin->call<PiApiKind::piEventsWait>(RawEvents.size(), &RawEvents[0]);
2718  }
2719 
2720  sycl::detail::pi::PiEvent *Event =
2721  (MQueue->supportsDiscardingPiEvents() &&
2722  MCommandGroup->getRequirements().size() == 0)
2723  ? nullptr
2724  : &MEvent->getHandleRef();
2725  sycl::detail::pi::PiExtSyncPoint OutSyncPoint;
2726  switch (MCommandGroup->getType()) {
2727  case CG::CGTYPE::Kernel: {
2728  CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
2729 
2730  auto getMemAllocationFunc = [this](Requirement *Req) {
2731  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2732  return AllocaCmd->getMemAllocation();
2733  };
2734 
2735  if (!Event) {
2736  // Kernel only uses assert if it's non interop one
2737  bool KernelUsesAssert =
2738  !(ExecKernel->MSyclKernel && ExecKernel->MSyclKernel->isInterop()) &&
2740  ExecKernel->MKernelName);
2741  if (KernelUsesAssert) {
2742  Event = &MEvent->getHandleRef();
2743  }
2744  }
2745  auto result = enqueueImpCommandBufferKernel(
2746  MQueue->get_context(), MQueue->getDeviceImplPtr(), MCommandBuffer,
2747  *ExecKernel, MSyncPointDeps, &OutSyncPoint, getMemAllocationFunc);
2748  MEvent->setSyncPoint(OutSyncPoint);
2749  return result;
2750  }
2751  case CG::CGTYPE::CopyUSM: {
2752  CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
2754  MQueue->getContextImplPtr(), Copy->getSrc(), MCommandBuffer,
2755  Copy->getLength(), Copy->getDst(), MSyncPointDeps, &OutSyncPoint);
2756  MEvent->setSyncPoint(OutSyncPoint);
2757  return PI_SUCCESS;
2758  }
2759  case CG::CGTYPE::CopyAccToAcc: {
2760  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2761  Requirement *ReqSrc = (Requirement *)(Copy->getSrc());
2762  Requirement *ReqDst = (Requirement *)(Copy->getDst());
2763 
2764  AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc);
2765  AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst);
2766 
2768  MQueue->getContextImplPtr(), MCommandBuffer,
2769  AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(),
2770  ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange,
2771  ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
2772  ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
2773  ReqDst->MOffset, ReqDst->MElemSize, std::move(MSyncPointDeps),
2774  &OutSyncPoint);
2775  MEvent->setSyncPoint(OutSyncPoint);
2776  return PI_SUCCESS;
2777  }
2778  case CG::CGTYPE::CopyAccToPtr: {
2779  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2780  Requirement *Req = (Requirement *)Copy->getSrc();
2781  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2782 
2784  MQueue->getContextImplPtr(), MCommandBuffer, AllocaCmd->getSYCLMemObj(),
2785  AllocaCmd->getMemAllocation(), Req->MDims, Req->MMemoryRange,
2786  Req->MAccessRange, Req->MOffset, Req->MElemSize, (char *)Copy->getDst(),
2787  Req->MDims, Req->MAccessRange,
2788  /*DstOffset=*/{0, 0, 0}, Req->MElemSize, std::move(MSyncPointDeps),
2789  &OutSyncPoint);
2790  MEvent->setSyncPoint(OutSyncPoint);
2791  return PI_SUCCESS;
2792  }
2793  case CG::CGTYPE::CopyPtrToAcc: {
2794  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2795  Requirement *Req = (Requirement *)(Copy->getDst());
2796  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2797 
2799  MQueue->getContextImplPtr(), MCommandBuffer, AllocaCmd->getSYCLMemObj(),
2800  (char *)Copy->getSrc(), Req->MDims, Req->MAccessRange,
2801  /*SrcOffset*/ {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(),
2802  Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2803  Req->MElemSize, std::move(MSyncPointDeps), &OutSyncPoint);
2804  MEvent->setSyncPoint(OutSyncPoint);
2805  return PI_SUCCESS;
2806  }
2807  case CG::CGTYPE::Fill: {
2808  CGFill *Fill = (CGFill *)MCommandGroup.get();
2809  Requirement *Req = (Requirement *)(Fill->getReqToFill());
2810  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2811 
2813  MQueue->getContextImplPtr(), MCommandBuffer, AllocaCmd->getSYCLMemObj(),
2814  AllocaCmd->getMemAllocation(), Fill->MPattern.size(),
2815  Fill->MPattern.data(), Req->MDims, Req->MMemoryRange, Req->MAccessRange,
2816  Req->MOffset, Req->MElemSize, std::move(MSyncPointDeps), &OutSyncPoint);
2817  MEvent->setSyncPoint(OutSyncPoint);
2818  return PI_SUCCESS;
2819  }
2820  case CG::CGTYPE::FillUSM: {
2821  CGFillUSM *Fill = (CGFillUSM *)MCommandGroup.get();
2823  MQueue->getContextImplPtr(), MCommandBuffer, Fill->getDst(),
2824  Fill->getLength(), Fill->getFill(), std::move(MSyncPointDeps),
2825  &OutSyncPoint);
2826  MEvent->setSyncPoint(OutSyncPoint);
2827  return PI_SUCCESS;
2828  }
2829  case CG::CGTYPE::PrefetchUSM: {
2830  CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get();
2832  MQueue->getContextImplPtr(), MCommandBuffer, Prefetch->getDst(),
2833  Prefetch->getLength(), std::move(MSyncPointDeps), &OutSyncPoint);
2834  MEvent->setSyncPoint(OutSyncPoint);
2835  return PI_SUCCESS;
2836  }
2837  case CG::CGTYPE::AdviseUSM: {
2838  CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get();
2840  MQueue->getContextImplPtr(), MCommandBuffer, Advise->getDst(),
2841  Advise->getLength(), Advise->getAdvice(), std::move(MSyncPointDeps),
2842  &OutSyncPoint);
2843  MEvent->setSyncPoint(OutSyncPoint);
2844  return PI_SUCCESS;
2845  }
2846 
2847  default:
2848  throw runtime_error("CG type not implemented for command buffers.",
2849  PI_ERROR_INVALID_OPERATION);
2850  }
2851 }
2852 
2853 pi_int32 ExecCGCommand::enqueueImp() {
2854  if (MCommandBuffer) {
2855  return enqueueImpCommandBuffer();
2856  } else {
2857  return enqueueImpQueue();
2858  }
2859 }
2860 
2861 pi_int32 ExecCGCommand::enqueueImpQueue() {
2862  if (getCG().getType() != CG::CGTYPE::CodeplayHostTask)
2864  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
2865  auto RawEvents = getPiEvents(EventImpls);
2866  flushCrossQueueDeps(EventImpls, getWorkerQueue());
2867 
2868  bool DiscardPiEvent = (MQueue->supportsDiscardingPiEvents() &&
2869  MCommandGroup->getRequirements().size() == 0);
2870  sycl::detail::pi::PiEvent *Event =
2871  DiscardPiEvent ? nullptr : &MEvent->getHandleRef();
2872  detail::EventImplPtr EventImpl = DiscardPiEvent ? nullptr : MEvent;
2873 
2874  switch (MCommandGroup->getType()) {
2875 
2876  case CG::CGTYPE::UpdateHost: {
2877  throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
2878  "Update host should be handled by the Scheduler. " +
2879  codeToString(PI_ERROR_INVALID_VALUE));
2880  }
2881  case CG::CGTYPE::CopyAccToPtr: {
2882  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2883  Requirement *Req = (Requirement *)Copy->getSrc();
2884  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2885 
2887  AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(), MQueue,
2888  Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2889  Req->MElemSize, Copy->getDst(),
2891  Req->MAccessRange, Req->MAccessRange, /*DstOffset=*/{0, 0, 0},
2892  Req->MElemSize, std::move(RawEvents), MEvent->getHandleRef(), MEvent);
2893 
2894  return PI_SUCCESS;
2895  }
2896  case CG::CGTYPE::CopyPtrToAcc: {
2897  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2898  Requirement *Req = (Requirement *)(Copy->getDst());
2899  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2900 
2902 
2904  AllocaCmd->getSYCLMemObj(), Copy->getSrc(),
2906  Req->MAccessRange, Req->MAccessRange,
2907  /*SrcOffset*/ {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(),
2908  MQueue, Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2909  Req->MElemSize, std::move(RawEvents), MEvent->getHandleRef(), MEvent);
2910 
2911  return PI_SUCCESS;
2912  }
2913  case CG::CGTYPE::CopyAccToAcc: {
2914  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2915  Requirement *ReqSrc = (Requirement *)(Copy->getSrc());
2916  Requirement *ReqDst = (Requirement *)(Copy->getDst());
2917 
2918  AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc);
2919  AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst);
2920 
2922  AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(), MQueue,
2923  ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange,
2924  ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
2925  MQueue, ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
2926  ReqDst->MOffset, ReqDst->MElemSize, std::move(RawEvents),
2927  MEvent->getHandleRef(), MEvent);
2928 
2929  return PI_SUCCESS;
2930  }
2931  case CG::CGTYPE::Fill: {
2932  CGFill *Fill = (CGFill *)MCommandGroup.get();
2933  Requirement *Req = (Requirement *)(Fill->getReqToFill());
2934  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2935 
2937  AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(), MQueue,
2938  Fill->MPattern.size(), Fill->MPattern.data(), Req->MDims,
2939  Req->MMemoryRange, Req->MAccessRange, Req->MOffset, Req->MElemSize,
2940  std::move(RawEvents), MEvent->getHandleRef(), MEvent);
2941 
2942  return PI_SUCCESS;
2943  }
2944  case CG::CGTYPE::Kernel: {
2945  CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
2946 
2947  NDRDescT &NDRDesc = ExecKernel->MNDRDesc;
2948  std::vector<ArgDesc> &Args = ExecKernel->MArgs;
2949 
2950  if (MQueue->is_host() || (MQueue->getDeviceImplPtr()->getBackend() ==
2951  backend::ext_intel_esimd_emulator)) {
2952  for (ArgDesc &Arg : Args)
2953  if (kernel_param_kind_t::kind_accessor == Arg.MType) {
2954  Requirement *Req = (Requirement *)(Arg.MPtr);
2955  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2956  Req->MData = AllocaCmd->getMemAllocation();
2957  }
2958  if (!RawEvents.empty()) {
2959  // Assuming that the events are for devices to the same Plugin.
2960  const PluginPtr &Plugin = EventImpls[0]->getPlugin();
2961  Plugin->call<PiApiKind::piEventsWait>(RawEvents.size(), &RawEvents[0]);
2962  }
2963 
2964  if (MQueue->is_host()) {
2965  ExecKernel->MHostKernel->call(NDRDesc,
2966  getEvent()->getHostProfilingInfo());
2967  } else {
2968  assert(MQueue->getDeviceImplPtr()->getBackend() ==
2969  backend::ext_intel_esimd_emulator);
2970  if (MEvent != nullptr)
2971  MEvent->setHostEnqueueTime();
2972  MQueue->getPlugin()->call<PiApiKind::piEnqueueKernelLaunch>(
2973  nullptr,
2974  reinterpret_cast<pi_kernel>(ExecKernel->MHostKernel->getPtr()),
2975  NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0],
2976  &NDRDesc.LocalSize[0], 0, nullptr, nullptr);
2977  }
2978  return PI_SUCCESS;
2979  }
2980 
2981  auto getMemAllocationFunc = [this](Requirement *Req) {
2982  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2983  // getAllocaForReq may return nullptr if Req is a default constructed
2984  // accessor. Simply return nullptr in such a case.
2985  return AllocaCmd ? AllocaCmd->getMemAllocation() : nullptr;
2986  };
2987 
2988  const std::shared_ptr<detail::kernel_impl> &SyclKernel =
2989  ExecKernel->MSyclKernel;
2990  const std::string &KernelName = ExecKernel->MKernelName;
2991 
2992  if (!EventImpl) {
2993  // Kernel only uses assert if it's non interop one
2994  bool KernelUsesAssert =
2995  !(SyclKernel && SyclKernel->isInterop()) &&
2997  if (KernelUsesAssert) {
2998  EventImpl = MEvent;
2999  }
3000  }
3001 
3002  return enqueueImpKernel(
3003  MQueue, NDRDesc, Args, ExecKernel->getKernelBundle(), SyclKernel,
3004  KernelName, RawEvents, EventImpl, getMemAllocationFunc,
3005  ExecKernel->MKernelCacheConfig, ExecKernel->MKernelIsCooperative);
3006  }
3007  case CG::CGTYPE::CopyUSM: {
3008  CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
3009  MemoryManager::copy_usm(Copy->getSrc(), MQueue, Copy->getLength(),
3010  Copy->getDst(), std::move(RawEvents), Event,
3011  MEvent);
3012 
3013  return PI_SUCCESS;
3014  }
3015  case CG::CGTYPE::FillUSM: {
3016  CGFillUSM *Fill = (CGFillUSM *)MCommandGroup.get();
3017  MemoryManager::fill_usm(Fill->getDst(), MQueue, Fill->getLength(),
3018  Fill->getFill(), std::move(RawEvents), Event,
3019  MEvent);
3020 
3021  return PI_SUCCESS;
3022  }
3023  case CG::CGTYPE::PrefetchUSM: {
3024  CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get();
3025  MemoryManager::prefetch_usm(Prefetch->getDst(), MQueue,
3026  Prefetch->getLength(), std::move(RawEvents),
3027  Event, MEvent);
3028 
3029  return PI_SUCCESS;
3030  }
3031  case CG::CGTYPE::AdviseUSM: {
3032  CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get();
3033  MemoryManager::advise_usm(Advise->getDst(), MQueue, Advise->getLength(),
3034  Advise->getAdvice(), std::move(RawEvents), Event,
3035  MEvent);
3036 
3037  return PI_SUCCESS;
3038  }
3039  case CG::CGTYPE::Copy2DUSM: {
3040  CGCopy2DUSM *Copy = (CGCopy2DUSM *)MCommandGroup.get();
3041  MemoryManager::copy_2d_usm(Copy->getSrc(), Copy->getSrcPitch(), MQueue,
3042  Copy->getDst(), Copy->getDstPitch(),
3043  Copy->getWidth(), Copy->getHeight(),
3044  std::move(RawEvents), Event, MEvent);
3045  return PI_SUCCESS;
3046  }
3047  case CG::CGTYPE::Fill2DUSM: {
3048  CGFill2DUSM *Fill = (CGFill2DUSM *)MCommandGroup.get();
3049  MemoryManager::fill_2d_usm(Fill->getDst(), MQueue, Fill->getPitch(),
3050  Fill->getWidth(), Fill->getHeight(),
3051  Fill->getPattern(), std::move(RawEvents), Event,
3052  MEvent);
3053  return PI_SUCCESS;
3054  }
3055  case CG::CGTYPE::Memset2DUSM: {
3056  CGMemset2DUSM *Memset = (CGMemset2DUSM *)MCommandGroup.get();
3057  MemoryManager::memset_2d_usm(Memset->getDst(), MQueue, Memset->getPitch(),
3058  Memset->getWidth(), Memset->getHeight(),
3059  Memset->getValue(), std::move(RawEvents),
3060  Event, MEvent);
3061  return PI_SUCCESS;
3062  }
3063  case CG::CGTYPE::CodeplayHostTask: {
3064  CGHostTask *HostTask = static_cast<CGHostTask *>(MCommandGroup.get());
3065 
3066  for (ArgDesc &Arg : HostTask->MArgs) {
3067  switch (Arg.MType) {
3069  Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
3070  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
3071 
3072  if (AllocaCmd)
3073  Req->MData = AllocaCmd->getMemAllocation();
3074  break;
3075  }
3076  default:
3077  throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
3078  "Unsupported arg type " +
3079  codeToString(PI_ERROR_INVALID_VALUE));
3080  }
3081  }
3082 
3083  std::vector<interop_handle::ReqToMem> ReqToMem;
3084 
3085  if (HostTask->MHostTask->isInteropTask()) {
3086  // Extract the Mem Objects for all Requirements, to ensure they are
3087  // available if a user asks for them inside the interop task scope
3088  const std::vector<Requirement *> &HandlerReq =
3089  HostTask->getRequirements();
3090  auto ReqToMemConv = [&ReqToMem, HostTask](Requirement *Req) {
3091  const std::vector<AllocaCommandBase *> &AllocaCmds =
3092  Req->MSYCLMemObj->MRecord->MAllocaCommands;
3093 
3094  for (AllocaCommandBase *AllocaCmd : AllocaCmds)
3095  if (HostTask->MQueue->getContextImplPtr() ==
3096  AllocaCmd->getQueue()->getContextImplPtr()) {
3097  auto MemArg =
3098  reinterpret_cast<pi_mem>(AllocaCmd->getMemAllocation());
3099  ReqToMem.emplace_back(std::make_pair(Req, MemArg));
3100 
3101  return;
3102  }
3103 
3104  assert(false &&
3105  "Can't get memory object due to no allocation available");
3106 
3107  throw sycl::exception(
3108  sycl::make_error_code(sycl::errc::runtime),
3109  "Can't get memory object due to no allocation available " +
3110  codeToString(PI_ERROR_INVALID_MEM_OBJECT));
3111  };
3112  std::for_each(std::begin(HandlerReq), std::end(HandlerReq), ReqToMemConv);
3113  std::sort(std::begin(ReqToMem), std::end(ReqToMem));
3114  }
3115 
3116  // Host task is executed asynchronously so we should record where it was
3117  // submitted to report exception origin properly.
3119 
3120  MQueue->getThreadPool().submit<DispatchHostTask>(
3121  DispatchHostTask(this, std::move(ReqToMem)));
3122 
3124 
3125  return PI_SUCCESS;
3126  }
3127  case CG::CGTYPE::Barrier: {
3128  if (MQueue->getDeviceImplPtr()->is_host()) {
3129  // NOP for host device.
3130  return PI_SUCCESS;
3131  }
3132  const PluginPtr &Plugin = MQueue->getPlugin();
3133  if (MEvent != nullptr)
3134  MEvent->setHostEnqueueTime();
3136  MQueue->getHandleRef(), 0, nullptr, Event);
3137 
3138  return PI_SUCCESS;
3139  }
3140  case CG::CGTYPE::BarrierWaitlist: {
3141  CGBarrier *Barrier = static_cast<CGBarrier *>(MCommandGroup.get());
3142  std::vector<detail::EventImplPtr> Events = Barrier->MEventsWaitWithBarrier;
3143  std::vector<sycl::detail::pi::PiEvent> PiEvents =
3144  getPiEventsBlocking(Events);
3145  if (MQueue->getDeviceImplPtr()->is_host() || PiEvents.empty()) {
3146  // NOP for host device.
3147  // If Events is empty, then the barrier has no effect.
3148  return PI_SUCCESS;
3149  }
3150  const PluginPtr &Plugin = MQueue->getPlugin();
3151  if (MEvent != nullptr)
3152  MEvent->setHostEnqueueTime();
3154  MQueue->getHandleRef(), PiEvents.size(), &PiEvents[0], Event);
3155 
3156  return PI_SUCCESS;
3157  }
3158  case CG::CGTYPE::CopyToDeviceGlobal: {
3159  CGCopyToDeviceGlobal *Copy = (CGCopyToDeviceGlobal *)MCommandGroup.get();
3161  Copy->getDeviceGlobalPtr(), Copy->isDeviceImageScoped(), MQueue,
3162  Copy->getNumBytes(), Copy->getOffset(), Copy->getSrc(),
3163  std::move(RawEvents), Event, MEvent);
3164 
3165  return CL_SUCCESS;
3166  }
3167  case CG::CGTYPE::CopyFromDeviceGlobal: {
3168  CGCopyFromDeviceGlobal *Copy =
3169  (CGCopyFromDeviceGlobal *)MCommandGroup.get();
3171  Copy->getDeviceGlobalPtr(), Copy->isDeviceImageScoped(), MQueue,
3172  Copy->getNumBytes(), Copy->getOffset(), Copy->getDest(),
3173  std::move(RawEvents), Event, MEvent);
3174 
3175  return CL_SUCCESS;
3176  }
3177  case CG::CGTYPE::ReadWriteHostPipe: {
3178  CGReadWriteHostPipe *ExecReadWriteHostPipe =
3179  (CGReadWriteHostPipe *)MCommandGroup.get();
3180  std::string pipeName = ExecReadWriteHostPipe->getPipeName();
3181  void *hostPtr = ExecReadWriteHostPipe->getHostPtr();
3182  size_t typeSize = ExecReadWriteHostPipe->getTypeSize();
3183  bool blocking = ExecReadWriteHostPipe->isBlocking();
3184  bool read = ExecReadWriteHostPipe->isReadHostPipe();
3185 
3186  if (!EventImpl) {
3187  EventImpl = MEvent;
3188  }
3189  return enqueueReadWriteHostPipe(MQueue, pipeName, blocking, hostPtr,
3190  typeSize, RawEvents, EventImpl, read);
3191  }
3192  case CG::CGTYPE::ExecCommandBuffer: {
3193  CGExecCommandBuffer *CmdBufferCG =
3194  static_cast<CGExecCommandBuffer *>(MCommandGroup.get());
3195  if (MEvent != nullptr)
3196  MEvent->setHostEnqueueTime();
3197  return MQueue->getPlugin()
3199  CmdBufferCG->MCommandBuffer, MQueue->getHandleRef(),
3200  RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
3201  Event);
3202  }
3203  case CG::CGTYPE::CopyImage: {
3204  CGCopyImage *Copy = (CGCopyImage *)MCommandGroup.get();
3205 
3206  sycl::detail::pi::PiMemImageDesc Desc = Copy->getDesc();
3207 
3209  Copy->getSrc(), MQueue, Copy->getDst(), Desc, Copy->getFormat(),
3210  Copy->getCopyFlags(), Copy->getSrcOffset(), Copy->getDstOffset(),
3211  Copy->getHostExtent(), Copy->getCopyExtent(), std::move(RawEvents),
3212  Event);
3213  return PI_SUCCESS;
3214  }
3215  case CG::CGTYPE::SemaphoreWait: {
3216  CGSemaphoreWait *SemWait = (CGSemaphoreWait *)MCommandGroup.get();
3217  if (MQueue->getDeviceImplPtr()->is_host()) {
3218  // NOP for host device.
3219  return PI_SUCCESS;
3220  }
3221 
3222  const detail::PluginPtr &Plugin = MQueue->getPlugin();
3224  MQueue->getHandleRef(), SemWait->getInteropSemaphoreHandle(), 0,
3225  nullptr, nullptr);
3226 
3227  return PI_SUCCESS;
3228  }
3229  case CG::CGTYPE::SemaphoreSignal: {
3230  CGSemaphoreSignal *SemSignal = (CGSemaphoreSignal *)MCommandGroup.get();
3231  if (MQueue->getDeviceImplPtr()->is_host()) {
3232  // NOP for host device.
3233  return PI_SUCCESS;
3234  }
3235 
3236  const detail::PluginPtr &Plugin = MQueue->getPlugin();
3238  MQueue->getHandleRef(), SemSignal->getInteropSemaphoreHandle(), 0,
3239  nullptr, nullptr);
3240 
3241  return PI_SUCCESS;
3242  }
3243  case CG::CGTYPE::None:
3244  throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
3245  "CG type not implemented. " +
3246  codeToString(PI_ERROR_INVALID_OPERATION));
3247  }
3248  return PI_ERROR_INVALID_OPERATION;
3249 }
3250 
3252  return !MCommandBuffer &&
3253  MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask;
3254 }
3255 
3257  // Host tasks are cleaned up upon completion instead.
3259  (MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask);
3260 }
3261 
3263  if (MCommandGroup->getType() == CG::CGTYPE::CodeplayHostTask)
3264  return MLeafCounter == 0 && MEvent->isCompleted();
3265  return Command::readyForCleanup();
3266 }
3267 
3269  : Command(Command::CommandType::FUSION, Queue),
3270  MStatus(FusionStatus::ACTIVE) {
3272 }
3273 
3274 std::vector<Command *> &KernelFusionCommand::auxiliaryCommands() {
3275  return MAuxiliaryCommands;
3276 }
3277 
3279  MFusionList.push_back(Kernel);
3280 }
3281 
3282 std::vector<ExecCGCommand *> &KernelFusionCommand::getFusionList() {
3283  return MFusionList;
3284 }
3285 
3286 bool KernelFusionCommand::producesPiEvent() const { return false; }
3287 
3288 pi_int32 KernelFusionCommand::enqueueImp() {
3290  waitForEvents(MQueue, MPreparedDepsEvents, MEvent->getHandleRef());
3291 
3292  // We need to release the queue here because KernelFusionCommands are
3293  // held back by the scheduler thus prevent the deallocation of the queue.
3294  resetQueue();
3295  return PI_SUCCESS;
3296 }
3297 
3299  MStatus = Status;
3300 }
3301 
3303  assert(MStatus != FusionStatus::ACTIVE &&
3304  "Cannot release the queue attached to the KernelFusionCommand if it "
3305  "is active.");
3306  MQueue.reset();
3307  MWorkerQueue.reset();
3308 }
3309 
3311 #ifdef XPTI_ENABLE_INSTRUMENTATION
3312  constexpr uint16_t NotificationTraceType = xpti::trace_node_create;
3313  if (!xptiCheckTraceEnabled(MStreamID)) {
3314  return;
3315  }
3316  // Create a payload with the command name and an event using this payload to
3317  // emit a node_create
3318  MCommandNodeType = commandToNodeType(MType);
3319  MCommandName = commandToName(MType);
3320 
3321  static unsigned FusionNodeCount = 0;
3322  std::stringstream PayloadStr;
3323  PayloadStr << "Fusion command #" << FusionNodeCount++;
3324  xpti::payload_t Payload = xpti::payload_t(PayloadStr.str().c_str());
3325 
3326  uint64_t CommandInstanceNo = 0;
3327  xpti_td *CmdTraceEvent =
3328  xptiMakeEvent(MCommandName.c_str(), &Payload, xpti::trace_graph_event,
3329  xpti_at::active, &CommandInstanceNo);
3330 
3331  MInstanceID = CommandInstanceNo;
3332  if (CmdTraceEvent) {
3333  MTraceEvent = static_cast<void *>(CmdTraceEvent);
3334  // If we are seeing this event again, then the instance ID
3335  // will be greater
3336  // than 1; in this case, we must skip sending a
3337  // notification to create a node as this node has already
3338  // been created. We return this value so the epilog method
3339  // can be called selectively.
3340  // See makeTraceEventProlog.
3341  MFirstInstance = (CommandInstanceNo == 1);
3342  }
3343 
3344  // This function is called in the constructor of the command. At this point
3345  // the kernel fusion list is still empty, so we don't have a terrible lot of
3346  // information we could attach to this node here.
3347  if (MFirstInstance && CmdTraceEvent) {
3348  xpti::addMetadata(CmdTraceEvent, "sycl_device",
3349  deviceToID(MQueue->get_device()));
3350  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
3351  deviceToString(MQueue->get_device()));
3352  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
3353  getSyclObjImpl(MQueue->get_device())->getDeviceName());
3354  xpti::addMetadata(CmdTraceEvent, "queue_id", MQueue->getQueueID());
3355  }
3356 
3357  if (MFirstInstance) {
3358  xptiNotifySubscribers(MStreamID, NotificationTraceType,
3359  detail::GSYCLGraphEvent,
3360  static_cast<xpti_td *>(MTraceEvent), MInstanceID,
3361  static_cast<const void *>(MCommandNodeType.c_str()));
3362  }
3363 
3364 #endif
3365 }
3366 
3367 void KernelFusionCommand::printDot(std::ostream &Stream) const {
3368  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
3369 
3370  Stream << "ID = " << this << "\\n";
3371  Stream << "KERNEL FUSION on " << deviceToString(MQueue->get_device()) << "\\n"
3372  << "FUSION LIST: {";
3373  bool Initial = true;
3374  for (auto *Cmd : MFusionList) {
3375  if (!Initial) {
3376  Stream << ",\\n";
3377  }
3378  Initial = false;
3379  auto *KernelCG = static_cast<detail::CGExecKernel *>(&Cmd->getCG());
3380  if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource()) {
3381  Stream << "created from source";
3382  } else {
3383  Stream << demangleKernelName(KernelCG->getKernelName());
3384  }
3385  }
3386  Stream << "}\\n";
3387 
3388  Stream << "\"];" << std::endl;
3389 
3390  for (const auto &Dep : MDeps) {
3391  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
3392  << " [ label = \"Access mode: "
3393  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
3394  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
3395  << std::endl;
3396  }
3397 }
3398 
3399 } // namespace detail
3400 } // namespace _V1
3401 } // namespace sycl
The context class represents a SYCL context on which kernel functions may be executed.
Definition: context.hpp:51
backend get_backend() const noexcept
Returns the backend associated with this context.
Definition: context.cpp:132
Base class for memory allocation commands.
Definition: commands.hpp:452
const Requirement * getRequirement() const final
Definition: commands.hpp:463
AllocaCommandBase * MLinkedAllocaCmd
Alloca command linked with current command.
Definition: commands.hpp:480
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:1017
virtual void * getMemAllocation() const =0
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1013
bool MIsActive
Indicates that current alloca is active one.
Definition: commands.hpp:482
bool MIsLeaderAlloca
Indicates that the command owns memory allocation in case of connected alloca command.
Definition: commands.hpp:486
AllocaCommandBase(CommandType Type, QueueImplPtr Queue, Requirement Req, AllocaCommandBase *LinkedAllocaCmd, bool IsConst)
Definition: commands.cpp:980
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:1015
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:991
SYCLMemObjI * getSYCLMemObj() const
Definition: commands.hpp:459
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1075
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1037
AllocaCommand(QueueImplPtr Queue, Requirement Req, bool InitFromUserData=true, AllocaCommandBase *LinkedAllocaCmd=nullptr, bool IsConst=false)
Definition: commands.cpp:1019
void * getMemAllocation() const final
Definition: commands.cpp:1133
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1113
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1160
AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *ParentAlloca, std::vector< Command * > &ToEnqueue, std::vector< Command * > &ToCleanUp)
Definition: commands.cpp:1095
sycl::detail::kernel_param_kind_t MType
Definition: cg_types.hpp:45
"Execute kernel" command group class.
Definition: cg.hpp:167
std::shared_ptr< detail::kernel_bundle_impl > MKernelBundle
Definition: cg.hpp:173
std::vector< ArgDesc > MArgs
Definition: cg.hpp:174
NDRDescT MNDRDesc
Stores ND-range description.
Definition: cg.hpp:170
std::shared_ptr< detail::kernel_impl > MSyclKernel
Definition: cg.hpp:172
std::shared_ptr< detail::queue_impl > MQueue
Definition: cg.hpp:340
Base class for all types of command groups.
Definition: cg.hpp:53
CGTYPE getType() const
Definition: cg.hpp:128
CGTYPE
Type of the command group.
Definition: cg.hpp:56
The Command class represents some action that needs to be performed on one or more memory objects.
Definition: commands.hpp:102
Command * processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform glueing of events from different contexts.
Definition: commands.cpp:705
void * MTraceEvent
The event for node_create and task_begin.
Definition: commands.hpp:345
CommandType MType
The type of the command.
Definition: commands.hpp:283
virtual bool producesPiEvent() const
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:749
void emitEnqueuedEventSignal(sycl::detail::pi::PiEvent &PiEventAddr)
Creates a signal event with the enqueued kernel event handle.
Definition: commands.cpp:798
int32_t MStreamID
The stream under which the traces are emitted.
Definition: commands.hpp:349
void emitInstrumentation(uint16_t Type, const char *Txt=nullptr)
Emits an event of Type.
Definition: commands.cpp:807
virtual void emitInstrumentationData()=0
Instrumentation method which emits telemetry data.
void resolveReleaseDependencies(std::set< Command * > &list)
Looks at all the dependencies for the release command and enables instrumentation to report these dep...
Definition: commands.cpp:909
std::string MCommandName
Buffer to build the command end-user understandable name.
Definition: commands.hpp:358
virtual pi_int32 enqueueImp()=0
Private interface. Derived classes should implement this method.
bool MMarkedForCleanup
Indicates that the node will be freed by graph cleanup.
Definition: commands.hpp:385
void emitEdgeEventForEventDependence(Command *Cmd, sycl::detail::pi::PiEvent &EventAddr)
Creates an edge event when the dependency is an event.
Definition: commands.cpp:602
unsigned MLeafCounter
Counts the number of memory objects this command is a leaf for.
Definition: commands.hpp:320
virtual bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, std::vector< Command * > &ToCleanUp)
Checks if the command is enqueued, and calls enqueueImp.
Definition: commands.cpp:818
std::vector< sycl::detail::pi::PiEvent > getPiEvents(const std::vector< EventImplPtr > &EventImpls) const
Collect PI events from EventImpls and filter out some of them in case of in order queue.
Definition: commands.cpp:232
void waitForPreparedHostEvents() const
Definition: commands.cpp:448
std::string MSubmissionFileName
Introduces string to handle memory management since code_location struct works with raw char arrays.
Definition: commands.hpp:371
std::mutex MEnqueueMtx
Mutex used to protect enqueueing from race conditions.
Definition: commands.hpp:285
virtual const ContextImplPtr & getWorkerContext() const
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:740
void emitInstrumentationDataProxy()
Proxy method which calls emitInstrumentationData.
Definition: commands.cpp:535
code_location MSubmissionCodeLocation
Represents code location of command submission to SYCL API, assigned with the valid value only if com...
Definition: commands.hpp:368
void makeTraceEventEpilog()
If prolog has been run, run epilog; this must be guarded by a check for xptiTraceEnabled().
Definition: commands.cpp:692
std::vector< sycl::detail::pi::PiEvent > getPiEventsBlocking(const std::vector< EventImplPtr > &EventImpls) const
Collect PI events from EventImpls and filter out some of them in case of in order queue.
Definition: commands.cpp:259
std::vector< EventImplPtr > & MPreparedHostDepsEvents
Definition: commands.hpp:257
std::vector< EventImplPtr > & MPreparedDepsEvents
Dependency events prepared for waiting by backend.
Definition: commands.hpp:256
Command(CommandType Type, QueueImplPtr Queue, sycl::detail::pi::PiExtCommandBuffer CommandBuffer=nullptr, const std::vector< sycl::detail::pi::PiExtSyncPoint > &SyncPoints={})
It is safe to bind MPreparedDepsEvents and MPreparedHostDepsEvents references to event_impl class mem...
Definition: commands.cpp:510
std::string MSubmissionFunctionName
Definition: commands.hpp:372
uint64_t MInstanceID
Instance ID tracked for the command.
Definition: commands.hpp:364
const QueueImplPtr & getWorkerQueue() const
Get the queue this command will be submitted to.
Definition: commands.cpp:744
std::vector< DepDesc > MDeps
Contains list of dependencies(edges)
Definition: commands.hpp:314
const char * getBlockReason() const
Definition: commands.cpp:951
std::vector< sycl::detail::pi::PiExtSyncPoint > MSyncPointDeps
List of sync points for submissions to a command buffer.
Definition: commands.hpp:405
virtual bool readyForCleanup() const
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:753
void addUser(Command *NewUser)
Definition: commands.hpp:135
std::atomic< EnqueueResultT::ResultT > MEnqueueStatus
Describes the status of the command.
Definition: commands.hpp:337
const EventImplPtr & getEvent() const
Definition: commands.hpp:174
uint64_t makeTraceEventProlog(void *MAddress)
Create a trace event of node_create type; this must be guarded by a check for xptiTraceEnabled().
Definition: commands.cpp:660
friend class DispatchHostTask
Definition: commands.hpp:287
CommandType getType() const
Definition: commands.hpp:138
void * MAddress
Reserved for storing the object address such as SPIR-V or memory object address.
Definition: commands.hpp:352
std::string MAddressString
Buffer to build the address string.
Definition: commands.hpp:354
void waitForEvents(QueueImplPtr Queue, std::vector< EventImplPtr > &RawEvents, sycl::detail::pi::PiEvent &Event)
Definition: commands.cpp:453
bool MIsBlockable
Indicates whether the command can be blocked from enqueueing.
Definition: commands.hpp:318
virtual bool supportsPostEnqueueCleanup() const
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:751
bool MTraceEventPrologComplete
Flag to indicate if makeTraceEventProlog() has been run.
Definition: commands.hpp:360
sycl::detail::pi::PiExtCommandBuffer MCommandBuffer
CommandBuffer which will be used to submit to instead of the queue, if set.
Definition: commands.hpp:403
std::string MCommandNodeType
Buffer to build the command node type.
Definition: commands.hpp:356
Command * addDep(DepDesc NewDep, std::vector< Command * > &ToCleanUp)
Definition: commands.cpp:758
void emitEdgeEventForCommandDependence(Command *Cmd, void *ObjAddr, bool IsCommand, std::optional< access::mode > AccMode=std::nullopt)
Creates an edge event when the dependency is a command.
Definition: commands.cpp:551
const QueueImplPtr & getQueue() const
Definition: commands.hpp:172
bool MFirstInstance
Flag to indicate if this is the first time we are seeing this payload.
Definition: commands.hpp:362
DispatchHostTask(ExecCGCommand *ThisCmd, std::vector< interop_handle::ReqToMem > ReqToMem)
Definition: commands.cpp:364
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1784
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1766
void addRequirement(Command *DepCmd, AllocaCommandBase *AllocaCmd, const Requirement *Req)
Definition: commands.cpp:1719
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1735
EmptyCommand(QueueImplPtr Queue)
Definition: commands.cpp:1707
The exec CG command enqueues execution of kernel or explicit memory operation.
Definition: commands.hpp:641
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:3251
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:3256
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:3262
std::string_view getTypeString() const
Definition: commands.cpp:2194
detail::CG & getCG() const
Definition: commands.hpp:656
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:2160
std::vector< std::shared_ptr< const void > > getAuxiliaryResources() const
Definition: commands.cpp:1577
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:2114
ExecCGCommand(std::unique_ptr< detail::CG > CommandGroup, QueueImplPtr Queue, sycl::detail::pi::PiExtCommandBuffer CommandBuffer=nullptr, const std::vector< sycl::detail::pi::PiExtSyncPoint > &Dependencies={})
Definition: commands.cpp:1907
void TraceEventXPTI(const char *Message)
static GlobalHandler & instance()
void addToFusionList(ExecCGCommand *Kernel)
Definition: commands.cpp:3278
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:3286
void setFusionStatus(FusionStatus Status)
Set the status of this fusion command to Status.
Definition: commands.cpp:3298
std::vector< ExecCGCommand * > & getFusionList()
Definition: commands.cpp:3282
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:3367
KernelFusionCommand(QueueImplPtr Queue)
Definition: commands.cpp:3268
void resetQueue()
Reset the queue.
Definition: commands.cpp:3302
std::vector< Command * > & auxiliaryCommands()
Definition: commands.cpp:3274
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:3310
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1309
MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, void **DstPtr, QueueImplPtr Queue, access::mode MapMode)
Definition: commands.cpp:1300
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1348
const ContextImplPtr & getWorkerContext() const final
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:1675
MemCpyCommandHost(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, Requirement DstReq, void **DstPtr, QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
Definition: commands.cpp:1625
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1643
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1786
MemCpyCommand(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, Requirement DstReq, AllocaCommandBase *DstAllocaCmd, QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
Definition: commands.cpp:1448
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1467
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1503
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1544
const ContextImplPtr & getWorkerContext() const final
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:1499
static void * allocateMemSubBuffer(ContextImplPtr TargetContext, void *ParentMemObj, size_t ElemSize, size_t Offset, range< 3 > Range, std::vector< EventImplPtr > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
static void ext_oneapi_copyH2D_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, char *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void ext_oneapi_copyD2H_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, void *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, char *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void prefetch_usm(void *Ptr, QueueImplPtr Queue, size_t Len, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void advise_usm(const void *Ptr, QueueImplPtr Queue, size_t Len, pi_mem_advice Advice, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy_2d_usm(const void *SrcMem, size_t SrcPitch, QueueImplPtr Queue, void *DstMem, size_t DstPitch, size_t Width, size_t Height, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_prefetch_usm_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, void *Mem, size_t Length, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void fill(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, size_t PatternSize, const char *Pattern, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent &OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_copyD2D_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, void *SrcMem, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void * allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, bool InitFromUserData, void *HostPtr, std::vector< EventImplPtr > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
static void copy_image_bindless(void *Src, QueueImplPtr Queue, void *Dst, const sycl::detail::pi::PiMemImageDesc &Desc, const sycl::detail::pi::PiMemImageFormat &Format, const sycl::detail::pi::PiImageCopyFlags Flags, sycl::detail::pi::PiImageOffset SrcOffset, sycl::detail::pi::PiImageOffset DstOffset, sycl::detail::pi::PiImageRegion CopyExtent, sycl::detail::pi::PiImageRegion HostExtent, const std::vector< sycl::detail::pi::PiEvent > &DepEvents, sycl::detail::pi::PiEvent *OutEvent)
static void copy_usm(const void *SrcMem, QueueImplPtr Queue, size_t Len, void *DstMem, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy_to_device_global(const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue, size_t NumBytes, size_t Offset, const void *SrcMem, const std::vector< sycl::detail::pi::PiEvent > &DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void release(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *MemAllocation, std::vector< EventImplPtr > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
static void ext_oneapi_fill_usm_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, void *DstMem, size_t Len, int Pattern, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void copy_from_device_global(const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue, size_t NumBytes, size_t Offset, void *DstMem, const std::vector< sycl::detail::pi::PiEvent > &DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_copy_usm_cmd_buffer(ContextImplPtr Context, const void *SrcMem, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, size_t Len, void *DstMem, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void memset_2d_usm(void *DstMem, QueueImplPtr Queue, size_t Pitch, size_t Width, size_t Height, char Value, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void fill_usm(void *DstMem, QueueImplPtr Queue, size_t Len, int Pattern, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void copy(SYCLMemObjI *SYCLMemObj, void *SrcMem, QueueImplPtr SrcQueue, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, QueueImplPtr TgtQueue, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent &OutEvent, const detail::EventImplPtr &OutEventImpl)
static void ext_oneapi_fill_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, void *Mem, size_t PatternSize, const char *Pattern, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
static void * map(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, access::mode AccessMode, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
static void fill_2d_usm(void *DstMem, QueueImplPtr Queue, size_t Pitch, size_t Width, size_t Height, const std::vector< char > &Pattern, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl)
static void unmap(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, void *MappedPtr, std::vector< sycl::detail::pi::PiEvent > DepEvents, sycl::detail::pi::PiEvent &OutEvent)
static void ext_oneapi_advise_usm_cmd_buffer(sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, const void *Mem, size_t Length, pi_mem_advice Advice, std::vector< sycl::detail::pi::PiExtSyncPoint > Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint)
sycl::range< 3 > GlobalSize
Definition: cg_types.hpp:129
sycl::range< 3 > NumWorkGroups
Number of workgroups, used to record the number of workgroups from the simplest form of parallel_for_...
Definition: cg_types.hpp:135
sycl::id< 3 > GlobalOffset
Definition: cg_types.hpp:131
sycl::range< 3 > LocalSize
Definition: cg_types.hpp:130
void set(sycl::range< Dims_ > NumWorkItems)
Definition: cg_types.hpp:70
static ProgramManager & getInstance()
kernel_id getSYCLKernelID(const std::string &KernelName)
device_image_plain getDeviceImageFromBinaryImage(RTDeviceBinaryImage *BinImage, const context &Ctx, const device &Dev)
bool kernelUsesAssert(const std::string &KernelName) const
HostPipeMapEntry * getHostPipeEntry(const std::string &UniqueId)
device_image_plain build(const device_image_plain &DeviceImage, const std::vector< device > &Devs, const property_list &PropList)
std::tuple< sycl::detail::pi::PiKernel, std::mutex *, const KernelArgMask *, sycl::detail::pi::PiProgram > getOrCreateKernel(const ContextImplPtr &ContextImpl, const DeviceImplPtr &DeviceImpl, const std::string &KernelName, const NDRDescT &NDRDesc={})
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1294
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:1296
ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd)
Definition: commands.cpp:1182
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1276
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1187
bool readyForCleanup() const final
Returns true iff this command is ready to be submitted for cleanup.
Definition: commands.cpp:1298
virtual MemObjType getType() const =0
Command * connectDepEvent(Command *const Cmd, const EventImplPtr &DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform connection of events in multiple contexts.
QueueImplPtr getDefaultHostQueue()
Definition: scheduler.hpp:444
void enqueueCommandForCG(EventImplPtr NewEvent, std::vector< Command * > &AuxilaryCmds, BlockingT Blocking=NON_BLOCKING)
Definition: scheduler.cpp:165
static Scheduler & getInstance()
Definition: scheduler.cpp:260
void NotifyHostTaskCompletion(Command *Cmd)
Definition: scheduler.cpp:471
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1431
UnMapMemObject(AllocaCommandBase *DstAllocaCmd, Requirement Req, void **SrcPtr, QueueImplPtr Queue)
Definition: commands.cpp:1365
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1372
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1396
UpdateHostRequirementCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *SrcAllocaCmd, void **DstPtr)
Definition: commands.cpp:1803
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1812
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1602
static void bufferAssociateNotification(const void *, const void *)
const PluginPtr & getPlugin() const
sycl::detail::pi::PiDevice & getHandleRef()
Get reference to PI device.
Definition: device_impl.hpp:66
Data type that manages the code_location information in TLS.
Definition: common.hpp:129
const detail::code_location & query()
Query the information in the TLS slot.
Definition: common.cpp:55
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:66
bool is_accelerator() const
Check if device is an accelerator device.
Definition: device.cpp:91
bool is_gpu() const
Check if device is a GPU device.
Definition: device.cpp:89
bool is_cpu() const
Check if device is a CPU device.
Definition: device.cpp:87
cl_int get_cl_code() const
Definition: exception.cpp:100
Objects of the class identify kernel is some kernel_bundle related APIs.
Provides an abstraction of a SYCL kernel.
Definition: kernel.hpp:79
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: nd_range.hpp:22
void handleErrorOrWarning(pi_result Error, const device_impl &DeviceImpl, pi_kernel Kernel, const NDRDescT &NDRDesc)
Analyzes error code and arguments of piEnqueueKernelLaunch to emit user-friendly exception describing...
::pi_mem PiMem
Definition: pi.hpp:141
::pi_kernel PiKernel
Definition: pi.hpp:138
::pi_kernel_cache_config PiKernelCacheConfig
Definition: pi.hpp:155
::pi_ext_sync_point PiExtSyncPoint
Definition: pi.hpp:156
::pi_program PiProgram
Definition: pi.hpp:137
void free(void *Ptr, const context &Ctxt, const code_location &CL)
Definition: usm_impl.cpp:296
static void flushCrossQueueDeps(const std::vector< EventImplPtr > &EventImpls, const QueueImplPtr &Queue)
Definition: commands.cpp:312
static std::string_view cgTypeToString(detail::CG::CGTYPE Type)
Definition: commands.cpp:1838
static std::string demangleKernelName(std::string Name)
Definition: commands.cpp:94
std::vector< bool > KernelArgMask
constexpr const char * SYCL_STREAM_NAME
std::enable_if< !std::is_same< typename Param::return_type, sycl::range< 3 > >::value, typename Param::return_type >::type get_kernel_device_specific_info(sycl::detail::pi::PiKernel Kernel, sycl::detail::pi::PiDevice Device, const PluginPtr &Plugin)
Definition: kernel_info.hpp:89
std::string codeToString(pi_int32 code)
Definition: common.hpp:153
void ReverseRangeDimensionsForKernel(NDRDescT &NDR)
Definition: commands.cpp:2236
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
Definition: event_impl.hpp:32
std::string string_view
Definition: handler.hpp:424
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: impl_utils.hpp:30
pi_int32 enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName, bool blocking, void *ptr, size_t size, std::vector< sycl::detail::pi::PiEvent > &RawEvents, const detail::EventImplPtr &OutEventImpl, bool read)
Definition: commands.cpp:2653
std::shared_ptr< event_impl > EventImplPtr
Definition: cg.hpp:43
static void adjustNDRangePerKernel(NDRDescT &NDR, sycl::detail::pi::PiKernel Kernel, const device_impl &DeviceImpl)
Definition: commands.cpp:2208
pi_int32 enqueueImpCommandBufferKernel(context Ctx, DeviceImplPtr DeviceImpl, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, const CGExecKernel &CommandGroup, std::vector< sycl::detail::pi::PiExtSyncPoint > &SyncPoints, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint, const std::function< void *(Requirement *Req)> &getMemAllocationFunc)
Definition: commands.cpp:2429
pi_mem_obj_access AccessModeToPi(access::mode AccessorMode)
Definition: commands.cpp:2244
static pi_result SetKernelParamsAndLaunch(const QueueImplPtr &Queue, std::vector< ArgDesc > &Args, const std::shared_ptr< device_image_impl > &DeviceImageImpl, sycl::detail::pi::PiKernel Kernel, NDRDescT &NDRDesc, std::vector< sycl::detail::pi::PiEvent > &RawEvents, const detail::EventImplPtr &OutEventImpl, const KernelArgMask *EliminatedArgMask, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, bool IsCooperative)
Definition: commands.cpp:2343
std::shared_ptr< plugin > PluginPtr
Definition: pi.hpp:48
AccessorImplHost Requirement
std::shared_ptr< device_impl > DeviceImplPtr
static std::string deviceToString(device Device)
Definition: commands.cpp:97
pi_int32 enqueueImpKernel(const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector< ArgDesc > &Args, const std::shared_ptr< detail::kernel_bundle_impl > &KernelBundleImplPtr, const std::shared_ptr< detail::kernel_impl > &MSyclKernel, const std::string &KernelName, std::vector< sycl::detail::pi::PiEvent > &RawEvents, const detail::EventImplPtr &OutEventImpl, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig, const bool KernelIsCooperative)
Definition: commands.cpp:2532
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
void applyFuncOnFilteredArgs(const KernelArgMask *EliminatedArgMask, std::vector< ArgDesc > &Args, std::function< void(detail::ArgDesc &Arg, int NextTrueIndex)> Func)
Definition: commands.cpp:110
auto tie(Ts &...Args)
Definition: tuple.hpp:39
std::string string
Definition: handler.hpp:423
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
Definition: event_impl.hpp:34
void SetArgBasedOnType(const PluginPtr &Plugin, sycl::detail::pi::PiKernel Kernel, const std::shared_ptr< device_image_impl > &DeviceImageImpl, const std::function< void *(Requirement *Req)> &getMemAllocationFunc, const sycl::context &Context, bool IsHost, detail::ArgDesc &Arg, size_t NextTrueIndex)
Definition: commands.cpp:2256
void DispatchNativeKernel(void *Blob)
Definition: commands.cpp:2405
static std::string accessModeToString(access::mode Mode)
Definition: commands.cpp:153
Function for_each(Group g, Ptr first, Ptr last, Function f)
__DPCPP_SYCL_EXTERNAL _SYCL_EXT_CPLX_INLINE_VISIBILITY std::enable_if_t< is_genfloat< _Tp >::value, _Tp > arg(const complex< _Tp > &__c)
PropertyListT int access::address_space multi_ptr & operator=(multi_ptr &&)=default
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:107
Definition: access.hpp:18
_pi_kernel_cache_config
Definition: pi.h:781
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA
Definition: pi.h:787
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM
Definition: pi.h:785
int32_t pi_int32
Definition: pi.h:201
_pi_result
Definition: pi.h:213
pi_result piextEnqueueReadHostPipe(pi_queue queue, pi_program program, const char *pipe_symbol, pi_bool blocking, void *ptr, size_t size, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
Plugin.
Definition: pi_cuda.cpp:1004
pi_result piKernelRelease(pi_kernel kernel)
Definition: pi_cuda.cpp:531
pi_result piextEnqueueWriteHostPipe(pi_queue queue, pi_program program, const char *pipe_symbol, pi_bool blocking, void *ptr, size_t size, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event)
Write to pipe of a given name.
Definition: pi_cuda.cpp:1024
pi_result piKernelSetArg(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
Definition: pi_cuda.cpp:347
pi_result piEnqueueEventsWaitWithBarrier(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:665
pi_result piEnqueueKernelLaunch(pi_queue queue, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:537
@ PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
Definition: pi.h:501
pi_result piextWaitExternalSemaphore(pi_queue command_queue, pi_interop_semaphore_handle sem_handle, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
API to instruct the queue with a non-blocking wait on an external semaphore.
Definition: pi_cuda.cpp:492
@ PI_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG
provides the preferred cache configuration (large slm or large data)
Definition: pi.h:1576
pi_result piProgramRelease(pi_program program)
Definition: pi_cuda.cpp:324
pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer, pi_queue queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
API to submit the command-buffer to queue for execution, returns an error if the command-buffer is no...
Definition: pi_cuda.cpp:1202
pi_result piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index, const pi_sampler *arg_value)
Definition: pi_cuda.cpp:360
pi_result piextCommandBufferNDRangeKernel(pi_ext_command_buffer command_buffer, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_sync_points_in_wait_list, const pi_ext_sync_point *sync_point_wait_list, pi_ext_sync_point *sync_point)
API to append a kernel execution command to the command-buffer.
Definition: pi_cuda.cpp:1078
pi_result piEnqueueEventsWait(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:657
pi_result piKernelSetExecInfo(pi_kernel kernel, pi_kernel_exec_info value_name, size_t param_value_size, const void *param_value)
API to set attributes controlling kernel execution.
Definition: pi_cuda.cpp:1044
pi_result piEventsWait(pi_uint32 num_events, const pi_event *event_list)
Definition: pi_cuda.cpp:598
pi_result piextEnqueueCooperativeKernelLaunch(pi_queue queue, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_cuda.cpp:547
_pi_mem_obj_access
Definition: pi.h:1872
@ PI_ACCESS_READ_WRITE
Definition: pi.h:1873
@ PI_ACCESS_READ_ONLY
Definition: pi.h:1875
@ PI_ACCESS_WRITE_ONLY
Definition: pi.h:1874
pi_result piextSignalExternalSemaphore(pi_queue command_queue, pi_interop_semaphore_handle sem_handle, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
API to instruct the queue to signal the external semaphore handle once all previous commands have com...
Definition: pi_cuda.cpp:500
pi_result piextKernelSetArgPointer(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
Sets up pointer arguments for CL kernels.
Definition: pi_cuda.cpp:891
pi_result piKernelGetGroupInfo(pi_kernel kernel, pi_device device, pi_kernel_group_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_cuda.cpp:508
@ PI_KERNEL_ARG_MEM_OBJ_ACCESS
Definition: pi.h:1881
pi_result piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, const pi_mem_obj_property *arg_properties, const pi_mem *arg_value)
Definition: pi_cuda.cpp:353
pi_mem_access_flag mem_access
Definition: pi.h:1889
Dependency between two commands.
Definition: commands.hpp:76
const Requirement * MDepRequirement
Requirement for the dependency.
Definition: commands.hpp:89
Command * MDepCommand
The actual dependency command.
Definition: commands.hpp:87
Result of command enqueueing.
Definition: commands.hpp:57