DPC++ Runtime
Runtime libraries for oneAPI DPC++
commands.cpp
Go to the documentation of this file.
1 //===----------- commands.cpp - SYCL commands -------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 
11 #include <detail/context_impl.hpp>
12 #include <detail/event_impl.hpp>
14 #include <detail/kernel_impl.hpp>
15 #include <detail/kernel_info.hpp>
17 #include <detail/program_impl.hpp>
19 #include <detail/queue_impl.hpp>
20 #include <detail/sampler_impl.hpp>
23 #include <detail/stream_impl.hpp>
24 #include <detail/xpti_registry.hpp>
25 #include <sycl/access/access.hpp>
26 #include <sycl/backend_types.hpp>
27 #include <sycl/detail/cg_types.hpp>
29 #include <sycl/sampler.hpp>
30 
31 #include <cassert>
32 #include <optional>
33 #include <string>
34 #include <vector>
35 
36 #ifdef __has_include
37 #if __has_include(<cxxabi.h>)
38 #define __SYCL_ENABLE_GNU_DEMANGLING
39 #include <cstdlib>
40 #include <cxxabi.h>
41 #include <memory>
42 #endif
43 #endif
44 
45 #ifdef XPTI_ENABLE_INSTRUMENTATION
46 #include "xpti/xpti_trace_framework.hpp"
47 #include <detail/xpti_registry.hpp>
48 #endif
49 
50 namespace sycl {
52 namespace detail {
53 
54 #ifdef XPTI_ENABLE_INSTRUMENTATION
55 // Global graph for the application
56 extern xpti::trace_event_data_t *GSYCLGraphEvent;
57 #endif
58 
59 #ifdef __SYCL_ENABLE_GNU_DEMANGLING
60 struct DemangleHandle {
61  char *p;
62  DemangleHandle(char *ptr) : p(ptr) {}
63  ~DemangleHandle() { std::free(p); }
64 };
65 static std::string demangleKernelName(std::string Name) {
66  int Status = -1; // some arbitrary value to eliminate the compiler warning
67  DemangleHandle result(abi::__cxa_demangle(Name.c_str(), NULL, NULL, &Status));
68  return (Status == 0) ? result.p : Name;
69 }
70 #else
71 static std::string demangleKernelName(std::string Name) { return Name; }
72 #endif
73 
74 static std::string deviceToString(device Device) {
75  if (getSyclObjImpl(Device)->is_host())
76  return "HOST";
77  else if (Device.is_cpu())
78  return "CPU";
79  else if (Device.is_gpu())
80  return "GPU";
81  else if (Device.is_accelerator())
82  return "ACCELERATOR";
83  else
84  return "UNKNOWN";
85 }
86 
88  const ProgramManager::KernelArgMask &EliminatedArgMask,
89  std::vector<ArgDesc> &Args,
90  std::function<void(detail::ArgDesc &Arg, int NextTrueIndex)> Func) {
91  if (EliminatedArgMask.empty()) {
92  for (ArgDesc &Arg : Args) {
93  Func(Arg, Arg.MIndex);
94  }
95  } else {
96  // TODO this is not necessary as long as we can guarantee that the
97  // arguments are already sorted (e. g. handle the sorting in handler
98  // if necessary due to set_arg(...) usage).
99  std::sort(Args.begin(), Args.end(), [](const ArgDesc &A, const ArgDesc &B) {
100  return A.MIndex < B.MIndex;
101  });
102  int LastIndex = -1;
103  size_t NextTrueIndex = 0;
104 
105  for (ArgDesc &Arg : Args) {
106  // Handle potential gaps in set arguments (e. g. if some of them are
107  // set on the user side).
108  for (int Idx = LastIndex + 1; Idx < Arg.MIndex; ++Idx)
109  if (!EliminatedArgMask[Idx])
110  ++NextTrueIndex;
111  LastIndex = Arg.MIndex;
112 
113  if (EliminatedArgMask[Arg.MIndex])
114  continue;
115 
116  Func(Arg, NextTrueIndex);
117  ++NextTrueIndex;
118  }
119  }
120 }
121 
122 #ifdef XPTI_ENABLE_INSTRUMENTATION
123 static size_t deviceToID(const device &Device) {
124  if (getSyclObjImpl(Device)->is_host())
125  return 0;
126  else
127  return reinterpret_cast<size_t>(getSyclObjImpl(Device)->getHandleRef());
128 }
129 #endif
130 
131 static std::string accessModeToString(access::mode Mode) {
132  switch (Mode) {
133  case access::mode::read:
134  return "read";
135  case access::mode::write:
136  return "write";
137  case access::mode::read_write:
138  return "read_write";
139  case access::mode::discard_write:
140  return "discard_write";
141  case access::mode::discard_read_write:
142  return "discard_read_write";
143  default:
144  return "unknown";
145  }
146 }
147 
148 #ifdef XPTI_ENABLE_INSTRUMENTATION
149 // Using the command group type to create node types for the asynchronous task
150 // graph modeling
151 static std::string commandToNodeType(Command::CommandType Type) {
152  switch (Type) {
153  case Command::CommandType::RUN_CG:
154  return "command_group_node";
155  case Command::CommandType::COPY_MEMORY:
156  return "memory_transfer_node";
157  case Command::CommandType::ALLOCA:
158  return "memory_allocation_node";
159  case Command::CommandType::ALLOCA_SUB_BUF:
160  return "sub_buffer_creation_node";
161  case Command::CommandType::RELEASE:
162  return "memory_deallocation_node";
163  case Command::CommandType::MAP_MEM_OBJ:
164  return "memory_transfer_node";
165  case Command::CommandType::UNMAP_MEM_OBJ:
166  return "memory_transfer_node";
167  case Command::CommandType::UPDATE_REQUIREMENT:
168  return "host_acc_create_buffer_lock_node";
169  case Command::CommandType::EMPTY_TASK:
170  return "host_acc_destroy_buffer_release_node";
171  default:
172  return "unknown_node";
173  }
174 }
175 
176 // Using the names being generated and the string are subject to change to
177 // something more meaningful to end-users as this will be visible in analysis
178 // tools that subscribe to this data
179 static std::string commandToName(Command::CommandType Type) {
180  switch (Type) {
181  case Command::CommandType::RUN_CG:
182  return "Command Group Action";
183  case Command::CommandType::COPY_MEMORY:
184  return "Memory Transfer (Copy)";
185  case Command::CommandType::ALLOCA:
186  return "Memory Allocation";
187  case Command::CommandType::ALLOCA_SUB_BUF:
188  return "Sub Buffer Creation";
189  case Command::CommandType::RELEASE:
190  return "Memory Deallocation";
191  case Command::CommandType::MAP_MEM_OBJ:
192  return "Memory Transfer (Map)";
193  case Command::CommandType::UNMAP_MEM_OBJ:
194  return "Memory Transfer (Unmap)";
195  case Command::CommandType::UPDATE_REQUIREMENT:
196  return "Host Accessor Creation/Buffer Lock";
197  case Command::CommandType::EMPTY_TASK:
198  return "Host Accessor Destruction/Buffer Lock Release";
199  default:
200  return "Unknown Action";
201  }
202 }
203 #endif
204 
205 std::vector<RT::PiEvent>
206 Command::getPiEvents(const std::vector<EventImplPtr> &EventImpls) const {
207  std::vector<RT::PiEvent> RetPiEvents;
208  for (auto &EventImpl : EventImpls) {
209  if (EventImpl->getHandleRef() == nullptr)
210  continue;
211 
212  // Do not add redundant event dependencies for in-order queues.
213  // At this stage dependency is definitely pi task and need to check if
214  // current one is a host task. In this case we should not skip pi event due
215  // to different sync mechanisms for different task types on in-order queue.
216  const QueueImplPtr &WorkerQueue = getWorkerQueue();
217  // MWorkerQueue in command is always not null. So check if
218  // EventImpl->getWorkerQueue != nullptr is implicit.
219  if (EventImpl->getWorkerQueue() == WorkerQueue &&
220  WorkerQueue->isInOrder() && !isHostTask())
221  continue;
222 
223  RetPiEvents.push_back(EventImpl->getHandleRef());
224  }
225 
226  return RetPiEvents;
227 }
228 
229 bool Command::isHostTask() const {
230  return (MType == CommandType::RUN_CG) /* host task has this type also */ &&
231  ((static_cast<const ExecCGCommand *>(this))->getCG().getType() ==
232  CG::CGTYPE::CodeplayHostTask);
233 }
234 
235 static void flushCrossQueueDeps(const std::vector<EventImplPtr> &EventImpls,
236  const QueueImplPtr &Queue) {
237  for (auto &EventImpl : EventImpls) {
238  EventImpl->flushIfNeeded(Queue);
239  }
240 }
241 
243  ExecCGCommand *MThisCmd;
244  std::vector<interop_handle::ReqToMem> MReqToMem;
245 
246  pi_result waitForEvents() const {
247  std::map<const detail::plugin *, std::vector<EventImplPtr>>
248  RequiredEventsPerPlugin;
249 
250  for (const EventImplPtr &Event : MThisCmd->MPreparedDepsEvents) {
251  const detail::plugin &Plugin = Event->getPlugin();
252  RequiredEventsPerPlugin[&Plugin].push_back(Event);
253  }
254 
255  // wait for dependency device events
256  // FIXME Current implementation of waiting for events will make the thread
257  // 'sleep' until all of dependency events are complete. We need a bit more
258  // sophisticated waiting mechanism to allow to utilize this thread for any
259  // other available job and resume once all required events are ready.
260  for (auto &PluginWithEvents : RequiredEventsPerPlugin) {
261  std::vector<RT::PiEvent> RawEvents =
262  MThisCmd->getPiEvents(PluginWithEvents.second);
263  try {
264  PluginWithEvents.first->call<PiApiKind::piEventsWait>(RawEvents.size(),
265  RawEvents.data());
266  } catch (const sycl::exception &E) {
267  CGHostTask &HostTask = static_cast<CGHostTask &>(MThisCmd->getCG());
268  HostTask.MQueue->reportAsyncException(std::current_exception());
269  return (pi_result)E.get_cl_code();
270  } catch (...) {
271  CGHostTask &HostTask = static_cast<CGHostTask &>(MThisCmd->getCG());
272  HostTask.MQueue->reportAsyncException(std::current_exception());
273  return PI_ERROR_UNKNOWN;
274  }
275  }
276 
277  // Wait for dependency host events.
278  // Host events can't throw exceptions so don't try to catch it.
279  for (const EventImplPtr &Event : MThisCmd->MPreparedHostDepsEvents) {
280  Event->waitInternal();
281  }
282 
283  return PI_SUCCESS;
284  }
285 
286 public:
288  std::vector<interop_handle::ReqToMem> ReqToMem)
289  : MThisCmd{ThisCmd}, MReqToMem(std::move(ReqToMem)) {}
290 
291  void operator()() const {
292  assert(MThisCmd->getCG().getType() == CG::CGTYPE::CodeplayHostTask);
293 
294  CGHostTask &HostTask = static_cast<CGHostTask &>(MThisCmd->getCG());
295 
296  pi_result WaitResult = waitForEvents();
297  if (WaitResult != PI_SUCCESS) {
298  std::exception_ptr EPtr = std::make_exception_ptr(sycl::runtime_error(
299  std::string("Couldn't wait for host-task's dependencies"),
300  WaitResult));
301  HostTask.MQueue->reportAsyncException(EPtr);
302 
303  // reset host-task's lambda and quit
304  HostTask.MHostTask.reset();
305  return;
306  }
307 
308  try {
309  // we're ready to call the user-defined lambda now
310  if (HostTask.MHostTask->isInteropTask()) {
311  interop_handle IH{MReqToMem, HostTask.MQueue,
312  HostTask.MQueue->getDeviceImplPtr(),
313  HostTask.MQueue->getContextImplPtr()};
314 
315  HostTask.MHostTask->call(IH);
316  } else
317  HostTask.MHostTask->call();
318  } catch (...) {
319  HostTask.MQueue->reportAsyncException(std::current_exception());
320  }
321 
322  HostTask.MHostTask.reset();
323 
324  // unblock user empty command here
325  EmptyCommand *EmptyCmd = MThisCmd->MEmptyCmd;
326  assert(EmptyCmd && "No empty command found");
327 
328  Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd, EmptyCmd);
329  }
330 };
331 
332 void Command::waitForPreparedHostEvents() const {
333  for (const EventImplPtr &HostEvent : MPreparedHostDepsEvents)
334  HostEvent->waitInternal();
335 }
336 
338  std::vector<EventImplPtr> &EventImpls,
339  RT::PiEvent &Event) {
340 
341  if (!EventImpls.empty()) {
342  if (Queue->is_host()) {
343  // Host queue can wait for events from different contexts, i.e. it may
344  // contain events with different contexts in its MPreparedDepsEvents.
345  // OpenCL 2.1 spec says that clWaitForEvents will return
346  // CL_INVALID_CONTEXT if events specified in the list do not belong to
347  // the same context. Thus we split all the events into per-context map.
348  // An example. We have two queues for the same CPU device: Q1, Q2. Thus
349  // we will have two different contexts for the same CPU device: C1, C2.
350  // Also we have default host queue. This queue is accessible via
351  // Scheduler. Now, let's assume we have three different events: E1(C1),
352  // E2(C1), E3(C2). Also, we have an EmptyCommand which is to be executed
353  // on host queue. The command's MPreparedDepsEvents will contain all three
354  // events (E1, E2, E3). Now, if piEventsWait is called for all three
355  // events we'll experience failure with CL_INVALID_CONTEXT 'cause these
356  // events refer to different contexts.
357  std::map<context_impl *, std::vector<EventImplPtr>>
358  RequiredEventsPerContext;
359 
360  for (const EventImplPtr &Event : EventImpls) {
361  ContextImplPtr Context = Event->getContextImpl();
362  assert(Context.get() &&
363  "Only non-host events are expected to be waited for here");
364  RequiredEventsPerContext[Context.get()].push_back(Event);
365  }
366 
367  for (auto &CtxWithEvents : RequiredEventsPerContext) {
368  std::vector<RT::PiEvent> RawEvents = getPiEvents(CtxWithEvents.second);
369  CtxWithEvents.first->getPlugin().call<PiApiKind::piEventsWait>(
370  RawEvents.size(), RawEvents.data());
371  }
372  } else {
373 #ifndef NDEBUG
374  for (const EventImplPtr &Event : EventImpls)
375  assert(Event->getContextImpl().get() &&
376  "Only non-host events are expected to be waited for here");
377 #endif
378 
379  std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
380  flushCrossQueueDeps(EventImpls, getWorkerQueue());
381  const detail::plugin &Plugin = Queue->getPlugin();
383  Queue->getHandleRef(), RawEvents.size(), &RawEvents[0], &Event);
384  }
385  }
386 }
387 
391 Command::Command(CommandType Type, QueueImplPtr Queue)
392  : MQueue(std::move(Queue)),
393  MEvent(std::make_shared<detail::event_impl>(MQueue)),
394  MPreparedDepsEvents(MEvent->getPreparedDepsEvents()),
395  MPreparedHostDepsEvents(MEvent->getPreparedHostDepsEvents()),
396  MType(Type) {
398  MEvent->setWorkerQueue(MWorkerQueue);
399  MEvent->setSubmittedQueue(MWorkerQueue);
400  MEvent->setCommand(this);
401  MEvent->setContextImpl(MQueue->getContextImplPtr());
402  MEvent->setStateIncomplete();
404 
405 #ifdef XPTI_ENABLE_INSTRUMENTATION
406  if (!xptiTraceEnabled())
407  return;
408  // Obtain the stream ID so all commands can emit traces to that stream
409  MStreamID = xptiRegisterStream(SYCL_STREAM_NAME);
410 #endif
411 }
412 
414 #ifdef XPTI_ENABLE_INSTRUMENTATION
416 #endif
417 }
418 
430  Command *Cmd, void *ObjAddr, bool IsCommand,
431  std::optional<access::mode> AccMode) {
432 #ifdef XPTI_ENABLE_INSTRUMENTATION
433  // Bail early if either the source or the target node for the given dependency
434  // is undefined or NULL
435  if (!(xptiTraceEnabled() && MTraceEvent && Cmd && Cmd->MTraceEvent))
436  return;
437 
438  // If all the information we need for creating an edge event is available,
439  // then go ahead with creating it; if not, bail early!
440  xpti::utils::StringHelper SH;
441  std::string AddressStr = SH.addressAsString<void *>(ObjAddr);
442  std::string Prefix = AccMode ? accessModeToString(AccMode.value()) : "Event";
443  std::string TypeString = SH.nameWithAddressString(Prefix, AddressStr);
444  // Create an edge with the dependent buffer address for which a command
445  // object has been created as one of the properties of the edge
446  xpti::payload_t Payload(TypeString.c_str(), MAddress);
447  uint64_t EdgeInstanceNo;
448  xpti_td *EdgeEvent =
449  xptiMakeEvent(TypeString.c_str(), &Payload, xpti::trace_graph_event,
450  xpti_at::active, &EdgeInstanceNo);
451  if (EdgeEvent) {
452  xpti_td *SrcEvent = static_cast<xpti_td *>(Cmd->MTraceEvent);
453  xpti_td *TgtEvent = static_cast<xpti_td *>(MTraceEvent);
454  EdgeEvent->source_id = SrcEvent->unique_id;
455  EdgeEvent->target_id = TgtEvent->unique_id;
456  if (IsCommand) {
457  xpti::addMetadata(EdgeEvent, "access_mode",
458  static_cast<int>(AccMode.value()));
459  xpti::addMetadata(EdgeEvent, "memory_object",
460  reinterpret_cast<size_t>(ObjAddr));
461  } else {
462  xpti::addMetadata(EdgeEvent, "event", reinterpret_cast<size_t>(ObjAddr));
463  }
464  xptiNotifySubscribers(MStreamID, xpti::trace_edge_create,
465  detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
466  nullptr);
467  }
468  // General comment - None of these are serious errors as the instrumentation
469  // layer MUST be tolerant of errors. If we need to let the end user know, we
470  // throw exceptions in the future
471 #endif
472 }
473 
479  RT::PiEvent &PiEventAddr) {
480 #ifdef XPTI_ENABLE_INSTRUMENTATION
481  // If we have failed to create an event to represent the Command, then we
482  // cannot emit an edge event. Bail early!
483  if (!(xptiTraceEnabled() && MTraceEvent))
484  return;
485 
486  if (Cmd && Cmd->MTraceEvent) {
487  // If the event is associated with a command, we use this command's trace
488  // event as the source of edge, hence modeling the control flow
489  emitEdgeEventForCommandDependence(Cmd, (void *)PiEventAddr, false);
490  return;
491  }
492  if (PiEventAddr) {
493  xpti::utils::StringHelper SH;
494  std::string AddressStr = SH.addressAsString<RT::PiEvent>(PiEventAddr);
495  // This is the case when it is a OCL event enqueued by the user or another
496  // event is registered by the runtime as a dependency The dependency on
497  // this occasion is an OCL event; so we build a virtual node in the graph
498  // with the event as the metadata for the node
499  std::string NodeName = SH.nameWithAddressString("virtual_node", AddressStr);
500  // Node name is "virtual_node[<event_addr>]"
501  xpti::payload_t VNPayload(NodeName.c_str(), MAddress);
502  uint64_t VNodeInstanceNo;
503  xpti_td *NodeEvent =
504  xptiMakeEvent(NodeName.c_str(), &VNPayload, xpti::trace_graph_event,
505  xpti_at::active, &VNodeInstanceNo);
506  // Emit the virtual node first
507  xpti::addMetadata(NodeEvent, "kernel_name", NodeName);
508  xptiNotifySubscribers(MStreamID, xpti::trace_node_create,
509  detail::GSYCLGraphEvent, NodeEvent, VNodeInstanceNo,
510  nullptr);
511  // Create a new event for the edge
512  std::string EdgeName = SH.nameWithAddressString("Event", AddressStr);
513  xpti::payload_t EdgePayload(EdgeName.c_str(), MAddress);
514  uint64_t EdgeInstanceNo;
515  xpti_td *EdgeEvent =
516  xptiMakeEvent(EdgeName.c_str(), &EdgePayload, xpti::trace_graph_event,
517  xpti_at::active, &EdgeInstanceNo);
518  if (EdgeEvent && NodeEvent) {
519  // Source node represents the event and this event needs to be completed
520  // before target node can execute
521  xpti_td *TgtEvent = static_cast<xpti_td *>(MTraceEvent);
522  EdgeEvent->source_id = NodeEvent->unique_id;
523  EdgeEvent->target_id = TgtEvent->unique_id;
524  xpti::addMetadata(EdgeEvent, "event",
525  reinterpret_cast<size_t>(PiEventAddr));
526  xptiNotifySubscribers(MStreamID, xpti::trace_edge_create,
527  detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
528  nullptr);
529  }
530  return;
531  }
532 #endif
533 }
534 
535 uint64_t Command::makeTraceEventProlog(void *MAddress) {
536  uint64_t CommandInstanceNo = 0;
537 #ifdef XPTI_ENABLE_INSTRUMENTATION
538  if (!xptiTraceEnabled())
539  return CommandInstanceNo;
540 
542  // Setup the member variables with information needed for event notification
543  MCommandNodeType = commandToNodeType(MType);
544  MCommandName = commandToName(MType);
545  xpti::utils::StringHelper SH;
546  MAddressString = SH.addressAsString<void *>(MAddress);
547  std::string CommandString =
548  SH.nameWithAddressString(MCommandName, MAddressString);
549 
550  xpti::payload_t p(CommandString.c_str(), MAddress);
551  xpti_td *CmdTraceEvent =
552  xptiMakeEvent(CommandString.c_str(), &p, xpti::trace_graph_event,
553  xpti_at::active, &CommandInstanceNo);
554  MInstanceID = CommandInstanceNo;
555  if (CmdTraceEvent) {
556  MTraceEvent = (void *)CmdTraceEvent;
557  // If we are seeing this event again, then the instance ID will be greater
558  // than 1; in this case, we must skip sending a notification to create a
559  // node as this node has already been created. We return this value so the
560  // epilog method can be called selectively.
561  MFirstInstance = (CommandInstanceNo == 1);
562  }
563 #endif
564  return CommandInstanceNo;
565 }
566 
568 #ifdef XPTI_ENABLE_INSTRUMENTATION
569  if (!(xptiTraceEnabled() && MTraceEvent))
570  return;
572  xptiNotifySubscribers(MStreamID, xpti::trace_node_create,
573  detail::GSYCLGraphEvent,
574  static_cast<xpti_td *>(MTraceEvent), MInstanceID,
575  static_cast<const void *>(MCommandNodeType.c_str()));
576 #endif
577 }
578 
580  std::vector<Command *> &ToCleanUp) {
581  const QueueImplPtr &WorkerQueue = getWorkerQueue();
582  const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr();
583 
584  // 1. Async work is not supported for host device.
585  // 2. Non-host events can be ignored if they are not fully initialized.
586  // 3. Some types of commands do not produce PI events after they are enqueued
587  // (e.g. alloca). Note that we can't check the pi event to make that
588  // distinction since the command might still be unenqueued at this point.
589  bool PiEventExpected = (!DepEvent->is_host() && DepEvent->isInitialized()) ||
590  getType() == CommandType::HOST_TASK;
591  if (auto *DepCmd = static_cast<Command *>(DepEvent->getCommand()))
592  PiEventExpected &= DepCmd->producesPiEvent();
593 
594  if (!PiEventExpected) {
595  // call to waitInternal() is in waitForPreparedHostEvents() as it's called
596  // from enqueue process functions
597  MPreparedHostDepsEvents.push_back(DepEvent);
598  return nullptr;
599  }
600 
601  Command *ConnectionCmd = nullptr;
602 
603  ContextImplPtr DepEventContext = DepEvent->getContextImpl();
604  // If contexts don't match we'll connect them using host task
605  if (DepEventContext != WorkerContext && !WorkerContext->is_host()) {
607  ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep, ToCleanUp);
608  } else
609  MPreparedDepsEvents.push_back(std::move(DepEvent));
610 
611  return ConnectionCmd;
612 }
613 
615  return MQueue->getContextImplPtr();
616 }
617 
619  assert(MWorkerQueue && "MWorkerQueue must not be nullptr");
620  return MWorkerQueue;
621 }
622 
623 bool Command::producesPiEvent() const { return true; }
624 
625 bool Command::supportsPostEnqueueCleanup() const { return true; }
626 
627 Command *Command::addDep(DepDesc NewDep, std::vector<Command *> &ToCleanUp) {
628  Command *ConnectionCmd = nullptr;
629 
630  if (NewDep.MDepCommand) {
631  ConnectionCmd =
632  processDepEvent(NewDep.MDepCommand->getEvent(), NewDep, ToCleanUp);
633  }
634  // ConnectionCmd insertion builds the following dependency structure:
635  // this -> emptyCmd (for ConnectionCmd) -> ConnectionCmd -> NewDep
636  // that means that this and NewDep are already dependent
637  if (!ConnectionCmd) {
638  MDeps.push_back(NewDep);
639  if (NewDep.MDepCommand)
640  NewDep.MDepCommand->addUser(this);
641  }
642 
643 #ifdef XPTI_ENABLE_INSTRUMENTATION
645  (void *)NewDep.MDepRequirement->MSYCLMemObj,
646  true, NewDep.MDepRequirement->MAccessMode);
647 #endif
648 
649  return ConnectionCmd;
650 }
651 
653  std::vector<Command *> &ToCleanUp) {
654 #ifdef XPTI_ENABLE_INSTRUMENTATION
655  // We need this for just the instrumentation, so guarding it will prevent
656  // unused variable warnings when instrumentation is turned off
657  Command *Cmd = (Command *)Event->getCommand();
658  RT::PiEvent &PiEventAddr = Event->getHandleRef();
659  // Now make an edge for the dependent event
660  emitEdgeEventForEventDependence(Cmd, PiEventAddr);
661 #endif
662 
663  return processDepEvent(std::move(Event), DepDesc{nullptr, nullptr, nullptr},
664  ToCleanUp);
665 }
666 
668 #ifdef XPTI_ENABLE_INSTRUMENTATION
669  if (!(xptiTraceEnabled() && MTraceEvent && PiEventAddr))
670  return;
671  // Asynchronous call, so send a signal with the event information as
672  // user_data
673  xptiNotifySubscribers(MStreamID, xpti::trace_signal, detail::GSYCLGraphEvent,
674  static_cast<xpti_td *>(MTraceEvent), MInstanceID,
675  (void *)PiEventAddr);
676 #endif
677 }
678 
679 void Command::emitInstrumentation(uint16_t Type, const char *Txt) {
680 #ifdef XPTI_ENABLE_INSTRUMENTATION
681  if (!(xptiTraceEnabled() && MTraceEvent))
682  return;
683  // Trace event notifier that emits a Type event
684  xptiNotifySubscribers(MStreamID, Type, detail::GSYCLGraphEvent,
685  static_cast<xpti_td *>(MTraceEvent), MInstanceID,
686  static_cast<const void *>(Txt));
687 #endif
688 }
689 
690 bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking,
691  std::vector<Command *> &ToCleanUp) {
692  // Exit if already enqueued
694  return true;
695 
696  // If the command is blocked from enqueueing
698  // Exit if enqueue type is not blocking
699  if (!Blocking) {
700  EnqueueResult = EnqueueResultT(EnqueueResultT::SyclEnqueueBlocked, this);
701  return false;
702  }
703 
704 #ifdef XPTI_ENABLE_INSTRUMENTATION
705  // Scoped trace event notifier that emits a barrier begin and barrier end
706  // event, which models the barrier while enqueuing along with the blocked
707  // reason, as determined by the scheduler
708  std::string Info = "enqueue.barrier[";
709  Info += std::string(getBlockReason()) + "]";
710  emitInstrumentation(xpti::trace_barrier_begin, Info.c_str());
711 #endif
712 
713  // Wait if blocking
715  ;
716 #ifdef XPTI_ENABLE_INSTRUMENTATION
717  emitInstrumentation(xpti::trace_barrier_end, Info.c_str());
718 #endif
719  }
720 
721  std::lock_guard<std::mutex> Lock(MEnqueueMtx);
722 
723  // Exit if the command is already enqueued
725  return true;
726 
727 #ifdef XPTI_ENABLE_INSTRUMENTATION
728  emitInstrumentation(xpti::trace_task_begin, nullptr);
729 #endif
730 
732  EnqueueResult = EnqueueResultT(EnqueueResultT::SyclEnqueueFailed, this);
733  return false;
734  }
735 
736  // Command status set to "failed" beforehand, so this command
737  // has already been marked as "failed" if enqueueImp throws an exception.
738  // This will avoid execution of the same failed command twice.
741  pi_int32 Res = enqueueImp();
742 
743  if (PI_SUCCESS != Res)
744  EnqueueResult =
746  else {
748  (MEvent->is_host() || MEvent->getHandleRef() == nullptr))
749  MEvent->setComplete();
750 
751  // Consider the command is successfully enqueued if return code is
752  // PI_SUCCESS
756  assert(!MPostEnqueueCleanup);
757  MPostEnqueueCleanup = true;
758  ToCleanUp.push_back(this);
759  }
760  }
761 
762  // Emit this correlation signal before the task end
763  emitEnqueuedEventSignal(MEvent->getHandleRef());
764 #ifdef XPTI_ENABLE_INSTRUMENTATION
765  emitInstrumentation(xpti::trace_task_end, nullptr);
766 #endif
768 }
769 
770 void Command::resolveReleaseDependencies(std::set<Command *> &DepList) {
771 #ifdef XPTI_ENABLE_INSTRUMENTATION
772  assert(MType == CommandType::RELEASE && "Expected release command");
773  if (!MTraceEvent)
774  return;
775  // The current command is the target node for all dependencies as the source
776  // nodes have to be completed first before the current node can begin to
777  // execute; these edges model control flow
778  xpti_td *TgtTraceEvent = static_cast<xpti_td *>(MTraceEvent);
779  // We have all the Commands that must be completed before the release command
780  // can be enqueued; here we'll find the command that is an Alloca with the
781  // same SYCLMemObject address and create a dependency line (edge) between them
782  // in our sematic modeling
783  for (auto &Item : DepList) {
784  if (Item->MTraceEvent && Item->MAddress == MAddress) {
785  xpti::utils::StringHelper SH;
786  std::string AddressStr = SH.addressAsString<void *>(MAddress);
787  std::string TypeString =
788  "Edge:" + SH.nameWithAddressString(commandToName(MType), AddressStr);
789 
790  // Create an edge with the dependent buffer address being one of the
791  // properties of the edge
792  xpti::payload_t p(TypeString.c_str(), MAddress);
793  uint64_t EdgeInstanceNo;
794  xpti_td *EdgeEvent =
795  xptiMakeEvent(TypeString.c_str(), &p, xpti::trace_graph_event,
796  xpti_at::active, &EdgeInstanceNo);
797  if (EdgeEvent) {
798  xpti_td *SrcTraceEvent = static_cast<xpti_td *>(Item->MTraceEvent);
799  EdgeEvent->target_id = TgtTraceEvent->unique_id;
800  EdgeEvent->source_id = SrcTraceEvent->unique_id;
801  xpti::addMetadata(EdgeEvent, "memory_object",
802  reinterpret_cast<size_t>(MAddress));
803  xptiNotifySubscribers(MStreamID, xpti::trace_edge_create,
804  detail::GSYCLGraphEvent, EdgeEvent,
805  EdgeInstanceNo, nullptr);
806  }
807  }
808  }
809 #endif
810 }
811 
812 const char *Command::getBlockReason() const {
813  switch (MBlockReason) {
815  return "A Buffer is locked by the host accessor";
817  return "Blocked by host task";
818  }
819 
820  return "Unknown block reason";
821 }
822 
824  Requirement Req,
825  AllocaCommandBase *LinkedAllocaCmd,
826  bool IsConst)
827  : Command(Type, Queue), MLinkedAllocaCmd(LinkedAllocaCmd),
828  MIsLeaderAlloca(nullptr == LinkedAllocaCmd), MIsConst(IsConst),
829  MRequirement(std::move(Req)), MReleaseCmd(Queue, this) {
832 }
833 
835 #ifdef XPTI_ENABLE_INSTRUMENTATION
836  if (!xptiTraceEnabled())
837  return;
838  // Create a payload with the command name and an event using this payload to
839  // emit a node_create
842  // Set the relevant meta data properties for this command
843  if (MTraceEvent && MFirstInstance) {
844  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
845  xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
846  xpti::addMetadata(TE, "sycl_device_type",
847  deviceToString(MQueue->get_device()));
848  xpti::addMetadata(TE, "sycl_device_name",
849  getSyclObjImpl(MQueue->get_device())->getDeviceName());
850  xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
851  }
852 #endif
853 }
854 
855 bool AllocaCommandBase::producesPiEvent() const { return false; }
856 
857 bool AllocaCommandBase::supportsPostEnqueueCleanup() const { return false; }
858 
860  bool InitFromUserData,
861  AllocaCommandBase *LinkedAllocaCmd, bool IsConst)
862  : AllocaCommandBase(CommandType::ALLOCA, std::move(Queue), std::move(Req),
863  LinkedAllocaCmd, IsConst),
864  MInitFromUserData(InitFromUserData) {
865  // Node event must be created before the dependent edge is added to this node,
866  // so this call must be before the addDep() call.
868  // "Nothing to depend on"
869  std::vector<Command *> ToCleanUp;
870  Command *ConnectionCmd =
871  addDep(DepDesc(nullptr, getRequirement(), this), ToCleanUp);
872  assert(ConnectionCmd == nullptr);
873  assert(ToCleanUp.empty());
874  (void)ConnectionCmd;
875 }
876 
878 #ifdef XPTI_ENABLE_INSTRUMENTATION
879  if (!xptiTraceEnabled())
880  return;
881 
882  // Only if it is the first event, we emit a node create event
883  if (MFirstInstance) {
885  }
886 #endif
887 }
888 
889 pi_int32 AllocaCommand::enqueueImp() {
891  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
892 
893  RT::PiEvent &Event = MEvent->getHandleRef();
894 
895  void *HostPtr = nullptr;
896  if (!MIsLeaderAlloca) {
897 
898  if (MQueue->is_host()) {
899  // Do not need to make allocation if we have a linked device allocation
900  Command::waitForEvents(MQueue, EventImpls, Event);
901 
902  return PI_SUCCESS;
903  }
904  HostPtr = MLinkedAllocaCmd->getMemAllocation();
905  }
906  // TODO: Check if it is correct to use std::move on stack variable and
907  // delete it RawEvents below.
909  MQueue->getContextImplPtr(), getSYCLMemObj(), MInitFromUserData, HostPtr,
910  std::move(EventImpls), Event);
911 
912  return PI_SUCCESS;
913 }
914 
915 void AllocaCommand::printDot(std::ostream &Stream) const {
916  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
917 
918  Stream << "ID = " << this << "\\n";
919  Stream << "ALLOCA ON " << deviceToString(MQueue->get_device()) << "\\n";
920  Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
921  Stream << " Link : " << this->MLinkedAllocaCmd << "\\n";
922  Stream << "\"];" << std::endl;
923 
924  for (const auto &Dep : MDeps) {
925  if (Dep.MDepCommand == nullptr)
926  continue;
927  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
928  << " [ label = \"Access mode: "
929  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
930  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
931  << std::endl;
932  }
933 }
934 
936  AllocaCommandBase *ParentAlloca,
937  std::vector<Command *> &ToEnqueue,
938  std::vector<Command *> &ToCleanUp)
939  : AllocaCommandBase(CommandType::ALLOCA_SUB_BUF, std::move(Queue),
940  std::move(Req),
941  /*LinkedAllocaCmd*/ nullptr, /*IsConst*/ false),
942  MParentAlloca(ParentAlloca) {
943  // Node event must be created before the dependent edge
944  // is added to this node, so this call must be before
945  // the addDep() call.
947  Command *ConnectionCmd = addDep(
948  DepDesc(MParentAlloca, getRequirement(), MParentAlloca), ToCleanUp);
949  if (ConnectionCmd)
950  ToEnqueue.push_back(ConnectionCmd);
951 }
952 
954 #ifdef XPTI_ENABLE_INSTRUMENTATION
955  if (!xptiTraceEnabled())
956  return;
957 
958  // Only if it is the first event, we emit a node create event and any meta
959  // data that is available for the command
960  if (MFirstInstance) {
961  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
962  xpti::addMetadata(TE, "offset", this->MRequirement.MOffsetInBytes);
963  xpti::addMetadata(TE, "access_range_start",
964  this->MRequirement.MAccessRange[0]);
965  xpti::addMetadata(TE, "access_range_end",
966  this->MRequirement.MAccessRange[1]);
968  }
969 #endif
970 }
971 
973  // In some cases parent`s memory allocation might change (e.g., after
974  // map/unmap operations). If parent`s memory allocation changes, sub-buffer
975  // memory allocation should be changed as well.
976  if (MQueue->is_host()) {
977  return static_cast<void *>(
978  static_cast<char *>(MParentAlloca->getMemAllocation()) +
980  }
981  return MMemAllocation;
982 }
983 
984 pi_int32 AllocaSubBufCommand::enqueueImp() {
986  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
987  RT::PiEvent &Event = MEvent->getHandleRef();
988 
990  MQueue->getContextImplPtr(), MParentAlloca->getMemAllocation(),
992  MRequirement.MAccessRange, std::move(EventImpls), Event);
993 
996  return PI_SUCCESS;
997 }
998 
999 void AllocaSubBufCommand::printDot(std::ostream &Stream) const {
1000  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1001 
1002  Stream << "ID = " << this << "\\n";
1003  Stream << "ALLOCA SUB BUF ON " << deviceToString(MQueue->get_device())
1004  << "\\n";
1005  Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
1006  Stream << " Offset : " << this->MRequirement.MOffsetInBytes << "\\n";
1007  Stream << " Access range : " << this->MRequirement.MAccessRange[0] << "\\n";
1008  Stream << "\"];" << std::endl;
1009 
1010  for (const auto &Dep : MDeps) {
1011  if (Dep.MDepCommand == nullptr)
1012  continue;
1013  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1014  << " [ label = \"Access mode: "
1015  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1016  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1017  << std::endl;
1018  }
1019 }
1020 
1022  : Command(CommandType::RELEASE, std::move(Queue)), MAllocaCmd(AllocaCmd) {
1024 }
1025 
1027 #ifdef XPTI_ENABLE_INSTRUMENTATION
1028  if (!xptiTraceEnabled())
1029  return;
1030  // Create a payload with the command name and an event using this payload to
1031  // emit a node_create
1032  MAddress = MAllocaCmd->getSYCLMemObj();
1034 
1035  if (MFirstInstance) {
1036  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1037  xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
1038  xpti::addMetadata(TE, "sycl_device_type",
1039  deviceToString(MQueue->get_device()));
1040  xpti::addMetadata(TE, "sycl_device_name",
1041  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1042  xpti::addMetadata(TE, "allocation_type",
1043  commandToName(MAllocaCmd->getType()));
1045  }
1046 #endif
1047 }
1048 
1049 pi_int32 ReleaseCommand::enqueueImp() {
1051  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1052  std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
1053  bool SkipRelease = false;
1054 
1055  // On host side we only allocate memory for full buffers.
1056  // Thus, deallocating sub buffers leads to double memory freeing.
1057  SkipRelease |= MQueue->is_host() && MAllocaCmd->getType() == ALLOCA_SUB_BUF;
1058 
1059  const bool CurAllocaIsHost = MAllocaCmd->getQueue()->is_host();
1060  bool NeedUnmap = false;
1061  if (MAllocaCmd->MLinkedAllocaCmd) {
1062 
1063  // When releasing one of the "linked" allocations special rules take place:
1064  // 1. Device allocation should always be released.
1065  // 2. Host allocation should be released if host allocation is "leader".
1066  // 3. Device alloca in the pair should be in active state in order to be
1067  // correctly released.
1068 
1069  // There is no actual memory allocation if a host alloca command is created
1070  // being linked to a device allocation.
1071  SkipRelease |= CurAllocaIsHost && !MAllocaCmd->MIsLeaderAlloca;
1072 
1073  NeedUnmap |= CurAllocaIsHost == MAllocaCmd->MIsActive;
1074  }
1075 
1076  if (NeedUnmap) {
1077  const QueueImplPtr &Queue = CurAllocaIsHost
1078  ? MAllocaCmd->MLinkedAllocaCmd->getQueue()
1079  : MAllocaCmd->getQueue();
1080 
1081  EventImplPtr UnmapEventImpl(new event_impl(Queue));
1082  UnmapEventImpl->setContextImpl(Queue->getContextImplPtr());
1083  UnmapEventImpl->setStateIncomplete();
1084  RT::PiEvent &UnmapEvent = UnmapEventImpl->getHandleRef();
1085 
1086  void *Src = CurAllocaIsHost
1087  ? MAllocaCmd->getMemAllocation()
1088  : MAllocaCmd->MLinkedAllocaCmd->getMemAllocation();
1089 
1090  void *Dst = !CurAllocaIsHost
1091  ? MAllocaCmd->getMemAllocation()
1092  : MAllocaCmd->MLinkedAllocaCmd->getMemAllocation();
1093 
1094  MemoryManager::unmap(MAllocaCmd->getSYCLMemObj(), Dst, Queue, Src,
1095  RawEvents, UnmapEvent);
1096 
1097  std::swap(MAllocaCmd->MIsActive, MAllocaCmd->MLinkedAllocaCmd->MIsActive);
1098  EventImpls.clear();
1099  EventImpls.push_back(UnmapEventImpl);
1100  }
1101  RT::PiEvent &Event = MEvent->getHandleRef();
1102  if (SkipRelease)
1103  Command::waitForEvents(MQueue, EventImpls, Event);
1104  else {
1106  MQueue->getContextImplPtr(), MAllocaCmd->getSYCLMemObj(),
1107  MAllocaCmd->getMemAllocation(), std::move(EventImpls), Event);
1108  }
1109  return PI_SUCCESS;
1110 }
1111 
1112 void ReleaseCommand::printDot(std::ostream &Stream) const {
1113  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FF827A\", label=\"";
1114 
1115  Stream << "ID = " << this << " ; ";
1116  Stream << "RELEASE ON " << deviceToString(MQueue->get_device()) << "\\n";
1117  Stream << " Alloca : " << MAllocaCmd << "\\n";
1118  Stream << " MemObj : " << MAllocaCmd->getSYCLMemObj() << "\\n";
1119  Stream << "\"];" << std::endl;
1120 
1121  for (const auto &Dep : MDeps) {
1122  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1123  << " [ label = \"Access mode: "
1124  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1125  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1126  << std::endl;
1127  }
1128 }
1129 
1130 bool ReleaseCommand::producesPiEvent() const { return false; }
1131 
1132 bool ReleaseCommand::supportsPostEnqueueCleanup() const { return false; }
1133 
1135  void **DstPtr, QueueImplPtr Queue,
1136  access::mode MapMode)
1137  : Command(CommandType::MAP_MEM_OBJ, std::move(Queue)),
1138  MSrcAllocaCmd(SrcAllocaCmd), MSrcReq(std::move(Req)), MDstPtr(DstPtr),
1139  MMapMode(MapMode) {
1141 }
1142 
1144 #ifdef XPTI_ENABLE_INSTRUMENTATION
1145  if (!xptiTraceEnabled())
1146  return;
1147  // Create a payload with the command name and an event using this payload to
1148  // emit a node_create
1149  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1151 
1152  if (MFirstInstance) {
1153  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1154  xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
1155  xpti::addMetadata(TE, "sycl_device_type",
1156  deviceToString(MQueue->get_device()));
1157  xpti::addMetadata(TE, "sycl_device_name",
1158  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1159  xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
1161  }
1162 #endif
1163 }
1164 
1165 pi_int32 MapMemObject::enqueueImp() {
1167  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1168  std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
1169  flushCrossQueueDeps(EventImpls, getWorkerQueue());
1170 
1171  RT::PiEvent &Event = MEvent->getHandleRef();
1172  *MDstPtr = MemoryManager::map(
1173  MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(), MQueue,
1174  MMapMode, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1175  MSrcReq.MOffset, MSrcReq.MElemSize, std::move(RawEvents), Event);
1176 
1177  return PI_SUCCESS;
1178 }
1179 
1180 void MapMemObject::printDot(std::ostream &Stream) const {
1181  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#77AFFF\", label=\"";
1182 
1183  Stream << "ID = " << this << " ; ";
1184  Stream << "MAP ON " << deviceToString(MQueue->get_device()) << "\\n";
1185 
1186  Stream << "\"];" << std::endl;
1187 
1188  for (const auto &Dep : MDeps) {
1189  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1190  << " [ label = \"Access mode: "
1191  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1192  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1193  << std::endl;
1194  }
1195 }
1196 
1198  void **SrcPtr, QueueImplPtr Queue)
1199  : Command(CommandType::UNMAP_MEM_OBJ, std::move(Queue)),
1200  MDstAllocaCmd(DstAllocaCmd), MDstReq(std::move(Req)), MSrcPtr(SrcPtr) {
1202 }
1203 
1205 #ifdef XPTI_ENABLE_INSTRUMENTATION
1206  if (!xptiTraceEnabled())
1207  return;
1208  // Create a payload with the command name and an event using this payload to
1209  // emit a node_create
1210  MAddress = MDstAllocaCmd->getSYCLMemObj();
1212 
1213  if (MFirstInstance) {
1214  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1215  xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
1216  xpti::addMetadata(TE, "sycl_device_type",
1217  deviceToString(MQueue->get_device()));
1218  xpti::addMetadata(TE, "sycl_device_name",
1219  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1220  xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
1222  }
1223 #endif
1224 }
1225 
1227  // TODO remove this workaround once the batching issue is addressed in Level
1228  // Zero plugin.
1229  // Consider the following scenario on Level Zero:
1230  // 1. Kernel A, which uses buffer A, is submitted to queue A.
1231  // 2. Kernel B, which uses buffer B, is submitted to queue B.
1232  // 3. queueA.wait().
1233  // 4. queueB.wait().
1234  // DPCPP runtime used to treat unmap/write commands for buffer A/B as host
1235  // dependencies (i.e. they were waited for prior to enqueueing any command
1236  // that's dependent on them). This allowed Level Zero plugin to detect that
1237  // each queue is idle on steps 1/2 and submit the command list right away.
1238  // This is no longer the case since we started passing these dependencies in
1239  // an event waitlist and Level Zero plugin attempts to batch these commands,
1240  // so the execution of kernel B starts only on step 4. This workaround
1241  // restores the old behavior in this case until this is resolved.
1242  return MQueue->getPlugin().getBackend() != backend::ext_oneapi_level_zero ||
1243  MEvent->getHandleRef() != nullptr;
1244 }
1245 
1246 pi_int32 UnMapMemObject::enqueueImp() {
1248  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1249  std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
1250  flushCrossQueueDeps(EventImpls, getWorkerQueue());
1251 
1252  RT::PiEvent &Event = MEvent->getHandleRef();
1253  MemoryManager::unmap(MDstAllocaCmd->getSYCLMemObj(),
1254  MDstAllocaCmd->getMemAllocation(), MQueue, *MSrcPtr,
1255  std::move(RawEvents), Event);
1256 
1257  return PI_SUCCESS;
1258 }
1259 
1260 void UnMapMemObject::printDot(std::ostream &Stream) const {
1261  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#EBC40F\", label=\"";
1262 
1263  Stream << "ID = " << this << " ; ";
1264  Stream << "UNMAP ON " << deviceToString(MQueue->get_device()) << "\\n";
1265 
1266  Stream << "\"];" << std::endl;
1267 
1268  for (const auto &Dep : MDeps) {
1269  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1270  << " [ label = \"Access mode: "
1271  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1272  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1273  << std::endl;
1274  }
1275 }
1276 
1278  AllocaCommandBase *SrcAllocaCmd,
1279  Requirement DstReq,
1280  AllocaCommandBase *DstAllocaCmd,
1281  QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
1282  : Command(CommandType::COPY_MEMORY, std::move(DstQueue)),
1283  MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)),
1284  MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)),
1285  MDstAllocaCmd(DstAllocaCmd) {
1286  if (!MSrcQueue->is_host()) {
1287  MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1288  }
1289 
1290  MWorkerQueue = MQueue->is_host() ? MSrcQueue : MQueue;
1291  MEvent->setWorkerQueue(MWorkerQueue);
1292 
1294 }
1295 
1297 #ifdef XPTI_ENABLE_INSTRUMENTATION
1298  if (!xptiTraceEnabled())
1299  return;
1300  // Create a payload with the command name and an event using this payload to
1301  // emit a node_create
1302  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1304 
1305  if (MFirstInstance) {
1306  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1307  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1308  deviceToID(MQueue->get_device()));
1309  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1310  deviceToString(MQueue->get_device()));
1311  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1312  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1313  xpti::addMetadata(CmdTraceEvent, "memory_object",
1314  reinterpret_cast<size_t>(MAddress));
1315  xpti::addMetadata(CmdTraceEvent, "copy_from",
1316  reinterpret_cast<size_t>(
1317  getSyclObjImpl(MSrcQueue->get_device()).get()));
1318  xpti::addMetadata(
1319  CmdTraceEvent, "copy_to",
1320  reinterpret_cast<size_t>(getSyclObjImpl(MQueue->get_device()).get()));
1322  }
1323 #endif
1324 }
1325 
1327  return getWorkerQueue()->getContextImplPtr();
1328 }
1329 
1331  // TODO remove this workaround once the batching issue is addressed in Level
1332  // Zero plugin.
1333  // Consider the following scenario on Level Zero:
1334  // 1. Kernel A, which uses buffer A, is submitted to queue A.
1335  // 2. Kernel B, which uses buffer B, is submitted to queue B.
1336  // 3. queueA.wait().
1337  // 4. queueB.wait().
1338  // DPCPP runtime used to treat unmap/write commands for buffer A/B as host
1339  // dependencies (i.e. they were waited for prior to enqueueing any command
1340  // that's dependent on them). This allowed Level Zero plugin to detect that
1341  // each queue is idle on steps 1/2 and submit the command list right away.
1342  // This is no longer the case since we started passing these dependencies in
1343  // an event waitlist and Level Zero plugin attempts to batch these commands,
1344  // so the execution of kernel B starts only on step 4. This workaround
1345  // restores the old behavior in this case until this is resolved.
1346  return MQueue->is_host() ||
1347  MQueue->getPlugin().getBackend() != backend::ext_oneapi_level_zero ||
1348  MEvent->getHandleRef() != nullptr;
1349 }
1350 
1351 pi_int32 MemCpyCommand::enqueueImp() {
1353  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1354 
1355  RT::PiEvent &Event = MEvent->getHandleRef();
1356 
1357  auto RawEvents = getPiEvents(EventImpls);
1358  flushCrossQueueDeps(EventImpls, getWorkerQueue());
1359 
1361  MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(),
1362  MSrcQueue, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1363  MSrcReq.MOffset, MSrcReq.MElemSize, MDstAllocaCmd->getMemAllocation(),
1364  MQueue, MDstReq.MDims, MDstReq.MMemoryRange, MDstReq.MAccessRange,
1365  MDstReq.MOffset, MDstReq.MElemSize, std::move(RawEvents), Event);
1366 
1367  return PI_SUCCESS;
1368 }
1369 
1370 void MemCpyCommand::printDot(std::ostream &Stream) const {
1371  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#C7EB15\" label=\"";
1372 
1373  Stream << "ID = " << this << " ; ";
1374  Stream << "MEMCPY ON " << deviceToString(MQueue->get_device()) << "\\n";
1375  Stream << "From: " << MSrcAllocaCmd << " is host: " << MSrcQueue->is_host()
1376  << "\\n";
1377  Stream << "To: " << MDstAllocaCmd << " is host: " << MQueue->is_host()
1378  << "\\n";
1379 
1380  Stream << "\"];" << std::endl;
1381 
1382  for (const auto &Dep : MDeps) {
1383  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1384  << " [ label = \"Access mode: "
1385  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1386  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1387  << std::endl;
1388  }
1389 }
1390 
1391 AllocaCommandBase *ExecCGCommand::getAllocaForReq(Requirement *Req) {
1392  for (const DepDesc &Dep : MDeps) {
1393  if (Dep.MDepRequirement == Req)
1394  return Dep.MAllocaCmd;
1395  }
1396  throw runtime_error("Alloca for command not found",
1397  PI_ERROR_INVALID_OPERATION);
1398 }
1399 
1400 std::vector<StreamImplPtr> ExecCGCommand::getStreams() const {
1401  if (MCommandGroup->getType() == CG::Kernel)
1402  return ((CGExecKernel *)MCommandGroup.get())->getStreams();
1403  return {};
1404 }
1405 
1406 std::vector<std::shared_ptr<const void>>
1408  if (MCommandGroup->getType() == CG::Kernel)
1409  return ((CGExecKernel *)MCommandGroup.get())->getAuxiliaryResources();
1410  return {};
1411 }
1412 
1414  if (MCommandGroup->getType() == CG::Kernel)
1415  ((CGExecKernel *)MCommandGroup.get())->clearStreams();
1416 }
1417 
1419  if (MCommandGroup->getType() == CG::Kernel)
1420  ((CGExecKernel *)MCommandGroup.get())->clearAuxiliaryResources();
1421 }
1422 
1423 pi_int32 UpdateHostRequirementCommand::enqueueImp() {
1425  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1426  RT::PiEvent &Event = MEvent->getHandleRef();
1427  Command::waitForEvents(MQueue, EventImpls, Event);
1428 
1429  assert(MSrcAllocaCmd && "Expected valid alloca command");
1430  assert(MSrcAllocaCmd->getMemAllocation() && "Expected valid source pointer");
1431  assert(MDstPtr && "Expected valid target pointer");
1432  *MDstPtr = MSrcAllocaCmd->getMemAllocation();
1433 
1434  return PI_SUCCESS;
1435 }
1436 
1437 void UpdateHostRequirementCommand::printDot(std::ostream &Stream) const {
1438  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#f1337f\", label=\"";
1439 
1440  Stream << "ID = " << this << "\\n";
1441  Stream << "UPDATE REQ ON " << deviceToString(MQueue->get_device()) << "\\n";
1442  bool IsReqOnBuffer =
1443  MDstReq.MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer;
1444  Stream << "TYPE: " << (IsReqOnBuffer ? "Buffer" : "Image") << "\\n";
1445  if (IsReqOnBuffer)
1446  Stream << "Is sub buffer: " << std::boolalpha << MDstReq.MIsSubBuffer
1447  << "\\n";
1448 
1449  Stream << "\"];" << std::endl;
1450 
1451  for (const auto &Dep : MDeps) {
1452  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1453  << " [ label = \"Access mode: "
1454  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1455  << "MemObj: " << Dep.MAllocaCmd->getSYCLMemObj() << " \" ]"
1456  << std::endl;
1457  }
1458 }
1459 
1461  AllocaCommandBase *SrcAllocaCmd,
1462  Requirement DstReq, void **DstPtr,
1463  QueueImplPtr SrcQueue,
1464  QueueImplPtr DstQueue)
1465  : Command(CommandType::COPY_MEMORY, std::move(DstQueue)),
1466  MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)),
1467  MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)), MDstPtr(DstPtr) {
1468  if (!MSrcQueue->is_host()) {
1469  MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1470  }
1471 
1472  MWorkerQueue = MQueue->is_host() ? MSrcQueue : MQueue;
1473  MEvent->setWorkerQueue(MWorkerQueue);
1474 
1476 }
1477 
1479 #ifdef XPTI_ENABLE_INSTRUMENTATION
1480  if (!xptiTraceEnabled())
1481  return;
1482  // Create a payload with the command name and an event using this payload to
1483  // emit a node_create
1484  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1486 
1487  if (MFirstInstance) {
1488  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1489  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1490  deviceToID(MQueue->get_device()));
1491  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1492  deviceToString(MQueue->get_device()));
1493  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1494  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1495  xpti::addMetadata(CmdTraceEvent, "memory_object",
1496  reinterpret_cast<size_t>(MAddress));
1497  xpti::addMetadata(CmdTraceEvent, "copy_from",
1498  reinterpret_cast<size_t>(
1499  getSyclObjImpl(MSrcQueue->get_device()).get()));
1500  xpti::addMetadata(
1501  CmdTraceEvent, "copy_to",
1502  reinterpret_cast<size_t>(getSyclObjImpl(MQueue->get_device()).get()));
1504  }
1505 #endif
1506 }
1507 
1509  return getWorkerQueue()->getContextImplPtr();
1510 }
1511 
1512 pi_int32 MemCpyCommandHost::enqueueImp() {
1513  const QueueImplPtr &Queue = getWorkerQueue();
1515  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1516  std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
1517 
1518  RT::PiEvent &Event = MEvent->getHandleRef();
1519  // Omit copying if mode is discard one.
1520  // TODO: Handle this at the graph building time by, for example, creating
1521  // empty node instead of memcpy.
1524  Command::waitForEvents(Queue, EventImpls, Event);
1525 
1526  return PI_SUCCESS;
1527  }
1528 
1529  flushCrossQueueDeps(EventImpls, getWorkerQueue());
1531  MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(),
1532  MSrcQueue, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1533  MSrcReq.MOffset, MSrcReq.MElemSize, *MDstPtr, MQueue, MDstReq.MDims,
1534  MDstReq.MMemoryRange, MDstReq.MAccessRange, MDstReq.MOffset,
1535  MDstReq.MElemSize, std::move(RawEvents), Event);
1536 
1537  return PI_SUCCESS;
1538 }
1539 
1541  : Command(CommandType::EMPTY_TASK, std::move(Queue)) {
1543 }
1544 
1545 pi_int32 EmptyCommand::enqueueImp() {
1547  waitForEvents(MQueue, MPreparedDepsEvents, MEvent->getHandleRef());
1548 
1549  return PI_SUCCESS;
1550 }
1551 
1553  const Requirement *Req) {
1554  const Requirement &ReqRef = *Req;
1555  MRequirements.emplace_back(ReqRef);
1556  const Requirement *const StoredReq = &MRequirements.back();
1557 
1558  // EmptyCommand is always host one, so we believe that result of addDep is nil
1559  std::vector<Command *> ToCleanUp;
1560  Command *Cmd = addDep(DepDesc{DepCmd, StoredReq, AllocaCmd}, ToCleanUp);
1561  assert(Cmd == nullptr && "Conection command should be null for EmptyCommand");
1562  assert(ToCleanUp.empty() && "addDep should add a command for cleanup only if "
1563  "there's a connection command");
1564  (void)Cmd;
1565 }
1566 
1568 #ifdef XPTI_ENABLE_INSTRUMENTATION
1569  if (!xptiTraceEnabled())
1570  return;
1571  // Create a payload with the command name and an event using this payload to
1572  // emit a node_create
1573  if (MRequirements.empty())
1574  return;
1575 
1576  Requirement &Req = *MRequirements.begin();
1577 
1578  MAddress = Req.MSYCLMemObj;
1580 
1581  if (MFirstInstance) {
1582  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1583  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1584  deviceToID(MQueue->get_device()));
1585  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1586  deviceToString(MQueue->get_device()));
1587  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1588  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1589  xpti::addMetadata(CmdTraceEvent, "memory_object",
1590  reinterpret_cast<size_t>(MAddress));
1592  }
1593 #endif
1594 }
1595 
1596 void EmptyCommand::printDot(std::ostream &Stream) const {
1597  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#8d8f29\", label=\"";
1598 
1599  Stream << "ID = " << this << "\\n";
1600  Stream << "EMPTY NODE"
1601  << "\\n";
1602 
1603  Stream << "\"];" << std::endl;
1604 
1605  for (const auto &Dep : MDeps) {
1606  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1607  << " [ label = \"Access mode: "
1608  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1609  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1610  << std::endl;
1611  }
1612 }
1613 
1614 bool EmptyCommand::producesPiEvent() const { return false; }
1615 
1616 void MemCpyCommandHost::printDot(std::ostream &Stream) const {
1617  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\"";
1618 
1619  Stream << "ID = " << this << "\\n";
1620  Stream << "MEMCPY HOST ON " << deviceToString(MQueue->get_device()) << "\\n";
1621 
1622  Stream << "\"];" << std::endl;
1623 
1624  for (const auto &Dep : MDeps) {
1625  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1626  << " [ label = \"Access mode: "
1627  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1628  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1629  << std::endl;
1630  }
1631 }
1632 
1634  QueueImplPtr Queue, Requirement Req, AllocaCommandBase *SrcAllocaCmd,
1635  void **DstPtr)
1636  : Command(CommandType::UPDATE_REQUIREMENT, std::move(Queue)),
1637  MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(Req)), MDstPtr(DstPtr) {
1638 
1640 }
1641 
1643 #ifdef XPTI_ENABLE_INSTRUMENTATION
1644  if (!xptiTraceEnabled())
1645  return;
1646  // Create a payload with the command name and an event using this payload to
1647  // emit a node_create
1648  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1650 
1651  if (MFirstInstance) {
1652  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1653  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1654  deviceToID(MQueue->get_device()));
1655  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1656  deviceToString(MQueue->get_device()));
1657  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1658  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1659  xpti::addMetadata(CmdTraceEvent, "memory_object",
1660  reinterpret_cast<size_t>(MAddress));
1662  }
1663 #endif
1664 }
1665 
1666 static std::string cgTypeToString(detail::CG::CGTYPE Type) {
1667  switch (Type) {
1668  case detail::CG::Kernel:
1669  return "Kernel";
1670  break;
1672  return "update_host";
1673  break;
1674  case detail::CG::Fill:
1675  return "fill";
1676  break;
1678  return "copy acc to acc";
1679  break;
1681  return "copy acc to ptr";
1682  break;
1684  return "copy ptr to acc";
1685  break;
1686  case detail::CG::CopyUSM:
1687  return "copy usm";
1688  break;
1689  case detail::CG::FillUSM:
1690  return "fill usm";
1691  break;
1693  return "prefetch usm";
1694  break;
1696  return "host task";
1697  break;
1698  default:
1699  return "unknown";
1700  break;
1701  }
1702 }
1703 
1704 ExecCGCommand::ExecCGCommand(std::unique_ptr<detail::CG> CommandGroup,
1705  QueueImplPtr Queue)
1706  : Command(CommandType::RUN_CG, std::move(Queue)),
1707  MCommandGroup(std::move(CommandGroup)) {
1708  if (MCommandGroup->getType() == detail::CG::CodeplayHostTask) {
1709  MEvent->setSubmittedQueue(
1710  static_cast<detail::CGHostTask *>(MCommandGroup.get())->MQueue);
1711  MEvent->setNeedsCleanupAfterWait(true);
1712  } else if (MCommandGroup->getType() == CG::CGTYPE::Kernel &&
1713  (static_cast<CGExecKernel *>(MCommandGroup.get())->hasStreams() ||
1714  static_cast<CGExecKernel *>(MCommandGroup.get())
1715  ->hasAuxiliaryResources()))
1716  MEvent->setNeedsCleanupAfterWait(true);
1717 
1719 }
1720 
1722 #ifdef XPTI_ENABLE_INSTRUMENTATION
1723  if (!xptiTraceEnabled())
1724  return;
1725  // Create a payload with the command name and an event using this payload to
1726  // emit a node_create
1727  bool HasSourceInfo = false;
1728  std::string KernelName;
1729  std::optional<bool> FromSource;
1730  switch (MCommandGroup->getType()) {
1731  case detail::CG::Kernel: {
1732  auto KernelCG =
1733  reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
1734 
1735  if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource()) {
1736  FromSource = true;
1737  pi_kernel KernelHandle = KernelCG->MSyclKernel->getHandleRef();
1738  MAddress = KernelHandle;
1739  KernelName = MCommandGroup->MFunctionName;
1740  } else {
1741  FromSource = false;
1742  KernelName = demangleKernelName(KernelCG->getKernelName());
1743  }
1744  } break;
1745  default:
1746  KernelName = cgTypeToString(MCommandGroup->getType());
1747  break;
1748  }
1749  std::string CommandType = commandToNodeType(MType);
1750  // Get source file, line number information from the CommandGroup object
1751  // and create payload using name, address, and source info
1752  //
1753  // On Windows, since the support for builtin functions is not available in
1754  // MSVC, the MFileName, MLine will be set to nullptr and "0" respectively.
1755  // Handle this condition explicitly here.
1756  xpti::payload_t Payload;
1757  if (!MCommandGroup->MFileName.empty()) {
1758  // File name has a valid string
1759  Payload =
1760  xpti::payload_t(KernelName.c_str(), MCommandGroup->MFileName.c_str(),
1761  MCommandGroup->MLine, MCommandGroup->MColumn, MAddress);
1762  HasSourceInfo = true;
1763  } else if (MAddress) {
1764  // We have a valid function name and an address
1765  Payload = xpti::payload_t(KernelName.c_str(), MAddress);
1766  } else {
1767  // In any case, we will have a valid function name and we'll use that to
1768  // create the hash
1769  Payload = xpti::payload_t(KernelName.c_str());
1770  }
1771 
1772  uint64_t CGKernelInstanceNo;
1773  // Create event using the payload
1774  xpti_td *CmdTraceEvent =
1775  xptiMakeEvent("ExecCG", &Payload, xpti::trace_graph_event,
1776  xpti::trace_activity_type_t::active, &CGKernelInstanceNo);
1777 
1778  if (CmdTraceEvent) {
1779  MInstanceID = CGKernelInstanceNo;
1780  MTraceEvent = (void *)CmdTraceEvent;
1781  // If we are seeing this event again, then the instance ID will be greater
1782  // than 1; in this case, we will skip sending a notification to create a
1783  // node as this node has already been created.
1784  if (CGKernelInstanceNo > 1)
1785  return;
1786 
1787  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1788  deviceToID(MQueue->get_device()));
1789  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1790  deviceToString(MQueue->get_device()));
1791  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1792  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1793  if (!KernelName.empty()) {
1794  xpti::addMetadata(CmdTraceEvent, "kernel_name", KernelName);
1795  }
1796  if (FromSource.has_value()) {
1797  xpti::addMetadata(CmdTraceEvent, "from_source", FromSource.value());
1798  }
1799  if (HasSourceInfo) {
1800  xpti::addMetadata(CmdTraceEvent, "sym_function_name", KernelName);
1801  xpti::addMetadata(CmdTraceEvent, "sym_source_file_name",
1802  MCommandGroup->MFileName);
1803  xpti::addMetadata(CmdTraceEvent, "sym_line_no", MCommandGroup->MLine);
1804  xpti::addMetadata(CmdTraceEvent, "sym_column_no", MCommandGroup->MColumn);
1805  }
1806 
1807  if (MCommandGroup->getType() == detail::CG::Kernel) {
1808  auto KernelCG =
1809  reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
1810  auto &NDRDesc = KernelCG->MNDRDesc;
1811  std::vector<ArgDesc> Args;
1812 
1813  auto FilterArgs = [&Args](detail::ArgDesc &Arg, int NextTrueIndex) {
1814  Args.push_back({Arg.MType, Arg.MPtr, Arg.MSize, NextTrueIndex});
1815  };
1816  RT::PiProgram Program = nullptr;
1817  RT::PiKernel Kernel = nullptr;
1818  std::mutex *KernelMutex = nullptr;
1819 
1820  std::shared_ptr<kernel_impl> SyclKernelImpl;
1821  std::shared_ptr<device_image_impl> DeviceImageImpl;
1822  auto KernelBundleImplPtr = KernelCG->getKernelBundle();
1823 
1824  // Use kernel_bundle if available unless it is interop.
1825  // Interop bundles can't be used in the first branch, because the kernels
1826  // in interop kernel bundles (if any) do not have kernel_id
1827  // and can therefore not be looked up, but since they are self-contained
1828  // they can simply be launched directly.
1829  if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
1830  kernel_id KernelID =
1832  KernelCG->MKernelName);
1833  kernel SyclKernel =
1834  KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr);
1835  Program = detail::getSyclObjImpl(SyclKernel)
1836  ->getDeviceImage()
1837  ->get_program_ref();
1838  } else if (nullptr != KernelCG->MSyclKernel) {
1839  auto SyclProg = KernelCG->MSyclKernel->getProgramImpl();
1840  Program = SyclProg->getHandleRef();
1841  } else {
1842  std::tie(Kernel, KernelMutex, Program) =
1844  KernelCG->MOSModuleHandle, MQueue->getContextImplPtr(),
1845  MQueue->getDeviceImplPtr(), KernelCG->MKernelName, nullptr);
1846  }
1847 
1848  ProgramManager::KernelArgMask EliminatedArgMask;
1849  if (nullptr == KernelCG->MSyclKernel ||
1850  !KernelCG->MSyclKernel->isCreatedFromSource()) {
1851  EliminatedArgMask =
1853  KernelCG->MOSModuleHandle, Program, KernelCG->MKernelName);
1854  }
1855 
1856  applyFuncOnFilteredArgs(EliminatedArgMask, KernelCG->MArgs, FilterArgs);
1857 
1858  xpti::offload_kernel_enqueue_data_t KernelData{
1859  {NDRDesc.GlobalSize[0], NDRDesc.GlobalSize[1], NDRDesc.GlobalSize[2]},
1860  {NDRDesc.LocalSize[0], NDRDesc.LocalSize[1], NDRDesc.LocalSize[2]},
1861  {NDRDesc.GlobalOffset[0], NDRDesc.GlobalOffset[1],
1862  NDRDesc.GlobalOffset[2]},
1863  Args.size()};
1864  xpti::addMetadata(CmdTraceEvent, "enqueue_kernel_data", KernelData);
1865  for (size_t i = 0; i < Args.size(); i++) {
1866  std::string Prefix("arg");
1867  xpti::offload_kernel_arg_data_t arg{(int)Args[i].MType, Args[i].MPtr,
1868  Args[i].MSize, Args[i].MIndex};
1869  xpti::addMetadata(CmdTraceEvent, Prefix + std::to_string(i), arg);
1870  }
1871  }
1872 
1873  xptiNotifySubscribers(MStreamID, xpti::trace_node_create,
1874  detail::GSYCLGraphEvent, CmdTraceEvent,
1875  CGKernelInstanceNo,
1876  static_cast<const void *>(CommandType.c_str()));
1877  }
1878 #endif
1879 }
1880 
1881 void ExecCGCommand::printDot(std::ostream &Stream) const {
1882  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
1883 
1884  Stream << "ID = " << this << "\\n";
1885  Stream << "EXEC CG ON " << deviceToString(MQueue->get_device()) << "\\n";
1886 
1887  switch (MCommandGroup->getType()) {
1888  case detail::CG::Kernel: {
1889  auto KernelCG =
1890  reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
1891  Stream << "Kernel name: ";
1892  if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource())
1893  Stream << "created from source";
1894  else
1895  Stream << demangleKernelName(KernelCG->getKernelName());
1896  Stream << "\\n";
1897  break;
1898  }
1899  default:
1900  Stream << "CG type: " << cgTypeToString(MCommandGroup->getType()) << "\\n";
1901  break;
1902  }
1903 
1904  Stream << "\"];" << std::endl;
1905 
1906  for (const auto &Dep : MDeps) {
1907  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1908  << " [ label = \"Access mode: "
1909  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1910  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1911  << std::endl;
1912  }
1913 }
1914 
1915 // SYCL has a parallel_for_work_group variant where the only NDRange
1916 // characteristics set by a user is the number of work groups. This does not map
1917 // to the OpenCL clEnqueueNDRangeAPI, which requires global work size to be set
1918 // as well. This function determines local work size based on the device
1919 // characteristics and the number of work groups requested by the user, then
1920 // calculates the global work size.
1921 // SYCL specification (from 4.8.5.3):
1922 // The member function handler::parallel_for_work_group is parameterized by the
1923 // number of work - groups, such that the size of each group is chosen by the
1924 // runtime, or by the number of work - groups and number of work - items for
1925 // users who need more control.
1927  const device_impl &DeviceImpl) {
1928  if (NDR.GlobalSize[0] != 0)
1929  return; // GlobalSize is set - no need to adjust
1930  // check the prerequisites:
1931  assert(NDR.LocalSize[0] == 0);
1932  // TODO might be good to cache this info together with the kernel info to
1933  // avoid get_kernel_work_group_info on every kernel run
1935  sycl::info::kernel_device_specific::compile_work_group_size>(
1936  Kernel, DeviceImpl.getHandleRef(), DeviceImpl.getPlugin());
1937 
1938  if (WGSize[0] == 0) {
1939  WGSize = {1, 1, 1};
1940  }
1941  NDR.set(NDR.Dims, nd_range<3>(NDR.NumWorkGroups * WGSize, WGSize));
1942 }
1943 
1944 // We have the following mapping between dimensions with SPIR-V builtins:
1945 // 1D: id[0] -> x
1946 // 2D: id[0] -> y, id[1] -> x
1947 // 3D: id[0] -> z, id[1] -> y, id[2] -> x
1948 // So in order to ensure the correctness we update all the kernel
1949 // parameters accordingly.
1950 // Initially we keep the order of NDRDescT as it provided by the user, this
1951 // simplifies overall handling and do the reverse only when
1952 // the kernel is enqueued.
1954  if (NDR.Dims > 1) {
1955  std::swap(NDR.GlobalSize[0], NDR.GlobalSize[NDR.Dims - 1]);
1956  std::swap(NDR.LocalSize[0], NDR.LocalSize[NDR.Dims - 1]);
1957  std::swap(NDR.GlobalOffset[0], NDR.GlobalOffset[NDR.Dims - 1]);
1958  }
1959 }
1960 
1962  const QueueImplPtr &Queue, std::vector<ArgDesc> &Args,
1963  const std::shared_ptr<device_image_impl> &DeviceImageImpl,
1964  RT::PiKernel Kernel, NDRDescT &NDRDesc, std::vector<RT::PiEvent> &RawEvents,
1965  RT::PiEvent *OutEvent,
1966  const ProgramManager::KernelArgMask &EliminatedArgMask,
1967  const std::function<void *(Requirement *Req)> &getMemAllocationFunc) {
1968  const detail::plugin &Plugin = Queue->getPlugin();
1969 
1970  auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
1971  &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) {
1972  switch (Arg.MType) {
1974  break;
1976  Requirement *Req = (Requirement *)(Arg.MPtr);
1977  if (Req->MAccessRange == range<3>({0, 0, 0}))
1978  break;
1979  if (getMemAllocationFunc == nullptr)
1980  throw sycl::exception(make_error_code(errc::kernel_argument),
1981  "placeholder accessor must be bound by calling "
1982  "handler::require() before it can be used.");
1983 
1984  RT::PiMem MemArg = (RT::PiMem)getMemAllocationFunc(Req);
1985  if (Plugin.getBackend() == backend::opencl) {
1986  Plugin.call<PiApiKind::piKernelSetArg>(Kernel, NextTrueIndex,
1987  sizeof(RT::PiMem), &MemArg);
1988  } else {
1989  Plugin.call<PiApiKind::piextKernelSetArgMemObj>(Kernel, NextTrueIndex,
1990  &MemArg);
1991  }
1992  break;
1993  }
1995  Plugin.call<PiApiKind::piKernelSetArg>(Kernel, NextTrueIndex, Arg.MSize,
1996  Arg.MPtr);
1997  break;
1998  }
2000  sampler *SamplerPtr = (sampler *)Arg.MPtr;
2001  RT::PiSampler Sampler = detail::getSyclObjImpl(*SamplerPtr)
2002  ->getOrCreateSampler(Queue->get_context());
2003  Plugin.call<PiApiKind::piextKernelSetArgSampler>(Kernel, NextTrueIndex,
2004  &Sampler);
2005  break;
2006  }
2008  Plugin.call<PiApiKind::piextKernelSetArgPointer>(Kernel, NextTrueIndex,
2009  Arg.MSize, Arg.MPtr);
2010  break;
2011  }
2013  if (Queue->is_host()) {
2014  throw sycl::feature_not_supported(
2015  "SYCL2020 specialization constants are not yet supported on host "
2016  "device",
2017  PI_ERROR_INVALID_OPERATION);
2018  }
2019  assert(DeviceImageImpl != nullptr);
2020  RT::PiMem SpecConstsBuffer = DeviceImageImpl->get_spec_const_buffer_ref();
2021  // Avoid taking an address of nullptr
2022  RT::PiMem *SpecConstsBufferArg =
2023  SpecConstsBuffer ? &SpecConstsBuffer : nullptr;
2024  Plugin.call<PiApiKind::piextKernelSetArgMemObj>(Kernel, NextTrueIndex,
2025  SpecConstsBufferArg);
2026  break;
2027  }
2029  throw runtime_error("Invalid kernel param kind", PI_ERROR_INVALID_VALUE);
2030  break;
2031  }
2032  };
2033 
2034  applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc);
2035 
2036  adjustNDRangePerKernel(NDRDesc, Kernel, *(Queue->getDeviceImplPtr()));
2037 
2038  // Remember this information before the range dimensions are reversed
2039  const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0);
2040 
2042 
2043  size_t RequiredWGSize[3] = {0, 0, 0};
2044  size_t *LocalSize = nullptr;
2045 
2046  if (HasLocalSize)
2047  LocalSize = &NDRDesc.LocalSize[0];
2048  else {
2049  Plugin.call<PiApiKind::piKernelGetGroupInfo>(
2050  Kernel, Queue->getDeviceImplPtr()->getHandleRef(),
2051  PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, sizeof(RequiredWGSize),
2052  RequiredWGSize, /* param_value_size_ret = */ nullptr);
2053 
2054  const bool EnforcedLocalSize =
2055  (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2056  RequiredWGSize[2] != 0);
2057  if (EnforcedLocalSize)
2058  LocalSize = RequiredWGSize;
2059  }
2060 
2061  pi_result Error = Plugin.call_nocheck<PiApiKind::piEnqueueKernelLaunch>(
2062  Queue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
2063  &NDRDesc.GlobalSize[0], LocalSize, RawEvents.size(),
2064  RawEvents.empty() ? nullptr : &RawEvents[0], OutEvent);
2065  return Error;
2066 }
2067 
2068 // The function initialize accessors and calls lambda.
2069 // The function is used as argument to piEnqueueNativeKernel which requires
2070 // that the passed function takes one void* argument.
2071 void DispatchNativeKernel(void *Blob) {
2072  void **CastedBlob = (void **)Blob;
2073 
2074  std::vector<Requirement *> *Reqs =
2075  static_cast<std::vector<Requirement *> *>(CastedBlob[0]);
2076 
2077  std::unique_ptr<HostKernelBase> *HostKernel =
2078  static_cast<std::unique_ptr<HostKernelBase> *>(CastedBlob[1]);
2079 
2080  NDRDescT *NDRDesc = static_cast<NDRDescT *>(CastedBlob[2]);
2081 
2082  // Other value are pointer to the buffers.
2083  void **NextArg = CastedBlob + 3;
2084  for (detail::Requirement *Req : *Reqs)
2085  Req->MData = *(NextArg++);
2086 
2087  (*HostKernel)->call(*NDRDesc, nullptr);
2088 
2089  // The ownership of these objects have been passed to us, need to cleanup
2090  delete Reqs;
2091  delete HostKernel;
2092  delete NDRDesc;
2093 }
2094 
2096  const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector<ArgDesc> &Args,
2097  const std::shared_ptr<detail::kernel_bundle_impl> &KernelBundleImplPtr,
2098  const std::shared_ptr<detail::kernel_impl> &MSyclKernel,
2099  const std::string &KernelName, const detail::OSModuleHandle &OSModuleHandle,
2100  std::vector<RT::PiEvent> &RawEvents, RT::PiEvent *OutEvent,
2101  const std::function<void *(Requirement *Req)> &getMemAllocationFunc) {
2102 
2103  // Run OpenCL kernel
2104  auto ContextImpl = Queue->getContextImplPtr();
2105  auto DeviceImpl = Queue->getDeviceImplPtr();
2106  RT::PiKernel Kernel = nullptr;
2107  std::mutex *KernelMutex = nullptr;
2108  RT::PiProgram Program = nullptr;
2109 
2110  std::shared_ptr<kernel_impl> SyclKernelImpl;
2111  std::shared_ptr<device_image_impl> DeviceImageImpl;
2112 
2113  // Use kernel_bundle if available unless it is interop.
2114  // Interop bundles can't be used in the first branch, because the kernels
2115  // in interop kernel bundles (if any) do not have kernel_id
2116  // and can therefore not be looked up, but since they are self-contained
2117  // they can simply be launched directly.
2118  if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
2119  kernel_id KernelID =
2121  kernel SyclKernel =
2122  KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr);
2123 
2124  SyclKernelImpl = detail::getSyclObjImpl(SyclKernel);
2125 
2126  Kernel = SyclKernelImpl->getHandleRef();
2127  DeviceImageImpl = SyclKernelImpl->getDeviceImage();
2128 
2129  Program = DeviceImageImpl->get_program_ref();
2130 
2131  std::tie(Kernel, KernelMutex) =
2133  KernelBundleImplPtr->get_context(), KernelName,
2134  /*PropList=*/{}, Program);
2135  } else if (nullptr != MSyclKernel) {
2136  assert(MSyclKernel->get_info<info::kernel::context>() ==
2137  Queue->get_context());
2138  Kernel = MSyclKernel->getHandleRef();
2139  auto SyclProg = MSyclKernel->getProgramImpl();
2140  Program = SyclProg->getHandleRef();
2141  if (SyclProg->is_cacheable()) {
2142  RT::PiKernel FoundKernel = nullptr;
2143  std::tie(FoundKernel, KernelMutex, std::ignore) =
2145  OSModuleHandle, ContextImpl, DeviceImpl, KernelName,
2146  SyclProg.get());
2147  assert(FoundKernel == Kernel);
2148  }
2149  } else {
2150  std::tie(Kernel, KernelMutex, Program) =
2152  OSModuleHandle, ContextImpl, DeviceImpl, KernelName, nullptr);
2153  }
2154 
2155  pi_result Error = PI_SUCCESS;
2156  ProgramManager::KernelArgMask EliminatedArgMask;
2157  if (nullptr == MSyclKernel || !MSyclKernel->isCreatedFromSource()) {
2158  EliminatedArgMask =
2160  OSModuleHandle, Program, KernelName);
2161  }
2162  if (KernelMutex != nullptr) {
2163  // For cacheable kernels, we use per-kernel mutex
2164  std::lock_guard<std::mutex> Lock(*KernelMutex);
2165  Error = SetKernelParamsAndLaunch(Queue, Args, DeviceImageImpl, Kernel,
2166  NDRDesc, RawEvents, OutEvent,
2167  EliminatedArgMask, getMemAllocationFunc);
2168  } else {
2169  Error = SetKernelParamsAndLaunch(Queue, Args, DeviceImageImpl, Kernel,
2170  NDRDesc, RawEvents, OutEvent,
2171  EliminatedArgMask, getMemAllocationFunc);
2172  }
2173 
2174  if (PI_SUCCESS != Error) {
2175  // If we have got non-success error code, let's analyze it to emit nice
2176  // exception explaining what was wrong
2177  const device_impl &DeviceImpl = *(Queue->getDeviceImplPtr());
2179  Kernel, NDRDesc);
2180  }
2181 
2182  return PI_SUCCESS;
2183 }
2184 
2185 pi_int32 ExecCGCommand::enqueueImp() {
2186  if (getCG().getType() != CG::CGTYPE::CodeplayHostTask)
2187  waitForPreparedHostEvents();
2188  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
2189  auto RawEvents = getPiEvents(EventImpls);
2190  flushCrossQueueDeps(EventImpls, getWorkerQueue());
2191 
2192  RT::PiEvent *Event = (MQueue->has_discard_events_support() &&
2193  MCommandGroup->MRequirements.size() == 0)
2194  ? nullptr
2195  : &MEvent->getHandleRef();
2196  switch (MCommandGroup->getType()) {
2197 
2198  case CG::CGTYPE::UpdateHost: {
2199  throw runtime_error("Update host should be handled by the Scheduler.",
2200  PI_ERROR_INVALID_OPERATION);
2201  }
2202  case CG::CGTYPE::CopyAccToPtr: {
2203  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2204  Requirement *Req = (Requirement *)Copy->getSrc();
2205  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2206 
2207  MemoryManager::copy(
2208  AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(), MQueue,
2209  Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2210  Req->MElemSize, Copy->getDst(),
2211  Scheduler::getInstance().getDefaultHostQueue(), Req->MDims,
2212  Req->MAccessRange, Req->MAccessRange, /*DstOffset=*/{0, 0, 0},
2213  Req->MElemSize, std::move(RawEvents), MEvent->getHandleRef());
2214 
2215  return PI_SUCCESS;
2216  }
2217  case CG::CGTYPE::CopyPtrToAcc: {
2218  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2219  Requirement *Req = (Requirement *)(Copy->getDst());
2220  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2221 
2222  Scheduler::getInstance().getDefaultHostQueue();
2223 
2224  MemoryManager::copy(
2225  AllocaCmd->getSYCLMemObj(), Copy->getSrc(),
2226  Scheduler::getInstance().getDefaultHostQueue(), Req->MDims,
2227  Req->MAccessRange, Req->MAccessRange,
2228  /*SrcOffset*/ {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(),
2229  MQueue, Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2230  Req->MElemSize, std::move(RawEvents), MEvent->getHandleRef());
2231 
2232  return PI_SUCCESS;
2233  }
2234  case CG::CGTYPE::CopyAccToAcc: {
2235  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2236  Requirement *ReqSrc = (Requirement *)(Copy->getSrc());
2237  Requirement *ReqDst = (Requirement *)(Copy->getDst());
2238 
2239  AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc);
2240  AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst);
2241 
2242  MemoryManager::copy(
2243  AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(), MQueue,
2244  ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange,
2245  ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
2246  MQueue, ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
2247  ReqDst->MOffset, ReqDst->MElemSize, std::move(RawEvents),
2248  MEvent->getHandleRef());
2249 
2250  return PI_SUCCESS;
2251  }
2252  case CG::CGTYPE::Fill: {
2253  CGFill *Fill = (CGFill *)MCommandGroup.get();
2254  Requirement *Req = (Requirement *)(Fill->getReqToFill());
2255  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2256 
2257  MemoryManager::fill(
2258  AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(), MQueue,
2259  Fill->MPattern.size(), Fill->MPattern.data(), Req->MDims,
2260  Req->MMemoryRange, Req->MAccessRange, Req->MOffset, Req->MElemSize,
2261  std::move(RawEvents), MEvent->getHandleRef());
2262 
2263  return PI_SUCCESS;
2264  }
2265  case CG::CGTYPE::RunOnHostIntel: {
2266  CGExecKernel *HostTask = (CGExecKernel *)MCommandGroup.get();
2267 
2268  // piEnqueueNativeKernel takes arguments blob which is passes to user
2269  // function.
2270  // Need the following items to restore context in the host task.
2271  // Make a copy on heap to "dettach" from the command group as it can be
2272  // released before the host task completes.
2273  std::vector<void *> ArgsBlob(HostTask->MArgs.size() + 3);
2274 
2275  std::vector<Requirement *> *CopyReqs =
2276  new std::vector<Requirement *>(HostTask->MRequirements);
2277 
2278  // Not actually a copy, but move. Should be OK as it's not expected that
2279  // MHostKernel will be used elsewhere.
2280  std::unique_ptr<HostKernelBase> *CopyHostKernel =
2281  new std::unique_ptr<HostKernelBase>(std::move(HostTask->MHostKernel));
2282 
2283  NDRDescT *CopyNDRDesc = new NDRDescT(HostTask->MNDRDesc);
2284 
2285  ArgsBlob[0] = (void *)CopyReqs;
2286  ArgsBlob[1] = (void *)CopyHostKernel;
2287  ArgsBlob[2] = (void *)CopyNDRDesc;
2288 
2289  void **NextArg = ArgsBlob.data() + 3;
2290 
2291  if (MQueue->is_host()) {
2292  for (ArgDesc &Arg : HostTask->MArgs) {
2293  assert(Arg.MType == kernel_param_kind_t::kind_accessor);
2294 
2295  Requirement *Req = (Requirement *)(Arg.MPtr);
2296  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2297 
2298  *NextArg = AllocaCmd->getMemAllocation();
2299  NextArg++;
2300  }
2301 
2302  if (!RawEvents.empty()) {
2303  // Assuming that the events are for devices to the same Plugin.
2304  const detail::plugin &Plugin = EventImpls[0]->getPlugin();
2305  Plugin.call<PiApiKind::piEventsWait>(RawEvents.size(), &RawEvents[0]);
2306  }
2307  DispatchNativeKernel((void *)ArgsBlob.data());
2308 
2309  return PI_SUCCESS;
2310  }
2311 
2312  std::vector<pi_mem> Buffers;
2313  // piEnqueueNativeKernel requires additional array of pointers to args blob,
2314  // values that pointers point to are replaced with actual pointers to the
2315  // memory before execution of user function.
2316  std::vector<void *> MemLocs;
2317 
2318  for (ArgDesc &Arg : HostTask->MArgs) {
2319  assert(Arg.MType == kernel_param_kind_t::kind_accessor);
2320 
2321  Requirement *Req = (Requirement *)(Arg.MPtr);
2322  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2323  pi_mem MemArg = (pi_mem)AllocaCmd->getMemAllocation();
2324 
2325  Buffers.push_back(MemArg);
2326  MemLocs.push_back(NextArg);
2327  NextArg++;
2328  }
2329  const detail::plugin &Plugin = MQueue->getPlugin();
2330  pi_result Error = Plugin.call_nocheck<PiApiKind::piEnqueueNativeKernel>(
2331  MQueue->getHandleRef(), DispatchNativeKernel, (void *)ArgsBlob.data(),
2332  ArgsBlob.size() * sizeof(ArgsBlob[0]), Buffers.size(), Buffers.data(),
2333  const_cast<const void **>(MemLocs.data()), RawEvents.size(),
2334  RawEvents.empty() ? nullptr : RawEvents.data(), Event);
2335 
2336  switch (Error) {
2337  case PI_ERROR_INVALID_OPERATION:
2338  throw sycl::runtime_error(
2339  "Device doesn't support run_on_host_intel tasks.", Error);
2340  case PI_SUCCESS:
2341  return Error;
2342  default:
2343  throw sycl::runtime_error("Enqueueing run_on_host_intel task has failed.",
2344  Error);
2345  }
2346  }
2347  case CG::CGTYPE::Kernel: {
2348  CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
2349 
2350  NDRDescT &NDRDesc = ExecKernel->MNDRDesc;
2351  std::vector<ArgDesc> &Args = ExecKernel->MArgs;
2352 
2353  if (MQueue->is_host() || (MQueue->getPlugin().getBackend() ==
2355  for (ArgDesc &Arg : Args)
2356  if (kernel_param_kind_t::kind_accessor == Arg.MType) {
2357  Requirement *Req = (Requirement *)(Arg.MPtr);
2358  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2359  Req->MData = AllocaCmd->getMemAllocation();
2360  }
2361  if (!RawEvents.empty()) {
2362  // Assuming that the events are for devices to the same Plugin.
2363  const detail::plugin &Plugin = EventImpls[0]->getPlugin();
2364  Plugin.call<PiApiKind::piEventsWait>(RawEvents.size(), &RawEvents[0]);
2365  }
2366 
2367  if (MQueue->is_host()) {
2368  ExecKernel->MHostKernel->call(NDRDesc,
2369  getEvent()->getHostProfilingInfo());
2370  } else {
2371  assert(MQueue->getPlugin().getBackend() ==
2373 
2374  MQueue->getPlugin().call<PiApiKind::piEnqueueKernelLaunch>(
2375  nullptr,
2376  reinterpret_cast<pi_kernel>(ExecKernel->MHostKernel->getPtr()),
2377  NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0],
2378  &NDRDesc.LocalSize[0], 0, nullptr, nullptr);
2379  }
2380 
2381  return PI_SUCCESS;
2382  }
2383 
2384  auto getMemAllocationFunc = [this](Requirement *Req) {
2385  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2386  return AllocaCmd->getMemAllocation();
2387  };
2388 
2389  const std::shared_ptr<detail::kernel_impl> &SyclKernel =
2390  ExecKernel->MSyclKernel;
2391  const std::string &KernelName = ExecKernel->MKernelName;
2392  const detail::OSModuleHandle &OSModuleHandle = ExecKernel->MOSModuleHandle;
2393 
2394  if (!Event) {
2395  // Kernel only uses assert if it's non interop one
2396  bool KernelUsesAssert = !(SyclKernel && SyclKernel->isInterop()) &&
2397  ProgramManager::getInstance().kernelUsesAssert(
2398  OSModuleHandle, KernelName);
2399  if (KernelUsesAssert) {
2400  Event = &MEvent->getHandleRef();
2401  }
2402  }
2403 
2404  return enqueueImpKernel(
2405  MQueue, NDRDesc, Args, ExecKernel->getKernelBundle(), SyclKernel,
2406  KernelName, OSModuleHandle, RawEvents, Event, getMemAllocationFunc);
2407  }
2408  case CG::CGTYPE::CopyUSM: {
2409  CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
2410  MemoryManager::copy_usm(Copy->getSrc(), MQueue, Copy->getLength(),
2411  Copy->getDst(), std::move(RawEvents), Event);
2412 
2413  return PI_SUCCESS;
2414  }
2415  case CG::CGTYPE::FillUSM: {
2416  CGFillUSM *Fill = (CGFillUSM *)MCommandGroup.get();
2417  MemoryManager::fill_usm(Fill->getDst(), MQueue, Fill->getLength(),
2418  Fill->getFill(), std::move(RawEvents), Event);
2419 
2420  return PI_SUCCESS;
2421  }
2422  case CG::CGTYPE::PrefetchUSM: {
2423  CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get();
2424  MemoryManager::prefetch_usm(Prefetch->getDst(), MQueue,
2425  Prefetch->getLength(), std::move(RawEvents),
2426  Event);
2427 
2428  return PI_SUCCESS;
2429  }
2430  case CG::CGTYPE::AdviseUSM: {
2431  CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get();
2432  MemoryManager::advise_usm(Advise->getDst(), MQueue, Advise->getLength(),
2433  Advise->getAdvice(), std::move(RawEvents), Event);
2434 
2435  return PI_SUCCESS;
2436  }
2437  case CG::CGTYPE::CodeplayInteropTask: {
2438  const detail::plugin &Plugin = MQueue->getPlugin();
2439  CGInteropTask *ExecInterop = (CGInteropTask *)MCommandGroup.get();
2440  // Wait for dependencies to complete before dispatching work on the host
2441  // TODO: Use a callback to dispatch the interop task instead of waiting for
2442  // the event
2443  if (!RawEvents.empty()) {
2444  Plugin.call<PiApiKind::piEventsWait>(RawEvents.size(), &RawEvents[0]);
2445  }
2446  std::vector<interop_handler::ReqToMem> ReqMemObjs;
2447  // Extract the Mem Objects for all Requirements, to ensure they are
2448  // available if a user ask for them inside the interop task scope
2449  const auto &HandlerReq = ExecInterop->MRequirements;
2450  std::for_each(
2451  std::begin(HandlerReq), std::end(HandlerReq), [&](Requirement *Req) {
2452  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2453  auto MemArg = reinterpret_cast<pi_mem>(AllocaCmd->getMemAllocation());
2454  interop_handler::ReqToMem ReqToMem = std::make_pair(Req, MemArg);
2455  ReqMemObjs.emplace_back(ReqToMem);
2456  });
2457 
2458  std::sort(std::begin(ReqMemObjs), std::end(ReqMemObjs));
2459  interop_handler InteropHandler(std::move(ReqMemObjs), MQueue);
2460  ExecInterop->MInteropTask->call(InteropHandler);
2461  Plugin.call<PiApiKind::piEnqueueEventsWait>(MQueue->getHandleRef(), 0,
2462  nullptr, Event);
2463 
2464  return PI_SUCCESS;
2465  }
2466  case CG::CGTYPE::CodeplayHostTask: {
2467  CGHostTask *HostTask = static_cast<CGHostTask *>(MCommandGroup.get());
2468 
2469  for (ArgDesc &Arg : HostTask->MArgs) {
2470  switch (Arg.MType) {
2471  case kernel_param_kind_t::kind_accessor: {
2472  Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
2473  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2474 
2475  Req->MData = AllocaCmd->getMemAllocation();
2476  break;
2477  }
2478  default:
2479  throw runtime_error("Unsupported arg type", PI_ERROR_INVALID_VALUE);
2480  }
2481  }
2482 
2483  std::vector<interop_handle::ReqToMem> ReqToMem;
2484 
2485  if (HostTask->MHostTask->isInteropTask()) {
2486  // Extract the Mem Objects for all Requirements, to ensure they are
2487  // available if a user asks for them inside the interop task scope
2488  const std::vector<Requirement *> &HandlerReq = HostTask->MRequirements;
2489  auto ReqToMemConv = [&ReqToMem, HostTask](Requirement *Req) {
2490  const std::vector<AllocaCommandBase *> &AllocaCmds =
2491  Req->MSYCLMemObj->MRecord->MAllocaCommands;
2492 
2493  for (AllocaCommandBase *AllocaCmd : AllocaCmds)
2494  if (HostTask->MQueue->getContextImplPtr() ==
2495  AllocaCmd->getQueue()->getContextImplPtr()) {
2496  auto MemArg =
2497  reinterpret_cast<pi_mem>(AllocaCmd->getMemAllocation());
2498  ReqToMem.emplace_back(std::make_pair(Req, MemArg));
2499 
2500  return;
2501  }
2502 
2503  assert(false &&
2504  "Can't get memory object due to no allocation available");
2505 
2506  throw runtime_error(
2507  "Can't get memory object due to no allocation available",
2508  PI_ERROR_INVALID_MEM_OBJECT);
2509  };
2510  std::for_each(std::begin(HandlerReq), std::end(HandlerReq), ReqToMemConv);
2511  std::sort(std::begin(ReqToMem), std::end(ReqToMem));
2512  }
2513 
2514  MQueue->getThreadPool().submit<DispatchHostTask>(
2515  DispatchHostTask(this, std::move(ReqToMem)));
2516 
2517  MShouldCompleteEventIfPossible = false;
2518 
2519  return PI_SUCCESS;
2520  }
2521  case CG::CGTYPE::Barrier: {
2522  if (MQueue->getDeviceImplPtr()->is_host()) {
2523  // NOP for host device.
2524  return PI_SUCCESS;
2525  }
2526  const detail::plugin &Plugin = MQueue->getPlugin();
2528  MQueue->getHandleRef(), 0, nullptr, Event);
2529 
2530  return PI_SUCCESS;
2531  }
2532  case CG::CGTYPE::BarrierWaitlist: {
2533  CGBarrier *Barrier = static_cast<CGBarrier *>(MCommandGroup.get());
2534  std::vector<detail::EventImplPtr> Events = Barrier->MEventsWaitWithBarrier;
2535  std::vector<RT::PiEvent> PiEvents = getPiEvents(Events);
2536  if (MQueue->getDeviceImplPtr()->is_host() || PiEvents.empty()) {
2537  // NOP for host device.
2538  // If Events is empty, then the barrier has no effect.
2539  return PI_SUCCESS;
2540  }
2541  const detail::plugin &Plugin = MQueue->getPlugin();
2543  MQueue->getHandleRef(), PiEvents.size(), &PiEvents[0], Event);
2544 
2545  return PI_SUCCESS;
2546  }
2547  case CG::CGTYPE::None:
2548  throw runtime_error("CG type not implemented.", PI_ERROR_INVALID_OPERATION);
2549  }
2550  return PI_ERROR_INVALID_OPERATION;
2551 }
2552 
2553 bool ExecCGCommand::producesPiEvent() const {
2554  return MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask;
2555 }
2556 
2557 bool ExecCGCommand::supportsPostEnqueueCleanup() const {
2558  // TODO enable cleaning up host task commands and kernels with streams after
2559  // enqueue
2560  return Command::supportsPostEnqueueCleanup() &&
2561  (MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask) &&
2562  (MCommandGroup->getType() != CG::CGTYPE::Kernel ||
2563  (!static_cast<CGExecKernel *>(MCommandGroup.get())->hasStreams() &&
2564  !static_cast<CGExecKernel *>(MCommandGroup.get())
2565  ->hasAuxiliaryResources()));
2566 }
2567 
2568 } // namespace detail
2569 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
2570 } // namespace sycl
Base class for memory allocation commands.
Definition: commands.hpp:373
const Requirement * getRequirement() const final
Definition: commands.hpp:384
AllocaCommandBase * MLinkedAllocaCmd
Alloca command linked with current command.
Definition: commands.hpp:399
virtual void * getMemAllocation() const =0
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:855
bool MIsActive
Indicates that current alloca is active one.
Definition: commands.hpp:401
bool MIsLeaderAlloca
Indicates that the command owns memory allocation in case of connected alloca command.
Definition: commands.hpp:405
AllocaCommandBase(CommandType Type, QueueImplPtr Queue, Requirement Req, AllocaCommandBase *LinkedAllocaCmd, bool IsConst)
Definition: commands.cpp:823
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:857
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:834
SYCLMemObjI * getSYCLMemObj() const
Definition: commands.hpp:380
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:915
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:877
AllocaCommand(QueueImplPtr Queue, Requirement Req, bool InitFromUserData=true, AllocaCommandBase *LinkedAllocaCmd=nullptr, bool IsConst=false)
Definition: commands.cpp:859
void * getMemAllocation() const final
Definition: commands.cpp:972
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:953
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:999
AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *ParentAlloca, std::vector< Command * > &ToEnqueue, std::vector< Command * > &ToCleanUp)
Definition: commands.cpp:935
sycl::detail::kernel_param_kind_t MType
Definition: cg_types.hpp:33
"Execute kernel" command group class.
Definition: cg.hpp:126
NDRDescT MNDRDesc
Stores ND-range description.
Definition: cg.hpp:129
std::shared_ptr< detail::queue_impl > MQueue
Definition: cg.hpp:358
CGTYPE getType()
Definition: cg.hpp:96
CGTYPE
Type of the command group.
Definition: cg.hpp:55
The Command class represents some action that needs to be performed on one or more memory objects.
Definition: commands.hpp:95
Command * processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform glueing of events from different contexts.
Definition: commands.cpp:579
void * MTraceEvent
The event for node_create and task_begin.
Definition: commands.hpp:296
CommandType MType
The type of the command.
Definition: commands.hpp:249
virtual bool producesPiEvent() const
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:623
int32_t MStreamID
The stream under which the traces are emitted.
Definition: commands.hpp:300
void emitInstrumentation(uint16_t Type, const char *Txt=nullptr)
Emits an event of Type.
Definition: commands.cpp:679
virtual void emitInstrumentationData()=0
Instrumentation method which emits telemetry data.
void resolveReleaseDependencies(std::set< Command * > &list)
Looks at all the dependencies for the release command and enables instrumentation to report these dep...
Definition: commands.cpp:770
std::string MCommandName
Buffer to build the command end-user understandable name.
Definition: commands.hpp:309
virtual pi_int32 enqueueImp()=0
Private interface. Derived classes should implement this method.
unsigned MLeafCounter
Counts the number of memory objects this command is a leaf for.
Definition: commands.hpp:271
void waitForEvents(QueueImplPtr Queue, std::vector< EventImplPtr > &RawEvents, RT::PiEvent &Event)
Definition: commands.cpp:337
virtual bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, std::vector< Command * > &ToCleanUp)
Checks if the command is enqueued, and calls enqueueImp.
Definition: commands.cpp:690
void waitForPreparedHostEvents() const
Definition: commands.cpp:332
bool MPostEnqueueCleanup
Indicates that the node will be freed by cleanup after enqueue.
Definition: commands.hpp:327
std::mutex MEnqueueMtx
Mutex used to protect enqueueing from race conditions.
Definition: commands.hpp:251
virtual const ContextImplPtr & getWorkerContext() const
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:614
void emitInstrumentationDataProxy()
Proxy method which calls emitInstrumentationData.
Definition: commands.cpp:413
void makeTraceEventEpilog()
If prolog has been run, run epilog; this must be guarded by a check for xptiTraceEnabled().
Definition: commands.cpp:567
void emitEnqueuedEventSignal(RT::PiEvent &PiEventAddr)
Creates a signal event with the enqueued kernel event handle.
Definition: commands.cpp:667
std::vector< EventImplPtr > & MPreparedHostDepsEvents
Definition: commands.hpp:223
std::vector< EventImplPtr > & MPreparedDepsEvents
Dependency events prepared for waiting by backend.
Definition: commands.hpp:222
uint64_t MInstanceID
Instance ID tracked for the command.
Definition: commands.hpp:315
const QueueImplPtr & getWorkerQueue() const
Get the queue this command will be submitted to.
Definition: commands.cpp:618
std::vector< DepDesc > MDeps
Contains list of dependencies(edges)
Definition: commands.hpp:265
const char * getBlockReason() const
Definition: commands.cpp:812
void addUser(Command *NewUser)
Definition: commands.hpp:124
std::atomic< EnqueueResultT::ResultT > MEnqueueStatus
Describes the status of the command.
Definition: commands.hpp:288
const EventImplPtr & getEvent() const
Definition: commands.hpp:151
uint64_t makeTraceEventProlog(void *MAddress)
Create a trace event of node_create type; this must be guarded by a check for xptiTraceEnabled().
Definition: commands.cpp:535
CommandType getType() const
Definition: commands.hpp:127
void * MAddress
Reserved for storing the object address such as SPIR-V or memory object address.
Definition: commands.hpp:303
std::string MAddressString
Buffer to build the address string.
Definition: commands.hpp:305
bool MIsBlockable
Indicates whether the command can be blocked from enqueueing.
Definition: commands.hpp:269
virtual bool supportsPostEnqueueCleanup() const
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:625
bool MTraceEventPrologComplete
Flag to indicate if makeTraceEventProlog() has been run.
Definition: commands.hpp:311
std::vector< RT::PiEvent > getPiEvents(const std::vector< EventImplPtr > &EventImpls) const
Collect PI events from EventImpls and filter out some of them in case of in order queue.
Definition: commands.cpp:206
void emitEdgeEventForEventDependence(Command *Cmd, RT::PiEvent &EventAddr)
Creates an edge event when the dependency is an event.
Definition: commands.cpp:478
std::string MCommandNodeType
Buffer to build the command node type.
Definition: commands.hpp:307
Command * addDep(DepDesc NewDep, std::vector< Command * > &ToCleanUp)
Definition: commands.cpp:627
void emitEdgeEventForCommandDependence(Command *Cmd, void *ObjAddr, bool IsCommand, std::optional< access::mode > AccMode=std::nullopt)
Creates an edge event when the dependency is a command.
Definition: commands.cpp:429
const QueueImplPtr & getQueue() const
Definition: commands.hpp:149
bool MFirstInstance
Flag to indicate if this is the first time we are seeing this payload.
Definition: commands.hpp:313
DispatchHostTask(ExecCGCommand *ThisCmd, std::vector< interop_handle::ReqToMem > ReqToMem)
Definition: commands.cpp:287
The empty command does nothing during enqueue.
Definition: commands.hpp:332
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1614
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1596
void addRequirement(Command *DepCmd, AllocaCommandBase *AllocaCmd, const Requirement *Req)
Definition: commands.cpp:1552
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1567
EmptyCommand(QueueImplPtr Queue)
Definition: commands.cpp:1540
The exec CG command enqueues execution of kernel or explicit memory operation.
Definition: commands.hpp:549
std::vector< StreamImplPtr > getStreams() const
Definition: commands.cpp:1400
detail::CG & getCG() const
Definition: commands.hpp:562
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1881
std::vector< std::shared_ptr< const void > > getAuxiliaryResources() const
Definition: commands.cpp:1407
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1721
ExecCGCommand(std::unique_ptr< detail::CG > CommandGroup, QueueImplPtr Queue)
Definition: commands.cpp:1704
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1143
MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, void **DstPtr, QueueImplPtr Queue, access::mode MapMode)
Definition: commands.cpp:1134
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1180
const ContextImplPtr & getWorkerContext() const final
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:1508
MemCpyCommandHost(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, Requirement DstReq, void **DstPtr, QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
Definition: commands.cpp:1460
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1478
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1616
MemCpyCommand(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, Requirement DstReq, AllocaCommandBase *DstAllocaCmd, QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
Definition: commands.cpp:1277
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1296
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1330
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1370
const ContextImplPtr & getWorkerContext() const final
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:1326
static void * map(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, access::mode AccessMode, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< RT::PiEvent > DepEvents, RT::PiEvent &OutEvent)
static void unmap(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, void *MappedPtr, std::vector< RT::PiEvent > DepEvents, RT::PiEvent &OutEvent)
static void release(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *MemAllocation, std::vector< EventImplPtr > DepEvents, RT::PiEvent &OutEvent)
static void * allocateMemSubBuffer(ContextImplPtr TargetContext, void *ParentMemObj, size_t ElemSize, size_t Offset, range< 3 > Range, std::vector< EventImplPtr > DepEvents, RT::PiEvent &OutEvent)
static void copy(SYCLMemObjI *SYCLMemObj, void *SrcMem, QueueImplPtr SrcQueue, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, QueueImplPtr TgtQueue, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< RT::PiEvent > DepEvents, RT::PiEvent &OutEvent)
static void * allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, bool InitFromUserData, void *HostPtr, std::vector< EventImplPtr > DepEvents, RT::PiEvent &OutEvent)
sycl::range< 3 > GlobalSize
Definition: cg_types.hpp:116
sycl::range< 3 > NumWorkGroups
Number of workgroups, used to record the number of workgroups from the simplest form of parallel_for_...
Definition: cg_types.hpp:122
sycl::id< 3 > GlobalOffset
Definition: cg_types.hpp:118
sycl::range< 3 > LocalSize
Definition: cg_types.hpp:117
void set(sycl::range< Dims_ > NumWorkItems)
Definition: cg_types.hpp:57
KernelArgMask getEliminatedKernelArgMask(OSModuleHandle M, pi::PiProgram NativePrg, const std::string &KernelName)
Returns the mask for eliminated kernel arguments for the requested kernel within the native program.
static ProgramManager & getInstance()
kernel_id getSYCLKernelID(const std::string &KernelName)
std::tuple< RT::PiKernel, std::mutex *, RT::PiProgram > getOrCreateKernel(OSModuleHandle M, const ContextImplPtr &ContextImpl, const DeviceImplPtr &DeviceImpl, const std::string &KernelName, const program_impl *Prg)
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1130
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:1132
ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd)
Definition: commands.cpp:1021
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1112
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1026
virtual MemObjType getType() const =0
Command * connectDepEvent(Command *const Cmd, const EventImplPtr &DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform connection of events in multiple contexts.
static Scheduler & getInstance()
Definition: scheduler.cpp:207
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1260
UnMapMemObject(AllocaCommandBase *DstAllocaCmd, Requirement Req, void **SrcPtr, QueueImplPtr Queue)
Definition: commands.cpp:1197
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1204
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1226
UpdateHostRequirementCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *SrcAllocaCmd, void **DstPtr)
Definition: commands.cpp:1633
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1642
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1437
static void bufferAssociateNotification(const void *, const void *)
const plugin & getPlugin() const
RT::PiDevice & getHandleRef()
Get reference to PI device.
Definition: device_impl.hpp:64
The plugin class provides a unified interface to the underlying low-level runtimes for the device-agn...
Definition: plugin.hpp:90
void call(ArgsT... Args) const
Calls the API, traces the call, checks the result.
Definition: plugin.hpp:217
backend getBackend(void) const
Definition: plugin.hpp:229
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:49
Objects of the class identify kernel is some kernel_bundle related APIs.
Provides an abstraction of a SYCL kernel.
Definition: kernel.hpp:71
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: nd_range.hpp:23
#define __SYCL_INLINE_VER_NAMESPACE(X)
void handleErrorOrWarning(pi_result Error, const device_impl &DeviceImpl, pi_kernel Kernel, const NDRDescT &NDRDesc)
Analyzes error code and arguments of piEnqueueKernelLaunch to emit user-friendly exception describing...
::pi_event PiEvent
Definition: pi.hpp:121
::pi_mem PiMem
Definition: pi.hpp:119
::pi_kernel PiKernel
Definition: pi.hpp:116
::pi_sampler PiSampler
Definition: pi.hpp:122
::pi_program PiProgram
Definition: pi.hpp:115
void free(void *Ptr, const context &Ctxt, const code_location &CL)
Definition: usm_impl.cpp:221
static void ReverseRangeDimensionsForKernel(NDRDescT &NDR)
Definition: commands.cpp:1953
static void flushCrossQueueDeps(const std::vector< EventImplPtr > &EventImpls, const QueueImplPtr &Queue)
Definition: commands.cpp:235
static void adjustNDRangePerKernel(NDRDescT &NDR, RT::PiKernel Kernel, const device_impl &DeviceImpl)
Definition: commands.cpp:1926
static std::string demangleKernelName(std::string Name)
Definition: commands.cpp:71
constexpr const char * SYCL_STREAM_NAME
static void waitForEvents(const std::vector< EventImplPtr > &Events)
std::shared_ptr< sycl::detail::context_impl > ContextImplPtr
Definition: event_impl.hpp:30
static void applyFuncOnFilteredArgs(const ProgramManager::KernelArgMask &EliminatedArgMask, std::vector< ArgDesc > &Args, std::function< void(detail::ArgDesc &Arg, int NextTrueIndex)> Func)
Definition: commands.cpp:87
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: common.hpp:248
static pi_result SetKernelParamsAndLaunch(const QueueImplPtr &Queue, std::vector< ArgDesc > &Args, const std::shared_ptr< device_image_impl > &DeviceImageImpl, RT::PiKernel Kernel, NDRDescT &NDRDesc, std::vector< RT::PiEvent > &RawEvents, RT::PiEvent *OutEvent, const ProgramManager::KernelArgMask &EliminatedArgMask, const std::function< void *(Requirement *Req)> &getMemAllocationFunc)
Definition: commands.cpp:1961
std::shared_ptr< event_impl > EventImplPtr
Definition: cg.hpp:42
static std::string cgTypeToString(detail::CG::CGTYPE Type)
Definition: commands.cpp:1666
pi_int32 enqueueImpKernel(const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector< ArgDesc > &Args, const std::shared_ptr< detail::kernel_bundle_impl > &KernelBundleImplPtr, const std::shared_ptr< detail::kernel_impl > &MSyclKernel, const std::string &KernelName, const detail::OSModuleHandle &OSModuleHandle, std::vector< RT::PiEvent > &RawEvents, RT::PiEvent *OutEvent, const std::function< void *(Requirement *Req)> &getMemAllocationFunc)
Definition: commands.cpp:2095
AccessorImplHost Requirement
std::enable_if< !std::is_same< typename Param::return_type, sycl::range< 3 > >::value, typename Param::return_type >::type get_kernel_device_specific_info(RT::PiKernel Kernel, RT::PiDevice Device, const plugin &Plugin)
Definition: kernel_info.hpp:82
static std::string deviceToString(device Device)
Definition: commands.cpp:74
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
auto tie(Ts &...Args)
Definition: tuple.hpp:40
std::shared_ptr< sycl::detail::queue_impl > QueueImplPtr
Definition: event_impl.hpp:32
void DispatchNativeKernel(void *Blob)
Definition: commands.cpp:2071
void write(GlobalBufAccessorT &GlobalFlushBuf, size_t FlushBufferSize, unsigned WIOffset, const char *Str, unsigned Len, unsigned Padding=0)
Definition: stream.hpp:111
intptr_t OSModuleHandle
Uniquely identifies an operating system module (executable or a dynamic library)
Definition: os_util.hpp:48
static std::string accessModeToString(access::mode Mode)
Definition: commands.cpp:131
Function for_each(Group g, Ptr first, Ptr last, Function f)
std::error_code make_error_code(sycl::errc E) noexcept
Constructs an error code using e and sycl_category()
Definition: exception.cpp:91
---— Error handling, matching OpenCL plugin semantics.
Definition: access.hpp:14
int32_t pi_int32
Definition: pi.h:106
_pi_result
Definition: pi.h:118
pi_result piKernelSetArg(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
pi_result piEnqueueEventsWaitWithBarrier(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
pi_result piEnqueueKernelLaunch(pi_queue queue, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
_pi_mem * pi_mem
Definition: pi.h:916
@ PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
Definition: pi.h:354
pi_result piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index, const pi_sampler *arg_value)
pi_result piEnqueueEventsWait(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
pi_result piEventsWait(pi_uint32 num_events, const pi_event *event_list)
pi_result piEnqueueNativeKernel(pi_queue queue, void(*user_func)(void *), void *args, size_t cb_args, pi_uint32 num_mem_objects, const pi_mem *mem_list, const void **args_mem_loc, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
pi_result piextKernelSetArgPointer(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
Sets up pointer arguments for CL kernels.
pi_result piKernelGetGroupInfo(pi_kernel kernel, pi_device device, pi_kernel_group_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
pi_result piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, const pi_mem *arg_value)
@ Device
Implementation of a PI Kernel for CUDA.
Definition: pi_cuda.hpp:813
PI Mem mapping to CUDA memory allocations, both data and texture/surface.
Definition: pi_cuda.hpp:222
Dependency between two commands.
Definition: commands.hpp:69
const Requirement * MDepRequirement
Requirement for the dependency.
Definition: commands.hpp:82
Command * MDepCommand
The actual dependency command.
Definition: commands.hpp:80
Result of command enqueueing.
Definition: commands.hpp:50