DPC++ Runtime
Runtime libraries for oneAPI DPC++
commands.cpp
Go to the documentation of this file.
1 //===----------- commands.cpp - SYCL commands -------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 
14 #include <CL/sycl/detail/cl.h>
17 #include <CL/sycl/program.hpp>
18 #include <CL/sycl/sampler.hpp>
19 #include <detail/context_impl.hpp>
20 #include <detail/event_impl.hpp>
22 #include <detail/kernel_impl.hpp>
23 #include <detail/kernel_info.hpp>
24 #include <detail/program_impl.hpp>
26 #include <detail/queue_impl.hpp>
27 #include <detail/sampler_impl.hpp>
30 #include <detail/stream_impl.hpp>
31 #include <detail/xpti_registry.hpp>
32 
33 #include <cassert>
34 #include <optional>
35 #include <string>
36 #include <vector>
37 
38 #ifdef __has_include
39 #if __has_include(<cxxabi.h>)
40 #define __SYCL_ENABLE_GNU_DEMANGLING
41 #include <cstdlib>
42 #include <cxxabi.h>
43 #include <memory>
44 #endif
45 #endif
46 
47 #ifdef XPTI_ENABLE_INSTRUMENTATION
48 #include "xpti/xpti_trace_framework.hpp"
49 #include <detail/xpti_registry.hpp>
50 #endif
51 
53 namespace sycl {
54 namespace detail {
55 
56 #ifdef XPTI_ENABLE_INSTRUMENTATION
57 // Global graph for the application
58 extern xpti::trace_event_data_t *GSYCLGraphEvent;
59 #endif
60 
61 #ifdef __SYCL_ENABLE_GNU_DEMANGLING
62 struct DemangleHandle {
63  char *p;
64  DemangleHandle(char *ptr) : p(ptr) {}
65  ~DemangleHandle() { std::free(p); }
66 };
67 static std::string demangleKernelName(std::string Name) {
68  int Status = -1; // some arbitrary value to eliminate the compiler warning
69  DemangleHandle result(abi::__cxa_demangle(Name.c_str(), NULL, NULL, &Status));
70  return (Status == 0) ? result.p : Name;
71 }
72 #else
73 static std::string demangleKernelName(std::string Name) { return Name; }
74 #endif
75 
76 static std::string deviceToString(device Device) {
77  if (Device.is_host())
78  return "HOST";
79  else if (Device.is_cpu())
80  return "CPU";
81  else if (Device.is_gpu())
82  return "GPU";
83  else if (Device.is_accelerator())
84  return "ACCELERATOR";
85  else
86  return "UNKNOWN";
87 }
88 
90  const ProgramManager::KernelArgMask &EliminatedArgMask,
91  std::vector<ArgDesc> &Args,
92  std::function<void(detail::ArgDesc &Arg, int NextTrueIndex)> Func) {
93  if (EliminatedArgMask.empty()) {
94  for (ArgDesc &Arg : Args) {
95  Func(Arg, Arg.MIndex);
96  }
97  } else {
98  // TODO this is not necessary as long as we can guarantee that the
99  // arguments are already sorted (e. g. handle the sorting in handler
100  // if necessary due to set_arg(...) usage).
101  std::sort(Args.begin(), Args.end(), [](const ArgDesc &A, const ArgDesc &B) {
102  return A.MIndex < B.MIndex;
103  });
104  int LastIndex = -1;
105  size_t NextTrueIndex = 0;
106 
107  for (ArgDesc &Arg : Args) {
108  // Handle potential gaps in set arguments (e. g. if some of them are
109  // set on the user side).
110  for (int Idx = LastIndex + 1; Idx < Arg.MIndex; ++Idx)
111  if (!EliminatedArgMask[Idx])
112  ++NextTrueIndex;
113  LastIndex = Arg.MIndex;
114 
115  if (EliminatedArgMask[Arg.MIndex])
116  continue;
117 
118  Func(Arg, NextTrueIndex);
119  ++NextTrueIndex;
120  }
121  }
122 }
123 
124 #ifdef XPTI_ENABLE_INSTRUMENTATION
125 static size_t deviceToID(const device &Device) {
126  if (Device.is_host())
127  return 0;
128  else
129  return reinterpret_cast<size_t>(getSyclObjImpl(Device)->getHandleRef());
130 }
131 #endif
132 
133 static std::string accessModeToString(access::mode Mode) {
134  switch (Mode) {
135  case access::mode::read:
136  return "read";
137  case access::mode::write:
138  return "write";
139  case access::mode::read_write:
140  return "read_write";
141  case access::mode::discard_write:
142  return "discard_write";
143  case access::mode::discard_read_write:
144  return "discard_read_write";
145  default:
146  return "unknown";
147  }
148 }
149 
150 #ifdef XPTI_ENABLE_INSTRUMENTATION
151 // Using the command group type to create node types for the asynchronous task
152 // graph modeling
153 static std::string commandToNodeType(Command::CommandType Type) {
154  switch (Type) {
155  case Command::CommandType::RUN_CG:
156  return "command_group_node";
157  case Command::CommandType::COPY_MEMORY:
158  return "memory_transfer_node";
159  case Command::CommandType::ALLOCA:
160  return "memory_allocation_node";
161  case Command::CommandType::ALLOCA_SUB_BUF:
162  return "sub_buffer_creation_node";
163  case Command::CommandType::RELEASE:
164  return "memory_deallocation_node";
165  case Command::CommandType::MAP_MEM_OBJ:
166  return "memory_transfer_node";
167  case Command::CommandType::UNMAP_MEM_OBJ:
168  return "memory_transfer_node";
169  case Command::CommandType::UPDATE_REQUIREMENT:
170  return "host_acc_create_buffer_lock_node";
171  case Command::CommandType::EMPTY_TASK:
172  return "host_acc_destroy_buffer_release_node";
173  default:
174  return "unknown_node";
175  }
176 }
177 
178 // Using the names being generated and the string are subject to change to
179 // something more meaningful to end-users as this will be visible in analysis
180 // tools that subscribe to this data
181 static std::string commandToName(Command::CommandType Type) {
182  switch (Type) {
183  case Command::CommandType::RUN_CG:
184  return "Command Group Action";
185  case Command::CommandType::COPY_MEMORY:
186  return "Memory Transfer (Copy)";
187  case Command::CommandType::ALLOCA:
188  return "Memory Allocation";
189  case Command::CommandType::ALLOCA_SUB_BUF:
190  return "Sub Buffer Creation";
191  case Command::CommandType::RELEASE:
192  return "Memory Deallocation";
193  case Command::CommandType::MAP_MEM_OBJ:
194  return "Memory Transfer (Map)";
195  case Command::CommandType::UNMAP_MEM_OBJ:
196  return "Memory Transfer (Unmap)";
197  case Command::CommandType::UPDATE_REQUIREMENT:
198  return "Host Accessor Creation/Buffer Lock";
199  case Command::CommandType::EMPTY_TASK:
200  return "Host Accessor Destruction/Buffer Lock Release";
201  default:
202  return "Unknown Action";
203  }
204 }
205 #endif
206 
207 static std::vector<RT::PiEvent>
208 getPiEvents(const std::vector<EventImplPtr> &EventImpls) {
209  std::vector<RT::PiEvent> RetPiEvents;
210  for (auto &EventImpl : EventImpls) {
211  if (EventImpl->getHandleRef() != nullptr)
212  RetPiEvents.push_back(EventImpl->getHandleRef());
213  }
214 
215  return RetPiEvents;
216 }
217 
218 static void flushCrossQueueDeps(const std::vector<EventImplPtr> &EventImpls,
219  const QueueImplPtr &Queue) {
220  for (auto &EventImpl : EventImpls) {
221  EventImpl->flushIfNeeded(Queue);
222  }
223 }
224 
226  ExecCGCommand *MThisCmd;
227  std::vector<interop_handle::ReqToMem> MReqToMem;
228 
229  pi_result waitForEvents() const {
230  std::map<const detail::plugin *, std::vector<EventImplPtr>>
231  RequiredEventsPerPlugin;
232 
233  for (const EventImplPtr &Event : MThisCmd->MPreparedDepsEvents) {
234  const detail::plugin &Plugin = Event->getPlugin();
235  RequiredEventsPerPlugin[&Plugin].push_back(Event);
236  }
237 
238  // wait for dependency device events
239  // FIXME Current implementation of waiting for events will make the thread
240  // 'sleep' until all of dependency events are complete. We need a bit more
241  // sophisticated waiting mechanism to allow to utilize this thread for any
242  // other available job and resume once all required events are ready.
243  for (auto &PluginWithEvents : RequiredEventsPerPlugin) {
244  std::vector<RT::PiEvent> RawEvents = getPiEvents(PluginWithEvents.second);
245  try {
246  PluginWithEvents.first->call<PiApiKind::piEventsWait>(RawEvents.size(),
247  RawEvents.data());
248  } catch (const sycl::exception &E) {
249  CGHostTask &HostTask = static_cast<CGHostTask &>(MThisCmd->getCG());
250  HostTask.MQueue->reportAsyncException(std::current_exception());
251  return (pi_result)E.get_cl_code();
252  } catch (...) {
253  CGHostTask &HostTask = static_cast<CGHostTask &>(MThisCmd->getCG());
254  HostTask.MQueue->reportAsyncException(std::current_exception());
255  return PI_ERROR_UNKNOWN;
256  }
257  }
258 
259  // Wait for dependency host events.
260  // Host events can't throw exceptions so don't try to catch it.
261  for (const EventImplPtr &Event : MThisCmd->MPreparedHostDepsEvents) {
262  Event->waitInternal();
263  }
264 
265  return PI_SUCCESS;
266  }
267 
268 public:
270  std::vector<interop_handle::ReqToMem> ReqToMem)
271  : MThisCmd{ThisCmd}, MReqToMem(std::move(ReqToMem)) {}
272 
273  void operator()() const {
274  assert(MThisCmd->getCG().getType() == CG::CGTYPE::CodeplayHostTask);
275 
276  CGHostTask &HostTask = static_cast<CGHostTask &>(MThisCmd->getCG());
277 
278  pi_result WaitResult = waitForEvents();
279  if (WaitResult != PI_SUCCESS) {
280  std::exception_ptr EPtr = std::make_exception_ptr(sycl::runtime_error(
281  std::string("Couldn't wait for host-task's dependencies"),
282  WaitResult));
283  HostTask.MQueue->reportAsyncException(EPtr);
284 
285  // reset host-task's lambda and quit
286  HostTask.MHostTask.reset();
287  return;
288  }
289 
290  try {
291  // we're ready to call the user-defined lambda now
292  if (HostTask.MHostTask->isInteropTask()) {
293  interop_handle IH{MReqToMem, HostTask.MQueue,
294  HostTask.MQueue->getDeviceImplPtr(),
295  HostTask.MQueue->getContextImplPtr()};
296 
297  HostTask.MHostTask->call(IH);
298  } else
299  HostTask.MHostTask->call();
300  } catch (...) {
301  HostTask.MQueue->reportAsyncException(std::current_exception());
302  }
303 
304  HostTask.MHostTask.reset();
305 
306  // unblock user empty command here
307  EmptyCommand *EmptyCmd = MThisCmd->MEmptyCmd;
308  assert(EmptyCmd && "No empty command found");
309 
310  // Completing command's event along with unblocking enqueue readiness of
311  // empty command may lead to quick deallocation of MThisCmd by some cleanup
312  // process. Thus we'll copy deps prior to completing of event and unblocking
313  // of empty command.
314  // Also, it's possible to have record deallocated prior to enqueue process.
315  // Thus we employ read-lock of graph.
316  std::vector<Command *> ToCleanUp;
317  Scheduler &Sched = Scheduler::getInstance();
318  {
319  Scheduler::ReadLockT Lock(Sched.MGraphLock);
320 
321  std::vector<DepDesc> Deps = MThisCmd->MDeps;
322 
323  // update self-event status
324  MThisCmd->MEvent->setComplete();
325 
326  EmptyCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady;
327 
328  for (const DepDesc &Dep : Deps)
329  Scheduler::enqueueLeavesOfReqUnlocked(Dep.MDepRequirement, ToCleanUp);
330  }
331  Sched.cleanupCommands(ToCleanUp);
332  }
333 };
334 
335 void Command::waitForPreparedHostEvents() const {
336  for (const EventImplPtr &HostEvent : MPreparedHostDepsEvents)
337  HostEvent->waitInternal();
338 }
339 
341  std::vector<EventImplPtr> &EventImpls,
342  RT::PiEvent &Event) {
343 
344  if (!EventImpls.empty()) {
345  if (Queue->is_host()) {
346  // Host queue can wait for events from different contexts, i.e. it may
347  // contain events with different contexts in its MPreparedDepsEvents.
348  // OpenCL 2.1 spec says that clWaitForEvents will return
349  // CL_INVALID_CONTEXT if events specified in the list do not belong to
350  // the same context. Thus we split all the events into per-context map.
351  // An example. We have two queues for the same CPU device: Q1, Q2. Thus
352  // we will have two different contexts for the same CPU device: C1, C2.
353  // Also we have default host queue. This queue is accessible via
354  // Scheduler. Now, let's assume we have three different events: E1(C1),
355  // E2(C1), E3(C2). Also, we have an EmptyCommand which is to be executed
356  // on host queue. The command's MPreparedDepsEvents will contain all three
357  // events (E1, E2, E3). Now, if piEventsWait is called for all three
358  // events we'll experience failure with CL_INVALID_CONTEXT 'cause these
359  // events refer to different contexts.
360  std::map<context_impl *, std::vector<EventImplPtr>>
361  RequiredEventsPerContext;
362 
363  for (const EventImplPtr &Event : EventImpls) {
364  ContextImplPtr Context = Event->getContextImpl();
365  assert(Context.get() &&
366  "Only non-host events are expected to be waited for here");
367  RequiredEventsPerContext[Context.get()].push_back(Event);
368  }
369 
370  for (auto &CtxWithEvents : RequiredEventsPerContext) {
371  std::vector<RT::PiEvent> RawEvents = getPiEvents(CtxWithEvents.second);
372  CtxWithEvents.first->getPlugin().call<PiApiKind::piEventsWait>(
373  RawEvents.size(), RawEvents.data());
374  }
375  } else {
376 #ifndef NDEBUG
377  for (const EventImplPtr &Event : EventImpls)
378  assert(Event->getContextImpl().get() &&
379  "Only non-host events are expected to be waited for here");
380 #endif
381 
382  std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
383  flushCrossQueueDeps(EventImpls, getWorkerQueue());
384  const detail::plugin &Plugin = Queue->getPlugin();
386  Queue->getHandleRef(), RawEvents.size(), &RawEvents[0], &Event);
387  }
388  }
389 }
390 
394 Command::Command(CommandType Type, QueueImplPtr Queue)
395  : MQueue(std::move(Queue)),
396  MEvent(std::make_shared<detail::event_impl>(MQueue)),
397  MPreparedDepsEvents(MEvent->getPreparedDepsEvents()),
398  MPreparedHostDepsEvents(MEvent->getPreparedHostDepsEvents()),
399  MType(Type) {
401  MEvent->setCommand(this);
402  MEvent->setContextImpl(MQueue->getContextImplPtr());
404 
405 #ifdef XPTI_ENABLE_INSTRUMENTATION
406  if (!xptiTraceEnabled())
407  return;
408  // Obtain the stream ID so all commands can emit traces to that stream
409  MStreamID = xptiRegisterStream(SYCL_STREAM_NAME);
410 #endif
411 }
412 
414 #ifdef XPTI_ENABLE_INSTRUMENTATION
416 #endif
417 }
418 
430  Command *Cmd, void *ObjAddr, bool IsCommand,
431  std::optional<access::mode> AccMode) {
432 #ifdef XPTI_ENABLE_INSTRUMENTATION
433  // Bail early if either the source or the target node for the given dependency
434  // is undefined or NULL
435  if (!(xptiTraceEnabled() && MTraceEvent && Cmd && Cmd->MTraceEvent))
436  return;
437 
438  // If all the information we need for creating an edge event is available,
439  // then go ahead with creating it; if not, bail early!
440  xpti::utils::StringHelper SH;
441  std::string AddressStr = SH.addressAsString<void *>(ObjAddr);
442  std::string Prefix = AccMode ? accessModeToString(AccMode.value()) : "Event";
443  std::string TypeString = SH.nameWithAddressString(Prefix, AddressStr);
444  // Create an edge with the dependent buffer address for which a command
445  // object has been created as one of the properties of the edge
446  xpti::payload_t Payload(TypeString.c_str(), MAddress);
447  uint64_t EdgeInstanceNo;
448  xpti_td *EdgeEvent =
449  xptiMakeEvent(TypeString.c_str(), &Payload, xpti::trace_graph_event,
450  xpti_at::active, &EdgeInstanceNo);
451  if (EdgeEvent) {
452  xpti_td *SrcEvent = static_cast<xpti_td *>(Cmd->MTraceEvent);
453  xpti_td *TgtEvent = static_cast<xpti_td *>(MTraceEvent);
454  EdgeEvent->source_id = SrcEvent->unique_id;
455  EdgeEvent->target_id = TgtEvent->unique_id;
456  if (IsCommand) {
457  xpti::addMetadata(EdgeEvent, "access_mode",
458  static_cast<int>(AccMode.value()));
459  xpti::addMetadata(EdgeEvent, "memory_object",
460  reinterpret_cast<size_t>(ObjAddr));
461  } else {
462  xpti::addMetadata(EdgeEvent, "event", reinterpret_cast<size_t>(ObjAddr));
463  }
464  xptiNotifySubscribers(MStreamID, xpti::trace_edge_create,
465  detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
466  nullptr);
467  }
468  // General comment - None of these are serious errors as the instrumentation
469  // layer MUST be tolerant of errors. If we need to let the end user know, we
470  // throw exceptions in the future
471 #endif
472 }
473 
479  RT::PiEvent &PiEventAddr) {
480 #ifdef XPTI_ENABLE_INSTRUMENTATION
481  // If we have failed to create an event to represent the Command, then we
482  // cannot emit an edge event. Bail early!
483  if (!(xptiTraceEnabled() && MTraceEvent))
484  return;
485 
486  if (Cmd && Cmd->MTraceEvent) {
487  // If the event is associated with a command, we use this command's trace
488  // event as the source of edge, hence modeling the control flow
489  emitEdgeEventForCommandDependence(Cmd, (void *)PiEventAddr, false);
490  return;
491  }
492  if (PiEventAddr) {
493  xpti::utils::StringHelper SH;
494  std::string AddressStr = SH.addressAsString<RT::PiEvent>(PiEventAddr);
495  // This is the case when it is a OCL event enqueued by the user or another
496  // event is registered by the runtime as a dependency The dependency on
497  // this occasion is an OCL event; so we build a virtual node in the graph
498  // with the event as the metadata for the node
499  std::string NodeName = SH.nameWithAddressString("virtual_node", AddressStr);
500  // Node name is "virtual_node[<event_addr>]"
501  xpti::payload_t VNPayload(NodeName.c_str(), MAddress);
502  uint64_t VNodeInstanceNo;
503  xpti_td *NodeEvent =
504  xptiMakeEvent(NodeName.c_str(), &VNPayload, xpti::trace_graph_event,
505  xpti_at::active, &VNodeInstanceNo);
506  // Emit the virtual node first
507  xpti::addMetadata(NodeEvent, "kernel_name", NodeName);
508  xptiNotifySubscribers(MStreamID, xpti::trace_node_create,
509  detail::GSYCLGraphEvent, NodeEvent, VNodeInstanceNo,
510  nullptr);
511  // Create a new event for the edge
512  std::string EdgeName = SH.nameWithAddressString("Event", AddressStr);
513  xpti::payload_t EdgePayload(EdgeName.c_str(), MAddress);
514  uint64_t EdgeInstanceNo;
515  xpti_td *EdgeEvent =
516  xptiMakeEvent(EdgeName.c_str(), &EdgePayload, xpti::trace_graph_event,
517  xpti_at::active, &EdgeInstanceNo);
518  if (EdgeEvent && NodeEvent) {
519  // Source node represents the event and this event needs to be completed
520  // before target node can execute
521  xpti_td *TgtEvent = static_cast<xpti_td *>(MTraceEvent);
522  EdgeEvent->source_id = NodeEvent->unique_id;
523  EdgeEvent->target_id = TgtEvent->unique_id;
524  xpti::addMetadata(EdgeEvent, "event",
525  reinterpret_cast<size_t>(PiEventAddr));
526  xptiNotifySubscribers(MStreamID, xpti::trace_edge_create,
527  detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
528  nullptr);
529  }
530  return;
531  }
532 #endif
533 }
534 
535 uint64_t Command::makeTraceEventProlog(void *MAddress) {
536  uint64_t CommandInstanceNo = 0;
537 #ifdef XPTI_ENABLE_INSTRUMENTATION
538  if (!xptiTraceEnabled())
539  return CommandInstanceNo;
540 
542  // Setup the member variables with information needed for event notification
543  MCommandNodeType = commandToNodeType(MType);
544  MCommandName = commandToName(MType);
545  xpti::utils::StringHelper SH;
546  MAddressString = SH.addressAsString<void *>(MAddress);
547  std::string CommandString =
548  SH.nameWithAddressString(MCommandName, MAddressString);
549 
550  xpti::payload_t p(CommandString.c_str(), MAddress);
551  xpti_td *CmdTraceEvent =
552  xptiMakeEvent(CommandString.c_str(), &p, xpti::trace_graph_event,
553  xpti_at::active, &CommandInstanceNo);
554  MInstanceID = CommandInstanceNo;
555  if (CmdTraceEvent) {
556  MTraceEvent = (void *)CmdTraceEvent;
557  // If we are seeing this event again, then the instance ID will be greater
558  // than 1; in this case, we must skip sending a notification to create a
559  // node as this node has already been created. We return this value so the
560  // epilog method can be called selectively.
561  MFirstInstance = (CommandInstanceNo == 1);
562  }
563 #endif
564  return CommandInstanceNo;
565 }
566 
568 #ifdef XPTI_ENABLE_INSTRUMENTATION
569  if (!(xptiTraceEnabled() && MTraceEvent))
570  return;
572  xptiNotifySubscribers(MStreamID, xpti::trace_node_create,
573  detail::GSYCLGraphEvent,
574  static_cast<xpti_td *>(MTraceEvent), MInstanceID,
575  static_cast<const void *>(MCommandNodeType.c_str()));
576 #endif
577 }
578 
580  std::vector<Command *> &ToCleanUp) {
581  const QueueImplPtr &WorkerQueue = getWorkerQueue();
582  const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr();
583 
584  // 1. Async work is not supported for host device.
585  // 2. Some types of commands do not produce PI events after they are enqueued
586  // (e.g. alloca). Note that we can't check the pi event to make that
587  // distinction since the command might still be unenqueued at this point.
588  bool PiEventExpected =
589  !DepEvent->is_host() || getType() == CommandType::HOST_TASK;
590  if (auto *DepCmd = static_cast<Command *>(DepEvent->getCommand()))
591  PiEventExpected &= DepCmd->producesPiEvent();
592 
593  if (!PiEventExpected) {
594  // call to waitInternal() is in waitForPreparedHostEvents() as it's called
595  // from enqueue process functions
596  MPreparedHostDepsEvents.push_back(DepEvent);
597  return nullptr;
598  }
599 
600  Command *ConnectionCmd = nullptr;
601 
602  // Do not add redundant event dependencies for in-order queues.
603  if (Dep.MDepCommand && Dep.MDepCommand->getWorkerQueue() == WorkerQueue &&
604  WorkerQueue->has_property<property::queue::in_order>() &&
605  getType() != CommandType::HOST_TASK)
606  return nullptr;
607 
608  ContextImplPtr DepEventContext = DepEvent->getContextImpl();
609  // If contexts don't match we'll connect them using host task
610  if (DepEventContext != WorkerContext && !WorkerContext->is_host()) {
612  ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep, ToCleanUp);
613  } else
614  MPreparedDepsEvents.push_back(std::move(DepEvent));
615 
616  return ConnectionCmd;
617 }
618 
620  return MQueue->getContextImplPtr();
621 }
622 
623 const QueueImplPtr &Command::getWorkerQueue() const { return MQueue; }
624 
625 bool Command::producesPiEvent() const { return true; }
626 
628  // Isolated commands are cleaned up separately
629  return !MUsers.empty() || !MDeps.empty();
630 }
631 
632 Command *Command::addDep(DepDesc NewDep, std::vector<Command *> &ToCleanUp) {
633  Command *ConnectionCmd = nullptr;
634 
635  if (NewDep.MDepCommand) {
636  ConnectionCmd =
637  processDepEvent(NewDep.MDepCommand->getEvent(), NewDep, ToCleanUp);
638  }
639  // ConnectionCmd insertion builds the following dependency structure:
640  // this -> emptyCmd (for ConnectionCmd) -> ConnectionCmd -> NewDep
641  // that means that this and NewDep are already dependent
642  if (!ConnectionCmd) {
643  MDeps.push_back(NewDep);
644  if (NewDep.MDepCommand)
645  NewDep.MDepCommand->addUser(this);
646  }
647 
648 #ifdef XPTI_ENABLE_INSTRUMENTATION
650  (void *)NewDep.MDepRequirement->MSYCLMemObj,
651  true, NewDep.MDepRequirement->MAccessMode);
652 #endif
653 
654  return ConnectionCmd;
655 }
656 
658  std::vector<Command *> &ToCleanUp) {
659 #ifdef XPTI_ENABLE_INSTRUMENTATION
660  // We need this for just the instrumentation, so guarding it will prevent
661  // unused variable warnings when instrumentation is turned off
662  Command *Cmd = (Command *)Event->getCommand();
663  RT::PiEvent &PiEventAddr = Event->getHandleRef();
664  // Now make an edge for the dependent event
665  emitEdgeEventForEventDependence(Cmd, PiEventAddr);
666 #endif
667 
668  return processDepEvent(std::move(Event), DepDesc{nullptr, nullptr, nullptr},
669  ToCleanUp);
670 }
671 
673 #ifdef XPTI_ENABLE_INSTRUMENTATION
674  if (!(xptiTraceEnabled() && MTraceEvent && PiEventAddr))
675  return;
676  // Asynchronous call, so send a signal with the event information as
677  // user_data
678  xptiNotifySubscribers(MStreamID, xpti::trace_signal, detail::GSYCLGraphEvent,
679  static_cast<xpti_td *>(MTraceEvent), MInstanceID,
680  (void *)PiEventAddr);
681 #endif
682 }
683 
684 void Command::emitInstrumentation(uint16_t Type, const char *Txt) {
685 #ifdef XPTI_ENABLE_INSTRUMENTATION
686  if (!(xptiTraceEnabled() && MTraceEvent))
687  return;
688  // Trace event notifier that emits a Type event
689  xptiNotifySubscribers(MStreamID, Type, detail::GSYCLGraphEvent,
690  static_cast<xpti_td *>(MTraceEvent), MInstanceID,
691  static_cast<const void *>(Txt));
692 #endif
693 }
694 
695 bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking,
696  std::vector<Command *> &ToCleanUp) {
697  // Exit if already enqueued
699  return true;
700 
701  // If the command is blocked from enqueueing
703  // Exit if enqueue type is not blocking
704  if (!Blocking) {
705  EnqueueResult = EnqueueResultT(EnqueueResultT::SyclEnqueueBlocked, this);
706  return false;
707  }
708  static bool ThrowOnBlock = getenv("SYCL_THROW_ON_BLOCK") != nullptr;
709  if (ThrowOnBlock)
710  throw sycl::runtime_error(
711  std::string("Waiting for blocked command. Block reason: ") +
712  std::string(getBlockReason()),
714 
715 #ifdef XPTI_ENABLE_INSTRUMENTATION
716  // Scoped trace event notifier that emits a barrier begin and barrier end
717  // event, which models the barrier while enqueuing along with the blocked
718  // reason, as determined by the scheduler
719  std::string Info = "enqueue.barrier[";
720  Info += std::string(getBlockReason()) + "]";
721  emitInstrumentation(xpti::trace_barrier_begin, Info.c_str());
722 #endif
723 
724  // Wait if blocking
726  ;
727 #ifdef XPTI_ENABLE_INSTRUMENTATION
728  emitInstrumentation(xpti::trace_barrier_end, Info.c_str());
729 #endif
730  }
731 
732  std::lock_guard<std::mutex> Lock(MEnqueueMtx);
733 
734  // Exit if the command is already enqueued
736  return true;
737 
738 #ifdef XPTI_ENABLE_INSTRUMENTATION
739  emitInstrumentation(xpti::trace_task_begin, nullptr);
740 #endif
741 
743  EnqueueResult = EnqueueResultT(EnqueueResultT::SyclEnqueueFailed, this);
744  return false;
745  }
746 
747  // Command status set to "failed" beforehand, so this command
748  // has already been marked as "failed" if enqueueImp throws an exception.
749  // This will avoid execution of the same failed command twice.
752  cl_int Res = enqueueImp();
753 
754  if (CL_SUCCESS != Res)
755  EnqueueResult =
757  else {
759  (MEvent->is_host() || MEvent->getHandleRef() == nullptr))
760  MEvent->setComplete();
761 
762  // Consider the command is successfully enqueued if return code is
763  // CL_SUCCESS
767  assert(!MPostEnqueueCleanup);
768  MPostEnqueueCleanup = true;
769  ToCleanUp.push_back(this);
770  }
771  }
772 
773  // Emit this correlation signal before the task end
774  emitEnqueuedEventSignal(MEvent->getHandleRef());
775 #ifdef XPTI_ENABLE_INSTRUMENTATION
776  emitInstrumentation(xpti::trace_task_end, nullptr);
777 #endif
779 }
780 
781 void Command::resolveReleaseDependencies(std::set<Command *> &DepList) {
782 #ifdef XPTI_ENABLE_INSTRUMENTATION
783  assert(MType == CommandType::RELEASE && "Expected release command");
784  if (!MTraceEvent)
785  return;
786  // The current command is the target node for all dependencies as the source
787  // nodes have to be completed first before the current node can begin to
788  // execute; these edges model control flow
789  xpti_td *TgtTraceEvent = static_cast<xpti_td *>(MTraceEvent);
790  // We have all the Commands that must be completed before the release command
791  // can be enqueued; here we'll find the command that is an Alloca with the
792  // same SYCLMemObject address and create a dependency line (edge) between them
793  // in our sematic modeling
794  for (auto &Item : DepList) {
795  if (Item->MTraceEvent && Item->MAddress == MAddress) {
796  xpti::utils::StringHelper SH;
797  std::string AddressStr = SH.addressAsString<void *>(MAddress);
798  std::string TypeString =
799  "Edge:" + SH.nameWithAddressString(commandToName(MType), AddressStr);
800 
801  // Create an edge with the dependent buffer address being one of the
802  // properties of the edge
803  xpti::payload_t p(TypeString.c_str(), MAddress);
804  uint64_t EdgeInstanceNo;
805  xpti_td *EdgeEvent =
806  xptiMakeEvent(TypeString.c_str(), &p, xpti::trace_graph_event,
807  xpti_at::active, &EdgeInstanceNo);
808  if (EdgeEvent) {
809  xpti_td *SrcTraceEvent = static_cast<xpti_td *>(Item->MTraceEvent);
810  EdgeEvent->target_id = TgtTraceEvent->unique_id;
811  EdgeEvent->source_id = SrcTraceEvent->unique_id;
812  xpti::addMetadata(EdgeEvent, "memory_object",
813  reinterpret_cast<size_t>(MAddress));
814  xptiNotifySubscribers(MStreamID, xpti::trace_edge_create,
815  detail::GSYCLGraphEvent, EdgeEvent,
816  EdgeInstanceNo, nullptr);
817  }
818  }
819  }
820 #endif
821 }
822 
823 const char *Command::getBlockReason() const {
824  switch (MBlockReason) {
826  return "A Buffer is locked by the host accessor";
828  return "Blocked by host task";
829  }
830 
831  return "Unknown block reason";
832 }
833 
835  Requirement Req,
836  AllocaCommandBase *LinkedAllocaCmd)
837  : Command(Type, Queue), MLinkedAllocaCmd(LinkedAllocaCmd),
838  MIsLeaderAlloca(nullptr == LinkedAllocaCmd), MRequirement(std::move(Req)),
839  MReleaseCmd(Queue, this) {
842 }
843 
845 #ifdef XPTI_ENABLE_INSTRUMENTATION
846  if (!xptiTraceEnabled())
847  return;
848  // Create a payload with the command name and an event using this payload to
849  // emit a node_create
852  // Set the relevant meta data properties for this command
853  if (MTraceEvent && MFirstInstance) {
854  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
855  xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
856  xpti::addMetadata(TE, "sycl_device_type",
857  deviceToString(MQueue->get_device()));
858  xpti::addMetadata(TE, "sycl_device_name",
859  getSyclObjImpl(MQueue->get_device())->getDeviceName());
860  xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
861  }
862 #endif
863 }
864 
865 bool AllocaCommandBase::producesPiEvent() const { return false; }
866 
867 bool AllocaCommandBase::supportsPostEnqueueCleanup() const { return false; }
868 
870  bool InitFromUserData,
871  AllocaCommandBase *LinkedAllocaCmd)
872  : AllocaCommandBase(CommandType::ALLOCA, std::move(Queue), std::move(Req),
873  LinkedAllocaCmd),
874  MInitFromUserData(InitFromUserData) {
875  // Node event must be created before the dependent edge is added to this node,
876  // so this call must be before the addDep() call.
878  // "Nothing to depend on"
879  std::vector<Command *> ToCleanUp;
880  Command *ConnectionCmd =
881  addDep(DepDesc(nullptr, getRequirement(), this), ToCleanUp);
882  assert(ConnectionCmd == nullptr);
883  assert(ToCleanUp.empty());
884  (void)ConnectionCmd;
885 }
886 
888 #ifdef XPTI_ENABLE_INSTRUMENTATION
889  if (!xptiTraceEnabled())
890  return;
891 
892  // Only if it is the first event, we emit a node create event
893  if (MFirstInstance) {
895  }
896 #endif
897 }
898 
899 cl_int AllocaCommand::enqueueImp() {
901  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
902 
903  RT::PiEvent &Event = MEvent->getHandleRef();
904 
905  void *HostPtr = nullptr;
906  if (!MIsLeaderAlloca) {
907 
908  if (MQueue->is_host()) {
909  // Do not need to make allocation if we have a linked device allocation
910  Command::waitForEvents(MQueue, EventImpls, Event);
911 
912  return CL_SUCCESS;
913  }
914  HostPtr = MLinkedAllocaCmd->getMemAllocation();
915  }
916  // TODO: Check if it is correct to use std::move on stack variable and
917  // delete it RawEvents below.
919  MQueue->getContextImplPtr(), getSYCLMemObj(), MInitFromUserData, HostPtr,
920  std::move(EventImpls), Event);
921 
922  return CL_SUCCESS;
923 }
924 
925 void AllocaCommand::printDot(std::ostream &Stream) const {
926  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
927 
928  Stream << "ID = " << this << "\\n";
929  Stream << "ALLOCA ON " << deviceToString(MQueue->get_device()) << "\\n";
930  Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
931  Stream << " Link : " << this->MLinkedAllocaCmd << "\\n";
932  Stream << "\"];" << std::endl;
933 
934 
935  for (const auto &Dep : MDeps) {
936  if (Dep.MDepCommand == nullptr)
937  continue;
938  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
939  << " [ label = \"Access mode: "
940  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
941  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
942  << std::endl;
943  }
944 }
945 
947  AllocaCommandBase *ParentAlloca,
948  std::vector<Command *> &ToEnqueue,
949  std::vector<Command *> &ToCleanUp)
950  : AllocaCommandBase(CommandType::ALLOCA_SUB_BUF, std::move(Queue),
951  std::move(Req),
952  /*LinkedAllocaCmd*/ nullptr),
953  MParentAlloca(ParentAlloca) {
954  // Node event must be created before the dependent edge
955  // is added to this node, so this call must be before
956  // the addDep() call.
958  Command *ConnectionCmd = addDep(
959  DepDesc(MParentAlloca, getRequirement(), MParentAlloca), ToCleanUp);
960  if (ConnectionCmd)
961  ToEnqueue.push_back(ConnectionCmd);
962 }
963 
965 #ifdef XPTI_ENABLE_INSTRUMENTATION
966  if (!xptiTraceEnabled())
967  return;
968 
969  // Only if it is the first event, we emit a node create event and any meta
970  // data that is available for the command
971  if (MFirstInstance) {
972  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
973  xpti::addMetadata(TE, "offset", this->MRequirement.MOffsetInBytes);
974  xpti::addMetadata(TE, "access_range_start",
975  this->MRequirement.MAccessRange[0]);
976  xpti::addMetadata(TE, "access_range_end",
977  this->MRequirement.MAccessRange[1]);
979  }
980 #endif
981 }
982 
984  // In some cases parent`s memory allocation might change (e.g., after
985  // map/unmap operations). If parent`s memory allocation changes, sub-buffer
986  // memory allocation should be changed as well.
987  if (MQueue->is_host()) {
988  return static_cast<void *>(
989  static_cast<char *>(MParentAlloca->getMemAllocation()) +
991  }
992  return MMemAllocation;
993 }
994 
995 cl_int AllocaSubBufCommand::enqueueImp() {
997  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
998  RT::PiEvent &Event = MEvent->getHandleRef();
999 
1001  MQueue->getContextImplPtr(), MParentAlloca->getMemAllocation(),
1003  MRequirement.MAccessRange, std::move(EventImpls), Event);
1004 
1006  MMemAllocation);
1007  return CL_SUCCESS;
1008 }
1009 
1010 void AllocaSubBufCommand::printDot(std::ostream &Stream) const {
1011  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
1012 
1013  Stream << "ID = " << this << "\\n";
1014  Stream << "ALLOCA SUB BUF ON " << deviceToString(MQueue->get_device())
1015  << "\\n";
1016  Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
1017  Stream << " Offset : " << this->MRequirement.MOffsetInBytes << "\\n";
1018  Stream << " Access range : " << this->MRequirement.MAccessRange[0] << "\\n";
1019  Stream << "\"];" << std::endl;
1020 
1021  for (const auto &Dep : MDeps) {
1022  if (Dep.MDepCommand == nullptr)
1023  continue;
1024  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1025  << " [ label = \"Access mode: "
1026  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1027  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1028  << std::endl;
1029  }
1030 }
1031 
1033  : Command(CommandType::RELEASE, std::move(Queue)), MAllocaCmd(AllocaCmd) {
1035 }
1036 
1038 #ifdef XPTI_ENABLE_INSTRUMENTATION
1039  if (!xptiTraceEnabled())
1040  return;
1041  // Create a payload with the command name and an event using this payload to
1042  // emit a node_create
1043  MAddress = MAllocaCmd->getSYCLMemObj();
1045 
1046  if (MFirstInstance) {
1047  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1048  xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
1049  xpti::addMetadata(TE, "sycl_device_type",
1050  deviceToString(MQueue->get_device()));
1051  xpti::addMetadata(TE, "sycl_device_name",
1052  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1053  xpti::addMetadata(TE, "allocation_type",
1054  commandToName(MAllocaCmd->getType()));
1056  }
1057 #endif
1058 }
1059 
1060 cl_int ReleaseCommand::enqueueImp() {
1062  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1063  std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
1064  bool SkipRelease = false;
1065 
1066  // On host side we only allocate memory for full buffers.
1067  // Thus, deallocating sub buffers leads to double memory freeing.
1068  SkipRelease |= MQueue->is_host() && MAllocaCmd->getType() == ALLOCA_SUB_BUF;
1069 
1070  const bool CurAllocaIsHost = MAllocaCmd->getQueue()->is_host();
1071  bool NeedUnmap = false;
1072  if (MAllocaCmd->MLinkedAllocaCmd) {
1073 
1074  // When releasing one of the "linked" allocations special rules take place:
1075  // 1. Device allocation should always be released.
1076  // 2. Host allocation should be released if host allocation is "leader".
1077  // 3. Device alloca in the pair should be in active state in order to be
1078  // correctly released.
1079 
1080 
1081  // There is no actual memory allocation if a host alloca command is created
1082  // being linked to a device allocation.
1083  SkipRelease |= CurAllocaIsHost && !MAllocaCmd->MIsLeaderAlloca;
1084 
1085  NeedUnmap |= CurAllocaIsHost == MAllocaCmd->MIsActive;
1086  }
1087 
1088  if (NeedUnmap) {
1089  const QueueImplPtr &Queue = CurAllocaIsHost
1090  ? MAllocaCmd->MLinkedAllocaCmd->getQueue()
1091  : MAllocaCmd->getQueue();
1092 
1093  EventImplPtr UnmapEventImpl(new event_impl(Queue));
1094  UnmapEventImpl->setContextImpl(Queue->getContextImplPtr());
1095  RT::PiEvent &UnmapEvent = UnmapEventImpl->getHandleRef();
1096 
1097  void *Src = CurAllocaIsHost
1098  ? MAllocaCmd->getMemAllocation()
1099  : MAllocaCmd->MLinkedAllocaCmd->getMemAllocation();
1100 
1101  void *Dst = !CurAllocaIsHost
1102  ? MAllocaCmd->getMemAllocation()
1103  : MAllocaCmd->MLinkedAllocaCmd->getMemAllocation();
1104 
1105  MemoryManager::unmap(MAllocaCmd->getSYCLMemObj(), Dst, Queue, Src,
1106  RawEvents, UnmapEvent);
1107 
1108  std::swap(MAllocaCmd->MIsActive, MAllocaCmd->MLinkedAllocaCmd->MIsActive);
1109  EventImpls.clear();
1110  EventImpls.push_back(UnmapEventImpl);
1111  }
1112  RT::PiEvent &Event = MEvent->getHandleRef();
1113  if (SkipRelease)
1114  Command::waitForEvents(MQueue, EventImpls, Event);
1115  else {
1117  MQueue->getContextImplPtr(), MAllocaCmd->getSYCLMemObj(),
1118  MAllocaCmd->getMemAllocation(), std::move(EventImpls), Event);
1119  }
1120  return CL_SUCCESS;
1121 }
1122 
1123 void ReleaseCommand::printDot(std::ostream &Stream) const {
1124  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FF827A\", label=\"";
1125 
1126  Stream << "ID = " << this << " ; ";
1127  Stream << "RELEASE ON " << deviceToString(MQueue->get_device()) << "\\n";
1128  Stream << " Alloca : " << MAllocaCmd << "\\n";
1129  Stream << " MemObj : " << MAllocaCmd->getSYCLMemObj() << "\\n";
1130  Stream << "\"];" << std::endl;
1131 
1132  for (const auto &Dep : MDeps) {
1133  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1134  << " [ label = \"Access mode: "
1135  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1136  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1137  << std::endl;
1138  }
1139 }
1140 
1141 bool ReleaseCommand::producesPiEvent() const { return false; }
1142 
1143 bool ReleaseCommand::supportsPostEnqueueCleanup() const { return false; }
1144 
1146  void **DstPtr, QueueImplPtr Queue,
1147  access::mode MapMode)
1148  : Command(CommandType::MAP_MEM_OBJ, std::move(Queue)),
1149  MSrcAllocaCmd(SrcAllocaCmd), MSrcReq(std::move(Req)), MDstPtr(DstPtr),
1150  MMapMode(MapMode) {
1152 }
1153 
1155 #ifdef XPTI_ENABLE_INSTRUMENTATION
1156  if (!xptiTraceEnabled())
1157  return;
1158  // Create a payload with the command name and an event using this payload to
1159  // emit a node_create
1160  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1162 
1163  if (MFirstInstance) {
1164  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1165  xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
1166  xpti::addMetadata(TE, "sycl_device_type",
1167  deviceToString(MQueue->get_device()));
1168  xpti::addMetadata(TE, "sycl_device_name",
1169  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1170  xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
1172  }
1173 #endif
1174 }
1175 
1176 cl_int MapMemObject::enqueueImp() {
1178  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1179  std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
1180  flushCrossQueueDeps(EventImpls, getWorkerQueue());
1181 
1182  RT::PiEvent &Event = MEvent->getHandleRef();
1183  *MDstPtr = MemoryManager::map(
1184  MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(), MQueue,
1185  MMapMode, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1186  MSrcReq.MOffset, MSrcReq.MElemSize, std::move(RawEvents), Event);
1187 
1188  return CL_SUCCESS;
1189 }
1190 
1191 void MapMemObject::printDot(std::ostream &Stream) const {
1192  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#77AFFF\", label=\"";
1193 
1194  Stream << "ID = " << this << " ; ";
1195  Stream << "MAP ON " << deviceToString(MQueue->get_device()) << "\\n";
1196 
1197  Stream << "\"];" << std::endl;
1198 
1199  for (const auto &Dep : MDeps) {
1200  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1201  << " [ label = \"Access mode: "
1202  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1203  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1204  << std::endl;
1205  }
1206 }
1207 
1209  void **SrcPtr, QueueImplPtr Queue)
1210  : Command(CommandType::UNMAP_MEM_OBJ, std::move(Queue)),
1211  MDstAllocaCmd(DstAllocaCmd), MDstReq(std::move(Req)), MSrcPtr(SrcPtr) {
1213 }
1214 
1216 #ifdef XPTI_ENABLE_INSTRUMENTATION
1217  if (!xptiTraceEnabled())
1218  return;
1219  // Create a payload with the command name and an event using this payload to
1220  // emit a node_create
1221  MAddress = MDstAllocaCmd->getSYCLMemObj();
1223 
1224  if (MFirstInstance) {
1225  xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
1226  xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
1227  xpti::addMetadata(TE, "sycl_device_type",
1228  deviceToString(MQueue->get_device()));
1229  xpti::addMetadata(TE, "sycl_device_name",
1230  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1231  xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
1233  }
1234 #endif
1235 }
1236 
1238  // TODO remove this workaround once the batching issue is addressed in Level
1239  // Zero plugin.
1240  // Consider the following scenario on Level Zero:
1241  // 1. Kernel A, which uses buffer A, is submitted to queue A.
1242  // 2. Kernel B, which uses buffer B, is submitted to queue B.
1243  // 3. queueA.wait().
1244  // 4. queueB.wait().
1245  // DPCPP runtime used to treat unmap/write commands for buffer A/B as host
1246  // dependencies (i.e. they were waited for prior to enqueueing any command
1247  // that's dependent on them). This allowed Level Zero plugin to detect that
1248  // each queue is idle on steps 1/2 and submit the command list right away.
1249  // This is no longer the case since we started passing these dependencies in
1250  // an event waitlist and Level Zero plugin attempts to batch these commands,
1251  // so the execution of kernel B starts only on step 4. This workaround
1252  // restores the old behavior in this case until this is resolved.
1253  return MQueue->getPlugin().getBackend() != backend::ext_oneapi_level_zero ||
1254  MEvent->getHandleRef() != nullptr;
1255 }
1256 
1257 cl_int UnMapMemObject::enqueueImp() {
1259  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1260  std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
1261  flushCrossQueueDeps(EventImpls, getWorkerQueue());
1262 
1263  RT::PiEvent &Event = MEvent->getHandleRef();
1264  MemoryManager::unmap(MDstAllocaCmd->getSYCLMemObj(),
1265  MDstAllocaCmd->getMemAllocation(), MQueue, *MSrcPtr,
1266  std::move(RawEvents), Event);
1267 
1268  return CL_SUCCESS;
1269 }
1270 
1271 void UnMapMemObject::printDot(std::ostream &Stream) const {
1272  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#EBC40F\", label=\"";
1273 
1274  Stream << "ID = " << this << " ; ";
1275  Stream << "UNMAP ON " << deviceToString(MQueue->get_device()) << "\\n";
1276 
1277  Stream << "\"];" << std::endl;
1278 
1279  for (const auto &Dep : MDeps) {
1280  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1281  << " [ label = \"Access mode: "
1282  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1283  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1284  << std::endl;
1285  }
1286 }
1287 
1289  AllocaCommandBase *SrcAllocaCmd,
1290  Requirement DstReq,
1291  AllocaCommandBase *DstAllocaCmd,
1292  QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
1293  : Command(CommandType::COPY_MEMORY, std::move(DstQueue)),
1294  MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)),
1295  MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)),
1296  MDstAllocaCmd(DstAllocaCmd) {
1297  if (!MSrcQueue->is_host())
1298  MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1299 
1301 }
1302 
1304 #ifdef XPTI_ENABLE_INSTRUMENTATION
1305  if (!xptiTraceEnabled())
1306  return;
1307  // Create a payload with the command name and an event using this payload to
1308  // emit a node_create
1309  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1311 
1312  if (MFirstInstance) {
1313  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1314  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1315  deviceToID(MQueue->get_device()));
1316  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1317  deviceToString(MQueue->get_device()));
1318  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1319  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1320  xpti::addMetadata(CmdTraceEvent, "memory_object",
1321  reinterpret_cast<size_t>(MAddress));
1322  xpti::addMetadata(CmdTraceEvent, "copy_from",
1323  reinterpret_cast<size_t>(
1324  getSyclObjImpl(MSrcQueue->get_device()).get()));
1325  xpti::addMetadata(
1326  CmdTraceEvent, "copy_to",
1327  reinterpret_cast<size_t>(getSyclObjImpl(MQueue->get_device()).get()));
1329  }
1330 #endif
1331 }
1332 
1334  return getWorkerQueue()->getContextImplPtr();
1335 }
1336 
1338  return MQueue->is_host() ? MSrcQueue : MQueue;
1339 }
1340 
1342  // TODO remove this workaround once the batching issue is addressed in Level
1343  // Zero plugin.
1344  // Consider the following scenario on Level Zero:
1345  // 1. Kernel A, which uses buffer A, is submitted to queue A.
1346  // 2. Kernel B, which uses buffer B, is submitted to queue B.
1347  // 3. queueA.wait().
1348  // 4. queueB.wait().
1349  // DPCPP runtime used to treat unmap/write commands for buffer A/B as host
1350  // dependencies (i.e. they were waited for prior to enqueueing any command
1351  // that's dependent on them). This allowed Level Zero plugin to detect that
1352  // each queue is idle on steps 1/2 and submit the command list right away.
1353  // This is no longer the case since we started passing these dependencies in
1354  // an event waitlist and Level Zero plugin attempts to batch these commands,
1355  // so the execution of kernel B starts only on step 4. This workaround
1356  // restores the old behavior in this case until this is resolved.
1357  return MQueue->is_host() ||
1358  MQueue->getPlugin().getBackend() != backend::ext_oneapi_level_zero ||
1359  MEvent->getHandleRef() != nullptr;
1360 }
1361 
1362 cl_int MemCpyCommand::enqueueImp() {
1364  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1365 
1366  RT::PiEvent &Event = MEvent->getHandleRef();
1367 
1368  auto RawEvents = getPiEvents(EventImpls);
1369  flushCrossQueueDeps(EventImpls, getWorkerQueue());
1370 
1372  MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(),
1373  MSrcQueue, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1374  MSrcReq.MOffset, MSrcReq.MElemSize, MDstAllocaCmd->getMemAllocation(),
1375  MQueue, MDstReq.MDims, MDstReq.MMemoryRange, MDstReq.MAccessRange,
1376  MDstReq.MOffset, MDstReq.MElemSize, std::move(RawEvents), Event);
1377 
1378  return CL_SUCCESS;
1379 }
1380 
1381 void MemCpyCommand::printDot(std::ostream &Stream) const {
1382  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#C7EB15\" label=\"";
1383 
1384  Stream << "ID = " << this << " ; ";
1385  Stream << "MEMCPY ON " << deviceToString(MQueue->get_device()) << "\\n";
1386  Stream << "From: " << MSrcAllocaCmd << " is host: " << MSrcQueue->is_host()
1387  << "\\n";
1388  Stream << "To: " << MDstAllocaCmd << " is host: " << MQueue->is_host()
1389  << "\\n";
1390 
1391  Stream << "\"];" << std::endl;
1392 
1393  for (const auto &Dep : MDeps) {
1394  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1395  << " [ label = \"Access mode: "
1396  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1397  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1398  << std::endl;
1399  }
1400 }
1401 
1402 AllocaCommandBase *ExecCGCommand::getAllocaForReq(Requirement *Req) {
1403  for (const DepDesc &Dep : MDeps) {
1404  if (Dep.MDepRequirement == Req)
1405  return Dep.MAllocaCmd;
1406  }
1407  throw runtime_error("Alloca for command not found", PI_INVALID_OPERATION);
1408 }
1409 
1410 std::vector<StreamImplPtr> ExecCGCommand::getStreams() const {
1411  if (MCommandGroup->getType() == CG::Kernel)
1412  return ((CGExecKernel *)MCommandGroup.get())->getStreams();
1413  return {};
1414 }
1415 
1416 std::vector<std::shared_ptr<const void>>
1418  if (MCommandGroup->getType() == CG::Kernel)
1419  return ((CGExecKernel *)MCommandGroup.get())->getAuxiliaryResources();
1420  return {};
1421 }
1422 
1424  if (MCommandGroup->getType() == CG::Kernel)
1425  ((CGExecKernel *)MCommandGroup.get())->clearStreams();
1426 }
1427 
1429  if (MCommandGroup->getType() == CG::Kernel)
1430  ((CGExecKernel *)MCommandGroup.get())->clearAuxiliaryResources();
1431 }
1432 
1433 cl_int UpdateHostRequirementCommand::enqueueImp() {
1435  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1436  RT::PiEvent &Event = MEvent->getHandleRef();
1437  Command::waitForEvents(MQueue, EventImpls, Event);
1438 
1439  assert(MSrcAllocaCmd && "Expected valid alloca command");
1440  assert(MSrcAllocaCmd->getMemAllocation() && "Expected valid source pointer");
1441  assert(MDstPtr && "Expected valid target pointer");
1442  *MDstPtr = MSrcAllocaCmd->getMemAllocation();
1443 
1444  return CL_SUCCESS;
1445 }
1446 
1447 void UpdateHostRequirementCommand::printDot(std::ostream &Stream) const {
1448  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#f1337f\", label=\"";
1449 
1450  Stream << "ID = " << this << "\\n";
1451  Stream << "UPDATE REQ ON " << deviceToString(MQueue->get_device()) << "\\n";
1452  bool IsReqOnBuffer =
1453  MDstReq.MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer;
1454  Stream << "TYPE: " << (IsReqOnBuffer ? "Buffer" : "Image") << "\\n";
1455  if (IsReqOnBuffer)
1456  Stream << "Is sub buffer: " << std::boolalpha << MDstReq.MIsSubBuffer
1457  << "\\n";
1458 
1459  Stream << "\"];" << std::endl;
1460 
1461  for (const auto &Dep : MDeps) {
1462  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1463  << " [ label = \"Access mode: "
1464  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1465  << "MemObj: " << Dep.MAllocaCmd->getSYCLMemObj() << " \" ]"
1466  << std::endl;
1467  }
1468 }
1469 
1471  AllocaCommandBase *SrcAllocaCmd,
1472  Requirement DstReq, void **DstPtr,
1473  QueueImplPtr SrcQueue,
1474  QueueImplPtr DstQueue)
1475  : Command(CommandType::COPY_MEMORY, std::move(DstQueue)),
1476  MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)),
1477  MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)), MDstPtr(DstPtr) {
1478  if (!MSrcQueue->is_host())
1479  MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
1480 
1482 }
1483 
1485 #ifdef XPTI_ENABLE_INSTRUMENTATION
1486  if (!xptiTraceEnabled())
1487  return;
1488  // Create a payload with the command name and an event using this payload to
1489  // emit a node_create
1490  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1492 
1493  if (MFirstInstance) {
1494  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1495  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1496  deviceToID(MQueue->get_device()));
1497  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1498  deviceToString(MQueue->get_device()));
1499  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1500  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1501  xpti::addMetadata(CmdTraceEvent, "memory_object",
1502  reinterpret_cast<size_t>(MAddress));
1503  xpti::addMetadata(CmdTraceEvent, "copy_from",
1504  reinterpret_cast<size_t>(
1505  getSyclObjImpl(MSrcQueue->get_device()).get()));
1506  xpti::addMetadata(
1507  CmdTraceEvent, "copy_to",
1508  reinterpret_cast<size_t>(getSyclObjImpl(MQueue->get_device()).get()));
1510  }
1511 #endif
1512 }
1513 
1515  return getWorkerQueue()->getContextImplPtr();
1516 }
1517 
1519  return MQueue->is_host() ? MSrcQueue : MQueue;
1520 }
1521 
1522 cl_int MemCpyCommandHost::enqueueImp() {
1523  const QueueImplPtr &Queue = getWorkerQueue();
1525  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
1526  std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
1527 
1528  RT::PiEvent &Event = MEvent->getHandleRef();
1529  // Omit copying if mode is discard one.
1530  // TODO: Handle this at the graph building time by, for example, creating
1531  // empty node instead of memcpy.
1534  Command::waitForEvents(Queue, EventImpls, Event);
1535 
1536  return CL_SUCCESS;
1537  }
1538 
1539  flushCrossQueueDeps(EventImpls, getWorkerQueue());
1541  MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(),
1542  MSrcQueue, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1543  MSrcReq.MOffset, MSrcReq.MElemSize, *MDstPtr, MQueue, MDstReq.MDims,
1544  MDstReq.MMemoryRange, MDstReq.MAccessRange, MDstReq.MOffset,
1545  MDstReq.MElemSize, std::move(RawEvents), Event);
1546 
1547  return CL_SUCCESS;
1548 }
1549 
1551  : Command(CommandType::EMPTY_TASK, std::move(Queue)) {
1553 }
1554 
1555 cl_int EmptyCommand::enqueueImp() {
1557  waitForEvents(MQueue, MPreparedDepsEvents, MEvent->getHandleRef());
1558 
1559  return CL_SUCCESS;
1560 }
1561 
1563  const Requirement *Req) {
1564  const Requirement &ReqRef = *Req;
1565  MRequirements.emplace_back(ReqRef);
1566  const Requirement *const StoredReq = &MRequirements.back();
1567 
1568  // EmptyCommand is always host one, so we believe that result of addDep is nil
1569  std::vector<Command *> ToCleanUp;
1570  Command *Cmd = addDep(DepDesc{DepCmd, StoredReq, AllocaCmd}, ToCleanUp);
1571  assert(Cmd == nullptr && "Conection command should be null for EmptyCommand");
1572  assert(ToCleanUp.empty() && "addDep should add a command for cleanup only if "
1573  "there's a connection command");
1574  (void)Cmd;
1575 }
1576 
1578 #ifdef XPTI_ENABLE_INSTRUMENTATION
1579  if (!xptiTraceEnabled())
1580  return;
1581  // Create a payload with the command name and an event using this payload to
1582  // emit a node_create
1583  if (MRequirements.empty())
1584  return;
1585 
1586  Requirement &Req = *MRequirements.begin();
1587 
1588  MAddress = Req.MSYCLMemObj;
1590 
1591  if (MFirstInstance) {
1592  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1593  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1594  deviceToID(MQueue->get_device()));
1595  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1596  deviceToString(MQueue->get_device()));
1597  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1598  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1599  xpti::addMetadata(CmdTraceEvent, "memory_object",
1600  reinterpret_cast<size_t>(MAddress));
1602  }
1603 #endif
1604 }
1605 
1606 void EmptyCommand::printDot(std::ostream &Stream) const {
1607  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#8d8f29\", label=\"";
1608 
1609  Stream << "ID = " << this << "\\n";
1610  Stream << "EMPTY NODE"
1611  << "\\n";
1612 
1613  Stream << "\"];" << std::endl;
1614 
1615  for (const auto &Dep : MDeps) {
1616  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1617  << " [ label = \"Access mode: "
1618  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1619  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1620  << std::endl;
1621  }
1622 }
1623 
1624 bool EmptyCommand::producesPiEvent() const { return false; }
1625 
1626 void MemCpyCommandHost::printDot(std::ostream &Stream) const {
1627  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\"";
1628 
1629  Stream << "ID = " << this << "\\n";
1630  Stream << "MEMCPY HOST ON " << deviceToString(MQueue->get_device()) << "\\n";
1631 
1632  Stream << "\"];" << std::endl;
1633 
1634  for (const auto &Dep : MDeps) {
1635  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1636  << " [ label = \"Access mode: "
1637  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1638  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1639  << std::endl;
1640  }
1641 }
1642 
1644  QueueImplPtr Queue, Requirement Req, AllocaCommandBase *SrcAllocaCmd,
1645  void **DstPtr)
1646  : Command(CommandType::UPDATE_REQUIREMENT, std::move(Queue)),
1647  MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(Req)), MDstPtr(DstPtr) {
1648 
1650 }
1651 
1653 #ifdef XPTI_ENABLE_INSTRUMENTATION
1654  if (!xptiTraceEnabled())
1655  return;
1656  // Create a payload with the command name and an event using this payload to
1657  // emit a node_create
1658  MAddress = MSrcAllocaCmd->getSYCLMemObj();
1660 
1661  if (MFirstInstance) {
1662  xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
1663  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1664  deviceToID(MQueue->get_device()));
1665  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1666  deviceToString(MQueue->get_device()));
1667  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1668  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1669  xpti::addMetadata(CmdTraceEvent, "memory_object",
1670  reinterpret_cast<size_t>(MAddress));
1672  }
1673 #endif
1674 }
1675 
1676 static std::string cgTypeToString(detail::CG::CGTYPE Type) {
1677  switch (Type) {
1678  case detail::CG::Kernel:
1679  return "Kernel";
1680  break;
1682  return "update_host";
1683  break;
1684  case detail::CG::Fill:
1685  return "fill";
1686  break;
1688  return "copy acc to acc";
1689  break;
1691  return "copy acc to ptr";
1692  break;
1694  return "copy ptr to acc";
1695  break;
1696  case detail::CG::CopyUSM:
1697  return "copy usm";
1698  break;
1699  case detail::CG::FillUSM:
1700  return "fill usm";
1701  break;
1703  return "prefetch usm";
1704  break;
1706  return "host task";
1707  break;
1708  default:
1709  return "unknown";
1710  break;
1711  }
1712 }
1713 
1714 ExecCGCommand::ExecCGCommand(std::unique_ptr<detail::CG> CommandGroup,
1715  QueueImplPtr Queue)
1716  : Command(CommandType::RUN_CG, std::move(Queue)),
1717  MCommandGroup(std::move(CommandGroup)) {
1718  if (MCommandGroup->getType() == detail::CG::CodeplayHostTask) {
1719  MSubmittedQueue =
1720  static_cast<detail::CGHostTask *>(MCommandGroup.get())->MQueue;
1721  MEvent->setNeedsCleanupAfterWait(true);
1722  } else if (MCommandGroup->getType() == CG::CGTYPE::Kernel &&
1723  (static_cast<CGExecKernel *>(MCommandGroup.get())->hasStreams() ||
1724  static_cast<CGExecKernel *>(MCommandGroup.get())
1725  ->hasAuxiliaryResources()))
1726  MEvent->setNeedsCleanupAfterWait(true);
1727 
1729 }
1730 
1732 #ifdef XPTI_ENABLE_INSTRUMENTATION
1733  if (!xptiTraceEnabled())
1734  return;
1735  // Create a payload with the command name and an event using this payload to
1736  // emit a node_create
1737  bool HasSourceInfo = false;
1738  std::string KernelName;
1739  std::optional<bool> FromSource;
1740  switch (MCommandGroup->getType()) {
1741  case detail::CG::Kernel: {
1742  auto KernelCG =
1743  reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
1744 
1745  if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource()) {
1746  FromSource = true;
1747  pi_kernel KernelHandle = KernelCG->MSyclKernel->getHandleRef();
1748  MAddress = KernelHandle;
1749  KernelName = MCommandGroup->MFunctionName;
1750  } else {
1751  FromSource = false;
1752  KernelName = demangleKernelName(KernelCG->getKernelName());
1753  }
1754  } break;
1755  default:
1756  KernelName = cgTypeToString(MCommandGroup->getType());
1757  break;
1758  }
1759  std::string CommandType = commandToNodeType(MType);
1760  // Get source file, line number information from the CommandGroup object
1761  // and create payload using name, address, and source info
1762  //
1763  // On Windows, since the support for builtin functions is not available in
1764  // MSVC, the MFileName, MLine will be set to nullptr and "0" respectively.
1765  // Handle this condition explicitly here.
1766  xpti::payload_t Payload;
1767  if (!MCommandGroup->MFileName.empty()) {
1768  // File name has a valid string
1769  Payload =
1770  xpti::payload_t(KernelName.c_str(), MCommandGroup->MFileName.c_str(),
1771  MCommandGroup->MLine, MCommandGroup->MColumn, MAddress);
1772  HasSourceInfo = true;
1773  } else if (MAddress) {
1774  // We have a valid function name and an address
1775  Payload = xpti::payload_t(KernelName.c_str(), MAddress);
1776  } else {
1777  // In any case, we will have a valid function name and we'll use that to
1778  // create the hash
1779  Payload = xpti::payload_t(KernelName.c_str());
1780  }
1781 
1782  uint64_t CGKernelInstanceNo;
1783  // Create event using the payload
1784  xpti_td *CmdTraceEvent =
1785  xptiMakeEvent("ExecCG", &Payload, xpti::trace_graph_event,
1786  xpti::trace_activity_type_t::active, &CGKernelInstanceNo);
1787 
1788  if (CmdTraceEvent) {
1789  MInstanceID = CGKernelInstanceNo;
1790  MTraceEvent = (void *)CmdTraceEvent;
1791  // If we are seeing this event again, then the instance ID will be greater
1792  // than 1; in this case, we will skip sending a notification to create a
1793  // node as this node has already been created.
1794  if (CGKernelInstanceNo > 1)
1795  return;
1796 
1797  xpti::addMetadata(CmdTraceEvent, "sycl_device",
1798  deviceToID(MQueue->get_device()));
1799  xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
1800  deviceToString(MQueue->get_device()));
1801  xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
1802  getSyclObjImpl(MQueue->get_device())->getDeviceName());
1803  if (!KernelName.empty()) {
1804  xpti::addMetadata(CmdTraceEvent, "kernel_name", KernelName);
1805  }
1806  if (FromSource.has_value()) {
1807  xpti::addMetadata(CmdTraceEvent, "from_source", FromSource.value());
1808  }
1809  if (HasSourceInfo) {
1810  xpti::addMetadata(CmdTraceEvent, "sym_function_name", KernelName);
1811  xpti::addMetadata(CmdTraceEvent, "sym_source_file_name",
1812  MCommandGroup->MFileName);
1813  xpti::addMetadata(CmdTraceEvent, "sym_line_no", MCommandGroup->MLine);
1814  xpti::addMetadata(CmdTraceEvent, "sym_column_no", MCommandGroup->MColumn);
1815  }
1816 
1817  if (MCommandGroup->getType() == detail::CG::Kernel) {
1818  auto KernelCG =
1819  reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
1820  auto &NDRDesc = KernelCG->MNDRDesc;
1821  std::vector<ArgDesc> Args;
1822 
1823  auto FilterArgs = [&Args](detail::ArgDesc &Arg, int NextTrueIndex) {
1824  Args.push_back({Arg.MType, Arg.MPtr, Arg.MSize, NextTrueIndex});
1825  };
1826  RT::PiProgram Program = nullptr;
1827  RT::PiKernel Kernel = nullptr;
1828  std::mutex *KernelMutex = nullptr;
1829 
1830  std::shared_ptr<kernel_impl> SyclKernelImpl;
1831  std::shared_ptr<device_image_impl> DeviceImageImpl;
1832  auto KernelBundleImplPtr = KernelCG->getKernelBundle();
1833 
1834  // Use kernel_bundle if available unless it is interop.
1835  // Interop bundles can't be used in the first branch, because the kernels
1836  // in interop kernel bundles (if any) do not have kernel_id
1837  // and can therefore not be looked up, but since they are self-contained
1838  // they can simply be launched directly.
1839  if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
1840  kernel_id KernelID =
1842  KernelCG->MKernelName);
1843  kernel SyclKernel =
1844  KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr);
1845  Program = detail::getSyclObjImpl(SyclKernel)
1846  ->getDeviceImage()
1847  ->get_program_ref();
1848  } else if (nullptr != KernelCG->MSyclKernel) {
1849  auto SyclProg = detail::getSyclObjImpl(
1850  KernelCG->MSyclKernel->get_info<info::kernel::program>());
1851  Program = SyclProg->getHandleRef();
1852  } else {
1853  std::tie(Kernel, KernelMutex, Program) =
1855  KernelCG->MOSModuleHandle, MQueue->getContextImplPtr(),
1856  MQueue->getDeviceImplPtr(), KernelCG->MKernelName, nullptr);
1857  }
1858 
1859  ProgramManager::KernelArgMask EliminatedArgMask;
1860  if (nullptr == KernelCG->MSyclKernel ||
1861  !KernelCG->MSyclKernel->isCreatedFromSource()) {
1862  EliminatedArgMask =
1864  KernelCG->MOSModuleHandle, Program, KernelCG->MKernelName);
1865  }
1866 
1867  applyFuncOnFilteredArgs(EliminatedArgMask, KernelCG->MArgs, FilterArgs);
1868 
1869  xpti::offload_kernel_enqueue_data_t KernelData{
1870  {NDRDesc.GlobalSize[0], NDRDesc.GlobalSize[1], NDRDesc.GlobalSize[2]},
1871  {NDRDesc.LocalSize[0], NDRDesc.LocalSize[1], NDRDesc.LocalSize[2]},
1872  {NDRDesc.GlobalOffset[0], NDRDesc.GlobalOffset[1],
1873  NDRDesc.GlobalOffset[2]},
1874  Args.size()};
1875  xpti::addMetadata(CmdTraceEvent, "enqueue_kernel_data", KernelData);
1876  for (size_t i = 0; i < Args.size(); i++) {
1877  std::string Prefix("arg");
1878  xpti::offload_kernel_arg_data_t arg{(int)Args[i].MType, Args[i].MPtr,
1879  Args[i].MSize, Args[i].MIndex};
1880  xpti::addMetadata(CmdTraceEvent, Prefix + std::to_string(i), arg);
1881  }
1882  }
1883 
1884  xptiNotifySubscribers(MStreamID, xpti::trace_node_create,
1885  detail::GSYCLGraphEvent, CmdTraceEvent,
1886  CGKernelInstanceNo,
1887  static_cast<const void *>(CommandType.c_str()));
1888  }
1889 #endif
1890 }
1891 
1892 void ExecCGCommand::printDot(std::ostream &Stream) const {
1893  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
1894 
1895  Stream << "ID = " << this << "\\n";
1896  Stream << "EXEC CG ON " << deviceToString(MQueue->get_device()) << "\\n";
1897 
1898  switch (MCommandGroup->getType()) {
1899  case detail::CG::Kernel: {
1900  auto KernelCG =
1901  reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
1902  Stream << "Kernel name: ";
1903  if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource())
1904  Stream << "created from source";
1905  else
1906  Stream << demangleKernelName(KernelCG->getKernelName());
1907  Stream << "\\n";
1908  break;
1909  }
1910  default:
1911  Stream << "CG type: " << cgTypeToString(MCommandGroup->getType()) << "\\n";
1912  break;
1913  }
1914 
1915  Stream << "\"];" << std::endl;
1916 
1917  for (const auto &Dep : MDeps) {
1918  Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
1919  << " [ label = \"Access mode: "
1920  << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
1921  << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
1922  << std::endl;
1923  }
1924 }
1925 
1926 // SYCL has a parallel_for_work_group variant where the only NDRange
1927 // characteristics set by a user is the number of work groups. This does not map
1928 // to the OpenCL clEnqueueNDRangeAPI, which requires global work size to be set
1929 // as well. This function determines local work size based on the device
1930 // characteristics and the number of work groups requested by the user, then
1931 // calculates the global work size.
1932 // SYCL specification (from 4.8.5.3):
1933 // The member function handler::parallel_for_work_group is parameterized by the
1934 // number of work - groups, such that the size of each group is chosen by the
1935 // runtime, or by the number of work - groups and number of work - items for
1936 // users who need more control.
1938  const device_impl &DeviceImpl) {
1939  if (NDR.GlobalSize[0] != 0)
1940  return; // GlobalSize is set - no need to adjust
1941  // check the prerequisites:
1942  assert(NDR.NumWorkGroups[0] != 0 && NDR.LocalSize[0] == 0);
1943  // TODO might be good to cache this info together with the kernel info to
1944  // avoid get_kernel_work_group_info on every kernel run
1946  range<3>,
1948  get(Kernel, DeviceImpl.getHandleRef(), DeviceImpl.getPlugin());
1949 
1950  if (WGSize[0] == 0) {
1951  WGSize = {1, 1, 1};
1952  }
1953  NDR.set(NDR.Dims, nd_range<3>(NDR.NumWorkGroups * WGSize, WGSize));
1954 }
1955 
1956 // We have the following mapping between dimensions with SPIR-V builtins:
1957 // 1D: id[0] -> x
1958 // 2D: id[0] -> y, id[1] -> x
1959 // 3D: id[0] -> z, id[1] -> y, id[2] -> x
1960 // So in order to ensure the correctness we update all the kernel
1961 // parameters accordingly.
1962 // Initially we keep the order of NDRDescT as it provided by the user, this
1963 // simplifies overall handling and do the reverse only when
1964 // the kernel is enqueued.
1966  if (NDR.Dims > 1) {
1967  std::swap(NDR.GlobalSize[0], NDR.GlobalSize[NDR.Dims - 1]);
1968  std::swap(NDR.LocalSize[0], NDR.LocalSize[NDR.Dims - 1]);
1969  std::swap(NDR.GlobalOffset[0], NDR.GlobalOffset[NDR.Dims - 1]);
1970  }
1971 }
1972 
1974  const QueueImplPtr &Queue, std::vector<ArgDesc> &Args,
1975  const std::shared_ptr<device_image_impl> &DeviceImageImpl,
1976  RT::PiKernel Kernel, NDRDescT &NDRDesc, std::vector<RT::PiEvent> &RawEvents,
1977  RT::PiEvent *OutEvent,
1978  const ProgramManager::KernelArgMask &EliminatedArgMask,
1979  const std::function<void *(Requirement *Req)> &getMemAllocationFunc) {
1980  const detail::plugin &Plugin = Queue->getPlugin();
1981 
1982  auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
1983  &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) {
1984  switch (Arg.MType) {
1986  break;
1988  Requirement *Req = (Requirement *)(Arg.MPtr);
1989  assert(getMemAllocationFunc != nullptr &&
1990  "The function should not be nullptr as we followed the path for "
1991  "which accessors are used");
1992  RT::PiMem MemArg = (RT::PiMem)getMemAllocationFunc(Req);
1993  if (Plugin.getBackend() == backend::opencl) {
1994  Plugin.call<PiApiKind::piKernelSetArg>(Kernel, NextTrueIndex,
1995  sizeof(RT::PiMem), &MemArg);
1996  } else {
1997  Plugin.call<PiApiKind::piextKernelSetArgMemObj>(Kernel, NextTrueIndex,
1998  &MemArg);
1999  }
2000  break;
2001  }
2003  Plugin.call<PiApiKind::piKernelSetArg>(Kernel, NextTrueIndex, Arg.MSize,
2004  Arg.MPtr);
2005  break;
2006  }
2008  sampler *SamplerPtr = (sampler *)Arg.MPtr;
2009  RT::PiSampler Sampler = detail::getSyclObjImpl(*SamplerPtr)
2010  ->getOrCreateSampler(Queue->get_context());
2011  Plugin.call<PiApiKind::piextKernelSetArgSampler>(Kernel, NextTrueIndex,
2012  &Sampler);
2013  break;
2014  }
2016  Plugin.call<PiApiKind::piextKernelSetArgPointer>(Kernel, NextTrueIndex,
2017  Arg.MSize, Arg.MPtr);
2018  break;
2019  }
2021  if (Queue->is_host()) {
2022  throw cl::sycl::feature_not_supported(
2023  "SYCL2020 specialization constants are not yet supported on host "
2024  "device",
2026  }
2027  assert(DeviceImageImpl != nullptr);
2028  RT::PiMem SpecConstsBuffer = DeviceImageImpl->get_spec_const_buffer_ref();
2029  // Avoid taking an address of nullptr
2030  RT::PiMem *SpecConstsBufferArg =
2031  SpecConstsBuffer ? &SpecConstsBuffer : nullptr;
2032  Plugin.call<PiApiKind::piextKernelSetArgMemObj>(Kernel, NextTrueIndex,
2033  SpecConstsBufferArg);
2034  break;
2035  }
2037  throw runtime_error("Invalid kernel param kind", PI_INVALID_VALUE);
2038  break;
2039  }
2040  };
2041 
2042  applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc);
2043 
2044  adjustNDRangePerKernel(NDRDesc, Kernel, *(Queue->getDeviceImplPtr()));
2045 
2046  // Remember this information before the range dimensions are reversed
2047  const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0);
2048 
2050 
2051  size_t RequiredWGSize[3] = {0, 0, 0};
2052  size_t *LocalSize = nullptr;
2053 
2054  if (HasLocalSize)
2055  LocalSize = &NDRDesc.LocalSize[0];
2056  else {
2058  Kernel, Queue->getDeviceImplPtr()->getHandleRef(),
2059  PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, sizeof(RequiredWGSize),
2060  RequiredWGSize, /* param_value_size_ret = */ nullptr);
2061 
2062  const bool EnforcedLocalSize =
2063  (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2064  RequiredWGSize[2] != 0);
2065  if (EnforcedLocalSize)
2066  LocalSize = RequiredWGSize;
2067  }
2068 
2070  Queue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
2071  &NDRDesc.GlobalSize[0], LocalSize, RawEvents.size(),
2072  RawEvents.empty() ? nullptr : &RawEvents[0], OutEvent);
2073  return Error;
2074 }
2075 
2076 // The function initialize accessors and calls lambda.
2077 // The function is used as argument to piEnqueueNativeKernel which requires
2078 // that the passed function takes one void* argument.
2079 void DispatchNativeKernel(void *Blob) {
2080  void **CastedBlob = (void **)Blob;
2081 
2082  std::vector<Requirement *> *Reqs =
2083  static_cast<std::vector<Requirement *> *>(CastedBlob[0]);
2084 
2085  std::unique_ptr<HostKernelBase> *HostKernel =
2086  static_cast<std::unique_ptr<HostKernelBase> *>(CastedBlob[1]);
2087 
2088  NDRDescT *NDRDesc = static_cast<NDRDescT *>(CastedBlob[2]);
2089 
2090  // Other value are pointer to the buffers.
2091  void **NextArg = CastedBlob + 3;
2092  for (detail::Requirement *Req : *Reqs)
2093  Req->MData = *(NextArg++);
2094 
2095  (*HostKernel)->call(*NDRDesc, nullptr);
2096 
2097  // The ownership of these objects have been passed to us, need to cleanup
2098  delete Reqs;
2099  delete HostKernel;
2100  delete NDRDesc;
2101 }
2102 
2104  const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector<ArgDesc> &Args,
2105  const std::shared_ptr<detail::kernel_bundle_impl> &KernelBundleImplPtr,
2106  const std::shared_ptr<detail::kernel_impl> &MSyclKernel,
2107  const std::string &KernelName, const detail::OSModuleHandle &OSModuleHandle,
2108  std::vector<RT::PiEvent> &RawEvents, RT::PiEvent *OutEvent,
2109  const std::function<void *(Requirement *Req)> &getMemAllocationFunc) {
2110 
2111  // Run OpenCL kernel
2112  auto ContextImpl = Queue->getContextImplPtr();
2113  auto DeviceImpl = Queue->getDeviceImplPtr();
2114  RT::PiKernel Kernel = nullptr;
2115  std::mutex *KernelMutex = nullptr;
2116  RT::PiProgram Program = nullptr;
2117 
2118  std::shared_ptr<kernel_impl> SyclKernelImpl;
2119  std::shared_ptr<device_image_impl> DeviceImageImpl;
2120 
2121  // Use kernel_bundle if available unless it is interop.
2122  // Interop bundles can't be used in the first branch, because the kernels
2123  // in interop kernel bundles (if any) do not have kernel_id
2124  // and can therefore not be looked up, but since they are self-contained
2125  // they can simply be launched directly.
2126  if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
2127  kernel_id KernelID =
2129  kernel SyclKernel =
2130  KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr);
2131 
2132  SyclKernelImpl = detail::getSyclObjImpl(SyclKernel);
2133 
2134  Kernel = SyclKernelImpl->getHandleRef();
2135  DeviceImageImpl = SyclKernelImpl->getDeviceImage();
2136 
2137  Program = DeviceImageImpl->get_program_ref();
2138 
2139  std::tie(Kernel, KernelMutex) =
2141  KernelBundleImplPtr->get_context(), KernelName,
2142  /*PropList=*/{}, Program);
2143  } else if (nullptr != MSyclKernel) {
2144  assert(MSyclKernel->get_info<info::kernel::context>() ==
2145  Queue->get_context());
2146  Kernel = MSyclKernel->getHandleRef();
2147 
2148  auto SyclProg =
2149  detail::getSyclObjImpl(MSyclKernel->get_info<info::kernel::program>());
2150  Program = SyclProg->getHandleRef();
2151  if (SyclProg->is_cacheable()) {
2152  RT::PiKernel FoundKernel = nullptr;
2153  std::tie(FoundKernel, KernelMutex, std::ignore) =
2155  OSModuleHandle, ContextImpl, DeviceImpl, KernelName,
2156  SyclProg.get());
2157  assert(FoundKernel == Kernel);
2158  }
2159  } else {
2160  std::tie(Kernel, KernelMutex, Program) =
2162  OSModuleHandle, ContextImpl, DeviceImpl, KernelName, nullptr);
2163  }
2164 
2165  pi_result Error = PI_SUCCESS;
2166  ProgramManager::KernelArgMask EliminatedArgMask;
2167  if (nullptr == MSyclKernel || !MSyclKernel->isCreatedFromSource()) {
2168  EliminatedArgMask =
2170  OSModuleHandle, Program, KernelName);
2171  }
2172  if (KernelMutex != nullptr) {
2173  // For cacheable kernels, we use per-kernel mutex
2174  std::lock_guard<std::mutex> Lock(*KernelMutex);
2175  Error = SetKernelParamsAndLaunch(Queue, Args, DeviceImageImpl, Kernel,
2176  NDRDesc, RawEvents, OutEvent,
2177  EliminatedArgMask, getMemAllocationFunc);
2178  } else {
2179  Error = SetKernelParamsAndLaunch(Queue, Args, DeviceImageImpl, Kernel,
2180  NDRDesc, RawEvents, OutEvent,
2181  EliminatedArgMask, getMemAllocationFunc);
2182  }
2183 
2184  if (PI_SUCCESS != Error) {
2185  // If we have got non-success error code, let's analyze it to emit nice
2186  // exception explaining what was wrong
2187  const device_impl &DeviceImpl = *(Queue->getDeviceImplPtr());
2188  return detail::enqueue_kernel_launch::handleError(Error, DeviceImpl, Kernel,
2189  NDRDesc);
2190  }
2191 
2192  return PI_SUCCESS;
2193 }
2194 
2195 cl_int ExecCGCommand::enqueueImp() {
2196  if (getCG().getType() != CG::CGTYPE::CodeplayHostTask)
2198  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
2199  auto RawEvents = getPiEvents(EventImpls);
2200  flushCrossQueueDeps(EventImpls, getWorkerQueue());
2201 
2202  RT::PiEvent *Event = (MQueue->has_discard_events_support() &&
2203  MCommandGroup->MRequirements.size() == 0)
2204  ? nullptr
2205  : &MEvent->getHandleRef();
2206  switch (MCommandGroup->getType()) {
2207 
2208  case CG::CGTYPE::UpdateHost: {
2209  throw runtime_error("Update host should be handled by the Scheduler.",
2211  }
2212  case CG::CGTYPE::CopyAccToPtr: {
2213  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2214  Requirement *Req = (Requirement *)Copy->getSrc();
2215  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2216 
2218  AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(), MQueue,
2219  Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2220  Req->MElemSize, Copy->getDst(),
2222  Req->MAccessRange, Req->MAccessRange, /*DstOffset=*/{0, 0, 0},
2223  Req->MElemSize, std::move(RawEvents), MEvent->getHandleRef());
2224 
2225  return CL_SUCCESS;
2226  }
2227  case CG::CGTYPE::CopyPtrToAcc: {
2228  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2229  Requirement *Req = (Requirement *)(Copy->getDst());
2230  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2231 
2233 
2235  AllocaCmd->getSYCLMemObj(), Copy->getSrc(),
2237  Req->MAccessRange, Req->MAccessRange,
2238  /*SrcOffset*/ {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(),
2239  MQueue, Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
2240  Req->MElemSize, std::move(RawEvents), MEvent->getHandleRef());
2241 
2242  return CL_SUCCESS;
2243  }
2244  case CG::CGTYPE::CopyAccToAcc: {
2245  CGCopy *Copy = (CGCopy *)MCommandGroup.get();
2246  Requirement *ReqSrc = (Requirement *)(Copy->getSrc());
2247  Requirement *ReqDst = (Requirement *)(Copy->getDst());
2248 
2249  AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc);
2250  AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst);
2251 
2253  AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(), MQueue,
2254  ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange,
2255  ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
2256  MQueue, ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
2257  ReqDst->MOffset, ReqDst->MElemSize, std::move(RawEvents),
2258  MEvent->getHandleRef());
2259 
2260  return CL_SUCCESS;
2261  }
2262  case CG::CGTYPE::Fill: {
2263  CGFill *Fill = (CGFill *)MCommandGroup.get();
2264  Requirement *Req = (Requirement *)(Fill->getReqToFill());
2265  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2266 
2268  AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(), MQueue,
2269  Fill->MPattern.size(), Fill->MPattern.data(), Req->MDims,
2270  Req->MMemoryRange, Req->MAccessRange, Req->MOffset, Req->MElemSize,
2271  std::move(RawEvents), MEvent->getHandleRef());
2272 
2273  return CL_SUCCESS;
2274  }
2275  case CG::CGTYPE::RunOnHostIntel: {
2276  CGExecKernel *HostTask = (CGExecKernel *)MCommandGroup.get();
2277 
2278  // piEnqueueNativeKernel takes arguments blob which is passes to user
2279  // function.
2280  // Need the following items to restore context in the host task.
2281  // Make a copy on heap to "dettach" from the command group as it can be
2282  // released before the host task completes.
2283  std::vector<void *> ArgsBlob(HostTask->MArgs.size() + 3);
2284 
2285  std::vector<Requirement *> *CopyReqs =
2286  new std::vector<Requirement *>(HostTask->MRequirements);
2287 
2288  // Not actually a copy, but move. Should be OK as it's not expected that
2289  // MHostKernel will be used elsewhere.
2290  std::unique_ptr<HostKernelBase> *CopyHostKernel =
2291  new std::unique_ptr<HostKernelBase>(std::move(HostTask->MHostKernel));
2292 
2293  NDRDescT *CopyNDRDesc = new NDRDescT(HostTask->MNDRDesc);
2294 
2295  ArgsBlob[0] = (void *)CopyReqs;
2296  ArgsBlob[1] = (void *)CopyHostKernel;
2297  ArgsBlob[2] = (void *)CopyNDRDesc;
2298 
2299  void **NextArg = ArgsBlob.data() + 3;
2300 
2301  if (MQueue->is_host()) {
2302  for (ArgDesc &Arg : HostTask->MArgs) {
2303  assert(Arg.MType == kernel_param_kind_t::kind_accessor);
2304 
2305  Requirement *Req = (Requirement *)(Arg.MPtr);
2306  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2307 
2308  *NextArg = AllocaCmd->getMemAllocation();
2309  NextArg++;
2310  }
2311 
2312  if (!RawEvents.empty()) {
2313  // Assuming that the events are for devices to the same Plugin.
2314  const detail::plugin &Plugin = EventImpls[0]->getPlugin();
2315  Plugin.call<PiApiKind::piEventsWait>(RawEvents.size(), &RawEvents[0]);
2316  }
2317  DispatchNativeKernel((void *)ArgsBlob.data());
2318 
2319  return CL_SUCCESS;
2320  }
2321 
2322  std::vector<pi_mem> Buffers;
2323  // piEnqueueNativeKernel requires additional array of pointers to args blob,
2324  // values that pointers point to are replaced with actual pointers to the
2325  // memory before execution of user function.
2326  std::vector<void *> MemLocs;
2327 
2328  for (ArgDesc &Arg : HostTask->MArgs) {
2329  assert(Arg.MType == kernel_param_kind_t::kind_accessor);
2330 
2331  Requirement *Req = (Requirement *)(Arg.MPtr);
2332  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2333  pi_mem MemArg = (pi_mem)AllocaCmd->getMemAllocation();
2334 
2335  Buffers.push_back(MemArg);
2336  MemLocs.push_back(NextArg);
2337  NextArg++;
2338  }
2339  const detail::plugin &Plugin = MQueue->getPlugin();
2340  pi_result Error = Plugin.call_nocheck<PiApiKind::piEnqueueNativeKernel>(
2341  MQueue->getHandleRef(), DispatchNativeKernel, (void *)ArgsBlob.data(),
2342  ArgsBlob.size() * sizeof(ArgsBlob[0]), Buffers.size(), Buffers.data(),
2343  const_cast<const void **>(MemLocs.data()), RawEvents.size(),
2344  RawEvents.empty() ? nullptr : RawEvents.data(), Event);
2345 
2346  switch (Error) {
2347  case PI_INVALID_OPERATION:
2348  throw cl::sycl::runtime_error(
2349  "Device doesn't support run_on_host_intel tasks.", Error);
2350  case PI_SUCCESS:
2351  return Error;
2352  default:
2353  throw cl::sycl::runtime_error(
2354  "Enqueueing run_on_host_intel task has failed.", Error);
2355  }
2356  }
2357  case CG::CGTYPE::Kernel: {
2358  CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
2359 
2360  NDRDescT &NDRDesc = ExecKernel->MNDRDesc;
2361  std::vector<ArgDesc> &Args = ExecKernel->MArgs;
2362 
2363  if (MQueue->is_host() || (MQueue->getPlugin().getBackend() ==
2365  for (ArgDesc &Arg : Args)
2366  if (kernel_param_kind_t::kind_accessor == Arg.MType) {
2367  Requirement *Req = (Requirement *)(Arg.MPtr);
2368  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2369  Req->MData = AllocaCmd->getMemAllocation();
2370  }
2371  if (!RawEvents.empty()) {
2372  // Assuming that the events are for devices to the same Plugin.
2373  const detail::plugin &Plugin = EventImpls[0]->getPlugin();
2374  Plugin.call<PiApiKind::piEventsWait>(RawEvents.size(), &RawEvents[0]);
2375  }
2376 
2377  if (MQueue->is_host()) {
2378  ExecKernel->MHostKernel->call(NDRDesc,
2379  getEvent()->getHostProfilingInfo());
2380  } else {
2381  assert(MQueue->getPlugin().getBackend() ==
2383 
2384  MQueue->getPlugin().call<PiApiKind::piEnqueueKernelLaunch>(
2385  nullptr,
2386  reinterpret_cast<pi_kernel>(ExecKernel->MHostKernel->getPtr()),
2387  NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0],
2388  &NDRDesc.LocalSize[0], 0, nullptr, nullptr);
2389  }
2390 
2391  return CL_SUCCESS;
2392  }
2393 
2394  auto getMemAllocationFunc = [this](Requirement *Req) {
2395  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2396  return AllocaCmd->getMemAllocation();
2397  };
2398 
2399  const std::shared_ptr<detail::kernel_impl> &SyclKernel =
2400  ExecKernel->MSyclKernel;
2401  const std::string &KernelName = ExecKernel->MKernelName;
2402  const detail::OSModuleHandle &OSModuleHandle = ExecKernel->MOSModuleHandle;
2403 
2404  if (!Event) {
2405  // Kernel only uses assert if it's non interop one
2406  bool KernelUsesAssert = !(SyclKernel && SyclKernel->isInterop()) &&
2408  OSModuleHandle, KernelName);
2409  if (KernelUsesAssert) {
2410  Event = &MEvent->getHandleRef();
2411  }
2412  }
2413 
2414  return enqueueImpKernel(
2415  MQueue, NDRDesc, Args, ExecKernel->getKernelBundle(), SyclKernel,
2416  KernelName, OSModuleHandle, RawEvents, Event, getMemAllocationFunc);
2417  }
2418  case CG::CGTYPE::CopyUSM: {
2419  CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
2420  MemoryManager::copy_usm(Copy->getSrc(), MQueue, Copy->getLength(),
2421  Copy->getDst(), std::move(RawEvents), Event);
2422 
2423  return CL_SUCCESS;
2424  }
2425  case CG::CGTYPE::FillUSM: {
2426  CGFillUSM *Fill = (CGFillUSM *)MCommandGroup.get();
2427  MemoryManager::fill_usm(Fill->getDst(), MQueue, Fill->getLength(),
2428  Fill->getFill(), std::move(RawEvents), Event);
2429 
2430  return CL_SUCCESS;
2431  }
2432  case CG::CGTYPE::PrefetchUSM: {
2433  CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get();
2434  MemoryManager::prefetch_usm(Prefetch->getDst(), MQueue,
2435  Prefetch->getLength(), std::move(RawEvents),
2436  Event);
2437 
2438  return CL_SUCCESS;
2439  }
2440  case CG::CGTYPE::AdviseUSM: {
2441  CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get();
2442  MemoryManager::advise_usm(Advise->getDst(), MQueue, Advise->getLength(),
2443  Advise->getAdvice(), std::move(RawEvents), Event);
2444 
2445  return CL_SUCCESS;
2446  }
2447  case CG::CGTYPE::CodeplayInteropTask: {
2448  const detail::plugin &Plugin = MQueue->getPlugin();
2449  CGInteropTask *ExecInterop = (CGInteropTask *)MCommandGroup.get();
2450  // Wait for dependencies to complete before dispatching work on the host
2451  // TODO: Use a callback to dispatch the interop task instead of waiting for
2452  // the event
2453  if (!RawEvents.empty()) {
2454  Plugin.call<PiApiKind::piEventsWait>(RawEvents.size(), &RawEvents[0]);
2455  }
2456  std::vector<interop_handler::ReqToMem> ReqMemObjs;
2457  // Extract the Mem Objects for all Requirements, to ensure they are
2458  // available if a user ask for them inside the interop task scope
2459  const auto &HandlerReq = ExecInterop->MRequirements;
2460  std::for_each(
2461  std::begin(HandlerReq), std::end(HandlerReq), [&](Requirement *Req) {
2462  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2463  auto MemArg = reinterpret_cast<pi_mem>(AllocaCmd->getMemAllocation());
2464  interop_handler::ReqToMem ReqToMem = std::make_pair(Req, MemArg);
2465  ReqMemObjs.emplace_back(ReqToMem);
2466  });
2467 
2468  std::sort(std::begin(ReqMemObjs), std::end(ReqMemObjs));
2469  interop_handler InteropHandler(std::move(ReqMemObjs), MQueue);
2470  ExecInterop->MInteropTask->call(InteropHandler);
2471  Plugin.call<PiApiKind::piEnqueueEventsWait>(MQueue->getHandleRef(), 0,
2472  nullptr, Event);
2473 
2474  return CL_SUCCESS;
2475  }
2476  case CG::CGTYPE::CodeplayHostTask: {
2477  CGHostTask *HostTask = static_cast<CGHostTask *>(MCommandGroup.get());
2478 
2479  for (ArgDesc &Arg : HostTask->MArgs) {
2480  switch (Arg.MType) {
2482  Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
2483  AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
2484 
2485  Req->MData = AllocaCmd->getMemAllocation();
2486  break;
2487  }
2488  default:
2489  throw runtime_error("Unsupported arg type", PI_INVALID_VALUE);
2490  }
2491  }
2492 
2493  std::vector<interop_handle::ReqToMem> ReqToMem;
2494 
2495  if (HostTask->MHostTask->isInteropTask()) {
2496  // Extract the Mem Objects for all Requirements, to ensure they are
2497  // available if a user asks for them inside the interop task scope
2498  const std::vector<Requirement *> &HandlerReq = HostTask->MRequirements;
2499  auto ReqToMemConv = [&ReqToMem, HostTask](Requirement *Req) {
2500  const std::vector<AllocaCommandBase *> &AllocaCmds =
2501  Req->MSYCLMemObj->MRecord->MAllocaCommands;
2502 
2503  for (AllocaCommandBase *AllocaCmd : AllocaCmds)
2504  if (HostTask->MQueue->getContextImplPtr() ==
2505  AllocaCmd->getQueue()->getContextImplPtr()) {
2506  auto MemArg =
2507  reinterpret_cast<pi_mem>(AllocaCmd->getMemAllocation());
2508  ReqToMem.emplace_back(std::make_pair(Req, MemArg));
2509 
2510  return;
2511  }
2512 
2513  assert(false &&
2514  "Can't get memory object due to no allocation available");
2515 
2516  throw runtime_error(
2517  "Can't get memory object due to no allocation available",
2519  };
2520  std::for_each(std::begin(HandlerReq), std::end(HandlerReq), ReqToMemConv);
2521  std::sort(std::begin(ReqToMem), std::end(ReqToMem));
2522  }
2523 
2524  MQueue->getThreadPool().submit<DispatchHostTask>(
2525  DispatchHostTask(this, std::move(ReqToMem)));
2526 
2528 
2529  return CL_SUCCESS;
2530  }
2531  case CG::CGTYPE::Barrier: {
2532  if (MQueue->get_device().is_host()) {
2533  // NOP for host device.
2534  return PI_SUCCESS;
2535  }
2536  const detail::plugin &Plugin = MQueue->getPlugin();
2538  MQueue->getHandleRef(), 0, nullptr, Event);
2539 
2540  return PI_SUCCESS;
2541  }
2542  case CG::CGTYPE::BarrierWaitlist: {
2543  CGBarrier *Barrier = static_cast<CGBarrier *>(MCommandGroup.get());
2544  std::vector<detail::EventImplPtr> Events = Barrier->MEventsWaitWithBarrier;
2545  std::vector<RT::PiEvent> PiEvents = getPiEvents(Events);
2546  if (MQueue->get_device().is_host() || PiEvents.empty()) {
2547  // NOP for host device.
2548  // If Events is empty, then the barrier has no effect.
2549  return PI_SUCCESS;
2550  }
2551  const detail::plugin &Plugin = MQueue->getPlugin();
2553  MQueue->getHandleRef(), PiEvents.size(), &PiEvents[0], Event);
2554 
2555  return PI_SUCCESS;
2556  }
2557  case CG::CGTYPE::None:
2558  throw runtime_error("CG type not implemented.", PI_INVALID_OPERATION);
2559  }
2560  return PI_INVALID_OPERATION;
2561 }
2562 
2564  return MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask;
2565 }
2566 
2568  // TODO enable cleaning up host task commands and kernels with streams after
2569  // enqueue
2571  (MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask) &&
2572  (MCommandGroup->getType() != CG::CGTYPE::Kernel ||
2573  (!static_cast<CGExecKernel *>(MCommandGroup.get())->hasStreams() &&
2574  !static_cast<CGExecKernel *>(MCommandGroup.get())
2575  ->hasAuxiliaryResources()));
2576 }
2577 
2578 } // namespace detail
2579 } // namespace sycl
2580 } // __SYCL_INLINE_NAMESPACE(cl)
cl::sycl::detail::UpdateHostRequirementCommand::emitInstrumentationData
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1652
cl::sycl::nd_range
Defines the iteration domain of both the work-groups and the overall dispatch.
Definition: nd_range.hpp:23
cl::sycl::detail::CG::CopyUSM
@ CopyUSM
Definition: cg.hpp:167
cl::sycl::detail::Scheduler
DPC++ graph scheduler class.
Definition: scheduler.hpp:358
cl::sycl::detail::DispatchNativeKernel
void DispatchNativeKernel(void *Blob)
Definition: commands.cpp:2079
cl::sycl::detail::Command::MTraceEvent
void * MTraceEvent
The event for node_create and task_begin.
Definition: commands.hpp:287
cl::sycl::detail::accessModeToString
static std::string accessModeToString(access::mode Mode)
Definition: commands.cpp:133
cl::sycl::detail::AllocaCommandBase::getMemAllocation
virtual void * getMemAllocation() const =0
cl::sycl::detail::Command::DispatchHostTask
friend class DispatchHostTask
Definition: commands.hpp:248
_pi_mem
PI Mem mapping to CUDA memory allocations, both data and texture/surface.
Definition: pi_cuda.hpp:207
cl::sycl::detail::NDRDescT::GlobalOffset
sycl::id< 3 > GlobalOffset
Definition: cg_types.hpp:118
cl::sycl::detail::ExecCGCommand::producesPiEvent
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:2563
cl::sycl::detail::AccessorImplHost::MDims
unsigned int MDims
Definition: accessor_impl.hpp:114
piEnqueueKernelLaunch
pi_result piEnqueueKernelLaunch(pi_queue queue, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_esimd_emulator.cpp:1759
cl::sycl::detail::Command::MEnqueueMtx
std::mutex MEnqueueMtx
Mutex used to protect enqueueing from race conditions.
Definition: commands.hpp:246
cl::sycl::detail::MemoryManager::copy
static void copy(SYCLMemObjI *SYCLMemObj, void *SrcMem, QueueImplPtr SrcQueue, unsigned int DimSrc, sycl::range< 3 > SrcSize, sycl::range< 3 > SrcAccessRange, sycl::id< 3 > SrcOffset, unsigned int SrcElemSize, void *DstMem, QueueImplPtr TgtQueue, unsigned int DimDst, sycl::range< 3 > DstSize, sycl::range< 3 > DstAccessRange, sycl::id< 3 > DstOffset, unsigned int DstElemSize, std::vector< RT::PiEvent > DepEvents, RT::PiEvent &OutEvent)
Definition: memory_manager.cpp:734
event_impl.hpp
cl::sycl::detail::MemCpyCommandHost::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1626
PI_SUCCESS
@ PI_SUCCESS
Definition: pi.h:86
cl::sycl::detail::ExecCGCommand
The exec CG command enqueues execution of kernel or explicit memory operation.
Definition: commands.hpp:539
cl::sycl::detail::EnqueueResultT::SyclEnqueueReady
@ SyclEnqueueReady
Definition: commands.hpp:52
cl::sycl::detail::Command::MUsers
std::unordered_set< Command * > MUsers
Contains list of commands that depend on the command.
Definition: commands.hpp:258
cl::sycl::detail::NDRDescT
Definition: cg_types.hpp:41
cl::sycl::detail::EmptyCommand::emitInstrumentationData
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1577
cl::sycl::detail::ContextImplPtr
std::shared_ptr< detail::context_impl > ContextImplPtr
Definition: memory_manager.hpp:32
piKernelSetArg
pi_result piKernelSetArg(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
Definition: pi_esimd_emulator.cpp:1353
cl::sycl::detail::AllocaCommandBase::MIsActive
bool MIsActive
Indicates that current alloca is active one.
Definition: commands.hpp:392
cl::sycl::detail::device_impl::getPlugin
const plugin & getPlugin() const
Definition: device_impl.hpp:124
cl::sycl::exception::get_cl_code
cl_int get_cl_code() const
Definition: exception.cpp:114
context_impl.hpp
cl::sycl::interop_handle
Definition: interop_handle.hpp:37
cl::sycl::detail::Command::MPostEnqueueCleanup
bool MPostEnqueueCleanup
Indicates that the node will be freed by cleanup after enqueue.
Definition: commands.hpp:318
cl::sycl::detail::CGExecKernel
"Execute kernel" command group class.
Definition: cg.hpp:241
cl::sycl::detail::BlockingT
BlockingT
Definition: commands.hpp:47
cl::sycl::detail::AllocaSubBufCommand::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1010
PI_INVALID_OPERATION
@ PI_INVALID_OPERATION
Definition: pi.h:88
cl::sycl::detail::ExecCGCommand::emitInstrumentationData
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1731
cl::sycl::detail::Command::getQueue
const QueueImplPtr & getQueue() const
Definition: commands.hpp:149
cl::sycl::detail::AllocaSubBufCommand::AllocaSubBufCommand
AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *ParentAlloca, std::vector< Command * > &ToEnqueue, std::vector< Command * > &ToCleanUp)
Definition: commands.cpp:946
cl::sycl::detail::HostTask
Definition: cg_types.hpp:229
cl::sycl::detail::Command::MSubmittedQueue
QueueImplPtr MSubmittedQueue
Definition: commands.hpp:212
cl::sycl::detail::MemCpyCommandHost::MemCpyCommandHost
MemCpyCommandHost(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, Requirement DstReq, void **DstPtr, QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
Definition: commands.cpp:1470
cl::sycl::detail::device_impl
Definition: device_impl.hpp:34
cl::sycl::backend::opencl
@ opencl
cl::sycl::detail::ExecCGCommand::MEmptyCmd
EmptyCommand * MEmptyCmd
Definition: commands.hpp:558
cl::sycl::device::is_cpu
bool is_cpu() const
Check if device is a CPU device.
Definition: device.cpp:104
cl::sycl::detail::for_each
Function for_each(Group g, Ptr first, Ptr last, Function f)
Definition: group_algorithm.hpp:148
cg_types.hpp
PI_INVALID_MEM_OBJECT
@ PI_INVALID_MEM_OBJECT
Definition: pi.h:102
cl::sycl::detail::Command::MEnqueueStatus
std::atomic< EnqueueResultT::ResultT > MEnqueueStatus
Describes the status of the command.
Definition: commands.hpp:279
xpti_registry.hpp
cl::sycl::detail::ReleaseCommand::emitInstrumentationData
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1037
cl::sycl::detail::NDRDescT::GlobalSize
sycl::range< 3 > GlobalSize
Definition: cg_types.hpp:116
cl::sycl::device::is_gpu
bool is_gpu() const
Check if device is a GPU device.
Definition: device.cpp:106
cl::sycl::sampler
Encapsulates a configuration for sampling an image accessor.
Definition: sampler.hpp:65
cl::sycl::detail::Command::MDeps
std::vector< DepDesc > MDeps
Contains list of dependencies(edges)
Definition: commands.hpp:256
cl::sycl::detail::MemoryManager::fill_usm
static void fill_usm(void *DstMem, QueueImplPtr Queue, size_t Len, int Pattern, std::vector< RT::PiEvent > DepEvents, RT::PiEvent *OutEvent)
Definition: memory_manager.cpp:888
cl::sycl::detail::ProgramManager::kernelUsesAssert
bool kernelUsesAssert(OSModuleHandle M, const std::string &KernelName) const
Definition: program_manager.cpp:1080
cl::sycl::backend::ext_intel_esimd_emulator
@ ext_intel_esimd_emulator
cl::sycl::detail::SetKernelParamsAndLaunch
static pi_result SetKernelParamsAndLaunch(const QueueImplPtr &Queue, std::vector< ArgDesc > &Args, const std::shared_ptr< device_image_impl > &DeviceImageImpl, RT::PiKernel Kernel, NDRDescT &NDRDesc, std::vector< RT::PiEvent > &RawEvents, RT::PiEvent *OutEvent, const ProgramManager::KernelArgMask &EliminatedArgMask, const std::function< void *(Requirement *Req)> &getMemAllocationFunc)
Definition: commands.cpp:1973
cl::sycl::detail::SYCLConfig
Definition: config.hpp:105
_pi_result
_pi_result
Definition: pi.h:85
cl::sycl::detail::Command::MCommandNodeType
std::string MCommandNodeType
Buffer to build the command node type.
Definition: commands.hpp:298
cl::sycl::detail::DispatchHostTask::operator()
void operator()() const
Definition: commands.cpp:273
cl::sycl::detail::Command::makeTraceEventProlog
uint64_t makeTraceEventProlog(void *MAddress)
Create a trace event of node_create type; this must be guarded by a check for xptiTraceEnabled().
Definition: commands.cpp:535
cl::sycl::detail::Scheduler::GraphBuilder
Graph builder class.
Definition: scheduler.hpp:473
cl::sycl::detail::AccessorImplHost::MData
void * MData
Definition: accessor_impl.hpp:119
cl::sycl::detail::kernel_param_kind_t::kind_sampler
@ kind_sampler
cl::sycl::detail::Command::CommandType
CommandType
Definition: commands.hpp:97
cl::sycl::detail::UpdateHostRequirementCommand::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1447
cl::sycl::detail::AllocaSubBufCommand::getMemAllocation
void * getMemAllocation() const final
Definition: commands.cpp:983
cl::sycl::detail::MemoryManager::advise_usm
static void advise_usm(const void *Ptr, QueueImplPtr Queue, size_t Len, pi_mem_advice Advice, std::vector< RT::PiEvent > DepEvents, RT::PiEvent *OutEvent)
Definition: memory_manager.cpp:930
cl::sycl::detail::Command::MShouldCompleteEventIfPossible
bool MShouldCompleteEventIfPossible
Definition: commands.hpp:314
cl::sycl::detail::AllocaCommandBase::MIsLeaderAlloca
bool MIsLeaderAlloca
Indicates that the command owns memory allocation in case of connected alloca command.
Definition: commands.hpp:396
cl::sycl::detail::Command::emitEnqueuedEventSignal
void emitEnqueuedEventSignal(RT::PiEvent &PiEventAddr)
Creates a signal event with the enqueued kernel event handle.
Definition: commands.cpp:672
cl::sycl::detail::ProgramManager::getSYCLKernelID
kernel_id getSYCLKernelID(const std::string &KernelName)
Definition: program_manager.cpp:1442
cl::sycl::detail::Command::enqueue
virtual bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, std::vector< Command * > &ToCleanUp)
Checks if the command is enqueued, and calls enqueueImp.
Definition: commands.cpp:695
cl::sycl::detail::EnqueueResultT::SyclEnqueueSuccess
@ SyclEnqueueSuccess
Definition: commands.hpp:53
cl::sycl::detail::KernelBundleImplPtr
std::shared_ptr< detail::kernel_bundle_impl > KernelBundleImplPtr
Definition: kernel_bundle.hpp:132
cl::sycl::detail::AllocaCommandBase::AllocaCommandBase
AllocaCommandBase(CommandType Type, QueueImplPtr Queue, Requirement Req, AllocaCommandBase *LinkedAllocaCmd)
Definition: commands.cpp:834
sycl
Definition: invoke_simd.hpp:68
cl::sycl::detail::MemCpyCommand::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1381
cl::sycl::detail::kernel_param_kind_t::kind_pointer
@ kind_pointer
cl::sycl::detail::cgTypeToString
static std::string cgTypeToString(detail::CG::CGTYPE Type)
Definition: commands.cpp:1676
cl::sycl::detail::MapMemObject::emitInstrumentationData
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1154
cl::sycl::detail::AllocaCommandBase::MRequirement
Requirement MRequirement
Definition: commands.hpp:399
cl::sycl::property::queue::in_order
Definition: queue_properties.hpp:18
cl::sycl::detail::AllocaCommand::AllocaCommand
AllocaCommand(QueueImplPtr Queue, Requirement Req, bool InitFromUserData=true, AllocaCommandBase *LinkedAllocaCmd=nullptr)
Definition: commands.cpp:869
cl::sycl::detail::EmptyCommand::producesPiEvent
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1624
access.hpp
queue_impl.hpp
cl::sycl::detail::AllocaSubBufCommand::emitInstrumentationData
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:964
cl::sycl::detail::UnMapMemObject::emitInstrumentationData
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1215
cl::sycl::detail::SYCL_STREAM_NAME
constexpr const char * SYCL_STREAM_NAME
Definition: xpti_registry.hpp:28
scheduler.hpp
cl::sycl::detail::plugin::call_nocheck
RT::PiResult call_nocheck(ArgsT... Args) const
Calls the PiApi, traces the call, and returns the result.
Definition: plugin.hpp:170
cl::sycl::detail::write
void write(GlobalBufAccessorT &GlobalFlushBuf, size_t FlushBufferSize, unsigned WIOffset, const char *Str, unsigned Len, unsigned Padding=0)
Definition: stream.hpp:110
cl::sycl::detail::Command::MPreparedHostDepsEvents
std::vector< EventImplPtr > & MPreparedHostDepsEvents
Definition: commands.hpp:218
cl::sycl::range< 3 >
cl::sycl::detail::MemCpyCommand::emitInstrumentationData
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1303
_pi_kernel
Implementation of a PI Kernel for CUDA.
Definition: pi_cuda.hpp:624
cl::sycl::detail::ExecCGCommand::getAuxiliaryResources
std::vector< std::shared_ptr< const void > > getAuxiliaryResources() const
Definition: commands.cpp:1417
cl::sycl::detail::ExecCGCommand::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1892
cl::sycl::detail::Requirement
AccessorImplHost Requirement
Definition: accessor_impl.hpp:218
cl::sycl::detail::Command::getBlockReason
const char * getBlockReason() const
Definition: commands.cpp:823
cl::sycl::detail::SYCLMemObjI::getType
virtual MemObjType getType() const =0
cl::sycl::detail::Command::addUser
void addUser(Command *NewUser)
Definition: commands.hpp:124
cl::sycl::detail::MapMemObject::MapMemObject
MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, void **DstPtr, QueueImplPtr Queue, access::mode MapMode)
Definition: commands.cpp:1145
cl::sycl::detail::Command::MLeafCounter
unsigned MLeafCounter
Counts the number of memory objects this command is a leaf for.
Definition: commands.hpp:262
cl::sycl::device::is_accelerator
bool is_accelerator() const
Check if device is an accelerator device.
Definition: device.cpp:108
cl::sycl::detail::UnMapMemObject::producesPiEvent
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1237
cl::sycl::detail::flushCrossQueueDeps
static void flushCrossQueueDeps(const std::vector< EventImplPtr > &EventImpls, const QueueImplPtr &Queue)
Definition: commands.cpp:218
cl::sycl::detail::EnqueueResultT
Result of command enqueueing.
Definition: commands.hpp:50
cl::sycl::detail::plugin::getBackend
backend getBackend(void) const
Definition: plugin.hpp:229
cl::sycl::range::size
size_t size() const
Definition: range.hpp:50
cl::sycl::detail::AccessorImplHost::MElemSize
unsigned int MElemSize
Definition: accessor_impl.hpp:115
cl::sycl::detail::CG::CodeplayHostTask
@ CodeplayHostTask
Definition: cg.hpp:171
PI_ERROR_UNKNOWN
@ PI_ERROR_UNKNOWN
Definition: pi.h:131
cl::sycl::detail::CG::CGTYPE
CGTYPE
Type of the command group.
Definition: cg.hpp:156
cl::sycl::detail::CG::FillUSM
@ FillUSM
Definition: cg.hpp:168
cl::sycl::detail::ProgramManager::getEliminatedKernelArgMask
KernelArgMask getEliminatedKernelArgMask(OSModuleHandle M, pi::PiProgram NativePrg, const std::string &KernelName)
Returns the mask for eliminated kernel arguments for the requested kernel within the native program.
Definition: program_manager.cpp:1370
cl::sycl::detail::pi::PiMem
::pi_mem PiMem
Definition: pi.hpp:111
cl::sycl::detail::CG::PrefetchUSM
@ PrefetchUSM
Definition: cg.hpp:169
cl::sycl::detail::ExecCGCommand::getCG
detail::CG & getCG() const
Definition: commands.hpp:552
cl::sycl::detail::Command::MTraceEventPrologComplete
bool MTraceEventPrologComplete
Flag to indicate if makeTraceEventProlog() has been run.
Definition: commands.hpp:302
cl::sycl::detail::AccessorImplHost
Definition: accessor_impl.hpp:74
cl::sycl::detail::ExecCGCommand::getStreams
std::vector< StreamImplPtr > getStreams() const
Definition: commands.cpp:1410
piEventsWait
pi_result piEventsWait(pi_uint32 num_events, const pi_event *event_list)
Definition: pi_esimd_emulator.cpp:1407
program.hpp
cl::sycl::detail::enqueueImpKernel
cl_int enqueueImpKernel(const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector< ArgDesc > &Args, const std::shared_ptr< detail::kernel_bundle_impl > &KernelBundleImplPtr, const std::shared_ptr< detail::kernel_impl > &MSyclKernel, const std::string &KernelName, const detail::OSModuleHandle &OSModuleHandle, std::vector< RT::PiEvent > &RawEvents, RT::PiEvent *OutEvent, const std::function< void *(Requirement *Req)> &getMemAllocationFunc)
Definition: commands.cpp:2103
cl::sycl::detail::DepDesc::MDepRequirement
const Requirement * MDepRequirement
Requirement for the dependency.
Definition: commands.hpp:82
cl::sycl::detail::AccessorImplHost::MOffsetInBytes
unsigned int MOffsetInBytes
Definition: accessor_impl.hpp:116
program_impl.hpp
cl::sycl::detail::deviceToString
static std::string deviceToString(device Device)
Definition: commands.cpp:76
cl::sycl::device
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:35
cl::sycl::detail::Command::emitInstrumentationData
virtual void emitInstrumentationData()=0
Instrumentation method which emits telemetry data.
cl::sycl::detail::DepDesc
Dependency between two commands.
Definition: commands.hpp:69
cl::sycl::detail::AllocaCommandBase::producesPiEvent
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:865
cl.h
cl::sycl::detail::ExecCGCommand::clearAuxiliaryResources
void clearAuxiliaryResources()
Definition: commands.cpp:1428
cl::sycl::detail::Scheduler::getInstance
static Scheduler & getInstance()
Definition: scheduler.cpp:209
cl::sycl::detail::Command::MCommandName
std::string MCommandName
Buffer to build the command end-user understandable name.
Definition: commands.hpp:300
cl::sycl::detail::AccessorImplHost::MOffset
id< 3 > MOffset
Definition: accessor_impl.hpp:105
cl::sycl::detail::ArgDesc
Definition: cg_types.hpp:27
cl::sycl::detail::AllocaCommandBase::getSYCLMemObj
SYCLMemObjI * getSYCLMemObj() const
Definition: commands.hpp:371
cl::sycl::detail::Command::getType
CommandType getType() const
Definition: commands.hpp:127
cl::sycl::detail::AccessorImplHost::MIsSubBuffer
bool MIsSubBuffer
Definition: accessor_impl.hpp:117
cl::sycl::detail::Command::MFirstInstance
bool MFirstInstance
Flag to indicate if this is the first time we are seeing this payload.
Definition: commands.hpp:304
cl::sycl::detail::CGExecKernel::MNDRDesc
NDRDescT MNDRDesc
Stores ND-range description.
Definition: cg.hpp:244
piKernelGetGroupInfo
pi_result piKernelGetGroupInfo(pi_kernel kernel, pi_device device, pi_kernel_group_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
Definition: pi_esimd_emulator.cpp:1370
cl::sycl::kernel_id
Objects of the class identify kernel is some kernel_bundle related APIs.
Definition: kernel_bundle.hpp:39
cl::sycl::detail::CG::CopyAccToAcc
@ CopyAccToAcc
Definition: cg.hpp:161
cl::sycl::detail::MemCpyCommand::getWorkerQueue
const QueueImplPtr & getWorkerQueue() const final
Get the queue this command will be submitted to.
Definition: commands.cpp:1337
cl::sycl::backend::ext_oneapi_level_zero
@ ext_oneapi_level_zero
cl::sycl::detail::EventImplPtr
std::shared_ptr< detail::event_impl > EventImplPtr
Definition: memory_manager.hpp:31
cl::sycl::detail::applyFuncOnFilteredArgs
static void applyFuncOnFilteredArgs(const ProgramManager::KernelArgMask &EliminatedArgMask, std::vector< ArgDesc > &Args, std::function< void(detail::ArgDesc &Arg, int NextTrueIndex)> Func)
Definition: commands.cpp:89
cl::sycl::detail::CGHostTask
Definition: cg.hpp:482
commands.hpp
cl::sycl::detail::AllocaCommandBase
Base class for memory allocation commands.
Definition: commands.hpp:364
cl::sycl::detail::MapMemObject::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1191
cl::sycl::detail::plugin::call
void call(ArgsT... Args) const
Calls the API, traces the call, checks the result.
Definition: plugin.hpp:217
cl::sycl::detail::kernel_param_kind_t::kind_std_layout
@ kind_std_layout
piEnqueueEventsWaitWithBarrier
pi_result piEnqueueEventsWaitWithBarrier(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_esimd_emulator.cpp:1490
cl::sycl::detail::Command::enqueueImp
virtual cl_int enqueueImp()=0
Private interface. Derived classes should implement this method.
cl::sycl::detail::MemoryManager::prefetch_usm
static void prefetch_usm(void *Ptr, QueueImplPtr Queue, size_t Len, std::vector< RT::PiEvent > DepEvents, RT::PiEvent *OutEvent)
Definition: memory_manager.cpp:915
cl::sycl::detail::AllocaCommandBase::MMemAllocation
void * MMemAllocation
Definition: commands.hpp:383
cl::sycl::detail::Command
The Command class represents some action that needs to be performed on one or more memory objects.
Definition: commands.hpp:95
cl
We provide new interfaces for matrix muliply in this patch:
Definition: access.hpp:13
cl::sycl::detail::MemCpyCommandHost::getWorkerContext
const ContextImplPtr & getWorkerContext() const final
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:1514
cl::sycl::detail::Scheduler::getDefaultHostQueue
QueueImplPtr getDefaultHostQueue()
Definition: scheduler.hpp:442
error_handling.hpp
cl::sycl::info::kernel::context
@ context
cl::sycl::detail::kernel_param_kind_t::kind_specialization_constants_buffer
@ kind_specialization_constants_buffer
cl::sycl::detail::waitForEvents
static void waitForEvents(const std::vector< EventImplPtr > &Events)
Definition: memory_manager.cpp:113
cl::sycl::detail::ReleaseCommand::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1123
cl::sycl::detail::ReverseRangeDimensionsForKernel
static void ReverseRangeDimensionsForKernel(NDRDescT &NDR)
Definition: commands.cpp:1965
cl::sycl::detail::XPTIRegistry::bufferAssociateNotification
static void bufferAssociateNotification(const void *, const void *)
Definition: xpti_registry.cpp:69
cl::sycl::detail::MemCpyCommand::producesPiEvent
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1341
cl::sycl::detail::EnqueueResultT::SyclEnqueueBlocked
@ SyclEnqueueBlocked
Definition: commands.hpp:54
cl::sycl::detail::tie
auto tie(Ts &... Args)
Definition: tuple.hpp:40
cl::sycl::detail::MemCpyCommandHost::emitInstrumentationData
void emitInstrumentationData() final
Instrumentation method which emits telemetry data.
Definition: commands.cpp:1484
cl::sycl::detail::AccessorImplHost::MSYCLMemObj
detail::SYCLMemObjI * MSYCLMemObj
Definition: accessor_impl.hpp:112
cl::sycl::access::mode::discard_read_write
@ discard_read_write
pi_mem
_pi_mem * pi_mem
Definition: pi.h:925
cl::sycl::detail::EnqueueResultT::SyclEnqueueFailed
@ SyclEnqueueFailed
Definition: commands.hpp:55
cl::sycl::detail::Command::BlockReason::HostTask
@ HostTask
cl::sycl::detail::ProgramManager::getInstance
static ProgramManager & getInstance()
Definition: program_manager.cpp:63
_pi_program
Implementation of PI Program on CUDA Module object.
Definition: pi_cuda.hpp:569
_pi_sampler
Implementation of samplers for CUDA.
Definition: pi_cuda.hpp:795
cl::sycl::detail::Command::MInstanceID
uint64_t MInstanceID
Instance ID tracked for the command.
Definition: commands.hpp:306
cl::sycl::detail::plugin
The plugin class provides a unified interface to the underlying low-level runtimes for the device-agn...
Definition: plugin.hpp:90
cl::sycl::detail::kernel_param_kind_t::kind_stream
@ kind_stream
program_manager.hpp
cl::sycl::detail::Command::getWorkerQueue
virtual const QueueImplPtr & getWorkerQueue() const
Get the queue this command will be submitted to.
Definition: commands.cpp:623
cl::sycl::detail::MemoryManager::release
static void release(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *MemAllocation, std::vector< EventImplPtr > DepEvents, RT::PiEvent &OutEvent)
Definition: memory_manager.cpp:234
cl::sycl::detail::ArgDesc::MIndex
int MIndex
Definition: cg_types.hpp:36
piextKernelSetArgSampler
pi_result piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index, const pi_sampler *arg_value)
Definition: pi_esimd_emulator.cpp:1362
cl::sycl::detail::MemCpyCommandHost::getWorkerQueue
const QueueImplPtr & getWorkerQueue() const final
Get the queue this command will be submitted to.
Definition: commands.cpp:1518
cl::sycl::detail::demangleKernelName
static std::string demangleKernelName(std::string Name)
Definition: commands.cpp:73
piextKernelSetArgMemObj
pi_result piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, const pi_mem *arg_value)
Definition: pi_esimd_emulator.cpp:1357
cl::sycl::detail::ReleaseCommand::producesPiEvent
bool producesPiEvent() const final
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:1141
cl::sycl::detail::kernel_param_kind_t::kind_accessor
@ kind_accessor
cl::sycl::detail::ArgDesc::MType
cl::sycl::detail::kernel_param_kind_t MType
Definition: cg_types.hpp:33
cl::sycl::detail::Scheduler::MGraphBuilder
GraphBuilder MGraphBuilder
Definition: scheduler.hpp:767
cl::sycl::detail::QueueImplPtr
std::shared_ptr< detail::queue_impl > QueueImplPtr
Definition: memory_manager.hpp:30
cl::sycl::detail::Command::MQueue
QueueImplPtr MQueue
Definition: commands.hpp:211
cl::sycl::detail::AllocaCommandBase::emitInstrumentationData
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:844
_pi_event::get
native_type get() const noexcept
Definition: pi_cuda.hpp:468
cl::sycl::detail::MemCpyCommand::getWorkerContext
const ContextImplPtr & getWorkerContext() const final
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:1333
cl::sycl::detail::enqueue_kernel_launch::handleError
bool handleError(pi_result Error, const device_impl &DeviceImpl, pi_kernel Kernel, const NDRDescT &NDRDesc)
Analyzes error code and arguments of piEnqueueKernelLaunch to emit user-friendly exception describing...
Definition: enqueue_kernel.cpp:296
cl::sycl::detail::Command::MAddressString
std::string MAddressString
Buffer to build the address string.
Definition: commands.hpp:296
cl::sycl::detail::AllocaCommandBase::getRequirement
const Requirement * getRequirement() const final
Definition: commands.hpp:375
std::get
constexpr tuple_element< I, tuple< Types... > >::type & get(cl::sycl::detail::tuple< Types... > &Arg) noexcept
Definition: tuple.hpp:199
cl::sycl::detail::EmptyCommand::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1606
cl::sycl::detail::MemoryManager::map
static void * map(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, access::mode AccessMode, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< RT::PiEvent > DepEvents, RT::PiEvent &OutEvent)
Definition: memory_manager.cpp:800
cl::sycl::detail::Command::MIsBlockable
bool MIsBlockable
Indicates whether the command can be blocked from enqueueing.
Definition: commands.hpp:260
cl::sycl::detail::Command::MAddress
void * MAddress
Reserved for storing the object address such as SPIR-V or memory object address.
Definition: commands.hpp:294
cl::sycl::detail::DispatchHostTask::DispatchHostTask
DispatchHostTask(ExecCGCommand *ThisCmd, std::vector< interop_handle::ReqToMem > ReqToMem)
Definition: commands.cpp:269
cl::sycl::detail::OSModuleHandle
intptr_t OSModuleHandle
Uniquely identifies an operating system module (executable or a dynamic library)
Definition: os_util.hpp:48
cl::sycl::detail::Command::resolveReleaseDependencies
void resolveReleaseDependencies(std::set< Command * > &list)
Looks at all the dependencies for the release command and enables instrumentation to report these dep...
Definition: commands.cpp:781
cl::sycl::detail::EmptyCommand::EmptyCommand
EmptyCommand(QueueImplPtr Queue)
Definition: commands.cpp:1550
kernel_info.hpp
cl::sycl::detail::NDRDescT::Dims
size_t Dims
Definition: cg_types.hpp:123
cl::sycl::detail::AccessorImplHost::MAccessMode
access::mode MAccessMode
Definition: accessor_impl.hpp:110
cl::sycl::detail::Command::emitInstrumentation
void emitInstrumentation(uint16_t Type, const char *Txt=nullptr)
Emits an event of Type.
Definition: commands.cpp:684
cl::sycl::detail::ProgramManager::KernelArgMask
std::vector< bool > KernelArgMask
Definition: program_manager.hpp:74
cl::sycl::detail::device_impl::getHandleRef
RT::PiDevice & getHandleRef()
Get reference to PI device.
Definition: device_impl.hpp:63
cl::sycl::detail::Command::BlockReason::HostAccessor
@ HostAccessor
PI_INVALID_VALUE
@ PI_INVALID_VALUE
Definition: pi.h:91
cl::sycl::detail::CG::UpdateHost
@ UpdateHost
Definition: cg.hpp:165
cl::sycl::detail::Scheduler::ReadLockT
std::shared_lock< RWLockT > ReadLockT
Definition: scheduler.hpp:453
cl::sycl::detail::usm::free
void free(void *Ptr, const context &Ctxt, const code_location &CL)
Definition: usm_impl.cpp:181
cl::sycl::detail::NDRDescT::NumWorkGroups
sycl::range< 3 > NumWorkGroups
Number of workgroups, used to record the number of workgroups from the simplest form of parallel_for_...
Definition: cg_types.hpp:122
cl::sycl::detail::get_kernel_device_specific_info
Definition: kernel_info.hpp:79
cl::sycl::detail::Command::supportsPostEnqueueCleanup
virtual bool supportsPostEnqueueCleanup() const
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:627
piEnqueueEventsWait
pi_result piEnqueueEventsWait(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_esimd_emulator.cpp:1485
cl::sycl::detail::Command::MType
CommandType MType
The type of the command.
Definition: commands.hpp:244
cl::sycl::detail::getSyclObjImpl
decltype(Obj::impl) getSyclObjImpl(const Obj &SyclObject)
Definition: common.hpp:198
backend_types.hpp
cl::sycl::detail::ExecCGCommand::clearStreams
void clearStreams()
Definition: commands.cpp:1423
cl::sycl::cl_int
std::int32_t cl_int
Definition: aliases.hpp:82
cl::sycl::detail::DispatchHostTask
Definition: commands.cpp:225
cl::sycl::detail::Scheduler::cleanupCommands
void cleanupCommands(const std::vector< Command * > &Cmds)
Definition: scheduler.cpp:440
cl::sycl::detail::MemoryManager::allocateMemSubBuffer
static void * allocateMemSubBuffer(ContextImplPtr TargetContext, void *ParentMemObj, size_t ElemSize, size_t Offset, range< 3 > Range, std::vector< EventImplPtr > DepEvents, RT::PiEvent &OutEvent)
Definition: memory_manager.cpp:420
cl::sycl::detail::MemoryManager::unmap
static void unmap(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, void *MappedPtr, std::vector< RT::PiEvent > DepEvents, RT::PiEvent &OutEvent)
Definition: memory_manager.cpp:845
cl::sycl::detail::CG::CopyPtrToAcc
@ CopyPtrToAcc
Definition: cg.hpp:160
cl::sycl::detail::AllocaCommand::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:925
cl::sycl::detail::Command::MBlockReason
BlockReason MBlockReason
Definition: commands.hpp:276
cl::sycl::detail::Command::MEvent
EventImplPtr MEvent
Definition: commands.hpp:213
cl::sycl::detail::Command::emitInstrumentationDataProxy
void emitInstrumentationDataProxy()
Proxy method which calls emitInstrumentationData.
Definition: commands.cpp:413
cl::sycl::detail::ReleaseCommand::ReleaseCommand
ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd)
Definition: commands.cpp:1032
std
Definition: accessor.hpp:2616
kernel_desc.hpp
_pi_event
PI Event mapping to CUevent.
Definition: pi_cuda.hpp:458
cl::sycl::detail::NDRDescT::LocalSize
sycl::range< 3 > LocalSize
Definition: cg_types.hpp:117
cl::sycl::detail::AccessorImplHost::MMemoryRange
range< 3 > MMemoryRange
Definition: accessor_impl.hpp:109
cl::sycl::detail::Command::waitForEvents
void waitForEvents(QueueImplPtr Queue, std::vector< EventImplPtr > &RawEvents, RT::PiEvent &Event)
Definition: commands.cpp:340
cl::sycl::detail::EmptyCommand::addRequirement
void addRequirement(Command *DepCmd, AllocaCommandBase *AllocaCmd, const Requirement *Req)
Definition: commands.cpp:1562
cl::sycl::detail::NDRDescT::set
void set(sycl::range< Dims_ > NumWorkItems)
Definition: cg_types.hpp:57
cl::sycl::detail::kernel_param_kind_t::kind_invalid
@ kind_invalid
cl::sycl::detail::CG::CopyAccToPtr
@ CopyAccToPtr
Definition: cg.hpp:159
cl::sycl::detail::DepDesc::MDepCommand
Command * MDepCommand
The actual dependency command.
Definition: commands.hpp:80
sampler.hpp
cl::sycl::detail::AllocaCommand::emitInstrumentationData
void emitInstrumentationData() override
Instrumentation method which emits telemetry data.
Definition: commands.cpp:887
cl::sycl::detail::ExecCGCommand::ExecCGCommand
ExecCGCommand(std::unique_ptr< detail::CG > CommandGroup, QueueImplPtr Queue)
Definition: commands.cpp:1714
cl::sycl::access::mode::discard_write
@ discard_write
cl::sycl::detail::HostKernel
Definition: cg_types.hpp:246
cl::sycl::detail::MemCpyCommand::MemCpyCommand
MemCpyCommand(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, Requirement DstReq, AllocaCommandBase *DstAllocaCmd, QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
Definition: commands.cpp:1288
cl::sycl::detail::UnMapMemObject::UnMapMemObject
UnMapMemObject(AllocaCommandBase *DstAllocaCmd, Requirement Req, void **SrcPtr, QueueImplPtr Queue)
Definition: commands.cpp:1208
cl::sycl::detail::MemoryManager::copy_usm
static void copy_usm(const void *SrcMem, QueueImplPtr Queue, size_t Len, void *DstMem, std::vector< RT::PiEvent > DepEvents, RT::PiEvent *OutEvent)
Definition: memory_manager.cpp:858
kernel_bundle_impl.hpp
cl::sycl::detail::CG::Kernel
@ Kernel
Definition: cg.hpp:158
cl::sycl::detail::AllocaCommandBase::supportsPostEnqueueCleanup
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:867
cl::sycl::detail::Command::addDep
Command * addDep(DepDesc NewDep, std::vector< Command * > &ToCleanUp)
Definition: commands.cpp:632
piextKernelSetArgPointer
pi_result piextKernelSetArgPointer(pi_kernel kernel, pi_uint32 arg_index, size_t arg_size, const void *arg_value)
Sets up pointer arguments for CL kernels.
Definition: pi_esimd_emulator.cpp:1913
cl::sycl::detail::MemoryManager::allocate
static void * allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, bool InitFromUserData, void *HostPtr, std::vector< EventImplPtr > DepEvents, RT::PiEvent &OutEvent)
Definition: memory_manager.cpp:271
cl::sycl::detail::Command::waitForPreparedHostEvents
void waitForPreparedHostEvents() const
Definition: commands.cpp:335
cl::sycl::detail::Scheduler::MGraphLock
RWLockT MGraphLock
Definition: scheduler.hpp:768
cl::sycl::access::mode::read_write
@ read_write
cl::sycl::detail::CG::Fill
@ Fill
Definition: cg.hpp:164
cl::sycl::detail::ExecCGCommand::supportsPostEnqueueCleanup
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:2567
cl::sycl::detail::Command::getEvent
const EventImplPtr & getEvent() const
Definition: commands.hpp:153
cl::sycl::detail::Command::processDepEvent
Command * processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform glueing of events from different contexts.
Definition: commands.cpp:579
cl::sycl::detail::Command::MPreparedDepsEvents
std::vector< EventImplPtr > & MPreparedDepsEvents
Dependency events prepared for waiting by backend.
Definition: commands.hpp:217
piEnqueueNativeKernel
pi_result piEnqueueNativeKernel(pi_queue queue, void(*user_func)(void *), void *args, size_t cb_args, pi_uint32 num_mem_objects, const pi_mem *mem_list, const void **args_mem_loc, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event)
Definition: pi_esimd_emulator.cpp:1829
cl::sycl::exception
Definition: exception.hpp:63
kernel_impl.hpp
cl::sycl::detail::ArgDesc::MSize
int MSize
Definition: cg_types.hpp:35
cl::sycl::detail::UnMapMemObject::printDot
void printDot(std::ostream &Stream) const final
Definition: commands.cpp:1271
cl::sycl::detail::event_impl
Definition: event_impl.hpp:33
cl::sycl::access::mode
mode
Definition: access.hpp:28
cl::sycl::detail::AccessorImplHost::MAccessRange
range< 3 > MAccessRange
Definition: accessor_impl.hpp:107
cl::sycl::detail::adjustNDRangePerKernel
static void adjustNDRangePerKernel(NDRDescT &NDR, RT::PiKernel Kernel, const device_impl &DeviceImpl)
Definition: commands.cpp:1937
cl::sycl::detail::Command::producesPiEvent
virtual bool producesPiEvent() const
Returns true iff the command produces a PI event on non-host devices.
Definition: commands.cpp:625
cl::sycl::detail::Command::getWorkerContext
virtual const ContextImplPtr & getWorkerContext() const
Get the context of the queue this command will be submitted to.
Definition: commands.cpp:619
cl::sycl::detail::ArgDesc::MPtr
void * MPtr
Definition: cg_types.hpp:34
cl::sycl::detail::UpdateHostRequirementCommand::UpdateHostRequirementCommand
UpdateHostRequirementCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *SrcAllocaCmd, void **DstPtr)
Definition: commands.cpp:1643
cl::sycl::detail::ReleaseCommand::supportsPostEnqueueCleanup
bool supportsPostEnqueueCleanup() const final
Returns true iff this command can be freed by post enqueue cleanup.
Definition: commands.cpp:1143
cl::sycl::detail::AllocaCommandBase::MLinkedAllocaCmd
AllocaCommandBase * MLinkedAllocaCmd
Alloca command linked with current command.
Definition: commands.hpp:390
cl::sycl::detail::Command::makeTraceEventEpilog
void makeTraceEventEpilog()
If prolog has been run, run epilog; this must be guarded by a check for xptiTraceEnabled().
Definition: commands.cpp:567
cl::sycl::kernel
Provides an abstraction of a SYCL kernel.
Definition: kernel.hpp:67
cl::sycl::detail::Command::ALLOCA_SUB_BUF
@ ALLOCA_SUB_BUF
Definition: commands.hpp:101
cl::sycl::detail::Command::emitEdgeEventForEventDependence
void emitEdgeEventForEventDependence(Command *Cmd, RT::PiEvent &EventAddr)
Creates an edge event when the dependency is an event.
Definition: commands.cpp:478
cl::sycl::detail::Command::emitEdgeEventForCommandDependence
void emitEdgeEventForCommandDependence(Command *Cmd, void *ObjAddr, bool IsCommand, std::optional< access::mode > AccMode=std::nullopt)
Creates an edge event when the dependency is a command.
Definition: commands.cpp:429
cl::sycl::detail::EmptyCommand
The empty command does nothing during enqueue.
Definition: commands.hpp:323
cl::sycl::detail::ProgramManager::getOrCreateKernel
std::tuple< RT::PiKernel, std::mutex *, RT::PiProgram > getOrCreateKernel(OSModuleHandle M, const ContextImplPtr &ContextImpl, const DeviceImplPtr &DeviceImpl, const std::string &KernelName, const program_impl *Prg)
Definition: program_manager.cpp:599
cl::sycl::detail::getPiEvents
static std::vector< RT::PiEvent > getPiEvents(const std::vector< EventImplPtr > &EventImpls)
Definition: commands.cpp:208
stream_impl.hpp
PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
@ PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
Definition: pi.h:371
sampler_impl.hpp
cl::sycl::device::is_host
bool is_host() const
Get instance of device.
Definition: device.cpp:102
cl::sycl::detail::Command::MStreamID
int32_t MStreamID
The stream under which the traces are emitted.
Definition: commands.hpp:291
cl::sycl::detail::Scheduler::GraphBuilder::connectDepEvent
Command * connectDepEvent(Command *const Cmd, EventImplPtr DepEvent, const DepDesc &Dep, std::vector< Command * > &ToCleanUp)
Perform connection of events in multiple contexts.
Definition: graph_builder.cpp:1299
cl::sycl::info::kernel_device_specific::compile_work_group_size
@ compile_work_group_size
__SYCL_INLINE_NAMESPACE
#define __SYCL_INLINE_NAMESPACE(X)
Definition: defines_elementary.hpp:12
cl::sycl::detail::MemoryManager::fill
static void fill(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, size_t PatternSize, const char *Pattern, unsigned int Dim, sycl::range< 3 > Size, sycl::range< 3 > AccessRange, sycl::id< 3 > AccessOffset, unsigned int ElementSize, std::vector< RT::PiEvent > DepEvents, RT::PiEvent &OutEvent)
Definition: memory_manager.cpp:774
cl::sycl::detail::CG::getType
CGTYPE getType()
Definition: cg.hpp:197
memory_manager.hpp